diff options
author | Ingo Molnar <mingo@elte.hu> | 2009-01-11 02:42:53 +0100 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2009-01-11 02:42:53 +0100 |
commit | 506c10f26c481b7f8ef27c1c79290f68989b2e9e (patch) | |
tree | 03de82e812f00957aa6276dac2fe51c3358e88d7 /fs | |
parent | e1df957670aef74ffd9a4ad93e6d2c90bf6b4845 (diff) | |
parent | c59765042f53a79a7a65585042ff463b69cb248c (diff) | |
download | lwn-506c10f26c481b7f8ef27c1c79290f68989b2e9e.tar.gz lwn-506c10f26c481b7f8ef27c1c79290f68989b2e9e.zip |
Merge commit 'v2.6.29-rc1' into perfcounters/core
Conflicts:
include/linux/kernel_stat.h
Diffstat (limited to 'fs')
526 files changed, 77358 insertions, 22493 deletions
diff --git a/fs/Kconfig b/fs/Kconfig index 522469a7eca3..51307b0fdf0f 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -189,6 +189,8 @@ config OCFS2_FS select CONFIGFS_FS select JBD2 select CRC32 + select QUOTA + select QUOTA_TREE help OCFS2 is a general purpose extent based shared disk cluster file system with many similarities to ext3. It supports 64 bit inode @@ -258,56 +260,37 @@ config OCFS2_DEBUG_FS this option for debugging only as it is likely to decrease performance of the filesystem. -config OCFS2_COMPAT_JBD - bool "Use JBD for compatibility" +config OCFS2_FS_POSIX_ACL + bool "OCFS2 POSIX Access Control Lists" depends on OCFS2_FS + select FS_POSIX_ACL default n - select JBD help - The ocfs2 filesystem now uses JBD2 for its journalling. JBD2 - is backwards compatible with JBD. It is safe to say N here. - However, if you really want to use the original JBD, say Y here. - -endif # BLOCK + Posix Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. -config DNOTIFY - bool "Dnotify support" - default y +config BTRFS_FS + tristate "Btrfs filesystem (EXPERIMENTAL) Unstable disk format" + depends on EXPERIMENTAL + select LIBCRC32C + select ZLIB_INFLATE + select ZLIB_DEFLATE help - Dnotify is a directory-based per-fd file change notification system - that uses signals to communicate events to user-space. There exist - superior alternatives, but some applications may still rely on - dnotify. - - If unsure, say Y. - -config INOTIFY - bool "Inotify file change notification support" - default y - ---help--- - Say Y here to enable inotify support. Inotify is a file change - notification system and a replacement for dnotify. Inotify fixes - numerous shortcomings in dnotify and introduces several new features - including multiple file events, one-shot support, and unmount - notification. + Btrfs is a new filesystem with extents, writable snapshotting, + support for multiple devices and many more features. - For more information, see <file:Documentation/filesystems/inotify.txt> + Btrfs is highly experimental, and THE DISK FORMAT IS NOT YET + FINALIZED. You should say N here unless you are interested in + testing Btrfs with non-critical data. - If unsure, say Y. + To compile this file system support as a module, choose M here. The + module will be called btrfs. -config INOTIFY_USER - bool "Inotify support for userspace" - depends on INOTIFY - default y - ---help--- - Say Y here to enable inotify support for userspace, including the - associated system calls. Inotify allows monitoring of both files and - directories via a single open fd. Events are read from the file - descriptor, which is also select()- and poll()-able. + If unsure, say N. - For more information, see <file:Documentation/filesystems/inotify.txt> +endif # BLOCK - If unsure, say Y. +source "fs/notify/Kconfig" config QUOTA bool "Quota support" @@ -340,6 +323,10 @@ config PRINT_QUOTA_WARNING Note that this behavior is currently deprecated and may go away in future. Please use notification via netlink socket instead. +# Generic support for tree structured quota files. Seleted when needed. +config QUOTA_TREE + tristate + config QFMT_V1 tristate "Old quota format support" depends on QUOTA @@ -351,6 +338,7 @@ config QFMT_V1 config QFMT_V2 tristate "Quota format v2 support" depends on QUOTA + select QUOTA_TREE help This quota format allows using quotas with 32-bit UIDs/GIDs. If you need this functionality say Y here. @@ -752,7 +740,20 @@ config CONFIGFS_FS endmenu -menu "Miscellaneous filesystems" +menuconfig MISC_FILESYSTEMS + bool "Miscellaneous filesystems" + default y + ---help--- + Say Y here to get to see options for various miscellaneous + filesystems, such as filesystems that came from other + operating systems. + + This option alone does not add any kernel code. + + If you say N, all options in this submenu will be skipped and + disabled; if unsure, say Y here. + +if MISC_FILESYSTEMS config ADFS_FS tristate "ADFS file system support (EXPERIMENTAL)" @@ -931,6 +932,58 @@ config CRAMFS If unsure, say N. +config SQUASHFS + tristate "SquashFS 4.0 - Squashed file system support" + depends on BLOCK + select ZLIB_INFLATE + help + Saying Y here includes support for SquashFS 4.0 (a Compressed + Read-Only File System). Squashfs is a highly compressed read-only + filesystem for Linux. It uses zlib compression to compress both + files, inodes and directories. Inodes in the system are very small + and all blocks are packed to minimise data overhead. Block sizes + greater than 4K are supported up to a maximum of 1 Mbytes (default + block size 128K). SquashFS 4.0 supports 64 bit filesystems and files + (larger than 4GB), full uid/gid information, hard links and + timestamps. + + Squashfs is intended for general read-only filesystem use, for + archival use (i.e. in cases where a .tar.gz file may be used), and in + embedded systems where low overhead is needed. Further information + and tools are available from http://squashfs.sourceforge.net. + + If you want to compile this as a module ( = code which can be + inserted in and removed from the running kernel whenever you want), + say M here and read <file:Documentation/modules.txt>. The module + will be called squashfs. Note that the root file system (the one + containing the directory /) cannot be compiled as a module. + + If unsure, say N. + +config SQUASHFS_EMBEDDED + + bool "Additional option for memory-constrained systems" + depends on SQUASHFS + default n + help + Saying Y here allows you to specify cache size. + + If unsure, say N. + +config SQUASHFS_FRAGMENT_CACHE_SIZE + int "Number of fragments cached" if SQUASHFS_EMBEDDED + depends on SQUASHFS + default "3" + help + By default SquashFS caches the last 3 fragments read from + the filesystem. Increasing this amount may mean SquashFS + has to re-read fragments less often from disk, at the expense + of extra system memory. Decreasing this amount will mean + SquashFS uses less memory at the expense of extra reads from disk. + + Note there must be at least one cached fragment. Anything + much more than three will probably not make much difference. + config VXFS_FS tristate "FreeVxFS file system support (VERITAS VxFS(TM) compatible)" depends on BLOCK @@ -1122,7 +1175,7 @@ config UFS_DEBUG Y here. This will result in _many_ additional debugging messages to be written to the system log. -endmenu +endif # MISC_FILESYSTEMS menuconfig NETWORK_FILESYSTEMS bool "Network File Systems" diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index ce9fb3fbfae4..bb4cc5b8abc8 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -43,7 +43,7 @@ config BINFMT_ELF_FDPIC config CORE_DUMP_DEFAULT_ELF_HEADERS bool "Write ELF core dumps with partial segments" default n - depends on BINFMT_ELF + depends on BINFMT_ELF && ELF_CORE help ELF core dump files describe each memory mapping of the crashed process, and can contain or omit the memory contents of each one. diff --git a/fs/Makefile b/fs/Makefile index d9f8afe6f0c4..38bc735c67ad 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -20,8 +20,7 @@ obj-y += no-block.o endif obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o -obj-$(CONFIG_INOTIFY) += inotify.o -obj-$(CONFIG_INOTIFY_USER) += inotify_user.o +obj-y += notify/ obj-$(CONFIG_EPOLL) += eventpoll.o obj-$(CONFIG_ANON_INODES) += anon_inodes.o obj-$(CONFIG_SIGNALFD) += signalfd.o @@ -55,10 +54,9 @@ obj-$(CONFIG_GENERIC_ACL) += generic_acl.o obj-$(CONFIG_QUOTA) += dquot.o obj-$(CONFIG_QFMT_V1) += quota_v1.o obj-$(CONFIG_QFMT_V2) += quota_v2.o +obj-$(CONFIG_QUOTA_TREE) += quota_tree.o obj-$(CONFIG_QUOTACTL) += quota.o -obj-$(CONFIG_DNOTIFY) += dnotify.o - obj-$(CONFIG_PROC_FS) += proc/ obj-y += partitions/ obj-$(CONFIG_SYSFS) += sysfs/ @@ -76,6 +74,7 @@ obj-$(CONFIG_JBD) += jbd/ obj-$(CONFIG_JBD2) += jbd2/ obj-$(CONFIG_EXT2_FS) += ext2/ obj-$(CONFIG_CRAMFS) += cramfs/ +obj-$(CONFIG_SQUASHFS) += squashfs/ obj-y += ramfs/ obj-$(CONFIG_HUGETLBFS) += hugetlbfs/ obj-$(CONFIG_CODA_FS) += coda/ @@ -121,4 +120,5 @@ obj-$(CONFIG_HOSTFS) += hostfs/ obj-$(CONFIG_HPPFS) += hppfs/ obj-$(CONFIG_DEBUG_FS) += debugfs/ obj-$(CONFIG_OCFS2_FS) += ocfs2/ +obj-$(CONFIG_BTRFS_FS) += btrfs/ obj-$(CONFIG_GFS2_FS) += gfs2/ diff --git a/fs/affs/file.c b/fs/affs/file.c index 1377b1240b6e..9246cb4aa018 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -628,7 +628,7 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping } index = pos >> PAGE_CACHE_SHIFT; - page = __grab_cache_page(mapping, index); + page = grab_cache_page_write_begin(mapping, index, flags); if (!page) return -ENOMEM; *pagep = page; diff --git a/fs/affs/inode.c b/fs/affs/inode.c index 415d9c67ac16..3c4ec7d864c4 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -119,8 +119,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino) goto bad_inode; #else inode->i_mode |= S_IFDIR; - inode->i_op = NULL; - inode->i_fop = NULL; + /* ... and leave ->i_op and ->i_fop pointing to empty */ break; #endif case ST_LINKFILE: diff --git a/fs/afs/write.c b/fs/afs/write.c index d6b85dab35fc..3fb36d433621 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -144,7 +144,7 @@ int afs_write_begin(struct file *file, struct address_space *mapping, candidate->state = AFS_WBACK_PENDING; init_waitqueue_head(&candidate->waitq); - page = __grab_cache_page(mapping, index); + page = grab_cache_page_write_begin(mapping, index, flags); if (!page) { kfree(candidate); return -ENOMEM; @@ -191,6 +191,20 @@ static int aio_setup_ring(struct kioctx *ctx) kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK), km); \ } while(0) +static void ctx_rcu_free(struct rcu_head *head) +{ + struct kioctx *ctx = container_of(head, struct kioctx, rcu_head); + unsigned nr_events = ctx->max_reqs; + + kmem_cache_free(kioctx_cachep, ctx); + + if (nr_events) { + spin_lock(&aio_nr_lock); + BUG_ON(aio_nr - nr_events > aio_nr); + aio_nr -= nr_events; + spin_unlock(&aio_nr_lock); + } +} /* __put_ioctx * Called when the last user of an aio context has gone away, @@ -198,8 +212,6 @@ static int aio_setup_ring(struct kioctx *ctx) */ static void __put_ioctx(struct kioctx *ctx) { - unsigned nr_events = ctx->max_reqs; - BUG_ON(ctx->reqs_active); cancel_delayed_work(&ctx->wq); @@ -208,14 +220,7 @@ static void __put_ioctx(struct kioctx *ctx) mmdrop(ctx->mm); ctx->mm = NULL; pr_debug("__put_ioctx: freeing %p\n", ctx); - kmem_cache_free(kioctx_cachep, ctx); - - if (nr_events) { - spin_lock(&aio_nr_lock); - BUG_ON(aio_nr - nr_events > aio_nr); - aio_nr -= nr_events; - spin_unlock(&aio_nr_lock); - } + call_rcu(&ctx->rcu_head, ctx_rcu_free); } #define get_ioctx(kioctx) do { \ @@ -235,6 +240,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) { struct mm_struct *mm; struct kioctx *ctx; + int did_sync = 0; /* Prevent overflows */ if ((nr_events > (0x10000000U / sizeof(struct io_event))) || @@ -267,21 +273,30 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) goto out_freectx; /* limit the number of system wide aios */ - spin_lock(&aio_nr_lock); - if (aio_nr + ctx->max_reqs > aio_max_nr || - aio_nr + ctx->max_reqs < aio_nr) - ctx->max_reqs = 0; - else - aio_nr += ctx->max_reqs; - spin_unlock(&aio_nr_lock); + do { + spin_lock_bh(&aio_nr_lock); + if (aio_nr + nr_events > aio_max_nr || + aio_nr + nr_events < aio_nr) + ctx->max_reqs = 0; + else + aio_nr += ctx->max_reqs; + spin_unlock_bh(&aio_nr_lock); + if (ctx->max_reqs || did_sync) + break; + + /* wait for rcu callbacks to have completed before giving up */ + synchronize_rcu(); + did_sync = 1; + ctx->max_reqs = nr_events; + } while (1); + if (ctx->max_reqs == 0) goto out_cleanup; /* now link into global list. */ - write_lock(&mm->ioctx_list_lock); - ctx->next = mm->ioctx_list; - mm->ioctx_list = ctx; - write_unlock(&mm->ioctx_list_lock); + spin_lock(&mm->ioctx_lock); + hlist_add_head_rcu(&ctx->list, &mm->ioctx_list); + spin_unlock(&mm->ioctx_lock); dprintk("aio: allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", ctx, ctx->user_id, current->mm, ctx->ring_info.nr); @@ -375,11 +390,12 @@ ssize_t wait_on_sync_kiocb(struct kiocb *iocb) */ void exit_aio(struct mm_struct *mm) { - struct kioctx *ctx = mm->ioctx_list; - mm->ioctx_list = NULL; - while (ctx) { - struct kioctx *next = ctx->next; - ctx->next = NULL; + struct kioctx *ctx; + + while (!hlist_empty(&mm->ioctx_list)) { + ctx = hlist_entry(mm->ioctx_list.first, struct kioctx, list); + hlist_del_rcu(&ctx->list); + aio_cancel_all(ctx); wait_for_all_aios(ctx); @@ -394,7 +410,6 @@ void exit_aio(struct mm_struct *mm) atomic_read(&ctx->users), ctx->dead, ctx->reqs_active); put_ioctx(ctx); - ctx = next; } } @@ -555,19 +570,21 @@ int aio_put_req(struct kiocb *req) static struct kioctx *lookup_ioctx(unsigned long ctx_id) { - struct kioctx *ioctx; - struct mm_struct *mm; + struct mm_struct *mm = current->mm; + struct kioctx *ctx = NULL; + struct hlist_node *n; - mm = current->mm; - read_lock(&mm->ioctx_list_lock); - for (ioctx = mm->ioctx_list; ioctx; ioctx = ioctx->next) - if (likely(ioctx->user_id == ctx_id && !ioctx->dead)) { - get_ioctx(ioctx); + rcu_read_lock(); + + hlist_for_each_entry_rcu(ctx, n, &mm->ioctx_list, list) { + if (ctx->user_id == ctx_id && !ctx->dead) { + get_ioctx(ctx); break; } - read_unlock(&mm->ioctx_list_lock); + } - return ioctx; + rcu_read_unlock(); + return ctx; } /* @@ -1215,19 +1232,14 @@ out: static void io_destroy(struct kioctx *ioctx) { struct mm_struct *mm = current->mm; - struct kioctx **tmp; int was_dead; /* delete the entry from the list is someone else hasn't already */ - write_lock(&mm->ioctx_list_lock); + spin_lock(&mm->ioctx_lock); was_dead = ioctx->dead; ioctx->dead = 1; - for (tmp = &mm->ioctx_list; *tmp && *tmp != ioctx; - tmp = &(*tmp)->next) - ; - if (*tmp) - *tmp = ioctx->next; - write_unlock(&mm->ioctx_list_lock); + hlist_del_rcu(&ioctx->list); + spin_unlock(&mm->ioctx_lock); dprintk("aio_release(%p)\n", ioctx); if (likely(!was_dead)) diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index c16d9be1b017..3bbdb9d02376 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -79,9 +79,12 @@ int anon_inode_getfd(const char *name, const struct file_operations *fops, if (IS_ERR(anon_inode_inode)) return -ENODEV; + if (fops->owner && !try_module_get(fops->owner)) + return -ENOENT; + error = get_unused_fd_flags(flags); if (error < 0) - return error; + goto err_module; fd = error; /* @@ -128,6 +131,8 @@ err_dput: dput(dentry); err_put_unused_fd: put_unused_fd(fd); +err_module: + module_put(fops->owner); return error; } EXPORT_SYMBOL_GPL(anon_inode_getfd); diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c index c773680d5c60..e1734f2d6e26 100644 --- a/fs/autofs/inode.c +++ b/fs/autofs/inode.c @@ -251,13 +251,11 @@ struct inode *autofs_iget(struct super_block *sb, unsigned long ino) inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; inode->i_nlink = 2; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; - inode->i_blocks = 0; if (ino == AUTOFS_ROOT_INO) { inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR; inode->i_op = &autofs_root_inode_operations; inode->i_fop = &autofs_root_operations; - inode->i_uid = inode->i_gid = 0; /* Changed in read_super */ goto done; } diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index e0f16da00e54..a76803108d06 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -25,8 +25,6 @@ #define AUTOFS_DEV_IOCTL_IOC_FIRST (AUTOFS_DEV_IOCTL_VERSION) #define AUTOFS_DEV_IOCTL_IOC_COUNT (AUTOFS_IOC_COUNT - 11) -#define AUTOFS_TYPE_TRIGGER (AUTOFS_TYPE_DIRECT|AUTOFS_TYPE_OFFSET) - #include <linux/kernel.h> #include <linux/slab.h> #include <linux/time.h> diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index 63b7c7afe8df..025e105bffea 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -124,7 +124,7 @@ static inline void free_dev_ioctl(struct autofs_dev_ioctl *param) /* * Check sanity of parameter control fields and if a path is present - * check that it has a "/" and is terminated. + * check that it is terminated and contains at least one "/". */ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param) { @@ -138,15 +138,16 @@ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param) } if (param->size > sizeof(*param)) { - err = check_name(param->path); + err = invalid_str(param->path, + (void *) ((size_t) param + param->size)); if (err) { - AUTOFS_WARN("invalid path supplied for cmd(0x%08x)", - cmd); + AUTOFS_WARN( + "path string terminator missing for cmd(0x%08x)", + cmd); goto out; } - err = invalid_str(param->path, - (void *) ((size_t) param + param->size)); + err = check_name(param->path); if (err) { AUTOFS_WARN("invalid path supplied for cmd(0x%08x)", cmd); @@ -180,7 +181,7 @@ static int autofs_dev_ioctl_protover(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { - param->arg1 = sbi->version; + param->protover.version = sbi->version; return 0; } @@ -189,7 +190,7 @@ static int autofs_dev_ioctl_protosubver(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { - param->arg1 = sbi->sub_version; + param->protosubver.sub_version = sbi->sub_version; return 0; } @@ -335,13 +336,13 @@ static int autofs_dev_ioctl_openmount(struct file *fp, int err, fd; /* param->path has already been checked */ - if (!param->arg1) + if (!param->openmount.devid) return -EINVAL; param->ioctlfd = -1; path = param->path; - devid = param->arg1; + devid = param->openmount.devid; err = 0; fd = autofs_dev_ioctl_open_mountpoint(path, devid); @@ -373,7 +374,7 @@ static int autofs_dev_ioctl_ready(struct file *fp, { autofs_wqt_t token; - token = (autofs_wqt_t) param->arg1; + token = (autofs_wqt_t) param->ready.token; return autofs4_wait_release(sbi, token, 0); } @@ -388,8 +389,8 @@ static int autofs_dev_ioctl_fail(struct file *fp, autofs_wqt_t token; int status; - token = (autofs_wqt_t) param->arg1; - status = param->arg2 ? param->arg2 : -ENOENT; + token = (autofs_wqt_t) param->fail.token; + status = param->fail.status ? param->fail.status : -ENOENT; return autofs4_wait_release(sbi, token, status); } @@ -412,10 +413,10 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp, int pipefd; int err = 0; - if (param->arg1 == -1) + if (param->setpipefd.pipefd == -1) return -EINVAL; - pipefd = param->arg1; + pipefd = param->setpipefd.pipefd; mutex_lock(&sbi->wq_mutex); if (!sbi->catatonic) { @@ -457,8 +458,8 @@ static int autofs_dev_ioctl_timeout(struct file *fp, { unsigned long timeout; - timeout = param->arg1; - param->arg1 = sbi->exp_timeout / HZ; + timeout = param->timeout.timeout; + param->timeout.timeout = sbi->exp_timeout / HZ; sbi->exp_timeout = timeout * HZ; return 0; } @@ -489,7 +490,7 @@ static int autofs_dev_ioctl_requester(struct file *fp, path = param->path; devid = sbi->sb->s_dev; - param->arg1 = param->arg2 = -1; + param->requester.uid = param->requester.gid = -1; /* Get nameidata of the parent directory */ err = path_lookup(path, LOOKUP_PARENT, &nd); @@ -505,8 +506,8 @@ static int autofs_dev_ioctl_requester(struct file *fp, err = 0; autofs4_expire_wait(nd.path.dentry); spin_lock(&sbi->fs_lock); - param->arg1 = ino->uid; - param->arg2 = ino->gid; + param->requester.uid = ino->uid; + param->requester.gid = ino->gid; spin_unlock(&sbi->fs_lock); } @@ -529,10 +530,10 @@ static int autofs_dev_ioctl_expire(struct file *fp, int err = -EAGAIN; int how; - how = param->arg1; + how = param->expire.how; mnt = fp->f_path.mnt; - if (sbi->type & AUTOFS_TYPE_TRIGGER) + if (autofs_type_trigger(sbi->type)) dentry = autofs4_expire_direct(sbi->sb, mnt, sbi, how); else dentry = autofs4_expire_indirect(sbi->sb, mnt, sbi, how); @@ -565,9 +566,9 @@ static int autofs_dev_ioctl_askumount(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { - param->arg1 = 0; + param->askumount.may_umount = 0; if (may_umount(fp->f_path.mnt)) - param->arg1 = 1; + param->askumount.may_umount = 1; return 0; } @@ -600,6 +601,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp, struct nameidata nd; const char *path; unsigned int type; + unsigned int devid, magic; int err = -ENOENT; if (param->size <= sizeof(*param)) { @@ -608,13 +610,13 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp, } path = param->path; - type = param->arg1; + type = param->ismountpoint.in.type; - param->arg1 = 0; - param->arg2 = 0; + param->ismountpoint.out.devid = devid = 0; + param->ismountpoint.out.magic = magic = 0; if (!fp || param->ioctlfd == -1) { - if (type == AUTOFS_TYPE_ANY) { + if (autofs_type_any(type)) { struct super_block *sb; err = path_lookup(path, LOOKUP_FOLLOW, &nd); @@ -622,7 +624,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp, goto out; sb = nd.path.dentry->d_sb; - param->arg1 = new_encode_dev(sb->s_dev); + devid = new_encode_dev(sb->s_dev); } else { struct autofs_info *ino; @@ -635,38 +637,41 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp, goto out_release; ino = autofs4_dentry_ino(nd.path.dentry); - param->arg1 = autofs4_get_dev(ino->sbi); + devid = autofs4_get_dev(ino->sbi); } err = 0; if (nd.path.dentry->d_inode && nd.path.mnt->mnt_root == nd.path.dentry) { err = 1; - param->arg2 = nd.path.dentry->d_inode->i_sb->s_magic; + magic = nd.path.dentry->d_inode->i_sb->s_magic; } } else { - dev_t devid = new_encode_dev(sbi->sb->s_dev); + dev_t dev = autofs4_get_dev(sbi); err = path_lookup(path, LOOKUP_PARENT, &nd); if (err) goto out; - err = autofs_dev_ioctl_find_super(&nd, devid); + err = autofs_dev_ioctl_find_super(&nd, dev); if (err) goto out_release; - param->arg1 = autofs4_get_dev(sbi); + devid = dev; err = have_submounts(nd.path.dentry); if (nd.path.mnt->mnt_mountpoint != nd.path.mnt->mnt_root) { if (follow_down(&nd.path.mnt, &nd.path.dentry)) { struct inode *inode = nd.path.dentry->d_inode; - param->arg2 = inode->i_sb->s_magic; + magic = inode->i_sb->s_magic; } } } + param->ismountpoint.out.devid = devid; + param->ismountpoint.out.magic = magic; + out_release: path_put(&nd.path); out: diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index 4b6fb3f628c0..e3bd50776f9e 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -63,7 +63,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry) struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); /* This is an autofs submount, we can't expire it */ - if (sbi->type == AUTOFS_TYPE_INDIRECT) + if (autofs_type_indirect(sbi->type)) goto done; /* @@ -490,7 +490,7 @@ int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt, if (arg && get_user(do_now, arg)) return -EFAULT; - if (sbi->type & AUTOFS_TYPE_TRIGGER) + if (autofs_type_trigger(sbi->type)) dentry = autofs4_expire_direct(sb, mnt, sbi, do_now); else dentry = autofs4_expire_indirect(sb, mnt, sbi, do_now); diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index 7b19802cfef4..716e12b627b2 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -197,9 +197,9 @@ static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt) seq_printf(m, ",minproto=%d", sbi->min_proto); seq_printf(m, ",maxproto=%d", sbi->max_proto); - if (sbi->type & AUTOFS_TYPE_OFFSET) + if (autofs_type_offset(sbi->type)) seq_printf(m, ",offset"); - else if (sbi->type & AUTOFS_TYPE_DIRECT) + else if (autofs_type_direct(sbi->type)) seq_printf(m, ",direct"); else seq_printf(m, ",indirect"); @@ -284,13 +284,13 @@ static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid, *maxproto = option; break; case Opt_indirect: - *type = AUTOFS_TYPE_INDIRECT; + set_autofs_type_indirect(type); break; case Opt_direct: - *type = AUTOFS_TYPE_DIRECT; + set_autofs_type_direct(type); break; case Opt_offset: - *type = AUTOFS_TYPE_OFFSET; + set_autofs_type_offset(type); break; default: return 1; @@ -338,7 +338,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) sbi->sb = s; sbi->version = 0; sbi->sub_version = 0; - sbi->type = AUTOFS_TYPE_INDIRECT; + set_autofs_type_indirect(&sbi->type); sbi->min_proto = 0; sbi->max_proto = 0; mutex_init(&sbi->wq_mutex); @@ -380,7 +380,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) } root_inode->i_fop = &autofs4_root_operations; - root_inode->i_op = sbi->type & AUTOFS_TYPE_TRIGGER ? + root_inode->i_op = autofs_type_trigger(sbi->type) ? &autofs4_direct_root_inode_operations : &autofs4_indirect_root_inode_operations; @@ -455,11 +455,7 @@ struct inode *autofs4_get_inode(struct super_block *sb, if (sb->s_root) { inode->i_uid = sb->s_root->d_inode->i_uid; inode->i_gid = sb->s_root->d_inode->i_gid; - } else { - inode->i_uid = 0; - inode->i_gid = 0; } - inode->i_blocks = 0; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; if (S_ISDIR(inf->mode)) { diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index e02cc8ae5eb3..eeb246845909 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -337,7 +337,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, * is very similar for indirect mounts except only dentrys * in the root of the autofs file system may be negative. */ - if (sbi->type & AUTOFS_TYPE_TRIGGER) + if (autofs_type_trigger(sbi->type)) return -ENOENT; else if (!IS_ROOT(dentry->d_parent)) return -ENOENT; @@ -348,7 +348,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, return -ENOMEM; /* If this is a direct mount request create a dummy name */ - if (IS_ROOT(dentry) && sbi->type & AUTOFS_TYPE_TRIGGER) + if (IS_ROOT(dentry) && autofs_type_trigger(sbi->type)) qstr.len = sprintf(name, "%p", dentry); else { qstr.len = autofs4_getpath(sbi, dentry, &name); @@ -406,11 +406,11 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, type = autofs_ptype_expire_multi; } else { if (notify == NFY_MOUNT) - type = (sbi->type & AUTOFS_TYPE_TRIGGER) ? + type = autofs_type_trigger(sbi->type) ? autofs_ptype_missing_direct : autofs_ptype_missing_indirect; else - type = (sbi->type & AUTOFS_TYPE_TRIGGER) ? + type = autofs_type_trigger(sbi->type) ? autofs_ptype_expire_direct : autofs_ptype_expire_indirect; } diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 5f1538c03b1b..a05287a23f62 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -132,11 +132,6 @@ static int bad_file_check_flags(int flags) return -EIO; } -static int bad_file_dir_notify(struct file *file, unsigned long arg) -{ - return -EIO; -} - static int bad_file_flock(struct file *filp, int cmd, struct file_lock *fl) { return -EIO; @@ -179,7 +174,6 @@ static const struct file_operations bad_file_ops = .sendpage = bad_file_sendpage, .get_unmapped_area = bad_file_get_unmapped_area, .check_flags = bad_file_check_flags, - .dir_notify = bad_file_dir_notify, .flock = bad_file_flock, .splice_write = bad_file_splice_write, .splice_read = bad_file_splice_read, diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index b6dfee37c7b7..d06cb023ad02 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -378,7 +378,8 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino) inode->i_size = 0; inode->i_blocks = befs_sb->block_size / VFS_BLOCK_SIZE; strncpy(befs_ino->i_data.symlink, raw_inode->data.symlink, - BEFS_SYMLINK_LEN); + BEFS_SYMLINK_LEN - 1); + befs_ino->i_data.symlink[BEFS_SYMLINK_LEN - 1] = '\0'; } else { int num_blks; @@ -477,6 +478,8 @@ befs_follow_link(struct dentry *dentry, struct nameidata *nd) kfree(link); befs_error(sb, "Failed to read entire long symlink"); link = ERR_PTR(-EIO); + } else { + link[len - 1] = '\0'; } } else { link = befs_ino->i_data.symlink; diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 0ed57b5ee012..cc4062d12ca2 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -213,6 +213,9 @@ static void bfs_put_super(struct super_block *s) { struct bfs_sb_info *info = BFS_SB(s); + if (!info) + return; + brelse(info->si_sbh); mutex_destroy(&info->bfs_lock); kfree(info->si_imap); @@ -327,6 +330,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) unsigned i, imap_len; struct bfs_sb_info *info; long ret = -EINVAL; + unsigned long i_sblock, i_eblock, i_eoff, s_size; info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) @@ -350,6 +354,12 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) s->s_magic = BFS_MAGIC; info->si_sbh = bh; + + if (le32_to_cpu(bfs_sb->s_start) > le32_to_cpu(bfs_sb->s_end)) { + printf("Superblock is corrupted\n"); + goto out; + } + info->si_lasti = (le32_to_cpu(bfs_sb->s_start) - BFS_BSIZE) / sizeof(struct bfs_inode) + BFS_ROOT_INO - 1; @@ -380,6 +390,18 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) - le32_to_cpu(bfs_sb->s_start)) >> BFS_BSIZE_BITS; info->si_freei = 0; info->si_lf_eblk = 0; + + /* can we read the last block? */ + bh = sb_bread(s, info->si_blocks - 1); + if (!bh) { + printf("Last block not available: %lu\n", info->si_blocks - 1); + iput(inode); + ret = -EIO; + kfree(info->si_imap); + goto out; + } + brelse(bh); + bh = NULL; for (i = BFS_ROOT_INO; i <= info->si_lasti; i++) { struct bfs_inode *di; @@ -397,6 +419,29 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) di = (struct bfs_inode *)bh->b_data + off; + /* test if filesystem is not corrupted */ + + i_eoff = le32_to_cpu(di->i_eoffset); + i_sblock = le32_to_cpu(di->i_sblock); + i_eblock = le32_to_cpu(di->i_eblock); + s_size = le32_to_cpu(bfs_sb->s_end); + + if (i_sblock > info->si_blocks || + i_eblock > info->si_blocks || + i_sblock > i_eblock || + i_eoff > s_size || + i_sblock * BFS_BSIZE > i_eoff) { + + printf("Inode 0x%08x corrupted\n", i); + + brelse(bh); + s->s_root = NULL; + kfree(info->si_imap); + kfree(info); + s->s_fs_info = NULL; + return -EIO; + } + if (!di->i_ino) { info->si_freei++; continue; diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index f1f3f4192a60..b639dcf7c778 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -95,92 +95,55 @@ static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file, u int has_dumped = 0; unsigned long dump_start, dump_size; struct user dump; -#if defined(__alpha__) +#ifdef __alpha__ # define START_DATA(u) (u.start_data) -#elif defined(__arm__) +#else # define START_DATA(u) ((u.u_tsize << PAGE_SHIFT) + u.start_code) -#elif defined(__sparc__) -# define START_DATA(u) (u.u_tsize) -#elif defined(__i386__) || defined(__mc68000__) || defined(__arch_um__) -# define START_DATA(u) (u.u_tsize << PAGE_SHIFT) #endif -#ifdef __sparc__ -# define START_STACK(u) ((regs->u_regs[UREG_FP]) & ~(PAGE_SIZE - 1)) -#else # define START_STACK(u) (u.start_stack) -#endif fs = get_fs(); set_fs(KERNEL_DS); has_dumped = 1; current->flags |= PF_DUMPCORE; strncpy(dump.u_comm, current->comm, sizeof(dump.u_comm)); -#ifndef __sparc__ dump.u_ar0 = offsetof(struct user, regs); -#endif dump.signal = signr; aout_dump_thread(regs, &dump); /* If the size of the dump file exceeds the rlimit, then see what would happen if we wrote the stack, but not the data area. */ -#ifdef __sparc__ - if ((dump.u_dsize + dump.u_ssize) > limit) - dump.u_dsize = 0; -#else if ((dump.u_dsize + dump.u_ssize+1) * PAGE_SIZE > limit) dump.u_dsize = 0; -#endif /* Make sure we have enough room to write the stack and data areas. */ -#ifdef __sparc__ - if (dump.u_ssize > limit) - dump.u_ssize = 0; -#else if ((dump.u_ssize + 1) * PAGE_SIZE > limit) dump.u_ssize = 0; -#endif /* make sure we actually have a data and stack area to dump */ set_fs(USER_DS); -#ifdef __sparc__ - if (!access_ok(VERIFY_READ, (void __user *)START_DATA(dump), dump.u_dsize)) - dump.u_dsize = 0; - if (!access_ok(VERIFY_READ, (void __user *)START_STACK(dump), dump.u_ssize)) - dump.u_ssize = 0; -#else if (!access_ok(VERIFY_READ, (void __user *)START_DATA(dump), dump.u_dsize << PAGE_SHIFT)) dump.u_dsize = 0; if (!access_ok(VERIFY_READ, (void __user *)START_STACK(dump), dump.u_ssize << PAGE_SHIFT)) dump.u_ssize = 0; -#endif set_fs(KERNEL_DS); /* struct user */ DUMP_WRITE(&dump,sizeof(dump)); /* Now dump all of the user data. Include malloced stuff as well */ -#ifndef __sparc__ DUMP_SEEK(PAGE_SIZE); -#endif /* now we start writing out the user space info */ set_fs(USER_DS); /* Dump the data area */ if (dump.u_dsize != 0) { dump_start = START_DATA(dump); -#ifdef __sparc__ - dump_size = dump.u_dsize; -#else dump_size = dump.u_dsize << PAGE_SHIFT; -#endif DUMP_WRITE(dump_start,dump_size); } /* Now prepare to dump the stack area */ if (dump.u_ssize != 0) { dump_start = START_STACK(dump); -#ifdef __sparc__ - dump_size = dump.u_ssize; -#else dump_size = dump.u_ssize << PAGE_SHIFT; -#endif DUMP_WRITE(dump_start,dump_size); } /* Finally dump the task struct. Not be used by gdb, but could be useful */ @@ -205,29 +168,24 @@ static unsigned long __user *create_aout_tables(char __user *p, struct linux_bin int envc = bprm->envc; sp = (void __user *)((-(unsigned long)sizeof(char *)) & (unsigned long) p); -#ifdef __sparc__ - /* This imposes the proper stack alignment for a new process. */ - sp = (void __user *) (((unsigned long) sp) & ~7); - if ((envc+argc+3)&1) --sp; -#endif #ifdef __alpha__ /* whee.. test-programs are so much fun. */ put_user(0, --sp); put_user(0, --sp); if (bprm->loader) { put_user(0, --sp); - put_user(0x3eb, --sp); + put_user(1003, --sp); put_user(bprm->loader, --sp); - put_user(0x3ea, --sp); + put_user(1002, --sp); } put_user(bprm->exec, --sp); - put_user(0x3e9, --sp); + put_user(1001, --sp); #endif sp -= envc+1; envp = (char __user * __user *) sp; sp -= argc+1; argv = (char __user * __user *) sp; -#if defined(__i386__) || defined(__mc68000__) || defined(__arm__) || defined(__arch_um__) +#ifndef __alpha__ put_user((unsigned long) envp,--sp); put_user((unsigned long) argv,--sp); #endif @@ -300,13 +258,8 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) return retval; /* OK, This is the point of no return */ -#if defined(__alpha__) +#ifdef __alpha__ SET_AOUT_PERSONALITY(bprm, ex); -#elif defined(__sparc__) - set_personality(PER_SUNOS); -#if !defined(__sparc_v9__) - memcpy(¤t->thread.core_exec, &ex, sizeof(struct exec)); -#endif #else set_personality(PER_LINUX); #endif @@ -322,24 +275,6 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) install_exec_creds(bprm); current->flags &= ~PF_FORKNOEXEC; -#ifdef __sparc__ - if (N_MAGIC(ex) == NMAGIC) { - loff_t pos = fd_offset; - /* Fuck me plenty... */ - /* <AOL></AOL> */ - down_write(¤t->mm->mmap_sem); - error = do_brk(N_TXTADDR(ex), ex.a_text); - up_write(¤t->mm->mmap_sem); - bprm->file->f_op->read(bprm->file, (char *) N_TXTADDR(ex), - ex.a_text, &pos); - down_write(¤t->mm->mmap_sem); - error = do_brk(N_DATADDR(ex), ex.a_data); - up_write(¤t->mm->mmap_sem); - bprm->file->f_op->read(bprm->file, (char *) N_DATADDR(ex), - ex.a_data, &pos); - goto beyond_if; - } -#endif if (N_MAGIC(ex) == OMAGIC) { unsigned long text_addr, map_size; @@ -347,7 +282,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) text_addr = N_TXTADDR(ex); -#if defined(__alpha__) || defined(__sparc__) +#ifdef __alpha__ pos = fd_offset; map_size = ex.a_text+ex.a_data + PAGE_SIZE - 1; #else diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index c41fa2af7677..e3ff2b9e602f 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -152,8 +152,10 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, elf_addr_t __user *sp; elf_addr_t __user *u_platform; elf_addr_t __user *u_base_platform; + elf_addr_t __user *u_rand_bytes; const char *k_platform = ELF_PLATFORM; const char *k_base_platform = ELF_BASE_PLATFORM; + unsigned char k_rand_bytes[16]; int items; elf_addr_t *elf_info; int ei_index = 0; @@ -196,6 +198,15 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, return -EFAULT; } + /* + * Generate 16 random bytes for userspace PRNG seeding. + */ + get_random_bytes(k_rand_bytes, sizeof(k_rand_bytes)); + u_rand_bytes = (elf_addr_t __user *) + STACK_ALLOC(p, sizeof(k_rand_bytes)); + if (__copy_to_user(u_rand_bytes, k_rand_bytes, sizeof(k_rand_bytes))) + return -EFAULT; + /* Create the ELF interpreter info */ elf_info = (elf_addr_t *)current->mm->saved_auxv; /* update AT_VECTOR_SIZE_BASE if the number of NEW_AUX_ENT() changes */ @@ -228,6 +239,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, NEW_AUX_ENT(AT_GID, cred->gid); NEW_AUX_ENT(AT_EGID, cred->egid); NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm)); + NEW_AUX_ENT(AT_RANDOM, (elf_addr_t)(unsigned long)u_rand_bytes); NEW_AUX_ENT(AT_EXECFN, bprm->exec); if (k_platform) { NEW_AUX_ENT(AT_PLATFORM, diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index aa5b43205e37..f3e72c5c19f5 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -168,9 +168,6 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, struct elf_fdpic_params exec_params, interp_params; struct elf_phdr *phdr; unsigned long stack_size, entryaddr; -#ifndef CONFIG_MMU - unsigned long fullsize; -#endif #ifdef ELF_FDPIC_PLAT_INIT unsigned long dynaddr; #endif @@ -390,11 +387,6 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, goto error_kill; } - /* expand the stack mapping to use up the entire allocation granule */ - fullsize = kobjsize((char *) current->mm->start_brk); - if (!IS_ERR_VALUE(do_mremap(current->mm->start_brk, stack_size, - fullsize, 0, 0))) - stack_size = fullsize; up_write(¤t->mm->mmap_sem); current->mm->brk = current->mm->start_brk; @@ -1567,11 +1559,9 @@ end_coredump: static int elf_fdpic_dump_segments(struct file *file, size_t *size, unsigned long *limit, unsigned long mm_flags) { - struct vm_list_struct *vml; - - for (vml = current->mm->context.vmlist; vml; vml = vml->next) { - struct vm_area_struct *vma = vml->vma; + struct vm_area_struct *vma; + for (vma = current->mm->mmap; vma; vma = vma->vm_next) { if (!maydump(vma, mm_flags)) continue; @@ -1617,9 +1607,6 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs, elf_fpxregset_t *xfpu = NULL; #endif int thread_status_size = 0; -#ifndef CONFIG_MMU - struct vm_list_struct *vml; -#endif elf_addr_t *auxv; unsigned long mm_flags; @@ -1685,13 +1672,7 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs, fill_prstatus(prstatus, current, signr); elf_core_copy_regs(&prstatus->pr_reg, regs); -#ifdef CONFIG_MMU segs = current->mm->map_count; -#else - segs = 0; - for (vml = current->mm->context.vmlist; vml; vml = vml->next) - segs++; -#endif #ifdef ELF_CORE_EXTRA_PHDRS segs += ELF_CORE_EXTRA_PHDRS; #endif @@ -1766,20 +1747,10 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs, mm_flags = current->mm->flags; /* write program headers for segments dump */ - for ( -#ifdef CONFIG_MMU - vma = current->mm->mmap; vma; vma = vma->vm_next -#else - vml = current->mm->context.vmlist; vml; vml = vml->next -#endif - ) { + for (vma = current->mm->mmap; vma; vma = vma->vm_next) { struct elf_phdr phdr; size_t sz; -#ifndef CONFIG_MMU - vma = vml->vma; -#endif - sz = vma->vm_end - vma->vm_start; phdr.p_type = PT_LOAD; diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 7bbd5c6b3725..5cebf0b37798 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -417,8 +417,8 @@ static int load_flat_file(struct linux_binprm * bprm, unsigned long textpos = 0, datapos = 0, result; unsigned long realdatastart = 0; unsigned long text_len, data_len, bss_len, stack_len, flags; - unsigned long len, reallen, memp = 0; - unsigned long extra, rlim; + unsigned long len, memp = 0; + unsigned long memp_size, extra, rlim; unsigned long *reloc = 0, *rp; struct inode *inode; int i, rev, relocs = 0; @@ -543,17 +543,10 @@ static int load_flat_file(struct linux_binprm * bprm, } len = data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long); + len = PAGE_ALIGN(len); down_write(¤t->mm->mmap_sem); realdatastart = do_mmap(0, 0, len, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0); - /* Remap to use all availabe slack region space */ - if (realdatastart && (realdatastart < (unsigned long)-4096)) { - reallen = kobjsize((void *)realdatastart); - if (reallen > len) { - realdatastart = do_mremap(realdatastart, len, - reallen, MREMAP_FIXED, realdatastart); - } - } up_write(¤t->mm->mmap_sem); if (realdatastart == 0 || realdatastart >= (unsigned long)-4096) { @@ -591,21 +584,14 @@ static int load_flat_file(struct linux_binprm * bprm, reloc = (unsigned long *) (datapos+(ntohl(hdr->reloc_start)-text_len)); memp = realdatastart; - + memp_size = len; } else { len = text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long); + len = PAGE_ALIGN(len); down_write(¤t->mm->mmap_sem); textpos = do_mmap(0, 0, len, PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0); - /* Remap to use all availabe slack region space */ - if (textpos && (textpos < (unsigned long) -4096)) { - reallen = kobjsize((void *)textpos); - if (reallen > len) { - textpos = do_mremap(textpos, len, reallen, - MREMAP_FIXED, textpos); - } - } up_write(¤t->mm->mmap_sem); if (!textpos || textpos >= (unsigned long) -4096) { @@ -622,7 +608,7 @@ static int load_flat_file(struct linux_binprm * bprm, reloc = (unsigned long *) (textpos + ntohl(hdr->reloc_start) + MAX_SHARED_LIBS * sizeof(unsigned long)); memp = textpos; - + memp_size = len; #ifdef CONFIG_BINFMT_ZFLAT /* * load it all in and treat it like a RAM load from now on @@ -680,10 +666,12 @@ static int load_flat_file(struct linux_binprm * bprm, * set up the brk stuff, uses any slack left in data/bss/stack * allocation. We put the brk after the bss (between the bss * and stack) like other platforms. + * Userspace code relies on the stack pointer starting out at + * an address right at the end of a page. */ current->mm->start_brk = datapos + data_len + bss_len; current->mm->brk = (current->mm->start_brk + 3) & ~3; - current->mm->context.end_brk = memp + kobjsize((void *) memp) - stack_len; + current->mm->context.end_brk = memp + memp_size - stack_len; } if (flags & FLAT_FLAG_KTRACE) @@ -790,8 +778,8 @@ static int load_flat_file(struct linux_binprm * bprm, /* zero the BSS, BRK and stack areas */ memset((void*)(datapos + data_len), 0, bss_len + - (memp + kobjsize((void *) memp) - stack_len - /* end brk */ - libinfo->lib_list[id].start_brk) + /* start brk */ + (memp + memp_size - stack_len - /* end brk */ + libinfo->lib_list[id].start_brk) + /* start brk */ stack_len); return 0; diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index f2744ab4e5b3..c4e83537ead7 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -496,9 +496,6 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode) if (inode) { inode->i_mode = mode; - inode->i_uid = 0; - inode->i_gid = 0; - inode->i_blocks = 0; inode->i_atime = inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb); } @@ -652,7 +649,7 @@ static const struct file_operations bm_register_operations = { static ssize_t bm_status_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) { - char *s = enabled ? "enabled" : "disabled"; + char *s = enabled ? "enabled\n" : "disabled\n"; return simple_read_from_buffer(buf, nbytes, ppos, s, strlen(s)); } diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index 19caf7c962ac..77ebc3c263d6 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -111,7 +111,7 @@ void bio_integrity_free(struct bio *bio, struct bio_set *bs) && bip->bip_buf != NULL) kfree(bip->bip_buf); - mempool_free(bip->bip_vec, bs->bvec_pools[bip->bip_pool]); + bvec_free_bs(bs, bip->bip_vec, bip->bip_pool); mempool_free(bip, bs->bio_integrity_pool); bio->bi_integrity = NULL; @@ -31,7 +31,11 @@ DEFINE_TRACE(block_split); -static struct kmem_cache *bio_slab __read_mostly; +/* + * Test patch to inline a certain number of bi_io_vec's inside the bio + * itself, to shrink a bio data allocation from two mempool calls to one + */ +#define BIO_INLINE_VECS 4 static mempool_t *bio_split_pool __read_mostly; @@ -40,9 +44,8 @@ static mempool_t *bio_split_pool __read_mostly; * break badly! cannot be bigger than what you can fit into an * unsigned short */ - #define BV(x) { .nr_vecs = x, .name = "biovec-"__stringify(x) } -static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = { +struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = { BV(1), BV(4), BV(16), BV(64), BV(128), BV(BIO_MAX_PAGES), }; #undef BV @@ -53,12 +56,121 @@ static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = { */ struct bio_set *fs_bio_set; +/* + * Our slab pool management + */ +struct bio_slab { + struct kmem_cache *slab; + unsigned int slab_ref; + unsigned int slab_size; + char name[8]; +}; +static DEFINE_MUTEX(bio_slab_lock); +static struct bio_slab *bio_slabs; +static unsigned int bio_slab_nr, bio_slab_max; + +static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size) +{ + unsigned int sz = sizeof(struct bio) + extra_size; + struct kmem_cache *slab = NULL; + struct bio_slab *bslab; + unsigned int i, entry = -1; + + mutex_lock(&bio_slab_lock); + + i = 0; + while (i < bio_slab_nr) { + struct bio_slab *bslab = &bio_slabs[i]; + + if (!bslab->slab && entry == -1) + entry = i; + else if (bslab->slab_size == sz) { + slab = bslab->slab; + bslab->slab_ref++; + break; + } + i++; + } + + if (slab) + goto out_unlock; + + if (bio_slab_nr == bio_slab_max && entry == -1) { + bio_slab_max <<= 1; + bio_slabs = krealloc(bio_slabs, + bio_slab_max * sizeof(struct bio_slab), + GFP_KERNEL); + if (!bio_slabs) + goto out_unlock; + } + if (entry == -1) + entry = bio_slab_nr++; + + bslab = &bio_slabs[entry]; + + snprintf(bslab->name, sizeof(bslab->name), "bio-%d", entry); + slab = kmem_cache_create(bslab->name, sz, 0, SLAB_HWCACHE_ALIGN, NULL); + if (!slab) + goto out_unlock; + + printk("bio: create slab <%s> at %d\n", bslab->name, entry); + bslab->slab = slab; + bslab->slab_ref = 1; + bslab->slab_size = sz; +out_unlock: + mutex_unlock(&bio_slab_lock); + return slab; +} + +static void bio_put_slab(struct bio_set *bs) +{ + struct bio_slab *bslab = NULL; + unsigned int i; + + mutex_lock(&bio_slab_lock); + + for (i = 0; i < bio_slab_nr; i++) { + if (bs->bio_slab == bio_slabs[i].slab) { + bslab = &bio_slabs[i]; + break; + } + } + + if (WARN(!bslab, KERN_ERR "bio: unable to find slab!\n")) + goto out; + + WARN_ON(!bslab->slab_ref); + + if (--bslab->slab_ref) + goto out; + + kmem_cache_destroy(bslab->slab); + bslab->slab = NULL; + +out: + mutex_unlock(&bio_slab_lock); +} + unsigned int bvec_nr_vecs(unsigned short idx) { return bvec_slabs[idx].nr_vecs; } -struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs) +void bvec_free_bs(struct bio_set *bs, struct bio_vec *bv, unsigned int idx) +{ + BIO_BUG_ON(idx >= BIOVEC_NR_POOLS); + + if (idx == BIOVEC_MAX_IDX) + mempool_free(bv, bs->bvec_pool); + else { + struct biovec_slab *bvs = bvec_slabs + idx; + + kmem_cache_free(bvs->slab, bv); + } +} + +struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, + struct bio_set *bs) { struct bio_vec *bvl; @@ -67,60 +179,85 @@ struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct * If not, this is a bio_kmalloc() allocation and just do a * kzalloc() for the exact number of vecs right away. */ - if (bs) { + if (!bs) + bvl = kmalloc(nr * sizeof(struct bio_vec), gfp_mask); + + /* + * see comment near bvec_array define! + */ + switch (nr) { + case 1: + *idx = 0; + break; + case 2 ... 4: + *idx = 1; + break; + case 5 ... 16: + *idx = 2; + break; + case 17 ... 64: + *idx = 3; + break; + case 65 ... 128: + *idx = 4; + break; + case 129 ... BIO_MAX_PAGES: + *idx = 5; + break; + default: + return NULL; + } + + /* + * idx now points to the pool we want to allocate from. only the + * 1-vec entry pool is mempool backed. + */ + if (*idx == BIOVEC_MAX_IDX) { +fallback: + bvl = mempool_alloc(bs->bvec_pool, gfp_mask); + } else { + struct biovec_slab *bvs = bvec_slabs + *idx; + gfp_t __gfp_mask = gfp_mask & ~(__GFP_WAIT | __GFP_IO); + /* - * see comment near bvec_array define! + * Make this allocation restricted and don't dump info on + * allocation failures, since we'll fallback to the mempool + * in case of failure. */ - switch (nr) { - case 1: - *idx = 0; - break; - case 2 ... 4: - *idx = 1; - break; - case 5 ... 16: - *idx = 2; - break; - case 17 ... 64: - *idx = 3; - break; - case 65 ... 128: - *idx = 4; - break; - case 129 ... BIO_MAX_PAGES: - *idx = 5; - break; - default: - return NULL; - } + __gfp_mask |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; /* - * idx now points to the pool we want to allocate from + * Try a slab allocation. If this fails and __GFP_WAIT + * is set, retry with the 1-entry mempool */ - bvl = mempool_alloc(bs->bvec_pools[*idx], gfp_mask); - if (bvl) - memset(bvl, 0, - bvec_nr_vecs(*idx) * sizeof(struct bio_vec)); - } else - bvl = kzalloc(nr * sizeof(struct bio_vec), gfp_mask); + bvl = kmem_cache_alloc(bvs->slab, __gfp_mask); + if (unlikely(!bvl && (gfp_mask & __GFP_WAIT))) { + *idx = BIOVEC_MAX_IDX; + goto fallback; + } + } return bvl; } -void bio_free(struct bio *bio, struct bio_set *bio_set) +void bio_free(struct bio *bio, struct bio_set *bs) { - if (bio->bi_io_vec) { - const int pool_idx = BIO_POOL_IDX(bio); - - BIO_BUG_ON(pool_idx >= BIOVEC_NR_POOLS); + void *p; - mempool_free(bio->bi_io_vec, bio_set->bvec_pools[pool_idx]); - } + if (bio_has_allocated_vec(bio)) + bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio)); if (bio_integrity(bio)) - bio_integrity_free(bio, bio_set); + bio_integrity_free(bio, bs); + + /* + * If we have front padding, adjust the bio pointer before freeing + */ + p = bio; + if (bs->front_pad) + p -= bs->front_pad; - mempool_free(bio, bio_set->bio_pool); + mempool_free(p, bs->bio_pool); } /* @@ -133,7 +270,8 @@ static void bio_fs_destructor(struct bio *bio) static void bio_kmalloc_destructor(struct bio *bio) { - kfree(bio->bi_io_vec); + if (bio_has_allocated_vec(bio)) + kfree(bio->bi_io_vec); kfree(bio); } @@ -157,16 +295,20 @@ void bio_init(struct bio *bio) * for a &struct bio to become free. If a %NULL @bs is passed in, we will * fall back to just using @kmalloc to allocate the required memory. * - * allocate bio and iovecs from the memory pools specified by the - * bio_set structure, or @kmalloc if none given. + * Note that the caller must set ->bi_destructor on succesful return + * of a bio, to do the appropriate freeing of the bio once the reference + * count drops to zero. **/ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) { - struct bio *bio; + struct bio *bio = NULL; + + if (bs) { + void *p = mempool_alloc(bs->bio_pool, gfp_mask); - if (bs) - bio = mempool_alloc(bs->bio_pool, gfp_mask); - else + if (p) + bio = p + bs->front_pad; + } else bio = kmalloc(sizeof(*bio), gfp_mask); if (likely(bio)) { @@ -176,7 +318,15 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) if (likely(nr_iovecs)) { unsigned long uninitialized_var(idx); - bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs); + if (nr_iovecs <= BIO_INLINE_VECS) { + idx = 0; + bvl = bio->bi_inline_vecs; + nr_iovecs = BIO_INLINE_VECS; + } else { + bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, + bs); + nr_iovecs = bvec_nr_vecs(idx); + } if (unlikely(!bvl)) { if (bs) mempool_free(bio, bs->bio_pool); @@ -186,7 +336,7 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) goto out; } bio->bi_flags |= idx << BIO_POOL_OFFSET; - bio->bi_max_vecs = bvec_nr_vecs(idx); + bio->bi_max_vecs = nr_iovecs; } bio->bi_io_vec = bvl; } @@ -638,6 +788,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q, int i, ret; int nr_pages = 0; unsigned int len = 0; + unsigned int offset = map_data ? map_data->offset & ~PAGE_MASK : 0; for (i = 0; i < iov_count; i++) { unsigned long uaddr; @@ -664,35 +815,42 @@ struct bio *bio_copy_user_iov(struct request_queue *q, bio->bi_rw |= (!write_to_vm << BIO_RW); ret = 0; - i = 0; + + if (map_data) { + nr_pages = 1 << map_data->page_order; + i = map_data->offset / PAGE_SIZE; + } while (len) { - unsigned int bytes; + unsigned int bytes = PAGE_SIZE; - if (map_data) - bytes = 1U << (PAGE_SHIFT + map_data->page_order); - else - bytes = PAGE_SIZE; + bytes -= offset; if (bytes > len) bytes = len; if (map_data) { - if (i == map_data->nr_entries) { + if (i == map_data->nr_entries * nr_pages) { ret = -ENOMEM; break; } - page = map_data->pages[i++]; - } else + + page = map_data->pages[i / nr_pages]; + page += (i % nr_pages); + + i++; + } else { page = alloc_page(q->bounce_gfp | gfp_mask); - if (!page) { - ret = -ENOMEM; - break; + if (!page) { + ret = -ENOMEM; + break; + } } - if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes) + if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes) break; len -= bytes; + offset = 0; } if (ret) @@ -701,7 +859,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q, /* * success */ - if (!write_to_vm) { + if (!write_to_vm && (!map_data || !map_data->null_mapped)) { ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0, 0); if (ret) goto cleanup; @@ -1346,30 +1504,18 @@ EXPORT_SYMBOL(bio_sector_offset); */ static int biovec_create_pools(struct bio_set *bs, int pool_entries) { - int i; + struct biovec_slab *bp = bvec_slabs + BIOVEC_MAX_IDX; - for (i = 0; i < BIOVEC_NR_POOLS; i++) { - struct biovec_slab *bp = bvec_slabs + i; - mempool_t **bvp = bs->bvec_pools + i; + bs->bvec_pool = mempool_create_slab_pool(pool_entries, bp->slab); + if (!bs->bvec_pool) + return -ENOMEM; - *bvp = mempool_create_slab_pool(pool_entries, bp->slab); - if (!*bvp) - return -ENOMEM; - } return 0; } static void biovec_free_pools(struct bio_set *bs) { - int i; - - for (i = 0; i < BIOVEC_NR_POOLS; i++) { - mempool_t *bvp = bs->bvec_pools[i]; - - if (bvp) - mempool_destroy(bvp); - } - + mempool_destroy(bs->bvec_pool); } void bioset_free(struct bio_set *bs) @@ -1379,25 +1525,49 @@ void bioset_free(struct bio_set *bs) bioset_integrity_free(bs); biovec_free_pools(bs); + bio_put_slab(bs); kfree(bs); } -struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size) +/** + * bioset_create - Create a bio_set + * @pool_size: Number of bio and bio_vecs to cache in the mempool + * @front_pad: Number of bytes to allocate in front of the returned bio + * + * Description: + * Set up a bio_set to be used with @bio_alloc_bioset. Allows the caller + * to ask for a number of bytes to be allocated in front of the bio. + * Front pad allocation is useful for embedding the bio inside + * another structure, to avoid allocating extra data to go with the bio. + * Note that the bio must be embedded at the END of that structure always, + * or things will break badly. + */ +struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad) { - struct bio_set *bs = kzalloc(sizeof(*bs), GFP_KERNEL); + unsigned int back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec); + struct bio_set *bs; + bs = kzalloc(sizeof(*bs), GFP_KERNEL); if (!bs) return NULL; - bs->bio_pool = mempool_create_slab_pool(bio_pool_size, bio_slab); + bs->front_pad = front_pad; + + bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad); + if (!bs->bio_slab) { + kfree(bs); + return NULL; + } + + bs->bio_pool = mempool_create_slab_pool(pool_size, bs->bio_slab); if (!bs->bio_pool) goto bad; - if (bioset_integrity_create(bs, bio_pool_size)) + if (bioset_integrity_create(bs, pool_size)) goto bad; - if (!biovec_create_pools(bs, bvec_pool_size)) + if (!biovec_create_pools(bs, pool_size)) return bs; bad: @@ -1421,12 +1591,16 @@ static void __init biovec_init_slabs(void) static int __init init_bio(void) { - bio_slab = KMEM_CACHE(bio, SLAB_HWCACHE_ALIGN|SLAB_PANIC); + bio_slab_max = 2; + bio_slab_nr = 0; + bio_slabs = kzalloc(bio_slab_max * sizeof(struct bio_slab), GFP_KERNEL); + if (!bio_slabs) + panic("bio: can't allocate bios\n"); bio_integrity_init_slab(); biovec_init_slabs(); - fs_bio_set = bioset_create(BIO_POOL_SIZE, 2); + fs_bio_set = bioset_create(BIO_POOL_SIZE, 0); if (!fs_bio_set) panic("bio: can't allocate bios\n"); diff --git a/fs/block_dev.c b/fs/block_dev.c index 99e0ae1a4c78..b3c1efff5e1d 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -285,6 +285,8 @@ static void init_once(void *foo) INIT_LIST_HEAD(&bdev->bd_holder_list); #endif inode_init_once(&ei->vfs_inode); + /* Initialize mutex for freeze. */ + mutex_init(&bdev->bd_fsfreeze_mutex); } static inline void __bd_forget(struct inode *inode) @@ -326,12 +328,13 @@ static struct file_system_type bd_type = { .kill_sb = kill_anon_super, }; -static struct vfsmount *bd_mnt __read_mostly; -struct super_block *blockdev_superblock; +struct super_block *blockdev_superblock __read_mostly; void __init bdev_cache_init(void) { int err; + struct vfsmount *bd_mnt; + bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode), 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| SLAB_MEM_SPREAD|SLAB_PANIC), @@ -373,7 +376,7 @@ struct block_device *bdget(dev_t dev) struct block_device *bdev; struct inode *inode; - inode = iget5_locked(bd_mnt->mnt_sb, hash(dev), + inode = iget5_locked(blockdev_superblock, hash(dev), bdev_test, bdev_set, &dev); if (!inode) @@ -463,7 +466,7 @@ void bd_forget(struct inode *inode) spin_lock(&bdev_lock); if (inode->i_bdev) { - if (inode->i_sb != blockdev_superblock) + if (!sb_is_blkdev_sb(inode->i_sb)) bdev = inode->i_bdev; __bd_forget(inode); } @@ -1004,6 +1007,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) } lock_kernel(); + restart: ret = -ENXIO; disk = get_gendisk(bdev->bd_dev, &partno); @@ -1024,6 +1028,19 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) if (disk->fops->open) { ret = disk->fops->open(bdev, mode); + if (ret == -ERESTARTSYS) { + /* Lost a race with 'disk' being + * deleted, try again. + * See md.c + */ + disk_put_part(bdev->bd_part); + bdev->bd_part = NULL; + module_put(disk->fops->owner); + put_disk(disk); + bdev->bd_disk = NULL; + mutex_unlock(&bdev->bd_mutex); + goto restart; + } if (ret) goto out_clear; } @@ -1219,6 +1236,20 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) return blkdev_ioctl(bdev, mode, cmd, arg); } +/* + * Try to release a page associated with block device when the system + * is under memory pressure. + */ +static int blkdev_releasepage(struct page *page, gfp_t wait) +{ + struct super_block *super = BDEV_I(page->mapping->host)->bdev.bd_super; + + if (super && super->s_op->bdev_try_to_free_page) + return super->s_op->bdev_try_to_free_page(super, page, wait); + + return try_to_free_buffers(page); +} + static const struct address_space_operations def_blk_aops = { .readpage = blkdev_readpage, .writepage = blkdev_writepage, @@ -1226,6 +1257,7 @@ static const struct address_space_operations def_blk_aops = { .write_begin = blkdev_write_begin, .write_end = blkdev_write_end, .writepages = generic_writepages, + .releasepage = blkdev_releasepage, .direct_IO = blkdev_direct_IO, }; @@ -1261,7 +1293,7 @@ EXPORT_SYMBOL(ioctl_by_bdev); /** * lookup_bdev - lookup a struct block_device by name - * @path: special file representing the block device + * @pathname: special file representing the block device * * Get a reference to the blockdevice at @pathname in the current * namespace if possible and return it. Return ERR_PTR(error) diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile new file mode 100644 index 000000000000..d2cf5a54a4b8 --- /dev/null +++ b/fs/btrfs/Makefile @@ -0,0 +1,25 @@ +ifneq ($(KERNELRELEASE),) +# kbuild part of makefile + +obj-$(CONFIG_BTRFS_FS) := btrfs.o +btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ + file-item.o inode-item.o inode-map.o disk-io.o \ + transaction.o inode.o file.o tree-defrag.o \ + extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ + extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ + ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \ + compression.o +else + +# Normal Makefile + +KERNELDIR := /lib/modules/`uname -r`/build +all: + $(MAKE) -C $(KERNELDIR) M=`pwd` CONFIG_BTRFS_FS=m modules + +modules_install: + $(MAKE) -C $(KERNELDIR) M=`pwd` modules_install +clean: + $(MAKE) -C $(KERNELDIR) M=`pwd` clean + +endif diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c new file mode 100644 index 000000000000..1d53b62dbba5 --- /dev/null +++ b/fs/btrfs/acl.c @@ -0,0 +1,351 @@ +/* + * Copyright (C) 2007 Red Hat. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/fs.h> +#include <linux/string.h> +#include <linux/xattr.h> +#include <linux/posix_acl_xattr.h> +#include <linux/posix_acl.h> +#include <linux/sched.h> + +#include "ctree.h" +#include "btrfs_inode.h" +#include "xattr.h" + +#ifdef CONFIG_FS_POSIX_ACL + +static void btrfs_update_cached_acl(struct inode *inode, + struct posix_acl **p_acl, + struct posix_acl *acl) +{ + spin_lock(&inode->i_lock); + if (*p_acl && *p_acl != BTRFS_ACL_NOT_CACHED) + posix_acl_release(*p_acl); + *p_acl = posix_acl_dup(acl); + spin_unlock(&inode->i_lock); +} + +static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) +{ + int size; + const char *name; + char *value = NULL; + struct posix_acl *acl = NULL, **p_acl; + + switch (type) { + case ACL_TYPE_ACCESS: + name = POSIX_ACL_XATTR_ACCESS; + p_acl = &BTRFS_I(inode)->i_acl; + break; + case ACL_TYPE_DEFAULT: + name = POSIX_ACL_XATTR_DEFAULT; + p_acl = &BTRFS_I(inode)->i_default_acl; + break; + default: + return ERR_PTR(-EINVAL); + } + + spin_lock(&inode->i_lock); + if (*p_acl != BTRFS_ACL_NOT_CACHED) + acl = posix_acl_dup(*p_acl); + spin_unlock(&inode->i_lock); + + if (acl) + return acl; + + + size = __btrfs_getxattr(inode, name, "", 0); + if (size > 0) { + value = kzalloc(size, GFP_NOFS); + if (!value) + return ERR_PTR(-ENOMEM); + size = __btrfs_getxattr(inode, name, value, size); + if (size > 0) { + acl = posix_acl_from_xattr(value, size); + btrfs_update_cached_acl(inode, p_acl, acl); + } + kfree(value); + } else if (size == -ENOENT) { + acl = NULL; + btrfs_update_cached_acl(inode, p_acl, acl); + } + + return acl; +} + +static int btrfs_xattr_get_acl(struct inode *inode, int type, + void *value, size_t size) +{ + struct posix_acl *acl; + int ret = 0; + + acl = btrfs_get_acl(inode, type); + + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl == NULL) + return -ENODATA; + ret = posix_acl_to_xattr(acl, value, size); + posix_acl_release(acl); + + return ret; +} + +/* + * Needs to be called with fs_mutex held + */ +static int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) +{ + int ret, size = 0; + const char *name; + struct posix_acl **p_acl; + char *value = NULL; + mode_t mode; + + if (acl) { + ret = posix_acl_valid(acl); + if (ret < 0) + return ret; + ret = 0; + } + + switch (type) { + case ACL_TYPE_ACCESS: + mode = inode->i_mode; + ret = posix_acl_equiv_mode(acl, &mode); + if (ret < 0) + return ret; + ret = 0; + inode->i_mode = mode; + name = POSIX_ACL_XATTR_ACCESS; + p_acl = &BTRFS_I(inode)->i_acl; + break; + case ACL_TYPE_DEFAULT: + if (!S_ISDIR(inode->i_mode)) + return acl ? -EINVAL : 0; + name = POSIX_ACL_XATTR_DEFAULT; + p_acl = &BTRFS_I(inode)->i_default_acl; + break; + default: + return -EINVAL; + } + + if (acl) { + size = posix_acl_xattr_size(acl->a_count); + value = kmalloc(size, GFP_NOFS); + if (!value) { + ret = -ENOMEM; + goto out; + } + + ret = posix_acl_to_xattr(acl, value, size); + if (ret < 0) + goto out; + } + + ret = __btrfs_setxattr(inode, name, value, size, 0); + +out: + kfree(value); + + if (!ret) + btrfs_update_cached_acl(inode, p_acl, acl); + + return ret; +} + +static int btrfs_xattr_set_acl(struct inode *inode, int type, + const void *value, size_t size) +{ + int ret = 0; + struct posix_acl *acl = NULL; + + if (value) { + acl = posix_acl_from_xattr(value, size); + if (acl == NULL) { + value = NULL; + size = 0; + } else if (IS_ERR(acl)) { + return PTR_ERR(acl); + } + } + + ret = btrfs_set_acl(inode, acl, type); + + posix_acl_release(acl); + + return ret; +} + + +static int btrfs_xattr_acl_access_get(struct inode *inode, const char *name, + void *value, size_t size) +{ + return btrfs_xattr_get_acl(inode, ACL_TYPE_ACCESS, value, size); +} + +static int btrfs_xattr_acl_access_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + return btrfs_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size); +} + +static int btrfs_xattr_acl_default_get(struct inode *inode, const char *name, + void *value, size_t size) +{ + return btrfs_xattr_get_acl(inode, ACL_TYPE_DEFAULT, value, size); +} + +static int btrfs_xattr_acl_default_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + return btrfs_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size); +} + +int btrfs_check_acl(struct inode *inode, int mask) +{ + struct posix_acl *acl; + int error = -EAGAIN; + + acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS); + + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl) { + error = posix_acl_permission(inode, acl, mask); + posix_acl_release(acl); + } + + return error; +} + +/* + * btrfs_init_acl is already generally called under fs_mutex, so the locking + * stuff has been fixed to work with that. If the locking stuff changes, we + * need to re-evaluate the acl locking stuff. + */ +int btrfs_init_acl(struct inode *inode, struct inode *dir) +{ + struct posix_acl *acl = NULL; + int ret = 0; + + /* this happens with subvols */ + if (!dir) + return 0; + + if (!S_ISLNK(inode->i_mode)) { + if (IS_POSIXACL(dir)) { + acl = btrfs_get_acl(dir, ACL_TYPE_DEFAULT); + if (IS_ERR(acl)) + return PTR_ERR(acl); + } + + if (!acl) + inode->i_mode &= ~current->fs->umask; + } + + if (IS_POSIXACL(dir) && acl) { + struct posix_acl *clone; + mode_t mode; + + if (S_ISDIR(inode->i_mode)) { + ret = btrfs_set_acl(inode, acl, ACL_TYPE_DEFAULT); + if (ret) + goto failed; + } + clone = posix_acl_clone(acl, GFP_NOFS); + ret = -ENOMEM; + if (!clone) + goto failed; + + mode = inode->i_mode; + ret = posix_acl_create_masq(clone, &mode); + if (ret >= 0) { + inode->i_mode = mode; + if (ret > 0) { + /* we need an acl */ + ret = btrfs_set_acl(inode, clone, + ACL_TYPE_ACCESS); + } + } + } +failed: + posix_acl_release(acl); + + return ret; +} + +int btrfs_acl_chmod(struct inode *inode) +{ + struct posix_acl *acl, *clone; + int ret = 0; + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + + if (!IS_POSIXACL(inode)) + return 0; + + acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS); + if (IS_ERR(acl) || !acl) + return PTR_ERR(acl); + + clone = posix_acl_clone(acl, GFP_KERNEL); + posix_acl_release(acl); + if (!clone) + return -ENOMEM; + + ret = posix_acl_chmod_masq(clone, inode->i_mode); + if (!ret) + ret = btrfs_set_acl(inode, clone, ACL_TYPE_ACCESS); + + posix_acl_release(clone); + + return ret; +} + +struct xattr_handler btrfs_xattr_acl_default_handler = { + .prefix = POSIX_ACL_XATTR_DEFAULT, + .get = btrfs_xattr_acl_default_get, + .set = btrfs_xattr_acl_default_set, +}; + +struct xattr_handler btrfs_xattr_acl_access_handler = { + .prefix = POSIX_ACL_XATTR_ACCESS, + .get = btrfs_xattr_acl_access_get, + .set = btrfs_xattr_acl_access_set, +}; + +#else /* CONFIG_FS_POSIX_ACL */ + +int btrfs_acl_chmod(struct inode *inode) +{ + return 0; +} + +int btrfs_init_acl(struct inode *inode, struct inode *dir) +{ + return 0; +} + +int btrfs_check_acl(struct inode *inode, int mask) +{ + return 0; +} + +#endif /* CONFIG_FS_POSIX_ACL */ diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c new file mode 100644 index 000000000000..8e2fec05dbe0 --- /dev/null +++ b/fs/btrfs/async-thread.c @@ -0,0 +1,419 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/version.h> +#include <linux/kthread.h> +#include <linux/list.h> +#include <linux/spinlock.h> +# include <linux/freezer.h> +#include "async-thread.h" + +#define WORK_QUEUED_BIT 0 +#define WORK_DONE_BIT 1 +#define WORK_ORDER_DONE_BIT 2 + +/* + * container for the kthread task pointer and the list of pending work + * One of these is allocated per thread. + */ +struct btrfs_worker_thread { + /* pool we belong to */ + struct btrfs_workers *workers; + + /* list of struct btrfs_work that are waiting for service */ + struct list_head pending; + + /* list of worker threads from struct btrfs_workers */ + struct list_head worker_list; + + /* kthread */ + struct task_struct *task; + + /* number of things on the pending list */ + atomic_t num_pending; + + unsigned long sequence; + + /* protects the pending list. */ + spinlock_t lock; + + /* set to non-zero when this thread is already awake and kicking */ + int working; + + /* are we currently idle */ + int idle; +}; + +/* + * helper function to move a thread onto the idle list after it + * has finished some requests. + */ +static void check_idle_worker(struct btrfs_worker_thread *worker) +{ + if (!worker->idle && atomic_read(&worker->num_pending) < + worker->workers->idle_thresh / 2) { + unsigned long flags; + spin_lock_irqsave(&worker->workers->lock, flags); + worker->idle = 1; + list_move(&worker->worker_list, &worker->workers->idle_list); + spin_unlock_irqrestore(&worker->workers->lock, flags); + } +} + +/* + * helper function to move a thread off the idle list after new + * pending work is added. + */ +static void check_busy_worker(struct btrfs_worker_thread *worker) +{ + if (worker->idle && atomic_read(&worker->num_pending) >= + worker->workers->idle_thresh) { + unsigned long flags; + spin_lock_irqsave(&worker->workers->lock, flags); + worker->idle = 0; + list_move_tail(&worker->worker_list, + &worker->workers->worker_list); + spin_unlock_irqrestore(&worker->workers->lock, flags); + } +} + +static noinline int run_ordered_completions(struct btrfs_workers *workers, + struct btrfs_work *work) +{ + unsigned long flags; + + if (!workers->ordered) + return 0; + + set_bit(WORK_DONE_BIT, &work->flags); + + spin_lock_irqsave(&workers->lock, flags); + + while (!list_empty(&workers->order_list)) { + work = list_entry(workers->order_list.next, + struct btrfs_work, order_list); + + if (!test_bit(WORK_DONE_BIT, &work->flags)) + break; + + /* we are going to call the ordered done function, but + * we leave the work item on the list as a barrier so + * that later work items that are done don't have their + * functions called before this one returns + */ + if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags)) + break; + + spin_unlock_irqrestore(&workers->lock, flags); + + work->ordered_func(work); + + /* now take the lock again and call the freeing code */ + spin_lock_irqsave(&workers->lock, flags); + list_del(&work->order_list); + work->ordered_free(work); + } + + spin_unlock_irqrestore(&workers->lock, flags); + return 0; +} + +/* + * main loop for servicing work items + */ +static int worker_loop(void *arg) +{ + struct btrfs_worker_thread *worker = arg; + struct list_head *cur; + struct btrfs_work *work; + do { + spin_lock_irq(&worker->lock); + while (!list_empty(&worker->pending)) { + cur = worker->pending.next; + work = list_entry(cur, struct btrfs_work, list); + list_del(&work->list); + clear_bit(WORK_QUEUED_BIT, &work->flags); + + work->worker = worker; + spin_unlock_irq(&worker->lock); + + work->func(work); + + atomic_dec(&worker->num_pending); + /* + * unless this is an ordered work queue, + * 'work' was probably freed by func above. + */ + run_ordered_completions(worker->workers, work); + + spin_lock_irq(&worker->lock); + check_idle_worker(worker); + + } + worker->working = 0; + if (freezing(current)) { + refrigerator(); + } else { + set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irq(&worker->lock); + if (!kthread_should_stop()) + schedule(); + __set_current_state(TASK_RUNNING); + } + } while (!kthread_should_stop()); + return 0; +} + +/* + * this will wait for all the worker threads to shutdown + */ +int btrfs_stop_workers(struct btrfs_workers *workers) +{ + struct list_head *cur; + struct btrfs_worker_thread *worker; + + list_splice_init(&workers->idle_list, &workers->worker_list); + while (!list_empty(&workers->worker_list)) { + cur = workers->worker_list.next; + worker = list_entry(cur, struct btrfs_worker_thread, + worker_list); + kthread_stop(worker->task); + list_del(&worker->worker_list); + kfree(worker); + } + return 0; +} + +/* + * simple init on struct btrfs_workers + */ +void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max) +{ + workers->num_workers = 0; + INIT_LIST_HEAD(&workers->worker_list); + INIT_LIST_HEAD(&workers->idle_list); + INIT_LIST_HEAD(&workers->order_list); + spin_lock_init(&workers->lock); + workers->max_workers = max; + workers->idle_thresh = 32; + workers->name = name; + workers->ordered = 0; +} + +/* + * starts new worker threads. This does not enforce the max worker + * count in case you need to temporarily go past it. + */ +int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) +{ + struct btrfs_worker_thread *worker; + int ret = 0; + int i; + + for (i = 0; i < num_workers; i++) { + worker = kzalloc(sizeof(*worker), GFP_NOFS); + if (!worker) { + ret = -ENOMEM; + goto fail; + } + + INIT_LIST_HEAD(&worker->pending); + INIT_LIST_HEAD(&worker->worker_list); + spin_lock_init(&worker->lock); + atomic_set(&worker->num_pending, 0); + worker->task = kthread_run(worker_loop, worker, + "btrfs-%s-%d", workers->name, + workers->num_workers + i); + worker->workers = workers; + if (IS_ERR(worker->task)) { + kfree(worker); + ret = PTR_ERR(worker->task); + goto fail; + } + + spin_lock_irq(&workers->lock); + list_add_tail(&worker->worker_list, &workers->idle_list); + worker->idle = 1; + workers->num_workers++; + spin_unlock_irq(&workers->lock); + } + return 0; +fail: + btrfs_stop_workers(workers); + return ret; +} + +/* + * run through the list and find a worker thread that doesn't have a lot + * to do right now. This can return null if we aren't yet at the thread + * count limit and all of the threads are busy. + */ +static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers) +{ + struct btrfs_worker_thread *worker; + struct list_head *next; + int enforce_min = workers->num_workers < workers->max_workers; + + /* + * if we find an idle thread, don't move it to the end of the + * idle list. This improves the chance that the next submission + * will reuse the same thread, and maybe catch it while it is still + * working + */ + if (!list_empty(&workers->idle_list)) { + next = workers->idle_list.next; + worker = list_entry(next, struct btrfs_worker_thread, + worker_list); + return worker; + } + if (enforce_min || list_empty(&workers->worker_list)) + return NULL; + + /* + * if we pick a busy task, move the task to the end of the list. + * hopefully this will keep things somewhat evenly balanced. + * Do the move in batches based on the sequence number. This groups + * requests submitted at roughly the same time onto the same worker. + */ + next = workers->worker_list.next; + worker = list_entry(next, struct btrfs_worker_thread, worker_list); + atomic_inc(&worker->num_pending); + worker->sequence++; + + if (worker->sequence % workers->idle_thresh == 0) + list_move_tail(next, &workers->worker_list); + return worker; +} + +/* + * selects a worker thread to take the next job. This will either find + * an idle worker, start a new worker up to the max count, or just return + * one of the existing busy workers. + */ +static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers) +{ + struct btrfs_worker_thread *worker; + unsigned long flags; + +again: + spin_lock_irqsave(&workers->lock, flags); + worker = next_worker(workers); + spin_unlock_irqrestore(&workers->lock, flags); + + if (!worker) { + spin_lock_irqsave(&workers->lock, flags); + if (workers->num_workers >= workers->max_workers) { + struct list_head *fallback = NULL; + /* + * we have failed to find any workers, just + * return the force one + */ + if (!list_empty(&workers->worker_list)) + fallback = workers->worker_list.next; + if (!list_empty(&workers->idle_list)) + fallback = workers->idle_list.next; + BUG_ON(!fallback); + worker = list_entry(fallback, + struct btrfs_worker_thread, worker_list); + spin_unlock_irqrestore(&workers->lock, flags); + } else { + spin_unlock_irqrestore(&workers->lock, flags); + /* we're below the limit, start another worker */ + btrfs_start_workers(workers, 1); + goto again; + } + } + return worker; +} + +/* + * btrfs_requeue_work just puts the work item back on the tail of the list + * it was taken from. It is intended for use with long running work functions + * that make some progress and want to give the cpu up for others. + */ +int btrfs_requeue_work(struct btrfs_work *work) +{ + struct btrfs_worker_thread *worker = work->worker; + unsigned long flags; + + if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags)) + goto out; + + spin_lock_irqsave(&worker->lock, flags); + atomic_inc(&worker->num_pending); + list_add_tail(&work->list, &worker->pending); + + /* by definition we're busy, take ourselves off the idle + * list + */ + if (worker->idle) { + spin_lock_irqsave(&worker->workers->lock, flags); + worker->idle = 0; + list_move_tail(&worker->worker_list, + &worker->workers->worker_list); + spin_unlock_irqrestore(&worker->workers->lock, flags); + } + + spin_unlock_irqrestore(&worker->lock, flags); + +out: + return 0; +} + +/* + * places a struct btrfs_work into the pending queue of one of the kthreads + */ +int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) +{ + struct btrfs_worker_thread *worker; + unsigned long flags; + int wake = 0; + + /* don't requeue something already on a list */ + if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags)) + goto out; + + worker = find_worker(workers); + if (workers->ordered) { + spin_lock_irqsave(&workers->lock, flags); + list_add_tail(&work->order_list, &workers->order_list); + spin_unlock_irqrestore(&workers->lock, flags); + } else { + INIT_LIST_HEAD(&work->order_list); + } + + spin_lock_irqsave(&worker->lock, flags); + atomic_inc(&worker->num_pending); + check_busy_worker(worker); + list_add_tail(&work->list, &worker->pending); + + /* + * avoid calling into wake_up_process if this thread has already + * been kicked + */ + if (!worker->working) + wake = 1; + worker->working = 1; + + spin_unlock_irqrestore(&worker->lock, flags); + + if (wake) + wake_up_process(worker->task); +out: + return 0; +} diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h new file mode 100644 index 000000000000..31be4ed8b63e --- /dev/null +++ b/fs/btrfs/async-thread.h @@ -0,0 +1,101 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_ASYNC_THREAD_ +#define __BTRFS_ASYNC_THREAD_ + +struct btrfs_worker_thread; + +/* + * This is similar to a workqueue, but it is meant to spread the operations + * across all available cpus instead of just the CPU that was used to + * queue the work. There is also some batching introduced to try and + * cut down on context switches. + * + * By default threads are added on demand up to 2 * the number of cpus. + * Changing struct btrfs_workers->max_workers is one way to prevent + * demand creation of kthreads. + * + * the basic model of these worker threads is to embed a btrfs_work + * structure in your own data struct, and use container_of in a + * work function to get back to your data struct. + */ +struct btrfs_work { + /* + * func should be set to the function you want called + * your work struct is passed as the only arg + * + * ordered_func must be set for work sent to an ordered work queue, + * and it is called to complete a given work item in the same + * order they were sent to the queue. + */ + void (*func)(struct btrfs_work *work); + void (*ordered_func)(struct btrfs_work *work); + void (*ordered_free)(struct btrfs_work *work); + + /* + * flags should be set to zero. It is used to make sure the + * struct is only inserted once into the list. + */ + unsigned long flags; + + /* don't touch these */ + struct btrfs_worker_thread *worker; + struct list_head list; + struct list_head order_list; +}; + +struct btrfs_workers { + /* current number of running workers */ + int num_workers; + + /* max number of workers allowed. changed by btrfs_start_workers */ + int max_workers; + + /* once a worker has this many requests or fewer, it is idle */ + int idle_thresh; + + /* force completions in the order they were queued */ + int ordered; + + /* list with all the work threads. The workers on the idle thread + * may be actively servicing jobs, but they haven't yet hit the + * idle thresh limit above. + */ + struct list_head worker_list; + struct list_head idle_list; + + /* + * when operating in ordered mode, this maintains the list + * of work items waiting for completion + */ + struct list_head order_list; + + /* lock for finding the next worker thread to queue on */ + spinlock_t lock; + + /* extra name for this worker, used for current->name */ + char *name; +}; + +int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work); +int btrfs_start_workers(struct btrfs_workers *workers, int num_workers); +int btrfs_stop_workers(struct btrfs_workers *workers); +void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max); +int btrfs_requeue_work(struct btrfs_work *work); +#endif diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h new file mode 100644 index 000000000000..a8c9693b75ac --- /dev/null +++ b/fs/btrfs/btrfs_inode.h @@ -0,0 +1,131 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_I__ +#define __BTRFS_I__ + +#include "extent_map.h" +#include "extent_io.h" +#include "ordered-data.h" + +/* in memory btrfs inode */ +struct btrfs_inode { + /* which subvolume this inode belongs to */ + struct btrfs_root *root; + + /* key used to find this inode on disk. This is used by the code + * to read in roots of subvolumes + */ + struct btrfs_key location; + + /* the extent_tree has caches of all the extent mappings to disk */ + struct extent_map_tree extent_tree; + + /* the io_tree does range state (DIRTY, LOCKED etc) */ + struct extent_io_tree io_tree; + + /* special utility tree used to record which mirrors have already been + * tried when checksums fail for a given block + */ + struct extent_io_tree io_failure_tree; + + /* held while inesrting or deleting extents from files */ + struct mutex extent_mutex; + + /* held while logging the inode in tree-log.c */ + struct mutex log_mutex; + + /* used to order data wrt metadata */ + struct btrfs_ordered_inode_tree ordered_tree; + + /* standard acl pointers */ + struct posix_acl *i_acl; + struct posix_acl *i_default_acl; + + /* for keeping track of orphaned inodes */ + struct list_head i_orphan; + + /* list of all the delalloc inodes in the FS. There are times we need + * to write all the delalloc pages to disk, and this list is used + * to walk them all. + */ + struct list_head delalloc_inodes; + + /* full 64 bit generation number, struct vfs_inode doesn't have a big + * enough field for this. + */ + u64 generation; + + /* sequence number for NFS changes */ + u64 sequence; + + /* + * transid of the trans_handle that last modified this inode + */ + u64 last_trans; + /* + * transid that last logged this inode + */ + u64 logged_trans; + + /* + * trans that last made a change that should be fully fsync'd. This + * gets reset to zero each time the inode is logged + */ + u64 log_dirty_trans; + + /* total number of bytes pending delalloc, used by stat to calc the + * real block usage of the file + */ + u64 delalloc_bytes; + + /* + * the size of the file stored in the metadata on disk. data=ordered + * means the in-memory i_size might be larger than the size on disk + * because not all the blocks are written yet. + */ + u64 disk_i_size; + + /* flags field from the on disk inode */ + u32 flags; + + /* + * if this is a directory then index_cnt is the counter for the index + * number for new files that are created + */ + u64 index_cnt; + + /* the start of block group preferred for allocations. */ + u64 block_group; + + struct inode vfs_inode; +}; + +static inline struct btrfs_inode *BTRFS_I(struct inode *inode) +{ + return container_of(inode, struct btrfs_inode, vfs_inode); +} + +static inline void btrfs_i_size_write(struct inode *inode, u64 size) +{ + inode->i_size = size; + BTRFS_I(inode)->disk_i_size = size; +} + + +#endif diff --git a/fs/btrfs/compat.h b/fs/btrfs/compat.h new file mode 100644 index 000000000000..7c4503ef6efd --- /dev/null +++ b/fs/btrfs/compat.h @@ -0,0 +1,7 @@ +#ifndef _COMPAT_H_ +#define _COMPAT_H_ + +#define btrfs_drop_nlink(inode) drop_nlink(inode) +#define btrfs_inc_nlink(inode) inc_nlink(inode) + +#endif /* _COMPAT_H_ */ diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c new file mode 100644 index 000000000000..ee848d8585d9 --- /dev/null +++ b/fs/btrfs/compression.c @@ -0,0 +1,709 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/kernel.h> +#include <linux/bio.h> +#include <linux/buffer_head.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/pagemap.h> +#include <linux/highmem.h> +#include <linux/time.h> +#include <linux/init.h> +#include <linux/string.h> +#include <linux/smp_lock.h> +#include <linux/backing-dev.h> +#include <linux/mpage.h> +#include <linux/swap.h> +#include <linux/writeback.h> +#include <linux/bit_spinlock.h> +#include <linux/version.h> +#include <linux/pagevec.h> +#include "compat.h" +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "btrfs_inode.h" +#include "volumes.h" +#include "ordered-data.h" +#include "compression.h" +#include "extent_io.h" +#include "extent_map.h" + +struct compressed_bio { + /* number of bios pending for this compressed extent */ + atomic_t pending_bios; + + /* the pages with the compressed data on them */ + struct page **compressed_pages; + + /* inode that owns this data */ + struct inode *inode; + + /* starting offset in the inode for our pages */ + u64 start; + + /* number of bytes in the inode we're working on */ + unsigned long len; + + /* number of bytes on disk */ + unsigned long compressed_len; + + /* number of compressed pages in the array */ + unsigned long nr_pages; + + /* IO errors */ + int errors; + int mirror_num; + + /* for reads, this is the bio we are copying the data into */ + struct bio *orig_bio; + + /* + * the start of a variable length array of checksums only + * used by reads + */ + u32 sums; +}; + +static inline int compressed_bio_size(struct btrfs_root *root, + unsigned long disk_size) +{ + u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); + return sizeof(struct compressed_bio) + + ((disk_size + root->sectorsize - 1) / root->sectorsize) * + csum_size; +} + +static struct bio *compressed_bio_alloc(struct block_device *bdev, + u64 first_byte, gfp_t gfp_flags) +{ + struct bio *bio; + int nr_vecs; + + nr_vecs = bio_get_nr_vecs(bdev); + bio = bio_alloc(gfp_flags, nr_vecs); + + if (bio == NULL && (current->flags & PF_MEMALLOC)) { + while (!bio && (nr_vecs /= 2)) + bio = bio_alloc(gfp_flags, nr_vecs); + } + + if (bio) { + bio->bi_size = 0; + bio->bi_bdev = bdev; + bio->bi_sector = first_byte >> 9; + } + return bio; +} + +static int check_compressed_csum(struct inode *inode, + struct compressed_bio *cb, + u64 disk_start) +{ + int ret; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct page *page; + unsigned long i; + char *kaddr; + u32 csum; + u32 *cb_sum = &cb->sums; + + if (btrfs_test_flag(inode, NODATASUM)) + return 0; + + for (i = 0; i < cb->nr_pages; i++) { + page = cb->compressed_pages[i]; + csum = ~(u32)0; + + kaddr = kmap_atomic(page, KM_USER0); + csum = btrfs_csum_data(root, kaddr, csum, PAGE_CACHE_SIZE); + btrfs_csum_final(csum, (char *)&csum); + kunmap_atomic(kaddr, KM_USER0); + + if (csum != *cb_sum) { + printk(KERN_INFO "btrfs csum failed ino %lu " + "extent %llu csum %u " + "wanted %u mirror %d\n", inode->i_ino, + (unsigned long long)disk_start, + csum, *cb_sum, cb->mirror_num); + ret = -EIO; + goto fail; + } + cb_sum++; + + } + ret = 0; +fail: + return ret; +} + +/* when we finish reading compressed pages from the disk, we + * decompress them and then run the bio end_io routines on the + * decompressed pages (in the inode address space). + * + * This allows the checksumming and other IO error handling routines + * to work normally + * + * The compressed pages are freed here, and it must be run + * in process context + */ +static void end_compressed_bio_read(struct bio *bio, int err) +{ + struct extent_io_tree *tree; + struct compressed_bio *cb = bio->bi_private; + struct inode *inode; + struct page *page; + unsigned long index; + int ret; + + if (err) + cb->errors = 1; + + /* if there are more bios still pending for this compressed + * extent, just exit + */ + if (!atomic_dec_and_test(&cb->pending_bios)) + goto out; + + inode = cb->inode; + ret = check_compressed_csum(inode, cb, (u64)bio->bi_sector << 9); + if (ret) + goto csum_failed; + + /* ok, we're the last bio for this extent, lets start + * the decompression. + */ + tree = &BTRFS_I(inode)->io_tree; + ret = btrfs_zlib_decompress_biovec(cb->compressed_pages, + cb->start, + cb->orig_bio->bi_io_vec, + cb->orig_bio->bi_vcnt, + cb->compressed_len); +csum_failed: + if (ret) + cb->errors = 1; + + /* release the compressed pages */ + index = 0; + for (index = 0; index < cb->nr_pages; index++) { + page = cb->compressed_pages[index]; + page->mapping = NULL; + page_cache_release(page); + } + + /* do io completion on the original bio */ + if (cb->errors) { + bio_io_error(cb->orig_bio); + } else { + int bio_index = 0; + struct bio_vec *bvec = cb->orig_bio->bi_io_vec; + + /* + * we have verified the checksum already, set page + * checked so the end_io handlers know about it + */ + while (bio_index < cb->orig_bio->bi_vcnt) { + SetPageChecked(bvec->bv_page); + bvec++; + bio_index++; + } + bio_endio(cb->orig_bio, 0); + } + + /* finally free the cb struct */ + kfree(cb->compressed_pages); + kfree(cb); +out: + bio_put(bio); +} + +/* + * Clear the writeback bits on all of the file + * pages for a compressed write + */ +static noinline int end_compressed_writeback(struct inode *inode, u64 start, + unsigned long ram_size) +{ + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT; + struct page *pages[16]; + unsigned long nr_pages = end_index - index + 1; + int i; + int ret; + + while (nr_pages > 0) { + ret = find_get_pages_contig(inode->i_mapping, index, + min_t(unsigned long, + nr_pages, ARRAY_SIZE(pages)), pages); + if (ret == 0) { + nr_pages -= 1; + index += 1; + continue; + } + for (i = 0; i < ret; i++) { + end_page_writeback(pages[i]); + page_cache_release(pages[i]); + } + nr_pages -= ret; + index += ret; + } + /* the inode may be gone now */ + return 0; +} + +/* + * do the cleanup once all the compressed pages hit the disk. + * This will clear writeback on the file pages and free the compressed + * pages. + * + * This also calls the writeback end hooks for the file pages so that + * metadata and checksums can be updated in the file. + */ +static void end_compressed_bio_write(struct bio *bio, int err) +{ + struct extent_io_tree *tree; + struct compressed_bio *cb = bio->bi_private; + struct inode *inode; + struct page *page; + unsigned long index; + + if (err) + cb->errors = 1; + + /* if there are more bios still pending for this compressed + * extent, just exit + */ + if (!atomic_dec_and_test(&cb->pending_bios)) + goto out; + + /* ok, we're the last bio for this extent, step one is to + * call back into the FS and do all the end_io operations + */ + inode = cb->inode; + tree = &BTRFS_I(inode)->io_tree; + cb->compressed_pages[0]->mapping = cb->inode->i_mapping; + tree->ops->writepage_end_io_hook(cb->compressed_pages[0], + cb->start, + cb->start + cb->len - 1, + NULL, 1); + cb->compressed_pages[0]->mapping = NULL; + + end_compressed_writeback(inode, cb->start, cb->len); + /* note, our inode could be gone now */ + + /* + * release the compressed pages, these came from alloc_page and + * are not attached to the inode at all + */ + index = 0; + for (index = 0; index < cb->nr_pages; index++) { + page = cb->compressed_pages[index]; + page->mapping = NULL; + page_cache_release(page); + } + + /* finally free the cb struct */ + kfree(cb->compressed_pages); + kfree(cb); +out: + bio_put(bio); +} + +/* + * worker function to build and submit bios for previously compressed pages. + * The corresponding pages in the inode should be marked for writeback + * and the compressed pages should have a reference on them for dropping + * when the IO is complete. + * + * This also checksums the file bytes and gets things ready for + * the end io hooks. + */ +int btrfs_submit_compressed_write(struct inode *inode, u64 start, + unsigned long len, u64 disk_start, + unsigned long compressed_len, + struct page **compressed_pages, + unsigned long nr_pages) +{ + struct bio *bio = NULL; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct compressed_bio *cb; + unsigned long bytes_left; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + int page_index = 0; + struct page *page; + u64 first_byte = disk_start; + struct block_device *bdev; + int ret; + + WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1)); + cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); + atomic_set(&cb->pending_bios, 0); + cb->errors = 0; + cb->inode = inode; + cb->start = start; + cb->len = len; + cb->mirror_num = 0; + cb->compressed_pages = compressed_pages; + cb->compressed_len = compressed_len; + cb->orig_bio = NULL; + cb->nr_pages = nr_pages; + + bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; + + bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS); + bio->bi_private = cb; + bio->bi_end_io = end_compressed_bio_write; + atomic_inc(&cb->pending_bios); + + /* create and submit bios for the compressed pages */ + bytes_left = compressed_len; + for (page_index = 0; page_index < cb->nr_pages; page_index++) { + page = compressed_pages[page_index]; + page->mapping = inode->i_mapping; + if (bio->bi_size) + ret = io_tree->ops->merge_bio_hook(page, 0, + PAGE_CACHE_SIZE, + bio, 0); + else + ret = 0; + + page->mapping = NULL; + if (ret || bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < + PAGE_CACHE_SIZE) { + bio_get(bio); + + /* + * inc the count before we submit the bio so + * we know the end IO handler won't happen before + * we inc the count. Otherwise, the cb might get + * freed before we're done setting it up + */ + atomic_inc(&cb->pending_bios); + ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); + BUG_ON(ret); + + ret = btrfs_csum_one_bio(root, inode, bio, start, 1); + BUG_ON(ret); + + ret = btrfs_map_bio(root, WRITE, bio, 0, 1); + BUG_ON(ret); + + bio_put(bio); + + bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS); + bio->bi_private = cb; + bio->bi_end_io = end_compressed_bio_write; + bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); + } + if (bytes_left < PAGE_CACHE_SIZE) { + printk("bytes left %lu compress len %lu nr %lu\n", + bytes_left, cb->compressed_len, cb->nr_pages); + } + bytes_left -= PAGE_CACHE_SIZE; + first_byte += PAGE_CACHE_SIZE; + cond_resched(); + } + bio_get(bio); + + ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); + BUG_ON(ret); + + ret = btrfs_csum_one_bio(root, inode, bio, start, 1); + BUG_ON(ret); + + ret = btrfs_map_bio(root, WRITE, bio, 0, 1); + BUG_ON(ret); + + bio_put(bio); + return 0; +} + +static noinline int add_ra_bio_pages(struct inode *inode, + u64 compressed_end, + struct compressed_bio *cb) +{ + unsigned long end_index; + unsigned long page_index; + u64 last_offset; + u64 isize = i_size_read(inode); + int ret; + struct page *page; + unsigned long nr_pages = 0; + struct extent_map *em; + struct address_space *mapping = inode->i_mapping; + struct pagevec pvec; + struct extent_map_tree *em_tree; + struct extent_io_tree *tree; + u64 end; + int misses = 0; + + page = cb->orig_bio->bi_io_vec[cb->orig_bio->bi_vcnt - 1].bv_page; + last_offset = (page_offset(page) + PAGE_CACHE_SIZE); + em_tree = &BTRFS_I(inode)->extent_tree; + tree = &BTRFS_I(inode)->io_tree; + + if (isize == 0) + return 0; + + end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; + + pagevec_init(&pvec, 0); + while (last_offset < compressed_end) { + page_index = last_offset >> PAGE_CACHE_SHIFT; + + if (page_index > end_index) + break; + + rcu_read_lock(); + page = radix_tree_lookup(&mapping->page_tree, page_index); + rcu_read_unlock(); + if (page) { + misses++; + if (misses > 4) + break; + goto next; + } + + page = alloc_page(mapping_gfp_mask(mapping) | GFP_NOFS); + if (!page) + break; + + page->index = page_index; + /* + * what we want to do here is call add_to_page_cache_lru, + * but that isn't exported, so we reproduce it here + */ + if (add_to_page_cache(page, mapping, + page->index, GFP_NOFS)) { + page_cache_release(page); + goto next; + } + + /* open coding of lru_cache_add, also not exported */ + page_cache_get(page); + if (!pagevec_add(&pvec, page)) + __pagevec_lru_add_file(&pvec); + + end = last_offset + PAGE_CACHE_SIZE - 1; + /* + * at this point, we have a locked page in the page cache + * for these bytes in the file. But, we have to make + * sure they map to this compressed extent on disk. + */ + set_page_extent_mapped(page); + lock_extent(tree, last_offset, end, GFP_NOFS); + spin_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, last_offset, + PAGE_CACHE_SIZE); + spin_unlock(&em_tree->lock); + + if (!em || last_offset < em->start || + (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) || + (em->block_start >> 9) != cb->orig_bio->bi_sector) { + free_extent_map(em); + unlock_extent(tree, last_offset, end, GFP_NOFS); + unlock_page(page); + page_cache_release(page); + break; + } + free_extent_map(em); + + if (page->index == end_index) { + char *userpage; + size_t zero_offset = isize & (PAGE_CACHE_SIZE - 1); + + if (zero_offset) { + int zeros; + zeros = PAGE_CACHE_SIZE - zero_offset; + userpage = kmap_atomic(page, KM_USER0); + memset(userpage + zero_offset, 0, zeros); + flush_dcache_page(page); + kunmap_atomic(userpage, KM_USER0); + } + } + + ret = bio_add_page(cb->orig_bio, page, + PAGE_CACHE_SIZE, 0); + + if (ret == PAGE_CACHE_SIZE) { + nr_pages++; + page_cache_release(page); + } else { + unlock_extent(tree, last_offset, end, GFP_NOFS); + unlock_page(page); + page_cache_release(page); + break; + } +next: + last_offset += PAGE_CACHE_SIZE; + } + if (pagevec_count(&pvec)) + __pagevec_lru_add_file(&pvec); + return 0; +} + +/* + * for a compressed read, the bio we get passed has all the inode pages + * in it. We don't actually do IO on those pages but allocate new ones + * to hold the compressed pages on disk. + * + * bio->bi_sector points to the compressed extent on disk + * bio->bi_io_vec points to all of the inode pages + * bio->bi_vcnt is a count of pages + * + * After the compressed pages are read, we copy the bytes into the + * bio we were passed and then call the bio end_io calls + */ +int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags) +{ + struct extent_io_tree *tree; + struct extent_map_tree *em_tree; + struct compressed_bio *cb; + struct btrfs_root *root = BTRFS_I(inode)->root; + unsigned long uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE; + unsigned long compressed_len; + unsigned long nr_pages; + unsigned long page_index; + struct page *page; + struct block_device *bdev; + struct bio *comp_bio; + u64 cur_disk_byte = (u64)bio->bi_sector << 9; + u64 em_len; + u64 em_start; + struct extent_map *em; + int ret; + u32 *sums; + + tree = &BTRFS_I(inode)->io_tree; + em_tree = &BTRFS_I(inode)->extent_tree; + + /* we need the actual starting offset of this extent in the file */ + spin_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, + page_offset(bio->bi_io_vec->bv_page), + PAGE_CACHE_SIZE); + spin_unlock(&em_tree->lock); + + compressed_len = em->block_len; + cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); + atomic_set(&cb->pending_bios, 0); + cb->errors = 0; + cb->inode = inode; + cb->mirror_num = mirror_num; + sums = &cb->sums; + + cb->start = em->orig_start; + em_len = em->len; + em_start = em->start; + + free_extent_map(em); + em = NULL; + + cb->len = uncompressed_len; + cb->compressed_len = compressed_len; + cb->orig_bio = bio; + + nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) / + PAGE_CACHE_SIZE; + cb->compressed_pages = kmalloc(sizeof(struct page *) * nr_pages, + GFP_NOFS); + bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; + + for (page_index = 0; page_index < nr_pages; page_index++) { + cb->compressed_pages[page_index] = alloc_page(GFP_NOFS | + __GFP_HIGHMEM); + } + cb->nr_pages = nr_pages; + + add_ra_bio_pages(inode, em_start + em_len, cb); + + /* include any pages we added in add_ra-bio_pages */ + uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE; + cb->len = uncompressed_len; + + comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS); + comp_bio->bi_private = cb; + comp_bio->bi_end_io = end_compressed_bio_read; + atomic_inc(&cb->pending_bios); + + for (page_index = 0; page_index < nr_pages; page_index++) { + page = cb->compressed_pages[page_index]; + page->mapping = inode->i_mapping; + page->index = em_start >> PAGE_CACHE_SHIFT; + + if (comp_bio->bi_size) + ret = tree->ops->merge_bio_hook(page, 0, + PAGE_CACHE_SIZE, + comp_bio, 0); + else + ret = 0; + + page->mapping = NULL; + if (ret || bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0) < + PAGE_CACHE_SIZE) { + bio_get(comp_bio); + + ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); + BUG_ON(ret); + + /* + * inc the count before we submit the bio so + * we know the end IO handler won't happen before + * we inc the count. Otherwise, the cb might get + * freed before we're done setting it up + */ + atomic_inc(&cb->pending_bios); + + if (!btrfs_test_flag(inode, NODATASUM)) { + btrfs_lookup_bio_sums(root, inode, comp_bio, + sums); + } + sums += (comp_bio->bi_size + root->sectorsize - 1) / + root->sectorsize; + + ret = btrfs_map_bio(root, READ, comp_bio, + mirror_num, 0); + BUG_ON(ret); + + bio_put(comp_bio); + + comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, + GFP_NOFS); + comp_bio->bi_private = cb; + comp_bio->bi_end_io = end_compressed_bio_read; + + bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0); + } + cur_disk_byte += PAGE_CACHE_SIZE; + } + bio_get(comp_bio); + + ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); + BUG_ON(ret); + + if (!btrfs_test_flag(inode, NODATASUM)) + btrfs_lookup_bio_sums(root, inode, comp_bio, sums); + + ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); + BUG_ON(ret); + + bio_put(comp_bio); + return 0; +} diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h new file mode 100644 index 000000000000..421f5b4aa715 --- /dev/null +++ b/fs/btrfs/compression.h @@ -0,0 +1,47 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_COMPRESSION_ +#define __BTRFS_COMPRESSION_ + +int btrfs_zlib_decompress(unsigned char *data_in, + struct page *dest_page, + unsigned long start_byte, + size_t srclen, size_t destlen); +int btrfs_zlib_compress_pages(struct address_space *mapping, + u64 start, unsigned long len, + struct page **pages, + unsigned long nr_dest_pages, + unsigned long *out_pages, + unsigned long *total_in, + unsigned long *total_out, + unsigned long max_out); +int btrfs_zlib_decompress_biovec(struct page **pages_in, + u64 disk_start, + struct bio_vec *bvec, + int vcnt, + size_t srclen); +void btrfs_zlib_exit(void); +int btrfs_submit_compressed_write(struct inode *inode, u64 start, + unsigned long len, u64 disk_start, + unsigned long compressed_len, + struct page **compressed_pages, + unsigned long nr_pages); +int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags); +#endif diff --git a/fs/btrfs/crc32c.h b/fs/btrfs/crc32c.h new file mode 100644 index 000000000000..6e1b3de36700 --- /dev/null +++ b/fs/btrfs/crc32c.h @@ -0,0 +1,29 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_CRC32C__ +#define __BTRFS_CRC32C__ +#include <linux/crc32c.h> + +/* + * this file used to do more for selecting the HW version of crc32c, + * perhaps it will one day again soon. + */ +#define btrfs_crc32c(seed, data, length) crc32c(seed, data, length) +#endif + diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c new file mode 100644 index 000000000000..9e46c0776816 --- /dev/null +++ b/fs/btrfs/ctree.c @@ -0,0 +1,3953 @@ +/* + * Copyright (C) 2007,2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/sched.h> +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "print-tree.h" +#include "locking.h" + +static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, int level); +static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *ins_key, + struct btrfs_path *path, int data_size, int extend); +static int push_node_left(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *dst, + struct extent_buffer *src, int empty); +static int balance_node_right(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *dst_buf, + struct extent_buffer *src_buf); +static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_path *path, int level, int slot); + +inline void btrfs_init_path(struct btrfs_path *p) +{ + memset(p, 0, sizeof(*p)); +} + +struct btrfs_path *btrfs_alloc_path(void) +{ + struct btrfs_path *path; + path = kmem_cache_alloc(btrfs_path_cachep, GFP_NOFS); + if (path) { + btrfs_init_path(path); + path->reada = 1; + } + return path; +} + +/* this also releases the path */ +void btrfs_free_path(struct btrfs_path *p) +{ + btrfs_release_path(NULL, p); + kmem_cache_free(btrfs_path_cachep, p); +} + +/* + * path release drops references on the extent buffers in the path + * and it drops any locks held by this path + * + * It is safe to call this on paths that no locks or extent buffers held. + */ +noinline void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p) +{ + int i; + + for (i = 0; i < BTRFS_MAX_LEVEL; i++) { + p->slots[i] = 0; + if (!p->nodes[i]) + continue; + if (p->locks[i]) { + btrfs_tree_unlock(p->nodes[i]); + p->locks[i] = 0; + } + free_extent_buffer(p->nodes[i]); + p->nodes[i] = NULL; + } +} + +/* + * safely gets a reference on the root node of a tree. A lock + * is not taken, so a concurrent writer may put a different node + * at the root of the tree. See btrfs_lock_root_node for the + * looping required. + * + * The extent buffer returned by this has a reference taken, so + * it won't disappear. It may stop being the root of the tree + * at any time because there are no locks held. + */ +struct extent_buffer *btrfs_root_node(struct btrfs_root *root) +{ + struct extent_buffer *eb; + spin_lock(&root->node_lock); + eb = root->node; + extent_buffer_get(eb); + spin_unlock(&root->node_lock); + return eb; +} + +/* loop around taking references on and locking the root node of the + * tree until you end up with a lock on the root. A locked buffer + * is returned, with a reference held. + */ +struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) +{ + struct extent_buffer *eb; + + while (1) { + eb = btrfs_root_node(root); + btrfs_tree_lock(eb); + + spin_lock(&root->node_lock); + if (eb == root->node) { + spin_unlock(&root->node_lock); + break; + } + spin_unlock(&root->node_lock); + + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + } + return eb; +} + +/* cowonly root (everything not a reference counted cow subvolume), just get + * put onto a simple dirty list. transaction.c walks this to make sure they + * get properly updated on disk. + */ +static void add_root_to_dirty_list(struct btrfs_root *root) +{ + if (root->track_dirty && list_empty(&root->dirty_list)) { + list_add(&root->dirty_list, + &root->fs_info->dirty_cowonly_roots); + } +} + +/* + * used by snapshot creation to make a copy of a root for a tree with + * a given objectid. The buffer with the new root node is returned in + * cow_ret, and this func returns zero on success or a negative error code. + */ +int btrfs_copy_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, + struct extent_buffer **cow_ret, u64 new_root_objectid) +{ + struct extent_buffer *cow; + u32 nritems; + int ret = 0; + int level; + struct btrfs_root *new_root; + + new_root = kmalloc(sizeof(*new_root), GFP_NOFS); + if (!new_root) + return -ENOMEM; + + memcpy(new_root, root, sizeof(*new_root)); + new_root->root_key.objectid = new_root_objectid; + + WARN_ON(root->ref_cows && trans->transid != + root->fs_info->running_transaction->transid); + WARN_ON(root->ref_cows && trans->transid != root->last_trans); + + level = btrfs_header_level(buf); + nritems = btrfs_header_nritems(buf); + + cow = btrfs_alloc_free_block(trans, new_root, buf->len, 0, + new_root_objectid, trans->transid, + level, buf->start, 0); + if (IS_ERR(cow)) { + kfree(new_root); + return PTR_ERR(cow); + } + + copy_extent_buffer(cow, buf, 0, 0, cow->len); + btrfs_set_header_bytenr(cow, cow->start); + btrfs_set_header_generation(cow, trans->transid); + btrfs_set_header_owner(cow, new_root_objectid); + btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN); + + write_extent_buffer(cow, root->fs_info->fsid, + (unsigned long)btrfs_header_fsid(cow), + BTRFS_FSID_SIZE); + + WARN_ON(btrfs_header_generation(buf) > trans->transid); + ret = btrfs_inc_ref(trans, new_root, buf, cow, NULL); + kfree(new_root); + + if (ret) + return ret; + + btrfs_mark_buffer_dirty(cow); + *cow_ret = cow; + return 0; +} + +/* + * does the dirty work in cow of a single block. The parent block (if + * supplied) is updated to point to the new cow copy. The new buffer is marked + * dirty and returned locked. If you modify the block it needs to be marked + * dirty again. + * + * search_start -- an allocation hint for the new block + * + * empty_size -- a hint that you plan on doing more cow. This is the size in + * bytes the allocator should try to find free next to the block it returns. + * This is just a hint and may be ignored by the allocator. + * + * prealloc_dest -- if you have already reserved a destination for the cow, + * this uses that block instead of allocating a new one. + * btrfs_alloc_reserved_extent is used to finish the allocation. + */ +static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, + struct extent_buffer *parent, int parent_slot, + struct extent_buffer **cow_ret, + u64 search_start, u64 empty_size, + u64 prealloc_dest) +{ + u64 parent_start; + struct extent_buffer *cow; + u32 nritems; + int ret = 0; + int level; + int unlock_orig = 0; + + if (*cow_ret == buf) + unlock_orig = 1; + + WARN_ON(!btrfs_tree_locked(buf)); + + if (parent) + parent_start = parent->start; + else + parent_start = 0; + + WARN_ON(root->ref_cows && trans->transid != + root->fs_info->running_transaction->transid); + WARN_ON(root->ref_cows && trans->transid != root->last_trans); + + level = btrfs_header_level(buf); + nritems = btrfs_header_nritems(buf); + + if (prealloc_dest) { + struct btrfs_key ins; + + ins.objectid = prealloc_dest; + ins.offset = buf->len; + ins.type = BTRFS_EXTENT_ITEM_KEY; + + ret = btrfs_alloc_reserved_extent(trans, root, parent_start, + root->root_key.objectid, + trans->transid, level, &ins); + BUG_ON(ret); + cow = btrfs_init_new_buffer(trans, root, prealloc_dest, + buf->len); + } else { + cow = btrfs_alloc_free_block(trans, root, buf->len, + parent_start, + root->root_key.objectid, + trans->transid, level, + search_start, empty_size); + } + if (IS_ERR(cow)) + return PTR_ERR(cow); + + copy_extent_buffer(cow, buf, 0, 0, cow->len); + btrfs_set_header_bytenr(cow, cow->start); + btrfs_set_header_generation(cow, trans->transid); + btrfs_set_header_owner(cow, root->root_key.objectid); + btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN); + + write_extent_buffer(cow, root->fs_info->fsid, + (unsigned long)btrfs_header_fsid(cow), + BTRFS_FSID_SIZE); + + WARN_ON(btrfs_header_generation(buf) > trans->transid); + if (btrfs_header_generation(buf) != trans->transid) { + u32 nr_extents; + ret = btrfs_inc_ref(trans, root, buf, cow, &nr_extents); + if (ret) + return ret; + + ret = btrfs_cache_ref(trans, root, buf, nr_extents); + WARN_ON(ret); + } else if (btrfs_header_owner(buf) == BTRFS_TREE_RELOC_OBJECTID) { + /* + * There are only two places that can drop reference to + * tree blocks owned by living reloc trees, one is here, + * the other place is btrfs_drop_subtree. In both places, + * we check reference count while tree block is locked. + * Furthermore, if reference count is one, it won't get + * increased by someone else. + */ + u32 refs; + ret = btrfs_lookup_extent_ref(trans, root, buf->start, + buf->len, &refs); + BUG_ON(ret); + if (refs == 1) { + ret = btrfs_update_ref(trans, root, buf, cow, + 0, nritems); + clean_tree_block(trans, root, buf); + } else { + ret = btrfs_inc_ref(trans, root, buf, cow, NULL); + } + BUG_ON(ret); + } else { + ret = btrfs_update_ref(trans, root, buf, cow, 0, nritems); + if (ret) + return ret; + clean_tree_block(trans, root, buf); + } + + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { + ret = btrfs_reloc_tree_cache_ref(trans, root, cow, buf->start); + WARN_ON(ret); + } + + if (buf == root->node) { + WARN_ON(parent && parent != buf); + + spin_lock(&root->node_lock); + root->node = cow; + extent_buffer_get(cow); + spin_unlock(&root->node_lock); + + if (buf != root->commit_root) { + btrfs_free_extent(trans, root, buf->start, + buf->len, buf->start, + root->root_key.objectid, + btrfs_header_generation(buf), + level, 1); + } + free_extent_buffer(buf); + add_root_to_dirty_list(root); + } else { + btrfs_set_node_blockptr(parent, parent_slot, + cow->start); + WARN_ON(trans->transid == 0); + btrfs_set_node_ptr_generation(parent, parent_slot, + trans->transid); + btrfs_mark_buffer_dirty(parent); + WARN_ON(btrfs_header_generation(parent) != trans->transid); + btrfs_free_extent(trans, root, buf->start, buf->len, + parent_start, btrfs_header_owner(parent), + btrfs_header_generation(parent), level, 1); + } + if (unlock_orig) + btrfs_tree_unlock(buf); + free_extent_buffer(buf); + btrfs_mark_buffer_dirty(cow); + *cow_ret = cow; + return 0; +} + +/* + * cows a single block, see __btrfs_cow_block for the real work. + * This version of it has extra checks so that a block isn't cow'd more than + * once per transaction, as long as it hasn't been written yet + */ +noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *buf, + struct extent_buffer *parent, int parent_slot, + struct extent_buffer **cow_ret, u64 prealloc_dest) +{ + u64 search_start; + int ret; + + if (trans->transaction != root->fs_info->running_transaction) { + printk(KERN_CRIT "trans %llu running %llu\n", + (unsigned long long)trans->transid, + (unsigned long long) + root->fs_info->running_transaction->transid); + WARN_ON(1); + } + if (trans->transid != root->fs_info->generation) { + printk(KERN_CRIT "trans %llu running %llu\n", + (unsigned long long)trans->transid, + (unsigned long long)root->fs_info->generation); + WARN_ON(1); + } + + spin_lock(&root->fs_info->hash_lock); + if (btrfs_header_generation(buf) == trans->transid && + btrfs_header_owner(buf) == root->root_key.objectid && + !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { + *cow_ret = buf; + spin_unlock(&root->fs_info->hash_lock); + WARN_ON(prealloc_dest); + return 0; + } + spin_unlock(&root->fs_info->hash_lock); + search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1); + ret = __btrfs_cow_block(trans, root, buf, parent, + parent_slot, cow_ret, search_start, 0, + prealloc_dest); + return ret; +} + +/* + * helper function for defrag to decide if two blocks pointed to by a + * node are actually close by + */ +static int close_blocks(u64 blocknr, u64 other, u32 blocksize) +{ + if (blocknr < other && other - (blocknr + blocksize) < 32768) + return 1; + if (blocknr > other && blocknr - (other + blocksize) < 32768) + return 1; + return 0; +} + +/* + * compare two keys in a memcmp fashion + */ +static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2) +{ + struct btrfs_key k1; + + btrfs_disk_key_to_cpu(&k1, disk); + + if (k1.objectid > k2->objectid) + return 1; + if (k1.objectid < k2->objectid) + return -1; + if (k1.type > k2->type) + return 1; + if (k1.type < k2->type) + return -1; + if (k1.offset > k2->offset) + return 1; + if (k1.offset < k2->offset) + return -1; + return 0; +} + +/* + * same as comp_keys only with two btrfs_key's + */ +static int comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2) +{ + if (k1->objectid > k2->objectid) + return 1; + if (k1->objectid < k2->objectid) + return -1; + if (k1->type > k2->type) + return 1; + if (k1->type < k2->type) + return -1; + if (k1->offset > k2->offset) + return 1; + if (k1->offset < k2->offset) + return -1; + return 0; +} + +/* + * this is used by the defrag code to go through all the + * leaves pointed to by a node and reallocate them so that + * disk order is close to key order + */ +int btrfs_realloc_node(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *parent, + int start_slot, int cache_only, u64 *last_ret, + struct btrfs_key *progress) +{ + struct extent_buffer *cur; + u64 blocknr; + u64 gen; + u64 search_start = *last_ret; + u64 last_block = 0; + u64 other; + u32 parent_nritems; + int end_slot; + int i; + int err = 0; + int parent_level; + int uptodate; + u32 blocksize; + int progress_passed = 0; + struct btrfs_disk_key disk_key; + + parent_level = btrfs_header_level(parent); + if (cache_only && parent_level != 1) + return 0; + + if (trans->transaction != root->fs_info->running_transaction) + WARN_ON(1); + if (trans->transid != root->fs_info->generation) + WARN_ON(1); + + parent_nritems = btrfs_header_nritems(parent); + blocksize = btrfs_level_size(root, parent_level - 1); + end_slot = parent_nritems; + + if (parent_nritems == 1) + return 0; + + for (i = start_slot; i < end_slot; i++) { + int close = 1; + + if (!parent->map_token) { + map_extent_buffer(parent, + btrfs_node_key_ptr_offset(i), + sizeof(struct btrfs_key_ptr), + &parent->map_token, &parent->kaddr, + &parent->map_start, &parent->map_len, + KM_USER1); + } + btrfs_node_key(parent, &disk_key, i); + if (!progress_passed && comp_keys(&disk_key, progress) < 0) + continue; + + progress_passed = 1; + blocknr = btrfs_node_blockptr(parent, i); + gen = btrfs_node_ptr_generation(parent, i); + if (last_block == 0) + last_block = blocknr; + + if (i > 0) { + other = btrfs_node_blockptr(parent, i - 1); + close = close_blocks(blocknr, other, blocksize); + } + if (!close && i < end_slot - 2) { + other = btrfs_node_blockptr(parent, i + 1); + close = close_blocks(blocknr, other, blocksize); + } + if (close) { + last_block = blocknr; + continue; + } + if (parent->map_token) { + unmap_extent_buffer(parent, parent->map_token, + KM_USER1); + parent->map_token = NULL; + } + + cur = btrfs_find_tree_block(root, blocknr, blocksize); + if (cur) + uptodate = btrfs_buffer_uptodate(cur, gen); + else + uptodate = 0; + if (!cur || !uptodate) { + if (cache_only) { + free_extent_buffer(cur); + continue; + } + if (!cur) { + cur = read_tree_block(root, blocknr, + blocksize, gen); + } else if (!uptodate) { + btrfs_read_buffer(cur, gen); + } + } + if (search_start == 0) + search_start = last_block; + + btrfs_tree_lock(cur); + err = __btrfs_cow_block(trans, root, cur, parent, i, + &cur, search_start, + min(16 * blocksize, + (end_slot - i) * blocksize), 0); + if (err) { + btrfs_tree_unlock(cur); + free_extent_buffer(cur); + break; + } + search_start = cur->start; + last_block = cur->start; + *last_ret = search_start; + btrfs_tree_unlock(cur); + free_extent_buffer(cur); + } + if (parent->map_token) { + unmap_extent_buffer(parent, parent->map_token, + KM_USER1); + parent->map_token = NULL; + } + return err; +} + +/* + * The leaf data grows from end-to-front in the node. + * this returns the address of the start of the last item, + * which is the stop of the leaf data stack + */ +static inline unsigned int leaf_data_end(struct btrfs_root *root, + struct extent_buffer *leaf) +{ + u32 nr = btrfs_header_nritems(leaf); + if (nr == 0) + return BTRFS_LEAF_DATA_SIZE(root); + return btrfs_item_offset_nr(leaf, nr - 1); +} + +/* + * extra debugging checks to make sure all the items in a key are + * well formed and in the proper order + */ +static int check_node(struct btrfs_root *root, struct btrfs_path *path, + int level) +{ + struct extent_buffer *parent = NULL; + struct extent_buffer *node = path->nodes[level]; + struct btrfs_disk_key parent_key; + struct btrfs_disk_key node_key; + int parent_slot; + int slot; + struct btrfs_key cpukey; + u32 nritems = btrfs_header_nritems(node); + + if (path->nodes[level + 1]) + parent = path->nodes[level + 1]; + + slot = path->slots[level]; + BUG_ON(nritems == 0); + if (parent) { + parent_slot = path->slots[level + 1]; + btrfs_node_key(parent, &parent_key, parent_slot); + btrfs_node_key(node, &node_key, 0); + BUG_ON(memcmp(&parent_key, &node_key, + sizeof(struct btrfs_disk_key))); + BUG_ON(btrfs_node_blockptr(parent, parent_slot) != + btrfs_header_bytenr(node)); + } + BUG_ON(nritems > BTRFS_NODEPTRS_PER_BLOCK(root)); + if (slot != 0) { + btrfs_node_key_to_cpu(node, &cpukey, slot - 1); + btrfs_node_key(node, &node_key, slot); + BUG_ON(comp_keys(&node_key, &cpukey) <= 0); + } + if (slot < nritems - 1) { + btrfs_node_key_to_cpu(node, &cpukey, slot + 1); + btrfs_node_key(node, &node_key, slot); + BUG_ON(comp_keys(&node_key, &cpukey) >= 0); + } + return 0; +} + +/* + * extra checking to make sure all the items in a leaf are + * well formed and in the proper order + */ +static int check_leaf(struct btrfs_root *root, struct btrfs_path *path, + int level) +{ + struct extent_buffer *leaf = path->nodes[level]; + struct extent_buffer *parent = NULL; + int parent_slot; + struct btrfs_key cpukey; + struct btrfs_disk_key parent_key; + struct btrfs_disk_key leaf_key; + int slot = path->slots[0]; + + u32 nritems = btrfs_header_nritems(leaf); + + if (path->nodes[level + 1]) + parent = path->nodes[level + 1]; + + if (nritems == 0) + return 0; + + if (parent) { + parent_slot = path->slots[level + 1]; + btrfs_node_key(parent, &parent_key, parent_slot); + btrfs_item_key(leaf, &leaf_key, 0); + + BUG_ON(memcmp(&parent_key, &leaf_key, + sizeof(struct btrfs_disk_key))); + BUG_ON(btrfs_node_blockptr(parent, parent_slot) != + btrfs_header_bytenr(leaf)); + } + if (slot != 0 && slot < nritems - 1) { + btrfs_item_key(leaf, &leaf_key, slot); + btrfs_item_key_to_cpu(leaf, &cpukey, slot - 1); + if (comp_keys(&leaf_key, &cpukey) <= 0) { + btrfs_print_leaf(root, leaf); + printk(KERN_CRIT "slot %d offset bad key\n", slot); + BUG_ON(1); + } + if (btrfs_item_offset_nr(leaf, slot - 1) != + btrfs_item_end_nr(leaf, slot)) { + btrfs_print_leaf(root, leaf); + printk(KERN_CRIT "slot %d offset bad\n", slot); + BUG_ON(1); + } + } + if (slot < nritems - 1) { + btrfs_item_key(leaf, &leaf_key, slot); + btrfs_item_key_to_cpu(leaf, &cpukey, slot + 1); + BUG_ON(comp_keys(&leaf_key, &cpukey) >= 0); + if (btrfs_item_offset_nr(leaf, slot) != + btrfs_item_end_nr(leaf, slot + 1)) { + btrfs_print_leaf(root, leaf); + printk(KERN_CRIT "slot %d offset bad\n", slot); + BUG_ON(1); + } + } + BUG_ON(btrfs_item_offset_nr(leaf, 0) + + btrfs_item_size_nr(leaf, 0) != BTRFS_LEAF_DATA_SIZE(root)); + return 0; +} + +static noinline int check_block(struct btrfs_root *root, + struct btrfs_path *path, int level) +{ + return 0; + if (level == 0) + return check_leaf(root, path, level); + return check_node(root, path, level); +} + +/* + * search for key in the extent_buffer. The items start at offset p, + * and they are item_size apart. There are 'max' items in p. + * + * the slot in the array is returned via slot, and it points to + * the place where you would insert key if it is not found in + * the array. + * + * slot may point to max if the key is bigger than all of the keys + */ +static noinline int generic_bin_search(struct extent_buffer *eb, + unsigned long p, + int item_size, struct btrfs_key *key, + int max, int *slot) +{ + int low = 0; + int high = max; + int mid; + int ret; + struct btrfs_disk_key *tmp = NULL; + struct btrfs_disk_key unaligned; + unsigned long offset; + char *map_token = NULL; + char *kaddr = NULL; + unsigned long map_start = 0; + unsigned long map_len = 0; + int err; + + while (low < high) { + mid = (low + high) / 2; + offset = p + mid * item_size; + + if (!map_token || offset < map_start || + (offset + sizeof(struct btrfs_disk_key)) > + map_start + map_len) { + if (map_token) { + unmap_extent_buffer(eb, map_token, KM_USER0); + map_token = NULL; + } + + err = map_private_extent_buffer(eb, offset, + sizeof(struct btrfs_disk_key), + &map_token, &kaddr, + &map_start, &map_len, KM_USER0); + + if (!err) { + tmp = (struct btrfs_disk_key *)(kaddr + offset - + map_start); + } else { + read_extent_buffer(eb, &unaligned, + offset, sizeof(unaligned)); + tmp = &unaligned; + } + + } else { + tmp = (struct btrfs_disk_key *)(kaddr + offset - + map_start); + } + ret = comp_keys(tmp, key); + + if (ret < 0) + low = mid + 1; + else if (ret > 0) + high = mid; + else { + *slot = mid; + if (map_token) + unmap_extent_buffer(eb, map_token, KM_USER0); + return 0; + } + } + *slot = low; + if (map_token) + unmap_extent_buffer(eb, map_token, KM_USER0); + return 1; +} + +/* + * simple bin_search frontend that does the right thing for + * leaves vs nodes + */ +static int bin_search(struct extent_buffer *eb, struct btrfs_key *key, + int level, int *slot) +{ + if (level == 0) { + return generic_bin_search(eb, + offsetof(struct btrfs_leaf, items), + sizeof(struct btrfs_item), + key, btrfs_header_nritems(eb), + slot); + } else { + return generic_bin_search(eb, + offsetof(struct btrfs_node, ptrs), + sizeof(struct btrfs_key_ptr), + key, btrfs_header_nritems(eb), + slot); + } + return -1; +} + +/* given a node and slot number, this reads the blocks it points to. The + * extent buffer is returned with a reference taken (but unlocked). + * NULL is returned on error. + */ +static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root, + struct extent_buffer *parent, int slot) +{ + int level = btrfs_header_level(parent); + if (slot < 0) + return NULL; + if (slot >= btrfs_header_nritems(parent)) + return NULL; + + BUG_ON(level == 0); + + return read_tree_block(root, btrfs_node_blockptr(parent, slot), + btrfs_level_size(root, level - 1), + btrfs_node_ptr_generation(parent, slot)); +} + +/* + * node level balancing, used to make sure nodes are in proper order for + * item deletion. We balance from the top down, so we have to make sure + * that a deletion won't leave an node completely empty later on. + */ +static noinline int balance_level(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int level) +{ + struct extent_buffer *right = NULL; + struct extent_buffer *mid; + struct extent_buffer *left = NULL; + struct extent_buffer *parent = NULL; + int ret = 0; + int wret; + int pslot; + int orig_slot = path->slots[level]; + int err_on_enospc = 0; + u64 orig_ptr; + + if (level == 0) + return 0; + + mid = path->nodes[level]; + WARN_ON(!path->locks[level]); + WARN_ON(btrfs_header_generation(mid) != trans->transid); + + orig_ptr = btrfs_node_blockptr(mid, orig_slot); + + if (level < BTRFS_MAX_LEVEL - 1) + parent = path->nodes[level + 1]; + pslot = path->slots[level + 1]; + + /* + * deal with the case where there is only one pointer in the root + * by promoting the node below to a root + */ + if (!parent) { + struct extent_buffer *child; + + if (btrfs_header_nritems(mid) != 1) + return 0; + + /* promote the child to a root */ + child = read_node_slot(root, mid, 0); + btrfs_tree_lock(child); + BUG_ON(!child); + ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0); + BUG_ON(ret); + + spin_lock(&root->node_lock); + root->node = child; + spin_unlock(&root->node_lock); + + ret = btrfs_update_extent_ref(trans, root, child->start, + mid->start, child->start, + root->root_key.objectid, + trans->transid, level - 1); + BUG_ON(ret); + + add_root_to_dirty_list(root); + btrfs_tree_unlock(child); + path->locks[level] = 0; + path->nodes[level] = NULL; + clean_tree_block(trans, root, mid); + btrfs_tree_unlock(mid); + /* once for the path */ + free_extent_buffer(mid); + ret = btrfs_free_extent(trans, root, mid->start, mid->len, + mid->start, root->root_key.objectid, + btrfs_header_generation(mid), + level, 1); + /* once for the root ptr */ + free_extent_buffer(mid); + return ret; + } + if (btrfs_header_nritems(mid) > + BTRFS_NODEPTRS_PER_BLOCK(root) / 4) + return 0; + + if (btrfs_header_nritems(mid) < 2) + err_on_enospc = 1; + + left = read_node_slot(root, parent, pslot - 1); + if (left) { + btrfs_tree_lock(left); + wret = btrfs_cow_block(trans, root, left, + parent, pslot - 1, &left, 0); + if (wret) { + ret = wret; + goto enospc; + } + } + right = read_node_slot(root, parent, pslot + 1); + if (right) { + btrfs_tree_lock(right); + wret = btrfs_cow_block(trans, root, right, + parent, pslot + 1, &right, 0); + if (wret) { + ret = wret; + goto enospc; + } + } + + /* first, try to make some room in the middle buffer */ + if (left) { + orig_slot += btrfs_header_nritems(left); + wret = push_node_left(trans, root, left, mid, 1); + if (wret < 0) + ret = wret; + if (btrfs_header_nritems(mid) < 2) + err_on_enospc = 1; + } + + /* + * then try to empty the right most buffer into the middle + */ + if (right) { + wret = push_node_left(trans, root, mid, right, 1); + if (wret < 0 && wret != -ENOSPC) + ret = wret; + if (btrfs_header_nritems(right) == 0) { + u64 bytenr = right->start; + u64 generation = btrfs_header_generation(parent); + u32 blocksize = right->len; + + clean_tree_block(trans, root, right); + btrfs_tree_unlock(right); + free_extent_buffer(right); + right = NULL; + wret = del_ptr(trans, root, path, level + 1, pslot + + 1); + if (wret) + ret = wret; + wret = btrfs_free_extent(trans, root, bytenr, + blocksize, parent->start, + btrfs_header_owner(parent), + generation, level, 1); + if (wret) + ret = wret; + } else { + struct btrfs_disk_key right_key; + btrfs_node_key(right, &right_key, 0); + btrfs_set_node_key(parent, &right_key, pslot + 1); + btrfs_mark_buffer_dirty(parent); + } + } + if (btrfs_header_nritems(mid) == 1) { + /* + * we're not allowed to leave a node with one item in the + * tree during a delete. A deletion from lower in the tree + * could try to delete the only pointer in this node. + * So, pull some keys from the left. + * There has to be a left pointer at this point because + * otherwise we would have pulled some pointers from the + * right + */ + BUG_ON(!left); + wret = balance_node_right(trans, root, mid, left); + if (wret < 0) { + ret = wret; + goto enospc; + } + if (wret == 1) { + wret = push_node_left(trans, root, left, mid, 1); + if (wret < 0) + ret = wret; + } + BUG_ON(wret == 1); + } + if (btrfs_header_nritems(mid) == 0) { + /* we've managed to empty the middle node, drop it */ + u64 root_gen = btrfs_header_generation(parent); + u64 bytenr = mid->start; + u32 blocksize = mid->len; + + clean_tree_block(trans, root, mid); + btrfs_tree_unlock(mid); + free_extent_buffer(mid); + mid = NULL; + wret = del_ptr(trans, root, path, level + 1, pslot); + if (wret) + ret = wret; + wret = btrfs_free_extent(trans, root, bytenr, blocksize, + parent->start, + btrfs_header_owner(parent), + root_gen, level, 1); + if (wret) + ret = wret; + } else { + /* update the parent key to reflect our changes */ + struct btrfs_disk_key mid_key; + btrfs_node_key(mid, &mid_key, 0); + btrfs_set_node_key(parent, &mid_key, pslot); + btrfs_mark_buffer_dirty(parent); + } + + /* update the path */ + if (left) { + if (btrfs_header_nritems(left) > orig_slot) { + extent_buffer_get(left); + /* left was locked after cow */ + path->nodes[level] = left; + path->slots[level + 1] -= 1; + path->slots[level] = orig_slot; + if (mid) { + btrfs_tree_unlock(mid); + free_extent_buffer(mid); + } + } else { + orig_slot -= btrfs_header_nritems(left); + path->slots[level] = orig_slot; + } + } + /* double check we haven't messed things up */ + check_block(root, path, level); + if (orig_ptr != + btrfs_node_blockptr(path->nodes[level], path->slots[level])) + BUG(); +enospc: + if (right) { + btrfs_tree_unlock(right); + free_extent_buffer(right); + } + if (left) { + if (path->nodes[level] != left) + btrfs_tree_unlock(left); + free_extent_buffer(left); + } + return ret; +} + +/* Node balancing for insertion. Here we only split or push nodes around + * when they are completely full. This is also done top down, so we + * have to be pessimistic. + */ +static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int level) +{ + struct extent_buffer *right = NULL; + struct extent_buffer *mid; + struct extent_buffer *left = NULL; + struct extent_buffer *parent = NULL; + int ret = 0; + int wret; + int pslot; + int orig_slot = path->slots[level]; + u64 orig_ptr; + + if (level == 0) + return 1; + + mid = path->nodes[level]; + WARN_ON(btrfs_header_generation(mid) != trans->transid); + orig_ptr = btrfs_node_blockptr(mid, orig_slot); + + if (level < BTRFS_MAX_LEVEL - 1) + parent = path->nodes[level + 1]; + pslot = path->slots[level + 1]; + + if (!parent) + return 1; + + left = read_node_slot(root, parent, pslot - 1); + + /* first, try to make some room in the middle buffer */ + if (left) { + u32 left_nr; + + btrfs_tree_lock(left); + left_nr = btrfs_header_nritems(left); + if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { + wret = 1; + } else { + ret = btrfs_cow_block(trans, root, left, parent, + pslot - 1, &left, 0); + if (ret) + wret = 1; + else { + wret = push_node_left(trans, root, + left, mid, 0); + } + } + if (wret < 0) + ret = wret; + if (wret == 0) { + struct btrfs_disk_key disk_key; + orig_slot += left_nr; + btrfs_node_key(mid, &disk_key, 0); + btrfs_set_node_key(parent, &disk_key, pslot); + btrfs_mark_buffer_dirty(parent); + if (btrfs_header_nritems(left) > orig_slot) { + path->nodes[level] = left; + path->slots[level + 1] -= 1; + path->slots[level] = orig_slot; + btrfs_tree_unlock(mid); + free_extent_buffer(mid); + } else { + orig_slot -= + btrfs_header_nritems(left); + path->slots[level] = orig_slot; + btrfs_tree_unlock(left); + free_extent_buffer(left); + } + return 0; + } + btrfs_tree_unlock(left); + free_extent_buffer(left); + } + right = read_node_slot(root, parent, pslot + 1); + + /* + * then try to empty the right most buffer into the middle + */ + if (right) { + u32 right_nr; + btrfs_tree_lock(right); + right_nr = btrfs_header_nritems(right); + if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { + wret = 1; + } else { + ret = btrfs_cow_block(trans, root, right, + parent, pslot + 1, + &right, 0); + if (ret) + wret = 1; + else { + wret = balance_node_right(trans, root, + right, mid); + } + } + if (wret < 0) + ret = wret; + if (wret == 0) { + struct btrfs_disk_key disk_key; + + btrfs_node_key(right, &disk_key, 0); + btrfs_set_node_key(parent, &disk_key, pslot + 1); + btrfs_mark_buffer_dirty(parent); + + if (btrfs_header_nritems(mid) <= orig_slot) { + path->nodes[level] = right; + path->slots[level + 1] += 1; + path->slots[level] = orig_slot - + btrfs_header_nritems(mid); + btrfs_tree_unlock(mid); + free_extent_buffer(mid); + } else { + btrfs_tree_unlock(right); + free_extent_buffer(right); + } + return 0; + } + btrfs_tree_unlock(right); + free_extent_buffer(right); + } + return 1; +} + +/* + * readahead one full node of leaves, finding things that are close + * to the block in 'slot', and triggering ra on them. + */ +static noinline void reada_for_search(struct btrfs_root *root, + struct btrfs_path *path, + int level, int slot, u64 objectid) +{ + struct extent_buffer *node; + struct btrfs_disk_key disk_key; + u32 nritems; + u64 search; + u64 lowest_read; + u64 highest_read; + u64 nread = 0; + int direction = path->reada; + struct extent_buffer *eb; + u32 nr; + u32 blocksize; + u32 nscan = 0; + + if (level != 1) + return; + + if (!path->nodes[level]) + return; + + node = path->nodes[level]; + + search = btrfs_node_blockptr(node, slot); + blocksize = btrfs_level_size(root, level - 1); + eb = btrfs_find_tree_block(root, search, blocksize); + if (eb) { + free_extent_buffer(eb); + return; + } + + highest_read = search; + lowest_read = search; + + nritems = btrfs_header_nritems(node); + nr = slot; + while (1) { + if (direction < 0) { + if (nr == 0) + break; + nr--; + } else if (direction > 0) { + nr++; + if (nr >= nritems) + break; + } + if (path->reada < 0 && objectid) { + btrfs_node_key(node, &disk_key, nr); + if (btrfs_disk_key_objectid(&disk_key) != objectid) + break; + } + search = btrfs_node_blockptr(node, nr); + if ((search >= lowest_read && search <= highest_read) || + (search < lowest_read && lowest_read - search <= 16384) || + (search > highest_read && search - highest_read <= 16384)) { + readahead_tree_block(root, search, blocksize, + btrfs_node_ptr_generation(node, nr)); + nread += blocksize; + } + nscan++; + if (path->reada < 2 && (nread > (64 * 1024) || nscan > 32)) + break; + + if (nread > (256 * 1024) || nscan > 128) + break; + + if (search < lowest_read) + lowest_read = search; + if (search > highest_read) + highest_read = search; + } +} + +/* + * when we walk down the tree, it is usually safe to unlock the higher layers + * in the tree. The exceptions are when our path goes through slot 0, because + * operations on the tree might require changing key pointers higher up in the + * tree. + * + * callers might also have set path->keep_locks, which tells this code to keep + * the lock if the path points to the last slot in the block. This is part of + * walking through the tree, and selecting the next slot in the higher block. + * + * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so + * if lowest_unlock is 1, level 0 won't be unlocked + */ +static noinline void unlock_up(struct btrfs_path *path, int level, + int lowest_unlock) +{ + int i; + int skip_level = level; + int no_skips = 0; + struct extent_buffer *t; + + for (i = level; i < BTRFS_MAX_LEVEL; i++) { + if (!path->nodes[i]) + break; + if (!path->locks[i]) + break; + if (!no_skips && path->slots[i] == 0) { + skip_level = i + 1; + continue; + } + if (!no_skips && path->keep_locks) { + u32 nritems; + t = path->nodes[i]; + nritems = btrfs_header_nritems(t); + if (nritems < 1 || path->slots[i] >= nritems - 1) { + skip_level = i + 1; + continue; + } + } + if (skip_level < i && i >= lowest_unlock) + no_skips = 1; + + t = path->nodes[i]; + if (i >= lowest_unlock && i > skip_level && path->locks[i]) { + btrfs_tree_unlock(t); + path->locks[i] = 0; + } + } +} + +/* + * look for key in the tree. path is filled in with nodes along the way + * if key is found, we return zero and you can find the item in the leaf + * level of the path (level 0) + * + * If the key isn't found, the path points to the slot where it should + * be inserted, and 1 is returned. If there are other errors during the + * search a negative error number is returned. + * + * if ins_len > 0, nodes and leaves will be split as we walk down the + * tree. if ins_len < 0, nodes will be merged as we walk down the tree (if + * possible) + */ +int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *key, struct btrfs_path *p, int + ins_len, int cow) +{ + struct extent_buffer *b; + struct extent_buffer *tmp; + int slot; + int ret; + int level; + int should_reada = p->reada; + int lowest_unlock = 1; + int blocksize; + u8 lowest_level = 0; + u64 blocknr; + u64 gen; + struct btrfs_key prealloc_block; + + lowest_level = p->lowest_level; + WARN_ON(lowest_level && ins_len > 0); + WARN_ON(p->nodes[0] != NULL); + + if (ins_len < 0) + lowest_unlock = 2; + + prealloc_block.objectid = 0; + +again: + if (p->skip_locking) + b = btrfs_root_node(root); + else + b = btrfs_lock_root_node(root); + + while (b) { + level = btrfs_header_level(b); + + /* + * setup the path here so we can release it under lock + * contention with the cow code + */ + p->nodes[level] = b; + if (!p->skip_locking) + p->locks[level] = 1; + + if (cow) { + int wret; + + /* is a cow on this block not required */ + spin_lock(&root->fs_info->hash_lock); + if (btrfs_header_generation(b) == trans->transid && + btrfs_header_owner(b) == root->root_key.objectid && + !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) { + spin_unlock(&root->fs_info->hash_lock); + goto cow_done; + } + spin_unlock(&root->fs_info->hash_lock); + + /* ok, we have to cow, is our old prealloc the right + * size? + */ + if (prealloc_block.objectid && + prealloc_block.offset != b->len) { + btrfs_free_reserved_extent(root, + prealloc_block.objectid, + prealloc_block.offset); + prealloc_block.objectid = 0; + } + + /* + * for higher level blocks, try not to allocate blocks + * with the block and the parent locks held. + */ + if (level > 1 && !prealloc_block.objectid && + btrfs_path_lock_waiting(p, level)) { + u32 size = b->len; + u64 hint = b->start; + + btrfs_release_path(root, p); + ret = btrfs_reserve_extent(trans, root, + size, size, 0, + hint, (u64)-1, + &prealloc_block, 0); + BUG_ON(ret); + goto again; + } + + wret = btrfs_cow_block(trans, root, b, + p->nodes[level + 1], + p->slots[level + 1], + &b, prealloc_block.objectid); + prealloc_block.objectid = 0; + if (wret) { + free_extent_buffer(b); + ret = wret; + goto done; + } + } +cow_done: + BUG_ON(!cow && ins_len); + if (level != btrfs_header_level(b)) + WARN_ON(1); + level = btrfs_header_level(b); + + p->nodes[level] = b; + if (!p->skip_locking) + p->locks[level] = 1; + + ret = check_block(root, p, level); + if (ret) { + ret = -1; + goto done; + } + + ret = bin_search(b, key, level, &slot); + if (level != 0) { + if (ret && slot > 0) + slot -= 1; + p->slots[level] = slot; + if ((p->search_for_split || ins_len > 0) && + btrfs_header_nritems(b) >= + BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { + int sret = split_node(trans, root, p, level); + BUG_ON(sret > 0); + if (sret) { + ret = sret; + goto done; + } + b = p->nodes[level]; + slot = p->slots[level]; + } else if (ins_len < 0) { + int sret = balance_level(trans, root, p, + level); + if (sret) { + ret = sret; + goto done; + } + b = p->nodes[level]; + if (!b) { + btrfs_release_path(NULL, p); + goto again; + } + slot = p->slots[level]; + BUG_ON(btrfs_header_nritems(b) == 1); + } + unlock_up(p, level, lowest_unlock); + + /* this is only true while dropping a snapshot */ + if (level == lowest_level) { + ret = 0; + goto done; + } + + blocknr = btrfs_node_blockptr(b, slot); + gen = btrfs_node_ptr_generation(b, slot); + blocksize = btrfs_level_size(root, level - 1); + + tmp = btrfs_find_tree_block(root, blocknr, blocksize); + if (tmp && btrfs_buffer_uptodate(tmp, gen)) { + b = tmp; + } else { + /* + * reduce lock contention at high levels + * of the btree by dropping locks before + * we read. + */ + if (level > 1) { + btrfs_release_path(NULL, p); + if (tmp) + free_extent_buffer(tmp); + if (should_reada) + reada_for_search(root, p, + level, slot, + key->objectid); + + tmp = read_tree_block(root, blocknr, + blocksize, gen); + if (tmp) + free_extent_buffer(tmp); + goto again; + } else { + if (tmp) + free_extent_buffer(tmp); + if (should_reada) + reada_for_search(root, p, + level, slot, + key->objectid); + b = read_node_slot(root, b, slot); + } + } + if (!p->skip_locking) + btrfs_tree_lock(b); + } else { + p->slots[level] = slot; + if (ins_len > 0 && + btrfs_leaf_free_space(root, b) < ins_len) { + int sret = split_leaf(trans, root, key, + p, ins_len, ret == 0); + BUG_ON(sret > 0); + if (sret) { + ret = sret; + goto done; + } + } + if (!p->search_for_split) + unlock_up(p, level, lowest_unlock); + goto done; + } + } + ret = 1; +done: + if (prealloc_block.objectid) { + btrfs_free_reserved_extent(root, + prealloc_block.objectid, + prealloc_block.offset); + } + + return ret; +} + +int btrfs_merge_path(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_key *node_keys, + u64 *nodes, int lowest_level) +{ + struct extent_buffer *eb; + struct extent_buffer *parent; + struct btrfs_key key; + u64 bytenr; + u64 generation; + u32 blocksize; + int level; + int slot; + int key_match; + int ret; + + eb = btrfs_lock_root_node(root); + ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0); + BUG_ON(ret); + + parent = eb; + while (1) { + level = btrfs_header_level(parent); + if (level == 0 || level <= lowest_level) + break; + + ret = bin_search(parent, &node_keys[lowest_level], level, + &slot); + if (ret && slot > 0) + slot--; + + bytenr = btrfs_node_blockptr(parent, slot); + if (nodes[level - 1] == bytenr) + break; + + blocksize = btrfs_level_size(root, level - 1); + generation = btrfs_node_ptr_generation(parent, slot); + btrfs_node_key_to_cpu(eb, &key, slot); + key_match = !memcmp(&key, &node_keys[level - 1], sizeof(key)); + + if (generation == trans->transid) { + eb = read_tree_block(root, bytenr, blocksize, + generation); + btrfs_tree_lock(eb); + } + + /* + * if node keys match and node pointer hasn't been modified + * in the running transaction, we can merge the path. for + * blocks owened by reloc trees, the node pointer check is + * skipped, this is because these blocks are fully controlled + * by the space balance code, no one else can modify them. + */ + if (!nodes[level - 1] || !key_match || + (generation == trans->transid && + btrfs_header_owner(eb) != BTRFS_TREE_RELOC_OBJECTID)) { + if (level == 1 || level == lowest_level + 1) { + if (generation == trans->transid) { + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + } + break; + } + + if (generation != trans->transid) { + eb = read_tree_block(root, bytenr, blocksize, + generation); + btrfs_tree_lock(eb); + } + + ret = btrfs_cow_block(trans, root, eb, parent, slot, + &eb, 0); + BUG_ON(ret); + + if (root->root_key.objectid == + BTRFS_TREE_RELOC_OBJECTID) { + if (!nodes[level - 1]) { + nodes[level - 1] = eb->start; + memcpy(&node_keys[level - 1], &key, + sizeof(node_keys[0])); + } else { + WARN_ON(1); + } + } + + btrfs_tree_unlock(parent); + free_extent_buffer(parent); + parent = eb; + continue; + } + + btrfs_set_node_blockptr(parent, slot, nodes[level - 1]); + btrfs_set_node_ptr_generation(parent, slot, trans->transid); + btrfs_mark_buffer_dirty(parent); + + ret = btrfs_inc_extent_ref(trans, root, + nodes[level - 1], + blocksize, parent->start, + btrfs_header_owner(parent), + btrfs_header_generation(parent), + level - 1); + BUG_ON(ret); + + /* + * If the block was created in the running transaction, + * it's possible this is the last reference to it, so we + * should drop the subtree. + */ + if (generation == trans->transid) { + ret = btrfs_drop_subtree(trans, root, eb, parent); + BUG_ON(ret); + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + } else { + ret = btrfs_free_extent(trans, root, bytenr, + blocksize, parent->start, + btrfs_header_owner(parent), + btrfs_header_generation(parent), + level - 1, 1); + BUG_ON(ret); + } + break; + } + btrfs_tree_unlock(parent); + free_extent_buffer(parent); + return 0; +} + +/* + * adjust the pointers going up the tree, starting at level + * making sure the right key of each node is points to 'key'. + * This is used after shifting pointers to the left, so it stops + * fixing up pointers when a given leaf/node is not in slot 0 of the + * higher levels + * + * If this fails to write a tree block, it returns -1, but continues + * fixing up the blocks in ram so the tree is consistent. + */ +static int fixup_low_keys(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_disk_key *key, int level) +{ + int i; + int ret = 0; + struct extent_buffer *t; + + for (i = level; i < BTRFS_MAX_LEVEL; i++) { + int tslot = path->slots[i]; + if (!path->nodes[i]) + break; + t = path->nodes[i]; + btrfs_set_node_key(t, key, tslot); + btrfs_mark_buffer_dirty(path->nodes[i]); + if (tslot != 0) + break; + } + return ret; +} + +/* + * update item key. + * + * This function isn't completely safe. It's the caller's responsibility + * that the new key won't break the order + */ +int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *new_key) +{ + struct btrfs_disk_key disk_key; + struct extent_buffer *eb; + int slot; + + eb = path->nodes[0]; + slot = path->slots[0]; + if (slot > 0) { + btrfs_item_key(eb, &disk_key, slot - 1); + if (comp_keys(&disk_key, new_key) >= 0) + return -1; + } + if (slot < btrfs_header_nritems(eb) - 1) { + btrfs_item_key(eb, &disk_key, slot + 1); + if (comp_keys(&disk_key, new_key) <= 0) + return -1; + } + + btrfs_cpu_key_to_disk(&disk_key, new_key); + btrfs_set_item_key(eb, &disk_key, slot); + btrfs_mark_buffer_dirty(eb); + if (slot == 0) + fixup_low_keys(trans, root, path, &disk_key, 1); + return 0; +} + +/* + * try to push data from one node into the next node left in the + * tree. + * + * returns 0 if some ptrs were pushed left, < 0 if there was some horrible + * error, and > 0 if there was no room in the left hand block. + */ +static int push_node_left(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *dst, + struct extent_buffer *src, int empty) +{ + int push_items = 0; + int src_nritems; + int dst_nritems; + int ret = 0; + + src_nritems = btrfs_header_nritems(src); + dst_nritems = btrfs_header_nritems(dst); + push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems; + WARN_ON(btrfs_header_generation(src) != trans->transid); + WARN_ON(btrfs_header_generation(dst) != trans->transid); + + if (!empty && src_nritems <= 8) + return 1; + + if (push_items <= 0) + return 1; + + if (empty) { + push_items = min(src_nritems, push_items); + if (push_items < src_nritems) { + /* leave at least 8 pointers in the node if + * we aren't going to empty it + */ + if (src_nritems - push_items < 8) { + if (push_items <= 8) + return 1; + push_items -= 8; + } + } + } else + push_items = min(src_nritems - 8, push_items); + + copy_extent_buffer(dst, src, + btrfs_node_key_ptr_offset(dst_nritems), + btrfs_node_key_ptr_offset(0), + push_items * sizeof(struct btrfs_key_ptr)); + + if (push_items < src_nritems) { + memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0), + btrfs_node_key_ptr_offset(push_items), + (src_nritems - push_items) * + sizeof(struct btrfs_key_ptr)); + } + btrfs_set_header_nritems(src, src_nritems - push_items); + btrfs_set_header_nritems(dst, dst_nritems + push_items); + btrfs_mark_buffer_dirty(src); + btrfs_mark_buffer_dirty(dst); + + ret = btrfs_update_ref(trans, root, src, dst, dst_nritems, push_items); + BUG_ON(ret); + + return ret; +} + +/* + * try to push data from one node into the next node right in the + * tree. + * + * returns 0 if some ptrs were pushed, < 0 if there was some horrible + * error, and > 0 if there was no room in the right hand block. + * + * this will only push up to 1/2 the contents of the left node over + */ +static int balance_node_right(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *dst, + struct extent_buffer *src) +{ + int push_items = 0; + int max_push; + int src_nritems; + int dst_nritems; + int ret = 0; + + WARN_ON(btrfs_header_generation(src) != trans->transid); + WARN_ON(btrfs_header_generation(dst) != trans->transid); + + src_nritems = btrfs_header_nritems(src); + dst_nritems = btrfs_header_nritems(dst); + push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems; + if (push_items <= 0) + return 1; + + if (src_nritems < 4) + return 1; + + max_push = src_nritems / 2 + 1; + /* don't try to empty the node */ + if (max_push >= src_nritems) + return 1; + + if (max_push < push_items) + push_items = max_push; + + memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items), + btrfs_node_key_ptr_offset(0), + (dst_nritems) * + sizeof(struct btrfs_key_ptr)); + + copy_extent_buffer(dst, src, + btrfs_node_key_ptr_offset(0), + btrfs_node_key_ptr_offset(src_nritems - push_items), + push_items * sizeof(struct btrfs_key_ptr)); + + btrfs_set_header_nritems(src, src_nritems - push_items); + btrfs_set_header_nritems(dst, dst_nritems + push_items); + + btrfs_mark_buffer_dirty(src); + btrfs_mark_buffer_dirty(dst); + + ret = btrfs_update_ref(trans, root, src, dst, 0, push_items); + BUG_ON(ret); + + return ret; +} + +/* + * helper function to insert a new root level in the tree. + * A new node is allocated, and a single item is inserted to + * point to the existing root + * + * returns zero on success or < 0 on failure. + */ +static noinline int insert_new_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int level) +{ + u64 lower_gen; + struct extent_buffer *lower; + struct extent_buffer *c; + struct extent_buffer *old; + struct btrfs_disk_key lower_key; + int ret; + + BUG_ON(path->nodes[level]); + BUG_ON(path->nodes[level-1] != root->node); + + lower = path->nodes[level-1]; + if (level == 1) + btrfs_item_key(lower, &lower_key, 0); + else + btrfs_node_key(lower, &lower_key, 0); + + c = btrfs_alloc_free_block(trans, root, root->nodesize, 0, + root->root_key.objectid, trans->transid, + level, root->node->start, 0); + if (IS_ERR(c)) + return PTR_ERR(c); + + memset_extent_buffer(c, 0, 0, root->nodesize); + btrfs_set_header_nritems(c, 1); + btrfs_set_header_level(c, level); + btrfs_set_header_bytenr(c, c->start); + btrfs_set_header_generation(c, trans->transid); + btrfs_set_header_owner(c, root->root_key.objectid); + + write_extent_buffer(c, root->fs_info->fsid, + (unsigned long)btrfs_header_fsid(c), + BTRFS_FSID_SIZE); + + write_extent_buffer(c, root->fs_info->chunk_tree_uuid, + (unsigned long)btrfs_header_chunk_tree_uuid(c), + BTRFS_UUID_SIZE); + + btrfs_set_node_key(c, &lower_key, 0); + btrfs_set_node_blockptr(c, 0, lower->start); + lower_gen = btrfs_header_generation(lower); + WARN_ON(lower_gen != trans->transid); + + btrfs_set_node_ptr_generation(c, 0, lower_gen); + + btrfs_mark_buffer_dirty(c); + + spin_lock(&root->node_lock); + old = root->node; + root->node = c; + spin_unlock(&root->node_lock); + + ret = btrfs_update_extent_ref(trans, root, lower->start, + lower->start, c->start, + root->root_key.objectid, + trans->transid, level - 1); + BUG_ON(ret); + + /* the super has an extra ref to root->node */ + free_extent_buffer(old); + + add_root_to_dirty_list(root); + extent_buffer_get(c); + path->nodes[level] = c; + path->locks[level] = 1; + path->slots[level] = 0; + return 0; +} + +/* + * worker function to insert a single pointer in a node. + * the node should have enough room for the pointer already + * + * slot and level indicate where you want the key to go, and + * blocknr is the block the key points to. + * + * returns zero on success and < 0 on any error + */ +static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, struct btrfs_disk_key + *key, u64 bytenr, int slot, int level) +{ + struct extent_buffer *lower; + int nritems; + + BUG_ON(!path->nodes[level]); + lower = path->nodes[level]; + nritems = btrfs_header_nritems(lower); + if (slot > nritems) + BUG(); + if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root)) + BUG(); + if (slot != nritems) { + memmove_extent_buffer(lower, + btrfs_node_key_ptr_offset(slot + 1), + btrfs_node_key_ptr_offset(slot), + (nritems - slot) * sizeof(struct btrfs_key_ptr)); + } + btrfs_set_node_key(lower, key, slot); + btrfs_set_node_blockptr(lower, slot, bytenr); + WARN_ON(trans->transid == 0); + btrfs_set_node_ptr_generation(lower, slot, trans->transid); + btrfs_set_header_nritems(lower, nritems + 1); + btrfs_mark_buffer_dirty(lower); + return 0; +} + +/* + * split the node at the specified level in path in two. + * The path is corrected to point to the appropriate node after the split + * + * Before splitting this tries to make some room in the node by pushing + * left and right, if either one works, it returns right away. + * + * returns 0 on success and < 0 on failure + */ +static noinline int split_node(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int level) +{ + struct extent_buffer *c; + struct extent_buffer *split; + struct btrfs_disk_key disk_key; + int mid; + int ret; + int wret; + u32 c_nritems; + + c = path->nodes[level]; + WARN_ON(btrfs_header_generation(c) != trans->transid); + if (c == root->node) { + /* trying to split the root, lets make a new one */ + ret = insert_new_root(trans, root, path, level + 1); + if (ret) + return ret; + } else { + ret = push_nodes_for_insert(trans, root, path, level); + c = path->nodes[level]; + if (!ret && btrfs_header_nritems(c) < + BTRFS_NODEPTRS_PER_BLOCK(root) - 3) + return 0; + if (ret < 0) + return ret; + } + + c_nritems = btrfs_header_nritems(c); + + split = btrfs_alloc_free_block(trans, root, root->nodesize, + path->nodes[level + 1]->start, + root->root_key.objectid, + trans->transid, level, c->start, 0); + if (IS_ERR(split)) + return PTR_ERR(split); + + btrfs_set_header_flags(split, btrfs_header_flags(c)); + btrfs_set_header_level(split, btrfs_header_level(c)); + btrfs_set_header_bytenr(split, split->start); + btrfs_set_header_generation(split, trans->transid); + btrfs_set_header_owner(split, root->root_key.objectid); + btrfs_set_header_flags(split, 0); + write_extent_buffer(split, root->fs_info->fsid, + (unsigned long)btrfs_header_fsid(split), + BTRFS_FSID_SIZE); + write_extent_buffer(split, root->fs_info->chunk_tree_uuid, + (unsigned long)btrfs_header_chunk_tree_uuid(split), + BTRFS_UUID_SIZE); + + mid = (c_nritems + 1) / 2; + + copy_extent_buffer(split, c, + btrfs_node_key_ptr_offset(0), + btrfs_node_key_ptr_offset(mid), + (c_nritems - mid) * sizeof(struct btrfs_key_ptr)); + btrfs_set_header_nritems(split, c_nritems - mid); + btrfs_set_header_nritems(c, mid); + ret = 0; + + btrfs_mark_buffer_dirty(c); + btrfs_mark_buffer_dirty(split); + + btrfs_node_key(split, &disk_key, 0); + wret = insert_ptr(trans, root, path, &disk_key, split->start, + path->slots[level + 1] + 1, + level + 1); + if (wret) + ret = wret; + + ret = btrfs_update_ref(trans, root, c, split, 0, c_nritems - mid); + BUG_ON(ret); + + if (path->slots[level] >= mid) { + path->slots[level] -= mid; + btrfs_tree_unlock(c); + free_extent_buffer(c); + path->nodes[level] = split; + path->slots[level + 1] += 1; + } else { + btrfs_tree_unlock(split); + free_extent_buffer(split); + } + return ret; +} + +/* + * how many bytes are required to store the items in a leaf. start + * and nr indicate which items in the leaf to check. This totals up the + * space used both by the item structs and the item data + */ +static int leaf_space_used(struct extent_buffer *l, int start, int nr) +{ + int data_len; + int nritems = btrfs_header_nritems(l); + int end = min(nritems, start + nr) - 1; + + if (!nr) + return 0; + data_len = btrfs_item_end_nr(l, start); + data_len = data_len - btrfs_item_offset_nr(l, end); + data_len += sizeof(struct btrfs_item) * nr; + WARN_ON(data_len < 0); + return data_len; +} + +/* + * The space between the end of the leaf items and + * the start of the leaf data. IOW, how much room + * the leaf has left for both items and data + */ +noinline int btrfs_leaf_free_space(struct btrfs_root *root, + struct extent_buffer *leaf) +{ + int nritems = btrfs_header_nritems(leaf); + int ret; + ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems); + if (ret < 0) { + printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, " + "used %d nritems %d\n", + ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root), + leaf_space_used(leaf, 0, nritems), nritems); + } + return ret; +} + +/* + * push some data in the path leaf to the right, trying to free up at + * least data_size bytes. returns zero if the push worked, nonzero otherwise + * + * returns 1 if the push failed because the other node didn't have enough + * room, 0 if everything worked out and < 0 if there were major errors. + */ +static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, int data_size, + int empty) +{ + struct extent_buffer *left = path->nodes[0]; + struct extent_buffer *right; + struct extent_buffer *upper; + struct btrfs_disk_key disk_key; + int slot; + u32 i; + int free_space; + int push_space = 0; + int push_items = 0; + struct btrfs_item *item; + u32 left_nritems; + u32 nr; + u32 right_nritems; + u32 data_end; + u32 this_item_size; + int ret; + + slot = path->slots[1]; + if (!path->nodes[1]) + return 1; + + upper = path->nodes[1]; + if (slot >= btrfs_header_nritems(upper) - 1) + return 1; + + WARN_ON(!btrfs_tree_locked(path->nodes[1])); + + right = read_node_slot(root, upper, slot + 1); + btrfs_tree_lock(right); + free_space = btrfs_leaf_free_space(root, right); + if (free_space < data_size) + goto out_unlock; + + /* cow and double check */ + ret = btrfs_cow_block(trans, root, right, upper, + slot + 1, &right, 0); + if (ret) + goto out_unlock; + + free_space = btrfs_leaf_free_space(root, right); + if (free_space < data_size) + goto out_unlock; + + left_nritems = btrfs_header_nritems(left); + if (left_nritems == 0) + goto out_unlock; + + if (empty) + nr = 0; + else + nr = 1; + + if (path->slots[0] >= left_nritems) + push_space += data_size; + + i = left_nritems - 1; + while (i >= nr) { + item = btrfs_item_nr(left, i); + + if (!empty && push_items > 0) { + if (path->slots[0] > i) + break; + if (path->slots[0] == i) { + int space = btrfs_leaf_free_space(root, left); + if (space + push_space * 2 > free_space) + break; + } + } + + if (path->slots[0] == i) + push_space += data_size; + + if (!left->map_token) { + map_extent_buffer(left, (unsigned long)item, + sizeof(struct btrfs_item), + &left->map_token, &left->kaddr, + &left->map_start, &left->map_len, + KM_USER1); + } + + this_item_size = btrfs_item_size(left, item); + if (this_item_size + sizeof(*item) + push_space > free_space) + break; + + push_items++; + push_space += this_item_size + sizeof(*item); + if (i == 0) + break; + i--; + } + if (left->map_token) { + unmap_extent_buffer(left, left->map_token, KM_USER1); + left->map_token = NULL; + } + + if (push_items == 0) + goto out_unlock; + + if (!empty && push_items == left_nritems) + WARN_ON(1); + + /* push left to right */ + right_nritems = btrfs_header_nritems(right); + + push_space = btrfs_item_end_nr(left, left_nritems - push_items); + push_space -= leaf_data_end(root, left); + + /* make room in the right data area */ + data_end = leaf_data_end(root, right); + memmove_extent_buffer(right, + btrfs_leaf_data(right) + data_end - push_space, + btrfs_leaf_data(right) + data_end, + BTRFS_LEAF_DATA_SIZE(root) - data_end); + + /* copy from the left data area */ + copy_extent_buffer(right, left, btrfs_leaf_data(right) + + BTRFS_LEAF_DATA_SIZE(root) - push_space, + btrfs_leaf_data(left) + leaf_data_end(root, left), + push_space); + + memmove_extent_buffer(right, btrfs_item_nr_offset(push_items), + btrfs_item_nr_offset(0), + right_nritems * sizeof(struct btrfs_item)); + + /* copy the items from left to right */ + copy_extent_buffer(right, left, btrfs_item_nr_offset(0), + btrfs_item_nr_offset(left_nritems - push_items), + push_items * sizeof(struct btrfs_item)); + + /* update the item pointers */ + right_nritems += push_items; + btrfs_set_header_nritems(right, right_nritems); + push_space = BTRFS_LEAF_DATA_SIZE(root); + for (i = 0; i < right_nritems; i++) { + item = btrfs_item_nr(right, i); + if (!right->map_token) { + map_extent_buffer(right, (unsigned long)item, + sizeof(struct btrfs_item), + &right->map_token, &right->kaddr, + &right->map_start, &right->map_len, + KM_USER1); + } + push_space -= btrfs_item_size(right, item); + btrfs_set_item_offset(right, item, push_space); + } + + if (right->map_token) { + unmap_extent_buffer(right, right->map_token, KM_USER1); + right->map_token = NULL; + } + left_nritems -= push_items; + btrfs_set_header_nritems(left, left_nritems); + + if (left_nritems) + btrfs_mark_buffer_dirty(left); + btrfs_mark_buffer_dirty(right); + + ret = btrfs_update_ref(trans, root, left, right, 0, push_items); + BUG_ON(ret); + + btrfs_item_key(right, &disk_key, 0); + btrfs_set_node_key(upper, &disk_key, slot + 1); + btrfs_mark_buffer_dirty(upper); + + /* then fixup the leaf pointer in the path */ + if (path->slots[0] >= left_nritems) { + path->slots[0] -= left_nritems; + if (btrfs_header_nritems(path->nodes[0]) == 0) + clean_tree_block(trans, root, path->nodes[0]); + btrfs_tree_unlock(path->nodes[0]); + free_extent_buffer(path->nodes[0]); + path->nodes[0] = right; + path->slots[1] += 1; + } else { + btrfs_tree_unlock(right); + free_extent_buffer(right); + } + return 0; + +out_unlock: + btrfs_tree_unlock(right); + free_extent_buffer(right); + return 1; +} + +/* + * push some data in the path leaf to the left, trying to free up at + * least data_size bytes. returns zero if the push worked, nonzero otherwise + */ +static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, int data_size, + int empty) +{ + struct btrfs_disk_key disk_key; + struct extent_buffer *right = path->nodes[0]; + struct extent_buffer *left; + int slot; + int i; + int free_space; + int push_space = 0; + int push_items = 0; + struct btrfs_item *item; + u32 old_left_nritems; + u32 right_nritems; + u32 nr; + int ret = 0; + int wret; + u32 this_item_size; + u32 old_left_item_size; + + slot = path->slots[1]; + if (slot == 0) + return 1; + if (!path->nodes[1]) + return 1; + + right_nritems = btrfs_header_nritems(right); + if (right_nritems == 0) + return 1; + + WARN_ON(!btrfs_tree_locked(path->nodes[1])); + + left = read_node_slot(root, path->nodes[1], slot - 1); + btrfs_tree_lock(left); + free_space = btrfs_leaf_free_space(root, left); + if (free_space < data_size) { + ret = 1; + goto out; + } + + /* cow and double check */ + ret = btrfs_cow_block(trans, root, left, + path->nodes[1], slot - 1, &left, 0); + if (ret) { + /* we hit -ENOSPC, but it isn't fatal here */ + ret = 1; + goto out; + } + + free_space = btrfs_leaf_free_space(root, left); + if (free_space < data_size) { + ret = 1; + goto out; + } + + if (empty) + nr = right_nritems; + else + nr = right_nritems - 1; + + for (i = 0; i < nr; i++) { + item = btrfs_item_nr(right, i); + if (!right->map_token) { + map_extent_buffer(right, (unsigned long)item, + sizeof(struct btrfs_item), + &right->map_token, &right->kaddr, + &right->map_start, &right->map_len, + KM_USER1); + } + + if (!empty && push_items > 0) { + if (path->slots[0] < i) + break; + if (path->slots[0] == i) { + int space = btrfs_leaf_free_space(root, right); + if (space + push_space * 2 > free_space) + break; + } + } + + if (path->slots[0] == i) + push_space += data_size; + + this_item_size = btrfs_item_size(right, item); + if (this_item_size + sizeof(*item) + push_space > free_space) + break; + + push_items++; + push_space += this_item_size + sizeof(*item); + } + + if (right->map_token) { + unmap_extent_buffer(right, right->map_token, KM_USER1); + right->map_token = NULL; + } + + if (push_items == 0) { + ret = 1; + goto out; + } + if (!empty && push_items == btrfs_header_nritems(right)) + WARN_ON(1); + + /* push data from right to left */ + copy_extent_buffer(left, right, + btrfs_item_nr_offset(btrfs_header_nritems(left)), + btrfs_item_nr_offset(0), + push_items * sizeof(struct btrfs_item)); + + push_space = BTRFS_LEAF_DATA_SIZE(root) - + btrfs_item_offset_nr(right, push_items - 1); + + copy_extent_buffer(left, right, btrfs_leaf_data(left) + + leaf_data_end(root, left) - push_space, + btrfs_leaf_data(right) + + btrfs_item_offset_nr(right, push_items - 1), + push_space); + old_left_nritems = btrfs_header_nritems(left); + BUG_ON(old_left_nritems <= 0); + + old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1); + for (i = old_left_nritems; i < old_left_nritems + push_items; i++) { + u32 ioff; + + item = btrfs_item_nr(left, i); + if (!left->map_token) { + map_extent_buffer(left, (unsigned long)item, + sizeof(struct btrfs_item), + &left->map_token, &left->kaddr, + &left->map_start, &left->map_len, + KM_USER1); + } + + ioff = btrfs_item_offset(left, item); + btrfs_set_item_offset(left, item, + ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size)); + } + btrfs_set_header_nritems(left, old_left_nritems + push_items); + if (left->map_token) { + unmap_extent_buffer(left, left->map_token, KM_USER1); + left->map_token = NULL; + } + + /* fixup right node */ + if (push_items > right_nritems) { + printk(KERN_CRIT "push items %d nr %u\n", push_items, + right_nritems); + WARN_ON(1); + } + + if (push_items < right_nritems) { + push_space = btrfs_item_offset_nr(right, push_items - 1) - + leaf_data_end(root, right); + memmove_extent_buffer(right, btrfs_leaf_data(right) + + BTRFS_LEAF_DATA_SIZE(root) - push_space, + btrfs_leaf_data(right) + + leaf_data_end(root, right), push_space); + + memmove_extent_buffer(right, btrfs_item_nr_offset(0), + btrfs_item_nr_offset(push_items), + (btrfs_header_nritems(right) - push_items) * + sizeof(struct btrfs_item)); + } + right_nritems -= push_items; + btrfs_set_header_nritems(right, right_nritems); + push_space = BTRFS_LEAF_DATA_SIZE(root); + for (i = 0; i < right_nritems; i++) { + item = btrfs_item_nr(right, i); + + if (!right->map_token) { + map_extent_buffer(right, (unsigned long)item, + sizeof(struct btrfs_item), + &right->map_token, &right->kaddr, + &right->map_start, &right->map_len, + KM_USER1); + } + + push_space = push_space - btrfs_item_size(right, item); + btrfs_set_item_offset(right, item, push_space); + } + if (right->map_token) { + unmap_extent_buffer(right, right->map_token, KM_USER1); + right->map_token = NULL; + } + + btrfs_mark_buffer_dirty(left); + if (right_nritems) + btrfs_mark_buffer_dirty(right); + + ret = btrfs_update_ref(trans, root, right, left, + old_left_nritems, push_items); + BUG_ON(ret); + + btrfs_item_key(right, &disk_key, 0); + wret = fixup_low_keys(trans, root, path, &disk_key, 1); + if (wret) + ret = wret; + + /* then fixup the leaf pointer in the path */ + if (path->slots[0] < push_items) { + path->slots[0] += old_left_nritems; + if (btrfs_header_nritems(path->nodes[0]) == 0) + clean_tree_block(trans, root, path->nodes[0]); + btrfs_tree_unlock(path->nodes[0]); + free_extent_buffer(path->nodes[0]); + path->nodes[0] = left; + path->slots[1] -= 1; + } else { + btrfs_tree_unlock(left); + free_extent_buffer(left); + path->slots[0] -= push_items; + } + BUG_ON(path->slots[0] < 0); + return ret; +out: + btrfs_tree_unlock(left); + free_extent_buffer(left); + return ret; +} + +/* + * split the path's leaf in two, making sure there is at least data_size + * available for the resulting leaf level of the path. + * + * returns 0 if all went well and < 0 on failure. + */ +static noinline int split_leaf(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_key *ins_key, + struct btrfs_path *path, int data_size, + int extend) +{ + struct extent_buffer *l; + u32 nritems; + int mid; + int slot; + struct extent_buffer *right; + int data_copy_size; + int rt_data_off; + int i; + int ret = 0; + int wret; + int double_split; + int num_doubles = 0; + struct btrfs_disk_key disk_key; + + /* first try to make some room by pushing left and right */ + if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) { + wret = push_leaf_right(trans, root, path, data_size, 0); + if (wret < 0) + return wret; + if (wret) { + wret = push_leaf_left(trans, root, path, data_size, 0); + if (wret < 0) + return wret; + } + l = path->nodes[0]; + + /* did the pushes work? */ + if (btrfs_leaf_free_space(root, l) >= data_size) + return 0; + } + + if (!path->nodes[1]) { + ret = insert_new_root(trans, root, path, 1); + if (ret) + return ret; + } +again: + double_split = 0; + l = path->nodes[0]; + slot = path->slots[0]; + nritems = btrfs_header_nritems(l); + mid = (nritems + 1) / 2; + + right = btrfs_alloc_free_block(trans, root, root->leafsize, + path->nodes[1]->start, + root->root_key.objectid, + trans->transid, 0, l->start, 0); + if (IS_ERR(right)) { + BUG_ON(1); + return PTR_ERR(right); + } + + memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header)); + btrfs_set_header_bytenr(right, right->start); + btrfs_set_header_generation(right, trans->transid); + btrfs_set_header_owner(right, root->root_key.objectid); + btrfs_set_header_level(right, 0); + write_extent_buffer(right, root->fs_info->fsid, + (unsigned long)btrfs_header_fsid(right), + BTRFS_FSID_SIZE); + + write_extent_buffer(right, root->fs_info->chunk_tree_uuid, + (unsigned long)btrfs_header_chunk_tree_uuid(right), + BTRFS_UUID_SIZE); + if (mid <= slot) { + if (nritems == 1 || + leaf_space_used(l, mid, nritems - mid) + data_size > + BTRFS_LEAF_DATA_SIZE(root)) { + if (slot >= nritems) { + btrfs_cpu_key_to_disk(&disk_key, ins_key); + btrfs_set_header_nritems(right, 0); + wret = insert_ptr(trans, root, path, + &disk_key, right->start, + path->slots[1] + 1, 1); + if (wret) + ret = wret; + + btrfs_tree_unlock(path->nodes[0]); + free_extent_buffer(path->nodes[0]); + path->nodes[0] = right; + path->slots[0] = 0; + path->slots[1] += 1; + btrfs_mark_buffer_dirty(right); + return ret; + } + mid = slot; + if (mid != nritems && + leaf_space_used(l, mid, nritems - mid) + + data_size > BTRFS_LEAF_DATA_SIZE(root)) { + double_split = 1; + } + } + } else { + if (leaf_space_used(l, 0, mid) + data_size > + BTRFS_LEAF_DATA_SIZE(root)) { + if (!extend && data_size && slot == 0) { + btrfs_cpu_key_to_disk(&disk_key, ins_key); + btrfs_set_header_nritems(right, 0); + wret = insert_ptr(trans, root, path, + &disk_key, + right->start, + path->slots[1], 1); + if (wret) + ret = wret; + btrfs_tree_unlock(path->nodes[0]); + free_extent_buffer(path->nodes[0]); + path->nodes[0] = right; + path->slots[0] = 0; + if (path->slots[1] == 0) { + wret = fixup_low_keys(trans, root, + path, &disk_key, 1); + if (wret) + ret = wret; + } + btrfs_mark_buffer_dirty(right); + return ret; + } else if ((extend || !data_size) && slot == 0) { + mid = 1; + } else { + mid = slot; + if (mid != nritems && + leaf_space_used(l, mid, nritems - mid) + + data_size > BTRFS_LEAF_DATA_SIZE(root)) { + double_split = 1; + } + } + } + } + nritems = nritems - mid; + btrfs_set_header_nritems(right, nritems); + data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l); + + copy_extent_buffer(right, l, btrfs_item_nr_offset(0), + btrfs_item_nr_offset(mid), + nritems * sizeof(struct btrfs_item)); + + copy_extent_buffer(right, l, + btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) - + data_copy_size, btrfs_leaf_data(l) + + leaf_data_end(root, l), data_copy_size); + + rt_data_off = BTRFS_LEAF_DATA_SIZE(root) - + btrfs_item_end_nr(l, mid); + + for (i = 0; i < nritems; i++) { + struct btrfs_item *item = btrfs_item_nr(right, i); + u32 ioff; + + if (!right->map_token) { + map_extent_buffer(right, (unsigned long)item, + sizeof(struct btrfs_item), + &right->map_token, &right->kaddr, + &right->map_start, &right->map_len, + KM_USER1); + } + + ioff = btrfs_item_offset(right, item); + btrfs_set_item_offset(right, item, ioff + rt_data_off); + } + + if (right->map_token) { + unmap_extent_buffer(right, right->map_token, KM_USER1); + right->map_token = NULL; + } + + btrfs_set_header_nritems(l, mid); + ret = 0; + btrfs_item_key(right, &disk_key, 0); + wret = insert_ptr(trans, root, path, &disk_key, right->start, + path->slots[1] + 1, 1); + if (wret) + ret = wret; + + btrfs_mark_buffer_dirty(right); + btrfs_mark_buffer_dirty(l); + BUG_ON(path->slots[0] != slot); + + ret = btrfs_update_ref(trans, root, l, right, 0, nritems); + BUG_ON(ret); + + if (mid <= slot) { + btrfs_tree_unlock(path->nodes[0]); + free_extent_buffer(path->nodes[0]); + path->nodes[0] = right; + path->slots[0] -= mid; + path->slots[1] += 1; + } else { + btrfs_tree_unlock(right); + free_extent_buffer(right); + } + + BUG_ON(path->slots[0] < 0); + + if (double_split) { + BUG_ON(num_doubles != 0); + num_doubles++; + goto again; + } + return ret; +} + +/* + * This function splits a single item into two items, + * giving 'new_key' to the new item and splitting the + * old one at split_offset (from the start of the item). + * + * The path may be released by this operation. After + * the split, the path is pointing to the old item. The + * new item is going to be in the same node as the old one. + * + * Note, the item being split must be smaller enough to live alone on + * a tree block with room for one extra struct btrfs_item + * + * This allows us to split the item in place, keeping a lock on the + * leaf the entire time. + */ +int btrfs_split_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *new_key, + unsigned long split_offset) +{ + u32 item_size; + struct extent_buffer *leaf; + struct btrfs_key orig_key; + struct btrfs_item *item; + struct btrfs_item *new_item; + int ret = 0; + int slot; + u32 nritems; + u32 orig_offset; + struct btrfs_disk_key disk_key; + char *buf; + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &orig_key, path->slots[0]); + if (btrfs_leaf_free_space(root, leaf) >= sizeof(struct btrfs_item)) + goto split; + + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + btrfs_release_path(root, path); + + path->search_for_split = 1; + path->keep_locks = 1; + + ret = btrfs_search_slot(trans, root, &orig_key, path, 0, 1); + path->search_for_split = 0; + + /* if our item isn't there or got smaller, return now */ + if (ret != 0 || item_size != btrfs_item_size_nr(path->nodes[0], + path->slots[0])) { + path->keep_locks = 0; + return -EAGAIN; + } + + ret = split_leaf(trans, root, &orig_key, path, + sizeof(struct btrfs_item), 1); + path->keep_locks = 0; + BUG_ON(ret); + + leaf = path->nodes[0]; + BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item)); + +split: + item = btrfs_item_nr(leaf, path->slots[0]); + orig_offset = btrfs_item_offset(leaf, item); + item_size = btrfs_item_size(leaf, item); + + + buf = kmalloc(item_size, GFP_NOFS); + read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, + path->slots[0]), item_size); + slot = path->slots[0] + 1; + leaf = path->nodes[0]; + + nritems = btrfs_header_nritems(leaf); + + if (slot != nritems) { + /* shift the items */ + memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1), + btrfs_item_nr_offset(slot), + (nritems - slot) * sizeof(struct btrfs_item)); + + } + + btrfs_cpu_key_to_disk(&disk_key, new_key); + btrfs_set_item_key(leaf, &disk_key, slot); + + new_item = btrfs_item_nr(leaf, slot); + + btrfs_set_item_offset(leaf, new_item, orig_offset); + btrfs_set_item_size(leaf, new_item, item_size - split_offset); + + btrfs_set_item_offset(leaf, item, + orig_offset + item_size - split_offset); + btrfs_set_item_size(leaf, item, split_offset); + + btrfs_set_header_nritems(leaf, nritems + 1); + + /* write the data for the start of the original item */ + write_extent_buffer(leaf, buf, + btrfs_item_ptr_offset(leaf, path->slots[0]), + split_offset); + + /* write the data for the new item */ + write_extent_buffer(leaf, buf + split_offset, + btrfs_item_ptr_offset(leaf, slot), + item_size - split_offset); + btrfs_mark_buffer_dirty(leaf); + + ret = 0; + if (btrfs_leaf_free_space(root, leaf) < 0) { + btrfs_print_leaf(root, leaf); + BUG(); + } + kfree(buf); + return ret; +} + +/* + * make the item pointed to by the path smaller. new_size indicates + * how small to make it, and from_end tells us if we just chop bytes + * off the end of the item or if we shift the item to chop bytes off + * the front. + */ +int btrfs_truncate_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u32 new_size, int from_end) +{ + int ret = 0; + int slot; + int slot_orig; + struct extent_buffer *leaf; + struct btrfs_item *item; + u32 nritems; + unsigned int data_end; + unsigned int old_data_start; + unsigned int old_size; + unsigned int size_diff; + int i; + + slot_orig = path->slots[0]; + leaf = path->nodes[0]; + slot = path->slots[0]; + + old_size = btrfs_item_size_nr(leaf, slot); + if (old_size == new_size) + return 0; + + nritems = btrfs_header_nritems(leaf); + data_end = leaf_data_end(root, leaf); + + old_data_start = btrfs_item_offset_nr(leaf, slot); + + size_diff = old_size - new_size; + + BUG_ON(slot < 0); + BUG_ON(slot >= nritems); + + /* + * item0..itemN ... dataN.offset..dataN.size .. data0.size + */ + /* first correct the data pointers */ + for (i = slot; i < nritems; i++) { + u32 ioff; + item = btrfs_item_nr(leaf, i); + + if (!leaf->map_token) { + map_extent_buffer(leaf, (unsigned long)item, + sizeof(struct btrfs_item), + &leaf->map_token, &leaf->kaddr, + &leaf->map_start, &leaf->map_len, + KM_USER1); + } + + ioff = btrfs_item_offset(leaf, item); + btrfs_set_item_offset(leaf, item, ioff + size_diff); + } + + if (leaf->map_token) { + unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); + leaf->map_token = NULL; + } + + /* shift the data */ + if (from_end) { + memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + + data_end + size_diff, btrfs_leaf_data(leaf) + + data_end, old_data_start + new_size - data_end); + } else { + struct btrfs_disk_key disk_key; + u64 offset; + + btrfs_item_key(leaf, &disk_key, slot); + + if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) { + unsigned long ptr; + struct btrfs_file_extent_item *fi; + + fi = btrfs_item_ptr(leaf, slot, + struct btrfs_file_extent_item); + fi = (struct btrfs_file_extent_item *)( + (unsigned long)fi - size_diff); + + if (btrfs_file_extent_type(leaf, fi) == + BTRFS_FILE_EXTENT_INLINE) { + ptr = btrfs_item_ptr_offset(leaf, slot); + memmove_extent_buffer(leaf, ptr, + (unsigned long)fi, + offsetof(struct btrfs_file_extent_item, + disk_bytenr)); + } + } + + memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + + data_end + size_diff, btrfs_leaf_data(leaf) + + data_end, old_data_start - data_end); + + offset = btrfs_disk_key_offset(&disk_key); + btrfs_set_disk_key_offset(&disk_key, offset + size_diff); + btrfs_set_item_key(leaf, &disk_key, slot); + if (slot == 0) + fixup_low_keys(trans, root, path, &disk_key, 1); + } + + item = btrfs_item_nr(leaf, slot); + btrfs_set_item_size(leaf, item, new_size); + btrfs_mark_buffer_dirty(leaf); + + ret = 0; + if (btrfs_leaf_free_space(root, leaf) < 0) { + btrfs_print_leaf(root, leaf); + BUG(); + } + return ret; +} + +/* + * make the item pointed to by the path bigger, data_size is the new size. + */ +int btrfs_extend_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + u32 data_size) +{ + int ret = 0; + int slot; + int slot_orig; + struct extent_buffer *leaf; + struct btrfs_item *item; + u32 nritems; + unsigned int data_end; + unsigned int old_data; + unsigned int old_size; + int i; + + slot_orig = path->slots[0]; + leaf = path->nodes[0]; + + nritems = btrfs_header_nritems(leaf); + data_end = leaf_data_end(root, leaf); + + if (btrfs_leaf_free_space(root, leaf) < data_size) { + btrfs_print_leaf(root, leaf); + BUG(); + } + slot = path->slots[0]; + old_data = btrfs_item_end_nr(leaf, slot); + + BUG_ON(slot < 0); + if (slot >= nritems) { + btrfs_print_leaf(root, leaf); + printk(KERN_CRIT "slot %d too large, nritems %d\n", + slot, nritems); + BUG_ON(1); + } + + /* + * item0..itemN ... dataN.offset..dataN.size .. data0.size + */ + /* first correct the data pointers */ + for (i = slot; i < nritems; i++) { + u32 ioff; + item = btrfs_item_nr(leaf, i); + + if (!leaf->map_token) { + map_extent_buffer(leaf, (unsigned long)item, + sizeof(struct btrfs_item), + &leaf->map_token, &leaf->kaddr, + &leaf->map_start, &leaf->map_len, + KM_USER1); + } + ioff = btrfs_item_offset(leaf, item); + btrfs_set_item_offset(leaf, item, ioff - data_size); + } + + if (leaf->map_token) { + unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); + leaf->map_token = NULL; + } + + /* shift the data */ + memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + + data_end - data_size, btrfs_leaf_data(leaf) + + data_end, old_data - data_end); + + data_end = old_data; + old_size = btrfs_item_size_nr(leaf, slot); + item = btrfs_item_nr(leaf, slot); + btrfs_set_item_size(leaf, item, old_size + data_size); + btrfs_mark_buffer_dirty(leaf); + + ret = 0; + if (btrfs_leaf_free_space(root, leaf) < 0) { + btrfs_print_leaf(root, leaf); + BUG(); + } + return ret; +} + +/* + * Given a key and some data, insert items into the tree. + * This does all the path init required, making room in the tree if needed. + * Returns the number of keys that were inserted. + */ +int btrfs_insert_some_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *cpu_key, u32 *data_size, + int nr) +{ + struct extent_buffer *leaf; + struct btrfs_item *item; + int ret = 0; + int slot; + int i; + u32 nritems; + u32 total_data = 0; + u32 total_size = 0; + unsigned int data_end; + struct btrfs_disk_key disk_key; + struct btrfs_key found_key; + + for (i = 0; i < nr; i++) { + if (total_size + data_size[i] + sizeof(struct btrfs_item) > + BTRFS_LEAF_DATA_SIZE(root)) { + break; + nr = i; + } + total_data += data_size[i]; + total_size += data_size[i] + sizeof(struct btrfs_item); + } + BUG_ON(nr == 0); + + ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1); + if (ret == 0) + return -EEXIST; + if (ret < 0) + goto out; + + leaf = path->nodes[0]; + + nritems = btrfs_header_nritems(leaf); + data_end = leaf_data_end(root, leaf); + + if (btrfs_leaf_free_space(root, leaf) < total_size) { + for (i = nr; i >= 0; i--) { + total_data -= data_size[i]; + total_size -= data_size[i] + sizeof(struct btrfs_item); + if (total_size < btrfs_leaf_free_space(root, leaf)) + break; + } + nr = i; + } + + slot = path->slots[0]; + BUG_ON(slot < 0); + + if (slot != nritems) { + unsigned int old_data = btrfs_item_end_nr(leaf, slot); + + item = btrfs_item_nr(leaf, slot); + btrfs_item_key_to_cpu(leaf, &found_key, slot); + + /* figure out how many keys we can insert in here */ + total_data = data_size[0]; + for (i = 1; i < nr; i++) { + if (comp_cpu_keys(&found_key, cpu_key + i) <= 0) + break; + total_data += data_size[i]; + } + nr = i; + + if (old_data < data_end) { + btrfs_print_leaf(root, leaf); + printk(KERN_CRIT "slot %d old_data %d data_end %d\n", + slot, old_data, data_end); + BUG_ON(1); + } + /* + * item0..itemN ... dataN.offset..dataN.size .. data0.size + */ + /* first correct the data pointers */ + WARN_ON(leaf->map_token); + for (i = slot; i < nritems; i++) { + u32 ioff; + + item = btrfs_item_nr(leaf, i); + if (!leaf->map_token) { + map_extent_buffer(leaf, (unsigned long)item, + sizeof(struct btrfs_item), + &leaf->map_token, &leaf->kaddr, + &leaf->map_start, &leaf->map_len, + KM_USER1); + } + + ioff = btrfs_item_offset(leaf, item); + btrfs_set_item_offset(leaf, item, ioff - total_data); + } + if (leaf->map_token) { + unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); + leaf->map_token = NULL; + } + + /* shift the items */ + memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr), + btrfs_item_nr_offset(slot), + (nritems - slot) * sizeof(struct btrfs_item)); + + /* shift the data */ + memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + + data_end - total_data, btrfs_leaf_data(leaf) + + data_end, old_data - data_end); + data_end = old_data; + } else { + /* + * this sucks but it has to be done, if we are inserting at + * the end of the leaf only insert 1 of the items, since we + * have no way of knowing whats on the next leaf and we'd have + * to drop our current locks to figure it out + */ + nr = 1; + } + + /* setup the item for the new data */ + for (i = 0; i < nr; i++) { + btrfs_cpu_key_to_disk(&disk_key, cpu_key + i); + btrfs_set_item_key(leaf, &disk_key, slot + i); + item = btrfs_item_nr(leaf, slot + i); + btrfs_set_item_offset(leaf, item, data_end - data_size[i]); + data_end -= data_size[i]; + btrfs_set_item_size(leaf, item, data_size[i]); + } + btrfs_set_header_nritems(leaf, nritems + nr); + btrfs_mark_buffer_dirty(leaf); + + ret = 0; + if (slot == 0) { + btrfs_cpu_key_to_disk(&disk_key, cpu_key); + ret = fixup_low_keys(trans, root, path, &disk_key, 1); + } + + if (btrfs_leaf_free_space(root, leaf) < 0) { + btrfs_print_leaf(root, leaf); + BUG(); + } +out: + if (!ret) + ret = nr; + return ret; +} + +/* + * Given a key and some data, insert items into the tree. + * This does all the path init required, making room in the tree if needed. + */ +int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *cpu_key, u32 *data_size, + int nr) +{ + struct extent_buffer *leaf; + struct btrfs_item *item; + int ret = 0; + int slot; + int slot_orig; + int i; + u32 nritems; + u32 total_size = 0; + u32 total_data = 0; + unsigned int data_end; + struct btrfs_disk_key disk_key; + + for (i = 0; i < nr; i++) + total_data += data_size[i]; + + total_size = total_data + (nr * sizeof(struct btrfs_item)); + ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1); + if (ret == 0) + return -EEXIST; + if (ret < 0) + goto out; + + slot_orig = path->slots[0]; + leaf = path->nodes[0]; + + nritems = btrfs_header_nritems(leaf); + data_end = leaf_data_end(root, leaf); + + if (btrfs_leaf_free_space(root, leaf) < total_size) { + btrfs_print_leaf(root, leaf); + printk(KERN_CRIT "not enough freespace need %u have %d\n", + total_size, btrfs_leaf_free_space(root, leaf)); + BUG(); + } + + slot = path->slots[0]; + BUG_ON(slot < 0); + + if (slot != nritems) { + unsigned int old_data = btrfs_item_end_nr(leaf, slot); + + if (old_data < data_end) { + btrfs_print_leaf(root, leaf); + printk(KERN_CRIT "slot %d old_data %d data_end %d\n", + slot, old_data, data_end); + BUG_ON(1); + } + /* + * item0..itemN ... dataN.offset..dataN.size .. data0.size + */ + /* first correct the data pointers */ + WARN_ON(leaf->map_token); + for (i = slot; i < nritems; i++) { + u32 ioff; + + item = btrfs_item_nr(leaf, i); + if (!leaf->map_token) { + map_extent_buffer(leaf, (unsigned long)item, + sizeof(struct btrfs_item), + &leaf->map_token, &leaf->kaddr, + &leaf->map_start, &leaf->map_len, + KM_USER1); + } + + ioff = btrfs_item_offset(leaf, item); + btrfs_set_item_offset(leaf, item, ioff - total_data); + } + if (leaf->map_token) { + unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); + leaf->map_token = NULL; + } + + /* shift the items */ + memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr), + btrfs_item_nr_offset(slot), + (nritems - slot) * sizeof(struct btrfs_item)); + + /* shift the data */ + memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + + data_end - total_data, btrfs_leaf_data(leaf) + + data_end, old_data - data_end); + data_end = old_data; + } + + /* setup the item for the new data */ + for (i = 0; i < nr; i++) { + btrfs_cpu_key_to_disk(&disk_key, cpu_key + i); + btrfs_set_item_key(leaf, &disk_key, slot + i); + item = btrfs_item_nr(leaf, slot + i); + btrfs_set_item_offset(leaf, item, data_end - data_size[i]); + data_end -= data_size[i]; + btrfs_set_item_size(leaf, item, data_size[i]); + } + btrfs_set_header_nritems(leaf, nritems + nr); + btrfs_mark_buffer_dirty(leaf); + + ret = 0; + if (slot == 0) { + btrfs_cpu_key_to_disk(&disk_key, cpu_key); + ret = fixup_low_keys(trans, root, path, &disk_key, 1); + } + + if (btrfs_leaf_free_space(root, leaf) < 0) { + btrfs_print_leaf(root, leaf); + BUG(); + } +out: + return ret; +} + +/* + * Given a key and some data, insert an item into the tree. + * This does all the path init required, making room in the tree if needed. + */ +int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *cpu_key, void *data, u32 + data_size) +{ + int ret = 0; + struct btrfs_path *path; + struct extent_buffer *leaf; + unsigned long ptr; + + path = btrfs_alloc_path(); + BUG_ON(!path); + ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); + if (!ret) { + leaf = path->nodes[0]; + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + write_extent_buffer(leaf, data, ptr, data_size); + btrfs_mark_buffer_dirty(leaf); + } + btrfs_free_path(path); + return ret; +} + +/* + * delete the pointer from a given node. + * + * the tree should have been previously balanced so the deletion does not + * empty a node. + */ +static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_path *path, int level, int slot) +{ + struct extent_buffer *parent = path->nodes[level]; + u32 nritems; + int ret = 0; + int wret; + + nritems = btrfs_header_nritems(parent); + if (slot != nritems - 1) { + memmove_extent_buffer(parent, + btrfs_node_key_ptr_offset(slot), + btrfs_node_key_ptr_offset(slot + 1), + sizeof(struct btrfs_key_ptr) * + (nritems - slot - 1)); + } + nritems--; + btrfs_set_header_nritems(parent, nritems); + if (nritems == 0 && parent == root->node) { + BUG_ON(btrfs_header_level(root->node) != 1); + /* just turn the root into a leaf and break */ + btrfs_set_header_level(root->node, 0); + } else if (slot == 0) { + struct btrfs_disk_key disk_key; + + btrfs_node_key(parent, &disk_key, 0); + wret = fixup_low_keys(trans, root, path, &disk_key, level + 1); + if (wret) + ret = wret; + } + btrfs_mark_buffer_dirty(parent); + return ret; +} + +/* + * a helper function to delete the leaf pointed to by path->slots[1] and + * path->nodes[1]. bytenr is the node block pointer, but since the callers + * already know it, it is faster to have them pass it down than to + * read it out of the node again. + * + * This deletes the pointer in path->nodes[1] and frees the leaf + * block extent. zero is returned if it all worked out, < 0 otherwise. + * + * The path must have already been setup for deleting the leaf, including + * all the proper balancing. path->nodes[1] must be locked. + */ +noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 bytenr) +{ + int ret; + u64 root_gen = btrfs_header_generation(path->nodes[1]); + + ret = del_ptr(trans, root, path, 1, path->slots[1]); + if (ret) + return ret; + + ret = btrfs_free_extent(trans, root, bytenr, + btrfs_level_size(root, 0), + path->nodes[1]->start, + btrfs_header_owner(path->nodes[1]), + root_gen, 0, 1); + return ret; +} +/* + * delete the item at the leaf level in path. If that empties + * the leaf, remove it from the tree + */ +int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_path *path, int slot, int nr) +{ + struct extent_buffer *leaf; + struct btrfs_item *item; + int last_off; + int dsize = 0; + int ret = 0; + int wret; + int i; + u32 nritems; + + leaf = path->nodes[0]; + last_off = btrfs_item_offset_nr(leaf, slot + nr - 1); + + for (i = 0; i < nr; i++) + dsize += btrfs_item_size_nr(leaf, slot + i); + + nritems = btrfs_header_nritems(leaf); + + if (slot + nr != nritems) { + int data_end = leaf_data_end(root, leaf); + + memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + + data_end + dsize, + btrfs_leaf_data(leaf) + data_end, + last_off - data_end); + + for (i = slot + nr; i < nritems; i++) { + u32 ioff; + + item = btrfs_item_nr(leaf, i); + if (!leaf->map_token) { + map_extent_buffer(leaf, (unsigned long)item, + sizeof(struct btrfs_item), + &leaf->map_token, &leaf->kaddr, + &leaf->map_start, &leaf->map_len, + KM_USER1); + } + ioff = btrfs_item_offset(leaf, item); + btrfs_set_item_offset(leaf, item, ioff + dsize); + } + + if (leaf->map_token) { + unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); + leaf->map_token = NULL; + } + + memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot), + btrfs_item_nr_offset(slot + nr), + sizeof(struct btrfs_item) * + (nritems - slot - nr)); + } + btrfs_set_header_nritems(leaf, nritems - nr); + nritems -= nr; + + /* delete the leaf if we've emptied it */ + if (nritems == 0) { + if (leaf == root->node) { + btrfs_set_header_level(leaf, 0); + } else { + ret = btrfs_del_leaf(trans, root, path, leaf->start); + BUG_ON(ret); + } + } else { + int used = leaf_space_used(leaf, 0, nritems); + if (slot == 0) { + struct btrfs_disk_key disk_key; + + btrfs_item_key(leaf, &disk_key, 0); + wret = fixup_low_keys(trans, root, path, + &disk_key, 1); + if (wret) + ret = wret; + } + + /* delete the leaf if it is mostly empty */ + if (used < BTRFS_LEAF_DATA_SIZE(root) / 4) { + /* push_leaf_left fixes the path. + * make sure the path still points to our leaf + * for possible call to del_ptr below + */ + slot = path->slots[1]; + extent_buffer_get(leaf); + + wret = push_leaf_left(trans, root, path, 1, 1); + if (wret < 0 && wret != -ENOSPC) + ret = wret; + + if (path->nodes[0] == leaf && + btrfs_header_nritems(leaf)) { + wret = push_leaf_right(trans, root, path, 1, 1); + if (wret < 0 && wret != -ENOSPC) + ret = wret; + } + + if (btrfs_header_nritems(leaf) == 0) { + path->slots[1] = slot; + ret = btrfs_del_leaf(trans, root, path, + leaf->start); + BUG_ON(ret); + free_extent_buffer(leaf); + } else { + /* if we're still in the path, make sure + * we're dirty. Otherwise, one of the + * push_leaf functions must have already + * dirtied this buffer + */ + if (path->nodes[0] == leaf) + btrfs_mark_buffer_dirty(leaf); + free_extent_buffer(leaf); + } + } else { + btrfs_mark_buffer_dirty(leaf); + } + } + return ret; +} + +/* + * search the tree again to find a leaf with lesser keys + * returns 0 if it found something or 1 if there are no lesser leaves. + * returns < 0 on io errors. + * + * This may release the path, and so you may lose any locks held at the + * time you call it. + */ +int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) +{ + struct btrfs_key key; + struct btrfs_disk_key found_key; + int ret; + + btrfs_item_key_to_cpu(path->nodes[0], &key, 0); + + if (key.offset > 0) + key.offset--; + else if (key.type > 0) + key.type--; + else if (key.objectid > 0) + key.objectid--; + else + return 1; + + btrfs_release_path(root, path); + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + return ret; + btrfs_item_key(path->nodes[0], &found_key, 0); + ret = comp_keys(&found_key, &key); + if (ret < 0) + return 0; + return 1; +} + +/* + * A helper function to walk down the tree starting at min_key, and looking + * for nodes or leaves that are either in cache or have a minimum + * transaction id. This is used by the btree defrag code, and tree logging + * + * This does not cow, but it does stuff the starting key it finds back + * into min_key, so you can call btrfs_search_slot with cow=1 on the + * key and get a writable path. + * + * This does lock as it descends, and path->keep_locks should be set + * to 1 by the caller. + * + * This honors path->lowest_level to prevent descent past a given level + * of the tree. + * + * min_trans indicates the oldest transaction that you are interested + * in walking through. Any nodes or leaves older than min_trans are + * skipped over (without reading them). + * + * returns zero if something useful was found, < 0 on error and 1 if there + * was nothing in the tree that matched the search criteria. + */ +int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, + struct btrfs_key *max_key, + struct btrfs_path *path, int cache_only, + u64 min_trans) +{ + struct extent_buffer *cur; + struct btrfs_key found_key; + int slot; + int sret; + u32 nritems; + int level; + int ret = 1; + + WARN_ON(!path->keep_locks); +again: + cur = btrfs_lock_root_node(root); + level = btrfs_header_level(cur); + WARN_ON(path->nodes[level]); + path->nodes[level] = cur; + path->locks[level] = 1; + + if (btrfs_header_generation(cur) < min_trans) { + ret = 1; + goto out; + } + while (1) { + nritems = btrfs_header_nritems(cur); + level = btrfs_header_level(cur); + sret = bin_search(cur, min_key, level, &slot); + + /* at the lowest level, we're done, setup the path and exit */ + if (level == path->lowest_level) { + if (slot >= nritems) + goto find_next_key; + ret = 0; + path->slots[level] = slot; + btrfs_item_key_to_cpu(cur, &found_key, slot); + goto out; + } + if (sret && slot > 0) + slot--; + /* + * check this node pointer against the cache_only and + * min_trans parameters. If it isn't in cache or is too + * old, skip to the next one. + */ + while (slot < nritems) { + u64 blockptr; + u64 gen; + struct extent_buffer *tmp; + struct btrfs_disk_key disk_key; + + blockptr = btrfs_node_blockptr(cur, slot); + gen = btrfs_node_ptr_generation(cur, slot); + if (gen < min_trans) { + slot++; + continue; + } + if (!cache_only) + break; + + if (max_key) { + btrfs_node_key(cur, &disk_key, slot); + if (comp_keys(&disk_key, max_key) >= 0) { + ret = 1; + goto out; + } + } + + tmp = btrfs_find_tree_block(root, blockptr, + btrfs_level_size(root, level - 1)); + + if (tmp && btrfs_buffer_uptodate(tmp, gen)) { + free_extent_buffer(tmp); + break; + } + if (tmp) + free_extent_buffer(tmp); + slot++; + } +find_next_key: + /* + * we didn't find a candidate key in this node, walk forward + * and find another one + */ + if (slot >= nritems) { + path->slots[level] = slot; + sret = btrfs_find_next_key(root, path, min_key, level, + cache_only, min_trans); + if (sret == 0) { + btrfs_release_path(root, path); + goto again; + } else { + goto out; + } + } + /* save our key for returning back */ + btrfs_node_key_to_cpu(cur, &found_key, slot); + path->slots[level] = slot; + if (level == path->lowest_level) { + ret = 0; + unlock_up(path, level, 1); + goto out; + } + cur = read_node_slot(root, cur, slot); + + btrfs_tree_lock(cur); + path->locks[level - 1] = 1; + path->nodes[level - 1] = cur; + unlock_up(path, level, 1); + } +out: + if (ret == 0) + memcpy(min_key, &found_key, sizeof(found_key)); + return ret; +} + +/* + * this is similar to btrfs_next_leaf, but does not try to preserve + * and fixup the path. It looks for and returns the next key in the + * tree based on the current path and the cache_only and min_trans + * parameters. + * + * 0 is returned if another key is found, < 0 if there are any errors + * and 1 is returned if there are no higher keys in the tree + * + * path->keep_locks should be set to 1 on the search made before + * calling this function. + */ +int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *key, int lowest_level, + int cache_only, u64 min_trans) +{ + int level = lowest_level; + int slot; + struct extent_buffer *c; + + WARN_ON(!path->keep_locks); + while (level < BTRFS_MAX_LEVEL) { + if (!path->nodes[level]) + return 1; + + slot = path->slots[level] + 1; + c = path->nodes[level]; +next: + if (slot >= btrfs_header_nritems(c)) { + level++; + if (level == BTRFS_MAX_LEVEL) + return 1; + continue; + } + if (level == 0) + btrfs_item_key_to_cpu(c, key, slot); + else { + u64 blockptr = btrfs_node_blockptr(c, slot); + u64 gen = btrfs_node_ptr_generation(c, slot); + + if (cache_only) { + struct extent_buffer *cur; + cur = btrfs_find_tree_block(root, blockptr, + btrfs_level_size(root, level - 1)); + if (!cur || !btrfs_buffer_uptodate(cur, gen)) { + slot++; + if (cur) + free_extent_buffer(cur); + goto next; + } + free_extent_buffer(cur); + } + if (gen < min_trans) { + slot++; + goto next; + } + btrfs_node_key_to_cpu(c, key, slot); + } + return 0; + } + return 1; +} + +/* + * search the tree again to find a leaf with greater keys + * returns 0 if it found something or 1 if there are no greater leaves. + * returns < 0 on io errors. + */ +int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) +{ + int slot; + int level = 1; + struct extent_buffer *c; + struct extent_buffer *next = NULL; + struct btrfs_key key; + u32 nritems; + int ret; + + nritems = btrfs_header_nritems(path->nodes[0]); + if (nritems == 0) + return 1; + + btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); + + btrfs_release_path(root, path); + path->keep_locks = 1; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + path->keep_locks = 0; + + if (ret < 0) + return ret; + + nritems = btrfs_header_nritems(path->nodes[0]); + /* + * by releasing the path above we dropped all our locks. A balance + * could have added more items next to the key that used to be + * at the very end of the block. So, check again here and + * advance the path if there are now more items available. + */ + if (nritems > 0 && path->slots[0] < nritems - 1) { + path->slots[0]++; + goto done; + } + + while (level < BTRFS_MAX_LEVEL) { + if (!path->nodes[level]) + return 1; + + slot = path->slots[level] + 1; + c = path->nodes[level]; + if (slot >= btrfs_header_nritems(c)) { + level++; + if (level == BTRFS_MAX_LEVEL) + return 1; + continue; + } + + if (next) { + btrfs_tree_unlock(next); + free_extent_buffer(next); + } + + if (level == 1 && (path->locks[1] || path->skip_locking) && + path->reada) + reada_for_search(root, path, level, slot, 0); + + next = read_node_slot(root, c, slot); + if (!path->skip_locking) { + WARN_ON(!btrfs_tree_locked(c)); + btrfs_tree_lock(next); + } + break; + } + path->slots[level] = slot; + while (1) { + level--; + c = path->nodes[level]; + if (path->locks[level]) + btrfs_tree_unlock(c); + free_extent_buffer(c); + path->nodes[level] = next; + path->slots[level] = 0; + if (!path->skip_locking) + path->locks[level] = 1; + if (!level) + break; + if (level == 1 && path->locks[1] && path->reada) + reada_for_search(root, path, level, slot, 0); + next = read_node_slot(root, next, 0); + if (!path->skip_locking) { + WARN_ON(!btrfs_tree_locked(path->nodes[level])); + btrfs_tree_lock(next); + } + } +done: + unlock_up(path, 0, 1); + return 0; +} + +/* + * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps + * searching until it gets past min_objectid or finds an item of 'type' + * + * returns 0 if something is found, 1 if nothing was found and < 0 on error + */ +int btrfs_previous_item(struct btrfs_root *root, + struct btrfs_path *path, u64 min_objectid, + int type) +{ + struct btrfs_key found_key; + struct extent_buffer *leaf; + u32 nritems; + int ret; + + while (1) { + if (path->slots[0] == 0) { + ret = btrfs_prev_leaf(root, path); + if (ret != 0) + return ret; + } else { + path->slots[0]--; + } + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + if (nritems == 0) + return 1; + if (path->slots[0] == nritems) + path->slots[0]--; + + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (found_key.type == type) + return 0; + if (found_key.objectid < min_objectid) + break; + if (found_key.objectid == min_objectid && + found_key.type < type) + break; + } + return 1; +} diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h new file mode 100644 index 000000000000..eee060f88113 --- /dev/null +++ b/fs/btrfs/ctree.h @@ -0,0 +1,2129 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_CTREE__ +#define __BTRFS_CTREE__ + +#include <linux/version.h> +#include <linux/mm.h> +#include <linux/highmem.h> +#include <linux/fs.h> +#include <linux/completion.h> +#include <linux/backing-dev.h> +#include <linux/wait.h> +#include <asm/kmap_types.h> +#include "extent_io.h" +#include "extent_map.h" +#include "async-thread.h" + +struct btrfs_trans_handle; +struct btrfs_transaction; +extern struct kmem_cache *btrfs_trans_handle_cachep; +extern struct kmem_cache *btrfs_transaction_cachep; +extern struct kmem_cache *btrfs_bit_radix_cachep; +extern struct kmem_cache *btrfs_path_cachep; +struct btrfs_ordered_sum; + +#define BTRFS_MAGIC "_BHRfS_M" + +#define BTRFS_ACL_NOT_CACHED ((void *)-1) + +#ifdef CONFIG_LOCKDEP +# define BTRFS_MAX_LEVEL 7 +#else +# define BTRFS_MAX_LEVEL 8 +#endif + +/* holds pointers to all of the tree roots */ +#define BTRFS_ROOT_TREE_OBJECTID 1ULL + +/* stores information about which extents are in use, and reference counts */ +#define BTRFS_EXTENT_TREE_OBJECTID 2ULL + +/* + * chunk tree stores translations from logical -> physical block numbering + * the super block points to the chunk tree + */ +#define BTRFS_CHUNK_TREE_OBJECTID 3ULL + +/* + * stores information about which areas of a given device are in use. + * one per device. The tree of tree roots points to the device tree + */ +#define BTRFS_DEV_TREE_OBJECTID 4ULL + +/* one per subvolume, storing files and directories */ +#define BTRFS_FS_TREE_OBJECTID 5ULL + +/* directory objectid inside the root tree */ +#define BTRFS_ROOT_TREE_DIR_OBJECTID 6ULL + +/* holds checksums of all the data extents */ +#define BTRFS_CSUM_TREE_OBJECTID 7ULL + +/* orhpan objectid for tracking unlinked/truncated files */ +#define BTRFS_ORPHAN_OBJECTID -5ULL + +/* does write ahead logging to speed up fsyncs */ +#define BTRFS_TREE_LOG_OBJECTID -6ULL +#define BTRFS_TREE_LOG_FIXUP_OBJECTID -7ULL + +/* for space balancing */ +#define BTRFS_TREE_RELOC_OBJECTID -8ULL +#define BTRFS_DATA_RELOC_TREE_OBJECTID -9ULL + +/* + * extent checksums all have this objectid + * this allows them to share the logging tree + * for fsyncs + */ +#define BTRFS_EXTENT_CSUM_OBJECTID -10ULL + +/* dummy objectid represents multiple objectids */ +#define BTRFS_MULTIPLE_OBJECTIDS -255ULL + +/* + * All files have objectids in this range. + */ +#define BTRFS_FIRST_FREE_OBJECTID 256ULL +#define BTRFS_LAST_FREE_OBJECTID -256ULL +#define BTRFS_FIRST_CHUNK_TREE_OBJECTID 256ULL + + +/* + * the device items go into the chunk tree. The key is in the form + * [ 1 BTRFS_DEV_ITEM_KEY device_id ] + */ +#define BTRFS_DEV_ITEMS_OBJECTID 1ULL + +/* + * we can actually store much bigger names, but lets not confuse the rest + * of linux + */ +#define BTRFS_NAME_LEN 255 + +/* 32 bytes in various csum fields */ +#define BTRFS_CSUM_SIZE 32 + +/* csum types */ +#define BTRFS_CSUM_TYPE_CRC32 0 + +static int btrfs_csum_sizes[] = { 4, 0 }; + +/* four bytes for CRC32 */ +#define BTRFS_EMPTY_DIR_SIZE 0 + +#define BTRFS_FT_UNKNOWN 0 +#define BTRFS_FT_REG_FILE 1 +#define BTRFS_FT_DIR 2 +#define BTRFS_FT_CHRDEV 3 +#define BTRFS_FT_BLKDEV 4 +#define BTRFS_FT_FIFO 5 +#define BTRFS_FT_SOCK 6 +#define BTRFS_FT_SYMLINK 7 +#define BTRFS_FT_XATTR 8 +#define BTRFS_FT_MAX 9 + +/* + * the key defines the order in the tree, and so it also defines (optimal) + * block layout. objectid corresonds to the inode number. The flags + * tells us things about the object, and is a kind of stream selector. + * so for a given inode, keys with flags of 1 might refer to the inode + * data, flags of 2 may point to file data in the btree and flags == 3 + * may point to extents. + * + * offset is the starting byte offset for this key in the stream. + * + * btrfs_disk_key is in disk byte order. struct btrfs_key is always + * in cpu native order. Otherwise they are identical and their sizes + * should be the same (ie both packed) + */ +struct btrfs_disk_key { + __le64 objectid; + u8 type; + __le64 offset; +} __attribute__ ((__packed__)); + +struct btrfs_key { + u64 objectid; + u8 type; + u64 offset; +} __attribute__ ((__packed__)); + +struct btrfs_mapping_tree { + struct extent_map_tree map_tree; +}; + +#define BTRFS_UUID_SIZE 16 +struct btrfs_dev_item { + /* the internal btrfs device id */ + __le64 devid; + + /* size of the device */ + __le64 total_bytes; + + /* bytes used */ + __le64 bytes_used; + + /* optimal io alignment for this device */ + __le32 io_align; + + /* optimal io width for this device */ + __le32 io_width; + + /* minimal io size for this device */ + __le32 sector_size; + + /* type and info about this device */ + __le64 type; + + /* expected generation for this device */ + __le64 generation; + + /* + * starting byte of this partition on the device, + * to allowr for stripe alignment in the future + */ + __le64 start_offset; + + /* grouping information for allocation decisions */ + __le32 dev_group; + + /* seek speed 0-100 where 100 is fastest */ + u8 seek_speed; + + /* bandwidth 0-100 where 100 is fastest */ + u8 bandwidth; + + /* btrfs generated uuid for this device */ + u8 uuid[BTRFS_UUID_SIZE]; + + /* uuid of FS who owns this device */ + u8 fsid[BTRFS_UUID_SIZE]; +} __attribute__ ((__packed__)); + +struct btrfs_stripe { + __le64 devid; + __le64 offset; + u8 dev_uuid[BTRFS_UUID_SIZE]; +} __attribute__ ((__packed__)); + +struct btrfs_chunk { + /* size of this chunk in bytes */ + __le64 length; + + /* objectid of the root referencing this chunk */ + __le64 owner; + + __le64 stripe_len; + __le64 type; + + /* optimal io alignment for this chunk */ + __le32 io_align; + + /* optimal io width for this chunk */ + __le32 io_width; + + /* minimal io size for this chunk */ + __le32 sector_size; + + /* 2^16 stripes is quite a lot, a second limit is the size of a single + * item in the btree + */ + __le16 num_stripes; + + /* sub stripes only matter for raid10 */ + __le16 sub_stripes; + struct btrfs_stripe stripe; + /* additional stripes go here */ +} __attribute__ ((__packed__)); + +static inline unsigned long btrfs_chunk_item_size(int num_stripes) +{ + BUG_ON(num_stripes == 0); + return sizeof(struct btrfs_chunk) + + sizeof(struct btrfs_stripe) * (num_stripes - 1); +} + +#define BTRFS_FSID_SIZE 16 +#define BTRFS_HEADER_FLAG_WRITTEN (1 << 0) + +/* + * every tree block (leaf or node) starts with this header. + */ +struct btrfs_header { + /* these first four must match the super block */ + u8 csum[BTRFS_CSUM_SIZE]; + u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ + __le64 bytenr; /* which block this node is supposed to live in */ + __le64 flags; + + /* allowed to be different from the super from here on down */ + u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; + __le64 generation; + __le64 owner; + __le32 nritems; + u8 level; +} __attribute__ ((__packed__)); + +#define BTRFS_NODEPTRS_PER_BLOCK(r) (((r)->nodesize - \ + sizeof(struct btrfs_header)) / \ + sizeof(struct btrfs_key_ptr)) +#define __BTRFS_LEAF_DATA_SIZE(bs) ((bs) - sizeof(struct btrfs_header)) +#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->leafsize)) +#define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \ + sizeof(struct btrfs_item) - \ + sizeof(struct btrfs_file_extent_item)) + +#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32) + +/* + * this is a very generous portion of the super block, giving us + * room to translate 14 chunks with 3 stripes each. + */ +#define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048 +#define BTRFS_LABEL_SIZE 256 + +/* + * the super block basically lists the main trees of the FS + * it currently lacks any block count etc etc + */ +struct btrfs_super_block { + u8 csum[BTRFS_CSUM_SIZE]; + /* the first 4 fields must match struct btrfs_header */ + u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ + __le64 bytenr; /* this block number */ + __le64 flags; + + /* allowed to be different from the btrfs_header from here own down */ + __le64 magic; + __le64 generation; + __le64 root; + __le64 chunk_root; + __le64 log_root; + + /* this will help find the new super based on the log root */ + __le64 log_root_transid; + __le64 total_bytes; + __le64 bytes_used; + __le64 root_dir_objectid; + __le64 num_devices; + __le32 sectorsize; + __le32 nodesize; + __le32 leafsize; + __le32 stripesize; + __le32 sys_chunk_array_size; + __le64 chunk_root_generation; + __le64 compat_flags; + __le64 compat_ro_flags; + __le64 incompat_flags; + __le16 csum_type; + u8 root_level; + u8 chunk_root_level; + u8 log_root_level; + struct btrfs_dev_item dev_item; + + char label[BTRFS_LABEL_SIZE]; + + /* future expansion */ + __le64 reserved[32]; + u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; +} __attribute__ ((__packed__)); + +/* + * Compat flags that we support. If any incompat flags are set other than the + * ones specified below then we will fail to mount + */ +#define BTRFS_FEATURE_COMPAT_SUPP 0x0 +#define BTRFS_FEATURE_COMPAT_RO_SUPP 0x0 +#define BTRFS_FEATURE_INCOMPAT_SUPP 0x0 + +/* + * A leaf is full of items. offset and size tell us where to find + * the item in the leaf (relative to the start of the data area) + */ +struct btrfs_item { + struct btrfs_disk_key key; + __le32 offset; + __le32 size; +} __attribute__ ((__packed__)); + +/* + * leaves have an item area and a data area: + * [item0, item1....itemN] [free space] [dataN...data1, data0] + * + * The data is separate from the items to get the keys closer together + * during searches. + */ +struct btrfs_leaf { + struct btrfs_header header; + struct btrfs_item items[]; +} __attribute__ ((__packed__)); + +/* + * all non-leaf blocks are nodes, they hold only keys and pointers to + * other blocks + */ +struct btrfs_key_ptr { + struct btrfs_disk_key key; + __le64 blockptr; + __le64 generation; +} __attribute__ ((__packed__)); + +struct btrfs_node { + struct btrfs_header header; + struct btrfs_key_ptr ptrs[]; +} __attribute__ ((__packed__)); + +/* + * btrfs_paths remember the path taken from the root down to the leaf. + * level 0 is always the leaf, and nodes[1...BTRFS_MAX_LEVEL] will point + * to any other levels that are present. + * + * The slots array records the index of the item or block pointer + * used while walking the tree. + */ +struct btrfs_path { + struct extent_buffer *nodes[BTRFS_MAX_LEVEL]; + int slots[BTRFS_MAX_LEVEL]; + /* if there is real range locking, this locks field will change */ + int locks[BTRFS_MAX_LEVEL]; + int reada; + /* keep some upper locks as we walk down */ + int keep_locks; + int skip_locking; + int lowest_level; + + /* + * set by btrfs_split_item, tells search_slot to keep all locks + * and to force calls to keep space in the nodes + */ + int search_for_split; +}; + +/* + * items in the extent btree are used to record the objectid of the + * owner of the block and the number of references + */ +struct btrfs_extent_item { + __le32 refs; +} __attribute__ ((__packed__)); + +struct btrfs_extent_ref { + __le64 root; + __le64 generation; + __le64 objectid; + __le32 num_refs; +} __attribute__ ((__packed__)); + +/* dev extents record free space on individual devices. The owner + * field points back to the chunk allocation mapping tree that allocated + * the extent. The chunk tree uuid field is a way to double check the owner + */ +struct btrfs_dev_extent { + __le64 chunk_tree; + __le64 chunk_objectid; + __le64 chunk_offset; + __le64 length; + u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; +} __attribute__ ((__packed__)); + +struct btrfs_inode_ref { + __le64 index; + __le16 name_len; + /* name goes here */ +} __attribute__ ((__packed__)); + +struct btrfs_timespec { + __le64 sec; + __le32 nsec; +} __attribute__ ((__packed__)); + +typedef enum { + BTRFS_COMPRESS_NONE = 0, + BTRFS_COMPRESS_ZLIB = 1, + BTRFS_COMPRESS_LAST = 2, +} btrfs_compression_type; + +/* we don't understand any encryption methods right now */ +typedef enum { + BTRFS_ENCRYPTION_NONE = 0, + BTRFS_ENCRYPTION_LAST = 1, +} btrfs_encryption_type; + +struct btrfs_inode_item { + /* nfs style generation number */ + __le64 generation; + /* transid that last touched this inode */ + __le64 transid; + __le64 size; + __le64 nbytes; + __le64 block_group; + __le32 nlink; + __le32 uid; + __le32 gid; + __le32 mode; + __le64 rdev; + __le64 flags; + + /* modification sequence number for NFS */ + __le64 sequence; + + /* + * a little future expansion, for more than this we can + * just grow the inode item and version it + */ + __le64 reserved[4]; + struct btrfs_timespec atime; + struct btrfs_timespec ctime; + struct btrfs_timespec mtime; + struct btrfs_timespec otime; +} __attribute__ ((__packed__)); + +struct btrfs_dir_log_item { + __le64 end; +} __attribute__ ((__packed__)); + +struct btrfs_dir_item { + struct btrfs_disk_key location; + __le64 transid; + __le16 data_len; + __le16 name_len; + u8 type; +} __attribute__ ((__packed__)); + +struct btrfs_root_item { + struct btrfs_inode_item inode; + __le64 generation; + __le64 root_dirid; + __le64 bytenr; + __le64 byte_limit; + __le64 bytes_used; + __le64 last_snapshot; + __le64 flags; + __le32 refs; + struct btrfs_disk_key drop_progress; + u8 drop_level; + u8 level; +} __attribute__ ((__packed__)); + +/* + * this is used for both forward and backward root refs + */ +struct btrfs_root_ref { + __le64 dirid; + __le64 sequence; + __le16 name_len; +} __attribute__ ((__packed__)); + +#define BTRFS_FILE_EXTENT_INLINE 0 +#define BTRFS_FILE_EXTENT_REG 1 +#define BTRFS_FILE_EXTENT_PREALLOC 2 + +struct btrfs_file_extent_item { + /* + * transaction id that created this extent + */ + __le64 generation; + /* + * max number of bytes to hold this extent in ram + * when we split a compressed extent we can't know how big + * each of the resulting pieces will be. So, this is + * an upper limit on the size of the extent in ram instead of + * an exact limit. + */ + __le64 ram_bytes; + + /* + * 32 bits for the various ways we might encode the data, + * including compression and encryption. If any of these + * are set to something a given disk format doesn't understand + * it is treated like an incompat flag for reading and writing, + * but not for stat. + */ + u8 compression; + u8 encryption; + __le16 other_encoding; /* spare for later use */ + + /* are we inline data or a real extent? */ + u8 type; + + /* + * disk space consumed by the extent, checksum blocks are included + * in these numbers + */ + __le64 disk_bytenr; + __le64 disk_num_bytes; + /* + * the logical offset in file blocks (no csums) + * this extent record is for. This allows a file extent to point + * into the middle of an existing extent on disk, sharing it + * between two snapshots (useful if some bytes in the middle of the + * extent have changed + */ + __le64 offset; + /* + * the logical number of file blocks (no csums included). This + * always reflects the size uncompressed and without encoding. + */ + __le64 num_bytes; + +} __attribute__ ((__packed__)); + +struct btrfs_csum_item { + u8 csum; +} __attribute__ ((__packed__)); + +/* different types of block groups (and chunks) */ +#define BTRFS_BLOCK_GROUP_DATA (1 << 0) +#define BTRFS_BLOCK_GROUP_SYSTEM (1 << 1) +#define BTRFS_BLOCK_GROUP_METADATA (1 << 2) +#define BTRFS_BLOCK_GROUP_RAID0 (1 << 3) +#define BTRFS_BLOCK_GROUP_RAID1 (1 << 4) +#define BTRFS_BLOCK_GROUP_DUP (1 << 5) +#define BTRFS_BLOCK_GROUP_RAID10 (1 << 6) + +struct btrfs_block_group_item { + __le64 used; + __le64 chunk_objectid; + __le64 flags; +} __attribute__ ((__packed__)); + +struct btrfs_space_info { + u64 flags; + u64 total_bytes; + u64 bytes_used; + u64 bytes_pinned; + u64 bytes_reserved; + u64 bytes_readonly; + int full; + int force_alloc; + struct list_head list; + + /* for block groups in our same type */ + struct list_head block_groups; + spinlock_t lock; + struct rw_semaphore groups_sem; +}; + +struct btrfs_free_space { + struct rb_node bytes_index; + struct rb_node offset_index; + u64 offset; + u64 bytes; +}; + +struct btrfs_block_group_cache { + struct btrfs_key key; + struct btrfs_block_group_item item; + spinlock_t lock; + struct mutex alloc_mutex; + struct mutex cache_mutex; + u64 pinned; + u64 reserved; + u64 flags; + int cached; + int ro; + int dirty; + + struct btrfs_space_info *space_info; + + /* free space cache stuff */ + struct rb_root free_space_bytes; + struct rb_root free_space_offset; + + /* block group cache stuff */ + struct rb_node cache_node; + + /* for block groups in the same raid type */ + struct list_head list; + + /* usage count */ + atomic_t count; +}; + +struct btrfs_leaf_ref_tree { + struct rb_root root; + struct list_head list; + spinlock_t lock; +}; + +struct btrfs_device; +struct btrfs_fs_devices; +struct btrfs_fs_info { + u8 fsid[BTRFS_FSID_SIZE]; + u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; + struct btrfs_root *extent_root; + struct btrfs_root *tree_root; + struct btrfs_root *chunk_root; + struct btrfs_root *dev_root; + struct btrfs_root *fs_root; + struct btrfs_root *csum_root; + + /* the log root tree is a directory of all the other log roots */ + struct btrfs_root *log_root_tree; + struct radix_tree_root fs_roots_radix; + + /* block group cache stuff */ + spinlock_t block_group_cache_lock; + struct rb_root block_group_cache_tree; + + struct extent_io_tree pinned_extents; + struct extent_io_tree pending_del; + struct extent_io_tree extent_ins; + + /* logical->physical extent mapping */ + struct btrfs_mapping_tree mapping_tree; + + u64 generation; + u64 last_trans_committed; + u64 last_trans_new_blockgroup; + u64 open_ioctl_trans; + unsigned long mount_opt; + u64 max_extent; + u64 max_inline; + u64 alloc_start; + struct btrfs_transaction *running_transaction; + wait_queue_head_t transaction_throttle; + wait_queue_head_t transaction_wait; + + wait_queue_head_t async_submit_wait; + wait_queue_head_t tree_log_wait; + + struct btrfs_super_block super_copy; + struct btrfs_super_block super_for_commit; + struct block_device *__bdev; + struct super_block *sb; + struct inode *btree_inode; + struct backing_dev_info bdi; + spinlock_t hash_lock; + struct mutex trans_mutex; + struct mutex tree_log_mutex; + struct mutex transaction_kthread_mutex; + struct mutex cleaner_mutex; + struct mutex extent_ins_mutex; + struct mutex pinned_mutex; + struct mutex chunk_mutex; + struct mutex drop_mutex; + struct mutex volume_mutex; + struct mutex tree_reloc_mutex; + struct list_head trans_list; + struct list_head hashers; + struct list_head dead_roots; + + atomic_t nr_async_submits; + atomic_t async_submit_draining; + atomic_t nr_async_bios; + atomic_t async_delalloc_pages; + atomic_t tree_log_writers; + atomic_t tree_log_commit; + unsigned long tree_log_batch; + u64 tree_log_transid; + + /* + * this is used by the balancing code to wait for all the pending + * ordered extents + */ + spinlock_t ordered_extent_lock; + struct list_head ordered_extents; + struct list_head delalloc_inodes; + + /* + * there is a pool of worker threads for checksumming during writes + * and a pool for checksumming after reads. This is because readers + * can run with FS locks held, and the writers may be waiting for + * those locks. We don't want ordering in the pending list to cause + * deadlocks, and so the two are serviced separately. + * + * A third pool does submit_bio to avoid deadlocking with the other + * two + */ + struct btrfs_workers workers; + struct btrfs_workers delalloc_workers; + struct btrfs_workers endio_workers; + struct btrfs_workers endio_meta_workers; + struct btrfs_workers endio_meta_write_workers; + struct btrfs_workers endio_write_workers; + struct btrfs_workers submit_workers; + /* + * fixup workers take dirty pages that didn't properly go through + * the cow mechanism and make them safe to write. It happens + * for the sys_munmap function call path + */ + struct btrfs_workers fixup_workers; + struct task_struct *transaction_kthread; + struct task_struct *cleaner_kthread; + int thread_pool_size; + + /* tree relocation relocated fields */ + struct list_head dead_reloc_roots; + struct btrfs_leaf_ref_tree reloc_ref_tree; + struct btrfs_leaf_ref_tree shared_ref_tree; + + struct kobject super_kobj; + struct completion kobj_unregister; + int do_barriers; + int closing; + int log_root_recovering; + atomic_t throttles; + atomic_t throttle_gen; + + u64 total_pinned; + struct list_head dirty_cowonly_roots; + + struct btrfs_fs_devices *fs_devices; + struct list_head space_info; + spinlock_t delalloc_lock; + spinlock_t new_trans_lock; + u64 delalloc_bytes; + u64 last_alloc; + u64 last_data_alloc; + + spinlock_t ref_cache_lock; + u64 total_ref_cache_size; + + u64 avail_data_alloc_bits; + u64 avail_metadata_alloc_bits; + u64 avail_system_alloc_bits; + u64 data_alloc_profile; + u64 metadata_alloc_profile; + u64 system_alloc_profile; + + void *bdev_holder; +}; + +/* + * in ram representation of the tree. extent_root is used for all allocations + * and for the extent tree extent_root root. + */ +struct btrfs_dirty_root; +struct btrfs_root { + struct extent_buffer *node; + + /* the node lock is held while changing the node pointer */ + spinlock_t node_lock; + + struct extent_buffer *commit_root; + struct btrfs_leaf_ref_tree *ref_tree; + struct btrfs_leaf_ref_tree ref_tree_struct; + struct btrfs_dirty_root *dirty_root; + struct btrfs_root *log_root; + struct btrfs_root *reloc_root; + + struct btrfs_root_item root_item; + struct btrfs_key root_key; + struct btrfs_fs_info *fs_info; + struct extent_io_tree dirty_log_pages; + + struct kobject root_kobj; + struct completion kobj_unregister; + struct mutex objectid_mutex; + struct mutex log_mutex; + + u64 objectid; + u64 last_trans; + + /* data allocations are done in sectorsize units */ + u32 sectorsize; + + /* node allocations are done in nodesize units */ + u32 nodesize; + + /* leaf allocations are done in leafsize units */ + u32 leafsize; + + u32 stripesize; + + u32 type; + u64 highest_inode; + u64 last_inode_alloc; + int ref_cows; + int track_dirty; + u64 defrag_trans_start; + struct btrfs_key defrag_progress; + struct btrfs_key defrag_max; + int defrag_running; + int defrag_level; + char *name; + int in_sysfs; + + /* the dirty list is only used by non-reference counted roots */ + struct list_head dirty_list; + + spinlock_t list_lock; + struct list_head dead_list; + struct list_head orphan_list; + + /* + * right now this just gets used so that a root has its own devid + * for stat. It may be used for more later + */ + struct super_block anon_super; +}; + +/* + + * inode items have the data typically returned from stat and store other + * info about object characteristics. There is one for every file and dir in + * the FS + */ +#define BTRFS_INODE_ITEM_KEY 1 +#define BTRFS_INODE_REF_KEY 12 +#define BTRFS_XATTR_ITEM_KEY 24 +#define BTRFS_ORPHAN_ITEM_KEY 48 +/* reserve 2-15 close to the inode for later flexibility */ + +/* + * dir items are the name -> inode pointers in a directory. There is one + * for every name in a directory. + */ +#define BTRFS_DIR_LOG_ITEM_KEY 60 +#define BTRFS_DIR_LOG_INDEX_KEY 72 +#define BTRFS_DIR_ITEM_KEY 84 +#define BTRFS_DIR_INDEX_KEY 96 +/* + * extent data is for file data + */ +#define BTRFS_EXTENT_DATA_KEY 108 + +/* + * extent csums are stored in a separate tree and hold csums for + * an entire extent on disk. + */ +#define BTRFS_EXTENT_CSUM_KEY 128 + +/* + * root items point to tree roots. There are typically in the root + * tree used by the super block to find all the other trees + */ +#define BTRFS_ROOT_ITEM_KEY 132 + +/* + * root backrefs tie subvols and snapshots to the directory entries that + * reference them + */ +#define BTRFS_ROOT_BACKREF_KEY 144 + +/* + * root refs make a fast index for listing all of the snapshots and + * subvolumes referenced by a given root. They point directly to the + * directory item in the root that references the subvol + */ +#define BTRFS_ROOT_REF_KEY 156 + +/* + * extent items are in the extent map tree. These record which blocks + * are used, and how many references there are to each block + */ +#define BTRFS_EXTENT_ITEM_KEY 168 +#define BTRFS_EXTENT_REF_KEY 180 + +/* + * block groups give us hints into the extent allocation trees. Which + * blocks are free etc etc + */ +#define BTRFS_BLOCK_GROUP_ITEM_KEY 192 + +#define BTRFS_DEV_EXTENT_KEY 204 +#define BTRFS_DEV_ITEM_KEY 216 +#define BTRFS_CHUNK_ITEM_KEY 228 + +/* + * string items are for debugging. They just store a short string of + * data in the FS + */ +#define BTRFS_STRING_ITEM_KEY 253 + +#define BTRFS_MOUNT_NODATASUM (1 << 0) +#define BTRFS_MOUNT_NODATACOW (1 << 1) +#define BTRFS_MOUNT_NOBARRIER (1 << 2) +#define BTRFS_MOUNT_SSD (1 << 3) +#define BTRFS_MOUNT_DEGRADED (1 << 4) +#define BTRFS_MOUNT_COMPRESS (1 << 5) + +#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) +#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) +#define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \ + BTRFS_MOUNT_##opt) +/* + * Inode flags + */ +#define BTRFS_INODE_NODATASUM (1 << 0) +#define BTRFS_INODE_NODATACOW (1 << 1) +#define BTRFS_INODE_READONLY (1 << 2) +#define BTRFS_INODE_NOCOMPRESS (1 << 3) +#define BTRFS_INODE_PREALLOC (1 << 4) +#define btrfs_clear_flag(inode, flag) (BTRFS_I(inode)->flags &= \ + ~BTRFS_INODE_##flag) +#define btrfs_set_flag(inode, flag) (BTRFS_I(inode)->flags |= \ + BTRFS_INODE_##flag) +#define btrfs_test_flag(inode, flag) (BTRFS_I(inode)->flags & \ + BTRFS_INODE_##flag) +/* some macros to generate set/get funcs for the struct fields. This + * assumes there is a lefoo_to_cpu for every type, so lets make a simple + * one for u8: + */ +#define le8_to_cpu(v) (v) +#define cpu_to_le8(v) (v) +#define __le8 u8 + +#define read_eb_member(eb, ptr, type, member, result) ( \ + read_extent_buffer(eb, (char *)(result), \ + ((unsigned long)(ptr)) + \ + offsetof(type, member), \ + sizeof(((type *)0)->member))) + +#define write_eb_member(eb, ptr, type, member, result) ( \ + write_extent_buffer(eb, (char *)(result), \ + ((unsigned long)(ptr)) + \ + offsetof(type, member), \ + sizeof(((type *)0)->member))) + +#ifndef BTRFS_SETGET_FUNCS +#define BTRFS_SETGET_FUNCS(name, type, member, bits) \ +u##bits btrfs_##name(struct extent_buffer *eb, type *s); \ +void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val); +#endif + +#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \ +static inline u##bits btrfs_##name(struct extent_buffer *eb) \ +{ \ + type *p = kmap_atomic(eb->first_page, KM_USER0); \ + u##bits res = le##bits##_to_cpu(p->member); \ + kunmap_atomic(p, KM_USER0); \ + return res; \ +} \ +static inline void btrfs_set_##name(struct extent_buffer *eb, \ + u##bits val) \ +{ \ + type *p = kmap_atomic(eb->first_page, KM_USER0); \ + p->member = cpu_to_le##bits(val); \ + kunmap_atomic(p, KM_USER0); \ +} + +#define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \ +static inline u##bits btrfs_##name(type *s) \ +{ \ + return le##bits##_to_cpu(s->member); \ +} \ +static inline void btrfs_set_##name(type *s, u##bits val) \ +{ \ + s->member = cpu_to_le##bits(val); \ +} + +BTRFS_SETGET_FUNCS(device_type, struct btrfs_dev_item, type, 64); +BTRFS_SETGET_FUNCS(device_total_bytes, struct btrfs_dev_item, total_bytes, 64); +BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64); +BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32); +BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32); +BTRFS_SETGET_FUNCS(device_start_offset, struct btrfs_dev_item, + start_offset, 64); +BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32); +BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64); +BTRFS_SETGET_FUNCS(device_group, struct btrfs_dev_item, dev_group, 32); +BTRFS_SETGET_FUNCS(device_seek_speed, struct btrfs_dev_item, seek_speed, 8); +BTRFS_SETGET_FUNCS(device_bandwidth, struct btrfs_dev_item, bandwidth, 8); +BTRFS_SETGET_FUNCS(device_generation, struct btrfs_dev_item, generation, 64); + +BTRFS_SETGET_STACK_FUNCS(stack_device_type, struct btrfs_dev_item, type, 64); +BTRFS_SETGET_STACK_FUNCS(stack_device_total_bytes, struct btrfs_dev_item, + total_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(stack_device_bytes_used, struct btrfs_dev_item, + bytes_used, 64); +BTRFS_SETGET_STACK_FUNCS(stack_device_io_align, struct btrfs_dev_item, + io_align, 32); +BTRFS_SETGET_STACK_FUNCS(stack_device_io_width, struct btrfs_dev_item, + io_width, 32); +BTRFS_SETGET_STACK_FUNCS(stack_device_sector_size, struct btrfs_dev_item, + sector_size, 32); +BTRFS_SETGET_STACK_FUNCS(stack_device_id, struct btrfs_dev_item, devid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_device_group, struct btrfs_dev_item, + dev_group, 32); +BTRFS_SETGET_STACK_FUNCS(stack_device_seek_speed, struct btrfs_dev_item, + seek_speed, 8); +BTRFS_SETGET_STACK_FUNCS(stack_device_bandwidth, struct btrfs_dev_item, + bandwidth, 8); +BTRFS_SETGET_STACK_FUNCS(stack_device_generation, struct btrfs_dev_item, + generation, 64); + +static inline char *btrfs_device_uuid(struct btrfs_dev_item *d) +{ + return (char *)d + offsetof(struct btrfs_dev_item, uuid); +} + +static inline char *btrfs_device_fsid(struct btrfs_dev_item *d) +{ + return (char *)d + offsetof(struct btrfs_dev_item, fsid); +} + +BTRFS_SETGET_FUNCS(chunk_length, struct btrfs_chunk, length, 64); +BTRFS_SETGET_FUNCS(chunk_owner, struct btrfs_chunk, owner, 64); +BTRFS_SETGET_FUNCS(chunk_stripe_len, struct btrfs_chunk, stripe_len, 64); +BTRFS_SETGET_FUNCS(chunk_io_align, struct btrfs_chunk, io_align, 32); +BTRFS_SETGET_FUNCS(chunk_io_width, struct btrfs_chunk, io_width, 32); +BTRFS_SETGET_FUNCS(chunk_sector_size, struct btrfs_chunk, sector_size, 32); +BTRFS_SETGET_FUNCS(chunk_type, struct btrfs_chunk, type, 64); +BTRFS_SETGET_FUNCS(chunk_num_stripes, struct btrfs_chunk, num_stripes, 16); +BTRFS_SETGET_FUNCS(chunk_sub_stripes, struct btrfs_chunk, sub_stripes, 16); +BTRFS_SETGET_FUNCS(stripe_devid, struct btrfs_stripe, devid, 64); +BTRFS_SETGET_FUNCS(stripe_offset, struct btrfs_stripe, offset, 64); + +static inline char *btrfs_stripe_dev_uuid(struct btrfs_stripe *s) +{ + return (char *)s + offsetof(struct btrfs_stripe, dev_uuid); +} + +BTRFS_SETGET_STACK_FUNCS(stack_chunk_length, struct btrfs_chunk, length, 64); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_owner, struct btrfs_chunk, owner, 64); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_stripe_len, struct btrfs_chunk, + stripe_len, 64); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_align, struct btrfs_chunk, + io_align, 32); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_width, struct btrfs_chunk, + io_width, 32); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_sector_size, struct btrfs_chunk, + sector_size, 32); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_type, struct btrfs_chunk, type, 64); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_num_stripes, struct btrfs_chunk, + num_stripes, 16); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_sub_stripes, struct btrfs_chunk, + sub_stripes, 16); +BTRFS_SETGET_STACK_FUNCS(stack_stripe_devid, struct btrfs_stripe, devid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_stripe_offset, struct btrfs_stripe, offset, 64); + +static inline struct btrfs_stripe *btrfs_stripe_nr(struct btrfs_chunk *c, + int nr) +{ + unsigned long offset = (unsigned long)c; + offset += offsetof(struct btrfs_chunk, stripe); + offset += nr * sizeof(struct btrfs_stripe); + return (struct btrfs_stripe *)offset; +} + +static inline char *btrfs_stripe_dev_uuid_nr(struct btrfs_chunk *c, int nr) +{ + return btrfs_stripe_dev_uuid(btrfs_stripe_nr(c, nr)); +} + +static inline u64 btrfs_stripe_offset_nr(struct extent_buffer *eb, + struct btrfs_chunk *c, int nr) +{ + return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr)); +} + +static inline void btrfs_set_stripe_offset_nr(struct extent_buffer *eb, + struct btrfs_chunk *c, int nr, + u64 val) +{ + btrfs_set_stripe_offset(eb, btrfs_stripe_nr(c, nr), val); +} + +static inline u64 btrfs_stripe_devid_nr(struct extent_buffer *eb, + struct btrfs_chunk *c, int nr) +{ + return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr)); +} + +static inline void btrfs_set_stripe_devid_nr(struct extent_buffer *eb, + struct btrfs_chunk *c, int nr, + u64 val) +{ + btrfs_set_stripe_devid(eb, btrfs_stripe_nr(c, nr), val); +} + +/* struct btrfs_block_group_item */ +BTRFS_SETGET_STACK_FUNCS(block_group_used, struct btrfs_block_group_item, + used, 64); +BTRFS_SETGET_FUNCS(disk_block_group_used, struct btrfs_block_group_item, + used, 64); +BTRFS_SETGET_STACK_FUNCS(block_group_chunk_objectid, + struct btrfs_block_group_item, chunk_objectid, 64); + +BTRFS_SETGET_FUNCS(disk_block_group_chunk_objectid, + struct btrfs_block_group_item, chunk_objectid, 64); +BTRFS_SETGET_FUNCS(disk_block_group_flags, + struct btrfs_block_group_item, flags, 64); +BTRFS_SETGET_STACK_FUNCS(block_group_flags, + struct btrfs_block_group_item, flags, 64); + +/* struct btrfs_inode_ref */ +BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16); +BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64); + +/* struct btrfs_inode_item */ +BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64); +BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64); +BTRFS_SETGET_FUNCS(inode_transid, struct btrfs_inode_item, transid, 64); +BTRFS_SETGET_FUNCS(inode_size, struct btrfs_inode_item, size, 64); +BTRFS_SETGET_FUNCS(inode_nbytes, struct btrfs_inode_item, nbytes, 64); +BTRFS_SETGET_FUNCS(inode_block_group, struct btrfs_inode_item, block_group, 64); +BTRFS_SETGET_FUNCS(inode_nlink, struct btrfs_inode_item, nlink, 32); +BTRFS_SETGET_FUNCS(inode_uid, struct btrfs_inode_item, uid, 32); +BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32); +BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32); +BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 64); +BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 64); + +static inline struct btrfs_timespec * +btrfs_inode_atime(struct btrfs_inode_item *inode_item) +{ + unsigned long ptr = (unsigned long)inode_item; + ptr += offsetof(struct btrfs_inode_item, atime); + return (struct btrfs_timespec *)ptr; +} + +static inline struct btrfs_timespec * +btrfs_inode_mtime(struct btrfs_inode_item *inode_item) +{ + unsigned long ptr = (unsigned long)inode_item; + ptr += offsetof(struct btrfs_inode_item, mtime); + return (struct btrfs_timespec *)ptr; +} + +static inline struct btrfs_timespec * +btrfs_inode_ctime(struct btrfs_inode_item *inode_item) +{ + unsigned long ptr = (unsigned long)inode_item; + ptr += offsetof(struct btrfs_inode_item, ctime); + return (struct btrfs_timespec *)ptr; +} + +static inline struct btrfs_timespec * +btrfs_inode_otime(struct btrfs_inode_item *inode_item) +{ + unsigned long ptr = (unsigned long)inode_item; + ptr += offsetof(struct btrfs_inode_item, otime); + return (struct btrfs_timespec *)ptr; +} + +BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64); +BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32); + +/* struct btrfs_dev_extent */ +BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent, + chunk_tree, 64); +BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent, + chunk_objectid, 64); +BTRFS_SETGET_FUNCS(dev_extent_chunk_offset, struct btrfs_dev_extent, + chunk_offset, 64); +BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64); + +static inline u8 *btrfs_dev_extent_chunk_tree_uuid(struct btrfs_dev_extent *dev) +{ + unsigned long ptr = offsetof(struct btrfs_dev_extent, chunk_tree_uuid); + return (u8 *)((unsigned long)dev + ptr); +} + +/* struct btrfs_extent_ref */ +BTRFS_SETGET_FUNCS(ref_root, struct btrfs_extent_ref, root, 64); +BTRFS_SETGET_FUNCS(ref_generation, struct btrfs_extent_ref, generation, 64); +BTRFS_SETGET_FUNCS(ref_objectid, struct btrfs_extent_ref, objectid, 64); +BTRFS_SETGET_FUNCS(ref_num_refs, struct btrfs_extent_ref, num_refs, 32); + +BTRFS_SETGET_STACK_FUNCS(stack_ref_root, struct btrfs_extent_ref, root, 64); +BTRFS_SETGET_STACK_FUNCS(stack_ref_generation, struct btrfs_extent_ref, + generation, 64); +BTRFS_SETGET_STACK_FUNCS(stack_ref_objectid, struct btrfs_extent_ref, + objectid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_ref_num_refs, struct btrfs_extent_ref, + num_refs, 32); + +/* struct btrfs_extent_item */ +BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 32); +BTRFS_SETGET_STACK_FUNCS(stack_extent_refs, struct btrfs_extent_item, + refs, 32); + +/* struct btrfs_node */ +BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64); +BTRFS_SETGET_FUNCS(key_generation, struct btrfs_key_ptr, generation, 64); + +static inline u64 btrfs_node_blockptr(struct extent_buffer *eb, int nr) +{ + unsigned long ptr; + ptr = offsetof(struct btrfs_node, ptrs) + + sizeof(struct btrfs_key_ptr) * nr; + return btrfs_key_blockptr(eb, (struct btrfs_key_ptr *)ptr); +} + +static inline void btrfs_set_node_blockptr(struct extent_buffer *eb, + int nr, u64 val) +{ + unsigned long ptr; + ptr = offsetof(struct btrfs_node, ptrs) + + sizeof(struct btrfs_key_ptr) * nr; + btrfs_set_key_blockptr(eb, (struct btrfs_key_ptr *)ptr, val); +} + +static inline u64 btrfs_node_ptr_generation(struct extent_buffer *eb, int nr) +{ + unsigned long ptr; + ptr = offsetof(struct btrfs_node, ptrs) + + sizeof(struct btrfs_key_ptr) * nr; + return btrfs_key_generation(eb, (struct btrfs_key_ptr *)ptr); +} + +static inline void btrfs_set_node_ptr_generation(struct extent_buffer *eb, + int nr, u64 val) +{ + unsigned long ptr; + ptr = offsetof(struct btrfs_node, ptrs) + + sizeof(struct btrfs_key_ptr) * nr; + btrfs_set_key_generation(eb, (struct btrfs_key_ptr *)ptr, val); +} + +static inline unsigned long btrfs_node_key_ptr_offset(int nr) +{ + return offsetof(struct btrfs_node, ptrs) + + sizeof(struct btrfs_key_ptr) * nr; +} + +void btrfs_node_key(struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int nr); + +static inline void btrfs_set_node_key(struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int nr) +{ + unsigned long ptr; + ptr = btrfs_node_key_ptr_offset(nr); + write_eb_member(eb, (struct btrfs_key_ptr *)ptr, + struct btrfs_key_ptr, key, disk_key); +} + +/* struct btrfs_item */ +BTRFS_SETGET_FUNCS(item_offset, struct btrfs_item, offset, 32); +BTRFS_SETGET_FUNCS(item_size, struct btrfs_item, size, 32); + +static inline unsigned long btrfs_item_nr_offset(int nr) +{ + return offsetof(struct btrfs_leaf, items) + + sizeof(struct btrfs_item) * nr; +} + +static inline struct btrfs_item *btrfs_item_nr(struct extent_buffer *eb, + int nr) +{ + return (struct btrfs_item *)btrfs_item_nr_offset(nr); +} + +static inline u32 btrfs_item_end(struct extent_buffer *eb, + struct btrfs_item *item) +{ + return btrfs_item_offset(eb, item) + btrfs_item_size(eb, item); +} + +static inline u32 btrfs_item_end_nr(struct extent_buffer *eb, int nr) +{ + return btrfs_item_end(eb, btrfs_item_nr(eb, nr)); +} + +static inline u32 btrfs_item_offset_nr(struct extent_buffer *eb, int nr) +{ + return btrfs_item_offset(eb, btrfs_item_nr(eb, nr)); +} + +static inline u32 btrfs_item_size_nr(struct extent_buffer *eb, int nr) +{ + return btrfs_item_size(eb, btrfs_item_nr(eb, nr)); +} + +static inline void btrfs_item_key(struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int nr) +{ + struct btrfs_item *item = btrfs_item_nr(eb, nr); + read_eb_member(eb, item, struct btrfs_item, key, disk_key); +} + +static inline void btrfs_set_item_key(struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int nr) +{ + struct btrfs_item *item = btrfs_item_nr(eb, nr); + write_eb_member(eb, item, struct btrfs_item, key, disk_key); +} + +BTRFS_SETGET_FUNCS(dir_log_end, struct btrfs_dir_log_item, end, 64); + +/* + * struct btrfs_root_ref + */ +BTRFS_SETGET_FUNCS(root_ref_dirid, struct btrfs_root_ref, dirid, 64); +BTRFS_SETGET_FUNCS(root_ref_sequence, struct btrfs_root_ref, sequence, 64); +BTRFS_SETGET_FUNCS(root_ref_name_len, struct btrfs_root_ref, name_len, 16); + +/* struct btrfs_dir_item */ +BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16); +BTRFS_SETGET_FUNCS(dir_type, struct btrfs_dir_item, type, 8); +BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16); +BTRFS_SETGET_FUNCS(dir_transid, struct btrfs_dir_item, transid, 64); + +static inline void btrfs_dir_item_key(struct extent_buffer *eb, + struct btrfs_dir_item *item, + struct btrfs_disk_key *key) +{ + read_eb_member(eb, item, struct btrfs_dir_item, location, key); +} + +static inline void btrfs_set_dir_item_key(struct extent_buffer *eb, + struct btrfs_dir_item *item, + struct btrfs_disk_key *key) +{ + write_eb_member(eb, item, struct btrfs_dir_item, location, key); +} + +/* struct btrfs_disk_key */ +BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key, + objectid, 64); +BTRFS_SETGET_STACK_FUNCS(disk_key_offset, struct btrfs_disk_key, offset, 64); +BTRFS_SETGET_STACK_FUNCS(disk_key_type, struct btrfs_disk_key, type, 8); + +static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu, + struct btrfs_disk_key *disk) +{ + cpu->offset = le64_to_cpu(disk->offset); + cpu->type = disk->type; + cpu->objectid = le64_to_cpu(disk->objectid); +} + +static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk, + struct btrfs_key *cpu) +{ + disk->offset = cpu_to_le64(cpu->offset); + disk->type = cpu->type; + disk->objectid = cpu_to_le64(cpu->objectid); +} + +static inline void btrfs_node_key_to_cpu(struct extent_buffer *eb, + struct btrfs_key *key, int nr) +{ + struct btrfs_disk_key disk_key; + btrfs_node_key(eb, &disk_key, nr); + btrfs_disk_key_to_cpu(key, &disk_key); +} + +static inline void btrfs_item_key_to_cpu(struct extent_buffer *eb, + struct btrfs_key *key, int nr) +{ + struct btrfs_disk_key disk_key; + btrfs_item_key(eb, &disk_key, nr); + btrfs_disk_key_to_cpu(key, &disk_key); +} + +static inline void btrfs_dir_item_key_to_cpu(struct extent_buffer *eb, + struct btrfs_dir_item *item, + struct btrfs_key *key) +{ + struct btrfs_disk_key disk_key; + btrfs_dir_item_key(eb, item, &disk_key); + btrfs_disk_key_to_cpu(key, &disk_key); +} + + +static inline u8 btrfs_key_type(struct btrfs_key *key) +{ + return key->type; +} + +static inline void btrfs_set_key_type(struct btrfs_key *key, u8 val) +{ + key->type = val; +} + +/* struct btrfs_header */ +BTRFS_SETGET_HEADER_FUNCS(header_bytenr, struct btrfs_header, bytenr, 64); +BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header, + generation, 64); +BTRFS_SETGET_HEADER_FUNCS(header_owner, struct btrfs_header, owner, 64); +BTRFS_SETGET_HEADER_FUNCS(header_nritems, struct btrfs_header, nritems, 32); +BTRFS_SETGET_HEADER_FUNCS(header_flags, struct btrfs_header, flags, 64); +BTRFS_SETGET_HEADER_FUNCS(header_level, struct btrfs_header, level, 8); + +static inline int btrfs_header_flag(struct extent_buffer *eb, u64 flag) +{ + return (btrfs_header_flags(eb) & flag) == flag; +} + +static inline int btrfs_set_header_flag(struct extent_buffer *eb, u64 flag) +{ + u64 flags = btrfs_header_flags(eb); + btrfs_set_header_flags(eb, flags | flag); + return (flags & flag) == flag; +} + +static inline int btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag) +{ + u64 flags = btrfs_header_flags(eb); + btrfs_set_header_flags(eb, flags & ~flag); + return (flags & flag) == flag; +} + +static inline u8 *btrfs_header_fsid(struct extent_buffer *eb) +{ + unsigned long ptr = offsetof(struct btrfs_header, fsid); + return (u8 *)ptr; +} + +static inline u8 *btrfs_header_chunk_tree_uuid(struct extent_buffer *eb) +{ + unsigned long ptr = offsetof(struct btrfs_header, chunk_tree_uuid); + return (u8 *)ptr; +} + +static inline u8 *btrfs_super_fsid(struct extent_buffer *eb) +{ + unsigned long ptr = offsetof(struct btrfs_super_block, fsid); + return (u8 *)ptr; +} + +static inline u8 *btrfs_header_csum(struct extent_buffer *eb) +{ + unsigned long ptr = offsetof(struct btrfs_header, csum); + return (u8 *)ptr; +} + +static inline struct btrfs_node *btrfs_buffer_node(struct extent_buffer *eb) +{ + return NULL; +} + +static inline struct btrfs_leaf *btrfs_buffer_leaf(struct extent_buffer *eb) +{ + return NULL; +} + +static inline struct btrfs_header *btrfs_buffer_header(struct extent_buffer *eb) +{ + return NULL; +} + +static inline int btrfs_is_leaf(struct extent_buffer *eb) +{ + return btrfs_header_level(eb) == 0; +} + +/* struct btrfs_root_item */ +BTRFS_SETGET_FUNCS(disk_root_generation, struct btrfs_root_item, + generation, 64); +BTRFS_SETGET_FUNCS(disk_root_refs, struct btrfs_root_item, refs, 32); +BTRFS_SETGET_FUNCS(disk_root_bytenr, struct btrfs_root_item, bytenr, 64); +BTRFS_SETGET_FUNCS(disk_root_level, struct btrfs_root_item, level, 8); + +BTRFS_SETGET_STACK_FUNCS(root_generation, struct btrfs_root_item, + generation, 64); +BTRFS_SETGET_STACK_FUNCS(root_bytenr, struct btrfs_root_item, bytenr, 64); +BTRFS_SETGET_STACK_FUNCS(root_level, struct btrfs_root_item, level, 8); +BTRFS_SETGET_STACK_FUNCS(root_dirid, struct btrfs_root_item, root_dirid, 64); +BTRFS_SETGET_STACK_FUNCS(root_refs, struct btrfs_root_item, refs, 32); +BTRFS_SETGET_STACK_FUNCS(root_flags, struct btrfs_root_item, flags, 64); +BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, bytes_used, 64); +BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64); +BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item, + last_snapshot, 64); + +/* struct btrfs_super_block */ + +BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); +BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64); +BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block, + generation, 64); +BTRFS_SETGET_STACK_FUNCS(super_root, struct btrfs_super_block, root, 64); +BTRFS_SETGET_STACK_FUNCS(super_sys_array_size, + struct btrfs_super_block, sys_chunk_array_size, 32); +BTRFS_SETGET_STACK_FUNCS(super_chunk_root_generation, + struct btrfs_super_block, chunk_root_generation, 64); +BTRFS_SETGET_STACK_FUNCS(super_root_level, struct btrfs_super_block, + root_level, 8); +BTRFS_SETGET_STACK_FUNCS(super_chunk_root, struct btrfs_super_block, + chunk_root, 64); +BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block, + chunk_root_level, 8); +BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block, + log_root, 64); +BTRFS_SETGET_STACK_FUNCS(super_log_root_transid, struct btrfs_super_block, + log_root_transid, 64); +BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block, + log_root_level, 8); +BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block, + total_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(super_bytes_used, struct btrfs_super_block, + bytes_used, 64); +BTRFS_SETGET_STACK_FUNCS(super_sectorsize, struct btrfs_super_block, + sectorsize, 32); +BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block, + nodesize, 32); +BTRFS_SETGET_STACK_FUNCS(super_leafsize, struct btrfs_super_block, + leafsize, 32); +BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block, + stripesize, 32); +BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block, + root_dir_objectid, 64); +BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block, + num_devices, 64); +BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block, + compat_flags, 64); +BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block, + compat_flags, 64); +BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block, + incompat_flags, 64); +BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block, + csum_type, 16); + +static inline int btrfs_super_csum_size(struct btrfs_super_block *s) +{ + int t = btrfs_super_csum_type(s); + BUG_ON(t >= ARRAY_SIZE(btrfs_csum_sizes)); + return btrfs_csum_sizes[t]; +} + +static inline unsigned long btrfs_leaf_data(struct extent_buffer *l) +{ + return offsetof(struct btrfs_leaf, items); +} + +/* struct btrfs_file_extent_item */ +BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8); + +static inline unsigned long +btrfs_file_extent_inline_start(struct btrfs_file_extent_item *e) +{ + unsigned long offset = (unsigned long)e; + offset += offsetof(struct btrfs_file_extent_item, disk_bytenr); + return offset; +} + +static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize) +{ + return offsetof(struct btrfs_file_extent_item, disk_bytenr) + datasize; +} + +BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item, + disk_bytenr, 64); +BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item, + generation, 64); +BTRFS_SETGET_FUNCS(file_extent_disk_num_bytes, struct btrfs_file_extent_item, + disk_num_bytes, 64); +BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item, + offset, 64); +BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item, + num_bytes, 64); +BTRFS_SETGET_FUNCS(file_extent_ram_bytes, struct btrfs_file_extent_item, + ram_bytes, 64); +BTRFS_SETGET_FUNCS(file_extent_compression, struct btrfs_file_extent_item, + compression, 8); +BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item, + encryption, 8); +BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item, + other_encoding, 16); + +/* this returns the number of file bytes represented by the inline item. + * If an item is compressed, this is the uncompressed size + */ +static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb, + struct btrfs_file_extent_item *e) +{ + return btrfs_file_extent_ram_bytes(eb, e); +} + +/* + * this returns the number of bytes used by the item on disk, minus the + * size of any extent headers. If a file is compressed on disk, this is + * the compressed size + */ +static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb, + struct btrfs_item *e) +{ + unsigned long offset; + offset = offsetof(struct btrfs_file_extent_item, disk_bytenr); + return btrfs_item_size(eb, e) - offset; +} + +static inline struct btrfs_root *btrfs_sb(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline int btrfs_set_root_name(struct btrfs_root *root, + const char *name, int len) +{ + /* if we already have a name just free it */ + kfree(root->name); + + root->name = kmalloc(len+1, GFP_KERNEL); + if (!root->name) + return -ENOMEM; + + memcpy(root->name, name, len); + root->name[len] = '\0'; + + return 0; +} + +static inline u32 btrfs_level_size(struct btrfs_root *root, int level) +{ + if (level == 0) + return root->leafsize; + return root->nodesize; +} + +/* helper function to cast into the data area of the leaf. */ +#define btrfs_item_ptr(leaf, slot, type) \ + ((type *)(btrfs_leaf_data(leaf) + \ + btrfs_item_offset_nr(leaf, slot))) + +#define btrfs_item_ptr_offset(leaf, slot) \ + ((unsigned long)(btrfs_leaf_data(leaf) + \ + btrfs_item_offset_nr(leaf, slot))) + +static inline struct dentry *fdentry(struct file *file) +{ + return file->f_path.dentry; +} + +/* extent-tree.c */ +int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len); +int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u32 *refs); +int btrfs_update_pinned_extents(struct btrfs_root *root, + u64 bytenr, u64 num, int pin); +int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *leaf); +int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 objectid, u64 bytenr); +int btrfs_extent_post_op(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy); +struct btrfs_block_group_cache *btrfs_lookup_block_group( + struct btrfs_fs_info *info, + u64 bytenr); +u64 btrfs_find_block_group(struct btrfs_root *root, + u64 search_start, u64 search_hint, int owner); +struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u32 blocksize, u64 parent, + u64 root_objectid, + u64 ref_generation, + int level, + u64 hint, + u64 empty_size); +struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u32 blocksize); +int btrfs_alloc_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 num_bytes, u64 parent, u64 min_bytes, + u64 root_objectid, u64 ref_generation, + u64 owner, u64 empty_size, u64 hint_byte, + u64 search_end, struct btrfs_key *ins, u64 data); +int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 parent, + u64 root_objectid, u64 ref_generation, + u64 owner, struct btrfs_key *ins); +int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 parent, + u64 root_objectid, u64 ref_generation, + u64 owner, struct btrfs_key *ins); +int btrfs_reserve_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 num_bytes, u64 min_alloc_size, + u64 empty_size, u64 hint_byte, + u64 search_end, struct btrfs_key *ins, + u64 data); +int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct extent_buffer *orig_buf, struct extent_buffer *buf, + u32 *nr_extents); +int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct extent_buffer *buf, u32 nr_extents); +int btrfs_update_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *orig_buf, + struct extent_buffer *buf, int start_slot, int nr); +int btrfs_free_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 ref_generation, + u64 owner_objectid, int pin); +int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); +int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_io_tree *unpin); +int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 ref_generation, + u64 owner_objectid); +int btrfs_update_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 orig_parent, u64 parent, + u64 root_objectid, u64 ref_generation, + u64 owner_objectid); +int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr); +int btrfs_free_block_groups(struct btrfs_fs_info *info); +int btrfs_read_block_groups(struct btrfs_root *root); +int btrfs_make_block_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytes_used, + u64 type, u64 chunk_objectid, u64 chunk_offset, + u64 size); +int btrfs_remove_block_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 group_start); +int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start); +int btrfs_free_reloc_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_drop_dead_reloc_roots(struct btrfs_root *root); +int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, u64 orig_start); +int btrfs_add_dead_reloc_root(struct btrfs_root *root); +int btrfs_cleanup_reloc_trees(struct btrfs_root *root); +int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len); +u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); +/* ctree.c */ +int btrfs_previous_item(struct btrfs_root *root, + struct btrfs_path *path, u64 min_objectid, + int type); +int btrfs_merge_path(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_key *node_keys, + u64 *nodes, int lowest_level); +int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *new_key); +struct extent_buffer *btrfs_root_node(struct btrfs_root *root); +struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); +int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *key, int lowest_level, + int cache_only, u64 min_trans); +int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, + struct btrfs_key *max_key, + struct btrfs_path *path, int cache_only, + u64 min_trans); +int btrfs_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *buf, + struct extent_buffer *parent, int parent_slot, + struct extent_buffer **cow_ret, u64 prealloc_dest); +int btrfs_copy_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, + struct extent_buffer **cow_ret, u64 new_root_objectid); +int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, u32 data_size); +int btrfs_truncate_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u32 new_size, int from_end); +int btrfs_split_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *new_key, + unsigned long split_offset); +int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *key, struct btrfs_path *p, int + ins_len, int cow); +int btrfs_realloc_node(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *parent, + int start_slot, int cache_only, u64 *last_ret, + struct btrfs_key *progress); +void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p); +struct btrfs_path *btrfs_alloc_path(void); +void btrfs_free_path(struct btrfs_path *p); +void btrfs_init_path(struct btrfs_path *p); +int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_path *path, int slot, int nr); +int btrfs_del_leaf(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 bytenr); +static inline int btrfs_del_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path) +{ + return btrfs_del_items(trans, root, path, path->slots[0], 1); +} + +int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *key, void *data, u32 data_size); +int btrfs_insert_some_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *cpu_key, u32 *data_size, + int nr); +int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *cpu_key, u32 *data_size, int nr); + +static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *key, + u32 data_size) +{ + return btrfs_insert_empty_items(trans, root, path, key, &data_size, 1); +} + +int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); +int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); +int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf); +int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root + *root); +int btrfs_drop_subtree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *node, + struct extent_buffer *parent); +/* root-item.c */ +int btrfs_find_root_ref(struct btrfs_root *tree_root, + struct btrfs_path *path, + u64 root_id, u64 ref_id); +int btrfs_add_root_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *tree_root, + u64 root_id, u8 type, u64 ref_id, + u64 dirid, u64 sequence, + const char *name, int name_len); +int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_key *key); +int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *key, struct btrfs_root_item + *item); +int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *key, struct btrfs_root_item + *item); +int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct + btrfs_root_item *item, struct btrfs_key *key); +int btrfs_search_root(struct btrfs_root *root, u64 search_start, + u64 *found_objectid); +int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid, + struct btrfs_root *latest_root); +/* dir-item.c */ +int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, const char *name, + int name_len, u64 dir, + struct btrfs_key *location, u8 type, u64 index); +struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 dir, + const char *name, int name_len, + int mod); +struct btrfs_dir_item * +btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 dir, + u64 objectid, const char *name, int name_len, + int mod); +struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, + struct btrfs_path *path, + const char *name, int name_len); +int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_dir_item *di); +int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, const char *name, + u16 name_len, const void *data, u16 data_len, + u64 dir); +struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 dir, + const char *name, u16 name_len, + int mod); + +/* orphan.c */ +int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 offset); +int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 offset); + +/* inode-map.c */ +int btrfs_find_free_objectid(struct btrfs_trans_handle *trans, + struct btrfs_root *fs_root, + u64 dirid, u64 *objectid); +int btrfs_find_highest_inode(struct btrfs_root *fs_root, u64 *objectid); + +/* inode-item.c */ +int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, u64 index); +int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, u64 *index); +int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid); +int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, + struct btrfs_key *location, int mod); + +/* file-item.c */ +int btrfs_del_csums(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, u64 len); +int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, + struct bio *bio, u32 *dst); +int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 objectid, u64 pos, + u64 disk_offset, u64 disk_num_bytes, + u64 num_bytes, u64 offset, u64 ram_bytes, + u8 compression, u8 encryption, u16 other_encoding); +int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid, + u64 bytenr, int mod); +int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_ordered_sum *sums); +int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, + struct bio *bio, u64 file_start, int contig); +int btrfs_csum_file_bytes(struct btrfs_root *root, struct inode *inode, + u64 start, unsigned long len); +struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, int cow); +int btrfs_csum_truncate(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + u64 isize); +int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, + u64 end, struct list_head *list); +/* inode.c */ + +/* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */ +#if defined(ClearPageFsMisc) && !defined(ClearPageChecked) +#define ClearPageChecked ClearPageFsMisc +#define SetPageChecked SetPageFsMisc +#define PageChecked PageFsMisc +#endif + +struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry); +int btrfs_set_inode_index(struct inode *dir, u64 *index); +int btrfs_unlink_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *dir, struct inode *inode, + const char *name, int name_len); +int btrfs_add_link(struct btrfs_trans_handle *trans, + struct inode *parent_inode, struct inode *inode, + const char *name, int name_len, int add_backref, u64 index); +int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, u64 new_size, + u32 min_type); + +int btrfs_start_delalloc_inodes(struct btrfs_root *root); +int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end); +int btrfs_writepages(struct address_space *mapping, + struct writeback_control *wbc); +int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, + struct btrfs_root *new_root, struct dentry *dentry, + u64 new_dirid, u64 alloc_hint); +int btrfs_merge_bio_hook(struct page *page, unsigned long offset, + size_t size, struct bio *bio, unsigned long bio_flags); + +unsigned long btrfs_force_ra(struct address_space *mapping, + struct file_ra_state *ra, struct file *file, + pgoff_t offset, pgoff_t last_index); +int btrfs_check_free_space(struct btrfs_root *root, u64 num_required, + int for_del); +int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page); +int btrfs_readpage(struct file *file, struct page *page); +void btrfs_delete_inode(struct inode *inode); +void btrfs_put_inode(struct inode *inode); +void btrfs_read_locked_inode(struct inode *inode); +int btrfs_write_inode(struct inode *inode, int wait); +void btrfs_dirty_inode(struct inode *inode); +struct inode *btrfs_alloc_inode(struct super_block *sb); +void btrfs_destroy_inode(struct inode *inode); +int btrfs_init_cachep(void); +void btrfs_destroy_cachep(void); +long btrfs_ioctl_trans_end(struct file *file); +struct inode *btrfs_ilookup(struct super_block *s, u64 objectid, + struct btrfs_root *root, int wait); +struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid, + struct btrfs_root *root); +struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, + struct btrfs_root *root, int *is_new); +int btrfs_commit_write(struct file *file, struct page *page, + unsigned from, unsigned to); +struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, + size_t page_offset, u64 start, u64 end, + int create); +int btrfs_update_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode); +int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode); +int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); +void btrfs_orphan_cleanup(struct btrfs_root *root); +int btrfs_cont_expand(struct inode *inode, loff_t size); + +/* ioctl.c */ +long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); + +/* file.c */ +int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync); +int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, + int skip_pinned); +int btrfs_check_file(struct btrfs_root *root, struct inode *inode); +extern struct file_operations btrfs_file_operations; +int btrfs_drop_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + u64 start, u64 end, u64 inline_limit, u64 *hint_block); +int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, u64 start, u64 end); +int btrfs_release_file(struct inode *inode, struct file *file); + +/* tree-defrag.c */ +int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, + struct btrfs_root *root, int cache_only); + +/* sysfs.c */ +int btrfs_init_sysfs(void); +void btrfs_exit_sysfs(void); +int btrfs_sysfs_add_super(struct btrfs_fs_info *fs); +int btrfs_sysfs_add_root(struct btrfs_root *root); +void btrfs_sysfs_del_root(struct btrfs_root *root); +void btrfs_sysfs_del_super(struct btrfs_fs_info *root); + +/* xattr.c */ +ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size); + +/* super.c */ +u64 btrfs_parse_size(char *str); +int btrfs_parse_options(struct btrfs_root *root, char *options); +int btrfs_sync_fs(struct super_block *sb, int wait); + +/* acl.c */ +int btrfs_check_acl(struct inode *inode, int mask); +int btrfs_init_acl(struct inode *inode, struct inode *dir); +int btrfs_acl_chmod(struct inode *inode); + +/* free-space-cache.c */ +int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, + u64 bytenr, u64 size); +int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes); +int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, + u64 bytenr, u64 size); +int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes); +void btrfs_remove_free_space_cache(struct btrfs_block_group_cache + *block_group); +struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache + *block_group, u64 offset, + u64 bytes); +void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, + u64 bytes); +u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group); +#endif diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c new file mode 100644 index 000000000000..926a0b287a7d --- /dev/null +++ b/fs/btrfs/dir-item.c @@ -0,0 +1,386 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include "ctree.h" +#include "disk-io.h" +#include "hash.h" +#include "transaction.h" + +/* + * insert a name into a directory, doing overflow properly if there is a hash + * collision. data_size indicates how big the item inserted should be. On + * success a struct btrfs_dir_item pointer is returned, otherwise it is + * an ERR_PTR. + * + * The name is not copied into the dir item, you have to do that yourself. + */ +static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle + *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *cpu_key, + u32 data_size, + const char *name, + int name_len) +{ + int ret; + char *ptr; + struct btrfs_item *item; + struct extent_buffer *leaf; + + ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); + if (ret == -EEXIST) { + struct btrfs_dir_item *di; + di = btrfs_match_dir_item_name(root, path, name, name_len); + if (di) + return ERR_PTR(-EEXIST); + ret = btrfs_extend_item(trans, root, path, data_size); + WARN_ON(ret > 0); + } + if (ret < 0) + return ERR_PTR(ret); + WARN_ON(ret > 0); + leaf = path->nodes[0]; + item = btrfs_item_nr(leaf, path->slots[0]); + ptr = btrfs_item_ptr(leaf, path->slots[0], char); + BUG_ON(data_size > btrfs_item_size(leaf, item)); + ptr += btrfs_item_size(leaf, item) - data_size; + return (struct btrfs_dir_item *)ptr; +} + +/* + * xattrs work a lot like directories, this inserts an xattr item + * into the tree + */ +int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, const char *name, + u16 name_len, const void *data, u16 data_len, + u64 dir) +{ + int ret = 0; + struct btrfs_path *path; + struct btrfs_dir_item *dir_item; + unsigned long name_ptr, data_ptr; + struct btrfs_key key, location; + struct btrfs_disk_key disk_key; + struct extent_buffer *leaf; + u32 data_size; + + key.objectid = dir; + btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY); + key.offset = btrfs_name_hash(name, name_len); + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + if (name_len + data_len + sizeof(struct btrfs_dir_item) > + BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item)) + return -ENOSPC; + + data_size = sizeof(*dir_item) + name_len + data_len; + dir_item = insert_with_overflow(trans, root, path, &key, data_size, + name, name_len); + /* + * FIXME: at some point we should handle xattr's that are larger than + * what we can fit in our leaf. We set location to NULL b/c we arent + * pointing at anything else, that will change if we store the xattr + * data in a separate inode. + */ + BUG_ON(IS_ERR(dir_item)); + memset(&location, 0, sizeof(location)); + + leaf = path->nodes[0]; + btrfs_cpu_key_to_disk(&disk_key, &location); + btrfs_set_dir_item_key(leaf, dir_item, &disk_key); + btrfs_set_dir_type(leaf, dir_item, BTRFS_FT_XATTR); + btrfs_set_dir_name_len(leaf, dir_item, name_len); + btrfs_set_dir_transid(leaf, dir_item, trans->transid); + btrfs_set_dir_data_len(leaf, dir_item, data_len); + name_ptr = (unsigned long)(dir_item + 1); + data_ptr = (unsigned long)((char *)name_ptr + name_len); + + write_extent_buffer(leaf, name, name_ptr, name_len); + write_extent_buffer(leaf, data, data_ptr, data_len); + btrfs_mark_buffer_dirty(path->nodes[0]); + + btrfs_free_path(path); + return ret; +} + +/* + * insert a directory item in the tree, doing all the magic for + * both indexes. 'dir' indicates which objectid to insert it into, + * 'location' is the key to stuff into the directory item, 'type' is the + * type of the inode we're pointing to, and 'index' is the sequence number + * to use for the second index (if one is created). + */ +int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root + *root, const char *name, int name_len, u64 dir, + struct btrfs_key *location, u8 type, u64 index) +{ + int ret = 0; + int ret2 = 0; + struct btrfs_path *path; + struct btrfs_dir_item *dir_item; + struct extent_buffer *leaf; + unsigned long name_ptr; + struct btrfs_key key; + struct btrfs_disk_key disk_key; + u32 data_size; + + key.objectid = dir; + btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); + key.offset = btrfs_name_hash(name, name_len); + path = btrfs_alloc_path(); + data_size = sizeof(*dir_item) + name_len; + dir_item = insert_with_overflow(trans, root, path, &key, data_size, + name, name_len); + if (IS_ERR(dir_item)) { + ret = PTR_ERR(dir_item); + if (ret == -EEXIST) + goto second_insert; + goto out; + } + + leaf = path->nodes[0]; + btrfs_cpu_key_to_disk(&disk_key, location); + btrfs_set_dir_item_key(leaf, dir_item, &disk_key); + btrfs_set_dir_type(leaf, dir_item, type); + btrfs_set_dir_data_len(leaf, dir_item, 0); + btrfs_set_dir_name_len(leaf, dir_item, name_len); + btrfs_set_dir_transid(leaf, dir_item, trans->transid); + name_ptr = (unsigned long)(dir_item + 1); + + write_extent_buffer(leaf, name, name_ptr, name_len); + btrfs_mark_buffer_dirty(leaf); + +second_insert: + /* FIXME, use some real flag for selecting the extra index */ + if (root == root->fs_info->tree_root) { + ret = 0; + goto out; + } + btrfs_release_path(root, path); + + btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY); + key.offset = index; + dir_item = insert_with_overflow(trans, root, path, &key, data_size, + name, name_len); + if (IS_ERR(dir_item)) { + ret2 = PTR_ERR(dir_item); + goto out; + } + leaf = path->nodes[0]; + btrfs_cpu_key_to_disk(&disk_key, location); + btrfs_set_dir_item_key(leaf, dir_item, &disk_key); + btrfs_set_dir_type(leaf, dir_item, type); + btrfs_set_dir_data_len(leaf, dir_item, 0); + btrfs_set_dir_name_len(leaf, dir_item, name_len); + btrfs_set_dir_transid(leaf, dir_item, trans->transid); + name_ptr = (unsigned long)(dir_item + 1); + write_extent_buffer(leaf, name, name_ptr, name_len); + btrfs_mark_buffer_dirty(leaf); +out: + btrfs_free_path(path); + if (ret) + return ret; + if (ret2) + return ret2; + return 0; +} + +/* + * lookup a directory item based on name. 'dir' is the objectid + * we're searching in, and 'mod' tells us if you plan on deleting the + * item (use mod < 0) or changing the options (use mod > 0) + */ +struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 dir, + const char *name, int name_len, + int mod) +{ + int ret; + struct btrfs_key key; + int ins_len = mod < 0 ? -1 : 0; + int cow = mod != 0; + struct btrfs_key found_key; + struct extent_buffer *leaf; + + key.objectid = dir; + btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); + + key.offset = btrfs_name_hash(name, name_len); + + ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); + if (ret < 0) + return ERR_PTR(ret); + if (ret > 0) { + if (path->slots[0] == 0) + return NULL; + path->slots[0]--; + } + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + if (found_key.objectid != dir || + btrfs_key_type(&found_key) != BTRFS_DIR_ITEM_KEY || + found_key.offset != key.offset) + return NULL; + + return btrfs_match_dir_item_name(root, path, name, name_len); +} + +/* + * lookup a directory item based on index. 'dir' is the objectid + * we're searching in, and 'mod' tells us if you plan on deleting the + * item (use mod < 0) or changing the options (use mod > 0) + * + * The name is used to make sure the index really points to the name you were + * looking for. + */ +struct btrfs_dir_item * +btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 dir, + u64 objectid, const char *name, int name_len, + int mod) +{ + int ret; + struct btrfs_key key; + int ins_len = mod < 0 ? -1 : 0; + int cow = mod != 0; + + key.objectid = dir; + btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY); + key.offset = objectid; + + ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); + if (ret < 0) + return ERR_PTR(ret); + if (ret > 0) + return ERR_PTR(-ENOENT); + return btrfs_match_dir_item_name(root, path, name, name_len); +} + +struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 dir, + const char *name, u16 name_len, + int mod) +{ + int ret; + struct btrfs_key key; + int ins_len = mod < 0 ? -1 : 0; + int cow = mod != 0; + struct btrfs_key found_key; + struct extent_buffer *leaf; + + key.objectid = dir; + btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY); + key.offset = btrfs_name_hash(name, name_len); + ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); + if (ret < 0) + return ERR_PTR(ret); + if (ret > 0) { + if (path->slots[0] == 0) + return NULL; + path->slots[0]--; + } + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + if (found_key.objectid != dir || + btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY || + found_key.offset != key.offset) + return NULL; + + return btrfs_match_dir_item_name(root, path, name, name_len); +} + +/* + * helper function to look at the directory item pointed to by 'path' + * this walks through all the entries in a dir item and finds one + * for a specific name. + */ +struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, + struct btrfs_path *path, + const char *name, int name_len) +{ + struct btrfs_dir_item *dir_item; + unsigned long name_ptr; + u32 total_len; + u32 cur = 0; + u32 this_len; + struct extent_buffer *leaf; + + leaf = path->nodes[0]; + dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item); + total_len = btrfs_item_size_nr(leaf, path->slots[0]); + while (cur < total_len) { + this_len = sizeof(*dir_item) + + btrfs_dir_name_len(leaf, dir_item) + + btrfs_dir_data_len(leaf, dir_item); + name_ptr = (unsigned long)(dir_item + 1); + + if (btrfs_dir_name_len(leaf, dir_item) == name_len && + memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) + return dir_item; + + cur += this_len; + dir_item = (struct btrfs_dir_item *)((char *)dir_item + + this_len); + } + return NULL; +} + +/* + * given a pointer into a directory item, delete it. This + * handles items that have more than one entry in them. + */ +int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_dir_item *di) +{ + + struct extent_buffer *leaf; + u32 sub_item_len; + u32 item_len; + int ret = 0; + + leaf = path->nodes[0]; + sub_item_len = sizeof(*di) + btrfs_dir_name_len(leaf, di) + + btrfs_dir_data_len(leaf, di); + item_len = btrfs_item_size_nr(leaf, path->slots[0]); + if (sub_item_len == item_len) { + ret = btrfs_del_item(trans, root, path); + } else { + /* MARKER */ + unsigned long ptr = (unsigned long)di; + unsigned long start; + + start = btrfs_item_ptr_offset(leaf, path->slots[0]); + memmove_extent_buffer(leaf, ptr, ptr + sub_item_len, + item_len - (ptr + sub_item_len - start)); + ret = btrfs_truncate_item(trans, root, path, + item_len - sub_item_len, 1); + } + return 0; +} diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c new file mode 100644 index 000000000000..81a313874ae5 --- /dev/null +++ b/fs/btrfs/disk-io.c @@ -0,0 +1,2343 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/version.h> +#include <linux/fs.h> +#include <linux/blkdev.h> +#include <linux/scatterlist.h> +#include <linux/swap.h> +#include <linux/radix-tree.h> +#include <linux/writeback.h> +#include <linux/buffer_head.h> +#include <linux/workqueue.h> +#include <linux/kthread.h> +#include <linux/freezer.h> +#include "compat.h" +#include "crc32c.h" +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "btrfs_inode.h" +#include "volumes.h" +#include "print-tree.h" +#include "async-thread.h" +#include "locking.h" +#include "ref-cache.h" +#include "tree-log.h" + +static struct extent_io_ops btree_extent_io_ops; +static void end_workqueue_fn(struct btrfs_work *work); + +/* + * end_io_wq structs are used to do processing in task context when an IO is + * complete. This is used during reads to verify checksums, and it is used + * by writes to insert metadata for new file extents after IO is complete. + */ +struct end_io_wq { + struct bio *bio; + bio_end_io_t *end_io; + void *private; + struct btrfs_fs_info *info; + int error; + int metadata; + struct list_head list; + struct btrfs_work work; +}; + +/* + * async submit bios are used to offload expensive checksumming + * onto the worker threads. They checksum file and metadata bios + * just before they are sent down the IO stack. + */ +struct async_submit_bio { + struct inode *inode; + struct bio *bio; + struct list_head list; + extent_submit_bio_hook_t *submit_bio_start; + extent_submit_bio_hook_t *submit_bio_done; + int rw; + int mirror_num; + unsigned long bio_flags; + struct btrfs_work work; +}; + +/* + * extents on the btree inode are pretty simple, there's one extent + * that covers the entire device + */ +static struct extent_map *btree_get_extent(struct inode *inode, + struct page *page, size_t page_offset, u64 start, u64 len, + int create) +{ + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_map *em; + int ret; + + spin_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, len); + if (em) { + em->bdev = + BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; + spin_unlock(&em_tree->lock); + goto out; + } + spin_unlock(&em_tree->lock); + + em = alloc_extent_map(GFP_NOFS); + if (!em) { + em = ERR_PTR(-ENOMEM); + goto out; + } + em->start = 0; + em->len = (u64)-1; + em->block_len = (u64)-1; + em->block_start = 0; + em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; + + spin_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); + if (ret == -EEXIST) { + u64 failed_start = em->start; + u64 failed_len = em->len; + + free_extent_map(em); + em = lookup_extent_mapping(em_tree, start, len); + if (em) { + ret = 0; + } else { + em = lookup_extent_mapping(em_tree, failed_start, + failed_len); + ret = -EIO; + } + } else if (ret) { + free_extent_map(em); + em = NULL; + } + spin_unlock(&em_tree->lock); + + if (ret) + em = ERR_PTR(ret); +out: + return em; +} + +u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len) +{ + return btrfs_crc32c(seed, data, len); +} + +void btrfs_csum_final(u32 crc, char *result) +{ + *(__le32 *)result = ~cpu_to_le32(crc); +} + +/* + * compute the csum for a btree block, and either verify it or write it + * into the csum field of the block. + */ +static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, + int verify) +{ + u16 csum_size = + btrfs_super_csum_size(&root->fs_info->super_copy); + char *result = NULL; + unsigned long len; + unsigned long cur_len; + unsigned long offset = BTRFS_CSUM_SIZE; + char *map_token = NULL; + char *kaddr; + unsigned long map_start; + unsigned long map_len; + int err; + u32 crc = ~(u32)0; + unsigned long inline_result; + + len = buf->len - offset; + while (len > 0) { + err = map_private_extent_buffer(buf, offset, 32, + &map_token, &kaddr, + &map_start, &map_len, KM_USER0); + if (err) + return 1; + cur_len = min(len, map_len - (offset - map_start)); + crc = btrfs_csum_data(root, kaddr + offset - map_start, + crc, cur_len); + len -= cur_len; + offset += cur_len; + unmap_extent_buffer(buf, map_token, KM_USER0); + } + if (csum_size > sizeof(inline_result)) { + result = kzalloc(csum_size * sizeof(char), GFP_NOFS); + if (!result) + return 1; + } else { + result = (char *)&inline_result; + } + + btrfs_csum_final(crc, result); + + if (verify) { + if (memcmp_extent_buffer(buf, result, 0, csum_size)) { + u32 val; + u32 found = 0; + memcpy(&found, result, csum_size); + + read_extent_buffer(buf, &val, 0, csum_size); + printk(KERN_INFO "btrfs: %s checksum verify failed " + "on %llu wanted %X found %X level %d\n", + root->fs_info->sb->s_id, + buf->start, val, found, btrfs_header_level(buf)); + if (result != (char *)&inline_result) + kfree(result); + return 1; + } + } else { + write_extent_buffer(buf, result, 0, csum_size); + } + if (result != (char *)&inline_result) + kfree(result); + return 0; +} + +/* + * we can't consider a given block up to date unless the transid of the + * block matches the transid in the parent node's pointer. This is how we + * detect blocks that either didn't get written at all or got written + * in the wrong place. + */ +static int verify_parent_transid(struct extent_io_tree *io_tree, + struct extent_buffer *eb, u64 parent_transid) +{ + int ret; + + if (!parent_transid || btrfs_header_generation(eb) == parent_transid) + return 0; + + lock_extent(io_tree, eb->start, eb->start + eb->len - 1, GFP_NOFS); + if (extent_buffer_uptodate(io_tree, eb) && + btrfs_header_generation(eb) == parent_transid) { + ret = 0; + goto out; + } + printk("parent transid verify failed on %llu wanted %llu found %llu\n", + (unsigned long long)eb->start, + (unsigned long long)parent_transid, + (unsigned long long)btrfs_header_generation(eb)); + ret = 1; + clear_extent_buffer_uptodate(io_tree, eb); +out: + unlock_extent(io_tree, eb->start, eb->start + eb->len - 1, + GFP_NOFS); + return ret; +} + +/* + * helper to read a given tree block, doing retries as required when + * the checksums don't match and we have alternate mirrors to try. + */ +static int btree_read_extent_buffer_pages(struct btrfs_root *root, + struct extent_buffer *eb, + u64 start, u64 parent_transid) +{ + struct extent_io_tree *io_tree; + int ret; + int num_copies = 0; + int mirror_num = 0; + + io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; + while (1) { + ret = read_extent_buffer_pages(io_tree, eb, start, 1, + btree_get_extent, mirror_num); + if (!ret && + !verify_parent_transid(io_tree, eb, parent_transid)) + return ret; + + num_copies = btrfs_num_copies(&root->fs_info->mapping_tree, + eb->start, eb->len); + if (num_copies == 1) + return ret; + + mirror_num++; + if (mirror_num > num_copies) + return ret; + } + return -EIO; +} + +/* + * checksum a dirty tree block before IO. This has extra checks to make sure + * we only fill in the checksum field in the first page of a multi-page block + */ + +static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) +{ + struct extent_io_tree *tree; + u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 found_start; + int found_level; + unsigned long len; + struct extent_buffer *eb; + int ret; + + tree = &BTRFS_I(page->mapping->host)->io_tree; + + if (page->private == EXTENT_PAGE_PRIVATE) + goto out; + if (!page->private) + goto out; + len = page->private >> 2; + WARN_ON(len == 0); + + eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS); + ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE, + btrfs_header_generation(eb)); + BUG_ON(ret); + found_start = btrfs_header_bytenr(eb); + if (found_start != start) { + WARN_ON(1); + goto err; + } + if (eb->first_page != page) { + WARN_ON(1); + goto err; + } + if (!PageUptodate(page)) { + WARN_ON(1); + goto err; + } + found_level = btrfs_header_level(eb); + + csum_tree_block(root, eb, 0); +err: + free_extent_buffer(eb); +out: + return 0; +} + +static int check_tree_block_fsid(struct btrfs_root *root, + struct extent_buffer *eb) +{ + struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; + u8 fsid[BTRFS_UUID_SIZE]; + int ret = 1; + + read_extent_buffer(eb, fsid, (unsigned long)btrfs_header_fsid(eb), + BTRFS_FSID_SIZE); + while (fs_devices) { + if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) { + ret = 0; + break; + } + fs_devices = fs_devices->seed; + } + return ret; +} + +static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, + struct extent_state *state) +{ + struct extent_io_tree *tree; + u64 found_start; + int found_level; + unsigned long len; + struct extent_buffer *eb; + struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; + int ret = 0; + + tree = &BTRFS_I(page->mapping->host)->io_tree; + if (page->private == EXTENT_PAGE_PRIVATE) + goto out; + if (!page->private) + goto out; + + len = page->private >> 2; + WARN_ON(len == 0); + + eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS); + + found_start = btrfs_header_bytenr(eb); + if (found_start != start) { + printk(KERN_INFO "btrfs bad tree block start %llu %llu\n", + (unsigned long long)found_start, + (unsigned long long)eb->start); + ret = -EIO; + goto err; + } + if (eb->first_page != page) { + printk(KERN_INFO "btrfs bad first page %lu %lu\n", + eb->first_page->index, page->index); + WARN_ON(1); + ret = -EIO; + goto err; + } + if (check_tree_block_fsid(root, eb)) { + printk(KERN_INFO "btrfs bad fsid on block %llu\n", + (unsigned long long)eb->start); + ret = -EIO; + goto err; + } + found_level = btrfs_header_level(eb); + + ret = csum_tree_block(root, eb, 1); + if (ret) + ret = -EIO; + + end = min_t(u64, eb->len, PAGE_CACHE_SIZE); + end = eb->start + end - 1; +err: + free_extent_buffer(eb); +out: + return ret; +} + +static void end_workqueue_bio(struct bio *bio, int err) +{ + struct end_io_wq *end_io_wq = bio->bi_private; + struct btrfs_fs_info *fs_info; + + fs_info = end_io_wq->info; + end_io_wq->error = err; + end_io_wq->work.func = end_workqueue_fn; + end_io_wq->work.flags = 0; + + if (bio->bi_rw & (1 << BIO_RW)) { + if (end_io_wq->metadata) + btrfs_queue_worker(&fs_info->endio_meta_write_workers, + &end_io_wq->work); + else + btrfs_queue_worker(&fs_info->endio_write_workers, + &end_io_wq->work); + } else { + if (end_io_wq->metadata) + btrfs_queue_worker(&fs_info->endio_meta_workers, + &end_io_wq->work); + else + btrfs_queue_worker(&fs_info->endio_workers, + &end_io_wq->work); + } +} + +int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, + int metadata) +{ + struct end_io_wq *end_io_wq; + end_io_wq = kmalloc(sizeof(*end_io_wq), GFP_NOFS); + if (!end_io_wq) + return -ENOMEM; + + end_io_wq->private = bio->bi_private; + end_io_wq->end_io = bio->bi_end_io; + end_io_wq->info = info; + end_io_wq->error = 0; + end_io_wq->bio = bio; + end_io_wq->metadata = metadata; + + bio->bi_private = end_io_wq; + bio->bi_end_io = end_workqueue_bio; + return 0; +} + +unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info) +{ + unsigned long limit = min_t(unsigned long, + info->workers.max_workers, + info->fs_devices->open_devices); + return 256 * limit; +} + +int btrfs_congested_async(struct btrfs_fs_info *info, int iodone) +{ + return atomic_read(&info->nr_async_bios) > + btrfs_async_submit_limit(info); +} + +static void run_one_async_start(struct btrfs_work *work) +{ + struct btrfs_fs_info *fs_info; + struct async_submit_bio *async; + + async = container_of(work, struct async_submit_bio, work); + fs_info = BTRFS_I(async->inode)->root->fs_info; + async->submit_bio_start(async->inode, async->rw, async->bio, + async->mirror_num, async->bio_flags); +} + +static void run_one_async_done(struct btrfs_work *work) +{ + struct btrfs_fs_info *fs_info; + struct async_submit_bio *async; + int limit; + + async = container_of(work, struct async_submit_bio, work); + fs_info = BTRFS_I(async->inode)->root->fs_info; + + limit = btrfs_async_submit_limit(fs_info); + limit = limit * 2 / 3; + + atomic_dec(&fs_info->nr_async_submits); + + if (atomic_read(&fs_info->nr_async_submits) < limit && + waitqueue_active(&fs_info->async_submit_wait)) + wake_up(&fs_info->async_submit_wait); + + async->submit_bio_done(async->inode, async->rw, async->bio, + async->mirror_num, async->bio_flags); +} + +static void run_one_async_free(struct btrfs_work *work) +{ + struct async_submit_bio *async; + + async = container_of(work, struct async_submit_bio, work); + kfree(async); +} + +int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, + int rw, struct bio *bio, int mirror_num, + unsigned long bio_flags, + extent_submit_bio_hook_t *submit_bio_start, + extent_submit_bio_hook_t *submit_bio_done) +{ + struct async_submit_bio *async; + + async = kmalloc(sizeof(*async), GFP_NOFS); + if (!async) + return -ENOMEM; + + async->inode = inode; + async->rw = rw; + async->bio = bio; + async->mirror_num = mirror_num; + async->submit_bio_start = submit_bio_start; + async->submit_bio_done = submit_bio_done; + + async->work.func = run_one_async_start; + async->work.ordered_func = run_one_async_done; + async->work.ordered_free = run_one_async_free; + + async->work.flags = 0; + async->bio_flags = bio_flags; + + atomic_inc(&fs_info->nr_async_submits); + btrfs_queue_worker(&fs_info->workers, &async->work); +#if 0 + int limit = btrfs_async_submit_limit(fs_info); + if (atomic_read(&fs_info->nr_async_submits) > limit) { + wait_event_timeout(fs_info->async_submit_wait, + (atomic_read(&fs_info->nr_async_submits) < limit), + HZ/10); + + wait_event_timeout(fs_info->async_submit_wait, + (atomic_read(&fs_info->nr_async_bios) < limit), + HZ/10); + } +#endif + while (atomic_read(&fs_info->async_submit_draining) && + atomic_read(&fs_info->nr_async_submits)) { + wait_event(fs_info->async_submit_wait, + (atomic_read(&fs_info->nr_async_submits) == 0)); + } + + return 0; +} + +static int btree_csum_one_bio(struct bio *bio) +{ + struct bio_vec *bvec = bio->bi_io_vec; + int bio_index = 0; + struct btrfs_root *root; + + WARN_ON(bio->bi_vcnt <= 0); + while (bio_index < bio->bi_vcnt) { + root = BTRFS_I(bvec->bv_page->mapping->host)->root; + csum_dirty_buffer(root, bvec->bv_page); + bio_index++; + bvec++; + } + return 0; +} + +static int __btree_submit_bio_start(struct inode *inode, int rw, + struct bio *bio, int mirror_num, + unsigned long bio_flags) +{ + /* + * when we're called for a write, we're already in the async + * submission context. Just jump into btrfs_map_bio + */ + btree_csum_one_bio(bio); + return 0; +} + +static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio, + int mirror_num, unsigned long bio_flags) +{ + /* + * when we're called for a write, we're already in the async + * submission context. Just jump into btrfs_map_bio + */ + return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1); +} + +static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, + int mirror_num, unsigned long bio_flags) +{ + int ret; + + ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info, + bio, 1); + BUG_ON(ret); + + if (!(rw & (1 << BIO_RW))) { + /* + * called for a read, do the setup so that checksum validation + * can happen in the async kernel threads + */ + return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, + mirror_num, 0); + } + /* + * kthread helpers are used to submit writes so that checksumming + * can happen in parallel across all CPUs + */ + return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, + inode, rw, bio, mirror_num, 0, + __btree_submit_bio_start, + __btree_submit_bio_done); +} + +static int btree_writepage(struct page *page, struct writeback_control *wbc) +{ + struct extent_io_tree *tree; + tree = &BTRFS_I(page->mapping->host)->io_tree; + + if (current->flags & PF_MEMALLOC) { + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; + } + return extent_write_full_page(tree, page, btree_get_extent, wbc); +} + +static int btree_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct extent_io_tree *tree; + tree = &BTRFS_I(mapping->host)->io_tree; + if (wbc->sync_mode == WB_SYNC_NONE) { + u64 num_dirty; + u64 start = 0; + unsigned long thresh = 32 * 1024 * 1024; + + if (wbc->for_kupdate) + return 0; + + num_dirty = count_range_bits(tree, &start, (u64)-1, + thresh, EXTENT_DIRTY); + if (num_dirty < thresh) + return 0; + } + return extent_writepages(tree, mapping, btree_get_extent, wbc); +} + +static int btree_readpage(struct file *file, struct page *page) +{ + struct extent_io_tree *tree; + tree = &BTRFS_I(page->mapping->host)->io_tree; + return extent_read_full_page(tree, page, btree_get_extent); +} + +static int btree_releasepage(struct page *page, gfp_t gfp_flags) +{ + struct extent_io_tree *tree; + struct extent_map_tree *map; + int ret; + + if (PageWriteback(page) || PageDirty(page)) + return 0; + + tree = &BTRFS_I(page->mapping->host)->io_tree; + map = &BTRFS_I(page->mapping->host)->extent_tree; + + ret = try_release_extent_state(map, tree, page, gfp_flags); + if (!ret) + return 0; + + ret = try_release_extent_buffer(tree, page); + if (ret == 1) { + ClearPagePrivate(page); + set_page_private(page, 0); + page_cache_release(page); + } + + return ret; +} + +static void btree_invalidatepage(struct page *page, unsigned long offset) +{ + struct extent_io_tree *tree; + tree = &BTRFS_I(page->mapping->host)->io_tree; + extent_invalidatepage(tree, page, offset); + btree_releasepage(page, GFP_NOFS); + if (PagePrivate(page)) { + printk(KERN_WARNING "btrfs warning page private not zero " + "on page %llu\n", (unsigned long long)page_offset(page)); + ClearPagePrivate(page); + set_page_private(page, 0); + page_cache_release(page); + } +} + +#if 0 +static int btree_writepage(struct page *page, struct writeback_control *wbc) +{ + struct buffer_head *bh; + struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; + struct buffer_head *head; + if (!page_has_buffers(page)) { + create_empty_buffers(page, root->fs_info->sb->s_blocksize, + (1 << BH_Dirty)|(1 << BH_Uptodate)); + } + head = page_buffers(page); + bh = head; + do { + if (buffer_dirty(bh)) + csum_tree_block(root, bh, 0); + bh = bh->b_this_page; + } while (bh != head); + return block_write_full_page(page, btree_get_block, wbc); +} +#endif + +static struct address_space_operations btree_aops = { + .readpage = btree_readpage, + .writepage = btree_writepage, + .writepages = btree_writepages, + .releasepage = btree_releasepage, + .invalidatepage = btree_invalidatepage, + .sync_page = block_sync_page, +}; + +int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, + u64 parent_transid) +{ + struct extent_buffer *buf = NULL; + struct inode *btree_inode = root->fs_info->btree_inode; + int ret = 0; + + buf = btrfs_find_create_tree_block(root, bytenr, blocksize); + if (!buf) + return 0; + read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, + buf, 0, 0, btree_get_extent, 0); + free_extent_buffer(buf); + return ret; +} + +struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, + u64 bytenr, u32 blocksize) +{ + struct inode *btree_inode = root->fs_info->btree_inode; + struct extent_buffer *eb; + eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree, + bytenr, blocksize, GFP_NOFS); + return eb; +} + +struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, + u64 bytenr, u32 blocksize) +{ + struct inode *btree_inode = root->fs_info->btree_inode; + struct extent_buffer *eb; + + eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree, + bytenr, blocksize, NULL, GFP_NOFS); + return eb; +} + + +int btrfs_write_tree_block(struct extent_buffer *buf) +{ + return btrfs_fdatawrite_range(buf->first_page->mapping, buf->start, + buf->start + buf->len - 1, WB_SYNC_ALL); +} + +int btrfs_wait_tree_block_writeback(struct extent_buffer *buf) +{ + return btrfs_wait_on_page_writeback_range(buf->first_page->mapping, + buf->start, buf->start + buf->len - 1); +} + +struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, + u32 blocksize, u64 parent_transid) +{ + struct extent_buffer *buf = NULL; + struct inode *btree_inode = root->fs_info->btree_inode; + struct extent_io_tree *io_tree; + int ret; + + io_tree = &BTRFS_I(btree_inode)->io_tree; + + buf = btrfs_find_create_tree_block(root, bytenr, blocksize); + if (!buf) + return NULL; + + ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); + + if (ret == 0) + buf->flags |= EXTENT_UPTODATE; + else + WARN_ON(1); + return buf; + +} + +int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct extent_buffer *buf) +{ + struct inode *btree_inode = root->fs_info->btree_inode; + if (btrfs_header_generation(buf) == + root->fs_info->running_transaction->transid) { + WARN_ON(!btrfs_tree_locked(buf)); + clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, + buf); + } + return 0; +} + +static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, + u32 stripesize, struct btrfs_root *root, + struct btrfs_fs_info *fs_info, + u64 objectid) +{ + root->node = NULL; + root->commit_root = NULL; + root->ref_tree = NULL; + root->sectorsize = sectorsize; + root->nodesize = nodesize; + root->leafsize = leafsize; + root->stripesize = stripesize; + root->ref_cows = 0; + root->track_dirty = 0; + + root->fs_info = fs_info; + root->objectid = objectid; + root->last_trans = 0; + root->highest_inode = 0; + root->last_inode_alloc = 0; + root->name = NULL; + root->in_sysfs = 0; + + INIT_LIST_HEAD(&root->dirty_list); + INIT_LIST_HEAD(&root->orphan_list); + INIT_LIST_HEAD(&root->dead_list); + spin_lock_init(&root->node_lock); + spin_lock_init(&root->list_lock); + mutex_init(&root->objectid_mutex); + mutex_init(&root->log_mutex); + extent_io_tree_init(&root->dirty_log_pages, + fs_info->btree_inode->i_mapping, GFP_NOFS); + + btrfs_leaf_ref_tree_init(&root->ref_tree_struct); + root->ref_tree = &root->ref_tree_struct; + + memset(&root->root_key, 0, sizeof(root->root_key)); + memset(&root->root_item, 0, sizeof(root->root_item)); + memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); + memset(&root->root_kobj, 0, sizeof(root->root_kobj)); + root->defrag_trans_start = fs_info->generation; + init_completion(&root->kobj_unregister); + root->defrag_running = 0; + root->defrag_level = 0; + root->root_key.objectid = objectid; + root->anon_super.s_root = NULL; + root->anon_super.s_dev = 0; + INIT_LIST_HEAD(&root->anon_super.s_list); + INIT_LIST_HEAD(&root->anon_super.s_instances); + init_rwsem(&root->anon_super.s_umount); + + return 0; +} + +static int find_and_setup_root(struct btrfs_root *tree_root, + struct btrfs_fs_info *fs_info, + u64 objectid, + struct btrfs_root *root) +{ + int ret; + u32 blocksize; + u64 generation; + + __setup_root(tree_root->nodesize, tree_root->leafsize, + tree_root->sectorsize, tree_root->stripesize, + root, fs_info, objectid); + ret = btrfs_find_last_root(tree_root, objectid, + &root->root_item, &root->root_key); + BUG_ON(ret); + + generation = btrfs_root_generation(&root->root_item); + blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); + root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), + blocksize, generation); + BUG_ON(!root->node); + return 0; +} + +int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + struct extent_buffer *eb; + struct btrfs_root *log_root_tree = fs_info->log_root_tree; + u64 start = 0; + u64 end = 0; + int ret; + + if (!log_root_tree) + return 0; + + while (1) { + ret = find_first_extent_bit(&log_root_tree->dirty_log_pages, + 0, &start, &end, EXTENT_DIRTY); + if (ret) + break; + + clear_extent_dirty(&log_root_tree->dirty_log_pages, + start, end, GFP_NOFS); + } + eb = fs_info->log_root_tree->node; + + WARN_ON(btrfs_header_level(eb) != 0); + WARN_ON(btrfs_header_nritems(eb) != 0); + + ret = btrfs_free_reserved_extent(fs_info->tree_root, + eb->start, eb->len); + BUG_ON(ret); + + free_extent_buffer(eb); + kfree(fs_info->log_root_tree); + fs_info->log_root_tree = NULL; + return 0; +} + +int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *root; + struct btrfs_root *tree_root = fs_info->tree_root; + + root = kzalloc(sizeof(*root), GFP_NOFS); + if (!root) + return -ENOMEM; + + __setup_root(tree_root->nodesize, tree_root->leafsize, + tree_root->sectorsize, tree_root->stripesize, + root, fs_info, BTRFS_TREE_LOG_OBJECTID); + + root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID; + root->root_key.type = BTRFS_ROOT_ITEM_KEY; + root->root_key.offset = BTRFS_TREE_LOG_OBJECTID; + root->ref_cows = 0; + + root->node = btrfs_alloc_free_block(trans, root, root->leafsize, + 0, BTRFS_TREE_LOG_OBJECTID, + trans->transid, 0, 0, 0); + + btrfs_set_header_nritems(root->node, 0); + btrfs_set_header_level(root->node, 0); + btrfs_set_header_bytenr(root->node, root->node->start); + btrfs_set_header_generation(root->node, trans->transid); + btrfs_set_header_owner(root->node, BTRFS_TREE_LOG_OBJECTID); + + write_extent_buffer(root->node, root->fs_info->fsid, + (unsigned long)btrfs_header_fsid(root->node), + BTRFS_FSID_SIZE); + btrfs_mark_buffer_dirty(root->node); + btrfs_tree_unlock(root->node); + fs_info->log_root_tree = root; + return 0; +} + +struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, + struct btrfs_key *location) +{ + struct btrfs_root *root; + struct btrfs_fs_info *fs_info = tree_root->fs_info; + struct btrfs_path *path; + struct extent_buffer *l; + u64 highest_inode; + u64 generation; + u32 blocksize; + int ret = 0; + + root = kzalloc(sizeof(*root), GFP_NOFS); + if (!root) + return ERR_PTR(-ENOMEM); + if (location->offset == (u64)-1) { + ret = find_and_setup_root(tree_root, fs_info, + location->objectid, root); + if (ret) { + kfree(root); + return ERR_PTR(ret); + } + goto insert; + } + + __setup_root(tree_root->nodesize, tree_root->leafsize, + tree_root->sectorsize, tree_root->stripesize, + root, fs_info, location->objectid); + + path = btrfs_alloc_path(); + BUG_ON(!path); + ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0); + if (ret != 0) { + if (ret > 0) + ret = -ENOENT; + goto out; + } + l = path->nodes[0]; + read_extent_buffer(l, &root->root_item, + btrfs_item_ptr_offset(l, path->slots[0]), + sizeof(root->root_item)); + memcpy(&root->root_key, location, sizeof(*location)); + ret = 0; +out: + btrfs_release_path(root, path); + btrfs_free_path(path); + if (ret) { + kfree(root); + return ERR_PTR(ret); + } + generation = btrfs_root_generation(&root->root_item); + blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); + root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), + blocksize, generation); + BUG_ON(!root->node); +insert: + if (location->objectid != BTRFS_TREE_LOG_OBJECTID) { + root->ref_cows = 1; + ret = btrfs_find_highest_inode(root, &highest_inode); + if (ret == 0) { + root->highest_inode = highest_inode; + root->last_inode_alloc = highest_inode; + } + } + return root; +} + +struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, + u64 root_objectid) +{ + struct btrfs_root *root; + + if (root_objectid == BTRFS_ROOT_TREE_OBJECTID) + return fs_info->tree_root; + if (root_objectid == BTRFS_EXTENT_TREE_OBJECTID) + return fs_info->extent_root; + + root = radix_tree_lookup(&fs_info->fs_roots_radix, + (unsigned long)root_objectid); + return root; +} + +struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, + struct btrfs_key *location) +{ + struct btrfs_root *root; + int ret; + + if (location->objectid == BTRFS_ROOT_TREE_OBJECTID) + return fs_info->tree_root; + if (location->objectid == BTRFS_EXTENT_TREE_OBJECTID) + return fs_info->extent_root; + if (location->objectid == BTRFS_CHUNK_TREE_OBJECTID) + return fs_info->chunk_root; + if (location->objectid == BTRFS_DEV_TREE_OBJECTID) + return fs_info->dev_root; + if (location->objectid == BTRFS_CSUM_TREE_OBJECTID) + return fs_info->csum_root; + + root = radix_tree_lookup(&fs_info->fs_roots_radix, + (unsigned long)location->objectid); + if (root) + return root; + + root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location); + if (IS_ERR(root)) + return root; + + set_anon_super(&root->anon_super, NULL); + + ret = radix_tree_insert(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + root); + if (ret) { + free_extent_buffer(root->node); + kfree(root); + return ERR_PTR(ret); + } + if (!(fs_info->sb->s_flags & MS_RDONLY)) { + ret = btrfs_find_dead_roots(fs_info->tree_root, + root->root_key.objectid, root); + BUG_ON(ret); + btrfs_orphan_cleanup(root); + } + return root; +} + +struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info, + struct btrfs_key *location, + const char *name, int namelen) +{ + struct btrfs_root *root; + int ret; + + root = btrfs_read_fs_root_no_name(fs_info, location); + if (!root) + return NULL; + + if (root->in_sysfs) + return root; + + ret = btrfs_set_root_name(root, name, namelen); + if (ret) { + free_extent_buffer(root->node); + kfree(root); + return ERR_PTR(ret); + } +#if 0 + ret = btrfs_sysfs_add_root(root); + if (ret) { + free_extent_buffer(root->node); + kfree(root->name); + kfree(root); + return ERR_PTR(ret); + } +#endif + root->in_sysfs = 1; + return root; +} + +static int btrfs_congested_fn(void *congested_data, int bdi_bits) +{ + struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data; + int ret = 0; + struct list_head *cur; + struct btrfs_device *device; + struct backing_dev_info *bdi; +#if 0 + if ((bdi_bits & (1 << BDI_write_congested)) && + btrfs_congested_async(info, 0)) + return 1; +#endif + list_for_each(cur, &info->fs_devices->devices) { + device = list_entry(cur, struct btrfs_device, dev_list); + if (!device->bdev) + continue; + bdi = blk_get_backing_dev_info(device->bdev); + if (bdi && bdi_congested(bdi, bdi_bits)) { + ret = 1; + break; + } + } + return ret; +} + +/* + * this unplugs every device on the box, and it is only used when page + * is null + */ +static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page) +{ + struct list_head *cur; + struct btrfs_device *device; + struct btrfs_fs_info *info; + + info = (struct btrfs_fs_info *)bdi->unplug_io_data; + list_for_each(cur, &info->fs_devices->devices) { + device = list_entry(cur, struct btrfs_device, dev_list); + if (!device->bdev) + continue; + + bdi = blk_get_backing_dev_info(device->bdev); + if (bdi->unplug_io_fn) + bdi->unplug_io_fn(bdi, page); + } +} + +static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) +{ + struct inode *inode; + struct extent_map_tree *em_tree; + struct extent_map *em; + struct address_space *mapping; + u64 offset; + + /* the generic O_DIRECT read code does this */ + if (1 || !page) { + __unplug_io_fn(bdi, page); + return; + } + + /* + * page->mapping may change at any time. Get a consistent copy + * and use that for everything below + */ + smp_mb(); + mapping = page->mapping; + if (!mapping) + return; + + inode = mapping->host; + + /* + * don't do the expensive searching for a small number of + * devices + */ + if (BTRFS_I(inode)->root->fs_info->fs_devices->open_devices <= 2) { + __unplug_io_fn(bdi, page); + return; + } + + offset = page_offset(page); + + em_tree = &BTRFS_I(inode)->extent_tree; + spin_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE); + spin_unlock(&em_tree->lock); + if (!em) { + __unplug_io_fn(bdi, page); + return; + } + + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + free_extent_map(em); + __unplug_io_fn(bdi, page); + return; + } + offset = offset - em->start; + btrfs_unplug_page(&BTRFS_I(inode)->root->fs_info->mapping_tree, + em->block_start + offset, page); + free_extent_map(em); +} + +static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) +{ + bdi_init(bdi); + bdi->ra_pages = default_backing_dev_info.ra_pages; + bdi->state = 0; + bdi->capabilities = default_backing_dev_info.capabilities; + bdi->unplug_io_fn = btrfs_unplug_io_fn; + bdi->unplug_io_data = info; + bdi->congested_fn = btrfs_congested_fn; + bdi->congested_data = info; + return 0; +} + +static int bio_ready_for_csum(struct bio *bio) +{ + u64 length = 0; + u64 buf_len = 0; + u64 start = 0; + struct page *page; + struct extent_io_tree *io_tree = NULL; + struct btrfs_fs_info *info = NULL; + struct bio_vec *bvec; + int i; + int ret; + + bio_for_each_segment(bvec, bio, i) { + page = bvec->bv_page; + if (page->private == EXTENT_PAGE_PRIVATE) { + length += bvec->bv_len; + continue; + } + if (!page->private) { + length += bvec->bv_len; + continue; + } + length = bvec->bv_len; + buf_len = page->private >> 2; + start = page_offset(page) + bvec->bv_offset; + io_tree = &BTRFS_I(page->mapping->host)->io_tree; + info = BTRFS_I(page->mapping->host)->root->fs_info; + } + /* are we fully contained in this bio? */ + if (buf_len <= length) + return 1; + + ret = extent_range_uptodate(io_tree, start + length, + start + buf_len - 1); + if (ret == 1) + return ret; + return ret; +} + +/* + * called by the kthread helper functions to finally call the bio end_io + * functions. This is where read checksum verification actually happens + */ +static void end_workqueue_fn(struct btrfs_work *work) +{ + struct bio *bio; + struct end_io_wq *end_io_wq; + struct btrfs_fs_info *fs_info; + int error; + + end_io_wq = container_of(work, struct end_io_wq, work); + bio = end_io_wq->bio; + fs_info = end_io_wq->info; + + /* metadata bio reads are special because the whole tree block must + * be checksummed at once. This makes sure the entire block is in + * ram and up to date before trying to verify things. For + * blocksize <= pagesize, it is basically a noop + */ + if (!(bio->bi_rw & (1 << BIO_RW)) && end_io_wq->metadata && + !bio_ready_for_csum(bio)) { + btrfs_queue_worker(&fs_info->endio_meta_workers, + &end_io_wq->work); + return; + } + error = end_io_wq->error; + bio->bi_private = end_io_wq->private; + bio->bi_end_io = end_io_wq->end_io; + kfree(end_io_wq); + bio_endio(bio, error); +} + +static int cleaner_kthread(void *arg) +{ + struct btrfs_root *root = arg; + + do { + smp_mb(); + if (root->fs_info->closing) + break; + + vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); + mutex_lock(&root->fs_info->cleaner_mutex); + btrfs_clean_old_snapshots(root); + mutex_unlock(&root->fs_info->cleaner_mutex); + + if (freezing(current)) { + refrigerator(); + } else { + smp_mb(); + if (root->fs_info->closing) + break; + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + __set_current_state(TASK_RUNNING); + } + } while (!kthread_should_stop()); + return 0; +} + +static int transaction_kthread(void *arg) +{ + struct btrfs_root *root = arg; + struct btrfs_trans_handle *trans; + struct btrfs_transaction *cur; + unsigned long now; + unsigned long delay; + int ret; + + do { + smp_mb(); + if (root->fs_info->closing) + break; + + delay = HZ * 30; + vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); + mutex_lock(&root->fs_info->transaction_kthread_mutex); + + if (root->fs_info->total_ref_cache_size > 20 * 1024 * 1024) { + printk(KERN_INFO "btrfs: total reference cache " + "size %llu\n", + root->fs_info->total_ref_cache_size); + } + + mutex_lock(&root->fs_info->trans_mutex); + cur = root->fs_info->running_transaction; + if (!cur) { + mutex_unlock(&root->fs_info->trans_mutex); + goto sleep; + } + + now = get_seconds(); + if (now < cur->start_time || now - cur->start_time < 30) { + mutex_unlock(&root->fs_info->trans_mutex); + delay = HZ * 5; + goto sleep; + } + mutex_unlock(&root->fs_info->trans_mutex); + trans = btrfs_start_transaction(root, 1); + ret = btrfs_commit_transaction(trans, root); +sleep: + wake_up_process(root->fs_info->cleaner_kthread); + mutex_unlock(&root->fs_info->transaction_kthread_mutex); + + if (freezing(current)) { + refrigerator(); + } else { + if (root->fs_info->closing) + break; + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(delay); + __set_current_state(TASK_RUNNING); + } + } while (!kthread_should_stop()); + return 0; +} + +struct btrfs_root *open_ctree(struct super_block *sb, + struct btrfs_fs_devices *fs_devices, + char *options) +{ + u32 sectorsize; + u32 nodesize; + u32 leafsize; + u32 blocksize; + u32 stripesize; + u64 generation; + u64 features; + struct btrfs_key location; + struct buffer_head *bh; + struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root), + GFP_NOFS); + struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root), + GFP_NOFS); + struct btrfs_root *tree_root = kzalloc(sizeof(struct btrfs_root), + GFP_NOFS); + struct btrfs_fs_info *fs_info = kzalloc(sizeof(*fs_info), + GFP_NOFS); + struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root), + GFP_NOFS); + struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root), + GFP_NOFS); + struct btrfs_root *log_tree_root; + + int ret; + int err = -EINVAL; + + struct btrfs_super_block *disk_super; + + if (!extent_root || !tree_root || !fs_info || + !chunk_root || !dev_root || !csum_root) { + err = -ENOMEM; + goto fail; + } + INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS); + INIT_LIST_HEAD(&fs_info->trans_list); + INIT_LIST_HEAD(&fs_info->dead_roots); + INIT_LIST_HEAD(&fs_info->hashers); + INIT_LIST_HEAD(&fs_info->delalloc_inodes); + spin_lock_init(&fs_info->hash_lock); + spin_lock_init(&fs_info->delalloc_lock); + spin_lock_init(&fs_info->new_trans_lock); + spin_lock_init(&fs_info->ref_cache_lock); + + init_completion(&fs_info->kobj_unregister); + fs_info->tree_root = tree_root; + fs_info->extent_root = extent_root; + fs_info->csum_root = csum_root; + fs_info->chunk_root = chunk_root; + fs_info->dev_root = dev_root; + fs_info->fs_devices = fs_devices; + INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); + INIT_LIST_HEAD(&fs_info->space_info); + btrfs_mapping_init(&fs_info->mapping_tree); + atomic_set(&fs_info->nr_async_submits, 0); + atomic_set(&fs_info->async_delalloc_pages, 0); + atomic_set(&fs_info->async_submit_draining, 0); + atomic_set(&fs_info->nr_async_bios, 0); + atomic_set(&fs_info->throttles, 0); + atomic_set(&fs_info->throttle_gen, 0); + fs_info->sb = sb; + fs_info->max_extent = (u64)-1; + fs_info->max_inline = 8192 * 1024; + setup_bdi(fs_info, &fs_info->bdi); + fs_info->btree_inode = new_inode(sb); + fs_info->btree_inode->i_ino = 1; + fs_info->btree_inode->i_nlink = 1; + + fs_info->thread_pool_size = min_t(unsigned long, + num_online_cpus() + 2, 8); + + INIT_LIST_HEAD(&fs_info->ordered_extents); + spin_lock_init(&fs_info->ordered_extent_lock); + + sb->s_blocksize = 4096; + sb->s_blocksize_bits = blksize_bits(4096); + + /* + * we set the i_size on the btree inode to the max possible int. + * the real end of the address space is determined by all of + * the devices in the system + */ + fs_info->btree_inode->i_size = OFFSET_MAX; + fs_info->btree_inode->i_mapping->a_ops = &btree_aops; + fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi; + + extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree, + fs_info->btree_inode->i_mapping, + GFP_NOFS); + extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree, + GFP_NOFS); + + BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops; + + spin_lock_init(&fs_info->block_group_cache_lock); + fs_info->block_group_cache_tree.rb_node = NULL; + + extent_io_tree_init(&fs_info->pinned_extents, + fs_info->btree_inode->i_mapping, GFP_NOFS); + extent_io_tree_init(&fs_info->pending_del, + fs_info->btree_inode->i_mapping, GFP_NOFS); + extent_io_tree_init(&fs_info->extent_ins, + fs_info->btree_inode->i_mapping, GFP_NOFS); + fs_info->do_barriers = 1; + + INIT_LIST_HEAD(&fs_info->dead_reloc_roots); + btrfs_leaf_ref_tree_init(&fs_info->reloc_ref_tree); + btrfs_leaf_ref_tree_init(&fs_info->shared_ref_tree); + + BTRFS_I(fs_info->btree_inode)->root = tree_root; + memset(&BTRFS_I(fs_info->btree_inode)->location, 0, + sizeof(struct btrfs_key)); + insert_inode_hash(fs_info->btree_inode); + + mutex_init(&fs_info->trans_mutex); + mutex_init(&fs_info->tree_log_mutex); + mutex_init(&fs_info->drop_mutex); + mutex_init(&fs_info->extent_ins_mutex); + mutex_init(&fs_info->pinned_mutex); + mutex_init(&fs_info->chunk_mutex); + mutex_init(&fs_info->transaction_kthread_mutex); + mutex_init(&fs_info->cleaner_mutex); + mutex_init(&fs_info->volume_mutex); + mutex_init(&fs_info->tree_reloc_mutex); + init_waitqueue_head(&fs_info->transaction_throttle); + init_waitqueue_head(&fs_info->transaction_wait); + init_waitqueue_head(&fs_info->async_submit_wait); + init_waitqueue_head(&fs_info->tree_log_wait); + atomic_set(&fs_info->tree_log_commit, 0); + atomic_set(&fs_info->tree_log_writers, 0); + fs_info->tree_log_transid = 0; + + __setup_root(4096, 4096, 4096, 4096, tree_root, + fs_info, BTRFS_ROOT_TREE_OBJECTID); + + + bh = btrfs_read_dev_super(fs_devices->latest_bdev); + if (!bh) + goto fail_iput; + + memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy)); + memcpy(&fs_info->super_for_commit, &fs_info->super_copy, + sizeof(fs_info->super_for_commit)); + brelse(bh); + + memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE); + + disk_super = &fs_info->super_copy; + if (!btrfs_super_root(disk_super)) + goto fail_iput; + + ret = btrfs_parse_options(tree_root, options); + if (ret) { + err = ret; + goto fail_iput; + } + + features = btrfs_super_incompat_flags(disk_super) & + ~BTRFS_FEATURE_INCOMPAT_SUPP; + if (features) { + printk(KERN_ERR "BTRFS: couldn't mount because of " + "unsupported optional features (%Lx).\n", + features); + err = -EINVAL; + goto fail_iput; + } + + features = btrfs_super_compat_ro_flags(disk_super) & + ~BTRFS_FEATURE_COMPAT_RO_SUPP; + if (!(sb->s_flags & MS_RDONLY) && features) { + printk(KERN_ERR "BTRFS: couldn't mount RDWR because of " + "unsupported option features (%Lx).\n", + features); + err = -EINVAL; + goto fail_iput; + } + + /* + * we need to start all the end_io workers up front because the + * queue work function gets called at interrupt time, and so it + * cannot dynamically grow. + */ + btrfs_init_workers(&fs_info->workers, "worker", + fs_info->thread_pool_size); + + btrfs_init_workers(&fs_info->delalloc_workers, "delalloc", + fs_info->thread_pool_size); + + btrfs_init_workers(&fs_info->submit_workers, "submit", + min_t(u64, fs_devices->num_devices, + fs_info->thread_pool_size)); + + /* a higher idle thresh on the submit workers makes it much more + * likely that bios will be send down in a sane order to the + * devices + */ + fs_info->submit_workers.idle_thresh = 64; + + fs_info->workers.idle_thresh = 16; + fs_info->workers.ordered = 1; + + fs_info->delalloc_workers.idle_thresh = 2; + fs_info->delalloc_workers.ordered = 1; + + btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1); + btrfs_init_workers(&fs_info->endio_workers, "endio", + fs_info->thread_pool_size); + btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta", + fs_info->thread_pool_size); + btrfs_init_workers(&fs_info->endio_meta_write_workers, + "endio-meta-write", fs_info->thread_pool_size); + btrfs_init_workers(&fs_info->endio_write_workers, "endio-write", + fs_info->thread_pool_size); + + /* + * endios are largely parallel and should have a very + * low idle thresh + */ + fs_info->endio_workers.idle_thresh = 4; + fs_info->endio_write_workers.idle_thresh = 64; + fs_info->endio_meta_write_workers.idle_thresh = 64; + + btrfs_start_workers(&fs_info->workers, 1); + btrfs_start_workers(&fs_info->submit_workers, 1); + btrfs_start_workers(&fs_info->delalloc_workers, 1); + btrfs_start_workers(&fs_info->fixup_workers, 1); + btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size); + btrfs_start_workers(&fs_info->endio_meta_workers, + fs_info->thread_pool_size); + btrfs_start_workers(&fs_info->endio_meta_write_workers, + fs_info->thread_pool_size); + btrfs_start_workers(&fs_info->endio_write_workers, + fs_info->thread_pool_size); + + fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); + fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, + 4 * 1024 * 1024 / PAGE_CACHE_SIZE); + + nodesize = btrfs_super_nodesize(disk_super); + leafsize = btrfs_super_leafsize(disk_super); + sectorsize = btrfs_super_sectorsize(disk_super); + stripesize = btrfs_super_stripesize(disk_super); + tree_root->nodesize = nodesize; + tree_root->leafsize = leafsize; + tree_root->sectorsize = sectorsize; + tree_root->stripesize = stripesize; + + sb->s_blocksize = sectorsize; + sb->s_blocksize_bits = blksize_bits(sectorsize); + + if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC, + sizeof(disk_super->magic))) { + printk(KERN_INFO "btrfs: valid FS not found on %s\n", sb->s_id); + goto fail_sb_buffer; + } + + mutex_lock(&fs_info->chunk_mutex); + ret = btrfs_read_sys_array(tree_root); + mutex_unlock(&fs_info->chunk_mutex); + if (ret) { + printk(KERN_WARNING "btrfs: failed to read the system " + "array on %s\n", sb->s_id); + goto fail_sys_array; + } + + blocksize = btrfs_level_size(tree_root, + btrfs_super_chunk_root_level(disk_super)); + generation = btrfs_super_chunk_root_generation(disk_super); + + __setup_root(nodesize, leafsize, sectorsize, stripesize, + chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID); + + chunk_root->node = read_tree_block(chunk_root, + btrfs_super_chunk_root(disk_super), + blocksize, generation); + BUG_ON(!chunk_root->node); + + read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid, + (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node), + BTRFS_UUID_SIZE); + + mutex_lock(&fs_info->chunk_mutex); + ret = btrfs_read_chunk_tree(chunk_root); + mutex_unlock(&fs_info->chunk_mutex); + if (ret) { + printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n", + sb->s_id); + goto fail_chunk_root; + } + + btrfs_close_extra_devices(fs_devices); + + blocksize = btrfs_level_size(tree_root, + btrfs_super_root_level(disk_super)); + generation = btrfs_super_generation(disk_super); + + tree_root->node = read_tree_block(tree_root, + btrfs_super_root(disk_super), + blocksize, generation); + if (!tree_root->node) + goto fail_chunk_root; + + + ret = find_and_setup_root(tree_root, fs_info, + BTRFS_EXTENT_TREE_OBJECTID, extent_root); + if (ret) + goto fail_tree_root; + extent_root->track_dirty = 1; + + ret = find_and_setup_root(tree_root, fs_info, + BTRFS_DEV_TREE_OBJECTID, dev_root); + dev_root->track_dirty = 1; + + if (ret) + goto fail_extent_root; + + ret = find_and_setup_root(tree_root, fs_info, + BTRFS_CSUM_TREE_OBJECTID, csum_root); + if (ret) + goto fail_extent_root; + + csum_root->track_dirty = 1; + + btrfs_read_block_groups(extent_root); + + fs_info->generation = generation; + fs_info->last_trans_committed = generation; + fs_info->data_alloc_profile = (u64)-1; + fs_info->metadata_alloc_profile = (u64)-1; + fs_info->system_alloc_profile = fs_info->metadata_alloc_profile; + fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, + "btrfs-cleaner"); + if (!fs_info->cleaner_kthread) + goto fail_csum_root; + + fs_info->transaction_kthread = kthread_run(transaction_kthread, + tree_root, + "btrfs-transaction"); + if (!fs_info->transaction_kthread) + goto fail_cleaner; + + if (btrfs_super_log_root(disk_super) != 0) { + u64 bytenr = btrfs_super_log_root(disk_super); + + if (fs_devices->rw_devices == 0) { + printk(KERN_WARNING "Btrfs log replay required " + "on RO media\n"); + err = -EIO; + goto fail_trans_kthread; + } + blocksize = + btrfs_level_size(tree_root, + btrfs_super_log_root_level(disk_super)); + + log_tree_root = kzalloc(sizeof(struct btrfs_root), + GFP_NOFS); + + __setup_root(nodesize, leafsize, sectorsize, stripesize, + log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID); + + log_tree_root->node = read_tree_block(tree_root, bytenr, + blocksize, + generation + 1); + ret = btrfs_recover_log_trees(log_tree_root); + BUG_ON(ret); + + if (sb->s_flags & MS_RDONLY) { + ret = btrfs_commit_super(tree_root); + BUG_ON(ret); + } + } + + if (!(sb->s_flags & MS_RDONLY)) { + ret = btrfs_cleanup_reloc_trees(tree_root); + BUG_ON(ret); + } + + location.objectid = BTRFS_FS_TREE_OBJECTID; + location.type = BTRFS_ROOT_ITEM_KEY; + location.offset = (u64)-1; + + fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location); + if (!fs_info->fs_root) + goto fail_trans_kthread; + return tree_root; + +fail_trans_kthread: + kthread_stop(fs_info->transaction_kthread); +fail_cleaner: + kthread_stop(fs_info->cleaner_kthread); + + /* + * make sure we're done with the btree inode before we stop our + * kthreads + */ + filemap_write_and_wait(fs_info->btree_inode->i_mapping); + invalidate_inode_pages2(fs_info->btree_inode->i_mapping); + +fail_csum_root: + free_extent_buffer(csum_root->node); +fail_extent_root: + free_extent_buffer(extent_root->node); +fail_tree_root: + free_extent_buffer(tree_root->node); +fail_chunk_root: + free_extent_buffer(chunk_root->node); +fail_sys_array: + free_extent_buffer(dev_root->node); +fail_sb_buffer: + btrfs_stop_workers(&fs_info->fixup_workers); + btrfs_stop_workers(&fs_info->delalloc_workers); + btrfs_stop_workers(&fs_info->workers); + btrfs_stop_workers(&fs_info->endio_workers); + btrfs_stop_workers(&fs_info->endio_meta_workers); + btrfs_stop_workers(&fs_info->endio_meta_write_workers); + btrfs_stop_workers(&fs_info->endio_write_workers); + btrfs_stop_workers(&fs_info->submit_workers); +fail_iput: + invalidate_inode_pages2(fs_info->btree_inode->i_mapping); + iput(fs_info->btree_inode); +fail: + btrfs_close_devices(fs_info->fs_devices); + btrfs_mapping_tree_free(&fs_info->mapping_tree); + + kfree(extent_root); + kfree(tree_root); + bdi_destroy(&fs_info->bdi); + kfree(fs_info); + kfree(chunk_root); + kfree(dev_root); + kfree(csum_root); + return ERR_PTR(err); +} + +static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) +{ + char b[BDEVNAME_SIZE]; + + if (uptodate) { + set_buffer_uptodate(bh); + } else { + if (!buffer_eopnotsupp(bh) && printk_ratelimit()) { + printk(KERN_WARNING "lost page write due to " + "I/O error on %s\n", + bdevname(bh->b_bdev, b)); + } + /* note, we dont' set_buffer_write_io_error because we have + * our own ways of dealing with the IO errors + */ + clear_buffer_uptodate(bh); + } + unlock_buffer(bh); + put_bh(bh); +} + +struct buffer_head *btrfs_read_dev_super(struct block_device *bdev) +{ + struct buffer_head *bh; + struct buffer_head *latest = NULL; + struct btrfs_super_block *super; + int i; + u64 transid = 0; + u64 bytenr; + + /* we would like to check all the supers, but that would make + * a btrfs mount succeed after a mkfs from a different FS. + * So, we need to add a special mount option to scan for + * later supers, using BTRFS_SUPER_MIRROR_MAX instead + */ + for (i = 0; i < 1; i++) { + bytenr = btrfs_sb_offset(i); + if (bytenr + 4096 >= i_size_read(bdev->bd_inode)) + break; + bh = __bread(bdev, bytenr / 4096, 4096); + if (!bh) + continue; + + super = (struct btrfs_super_block *)bh->b_data; + if (btrfs_super_bytenr(super) != bytenr || + strncmp((char *)(&super->magic), BTRFS_MAGIC, + sizeof(super->magic))) { + brelse(bh); + continue; + } + + if (!latest || btrfs_super_generation(super) > transid) { + brelse(latest); + latest = bh; + transid = btrfs_super_generation(super); + } else { + brelse(bh); + } + } + return latest; +} + +static int write_dev_supers(struct btrfs_device *device, + struct btrfs_super_block *sb, + int do_barriers, int wait, int max_mirrors) +{ + struct buffer_head *bh; + int i; + int ret; + int errors = 0; + u32 crc; + u64 bytenr; + int last_barrier = 0; + + if (max_mirrors == 0) + max_mirrors = BTRFS_SUPER_MIRROR_MAX; + + /* make sure only the last submit_bh does a barrier */ + if (do_barriers) { + for (i = 0; i < max_mirrors; i++) { + bytenr = btrfs_sb_offset(i); + if (bytenr + BTRFS_SUPER_INFO_SIZE >= + device->total_bytes) + break; + last_barrier = i; + } + } + + for (i = 0; i < max_mirrors; i++) { + bytenr = btrfs_sb_offset(i); + if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes) + break; + + if (wait) { + bh = __find_get_block(device->bdev, bytenr / 4096, + BTRFS_SUPER_INFO_SIZE); + BUG_ON(!bh); + brelse(bh); + wait_on_buffer(bh); + if (buffer_uptodate(bh)) { + brelse(bh); + continue; + } + } else { + btrfs_set_super_bytenr(sb, bytenr); + + crc = ~(u32)0; + crc = btrfs_csum_data(NULL, (char *)sb + + BTRFS_CSUM_SIZE, crc, + BTRFS_SUPER_INFO_SIZE - + BTRFS_CSUM_SIZE); + btrfs_csum_final(crc, sb->csum); + + bh = __getblk(device->bdev, bytenr / 4096, + BTRFS_SUPER_INFO_SIZE); + memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE); + + set_buffer_uptodate(bh); + get_bh(bh); + lock_buffer(bh); + bh->b_end_io = btrfs_end_buffer_write_sync; + } + + if (i == last_barrier && do_barriers && device->barriers) { + ret = submit_bh(WRITE_BARRIER, bh); + if (ret == -EOPNOTSUPP) { + printk("btrfs: disabling barriers on dev %s\n", + device->name); + set_buffer_uptodate(bh); + device->barriers = 0; + get_bh(bh); + lock_buffer(bh); + ret = submit_bh(WRITE, bh); + } + } else { + ret = submit_bh(WRITE, bh); + } + + if (!ret && wait) { + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + errors++; + } else if (ret) { + errors++; + } + if (wait) + brelse(bh); + } + return errors < i ? 0 : -1; +} + +int write_all_supers(struct btrfs_root *root, int max_mirrors) +{ + struct list_head *cur; + struct list_head *head = &root->fs_info->fs_devices->devices; + struct btrfs_device *dev; + struct btrfs_super_block *sb; + struct btrfs_dev_item *dev_item; + int ret; + int do_barriers; + int max_errors; + int total_errors = 0; + u64 flags; + + max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; + do_barriers = !btrfs_test_opt(root, NOBARRIER); + + sb = &root->fs_info->super_for_commit; + dev_item = &sb->dev_item; + list_for_each(cur, head) { + dev = list_entry(cur, struct btrfs_device, dev_list); + if (!dev->bdev) { + total_errors++; + continue; + } + if (!dev->in_fs_metadata || !dev->writeable) + continue; + + btrfs_set_stack_device_generation(dev_item, 0); + btrfs_set_stack_device_type(dev_item, dev->type); + btrfs_set_stack_device_id(dev_item, dev->devid); + btrfs_set_stack_device_total_bytes(dev_item, dev->total_bytes); + btrfs_set_stack_device_bytes_used(dev_item, dev->bytes_used); + btrfs_set_stack_device_io_align(dev_item, dev->io_align); + btrfs_set_stack_device_io_width(dev_item, dev->io_width); + btrfs_set_stack_device_sector_size(dev_item, dev->sector_size); + memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE); + memcpy(dev_item->fsid, dev->fs_devices->fsid, BTRFS_UUID_SIZE); + + flags = btrfs_super_flags(sb); + btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN); + + ret = write_dev_supers(dev, sb, do_barriers, 0, max_mirrors); + if (ret) + total_errors++; + } + if (total_errors > max_errors) { + printk(KERN_ERR "btrfs: %d errors while writing supers\n", + total_errors); + BUG(); + } + + total_errors = 0; + list_for_each(cur, head) { + dev = list_entry(cur, struct btrfs_device, dev_list); + if (!dev->bdev) + continue; + if (!dev->in_fs_metadata || !dev->writeable) + continue; + + ret = write_dev_supers(dev, sb, do_barriers, 1, max_mirrors); + if (ret) + total_errors++; + } + if (total_errors > max_errors) { + printk(KERN_ERR "btrfs: %d errors while writing supers\n", + total_errors); + BUG(); + } + return 0; +} + +int write_ctree_super(struct btrfs_trans_handle *trans, + struct btrfs_root *root, int max_mirrors) +{ + int ret; + + ret = write_all_supers(root, max_mirrors); + return ret; +} + +int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) +{ + radix_tree_delete(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid); + if (root->anon_super.s_dev) { + down_write(&root->anon_super.s_umount); + kill_anon_super(&root->anon_super); + } + if (root->node) + free_extent_buffer(root->node); + if (root->commit_root) + free_extent_buffer(root->commit_root); + kfree(root->name); + kfree(root); + return 0; +} + +static int del_fs_roots(struct btrfs_fs_info *fs_info) +{ + int ret; + struct btrfs_root *gang[8]; + int i; + + while (1) { + ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, + (void **)gang, 0, + ARRAY_SIZE(gang)); + if (!ret) + break; + for (i = 0; i < ret; i++) + btrfs_free_fs_root(fs_info, gang[i]); + } + return 0; +} + +int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) +{ + u64 root_objectid = 0; + struct btrfs_root *gang[8]; + int i; + int ret; + + while (1) { + ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, + (void **)gang, root_objectid, + ARRAY_SIZE(gang)); + if (!ret) + break; + for (i = 0; i < ret; i++) { + root_objectid = gang[i]->root_key.objectid; + ret = btrfs_find_dead_roots(fs_info->tree_root, + root_objectid, gang[i]); + BUG_ON(ret); + btrfs_orphan_cleanup(gang[i]); + } + root_objectid++; + } + return 0; +} + +int btrfs_commit_super(struct btrfs_root *root) +{ + struct btrfs_trans_handle *trans; + int ret; + + mutex_lock(&root->fs_info->cleaner_mutex); + btrfs_clean_old_snapshots(root); + mutex_unlock(&root->fs_info->cleaner_mutex); + trans = btrfs_start_transaction(root, 1); + ret = btrfs_commit_transaction(trans, root); + BUG_ON(ret); + /* run commit again to drop the original snapshot */ + trans = btrfs_start_transaction(root, 1); + btrfs_commit_transaction(trans, root); + ret = btrfs_write_and_wait_transaction(NULL, root); + BUG_ON(ret); + + ret = write_ctree_super(NULL, root, 0); + return ret; +} + +int close_ctree(struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + int ret; + + fs_info->closing = 1; + smp_mb(); + + kthread_stop(root->fs_info->transaction_kthread); + kthread_stop(root->fs_info->cleaner_kthread); + + if (!(fs_info->sb->s_flags & MS_RDONLY)) { + ret = btrfs_commit_super(root); + if (ret) + printk(KERN_ERR "btrfs: commit super ret %d\n", ret); + } + + if (fs_info->delalloc_bytes) { + printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n", + fs_info->delalloc_bytes); + } + if (fs_info->total_ref_cache_size) { + printk(KERN_INFO "btrfs: at umount reference cache size %llu\n", + (unsigned long long)fs_info->total_ref_cache_size); + } + + if (fs_info->extent_root->node) + free_extent_buffer(fs_info->extent_root->node); + + if (fs_info->tree_root->node) + free_extent_buffer(fs_info->tree_root->node); + + if (root->fs_info->chunk_root->node) + free_extent_buffer(root->fs_info->chunk_root->node); + + if (root->fs_info->dev_root->node) + free_extent_buffer(root->fs_info->dev_root->node); + + if (root->fs_info->csum_root->node) + free_extent_buffer(root->fs_info->csum_root->node); + + btrfs_free_block_groups(root->fs_info); + + del_fs_roots(fs_info); + + iput(fs_info->btree_inode); + + btrfs_stop_workers(&fs_info->fixup_workers); + btrfs_stop_workers(&fs_info->delalloc_workers); + btrfs_stop_workers(&fs_info->workers); + btrfs_stop_workers(&fs_info->endio_workers); + btrfs_stop_workers(&fs_info->endio_meta_workers); + btrfs_stop_workers(&fs_info->endio_meta_write_workers); + btrfs_stop_workers(&fs_info->endio_write_workers); + btrfs_stop_workers(&fs_info->submit_workers); + +#if 0 + while (!list_empty(&fs_info->hashers)) { + struct btrfs_hasher *hasher; + hasher = list_entry(fs_info->hashers.next, struct btrfs_hasher, + hashers); + list_del(&hasher->hashers); + crypto_free_hash(&fs_info->hash_tfm); + kfree(hasher); + } +#endif + btrfs_close_devices(fs_info->fs_devices); + btrfs_mapping_tree_free(&fs_info->mapping_tree); + + bdi_destroy(&fs_info->bdi); + + kfree(fs_info->extent_root); + kfree(fs_info->tree_root); + kfree(fs_info->chunk_root); + kfree(fs_info->dev_root); + kfree(fs_info->csum_root); + return 0; +} + +int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid) +{ + int ret; + struct inode *btree_inode = buf->first_page->mapping->host; + + ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf); + if (!ret) + return ret; + + ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf, + parent_transid); + return !ret; +} + +int btrfs_set_buffer_uptodate(struct extent_buffer *buf) +{ + struct inode *btree_inode = buf->first_page->mapping->host; + return set_extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, + buf); +} + +void btrfs_mark_buffer_dirty(struct extent_buffer *buf) +{ + struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; + u64 transid = btrfs_header_generation(buf); + struct inode *btree_inode = root->fs_info->btree_inode; + + WARN_ON(!btrfs_tree_locked(buf)); + if (transid != root->fs_info->generation) { + printk(KERN_CRIT "btrfs transid mismatch buffer %llu, " + "found %llu running %llu\n", + (unsigned long long)buf->start, + (unsigned long long)transid, + (unsigned long long)root->fs_info->generation); + WARN_ON(1); + } + set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, buf); +} + +void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) +{ + /* + * looks as though older kernels can get into trouble with + * this code, they end up stuck in balance_dirty_pages forever + */ + struct extent_io_tree *tree; + u64 num_dirty; + u64 start = 0; + unsigned long thresh = 32 * 1024 * 1024; + tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; + + if (current_is_pdflush() || current->flags & PF_MEMALLOC) + return; + + num_dirty = count_range_bits(tree, &start, (u64)-1, + thresh, EXTENT_DIRTY); + if (num_dirty > thresh) { + balance_dirty_pages_ratelimited_nr( + root->fs_info->btree_inode->i_mapping, 1); + } + return; +} + +int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) +{ + struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; + int ret; + ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); + if (ret == 0) + buf->flags |= EXTENT_UPTODATE; + return ret; +} + +int btree_lock_page_hook(struct page *page) +{ + struct inode *inode = page->mapping->host; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct extent_buffer *eb; + unsigned long len; + u64 bytenr = page_offset(page); + + if (page->private == EXTENT_PAGE_PRIVATE) + goto out; + + len = page->private >> 2; + eb = find_extent_buffer(io_tree, bytenr, len, GFP_NOFS); + if (!eb) + goto out; + + btrfs_tree_lock(eb); + spin_lock(&root->fs_info->hash_lock); + btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); + spin_unlock(&root->fs_info->hash_lock); + btrfs_tree_unlock(eb); + free_extent_buffer(eb); +out: + lock_page(page); + return 0; +} + +static struct extent_io_ops btree_extent_io_ops = { + .write_cache_pages_lock_hook = btree_lock_page_hook, + .readpage_end_io_hook = btree_readpage_end_io_hook, + .submit_bio_hook = btree_submit_bio_hook, + /* note we're sharing with inode.c for the merge bio hook */ + .merge_bio_hook = btrfs_merge_bio_hook, +}; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h new file mode 100644 index 000000000000..c0ff404c31b7 --- /dev/null +++ b/fs/btrfs/disk-io.h @@ -0,0 +1,102 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __DISKIO__ +#define __DISKIO__ + +#define BTRFS_SUPER_INFO_OFFSET (64 * 1024) +#define BTRFS_SUPER_INFO_SIZE 4096 + +#define BTRFS_SUPER_MIRROR_MAX 3 +#define BTRFS_SUPER_MIRROR_SHIFT 12 + +static inline u64 btrfs_sb_offset(int mirror) +{ + u64 start = 16 * 1024; + if (mirror) + return start << (BTRFS_SUPER_MIRROR_SHIFT * mirror); + return BTRFS_SUPER_INFO_OFFSET; +} + +struct btrfs_device; +struct btrfs_fs_devices; + +struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, + u32 blocksize, u64 parent_transid); +int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, + u64 parent_transid); +struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, + u64 bytenr, u32 blocksize); +int clean_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *buf); +struct btrfs_root *open_ctree(struct super_block *sb, + struct btrfs_fs_devices *fs_devices, + char *options); +int close_ctree(struct btrfs_root *root); +int write_ctree_super(struct btrfs_trans_handle *trans, + struct btrfs_root *root, int max_mirrors); +struct buffer_head *btrfs_read_dev_super(struct block_device *bdev); +int btrfs_commit_super(struct btrfs_root *root); +struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, + u64 bytenr, u32 blocksize); +struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, + u64 root_objectid); +struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info, + struct btrfs_key *location, + const char *name, int namelen); +struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, + struct btrfs_key *location); +struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, + struct btrfs_key *location); +int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); +int btrfs_insert_dev_radix(struct btrfs_root *root, + struct block_device *bdev, + u64 device_id, + u64 block_start, + u64 num_blocks); +void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr); +int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); +void btrfs_mark_buffer_dirty(struct extent_buffer *buf); +int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid); +int btrfs_set_buffer_uptodate(struct extent_buffer *buf); +int wait_on_tree_block_writeback(struct btrfs_root *root, + struct extent_buffer *buf); +int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid); +u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len); +void btrfs_csum_final(u32 crc, char *result); +int btrfs_open_device(struct btrfs_device *dev); +int btrfs_verify_block_csum(struct btrfs_root *root, + struct extent_buffer *buf); +int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, + int metadata); +int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, + int rw, struct bio *bio, int mirror_num, + unsigned long bio_flags, + extent_submit_bio_hook_t *submit_bio_start, + extent_submit_bio_hook_t *submit_bio_done); + +int btrfs_congested_async(struct btrfs_fs_info *info, int iodone); +unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info); +int btrfs_write_tree_block(struct extent_buffer *buf); +int btrfs_wait_tree_block_writeback(struct extent_buffer *buf); +int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +int btree_lock_page_hook(struct page *page); +#endif diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c new file mode 100644 index 000000000000..85315d2c90de --- /dev/null +++ b/fs/btrfs/export.c @@ -0,0 +1,203 @@ +#include <linux/fs.h> +#include <linux/types.h> +#include "ctree.h" +#include "disk-io.h" +#include "btrfs_inode.h" +#include "print-tree.h" +#include "export.h" +#include "compat.h" + +#define BTRFS_FID_SIZE_NON_CONNECTABLE (offsetof(struct btrfs_fid, \ + parent_objectid) / 4) +#define BTRFS_FID_SIZE_CONNECTABLE (offsetof(struct btrfs_fid, \ + parent_root_objectid) / 4) +#define BTRFS_FID_SIZE_CONNECTABLE_ROOT (sizeof(struct btrfs_fid) / 4) + +static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len, + int connectable) +{ + struct btrfs_fid *fid = (struct btrfs_fid *)fh; + struct inode *inode = dentry->d_inode; + int len = *max_len; + int type; + + if ((len < BTRFS_FID_SIZE_NON_CONNECTABLE) || + (connectable && len < BTRFS_FID_SIZE_CONNECTABLE)) + return 255; + + len = BTRFS_FID_SIZE_NON_CONNECTABLE; + type = FILEID_BTRFS_WITHOUT_PARENT; + + fid->objectid = BTRFS_I(inode)->location.objectid; + fid->root_objectid = BTRFS_I(inode)->root->objectid; + fid->gen = inode->i_generation; + + if (connectable && !S_ISDIR(inode->i_mode)) { + struct inode *parent; + u64 parent_root_id; + + spin_lock(&dentry->d_lock); + + parent = dentry->d_parent->d_inode; + fid->parent_objectid = BTRFS_I(parent)->location.objectid; + fid->parent_gen = parent->i_generation; + parent_root_id = BTRFS_I(parent)->root->objectid; + + spin_unlock(&dentry->d_lock); + + if (parent_root_id != fid->root_objectid) { + fid->parent_root_objectid = parent_root_id; + len = BTRFS_FID_SIZE_CONNECTABLE_ROOT; + type = FILEID_BTRFS_WITH_PARENT_ROOT; + } else { + len = BTRFS_FID_SIZE_CONNECTABLE; + type = FILEID_BTRFS_WITH_PARENT; + } + } + + *max_len = len; + return type; +} + +static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, + u64 root_objectid, u32 generation) +{ + struct btrfs_root *root; + struct inode *inode; + struct btrfs_key key; + + key.objectid = root_objectid; + btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + key.offset = (u64)-1; + + root = btrfs_read_fs_root_no_name(btrfs_sb(sb)->fs_info, &key); + if (IS_ERR(root)) + return ERR_CAST(root); + + key.objectid = objectid; + btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); + key.offset = 0; + + inode = btrfs_iget(sb, &key, root, NULL); + if (IS_ERR(inode)) + return (void *)inode; + + if (generation != inode->i_generation) { + iput(inode); + return ERR_PTR(-ESTALE); + } + + return d_obtain_alias(inode); +} + +static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh, + int fh_len, int fh_type) +{ + struct btrfs_fid *fid = (struct btrfs_fid *) fh; + u64 objectid, root_objectid; + u32 generation; + + if (fh_type == FILEID_BTRFS_WITH_PARENT) { + if (fh_len != BTRFS_FID_SIZE_CONNECTABLE) + return NULL; + root_objectid = fid->root_objectid; + } else if (fh_type == FILEID_BTRFS_WITH_PARENT_ROOT) { + if (fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT) + return NULL; + root_objectid = fid->parent_root_objectid; + } else + return NULL; + + objectid = fid->parent_objectid; + generation = fid->parent_gen; + + return btrfs_get_dentry(sb, objectid, root_objectid, generation); +} + +static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh, + int fh_len, int fh_type) +{ + struct btrfs_fid *fid = (struct btrfs_fid *) fh; + u64 objectid, root_objectid; + u32 generation; + + if ((fh_type != FILEID_BTRFS_WITH_PARENT || + fh_len != BTRFS_FID_SIZE_CONNECTABLE) && + (fh_type != FILEID_BTRFS_WITH_PARENT_ROOT || + fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT) && + (fh_type != FILEID_BTRFS_WITHOUT_PARENT || + fh_len != BTRFS_FID_SIZE_NON_CONNECTABLE)) + return NULL; + + objectid = fid->objectid; + root_objectid = fid->root_objectid; + generation = fid->gen; + + return btrfs_get_dentry(sb, objectid, root_objectid, generation); +} + +static struct dentry *btrfs_get_parent(struct dentry *child) +{ + struct inode *dir = child->d_inode; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_key key; + struct btrfs_path *path; + struct extent_buffer *leaf; + int slot; + u64 objectid; + int ret; + + path = btrfs_alloc_path(); + + key.objectid = dir->i_ino; + btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY); + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + /* Error */ + btrfs_free_path(path); + return ERR_PTR(ret); + } + leaf = path->nodes[0]; + slot = path->slots[0]; + if (ret) { + /* btrfs_search_slot() returns the slot where we'd want to + insert a backref for parent inode #0xFFFFFFFFFFFFFFFF. + The _real_ backref, telling us what the parent inode + _actually_ is, will be in the slot _before_ the one + that btrfs_search_slot() returns. */ + if (!slot) { + /* Unless there is _no_ key in the tree before... */ + btrfs_free_path(path); + return ERR_PTR(-EIO); + } + slot--; + } + + btrfs_item_key_to_cpu(leaf, &key, slot); + btrfs_free_path(path); + + if (key.objectid != dir->i_ino || key.type != BTRFS_INODE_REF_KEY) + return ERR_PTR(-EINVAL); + + objectid = key.offset; + + /* If we are already at the root of a subvol, return the real root */ + if (objectid == dir->i_ino) + return dget(dir->i_sb->s_root); + + /* Build a new key for the inode item */ + key.objectid = objectid; + btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); + key.offset = 0; + + return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL)); +} + +const struct export_operations btrfs_export_ops = { + .encode_fh = btrfs_encode_fh, + .fh_to_dentry = btrfs_fh_to_dentry, + .fh_to_parent = btrfs_fh_to_parent, + .get_parent = btrfs_get_parent, +}; diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h new file mode 100644 index 000000000000..074348a95841 --- /dev/null +++ b/fs/btrfs/export.h @@ -0,0 +1,19 @@ +#ifndef BTRFS_EXPORT_H +#define BTRFS_EXPORT_H + +#include <linux/exportfs.h> + +extern const struct export_operations btrfs_export_ops; + +struct btrfs_fid { + u64 objectid; + u64 root_objectid; + u32 gen; + + u64 parent_objectid; + u32 parent_gen; + + u64 parent_root_objectid; +} __attribute__ ((packed)); + +#endif diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c new file mode 100644 index 000000000000..293da650873f --- /dev/null +++ b/fs/btrfs/extent-tree.c @@ -0,0 +1,5986 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ +#include <linux/sched.h> +#include <linux/pagemap.h> +#include <linux/writeback.h> +#include <linux/blkdev.h> +#include <linux/version.h> +#include "compat.h" +#include "hash.h" +#include "crc32c.h" +#include "ctree.h" +#include "disk-io.h" +#include "print-tree.h" +#include "transaction.h" +#include "volumes.h" +#include "locking.h" +#include "ref-cache.h" +#include "compat.h" + +#define PENDING_EXTENT_INSERT 0 +#define PENDING_EXTENT_DELETE 1 +#define PENDING_BACKREF_UPDATE 2 + +struct pending_extent_op { + int type; + u64 bytenr; + u64 num_bytes; + u64 parent; + u64 orig_parent; + u64 generation; + u64 orig_generation; + int level; + struct list_head list; + int del; +}; + +static int finish_current_insert(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, int all); +static int del_pending_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, int all); +static int pin_down_bytes(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, int is_data); +static int update_block_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, int alloc, + int mark_free); + +static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) +{ + return (cache->flags & bits) == bits; +} + +/* + * this adds the block group to the fs_info rb tree for the block group + * cache + */ +static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, + struct btrfs_block_group_cache *block_group) +{ + struct rb_node **p; + struct rb_node *parent = NULL; + struct btrfs_block_group_cache *cache; + + spin_lock(&info->block_group_cache_lock); + p = &info->block_group_cache_tree.rb_node; + + while (*p) { + parent = *p; + cache = rb_entry(parent, struct btrfs_block_group_cache, + cache_node); + if (block_group->key.objectid < cache->key.objectid) { + p = &(*p)->rb_left; + } else if (block_group->key.objectid > cache->key.objectid) { + p = &(*p)->rb_right; + } else { + spin_unlock(&info->block_group_cache_lock); + return -EEXIST; + } + } + + rb_link_node(&block_group->cache_node, parent, p); + rb_insert_color(&block_group->cache_node, + &info->block_group_cache_tree); + spin_unlock(&info->block_group_cache_lock); + + return 0; +} + +/* + * This will return the block group at or after bytenr if contains is 0, else + * it will return the block group that contains the bytenr + */ +static struct btrfs_block_group_cache * +block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr, + int contains) +{ + struct btrfs_block_group_cache *cache, *ret = NULL; + struct rb_node *n; + u64 end, start; + + spin_lock(&info->block_group_cache_lock); + n = info->block_group_cache_tree.rb_node; + + while (n) { + cache = rb_entry(n, struct btrfs_block_group_cache, + cache_node); + end = cache->key.objectid + cache->key.offset - 1; + start = cache->key.objectid; + + if (bytenr < start) { + if (!contains && (!ret || start < ret->key.objectid)) + ret = cache; + n = n->rb_left; + } else if (bytenr > start) { + if (contains && bytenr <= end) { + ret = cache; + break; + } + n = n->rb_right; + } else { + ret = cache; + break; + } + } + if (ret) + atomic_inc(&ret->count); + spin_unlock(&info->block_group_cache_lock); + + return ret; +} + +/* + * this is only called by cache_block_group, since we could have freed extents + * we need to check the pinned_extents for any extents that can't be used yet + * since their free space will be released as soon as the transaction commits. + */ +static int add_new_free_space(struct btrfs_block_group_cache *block_group, + struct btrfs_fs_info *info, u64 start, u64 end) +{ + u64 extent_start, extent_end, size; + int ret; + + mutex_lock(&info->pinned_mutex); + while (start < end) { + ret = find_first_extent_bit(&info->pinned_extents, start, + &extent_start, &extent_end, + EXTENT_DIRTY); + if (ret) + break; + + if (extent_start == start) { + start = extent_end + 1; + } else if (extent_start > start && extent_start < end) { + size = extent_start - start; + ret = btrfs_add_free_space(block_group, start, + size); + BUG_ON(ret); + start = extent_end + 1; + } else { + break; + } + } + + if (start < end) { + size = end - start; + ret = btrfs_add_free_space(block_group, start, size); + BUG_ON(ret); + } + mutex_unlock(&info->pinned_mutex); + + return 0; +} + +static int remove_sb_from_cache(struct btrfs_root *root, + struct btrfs_block_group_cache *cache) +{ + u64 bytenr; + u64 *logical; + int stripe_len; + int i, nr, ret; + + for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { + bytenr = btrfs_sb_offset(i); + ret = btrfs_rmap_block(&root->fs_info->mapping_tree, + cache->key.objectid, bytenr, 0, + &logical, &nr, &stripe_len); + BUG_ON(ret); + while (nr--) { + btrfs_remove_free_space(cache, logical[nr], + stripe_len); + } + kfree(logical); + } + return 0; +} + +static int cache_block_group(struct btrfs_root *root, + struct btrfs_block_group_cache *block_group) +{ + struct btrfs_path *path; + int ret = 0; + struct btrfs_key key; + struct extent_buffer *leaf; + int slot; + u64 last; + + if (!block_group) + return 0; + + root = root->fs_info->extent_root; + + if (block_group->cached) + return 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + path->reada = 2; + /* + * we get into deadlocks with paths held by callers of this function. + * since the alloc_mutex is protecting things right now, just + * skip the locking here + */ + path->skip_locking = 1; + last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); + key.objectid = last; + key.offset = 0; + btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto err; + + while (1) { + leaf = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto err; + if (ret == 0) + continue; + else + break; + } + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid < block_group->key.objectid) + goto next; + + if (key.objectid >= block_group->key.objectid + + block_group->key.offset) + break; + + if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) { + add_new_free_space(block_group, root->fs_info, last, + key.objectid); + + last = key.objectid + key.offset; + } +next: + path->slots[0]++; + } + + add_new_free_space(block_group, root->fs_info, last, + block_group->key.objectid + + block_group->key.offset); + + remove_sb_from_cache(root, block_group); + block_group->cached = 1; + ret = 0; +err: + btrfs_free_path(path); + return ret; +} + +/* + * return the block group that starts at or after bytenr + */ +static struct btrfs_block_group_cache * +btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr) +{ + struct btrfs_block_group_cache *cache; + + cache = block_group_cache_tree_search(info, bytenr, 0); + + return cache; +} + +/* + * return the block group that contains teh given bytenr + */ +struct btrfs_block_group_cache *btrfs_lookup_block_group( + struct btrfs_fs_info *info, + u64 bytenr) +{ + struct btrfs_block_group_cache *cache; + + cache = block_group_cache_tree_search(info, bytenr, 1); + + return cache; +} + +static inline void put_block_group(struct btrfs_block_group_cache *cache) +{ + if (atomic_dec_and_test(&cache->count)) + kfree(cache); +} + +static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, + u64 flags) +{ + struct list_head *head = &info->space_info; + struct list_head *cur; + struct btrfs_space_info *found; + list_for_each(cur, head) { + found = list_entry(cur, struct btrfs_space_info, list); + if (found->flags == flags) + return found; + } + return NULL; +} + +static u64 div_factor(u64 num, int factor) +{ + if (factor == 10) + return num; + num *= factor; + do_div(num, 10); + return num; +} + +u64 btrfs_find_block_group(struct btrfs_root *root, + u64 search_start, u64 search_hint, int owner) +{ + struct btrfs_block_group_cache *cache; + u64 used; + u64 last = max(search_hint, search_start); + u64 group_start = 0; + int full_search = 0; + int factor = 9; + int wrapped = 0; +again: + while (1) { + cache = btrfs_lookup_first_block_group(root->fs_info, last); + if (!cache) + break; + + spin_lock(&cache->lock); + last = cache->key.objectid + cache->key.offset; + used = btrfs_block_group_used(&cache->item); + + if ((full_search || !cache->ro) && + block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) { + if (used + cache->pinned + cache->reserved < + div_factor(cache->key.offset, factor)) { + group_start = cache->key.objectid; + spin_unlock(&cache->lock); + put_block_group(cache); + goto found; + } + } + spin_unlock(&cache->lock); + put_block_group(cache); + cond_resched(); + } + if (!wrapped) { + last = search_start; + wrapped = 1; + goto again; + } + if (!full_search && factor < 10) { + last = search_start; + full_search = 1; + factor = 10; + goto again; + } +found: + return group_start; +} + +/* simple helper to search for an existing extent at a given offset */ +int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len) +{ + int ret; + struct btrfs_key key; + struct btrfs_path *path; + + path = btrfs_alloc_path(); + BUG_ON(!path); + key.objectid = start; + key.offset = len; + btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); + ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path, + 0, 0); + btrfs_free_path(path); + return ret; +} + +/* + * Back reference rules. Back refs have three main goals: + * + * 1) differentiate between all holders of references to an extent so that + * when a reference is dropped we can make sure it was a valid reference + * before freeing the extent. + * + * 2) Provide enough information to quickly find the holders of an extent + * if we notice a given block is corrupted or bad. + * + * 3) Make it easy to migrate blocks for FS shrinking or storage pool + * maintenance. This is actually the same as #2, but with a slightly + * different use case. + * + * File extents can be referenced by: + * + * - multiple snapshots, subvolumes, or different generations in one subvol + * - different files inside a single subvolume + * - different offsets inside a file (bookend extents in file.c) + * + * The extent ref structure has fields for: + * + * - Objectid of the subvolume root + * - Generation number of the tree holding the reference + * - objectid of the file holding the reference + * - number of references holding by parent node (alway 1 for tree blocks) + * + * Btree leaf may hold multiple references to a file extent. In most cases, + * these references are from same file and the corresponding offsets inside + * the file are close together. + * + * When a file extent is allocated the fields are filled in: + * (root_key.objectid, trans->transid, inode objectid, 1) + * + * When a leaf is cow'd new references are added for every file extent found + * in the leaf. It looks similar to the create case, but trans->transid will + * be different when the block is cow'd. + * + * (root_key.objectid, trans->transid, inode objectid, + * number of references in the leaf) + * + * When a file extent is removed either during snapshot deletion or + * file truncation, we find the corresponding back reference and check + * the following fields: + * + * (btrfs_header_owner(leaf), btrfs_header_generation(leaf), + * inode objectid) + * + * Btree extents can be referenced by: + * + * - Different subvolumes + * - Different generations of the same subvolume + * + * When a tree block is created, back references are inserted: + * + * (root->root_key.objectid, trans->transid, level, 1) + * + * When a tree block is cow'd, new back references are added for all the + * blocks it points to. If the tree block isn't in reference counted root, + * the old back references are removed. These new back references are of + * the form (trans->transid will have increased since creation): + * + * (root->root_key.objectid, trans->transid, level, 1) + * + * When a backref is in deleting, the following fields are checked: + * + * if backref was for a tree root: + * (btrfs_header_owner(itself), btrfs_header_generation(itself), level) + * else + * (btrfs_header_owner(parent), btrfs_header_generation(parent), level) + * + * Back Reference Key composing: + * + * The key objectid corresponds to the first byte in the extent, the key + * type is set to BTRFS_EXTENT_REF_KEY, and the key offset is the first + * byte of parent extent. If a extent is tree root, the key offset is set + * to the key objectid. + */ + +static noinline int lookup_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, u64 parent, + u64 ref_root, u64 ref_generation, + u64 owner_objectid, int del) +{ + struct btrfs_key key; + struct btrfs_extent_ref *ref; + struct extent_buffer *leaf; + u64 ref_objectid; + int ret; + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_REF_KEY; + key.offset = parent; + + ret = btrfs_search_slot(trans, root, &key, path, del ? -1 : 0, 1); + if (ret < 0) + goto out; + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + leaf = path->nodes[0]; + ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref); + ref_objectid = btrfs_ref_objectid(leaf, ref); + if (btrfs_ref_root(leaf, ref) != ref_root || + btrfs_ref_generation(leaf, ref) != ref_generation || + (ref_objectid != owner_objectid && + ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) { + ret = -EIO; + WARN_ON(1); + goto out; + } + ret = 0; +out: + return ret; +} + +/* + * updates all the backrefs that are pending on update_list for the + * extent_root + */ +static noinline int update_backrefs(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, + struct btrfs_path *path, + struct list_head *update_list) +{ + struct btrfs_key key; + struct btrfs_extent_ref *ref; + struct btrfs_fs_info *info = extent_root->fs_info; + struct pending_extent_op *op; + struct extent_buffer *leaf; + int ret = 0; + struct list_head *cur = update_list->next; + u64 ref_objectid; + u64 ref_root = extent_root->root_key.objectid; + + op = list_entry(cur, struct pending_extent_op, list); + +search: + key.objectid = op->bytenr; + key.type = BTRFS_EXTENT_REF_KEY; + key.offset = op->orig_parent; + + ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 1); + BUG_ON(ret); + + leaf = path->nodes[0]; + +loop: + ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref); + + ref_objectid = btrfs_ref_objectid(leaf, ref); + + if (btrfs_ref_root(leaf, ref) != ref_root || + btrfs_ref_generation(leaf, ref) != op->orig_generation || + (ref_objectid != op->level && + ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) { + printk(KERN_ERR "btrfs couldn't find %llu, parent %llu, " + "root %llu, owner %u\n", + (unsigned long long)op->bytenr, + (unsigned long long)op->orig_parent, + (unsigned long long)ref_root, op->level); + btrfs_print_leaf(extent_root, leaf); + BUG(); + } + + key.objectid = op->bytenr; + key.offset = op->parent; + key.type = BTRFS_EXTENT_REF_KEY; + ret = btrfs_set_item_key_safe(trans, extent_root, path, &key); + BUG_ON(ret); + ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref); + btrfs_set_ref_generation(leaf, ref, op->generation); + + cur = cur->next; + + list_del_init(&op->list); + unlock_extent(&info->extent_ins, op->bytenr, + op->bytenr + op->num_bytes - 1, GFP_NOFS); + kfree(op); + + if (cur == update_list) { + btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_release_path(extent_root, path); + goto out; + } + + op = list_entry(cur, struct pending_extent_op, list); + + path->slots[0]++; + while (path->slots[0] < btrfs_header_nritems(leaf)) { + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid == op->bytenr && + key.type == BTRFS_EXTENT_REF_KEY) + goto loop; + path->slots[0]++; + } + + btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_release_path(extent_root, path); + goto search; + +out: + return 0; +} + +static noinline int insert_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, + struct btrfs_path *path, + struct list_head *insert_list, int nr) +{ + struct btrfs_key *keys; + u32 *data_size; + struct pending_extent_op *op; + struct extent_buffer *leaf; + struct list_head *cur = insert_list->next; + struct btrfs_fs_info *info = extent_root->fs_info; + u64 ref_root = extent_root->root_key.objectid; + int i = 0, last = 0, ret; + int total = nr * 2; + + if (!nr) + return 0; + + keys = kzalloc(total * sizeof(struct btrfs_key), GFP_NOFS); + if (!keys) + return -ENOMEM; + + data_size = kzalloc(total * sizeof(u32), GFP_NOFS); + if (!data_size) { + kfree(keys); + return -ENOMEM; + } + + list_for_each_entry(op, insert_list, list) { + keys[i].objectid = op->bytenr; + keys[i].offset = op->num_bytes; + keys[i].type = BTRFS_EXTENT_ITEM_KEY; + data_size[i] = sizeof(struct btrfs_extent_item); + i++; + + keys[i].objectid = op->bytenr; + keys[i].offset = op->parent; + keys[i].type = BTRFS_EXTENT_REF_KEY; + data_size[i] = sizeof(struct btrfs_extent_ref); + i++; + } + + op = list_entry(cur, struct pending_extent_op, list); + i = 0; + while (i < total) { + int c; + ret = btrfs_insert_some_items(trans, extent_root, path, + keys+i, data_size+i, total-i); + BUG_ON(ret < 0); + + if (last && ret > 1) + BUG(); + + leaf = path->nodes[0]; + for (c = 0; c < ret; c++) { + int ref_first = keys[i].type == BTRFS_EXTENT_REF_KEY; + + /* + * if the first item we inserted was a backref, then + * the EXTENT_ITEM will be the odd c's, else it will + * be the even c's + */ + if ((ref_first && (c % 2)) || + (!ref_first && !(c % 2))) { + struct btrfs_extent_item *itm; + + itm = btrfs_item_ptr(leaf, path->slots[0] + c, + struct btrfs_extent_item); + btrfs_set_extent_refs(path->nodes[0], itm, 1); + op->del++; + } else { + struct btrfs_extent_ref *ref; + + ref = btrfs_item_ptr(leaf, path->slots[0] + c, + struct btrfs_extent_ref); + btrfs_set_ref_root(leaf, ref, ref_root); + btrfs_set_ref_generation(leaf, ref, + op->generation); + btrfs_set_ref_objectid(leaf, ref, op->level); + btrfs_set_ref_num_refs(leaf, ref, 1); + op->del++; + } + + /* + * using del to see when its ok to free up the + * pending_extent_op. In the case where we insert the + * last item on the list in order to help do batching + * we need to not free the extent op until we actually + * insert the extent_item + */ + if (op->del == 2) { + unlock_extent(&info->extent_ins, op->bytenr, + op->bytenr + op->num_bytes - 1, + GFP_NOFS); + cur = cur->next; + list_del_init(&op->list); + kfree(op); + if (cur != insert_list) + op = list_entry(cur, + struct pending_extent_op, + list); + } + } + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(extent_root, path); + + /* + * Ok backref's and items usually go right next to eachother, + * but if we could only insert 1 item that means that we + * inserted on the end of a leaf, and we have no idea what may + * be on the next leaf so we just play it safe. In order to + * try and help this case we insert the last thing on our + * insert list so hopefully it will end up being the last + * thing on the leaf and everything else will be before it, + * which will let us insert a whole bunch of items at the same + * time. + */ + if (ret == 1 && !last && (i + ret < total)) { + /* + * last: where we will pick up the next time around + * i: our current key to insert, will be total - 1 + * cur: the current op we are screwing with + * op: duh + */ + last = i + ret; + i = total - 1; + cur = insert_list->prev; + op = list_entry(cur, struct pending_extent_op, list); + } else if (last) { + /* + * ok we successfully inserted the last item on the + * list, lets reset everything + * + * i: our current key to insert, so where we left off + * last time + * last: done with this + * cur: the op we are messing with + * op: duh + * total: since we inserted the last key, we need to + * decrement total so we dont overflow + */ + i = last; + last = 0; + total--; + if (i < total) { + cur = insert_list->next; + op = list_entry(cur, struct pending_extent_op, + list); + } + } else { + i += ret; + } + + cond_resched(); + } + ret = 0; + kfree(keys); + kfree(data_size); + return ret; +} + +static noinline int insert_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, u64 parent, + u64 ref_root, u64 ref_generation, + u64 owner_objectid) +{ + struct btrfs_key key; + struct extent_buffer *leaf; + struct btrfs_extent_ref *ref; + u32 num_refs; + int ret; + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_REF_KEY; + key.offset = parent; + + ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*ref)); + if (ret == 0) { + leaf = path->nodes[0]; + ref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_ref); + btrfs_set_ref_root(leaf, ref, ref_root); + btrfs_set_ref_generation(leaf, ref, ref_generation); + btrfs_set_ref_objectid(leaf, ref, owner_objectid); + btrfs_set_ref_num_refs(leaf, ref, 1); + } else if (ret == -EEXIST) { + u64 existing_owner; + BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID); + leaf = path->nodes[0]; + ref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_ref); + if (btrfs_ref_root(leaf, ref) != ref_root || + btrfs_ref_generation(leaf, ref) != ref_generation) { + ret = -EIO; + WARN_ON(1); + goto out; + } + + num_refs = btrfs_ref_num_refs(leaf, ref); + BUG_ON(num_refs == 0); + btrfs_set_ref_num_refs(leaf, ref, num_refs + 1); + + existing_owner = btrfs_ref_objectid(leaf, ref); + if (existing_owner != owner_objectid && + existing_owner != BTRFS_MULTIPLE_OBJECTIDS) { + btrfs_set_ref_objectid(leaf, ref, + BTRFS_MULTIPLE_OBJECTIDS); + } + ret = 0; + } else { + goto out; + } + btrfs_mark_buffer_dirty(path->nodes[0]); +out: + btrfs_release_path(root, path); + return ret; +} + +static noinline int remove_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path) +{ + struct extent_buffer *leaf; + struct btrfs_extent_ref *ref; + u32 num_refs; + int ret = 0; + + leaf = path->nodes[0]; + ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref); + num_refs = btrfs_ref_num_refs(leaf, ref); + BUG_ON(num_refs == 0); + num_refs -= 1; + if (num_refs == 0) { + ret = btrfs_del_item(trans, root, path); + } else { + btrfs_set_ref_num_refs(leaf, ref, num_refs); + btrfs_mark_buffer_dirty(leaf); + } + btrfs_release_path(root, path); + return ret; +} + +#ifdef BIO_RW_DISCARD +static void btrfs_issue_discard(struct block_device *bdev, + u64 start, u64 len) +{ + blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL); +} +#endif + +static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, + u64 num_bytes) +{ +#ifdef BIO_RW_DISCARD + int ret; + u64 map_length = num_bytes; + struct btrfs_multi_bio *multi = NULL; + + /* Tell the block device(s) that the sectors can be discarded */ + ret = btrfs_map_block(&root->fs_info->mapping_tree, READ, + bytenr, &map_length, &multi, 0); + if (!ret) { + struct btrfs_bio_stripe *stripe = multi->stripes; + int i; + + if (map_length > num_bytes) + map_length = num_bytes; + + for (i = 0; i < multi->num_stripes; i++, stripe++) { + btrfs_issue_discard(stripe->dev->bdev, + stripe->physical, + map_length); + } + kfree(multi); + } + + return ret; +#else + return 0; +#endif +} + +static noinline int free_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, + struct list_head *del_list) +{ + struct btrfs_fs_info *info = extent_root->fs_info; + struct btrfs_path *path; + struct btrfs_key key, found_key; + struct extent_buffer *leaf; + struct list_head *cur; + struct pending_extent_op *op; + struct btrfs_extent_item *ei; + int ret, num_to_del, extent_slot = 0, found_extent = 0; + u32 refs; + u64 bytes_freed = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->reada = 1; + +search: + /* search for the backref for the current ref we want to delete */ + cur = del_list->next; + op = list_entry(cur, struct pending_extent_op, list); + ret = lookup_extent_backref(trans, extent_root, path, op->bytenr, + op->orig_parent, + extent_root->root_key.objectid, + op->orig_generation, op->level, 1); + if (ret) { + printk(KERN_ERR "btrfs unable to find backref byte nr %llu " + "root %llu gen %llu owner %u\n", + (unsigned long long)op->bytenr, + (unsigned long long)extent_root->root_key.objectid, + (unsigned long long)op->orig_generation, op->level); + btrfs_print_leaf(extent_root, path->nodes[0]); + WARN_ON(1); + goto out; + } + + extent_slot = path->slots[0]; + num_to_del = 1; + found_extent = 0; + + /* + * if we aren't the first item on the leaf we can move back one and see + * if our ref is right next to our extent item + */ + if (likely(extent_slot)) { + extent_slot--; + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + extent_slot); + if (found_key.objectid == op->bytenr && + found_key.type == BTRFS_EXTENT_ITEM_KEY && + found_key.offset == op->num_bytes) { + num_to_del++; + found_extent = 1; + } + } + + /* + * if we didn't find the extent we need to delete the backref and then + * search for the extent item key so we can update its ref count + */ + if (!found_extent) { + key.objectid = op->bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = op->num_bytes; + + ret = remove_extent_backref(trans, extent_root, path); + BUG_ON(ret); + btrfs_release_path(extent_root, path); + ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1); + BUG_ON(ret); + extent_slot = path->slots[0]; + } + + /* this is where we update the ref count for the extent */ + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, extent_slot, struct btrfs_extent_item); + refs = btrfs_extent_refs(leaf, ei); + BUG_ON(refs == 0); + refs--; + btrfs_set_extent_refs(leaf, ei, refs); + + btrfs_mark_buffer_dirty(leaf); + + /* + * This extent needs deleting. The reason cur_slot is extent_slot + + * num_to_del is because extent_slot points to the slot where the extent + * is, and if the backref was not right next to the extent we will be + * deleting at least 1 item, and will want to start searching at the + * slot directly next to extent_slot. However if we did find the + * backref next to the extent item them we will be deleting at least 2 + * items and will want to start searching directly after the ref slot + */ + if (!refs) { + struct list_head *pos, *n, *end; + int cur_slot = extent_slot+num_to_del; + u64 super_used; + u64 root_used; + + path->slots[0] = extent_slot; + bytes_freed = op->num_bytes; + + mutex_lock(&info->pinned_mutex); + ret = pin_down_bytes(trans, extent_root, op->bytenr, + op->num_bytes, op->level >= + BTRFS_FIRST_FREE_OBJECTID); + mutex_unlock(&info->pinned_mutex); + BUG_ON(ret < 0); + op->del = ret; + + /* + * we need to see if we can delete multiple things at once, so + * start looping through the list of extents we are wanting to + * delete and see if their extent/backref's are right next to + * eachother and the extents only have 1 ref + */ + for (pos = cur->next; pos != del_list; pos = pos->next) { + struct pending_extent_op *tmp; + + tmp = list_entry(pos, struct pending_extent_op, list); + + /* we only want to delete extent+ref at this stage */ + if (cur_slot >= btrfs_header_nritems(leaf) - 1) + break; + + btrfs_item_key_to_cpu(leaf, &found_key, cur_slot); + if (found_key.objectid != tmp->bytenr || + found_key.type != BTRFS_EXTENT_ITEM_KEY || + found_key.offset != tmp->num_bytes) + break; + + /* check to make sure this extent only has one ref */ + ei = btrfs_item_ptr(leaf, cur_slot, + struct btrfs_extent_item); + if (btrfs_extent_refs(leaf, ei) != 1) + break; + + btrfs_item_key_to_cpu(leaf, &found_key, cur_slot+1); + if (found_key.objectid != tmp->bytenr || + found_key.type != BTRFS_EXTENT_REF_KEY || + found_key.offset != tmp->orig_parent) + break; + + /* + * the ref is right next to the extent, we can set the + * ref count to 0 since we will delete them both now + */ + btrfs_set_extent_refs(leaf, ei, 0); + + /* pin down the bytes for this extent */ + mutex_lock(&info->pinned_mutex); + ret = pin_down_bytes(trans, extent_root, tmp->bytenr, + tmp->num_bytes, tmp->level >= + BTRFS_FIRST_FREE_OBJECTID); + mutex_unlock(&info->pinned_mutex); + BUG_ON(ret < 0); + + /* + * use the del field to tell if we need to go ahead and + * free up the extent when we delete the item or not. + */ + tmp->del = ret; + bytes_freed += tmp->num_bytes; + + num_to_del += 2; + cur_slot += 2; + } + end = pos; + + /* update the free space counters */ + spin_lock(&info->delalloc_lock); + super_used = btrfs_super_bytes_used(&info->super_copy); + btrfs_set_super_bytes_used(&info->super_copy, + super_used - bytes_freed); + + root_used = btrfs_root_used(&extent_root->root_item); + btrfs_set_root_used(&extent_root->root_item, + root_used - bytes_freed); + spin_unlock(&info->delalloc_lock); + + /* delete the items */ + ret = btrfs_del_items(trans, extent_root, path, + path->slots[0], num_to_del); + BUG_ON(ret); + + /* + * loop through the extents we deleted and do the cleanup work + * on them + */ + for (pos = cur, n = pos->next; pos != end; + pos = n, n = pos->next) { + struct pending_extent_op *tmp; + tmp = list_entry(pos, struct pending_extent_op, list); + + /* + * remember tmp->del tells us wether or not we pinned + * down the extent + */ + ret = update_block_group(trans, extent_root, + tmp->bytenr, tmp->num_bytes, 0, + tmp->del); + BUG_ON(ret); + + list_del_init(&tmp->list); + unlock_extent(&info->extent_ins, tmp->bytenr, + tmp->bytenr + tmp->num_bytes - 1, + GFP_NOFS); + kfree(tmp); + } + } else if (refs && found_extent) { + /* + * the ref and extent were right next to eachother, but the + * extent still has a ref, so just free the backref and keep + * going + */ + ret = remove_extent_backref(trans, extent_root, path); + BUG_ON(ret); + + list_del_init(&op->list); + unlock_extent(&info->extent_ins, op->bytenr, + op->bytenr + op->num_bytes - 1, GFP_NOFS); + kfree(op); + } else { + /* + * the extent has multiple refs and the backref we were looking + * for was not right next to it, so just unlock and go next, + * we're good to go + */ + list_del_init(&op->list); + unlock_extent(&info->extent_ins, op->bytenr, + op->bytenr + op->num_bytes - 1, GFP_NOFS); + kfree(op); + } + + btrfs_release_path(extent_root, path); + if (!list_empty(del_list)) + goto search; + +out: + btrfs_free_path(path); + return ret; +} + +static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 orig_parent, u64 parent, + u64 orig_root, u64 ref_root, + u64 orig_generation, u64 ref_generation, + u64 owner_objectid) +{ + int ret; + struct btrfs_root *extent_root = root->fs_info->extent_root; + struct btrfs_path *path; + + if (root == root->fs_info->extent_root) { + struct pending_extent_op *extent_op; + u64 num_bytes; + + BUG_ON(owner_objectid >= BTRFS_MAX_LEVEL); + num_bytes = btrfs_level_size(root, (int)owner_objectid); + mutex_lock(&root->fs_info->extent_ins_mutex); + if (test_range_bit(&root->fs_info->extent_ins, bytenr, + bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) { + u64 priv; + ret = get_state_private(&root->fs_info->extent_ins, + bytenr, &priv); + BUG_ON(ret); + extent_op = (struct pending_extent_op *) + (unsigned long)priv; + BUG_ON(extent_op->parent != orig_parent); + BUG_ON(extent_op->generation != orig_generation); + + extent_op->parent = parent; + extent_op->generation = ref_generation; + } else { + extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); + BUG_ON(!extent_op); + + extent_op->type = PENDING_BACKREF_UPDATE; + extent_op->bytenr = bytenr; + extent_op->num_bytes = num_bytes; + extent_op->parent = parent; + extent_op->orig_parent = orig_parent; + extent_op->generation = ref_generation; + extent_op->orig_generation = orig_generation; + extent_op->level = (int)owner_objectid; + INIT_LIST_HEAD(&extent_op->list); + extent_op->del = 0; + + set_extent_bits(&root->fs_info->extent_ins, + bytenr, bytenr + num_bytes - 1, + EXTENT_WRITEBACK, GFP_NOFS); + set_state_private(&root->fs_info->extent_ins, + bytenr, (unsigned long)extent_op); + } + mutex_unlock(&root->fs_info->extent_ins_mutex); + return 0; + } + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + ret = lookup_extent_backref(trans, extent_root, path, + bytenr, orig_parent, orig_root, + orig_generation, owner_objectid, 1); + if (ret) + goto out; + ret = remove_extent_backref(trans, extent_root, path); + if (ret) + goto out; + ret = insert_extent_backref(trans, extent_root, path, bytenr, + parent, ref_root, ref_generation, + owner_objectid); + BUG_ON(ret); + finish_current_insert(trans, extent_root, 0); + del_pending_extents(trans, extent_root, 0); +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_update_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 orig_parent, u64 parent, + u64 ref_root, u64 ref_generation, + u64 owner_objectid) +{ + int ret; + if (ref_root == BTRFS_TREE_LOG_OBJECTID && + owner_objectid < BTRFS_FIRST_FREE_OBJECTID) + return 0; + ret = __btrfs_update_extent_ref(trans, root, bytenr, orig_parent, + parent, ref_root, ref_root, + ref_generation, ref_generation, + owner_objectid); + return ret; +} + +static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 orig_parent, u64 parent, + u64 orig_root, u64 ref_root, + u64 orig_generation, u64 ref_generation, + u64 owner_objectid) +{ + struct btrfs_path *path; + int ret; + struct btrfs_key key; + struct extent_buffer *l; + struct btrfs_extent_item *item; + u32 refs; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + path->reada = 1; + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = (u64)-1; + + ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path, + 0, 1); + if (ret < 0) + return ret; + BUG_ON(ret == 0 || path->slots[0] == 0); + + path->slots[0]--; + l = path->nodes[0]; + + btrfs_item_key_to_cpu(l, &key, path->slots[0]); + if (key.objectid != bytenr) { + btrfs_print_leaf(root->fs_info->extent_root, path->nodes[0]); + printk(KERN_ERR "btrfs wanted %llu found %llu\n", + (unsigned long long)bytenr, + (unsigned long long)key.objectid); + BUG(); + } + BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY); + + item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item); + refs = btrfs_extent_refs(l, item); + btrfs_set_extent_refs(l, item, refs + 1); + btrfs_mark_buffer_dirty(path->nodes[0]); + + btrfs_release_path(root->fs_info->extent_root, path); + + path->reada = 1; + ret = insert_extent_backref(trans, root->fs_info->extent_root, + path, bytenr, parent, + ref_root, ref_generation, + owner_objectid); + BUG_ON(ret); + finish_current_insert(trans, root->fs_info->extent_root, 0); + del_pending_extents(trans, root->fs_info->extent_root, 0); + + btrfs_free_path(path); + return 0; +} + +int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, + u64 ref_root, u64 ref_generation, + u64 owner_objectid) +{ + int ret; + if (ref_root == BTRFS_TREE_LOG_OBJECTID && + owner_objectid < BTRFS_FIRST_FREE_OBJECTID) + return 0; + ret = __btrfs_inc_extent_ref(trans, root, bytenr, 0, parent, + 0, ref_root, 0, ref_generation, + owner_objectid); + return ret; +} + +int btrfs_extent_post_op(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + finish_current_insert(trans, root->fs_info->extent_root, 1); + del_pending_extents(trans, root->fs_info->extent_root, 1); + return 0; +} + +int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u32 *refs) +{ + struct btrfs_path *path; + int ret; + struct btrfs_key key; + struct extent_buffer *l; + struct btrfs_extent_item *item; + + WARN_ON(num_bytes < root->sectorsize); + path = btrfs_alloc_path(); + path->reada = 1; + key.objectid = bytenr; + key.offset = num_bytes; + btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); + ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path, + 0, 0); + if (ret < 0) + goto out; + if (ret != 0) { + btrfs_print_leaf(root, path->nodes[0]); + printk(KERN_INFO "btrfs failed to find block number %llu\n", + (unsigned long long)bytenr); + BUG(); + } + l = path->nodes[0]; + item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item); + *refs = btrfs_extent_refs(l, item); +out: + btrfs_free_path(path); + return 0; +} + +int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 objectid, u64 bytenr) +{ + struct btrfs_root *extent_root = root->fs_info->extent_root; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_extent_ref *ref_item; + struct btrfs_key key; + struct btrfs_key found_key; + u64 ref_root; + u64 last_snapshot; + u32 nritems; + int ret; + + key.objectid = bytenr; + key.offset = (u64)-1; + key.type = BTRFS_EXTENT_ITEM_KEY; + + path = btrfs_alloc_path(); + ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); + if (ret < 0) + goto out; + BUG_ON(ret == 0); + + ret = -ENOENT; + if (path->slots[0] == 0) + goto out; + + path->slots[0]--; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + if (found_key.objectid != bytenr || + found_key.type != BTRFS_EXTENT_ITEM_KEY) + goto out; + + last_snapshot = btrfs_root_last_snapshot(&root->root_item); + while (1) { + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(extent_root, path); + if (ret < 0) + goto out; + if (ret == 0) + continue; + break; + } + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (found_key.objectid != bytenr) + break; + + if (found_key.type != BTRFS_EXTENT_REF_KEY) { + path->slots[0]++; + continue; + } + + ref_item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_ref); + ref_root = btrfs_ref_root(leaf, ref_item); + if ((ref_root != root->root_key.objectid && + ref_root != BTRFS_TREE_LOG_OBJECTID) || + objectid != btrfs_ref_objectid(leaf, ref_item)) { + ret = 1; + goto out; + } + if (btrfs_ref_generation(leaf, ref_item) <= last_snapshot) { + ret = 1; + goto out; + } + + path->slots[0]++; + } + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct extent_buffer *buf, u32 nr_extents) +{ + struct btrfs_key key; + struct btrfs_file_extent_item *fi; + u64 root_gen; + u32 nritems; + int i; + int level; + int ret = 0; + int shared = 0; + + if (!root->ref_cows) + return 0; + + if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { + shared = 0; + root_gen = root->root_key.offset; + } else { + shared = 1; + root_gen = trans->transid - 1; + } + + level = btrfs_header_level(buf); + nritems = btrfs_header_nritems(buf); + + if (level == 0) { + struct btrfs_leaf_ref *ref; + struct btrfs_extent_info *info; + + ref = btrfs_alloc_leaf_ref(root, nr_extents); + if (!ref) { + ret = -ENOMEM; + goto out; + } + + ref->root_gen = root_gen; + ref->bytenr = buf->start; + ref->owner = btrfs_header_owner(buf); + ref->generation = btrfs_header_generation(buf); + ref->nritems = nr_extents; + info = ref->extents; + + for (i = 0; nr_extents > 0 && i < nritems; i++) { + u64 disk_bytenr; + btrfs_item_key_to_cpu(buf, &key, i); + if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) + continue; + fi = btrfs_item_ptr(buf, i, + struct btrfs_file_extent_item); + if (btrfs_file_extent_type(buf, fi) == + BTRFS_FILE_EXTENT_INLINE) + continue; + disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi); + if (disk_bytenr == 0) + continue; + + info->bytenr = disk_bytenr; + info->num_bytes = + btrfs_file_extent_disk_num_bytes(buf, fi); + info->objectid = key.objectid; + info->offset = key.offset; + info++; + } + + ret = btrfs_add_leaf_ref(root, ref, shared); + if (ret == -EEXIST && shared) { + struct btrfs_leaf_ref *old; + old = btrfs_lookup_leaf_ref(root, ref->bytenr); + BUG_ON(!old); + btrfs_remove_leaf_ref(root, old); + btrfs_free_leaf_ref(root, old); + ret = btrfs_add_leaf_ref(root, ref, shared); + } + WARN_ON(ret); + btrfs_free_leaf_ref(root, ref); + } +out: + return ret; +} + +int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct extent_buffer *orig_buf, struct extent_buffer *buf, + u32 *nr_extents) +{ + u64 bytenr; + u64 ref_root; + u64 orig_root; + u64 ref_generation; + u64 orig_generation; + u32 nritems; + u32 nr_file_extents = 0; + struct btrfs_key key; + struct btrfs_file_extent_item *fi; + int i; + int level; + int ret = 0; + int faili = 0; + int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, + u64, u64, u64, u64, u64, u64, u64, u64); + + ref_root = btrfs_header_owner(buf); + ref_generation = btrfs_header_generation(buf); + orig_root = btrfs_header_owner(orig_buf); + orig_generation = btrfs_header_generation(orig_buf); + + nritems = btrfs_header_nritems(buf); + level = btrfs_header_level(buf); + + if (root->ref_cows) { + process_func = __btrfs_inc_extent_ref; + } else { + if (level == 0 && + root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) + goto out; + if (level != 0 && + root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) + goto out; + process_func = __btrfs_update_extent_ref; + } + + for (i = 0; i < nritems; i++) { + cond_resched(); + if (level == 0) { + btrfs_item_key_to_cpu(buf, &key, i); + if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) + continue; + fi = btrfs_item_ptr(buf, i, + struct btrfs_file_extent_item); + if (btrfs_file_extent_type(buf, fi) == + BTRFS_FILE_EXTENT_INLINE) + continue; + bytenr = btrfs_file_extent_disk_bytenr(buf, fi); + if (bytenr == 0) + continue; + + nr_file_extents++; + + ret = process_func(trans, root, bytenr, + orig_buf->start, buf->start, + orig_root, ref_root, + orig_generation, ref_generation, + key.objectid); + + if (ret) { + faili = i; + WARN_ON(1); + goto fail; + } + } else { + bytenr = btrfs_node_blockptr(buf, i); + ret = process_func(trans, root, bytenr, + orig_buf->start, buf->start, + orig_root, ref_root, + orig_generation, ref_generation, + level - 1); + if (ret) { + faili = i; + WARN_ON(1); + goto fail; + } + } + } +out: + if (nr_extents) { + if (level == 0) + *nr_extents = nr_file_extents; + else + *nr_extents = nritems; + } + return 0; +fail: + WARN_ON(1); + return ret; +} + +int btrfs_update_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *orig_buf, + struct extent_buffer *buf, int start_slot, int nr) + +{ + u64 bytenr; + u64 ref_root; + u64 orig_root; + u64 ref_generation; + u64 orig_generation; + struct btrfs_key key; + struct btrfs_file_extent_item *fi; + int i; + int ret; + int slot; + int level; + + BUG_ON(start_slot < 0); + BUG_ON(start_slot + nr > btrfs_header_nritems(buf)); + + ref_root = btrfs_header_owner(buf); + ref_generation = btrfs_header_generation(buf); + orig_root = btrfs_header_owner(orig_buf); + orig_generation = btrfs_header_generation(orig_buf); + level = btrfs_header_level(buf); + + if (!root->ref_cows) { + if (level == 0 && + root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) + return 0; + if (level != 0 && + root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) + return 0; + } + + for (i = 0, slot = start_slot; i < nr; i++, slot++) { + cond_resched(); + if (level == 0) { + btrfs_item_key_to_cpu(buf, &key, slot); + if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) + continue; + fi = btrfs_item_ptr(buf, slot, + struct btrfs_file_extent_item); + if (btrfs_file_extent_type(buf, fi) == + BTRFS_FILE_EXTENT_INLINE) + continue; + bytenr = btrfs_file_extent_disk_bytenr(buf, fi); + if (bytenr == 0) + continue; + ret = __btrfs_update_extent_ref(trans, root, bytenr, + orig_buf->start, buf->start, + orig_root, ref_root, + orig_generation, ref_generation, + key.objectid); + if (ret) + goto fail; + } else { + bytenr = btrfs_node_blockptr(buf, slot); + ret = __btrfs_update_extent_ref(trans, root, bytenr, + orig_buf->start, buf->start, + orig_root, ref_root, + orig_generation, ref_generation, + level - 1); + if (ret) + goto fail; + } + } + return 0; +fail: + WARN_ON(1); + return -1; +} + +static int write_one_cache_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_block_group_cache *cache) +{ + int ret; + int pending_ret; + struct btrfs_root *extent_root = root->fs_info->extent_root; + unsigned long bi; + struct extent_buffer *leaf; + + ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1); + if (ret < 0) + goto fail; + BUG_ON(ret); + + leaf = path->nodes[0]; + bi = btrfs_item_ptr_offset(leaf, path->slots[0]); + write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item)); + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(extent_root, path); +fail: + finish_current_insert(trans, extent_root, 0); + pending_ret = del_pending_extents(trans, extent_root, 0); + if (ret) + return ret; + if (pending_ret) + return pending_ret; + return 0; + +} + +int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_block_group_cache *cache, *entry; + struct rb_node *n; + int err = 0; + int werr = 0; + struct btrfs_path *path; + u64 last = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + while (1) { + cache = NULL; + spin_lock(&root->fs_info->block_group_cache_lock); + for (n = rb_first(&root->fs_info->block_group_cache_tree); + n; n = rb_next(n)) { + entry = rb_entry(n, struct btrfs_block_group_cache, + cache_node); + if (entry->dirty) { + cache = entry; + break; + } + } + spin_unlock(&root->fs_info->block_group_cache_lock); + + if (!cache) + break; + + cache->dirty = 0; + last += cache->key.offset; + + err = write_one_cache_group(trans, root, + path, cache); + /* + * if we fail to write the cache group, we want + * to keep it marked dirty in hopes that a later + * write will work + */ + if (err) { + werr = err; + continue; + } + } + btrfs_free_path(path); + return werr; +} + +int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) +{ + struct btrfs_block_group_cache *block_group; + int readonly = 0; + + block_group = btrfs_lookup_block_group(root->fs_info, bytenr); + if (!block_group || block_group->ro) + readonly = 1; + if (block_group) + put_block_group(block_group); + return readonly; +} + +static int update_space_info(struct btrfs_fs_info *info, u64 flags, + u64 total_bytes, u64 bytes_used, + struct btrfs_space_info **space_info) +{ + struct btrfs_space_info *found; + + found = __find_space_info(info, flags); + if (found) { + spin_lock(&found->lock); + found->total_bytes += total_bytes; + found->bytes_used += bytes_used; + found->full = 0; + spin_unlock(&found->lock); + *space_info = found; + return 0; + } + found = kzalloc(sizeof(*found), GFP_NOFS); + if (!found) + return -ENOMEM; + + list_add(&found->list, &info->space_info); + INIT_LIST_HEAD(&found->block_groups); + init_rwsem(&found->groups_sem); + spin_lock_init(&found->lock); + found->flags = flags; + found->total_bytes = total_bytes; + found->bytes_used = bytes_used; + found->bytes_pinned = 0; + found->bytes_reserved = 0; + found->bytes_readonly = 0; + found->full = 0; + found->force_alloc = 0; + *space_info = found; + return 0; +} + +static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) +{ + u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_DUP); + if (extra_flags) { + if (flags & BTRFS_BLOCK_GROUP_DATA) + fs_info->avail_data_alloc_bits |= extra_flags; + if (flags & BTRFS_BLOCK_GROUP_METADATA) + fs_info->avail_metadata_alloc_bits |= extra_flags; + if (flags & BTRFS_BLOCK_GROUP_SYSTEM) + fs_info->avail_system_alloc_bits |= extra_flags; + } +} + +static void set_block_group_readonly(struct btrfs_block_group_cache *cache) +{ + spin_lock(&cache->space_info->lock); + spin_lock(&cache->lock); + if (!cache->ro) { + cache->space_info->bytes_readonly += cache->key.offset - + btrfs_block_group_used(&cache->item); + cache->ro = 1; + } + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); +} + +u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) +{ + u64 num_devices = root->fs_info->fs_devices->rw_devices; + + if (num_devices == 1) + flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); + if (num_devices < 4) + flags &= ~BTRFS_BLOCK_GROUP_RAID10; + + if ((flags & BTRFS_BLOCK_GROUP_DUP) && + (flags & (BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10))) { + flags &= ~BTRFS_BLOCK_GROUP_DUP; + } + + if ((flags & BTRFS_BLOCK_GROUP_RAID1) && + (flags & BTRFS_BLOCK_GROUP_RAID10)) { + flags &= ~BTRFS_BLOCK_GROUP_RAID1; + } + + if ((flags & BTRFS_BLOCK_GROUP_RAID0) && + ((flags & BTRFS_BLOCK_GROUP_RAID1) | + (flags & BTRFS_BLOCK_GROUP_RAID10) | + (flags & BTRFS_BLOCK_GROUP_DUP))) + flags &= ~BTRFS_BLOCK_GROUP_RAID0; + return flags; +} + +static int do_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, u64 alloc_bytes, + u64 flags, int force) +{ + struct btrfs_space_info *space_info; + u64 thresh; + int ret = 0; + + mutex_lock(&extent_root->fs_info->chunk_mutex); + + flags = btrfs_reduce_alloc_profile(extent_root, flags); + + space_info = __find_space_info(extent_root->fs_info, flags); + if (!space_info) { + ret = update_space_info(extent_root->fs_info, flags, + 0, 0, &space_info); + BUG_ON(ret); + } + BUG_ON(!space_info); + + spin_lock(&space_info->lock); + if (space_info->force_alloc) { + force = 1; + space_info->force_alloc = 0; + } + if (space_info->full) { + spin_unlock(&space_info->lock); + goto out; + } + + thresh = space_info->total_bytes - space_info->bytes_readonly; + thresh = div_factor(thresh, 6); + if (!force && + (space_info->bytes_used + space_info->bytes_pinned + + space_info->bytes_reserved + alloc_bytes) < thresh) { + spin_unlock(&space_info->lock); + goto out; + } + spin_unlock(&space_info->lock); + + ret = btrfs_alloc_chunk(trans, extent_root, flags); + if (ret) + space_info->full = 1; +out: + mutex_unlock(&extent_root->fs_info->chunk_mutex); + return ret; +} + +static int update_block_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, int alloc, + int mark_free) +{ + struct btrfs_block_group_cache *cache; + struct btrfs_fs_info *info = root->fs_info; + u64 total = num_bytes; + u64 old_val; + u64 byte_in_group; + + while (total) { + cache = btrfs_lookup_block_group(info, bytenr); + if (!cache) + return -1; + byte_in_group = bytenr - cache->key.objectid; + WARN_ON(byte_in_group > cache->key.offset); + + spin_lock(&cache->space_info->lock); + spin_lock(&cache->lock); + cache->dirty = 1; + old_val = btrfs_block_group_used(&cache->item); + num_bytes = min(total, cache->key.offset - byte_in_group); + if (alloc) { + old_val += num_bytes; + cache->space_info->bytes_used += num_bytes; + if (cache->ro) + cache->space_info->bytes_readonly -= num_bytes; + btrfs_set_block_group_used(&cache->item, old_val); + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + } else { + old_val -= num_bytes; + cache->space_info->bytes_used -= num_bytes; + if (cache->ro) + cache->space_info->bytes_readonly += num_bytes; + btrfs_set_block_group_used(&cache->item, old_val); + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + if (mark_free) { + int ret; + + ret = btrfs_discard_extent(root, bytenr, + num_bytes); + WARN_ON(ret); + + ret = btrfs_add_free_space(cache, bytenr, + num_bytes); + WARN_ON(ret); + } + } + put_block_group(cache); + total -= num_bytes; + bytenr += num_bytes; + } + return 0; +} + +static u64 first_logical_byte(struct btrfs_root *root, u64 search_start) +{ + struct btrfs_block_group_cache *cache; + u64 bytenr; + + cache = btrfs_lookup_first_block_group(root->fs_info, search_start); + if (!cache) + return 0; + + bytenr = cache->key.objectid; + put_block_group(cache); + + return bytenr; +} + +int btrfs_update_pinned_extents(struct btrfs_root *root, + u64 bytenr, u64 num, int pin) +{ + u64 len; + struct btrfs_block_group_cache *cache; + struct btrfs_fs_info *fs_info = root->fs_info; + + WARN_ON(!mutex_is_locked(&root->fs_info->pinned_mutex)); + if (pin) { + set_extent_dirty(&fs_info->pinned_extents, + bytenr, bytenr + num - 1, GFP_NOFS); + } else { + clear_extent_dirty(&fs_info->pinned_extents, + bytenr, bytenr + num - 1, GFP_NOFS); + } + while (num > 0) { + cache = btrfs_lookup_block_group(fs_info, bytenr); + BUG_ON(!cache); + len = min(num, cache->key.offset - + (bytenr - cache->key.objectid)); + if (pin) { + spin_lock(&cache->space_info->lock); + spin_lock(&cache->lock); + cache->pinned += len; + cache->space_info->bytes_pinned += len; + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + fs_info->total_pinned += len; + } else { + spin_lock(&cache->space_info->lock); + spin_lock(&cache->lock); + cache->pinned -= len; + cache->space_info->bytes_pinned -= len; + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + fs_info->total_pinned -= len; + if (cache->cached) + btrfs_add_free_space(cache, bytenr, len); + } + put_block_group(cache); + bytenr += len; + num -= len; + } + return 0; +} + +static int update_reserved_extents(struct btrfs_root *root, + u64 bytenr, u64 num, int reserve) +{ + u64 len; + struct btrfs_block_group_cache *cache; + struct btrfs_fs_info *fs_info = root->fs_info; + + while (num > 0) { + cache = btrfs_lookup_block_group(fs_info, bytenr); + BUG_ON(!cache); + len = min(num, cache->key.offset - + (bytenr - cache->key.objectid)); + + spin_lock(&cache->space_info->lock); + spin_lock(&cache->lock); + if (reserve) { + cache->reserved += len; + cache->space_info->bytes_reserved += len; + } else { + cache->reserved -= len; + cache->space_info->bytes_reserved -= len; + } + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + put_block_group(cache); + bytenr += len; + num -= len; + } + return 0; +} + +int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy) +{ + u64 last = 0; + u64 start; + u64 end; + struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents; + int ret; + + mutex_lock(&root->fs_info->pinned_mutex); + while (1) { + ret = find_first_extent_bit(pinned_extents, last, + &start, &end, EXTENT_DIRTY); + if (ret) + break; + set_extent_dirty(copy, start, end, GFP_NOFS); + last = end + 1; + } + mutex_unlock(&root->fs_info->pinned_mutex); + return 0; +} + +int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_io_tree *unpin) +{ + u64 start; + u64 end; + int ret; + + mutex_lock(&root->fs_info->pinned_mutex); + while (1) { + ret = find_first_extent_bit(unpin, 0, &start, &end, + EXTENT_DIRTY); + if (ret) + break; + + ret = btrfs_discard_extent(root, start, end + 1 - start); + + btrfs_update_pinned_extents(root, start, end + 1 - start, 0); + clear_extent_dirty(unpin, start, end, GFP_NOFS); + + if (need_resched()) { + mutex_unlock(&root->fs_info->pinned_mutex); + cond_resched(); + mutex_lock(&root->fs_info->pinned_mutex); + } + } + mutex_unlock(&root->fs_info->pinned_mutex); + return ret; +} + +static int finish_current_insert(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, int all) +{ + u64 start; + u64 end; + u64 priv; + u64 search = 0; + u64 skipped = 0; + struct btrfs_fs_info *info = extent_root->fs_info; + struct btrfs_path *path; + struct pending_extent_op *extent_op, *tmp; + struct list_head insert_list, update_list; + int ret; + int num_inserts = 0, max_inserts; + + path = btrfs_alloc_path(); + INIT_LIST_HEAD(&insert_list); + INIT_LIST_HEAD(&update_list); + + max_inserts = extent_root->leafsize / + (2 * sizeof(struct btrfs_key) + 2 * sizeof(struct btrfs_item) + + sizeof(struct btrfs_extent_ref) + + sizeof(struct btrfs_extent_item)); +again: + mutex_lock(&info->extent_ins_mutex); + while (1) { + ret = find_first_extent_bit(&info->extent_ins, search, &start, + &end, EXTENT_WRITEBACK); + if (ret) { + if (skipped && all && !num_inserts) { + skipped = 0; + search = 0; + continue; + } + mutex_unlock(&info->extent_ins_mutex); + break; + } + + ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS); + if (!ret) { + skipped = 1; + search = end + 1; + if (need_resched()) { + mutex_unlock(&info->extent_ins_mutex); + cond_resched(); + mutex_lock(&info->extent_ins_mutex); + } + continue; + } + + ret = get_state_private(&info->extent_ins, start, &priv); + BUG_ON(ret); + extent_op = (struct pending_extent_op *)(unsigned long) priv; + + if (extent_op->type == PENDING_EXTENT_INSERT) { + num_inserts++; + list_add_tail(&extent_op->list, &insert_list); + search = end + 1; + if (num_inserts == max_inserts) { + mutex_unlock(&info->extent_ins_mutex); + break; + } + } else if (extent_op->type == PENDING_BACKREF_UPDATE) { + list_add_tail(&extent_op->list, &update_list); + search = end + 1; + } else { + BUG(); + } + } + + /* + * process the update list, clear the writeback bit for it, and if + * somebody marked this thing for deletion then just unlock it and be + * done, the free_extents will handle it + */ + mutex_lock(&info->extent_ins_mutex); + list_for_each_entry_safe(extent_op, tmp, &update_list, list) { + clear_extent_bits(&info->extent_ins, extent_op->bytenr, + extent_op->bytenr + extent_op->num_bytes - 1, + EXTENT_WRITEBACK, GFP_NOFS); + if (extent_op->del) { + list_del_init(&extent_op->list); + unlock_extent(&info->extent_ins, extent_op->bytenr, + extent_op->bytenr + extent_op->num_bytes + - 1, GFP_NOFS); + kfree(extent_op); + } + } + mutex_unlock(&info->extent_ins_mutex); + + /* + * still have things left on the update list, go ahead an update + * everything + */ + if (!list_empty(&update_list)) { + ret = update_backrefs(trans, extent_root, path, &update_list); + BUG_ON(ret); + } + + /* + * if no inserts need to be done, but we skipped some extents and we + * need to make sure everything is cleaned then reset everything and + * go back to the beginning + */ + if (!num_inserts && all && skipped) { + search = 0; + skipped = 0; + INIT_LIST_HEAD(&update_list); + INIT_LIST_HEAD(&insert_list); + goto again; + } else if (!num_inserts) { + goto out; + } + + /* + * process the insert extents list. Again if we are deleting this + * extent, then just unlock it, pin down the bytes if need be, and be + * done with it. Saves us from having to actually insert the extent + * into the tree and then subsequently come along and delete it + */ + mutex_lock(&info->extent_ins_mutex); + list_for_each_entry_safe(extent_op, tmp, &insert_list, list) { + clear_extent_bits(&info->extent_ins, extent_op->bytenr, + extent_op->bytenr + extent_op->num_bytes - 1, + EXTENT_WRITEBACK, GFP_NOFS); + if (extent_op->del) { + u64 used; + list_del_init(&extent_op->list); + unlock_extent(&info->extent_ins, extent_op->bytenr, + extent_op->bytenr + extent_op->num_bytes + - 1, GFP_NOFS); + + mutex_lock(&extent_root->fs_info->pinned_mutex); + ret = pin_down_bytes(trans, extent_root, + extent_op->bytenr, + extent_op->num_bytes, 0); + mutex_unlock(&extent_root->fs_info->pinned_mutex); + + spin_lock(&info->delalloc_lock); + used = btrfs_super_bytes_used(&info->super_copy); + btrfs_set_super_bytes_used(&info->super_copy, + used - extent_op->num_bytes); + used = btrfs_root_used(&extent_root->root_item); + btrfs_set_root_used(&extent_root->root_item, + used - extent_op->num_bytes); + spin_unlock(&info->delalloc_lock); + + ret = update_block_group(trans, extent_root, + extent_op->bytenr, + extent_op->num_bytes, + 0, ret > 0); + BUG_ON(ret); + kfree(extent_op); + num_inserts--; + } + } + mutex_unlock(&info->extent_ins_mutex); + + ret = insert_extents(trans, extent_root, path, &insert_list, + num_inserts); + BUG_ON(ret); + + /* + * if we broke out of the loop in order to insert stuff because we hit + * the maximum number of inserts at a time we can handle, then loop + * back and pick up where we left off + */ + if (num_inserts == max_inserts) { + INIT_LIST_HEAD(&insert_list); + INIT_LIST_HEAD(&update_list); + num_inserts = 0; + goto again; + } + + /* + * again, if we need to make absolutely sure there are no more pending + * extent operations left and we know that we skipped some, go back to + * the beginning and do it all again + */ + if (all && skipped) { + INIT_LIST_HEAD(&insert_list); + INIT_LIST_HEAD(&update_list); + search = 0; + skipped = 0; + num_inserts = 0; + goto again; + } +out: + btrfs_free_path(path); + return 0; +} + +static int pin_down_bytes(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, int is_data) +{ + int err = 0; + struct extent_buffer *buf; + + if (is_data) + goto pinit; + + buf = btrfs_find_tree_block(root, bytenr, num_bytes); + if (!buf) + goto pinit; + + /* we can reuse a block if it hasn't been written + * and it is from this transaction. We can't + * reuse anything from the tree log root because + * it has tiny sub-transactions. + */ + if (btrfs_buffer_uptodate(buf, 0) && + btrfs_try_tree_lock(buf)) { + u64 header_owner = btrfs_header_owner(buf); + u64 header_transid = btrfs_header_generation(buf); + if (header_owner != BTRFS_TREE_LOG_OBJECTID && + header_owner != BTRFS_TREE_RELOC_OBJECTID && + header_transid == trans->transid && + !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { + clean_tree_block(NULL, root, buf); + btrfs_tree_unlock(buf); + free_extent_buffer(buf); + return 1; + } + btrfs_tree_unlock(buf); + } + free_extent_buffer(buf); +pinit: + btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); + + BUG_ON(err < 0); + return 0; +} + +/* + * remove an extent from the root, returns 0 on success + */ +static int __free_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 ref_generation, + u64 owner_objectid, int pin, int mark_free) +{ + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_fs_info *info = root->fs_info; + struct btrfs_root *extent_root = info->extent_root; + struct extent_buffer *leaf; + int ret; + int extent_slot = 0; + int found_extent = 0; + int num_to_del = 1; + struct btrfs_extent_item *ei; + u32 refs; + + key.objectid = bytenr; + btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); + key.offset = num_bytes; + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + path->reada = 1; + ret = lookup_extent_backref(trans, extent_root, path, + bytenr, parent, root_objectid, + ref_generation, owner_objectid, 1); + if (ret == 0) { + struct btrfs_key found_key; + extent_slot = path->slots[0]; + while (extent_slot > 0) { + extent_slot--; + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + extent_slot); + if (found_key.objectid != bytenr) + break; + if (found_key.type == BTRFS_EXTENT_ITEM_KEY && + found_key.offset == num_bytes) { + found_extent = 1; + break; + } + if (path->slots[0] - extent_slot > 5) + break; + } + if (!found_extent) { + ret = remove_extent_backref(trans, extent_root, path); + BUG_ON(ret); + btrfs_release_path(extent_root, path); + ret = btrfs_search_slot(trans, extent_root, + &key, path, -1, 1); + if (ret) { + printk(KERN_ERR "umm, got %d back from search" + ", was looking for %llu\n", ret, + (unsigned long long)bytenr); + btrfs_print_leaf(extent_root, path->nodes[0]); + } + BUG_ON(ret); + extent_slot = path->slots[0]; + } + } else { + btrfs_print_leaf(extent_root, path->nodes[0]); + WARN_ON(1); + printk(KERN_ERR "btrfs unable to find ref byte nr %llu " + "root %llu gen %llu owner %llu\n", + (unsigned long long)bytenr, + (unsigned long long)root_objectid, + (unsigned long long)ref_generation, + (unsigned long long)owner_objectid); + } + + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, extent_slot, + struct btrfs_extent_item); + refs = btrfs_extent_refs(leaf, ei); + BUG_ON(refs == 0); + refs -= 1; + btrfs_set_extent_refs(leaf, ei, refs); + + btrfs_mark_buffer_dirty(leaf); + + if (refs == 0 && found_extent && path->slots[0] == extent_slot + 1) { + struct btrfs_extent_ref *ref; + ref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_ref); + BUG_ON(btrfs_ref_num_refs(leaf, ref) != 1); + /* if the back ref and the extent are next to each other + * they get deleted below in one shot + */ + path->slots[0] = extent_slot; + num_to_del = 2; + } else if (found_extent) { + /* otherwise delete the extent back ref */ + ret = remove_extent_backref(trans, extent_root, path); + BUG_ON(ret); + /* if refs are 0, we need to setup the path for deletion */ + if (refs == 0) { + btrfs_release_path(extent_root, path); + ret = btrfs_search_slot(trans, extent_root, &key, path, + -1, 1); + BUG_ON(ret); + } + } + + if (refs == 0) { + u64 super_used; + u64 root_used; + + if (pin) { + mutex_lock(&root->fs_info->pinned_mutex); + ret = pin_down_bytes(trans, root, bytenr, num_bytes, + owner_objectid >= BTRFS_FIRST_FREE_OBJECTID); + mutex_unlock(&root->fs_info->pinned_mutex); + if (ret > 0) + mark_free = 1; + BUG_ON(ret < 0); + } + /* block accounting for super block */ + spin_lock(&info->delalloc_lock); + super_used = btrfs_super_bytes_used(&info->super_copy); + btrfs_set_super_bytes_used(&info->super_copy, + super_used - num_bytes); + + /* block accounting for root item */ + root_used = btrfs_root_used(&root->root_item); + btrfs_set_root_used(&root->root_item, + root_used - num_bytes); + spin_unlock(&info->delalloc_lock); + ret = btrfs_del_items(trans, extent_root, path, path->slots[0], + num_to_del); + BUG_ON(ret); + btrfs_release_path(extent_root, path); + + if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { + ret = btrfs_del_csums(trans, root, bytenr, num_bytes); + BUG_ON(ret); + } + + ret = update_block_group(trans, root, bytenr, num_bytes, 0, + mark_free); + BUG_ON(ret); + } + btrfs_free_path(path); + finish_current_insert(trans, extent_root, 0); + return ret; +} + +/* + * find all the blocks marked as pending in the radix tree and remove + * them from the extent map + */ +static int del_pending_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, int all) +{ + int ret; + int err = 0; + u64 start; + u64 end; + u64 priv; + u64 search = 0; + int nr = 0, skipped = 0; + struct extent_io_tree *pending_del; + struct extent_io_tree *extent_ins; + struct pending_extent_op *extent_op; + struct btrfs_fs_info *info = extent_root->fs_info; + struct list_head delete_list; + + INIT_LIST_HEAD(&delete_list); + extent_ins = &extent_root->fs_info->extent_ins; + pending_del = &extent_root->fs_info->pending_del; + +again: + mutex_lock(&info->extent_ins_mutex); + while (1) { + ret = find_first_extent_bit(pending_del, search, &start, &end, + EXTENT_WRITEBACK); + if (ret) { + if (all && skipped && !nr) { + search = 0; + continue; + } + mutex_unlock(&info->extent_ins_mutex); + break; + } + + ret = try_lock_extent(extent_ins, start, end, GFP_NOFS); + if (!ret) { + search = end+1; + skipped = 1; + + if (need_resched()) { + mutex_unlock(&info->extent_ins_mutex); + cond_resched(); + mutex_lock(&info->extent_ins_mutex); + } + + continue; + } + BUG_ON(ret < 0); + + ret = get_state_private(pending_del, start, &priv); + BUG_ON(ret); + extent_op = (struct pending_extent_op *)(unsigned long)priv; + + clear_extent_bits(pending_del, start, end, EXTENT_WRITEBACK, + GFP_NOFS); + if (!test_range_bit(extent_ins, start, end, + EXTENT_WRITEBACK, 0)) { + list_add_tail(&extent_op->list, &delete_list); + nr++; + } else { + kfree(extent_op); + + ret = get_state_private(&info->extent_ins, start, + &priv); + BUG_ON(ret); + extent_op = (struct pending_extent_op *) + (unsigned long)priv; + + clear_extent_bits(&info->extent_ins, start, end, + EXTENT_WRITEBACK, GFP_NOFS); + + if (extent_op->type == PENDING_BACKREF_UPDATE) { + list_add_tail(&extent_op->list, &delete_list); + search = end + 1; + nr++; + continue; + } + + mutex_lock(&extent_root->fs_info->pinned_mutex); + ret = pin_down_bytes(trans, extent_root, start, + end + 1 - start, 0); + mutex_unlock(&extent_root->fs_info->pinned_mutex); + + ret = update_block_group(trans, extent_root, start, + end + 1 - start, 0, ret > 0); + + unlock_extent(extent_ins, start, end, GFP_NOFS); + BUG_ON(ret); + kfree(extent_op); + } + if (ret) + err = ret; + + search = end + 1; + + if (need_resched()) { + mutex_unlock(&info->extent_ins_mutex); + cond_resched(); + mutex_lock(&info->extent_ins_mutex); + } + } + + if (nr) { + ret = free_extents(trans, extent_root, &delete_list); + BUG_ON(ret); + } + + if (all && skipped) { + INIT_LIST_HEAD(&delete_list); + search = 0; + nr = 0; + goto again; + } + + return err; +} + +/* + * remove an extent from the root, returns 0 on success + */ +static int __btrfs_free_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 ref_generation, + u64 owner_objectid, int pin) +{ + struct btrfs_root *extent_root = root->fs_info->extent_root; + int pending_ret; + int ret; + + WARN_ON(num_bytes < root->sectorsize); + if (root == extent_root) { + struct pending_extent_op *extent_op = NULL; + + mutex_lock(&root->fs_info->extent_ins_mutex); + if (test_range_bit(&root->fs_info->extent_ins, bytenr, + bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) { + u64 priv; + ret = get_state_private(&root->fs_info->extent_ins, + bytenr, &priv); + BUG_ON(ret); + extent_op = (struct pending_extent_op *) + (unsigned long)priv; + + extent_op->del = 1; + if (extent_op->type == PENDING_EXTENT_INSERT) { + mutex_unlock(&root->fs_info->extent_ins_mutex); + return 0; + } + } + + if (extent_op) { + ref_generation = extent_op->orig_generation; + parent = extent_op->orig_parent; + } + + extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); + BUG_ON(!extent_op); + + extent_op->type = PENDING_EXTENT_DELETE; + extent_op->bytenr = bytenr; + extent_op->num_bytes = num_bytes; + extent_op->parent = parent; + extent_op->orig_parent = parent; + extent_op->generation = ref_generation; + extent_op->orig_generation = ref_generation; + extent_op->level = (int)owner_objectid; + INIT_LIST_HEAD(&extent_op->list); + extent_op->del = 0; + + set_extent_bits(&root->fs_info->pending_del, + bytenr, bytenr + num_bytes - 1, + EXTENT_WRITEBACK, GFP_NOFS); + set_state_private(&root->fs_info->pending_del, + bytenr, (unsigned long)extent_op); + mutex_unlock(&root->fs_info->extent_ins_mutex); + return 0; + } + /* if metadata always pin */ + if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) { + if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { + struct btrfs_block_group_cache *cache; + + /* btrfs_free_reserved_extent */ + cache = btrfs_lookup_block_group(root->fs_info, bytenr); + BUG_ON(!cache); + btrfs_add_free_space(cache, bytenr, num_bytes); + put_block_group(cache); + update_reserved_extents(root, bytenr, num_bytes, 0); + return 0; + } + pin = 1; + } + + /* if data pin when any transaction has committed this */ + if (ref_generation != trans->transid) + pin = 1; + + ret = __free_extent(trans, root, bytenr, num_bytes, parent, + root_objectid, ref_generation, + owner_objectid, pin, pin == 0); + + finish_current_insert(trans, root->fs_info->extent_root, 0); + pending_ret = del_pending_extents(trans, root->fs_info->extent_root, 0); + return ret ? ret : pending_ret; +} + +int btrfs_free_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 ref_generation, + u64 owner_objectid, int pin) +{ + int ret; + + ret = __btrfs_free_extent(trans, root, bytenr, num_bytes, parent, + root_objectid, ref_generation, + owner_objectid, pin); + return ret; +} + +static u64 stripe_align(struct btrfs_root *root, u64 val) +{ + u64 mask = ((u64)root->stripesize - 1); + u64 ret = (val + mask) & ~mask; + return ret; +} + +/* + * walks the btree of allocated extents and find a hole of a given size. + * The key ins is changed to record the hole: + * ins->objectid == block start + * ins->flags = BTRFS_EXTENT_ITEM_KEY + * ins->offset == number of blocks + * Any available blocks before search_start are skipped. + */ +static noinline int find_free_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *orig_root, + u64 num_bytes, u64 empty_size, + u64 search_start, u64 search_end, + u64 hint_byte, struct btrfs_key *ins, + u64 exclude_start, u64 exclude_nr, + int data) +{ + int ret = 0; + struct btrfs_root *root = orig_root->fs_info->extent_root; + u64 total_needed = num_bytes; + u64 *last_ptr = NULL; + u64 last_wanted = 0; + struct btrfs_block_group_cache *block_group = NULL; + int chunk_alloc_done = 0; + int empty_cluster = 2 * 1024 * 1024; + int allowed_chunk_alloc = 0; + struct list_head *head = NULL, *cur = NULL; + int loop = 0; + int extra_loop = 0; + struct btrfs_space_info *space_info; + + WARN_ON(num_bytes < root->sectorsize); + btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); + ins->objectid = 0; + ins->offset = 0; + + if (orig_root->ref_cows || empty_size) + allowed_chunk_alloc = 1; + + if (data & BTRFS_BLOCK_GROUP_METADATA) { + last_ptr = &root->fs_info->last_alloc; + empty_cluster = 64 * 1024; + } + + if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) + last_ptr = &root->fs_info->last_data_alloc; + + if (last_ptr) { + if (*last_ptr) { + hint_byte = *last_ptr; + last_wanted = *last_ptr; + } else + empty_size += empty_cluster; + } else { + empty_cluster = 0; + } + search_start = max(search_start, first_logical_byte(root, 0)); + search_start = max(search_start, hint_byte); + + if (last_wanted && search_start != last_wanted) { + last_wanted = 0; + empty_size += empty_cluster; + } + + total_needed += empty_size; + block_group = btrfs_lookup_block_group(root->fs_info, search_start); + if (!block_group) + block_group = btrfs_lookup_first_block_group(root->fs_info, + search_start); + space_info = __find_space_info(root->fs_info, data); + + down_read(&space_info->groups_sem); + while (1) { + struct btrfs_free_space *free_space; + /* + * the only way this happens if our hint points to a block + * group thats not of the proper type, while looping this + * should never happen + */ + if (empty_size) + extra_loop = 1; + + if (!block_group) + goto new_group_no_lock; + + if (unlikely(!block_group->cached)) { + mutex_lock(&block_group->cache_mutex); + ret = cache_block_group(root, block_group); + mutex_unlock(&block_group->cache_mutex); + if (ret) + break; + } + + mutex_lock(&block_group->alloc_mutex); + if (unlikely(!block_group_bits(block_group, data))) + goto new_group; + + if (unlikely(block_group->ro)) + goto new_group; + + free_space = btrfs_find_free_space(block_group, search_start, + total_needed); + if (free_space) { + u64 start = block_group->key.objectid; + u64 end = block_group->key.objectid + + block_group->key.offset; + + search_start = stripe_align(root, free_space->offset); + + /* move on to the next group */ + if (search_start + num_bytes >= search_end) + goto new_group; + + /* move on to the next group */ + if (search_start + num_bytes > end) + goto new_group; + + if (last_wanted && search_start != last_wanted) { + total_needed += empty_cluster; + empty_size += empty_cluster; + last_wanted = 0; + /* + * if search_start is still in this block group + * then we just re-search this block group + */ + if (search_start >= start && + search_start < end) { + mutex_unlock(&block_group->alloc_mutex); + continue; + } + + /* else we go to the next block group */ + goto new_group; + } + + if (exclude_nr > 0 && + (search_start + num_bytes > exclude_start && + search_start < exclude_start + exclude_nr)) { + search_start = exclude_start + exclude_nr; + /* + * if search_start is still in this block group + * then we just re-search this block group + */ + if (search_start >= start && + search_start < end) { + mutex_unlock(&block_group->alloc_mutex); + last_wanted = 0; + continue; + } + + /* else we go to the next block group */ + goto new_group; + } + + ins->objectid = search_start; + ins->offset = num_bytes; + + btrfs_remove_free_space_lock(block_group, search_start, + num_bytes); + /* we are all good, lets return */ + mutex_unlock(&block_group->alloc_mutex); + break; + } +new_group: + mutex_unlock(&block_group->alloc_mutex); + put_block_group(block_group); + block_group = NULL; +new_group_no_lock: + /* don't try to compare new allocations against the + * last allocation any more + */ + last_wanted = 0; + + /* + * Here's how this works. + * loop == 0: we were searching a block group via a hint + * and didn't find anything, so we start at + * the head of the block groups and keep searching + * loop == 1: we're searching through all of the block groups + * if we hit the head again we have searched + * all of the block groups for this space and we + * need to try and allocate, if we cant error out. + * loop == 2: we allocated more space and are looping through + * all of the block groups again. + */ + if (loop == 0) { + head = &space_info->block_groups; + cur = head->next; + loop++; + } else if (loop == 1 && cur == head) { + int keep_going; + + /* at this point we give up on the empty_size + * allocations and just try to allocate the min + * space. + * + * The extra_loop field was set if an empty_size + * allocation was attempted above, and if this + * is try we need to try the loop again without + * the additional empty_size. + */ + total_needed -= empty_size; + empty_size = 0; + keep_going = extra_loop; + loop++; + + if (allowed_chunk_alloc && !chunk_alloc_done) { + up_read(&space_info->groups_sem); + ret = do_chunk_alloc(trans, root, num_bytes + + 2 * 1024 * 1024, data, 1); + down_read(&space_info->groups_sem); + if (ret < 0) + goto loop_check; + head = &space_info->block_groups; + /* + * we've allocated a new chunk, keep + * trying + */ + keep_going = 1; + chunk_alloc_done = 1; + } else if (!allowed_chunk_alloc) { + space_info->force_alloc = 1; + } +loop_check: + if (keep_going) { + cur = head->next; + extra_loop = 0; + } else { + break; + } + } else if (cur == head) { + break; + } + + block_group = list_entry(cur, struct btrfs_block_group_cache, + list); + atomic_inc(&block_group->count); + + search_start = block_group->key.objectid; + cur = cur->next; + } + + /* we found what we needed */ + if (ins->objectid) { + if (!(data & BTRFS_BLOCK_GROUP_DATA)) + trans->block_group = block_group->key.objectid; + + if (last_ptr) + *last_ptr = ins->objectid + ins->offset; + ret = 0; + } else if (!ret) { + printk(KERN_ERR "btrfs searching for %llu bytes, " + "num_bytes %llu, loop %d, allowed_alloc %d\n", + (unsigned long long)total_needed, + (unsigned long long)num_bytes, + loop, allowed_chunk_alloc); + ret = -ENOSPC; + } + if (block_group) + put_block_group(block_group); + + up_read(&space_info->groups_sem); + return ret; +} + +static void dump_space_info(struct btrfs_space_info *info, u64 bytes) +{ + struct btrfs_block_group_cache *cache; + struct list_head *l; + + printk(KERN_INFO "space_info has %llu free, is %sfull\n", + (unsigned long long)(info->total_bytes - info->bytes_used - + info->bytes_pinned - info->bytes_reserved), + (info->full) ? "" : "not "); + + down_read(&info->groups_sem); + list_for_each(l, &info->block_groups) { + cache = list_entry(l, struct btrfs_block_group_cache, list); + spin_lock(&cache->lock); + printk(KERN_INFO "block group %llu has %llu bytes, %llu used " + "%llu pinned %llu reserved\n", + (unsigned long long)cache->key.objectid, + (unsigned long long)cache->key.offset, + (unsigned long long)btrfs_block_group_used(&cache->item), + (unsigned long long)cache->pinned, + (unsigned long long)cache->reserved); + btrfs_dump_free_space(cache, bytes); + spin_unlock(&cache->lock); + } + up_read(&info->groups_sem); +} + +static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 num_bytes, u64 min_alloc_size, + u64 empty_size, u64 hint_byte, + u64 search_end, struct btrfs_key *ins, + u64 data) +{ + int ret; + u64 search_start = 0; + u64 alloc_profile; + struct btrfs_fs_info *info = root->fs_info; + + if (data) { + alloc_profile = info->avail_data_alloc_bits & + info->data_alloc_profile; + data = BTRFS_BLOCK_GROUP_DATA | alloc_profile; + } else if (root == root->fs_info->chunk_root) { + alloc_profile = info->avail_system_alloc_bits & + info->system_alloc_profile; + data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile; + } else { + alloc_profile = info->avail_metadata_alloc_bits & + info->metadata_alloc_profile; + data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile; + } +again: + data = btrfs_reduce_alloc_profile(root, data); + /* + * the only place that sets empty_size is btrfs_realloc_node, which + * is not called recursively on allocations + */ + if (empty_size || root->ref_cows) { + if (!(data & BTRFS_BLOCK_GROUP_METADATA)) { + ret = do_chunk_alloc(trans, root->fs_info->extent_root, + 2 * 1024 * 1024, + BTRFS_BLOCK_GROUP_METADATA | + (info->metadata_alloc_profile & + info->avail_metadata_alloc_bits), 0); + } + ret = do_chunk_alloc(trans, root->fs_info->extent_root, + num_bytes + 2 * 1024 * 1024, data, 0); + } + + WARN_ON(num_bytes < root->sectorsize); + ret = find_free_extent(trans, root, num_bytes, empty_size, + search_start, search_end, hint_byte, ins, + trans->alloc_exclude_start, + trans->alloc_exclude_nr, data); + + if (ret == -ENOSPC && num_bytes > min_alloc_size) { + num_bytes = num_bytes >> 1; + num_bytes = num_bytes & ~(root->sectorsize - 1); + num_bytes = max(num_bytes, min_alloc_size); + do_chunk_alloc(trans, root->fs_info->extent_root, + num_bytes, data, 1); + goto again; + } + if (ret) { + struct btrfs_space_info *sinfo; + + sinfo = __find_space_info(root->fs_info, data); + printk(KERN_ERR "btrfs allocation failed flags %llu, " + "wanted %llu\n", (unsigned long long)data, + (unsigned long long)num_bytes); + dump_space_info(sinfo, num_bytes); + BUG(); + } + + return ret; +} + +int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) +{ + struct btrfs_block_group_cache *cache; + int ret = 0; + + cache = btrfs_lookup_block_group(root->fs_info, start); + if (!cache) { + printk(KERN_ERR "Unable to find block group for %llu\n", + (unsigned long long)start); + return -ENOSPC; + } + + ret = btrfs_discard_extent(root, start, len); + + btrfs_add_free_space(cache, start, len); + put_block_group(cache); + update_reserved_extents(root, start, len, 0); + + return ret; +} + +int btrfs_reserve_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 num_bytes, u64 min_alloc_size, + u64 empty_size, u64 hint_byte, + u64 search_end, struct btrfs_key *ins, + u64 data) +{ + int ret; + ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size, + empty_size, hint_byte, search_end, ins, + data); + update_reserved_extents(root, ins->objectid, ins->offset, 1); + return ret; +} + +static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 parent, + u64 root_objectid, u64 ref_generation, + u64 owner, struct btrfs_key *ins) +{ + int ret; + int pending_ret; + u64 super_used; + u64 root_used; + u64 num_bytes = ins->offset; + u32 sizes[2]; + struct btrfs_fs_info *info = root->fs_info; + struct btrfs_root *extent_root = info->extent_root; + struct btrfs_extent_item *extent_item; + struct btrfs_extent_ref *ref; + struct btrfs_path *path; + struct btrfs_key keys[2]; + + if (parent == 0) + parent = ins->objectid; + + /* block accounting for super block */ + spin_lock(&info->delalloc_lock); + super_used = btrfs_super_bytes_used(&info->super_copy); + btrfs_set_super_bytes_used(&info->super_copy, super_used + num_bytes); + + /* block accounting for root item */ + root_used = btrfs_root_used(&root->root_item); + btrfs_set_root_used(&root->root_item, root_used + num_bytes); + spin_unlock(&info->delalloc_lock); + + if (root == extent_root) { + struct pending_extent_op *extent_op; + + extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); + BUG_ON(!extent_op); + + extent_op->type = PENDING_EXTENT_INSERT; + extent_op->bytenr = ins->objectid; + extent_op->num_bytes = ins->offset; + extent_op->parent = parent; + extent_op->orig_parent = 0; + extent_op->generation = ref_generation; + extent_op->orig_generation = 0; + extent_op->level = (int)owner; + INIT_LIST_HEAD(&extent_op->list); + extent_op->del = 0; + + mutex_lock(&root->fs_info->extent_ins_mutex); + set_extent_bits(&root->fs_info->extent_ins, ins->objectid, + ins->objectid + ins->offset - 1, + EXTENT_WRITEBACK, GFP_NOFS); + set_state_private(&root->fs_info->extent_ins, + ins->objectid, (unsigned long)extent_op); + mutex_unlock(&root->fs_info->extent_ins_mutex); + goto update_block; + } + + memcpy(&keys[0], ins, sizeof(*ins)); + keys[1].objectid = ins->objectid; + keys[1].type = BTRFS_EXTENT_REF_KEY; + keys[1].offset = parent; + sizes[0] = sizeof(*extent_item); + sizes[1] = sizeof(*ref); + + path = btrfs_alloc_path(); + BUG_ON(!path); + + ret = btrfs_insert_empty_items(trans, extent_root, path, keys, + sizes, 2); + BUG_ON(ret); + + extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_extent_item); + btrfs_set_extent_refs(path->nodes[0], extent_item, 1); + ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, + struct btrfs_extent_ref); + + btrfs_set_ref_root(path->nodes[0], ref, root_objectid); + btrfs_set_ref_generation(path->nodes[0], ref, ref_generation); + btrfs_set_ref_objectid(path->nodes[0], ref, owner); + btrfs_set_ref_num_refs(path->nodes[0], ref, 1); + + btrfs_mark_buffer_dirty(path->nodes[0]); + + trans->alloc_exclude_start = 0; + trans->alloc_exclude_nr = 0; + btrfs_free_path(path); + finish_current_insert(trans, extent_root, 0); + pending_ret = del_pending_extents(trans, extent_root, 0); + + if (ret) + goto out; + if (pending_ret) { + ret = pending_ret; + goto out; + } + +update_block: + ret = update_block_group(trans, root, ins->objectid, + ins->offset, 1, 0); + if (ret) { + printk(KERN_ERR "btrfs update block group failed for %llu " + "%llu\n", (unsigned long long)ins->objectid, + (unsigned long long)ins->offset); + BUG(); + } +out: + return ret; +} + +int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 parent, + u64 root_objectid, u64 ref_generation, + u64 owner, struct btrfs_key *ins) +{ + int ret; + + if (root_objectid == BTRFS_TREE_LOG_OBJECTID) + return 0; + ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid, + ref_generation, owner, ins); + update_reserved_extents(root, ins->objectid, ins->offset, 0); + return ret; +} + +/* + * this is used by the tree logging recovery code. It records that + * an extent has been allocated and makes sure to clear the free + * space cache bits as well + */ +int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 parent, + u64 root_objectid, u64 ref_generation, + u64 owner, struct btrfs_key *ins) +{ + int ret; + struct btrfs_block_group_cache *block_group; + + block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); + mutex_lock(&block_group->cache_mutex); + cache_block_group(root, block_group); + mutex_unlock(&block_group->cache_mutex); + + ret = btrfs_remove_free_space(block_group, ins->objectid, + ins->offset); + BUG_ON(ret); + put_block_group(block_group); + ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid, + ref_generation, owner, ins); + return ret; +} + +/* + * finds a free extent and does all the dirty work required for allocation + * returns the key for the extent through ins, and a tree buffer for + * the first block of the extent through buf. + * + * returns 0 if everything worked, non-zero otherwise. + */ +int btrfs_alloc_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 num_bytes, u64 parent, u64 min_alloc_size, + u64 root_objectid, u64 ref_generation, + u64 owner_objectid, u64 empty_size, u64 hint_byte, + u64 search_end, struct btrfs_key *ins, u64 data) +{ + int ret; + + ret = __btrfs_reserve_extent(trans, root, num_bytes, + min_alloc_size, empty_size, hint_byte, + search_end, ins, data); + BUG_ON(ret); + if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { + ret = __btrfs_alloc_reserved_extent(trans, root, parent, + root_objectid, ref_generation, + owner_objectid, ins); + BUG_ON(ret); + + } else { + update_reserved_extents(root, ins->objectid, ins->offset, 1); + } + return ret; +} + +struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u32 blocksize) +{ + struct extent_buffer *buf; + + buf = btrfs_find_create_tree_block(root, bytenr, blocksize); + if (!buf) + return ERR_PTR(-ENOMEM); + btrfs_set_header_generation(buf, trans->transid); + btrfs_tree_lock(buf); + clean_tree_block(trans, root, buf); + btrfs_set_buffer_uptodate(buf); + if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { + set_extent_dirty(&root->dirty_log_pages, buf->start, + buf->start + buf->len - 1, GFP_NOFS); + } else { + set_extent_dirty(&trans->transaction->dirty_pages, buf->start, + buf->start + buf->len - 1, GFP_NOFS); + } + trans->blocks_used++; + return buf; +} + +/* + * helper function to allocate a block for a given tree + * returns the tree buffer or NULL. + */ +struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u32 blocksize, u64 parent, + u64 root_objectid, + u64 ref_generation, + int level, + u64 hint, + u64 empty_size) +{ + struct btrfs_key ins; + int ret; + struct extent_buffer *buf; + + ret = btrfs_alloc_extent(trans, root, blocksize, parent, blocksize, + root_objectid, ref_generation, level, + empty_size, hint, (u64)-1, &ins, 0); + if (ret) { + BUG_ON(ret > 0); + return ERR_PTR(ret); + } + + buf = btrfs_init_new_buffer(trans, root, ins.objectid, blocksize); + return buf; +} + +int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *leaf) +{ + u64 leaf_owner; + u64 leaf_generation; + struct btrfs_key key; + struct btrfs_file_extent_item *fi; + int i; + int nritems; + int ret; + + BUG_ON(!btrfs_is_leaf(leaf)); + nritems = btrfs_header_nritems(leaf); + leaf_owner = btrfs_header_owner(leaf); + leaf_generation = btrfs_header_generation(leaf); + + for (i = 0; i < nritems; i++) { + u64 disk_bytenr; + cond_resched(); + + btrfs_item_key_to_cpu(leaf, &key, i); + if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) + continue; + fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); + if (btrfs_file_extent_type(leaf, fi) == + BTRFS_FILE_EXTENT_INLINE) + continue; + /* + * FIXME make sure to insert a trans record that + * repeats the snapshot del on crash + */ + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + if (disk_bytenr == 0) + continue; + + ret = __btrfs_free_extent(trans, root, disk_bytenr, + btrfs_file_extent_disk_num_bytes(leaf, fi), + leaf->start, leaf_owner, leaf_generation, + key.objectid, 0); + BUG_ON(ret); + + atomic_inc(&root->fs_info->throttle_gen); + wake_up(&root->fs_info->transaction_throttle); + cond_resched(); + } + return 0; +} + +static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_leaf_ref *ref) +{ + int i; + int ret; + struct btrfs_extent_info *info = ref->extents; + + for (i = 0; i < ref->nritems; i++) { + ret = __btrfs_free_extent(trans, root, info->bytenr, + info->num_bytes, ref->bytenr, + ref->owner, ref->generation, + info->objectid, 0); + + atomic_inc(&root->fs_info->throttle_gen); + wake_up(&root->fs_info->transaction_throttle); + cond_resched(); + + BUG_ON(ret); + info++; + } + + return 0; +} + +static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start, + u64 len, u32 *refs) +{ + int ret; + + ret = btrfs_lookup_extent_ref(NULL, root, start, len, refs); + BUG_ON(ret); + +#if 0 /* some debugging code in case we see problems here */ + /* if the refs count is one, it won't get increased again. But + * if the ref count is > 1, someone may be decreasing it at + * the same time we are. + */ + if (*refs != 1) { + struct extent_buffer *eb = NULL; + eb = btrfs_find_create_tree_block(root, start, len); + if (eb) + btrfs_tree_lock(eb); + + mutex_lock(&root->fs_info->alloc_mutex); + ret = lookup_extent_ref(NULL, root, start, len, refs); + BUG_ON(ret); + mutex_unlock(&root->fs_info->alloc_mutex); + + if (eb) { + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + } + if (*refs == 1) { + printk(KERN_ERR "btrfs block %llu went down to one " + "during drop_snap\n", (unsigned long long)start); + } + + } +#endif + + cond_resched(); + return ret; +} + +/* + * helper function for drop_snapshot, this walks down the tree dropping ref + * counts as it goes. + */ +static noinline int walk_down_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int *level) +{ + u64 root_owner; + u64 root_gen; + u64 bytenr; + u64 ptr_gen; + struct extent_buffer *next; + struct extent_buffer *cur; + struct extent_buffer *parent; + struct btrfs_leaf_ref *ref; + u32 blocksize; + int ret; + u32 refs; + + WARN_ON(*level < 0); + WARN_ON(*level >= BTRFS_MAX_LEVEL); + ret = drop_snap_lookup_refcount(root, path->nodes[*level]->start, + path->nodes[*level]->len, &refs); + BUG_ON(ret); + if (refs > 1) + goto out; + + /* + * walk down to the last node level and free all the leaves + */ + while (*level >= 0) { + WARN_ON(*level < 0); + WARN_ON(*level >= BTRFS_MAX_LEVEL); + cur = path->nodes[*level]; + + if (btrfs_header_level(cur) != *level) + WARN_ON(1); + + if (path->slots[*level] >= + btrfs_header_nritems(cur)) + break; + if (*level == 0) { + ret = btrfs_drop_leaf_ref(trans, root, cur); + BUG_ON(ret); + break; + } + bytenr = btrfs_node_blockptr(cur, path->slots[*level]); + ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); + blocksize = btrfs_level_size(root, *level - 1); + + ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs); + BUG_ON(ret); + if (refs != 1) { + parent = path->nodes[*level]; + root_owner = btrfs_header_owner(parent); + root_gen = btrfs_header_generation(parent); + path->slots[*level]++; + + ret = __btrfs_free_extent(trans, root, bytenr, + blocksize, parent->start, + root_owner, root_gen, + *level - 1, 1); + BUG_ON(ret); + + atomic_inc(&root->fs_info->throttle_gen); + wake_up(&root->fs_info->transaction_throttle); + cond_resched(); + + continue; + } + /* + * at this point, we have a single ref, and since the + * only place referencing this extent is a dead root + * the reference count should never go higher. + * So, we don't need to check it again + */ + if (*level == 1) { + ref = btrfs_lookup_leaf_ref(root, bytenr); + if (ref && ref->generation != ptr_gen) { + btrfs_free_leaf_ref(root, ref); + ref = NULL; + } + if (ref) { + ret = cache_drop_leaf_ref(trans, root, ref); + BUG_ON(ret); + btrfs_remove_leaf_ref(root, ref); + btrfs_free_leaf_ref(root, ref); + *level = 0; + break; + } + } + next = btrfs_find_tree_block(root, bytenr, blocksize); + if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) { + free_extent_buffer(next); + + next = read_tree_block(root, bytenr, blocksize, + ptr_gen); + cond_resched(); +#if 0 + /* + * this is a debugging check and can go away + * the ref should never go all the way down to 1 + * at this point + */ + ret = lookup_extent_ref(NULL, root, bytenr, blocksize, + &refs); + BUG_ON(ret); + WARN_ON(refs != 1); +#endif + } + WARN_ON(*level <= 0); + if (path->nodes[*level-1]) + free_extent_buffer(path->nodes[*level-1]); + path->nodes[*level-1] = next; + *level = btrfs_header_level(next); + path->slots[*level] = 0; + cond_resched(); + } +out: + WARN_ON(*level < 0); + WARN_ON(*level >= BTRFS_MAX_LEVEL); + + if (path->nodes[*level] == root->node) { + parent = path->nodes[*level]; + bytenr = path->nodes[*level]->start; + } else { + parent = path->nodes[*level + 1]; + bytenr = btrfs_node_blockptr(parent, path->slots[*level + 1]); + } + + blocksize = btrfs_level_size(root, *level); + root_owner = btrfs_header_owner(parent); + root_gen = btrfs_header_generation(parent); + + ret = __btrfs_free_extent(trans, root, bytenr, blocksize, + parent->start, root_owner, root_gen, + *level, 1); + free_extent_buffer(path->nodes[*level]); + path->nodes[*level] = NULL; + *level += 1; + BUG_ON(ret); + + cond_resched(); + return 0; +} + +/* + * helper function for drop_subtree, this function is similar to + * walk_down_tree. The main difference is that it checks reference + * counts while tree blocks are locked. + */ +static noinline int walk_down_subtree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int *level) +{ + struct extent_buffer *next; + struct extent_buffer *cur; + struct extent_buffer *parent; + u64 bytenr; + u64 ptr_gen; + u32 blocksize; + u32 refs; + int ret; + + cur = path->nodes[*level]; + ret = btrfs_lookup_extent_ref(trans, root, cur->start, cur->len, + &refs); + BUG_ON(ret); + if (refs > 1) + goto out; + + while (*level >= 0) { + cur = path->nodes[*level]; + if (*level == 0) { + ret = btrfs_drop_leaf_ref(trans, root, cur); + BUG_ON(ret); + clean_tree_block(trans, root, cur); + break; + } + if (path->slots[*level] >= btrfs_header_nritems(cur)) { + clean_tree_block(trans, root, cur); + break; + } + + bytenr = btrfs_node_blockptr(cur, path->slots[*level]); + blocksize = btrfs_level_size(root, *level - 1); + ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); + + next = read_tree_block(root, bytenr, blocksize, ptr_gen); + btrfs_tree_lock(next); + + ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize, + &refs); + BUG_ON(ret); + if (refs > 1) { + parent = path->nodes[*level]; + ret = btrfs_free_extent(trans, root, bytenr, + blocksize, parent->start, + btrfs_header_owner(parent), + btrfs_header_generation(parent), + *level - 1, 1); + BUG_ON(ret); + path->slots[*level]++; + btrfs_tree_unlock(next); + free_extent_buffer(next); + continue; + } + + *level = btrfs_header_level(next); + path->nodes[*level] = next; + path->slots[*level] = 0; + path->locks[*level] = 1; + cond_resched(); + } +out: + parent = path->nodes[*level + 1]; + bytenr = path->nodes[*level]->start; + blocksize = path->nodes[*level]->len; + + ret = btrfs_free_extent(trans, root, bytenr, blocksize, + parent->start, btrfs_header_owner(parent), + btrfs_header_generation(parent), *level, 1); + BUG_ON(ret); + + if (path->locks[*level]) { + btrfs_tree_unlock(path->nodes[*level]); + path->locks[*level] = 0; + } + free_extent_buffer(path->nodes[*level]); + path->nodes[*level] = NULL; + *level += 1; + cond_resched(); + return 0; +} + +/* + * helper for dropping snapshots. This walks back up the tree in the path + * to find the first node higher up where we haven't yet gone through + * all the slots + */ +static noinline int walk_up_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + int *level, int max_level) +{ + u64 root_owner; + u64 root_gen; + struct btrfs_root_item *root_item = &root->root_item; + int i; + int slot; + int ret; + + for (i = *level; i < max_level && path->nodes[i]; i++) { + slot = path->slots[i]; + if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { + struct extent_buffer *node; + struct btrfs_disk_key disk_key; + node = path->nodes[i]; + path->slots[i]++; + *level = i; + WARN_ON(*level == 0); + btrfs_node_key(node, &disk_key, path->slots[i]); + memcpy(&root_item->drop_progress, + &disk_key, sizeof(disk_key)); + root_item->drop_level = i; + return 0; + } else { + struct extent_buffer *parent; + if (path->nodes[*level] == root->node) + parent = path->nodes[*level]; + else + parent = path->nodes[*level + 1]; + + root_owner = btrfs_header_owner(parent); + root_gen = btrfs_header_generation(parent); + + clean_tree_block(trans, root, path->nodes[*level]); + ret = btrfs_free_extent(trans, root, + path->nodes[*level]->start, + path->nodes[*level]->len, + parent->start, root_owner, + root_gen, *level, 1); + BUG_ON(ret); + if (path->locks[*level]) { + btrfs_tree_unlock(path->nodes[*level]); + path->locks[*level] = 0; + } + free_extent_buffer(path->nodes[*level]); + path->nodes[*level] = NULL; + *level = i + 1; + } + } + return 1; +} + +/* + * drop the reference count on the tree rooted at 'snap'. This traverses + * the tree freeing any blocks that have a ref count of zero after being + * decremented. + */ +int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root + *root) +{ + int ret = 0; + int wret; + int level; + struct btrfs_path *path; + int i; + int orig_level; + struct btrfs_root_item *root_item = &root->root_item; + + WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex)); + path = btrfs_alloc_path(); + BUG_ON(!path); + + level = btrfs_header_level(root->node); + orig_level = level; + if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { + path->nodes[level] = root->node; + extent_buffer_get(root->node); + path->slots[level] = 0; + } else { + struct btrfs_key key; + struct btrfs_disk_key found_key; + struct extent_buffer *node; + + btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); + level = root_item->drop_level; + path->lowest_level = level; + wret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (wret < 0) { + ret = wret; + goto out; + } + node = path->nodes[level]; + btrfs_node_key(node, &found_key, path->slots[level]); + WARN_ON(memcmp(&found_key, &root_item->drop_progress, + sizeof(found_key))); + /* + * unlock our path, this is safe because only this + * function is allowed to delete this snapshot + */ + for (i = 0; i < BTRFS_MAX_LEVEL; i++) { + if (path->nodes[i] && path->locks[i]) { + path->locks[i] = 0; + btrfs_tree_unlock(path->nodes[i]); + } + } + } + while (1) { + wret = walk_down_tree(trans, root, path, &level); + if (wret > 0) + break; + if (wret < 0) + ret = wret; + + wret = walk_up_tree(trans, root, path, &level, + BTRFS_MAX_LEVEL); + if (wret > 0) + break; + if (wret < 0) + ret = wret; + if (trans->transaction->in_commit) { + ret = -EAGAIN; + break; + } + atomic_inc(&root->fs_info->throttle_gen); + wake_up(&root->fs_info->transaction_throttle); + } + for (i = 0; i <= orig_level; i++) { + if (path->nodes[i]) { + free_extent_buffer(path->nodes[i]); + path->nodes[i] = NULL; + } + } +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_drop_subtree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *node, + struct extent_buffer *parent) +{ + struct btrfs_path *path; + int level; + int parent_level; + int ret = 0; + int wret; + + path = btrfs_alloc_path(); + BUG_ON(!path); + + BUG_ON(!btrfs_tree_locked(parent)); + parent_level = btrfs_header_level(parent); + extent_buffer_get(parent); + path->nodes[parent_level] = parent; + path->slots[parent_level] = btrfs_header_nritems(parent); + + BUG_ON(!btrfs_tree_locked(node)); + level = btrfs_header_level(node); + extent_buffer_get(node); + path->nodes[level] = node; + path->slots[level] = 0; + + while (1) { + wret = walk_down_subtree(trans, root, path, &level); + if (wret < 0) + ret = wret; + if (wret != 0) + break; + + wret = walk_up_tree(trans, root, path, &level, parent_level); + if (wret < 0) + ret = wret; + if (wret != 0) + break; + } + + btrfs_free_path(path); + return ret; +} + +static unsigned long calc_ra(unsigned long start, unsigned long last, + unsigned long nr) +{ + return min(last, start + nr - 1); +} + +static noinline int relocate_inode_pages(struct inode *inode, u64 start, + u64 len) +{ + u64 page_start; + u64 page_end; + unsigned long first_index; + unsigned long last_index; + unsigned long i; + struct page *page; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct file_ra_state *ra; + struct btrfs_ordered_extent *ordered; + unsigned int total_read = 0; + unsigned int total_dirty = 0; + int ret = 0; + + ra = kzalloc(sizeof(*ra), GFP_NOFS); + + mutex_lock(&inode->i_mutex); + first_index = start >> PAGE_CACHE_SHIFT; + last_index = (start + len - 1) >> PAGE_CACHE_SHIFT; + + /* make sure the dirty trick played by the caller work */ + ret = invalidate_inode_pages2_range(inode->i_mapping, + first_index, last_index); + if (ret) + goto out_unlock; + + file_ra_state_init(ra, inode->i_mapping); + + for (i = first_index ; i <= last_index; i++) { + if (total_read % ra->ra_pages == 0) { + btrfs_force_ra(inode->i_mapping, ra, NULL, i, + calc_ra(i, last_index, ra->ra_pages)); + } + total_read++; +again: + if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode)) + BUG_ON(1); + page = grab_cache_page(inode->i_mapping, i); + if (!page) { + ret = -ENOMEM; + goto out_unlock; + } + if (!PageUptodate(page)) { + btrfs_readpage(NULL, page); + lock_page(page); + if (!PageUptodate(page)) { + unlock_page(page); + page_cache_release(page); + ret = -EIO; + goto out_unlock; + } + } + wait_on_page_writeback(page); + + page_start = (u64)page->index << PAGE_CACHE_SHIFT; + page_end = page_start + PAGE_CACHE_SIZE - 1; + lock_extent(io_tree, page_start, page_end, GFP_NOFS); + + ordered = btrfs_lookup_ordered_extent(inode, page_start); + if (ordered) { + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + unlock_page(page); + page_cache_release(page); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + goto again; + } + set_page_extent_mapped(page); + + if (i == first_index) + set_extent_bits(io_tree, page_start, page_end, + EXTENT_BOUNDARY, GFP_NOFS); + btrfs_set_extent_delalloc(inode, page_start, page_end); + + set_page_dirty(page); + total_dirty++; + + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + unlock_page(page); + page_cache_release(page); + } + +out_unlock: + kfree(ra); + mutex_unlock(&inode->i_mutex); + balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty); + return ret; +} + +static noinline int relocate_data_extent(struct inode *reloc_inode, + struct btrfs_key *extent_key, + u64 offset) +{ + struct btrfs_root *root = BTRFS_I(reloc_inode)->root; + struct extent_map_tree *em_tree = &BTRFS_I(reloc_inode)->extent_tree; + struct extent_map *em; + u64 start = extent_key->objectid - offset; + u64 end = start + extent_key->offset - 1; + + em = alloc_extent_map(GFP_NOFS); + BUG_ON(!em || IS_ERR(em)); + + em->start = start; + em->len = extent_key->offset; + em->block_len = extent_key->offset; + em->block_start = extent_key->objectid; + em->bdev = root->fs_info->fs_devices->latest_bdev; + set_bit(EXTENT_FLAG_PINNED, &em->flags); + + /* setup extent map to cheat btrfs_readpage */ + lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS); + while (1) { + int ret; + spin_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); + spin_unlock(&em_tree->lock); + if (ret != -EEXIST) { + free_extent_map(em); + break; + } + btrfs_drop_extent_cache(reloc_inode, start, end, 0); + } + unlock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS); + + return relocate_inode_pages(reloc_inode, start, extent_key->offset); +} + +struct btrfs_ref_path { + u64 extent_start; + u64 nodes[BTRFS_MAX_LEVEL]; + u64 root_objectid; + u64 root_generation; + u64 owner_objectid; + u32 num_refs; + int lowest_level; + int current_level; + int shared_level; + + struct btrfs_key node_keys[BTRFS_MAX_LEVEL]; + u64 new_nodes[BTRFS_MAX_LEVEL]; +}; + +struct disk_extent { + u64 ram_bytes; + u64 disk_bytenr; + u64 disk_num_bytes; + u64 offset; + u64 num_bytes; + u8 compression; + u8 encryption; + u16 other_encoding; +}; + +static int is_cowonly_root(u64 root_objectid) +{ + if (root_objectid == BTRFS_ROOT_TREE_OBJECTID || + root_objectid == BTRFS_EXTENT_TREE_OBJECTID || + root_objectid == BTRFS_CHUNK_TREE_OBJECTID || + root_objectid == BTRFS_DEV_TREE_OBJECTID || + root_objectid == BTRFS_TREE_LOG_OBJECTID || + root_objectid == BTRFS_CSUM_TREE_OBJECTID) + return 1; + return 0; +} + +static noinline int __next_ref_path(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, + struct btrfs_ref_path *ref_path, + int first_time) +{ + struct extent_buffer *leaf; + struct btrfs_path *path; + struct btrfs_extent_ref *ref; + struct btrfs_key key; + struct btrfs_key found_key; + u64 bytenr; + u32 nritems; + int level; + int ret = 1; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + if (first_time) { + ref_path->lowest_level = -1; + ref_path->current_level = -1; + ref_path->shared_level = -1; + goto walk_up; + } +walk_down: + level = ref_path->current_level - 1; + while (level >= -1) { + u64 parent; + if (level < ref_path->lowest_level) + break; + + if (level >= 0) + bytenr = ref_path->nodes[level]; + else + bytenr = ref_path->extent_start; + BUG_ON(bytenr == 0); + + parent = ref_path->nodes[level + 1]; + ref_path->nodes[level + 1] = 0; + ref_path->current_level = level; + BUG_ON(parent == 0); + + key.objectid = bytenr; + key.offset = parent + 1; + key.type = BTRFS_EXTENT_REF_KEY; + + ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0); + if (ret < 0) + goto out; + BUG_ON(ret == 0); + + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(extent_root, path); + if (ret < 0) + goto out; + if (ret > 0) + goto next; + leaf = path->nodes[0]; + } + + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (found_key.objectid == bytenr && + found_key.type == BTRFS_EXTENT_REF_KEY) { + if (level < ref_path->shared_level) + ref_path->shared_level = level; + goto found; + } +next: + level--; + btrfs_release_path(extent_root, path); + cond_resched(); + } + /* reached lowest level */ + ret = 1; + goto out; +walk_up: + level = ref_path->current_level; + while (level < BTRFS_MAX_LEVEL - 1) { + u64 ref_objectid; + + if (level >= 0) + bytenr = ref_path->nodes[level]; + else + bytenr = ref_path->extent_start; + + BUG_ON(bytenr == 0); + + key.objectid = bytenr; + key.offset = 0; + key.type = BTRFS_EXTENT_REF_KEY; + + ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0); + if (ret < 0) + goto out; + + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(extent_root, path); + if (ret < 0) + goto out; + if (ret > 0) { + /* the extent was freed by someone */ + if (ref_path->lowest_level == level) + goto out; + btrfs_release_path(extent_root, path); + goto walk_down; + } + leaf = path->nodes[0]; + } + + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (found_key.objectid != bytenr || + found_key.type != BTRFS_EXTENT_REF_KEY) { + /* the extent was freed by someone */ + if (ref_path->lowest_level == level) { + ret = 1; + goto out; + } + btrfs_release_path(extent_root, path); + goto walk_down; + } +found: + ref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_ref); + ref_objectid = btrfs_ref_objectid(leaf, ref); + if (ref_objectid < BTRFS_FIRST_FREE_OBJECTID) { + if (first_time) { + level = (int)ref_objectid; + BUG_ON(level >= BTRFS_MAX_LEVEL); + ref_path->lowest_level = level; + ref_path->current_level = level; + ref_path->nodes[level] = bytenr; + } else { + WARN_ON(ref_objectid != level); + } + } else { + WARN_ON(level != -1); + } + first_time = 0; + + if (ref_path->lowest_level == level) { + ref_path->owner_objectid = ref_objectid; + ref_path->num_refs = btrfs_ref_num_refs(leaf, ref); + } + + /* + * the block is tree root or the block isn't in reference + * counted tree. + */ + if (found_key.objectid == found_key.offset || + is_cowonly_root(btrfs_ref_root(leaf, ref))) { + ref_path->root_objectid = btrfs_ref_root(leaf, ref); + ref_path->root_generation = + btrfs_ref_generation(leaf, ref); + if (level < 0) { + /* special reference from the tree log */ + ref_path->nodes[0] = found_key.offset; + ref_path->current_level = 0; + } + ret = 0; + goto out; + } + + level++; + BUG_ON(ref_path->nodes[level] != 0); + ref_path->nodes[level] = found_key.offset; + ref_path->current_level = level; + + /* + * the reference was created in the running transaction, + * no need to continue walking up. + */ + if (btrfs_ref_generation(leaf, ref) == trans->transid) { + ref_path->root_objectid = btrfs_ref_root(leaf, ref); + ref_path->root_generation = + btrfs_ref_generation(leaf, ref); + ret = 0; + goto out; + } + + btrfs_release_path(extent_root, path); + cond_resched(); + } + /* reached max tree level, but no tree root found. */ + BUG(); +out: + btrfs_free_path(path); + return ret; +} + +static int btrfs_first_ref_path(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, + struct btrfs_ref_path *ref_path, + u64 extent_start) +{ + memset(ref_path, 0, sizeof(*ref_path)); + ref_path->extent_start = extent_start; + + return __next_ref_path(trans, extent_root, ref_path, 1); +} + +static int btrfs_next_ref_path(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, + struct btrfs_ref_path *ref_path) +{ + return __next_ref_path(trans, extent_root, ref_path, 0); +} + +static noinline int get_new_locations(struct inode *reloc_inode, + struct btrfs_key *extent_key, + u64 offset, int no_fragment, + struct disk_extent **extents, + int *nr_extents) +{ + struct btrfs_root *root = BTRFS_I(reloc_inode)->root; + struct btrfs_path *path; + struct btrfs_file_extent_item *fi; + struct extent_buffer *leaf; + struct disk_extent *exts = *extents; + struct btrfs_key found_key; + u64 cur_pos; + u64 last_byte; + u32 nritems; + int nr = 0; + int max = *nr_extents; + int ret; + + WARN_ON(!no_fragment && *extents); + if (!exts) { + max = 1; + exts = kmalloc(sizeof(*exts) * max, GFP_NOFS); + if (!exts) + return -ENOMEM; + } + + path = btrfs_alloc_path(); + BUG_ON(!path); + + cur_pos = extent_key->objectid - offset; + last_byte = extent_key->objectid + extent_key->offset; + ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino, + cur_pos, 0); + if (ret < 0) + goto out; + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + while (1) { + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + if (ret > 0) + break; + leaf = path->nodes[0]; + } + + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (found_key.offset != cur_pos || + found_key.type != BTRFS_EXTENT_DATA_KEY || + found_key.objectid != reloc_inode->i_ino) + break; + + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + if (btrfs_file_extent_type(leaf, fi) != + BTRFS_FILE_EXTENT_REG || + btrfs_file_extent_disk_bytenr(leaf, fi) == 0) + break; + + if (nr == max) { + struct disk_extent *old = exts; + max *= 2; + exts = kzalloc(sizeof(*exts) * max, GFP_NOFS); + memcpy(exts, old, sizeof(*exts) * nr); + if (old != *extents) + kfree(old); + } + + exts[nr].disk_bytenr = + btrfs_file_extent_disk_bytenr(leaf, fi); + exts[nr].disk_num_bytes = + btrfs_file_extent_disk_num_bytes(leaf, fi); + exts[nr].offset = btrfs_file_extent_offset(leaf, fi); + exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi); + exts[nr].ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); + exts[nr].compression = btrfs_file_extent_compression(leaf, fi); + exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi); + exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf, + fi); + BUG_ON(exts[nr].offset > 0); + BUG_ON(exts[nr].compression || exts[nr].encryption); + BUG_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes); + + cur_pos += exts[nr].num_bytes; + nr++; + + if (cur_pos + offset >= last_byte) + break; + + if (no_fragment) { + ret = 1; + goto out; + } + path->slots[0]++; + } + + BUG_ON(cur_pos + offset > last_byte); + if (cur_pos + offset < last_byte) { + ret = -ENOENT; + goto out; + } + ret = 0; +out: + btrfs_free_path(path); + if (ret) { + if (exts != *extents) + kfree(exts); + } else { + *extents = exts; + *nr_extents = nr; + } + return ret; +} + +static noinline int replace_one_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *extent_key, + struct btrfs_key *leaf_key, + struct btrfs_ref_path *ref_path, + struct disk_extent *new_extents, + int nr_extents) +{ + struct extent_buffer *leaf; + struct btrfs_file_extent_item *fi; + struct inode *inode = NULL; + struct btrfs_key key; + u64 lock_start = 0; + u64 lock_end = 0; + u64 num_bytes; + u64 ext_offset; + u64 first_pos; + u32 nritems; + int nr_scaned = 0; + int extent_locked = 0; + int extent_type; + int ret; + + memcpy(&key, leaf_key, sizeof(key)); + first_pos = INT_LIMIT(loff_t) - extent_key->offset; + if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) { + if (key.objectid < ref_path->owner_objectid || + (key.objectid == ref_path->owner_objectid && + key.type < BTRFS_EXTENT_DATA_KEY)) { + key.objectid = ref_path->owner_objectid; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = 0; + } + } + + while (1) { + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + if (ret < 0) + goto out; + + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); +next: + if (extent_locked && ret > 0) { + /* + * the file extent item was modified by someone + * before the extent got locked. + */ + unlock_extent(&BTRFS_I(inode)->io_tree, lock_start, + lock_end, GFP_NOFS); + extent_locked = 0; + } + + if (path->slots[0] >= nritems) { + if (++nr_scaned > 2) + break; + + BUG_ON(extent_locked); + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + if (ret > 0) + break; + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + + if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) { + if ((key.objectid > ref_path->owner_objectid) || + (key.objectid == ref_path->owner_objectid && + key.type > BTRFS_EXTENT_DATA_KEY) || + (key.offset >= first_pos + extent_key->offset)) + break; + } + + if (inode && key.objectid != inode->i_ino) { + BUG_ON(extent_locked); + btrfs_release_path(root, path); + mutex_unlock(&inode->i_mutex); + iput(inode); + inode = NULL; + continue; + } + + if (key.type != BTRFS_EXTENT_DATA_KEY) { + path->slots[0]++; + ret = 1; + goto next; + } + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + extent_type = btrfs_file_extent_type(leaf, fi); + if ((extent_type != BTRFS_FILE_EXTENT_REG && + extent_type != BTRFS_FILE_EXTENT_PREALLOC) || + (btrfs_file_extent_disk_bytenr(leaf, fi) != + extent_key->objectid)) { + path->slots[0]++; + ret = 1; + goto next; + } + + num_bytes = btrfs_file_extent_num_bytes(leaf, fi); + ext_offset = btrfs_file_extent_offset(leaf, fi); + + if (first_pos > key.offset - ext_offset) + first_pos = key.offset - ext_offset; + + if (!extent_locked) { + lock_start = key.offset; + lock_end = lock_start + num_bytes - 1; + } else { + if (lock_start > key.offset || + lock_end + 1 < key.offset + num_bytes) { + unlock_extent(&BTRFS_I(inode)->io_tree, + lock_start, lock_end, GFP_NOFS); + extent_locked = 0; + } + } + + if (!inode) { + btrfs_release_path(root, path); + + inode = btrfs_iget_locked(root->fs_info->sb, + key.objectid, root); + if (inode->i_state & I_NEW) { + BTRFS_I(inode)->root = root; + BTRFS_I(inode)->location.objectid = + key.objectid; + BTRFS_I(inode)->location.type = + BTRFS_INODE_ITEM_KEY; + BTRFS_I(inode)->location.offset = 0; + btrfs_read_locked_inode(inode); + unlock_new_inode(inode); + } + /* + * some code call btrfs_commit_transaction while + * holding the i_mutex, so we can't use mutex_lock + * here. + */ + if (is_bad_inode(inode) || + !mutex_trylock(&inode->i_mutex)) { + iput(inode); + inode = NULL; + key.offset = (u64)-1; + goto skip; + } + } + + if (!extent_locked) { + struct btrfs_ordered_extent *ordered; + + btrfs_release_path(root, path); + + lock_extent(&BTRFS_I(inode)->io_tree, lock_start, + lock_end, GFP_NOFS); + ordered = btrfs_lookup_first_ordered_extent(inode, + lock_end); + if (ordered && + ordered->file_offset <= lock_end && + ordered->file_offset + ordered->len > lock_start) { + unlock_extent(&BTRFS_I(inode)->io_tree, + lock_start, lock_end, GFP_NOFS); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + key.offset += num_bytes; + goto skip; + } + if (ordered) + btrfs_put_ordered_extent(ordered); + + extent_locked = 1; + continue; + } + + if (nr_extents == 1) { + /* update extent pointer in place */ + btrfs_set_file_extent_disk_bytenr(leaf, fi, + new_extents[0].disk_bytenr); + btrfs_set_file_extent_disk_num_bytes(leaf, fi, + new_extents[0].disk_num_bytes); + btrfs_mark_buffer_dirty(leaf); + + btrfs_drop_extent_cache(inode, key.offset, + key.offset + num_bytes - 1, 0); + + ret = btrfs_inc_extent_ref(trans, root, + new_extents[0].disk_bytenr, + new_extents[0].disk_num_bytes, + leaf->start, + root->root_key.objectid, + trans->transid, + key.objectid); + BUG_ON(ret); + + ret = btrfs_free_extent(trans, root, + extent_key->objectid, + extent_key->offset, + leaf->start, + btrfs_header_owner(leaf), + btrfs_header_generation(leaf), + key.objectid, 0); + BUG_ON(ret); + + btrfs_release_path(root, path); + key.offset += num_bytes; + } else { + BUG_ON(1); +#if 0 + u64 alloc_hint; + u64 extent_len; + int i; + /* + * drop old extent pointer at first, then insert the + * new pointers one bye one + */ + btrfs_release_path(root, path); + ret = btrfs_drop_extents(trans, root, inode, key.offset, + key.offset + num_bytes, + key.offset, &alloc_hint); + BUG_ON(ret); + + for (i = 0; i < nr_extents; i++) { + if (ext_offset >= new_extents[i].num_bytes) { + ext_offset -= new_extents[i].num_bytes; + continue; + } + extent_len = min(new_extents[i].num_bytes - + ext_offset, num_bytes); + + ret = btrfs_insert_empty_item(trans, root, + path, &key, + sizeof(*fi)); + BUG_ON(ret); + + leaf = path->nodes[0]; + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, fi, + trans->transid); + btrfs_set_file_extent_type(leaf, fi, + BTRFS_FILE_EXTENT_REG); + btrfs_set_file_extent_disk_bytenr(leaf, fi, + new_extents[i].disk_bytenr); + btrfs_set_file_extent_disk_num_bytes(leaf, fi, + new_extents[i].disk_num_bytes); + btrfs_set_file_extent_ram_bytes(leaf, fi, + new_extents[i].ram_bytes); + + btrfs_set_file_extent_compression(leaf, fi, + new_extents[i].compression); + btrfs_set_file_extent_encryption(leaf, fi, + new_extents[i].encryption); + btrfs_set_file_extent_other_encoding(leaf, fi, + new_extents[i].other_encoding); + + btrfs_set_file_extent_num_bytes(leaf, fi, + extent_len); + ext_offset += new_extents[i].offset; + btrfs_set_file_extent_offset(leaf, fi, + ext_offset); + btrfs_mark_buffer_dirty(leaf); + + btrfs_drop_extent_cache(inode, key.offset, + key.offset + extent_len - 1, 0); + + ret = btrfs_inc_extent_ref(trans, root, + new_extents[i].disk_bytenr, + new_extents[i].disk_num_bytes, + leaf->start, + root->root_key.objectid, + trans->transid, key.objectid); + BUG_ON(ret); + btrfs_release_path(root, path); + + inode_add_bytes(inode, extent_len); + + ext_offset = 0; + num_bytes -= extent_len; + key.offset += extent_len; + + if (num_bytes == 0) + break; + } + BUG_ON(i >= nr_extents); +#endif + } + + if (extent_locked) { + unlock_extent(&BTRFS_I(inode)->io_tree, lock_start, + lock_end, GFP_NOFS); + extent_locked = 0; + } +skip: + if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS && + key.offset >= first_pos + extent_key->offset) + break; + + cond_resched(); + } + ret = 0; +out: + btrfs_release_path(root, path); + if (inode) { + mutex_unlock(&inode->i_mutex); + if (extent_locked) { + unlock_extent(&BTRFS_I(inode)->io_tree, lock_start, + lock_end, GFP_NOFS); + } + iput(inode); + } + return ret; +} + +int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, u64 orig_start) +{ + int level; + int ret; + + BUG_ON(btrfs_header_generation(buf) != trans->transid); + BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); + + level = btrfs_header_level(buf); + if (level == 0) { + struct btrfs_leaf_ref *ref; + struct btrfs_leaf_ref *orig_ref; + + orig_ref = btrfs_lookup_leaf_ref(root, orig_start); + if (!orig_ref) + return -ENOENT; + + ref = btrfs_alloc_leaf_ref(root, orig_ref->nritems); + if (!ref) { + btrfs_free_leaf_ref(root, orig_ref); + return -ENOMEM; + } + + ref->nritems = orig_ref->nritems; + memcpy(ref->extents, orig_ref->extents, + sizeof(ref->extents[0]) * ref->nritems); + + btrfs_free_leaf_ref(root, orig_ref); + + ref->root_gen = trans->transid; + ref->bytenr = buf->start; + ref->owner = btrfs_header_owner(buf); + ref->generation = btrfs_header_generation(buf); + ret = btrfs_add_leaf_ref(root, ref, 0); + WARN_ON(ret); + btrfs_free_leaf_ref(root, ref); + } + return 0; +} + +static noinline int invalidate_extent_cache(struct btrfs_root *root, + struct extent_buffer *leaf, + struct btrfs_block_group_cache *group, + struct btrfs_root *target_root) +{ + struct btrfs_key key; + struct inode *inode = NULL; + struct btrfs_file_extent_item *fi; + u64 num_bytes; + u64 skip_objectid = 0; + u32 nritems; + u32 i; + + nritems = btrfs_header_nritems(leaf); + for (i = 0; i < nritems; i++) { + btrfs_item_key_to_cpu(leaf, &key, i); + if (key.objectid == skip_objectid || + key.type != BTRFS_EXTENT_DATA_KEY) + continue; + fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); + if (btrfs_file_extent_type(leaf, fi) == + BTRFS_FILE_EXTENT_INLINE) + continue; + if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0) + continue; + if (!inode || inode->i_ino != key.objectid) { + iput(inode); + inode = btrfs_ilookup(target_root->fs_info->sb, + key.objectid, target_root, 1); + } + if (!inode) { + skip_objectid = key.objectid; + continue; + } + num_bytes = btrfs_file_extent_num_bytes(leaf, fi); + + lock_extent(&BTRFS_I(inode)->io_tree, key.offset, + key.offset + num_bytes - 1, GFP_NOFS); + btrfs_drop_extent_cache(inode, key.offset, + key.offset + num_bytes - 1, 1); + unlock_extent(&BTRFS_I(inode)->io_tree, key.offset, + key.offset + num_bytes - 1, GFP_NOFS); + cond_resched(); + } + iput(inode); + return 0; +} + +static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *leaf, + struct btrfs_block_group_cache *group, + struct inode *reloc_inode) +{ + struct btrfs_key key; + struct btrfs_key extent_key; + struct btrfs_file_extent_item *fi; + struct btrfs_leaf_ref *ref; + struct disk_extent *new_extent; + u64 bytenr; + u64 num_bytes; + u32 nritems; + u32 i; + int ext_index; + int nr_extent; + int ret; + + new_extent = kmalloc(sizeof(*new_extent), GFP_NOFS); + BUG_ON(!new_extent); + + ref = btrfs_lookup_leaf_ref(root, leaf->start); + BUG_ON(!ref); + + ext_index = -1; + nritems = btrfs_header_nritems(leaf); + for (i = 0; i < nritems; i++) { + btrfs_item_key_to_cpu(leaf, &key, i); + if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) + continue; + fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); + if (btrfs_file_extent_type(leaf, fi) == + BTRFS_FILE_EXTENT_INLINE) + continue; + bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); + if (bytenr == 0) + continue; + + ext_index++; + if (bytenr >= group->key.objectid + group->key.offset || + bytenr + num_bytes <= group->key.objectid) + continue; + + extent_key.objectid = bytenr; + extent_key.offset = num_bytes; + extent_key.type = BTRFS_EXTENT_ITEM_KEY; + nr_extent = 1; + ret = get_new_locations(reloc_inode, &extent_key, + group->key.objectid, 1, + &new_extent, &nr_extent); + if (ret > 0) + continue; + BUG_ON(ret < 0); + + BUG_ON(ref->extents[ext_index].bytenr != bytenr); + BUG_ON(ref->extents[ext_index].num_bytes != num_bytes); + ref->extents[ext_index].bytenr = new_extent->disk_bytenr; + ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes; + + btrfs_set_file_extent_disk_bytenr(leaf, fi, + new_extent->disk_bytenr); + btrfs_set_file_extent_disk_num_bytes(leaf, fi, + new_extent->disk_num_bytes); + btrfs_mark_buffer_dirty(leaf); + + ret = btrfs_inc_extent_ref(trans, root, + new_extent->disk_bytenr, + new_extent->disk_num_bytes, + leaf->start, + root->root_key.objectid, + trans->transid, key.objectid); + BUG_ON(ret); + ret = btrfs_free_extent(trans, root, + bytenr, num_bytes, leaf->start, + btrfs_header_owner(leaf), + btrfs_header_generation(leaf), + key.objectid, 0); + BUG_ON(ret); + cond_resched(); + } + kfree(new_extent); + BUG_ON(ext_index + 1 != ref->nritems); + btrfs_free_leaf_ref(root, ref); + return 0; +} + +int btrfs_free_reloc_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_root *reloc_root; + int ret; + + if (root->reloc_root) { + reloc_root = root->reloc_root; + root->reloc_root = NULL; + list_add(&reloc_root->dead_list, + &root->fs_info->dead_reloc_roots); + + btrfs_set_root_bytenr(&reloc_root->root_item, + reloc_root->node->start); + btrfs_set_root_level(&root->root_item, + btrfs_header_level(reloc_root->node)); + memset(&reloc_root->root_item.drop_progress, 0, + sizeof(struct btrfs_disk_key)); + reloc_root->root_item.drop_level = 0; + + ret = btrfs_update_root(trans, root->fs_info->tree_root, + &reloc_root->root_key, + &reloc_root->root_item); + BUG_ON(ret); + } + return 0; +} + +int btrfs_drop_dead_reloc_roots(struct btrfs_root *root) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *reloc_root; + struct btrfs_root *prev_root = NULL; + struct list_head dead_roots; + int ret; + unsigned long nr; + + INIT_LIST_HEAD(&dead_roots); + list_splice_init(&root->fs_info->dead_reloc_roots, &dead_roots); + + while (!list_empty(&dead_roots)) { + reloc_root = list_entry(dead_roots.prev, + struct btrfs_root, dead_list); + list_del_init(&reloc_root->dead_list); + + BUG_ON(reloc_root->commit_root != NULL); + while (1) { + trans = btrfs_join_transaction(root, 1); + BUG_ON(!trans); + + mutex_lock(&root->fs_info->drop_mutex); + ret = btrfs_drop_snapshot(trans, reloc_root); + if (ret != -EAGAIN) + break; + mutex_unlock(&root->fs_info->drop_mutex); + + nr = trans->blocks_used; + ret = btrfs_end_transaction(trans, root); + BUG_ON(ret); + btrfs_btree_balance_dirty(root, nr); + } + + free_extent_buffer(reloc_root->node); + + ret = btrfs_del_root(trans, root->fs_info->tree_root, + &reloc_root->root_key); + BUG_ON(ret); + mutex_unlock(&root->fs_info->drop_mutex); + + nr = trans->blocks_used; + ret = btrfs_end_transaction(trans, root); + BUG_ON(ret); + btrfs_btree_balance_dirty(root, nr); + + kfree(prev_root); + prev_root = reloc_root; + } + if (prev_root) { + btrfs_remove_leaf_refs(prev_root, (u64)-1, 0); + kfree(prev_root); + } + return 0; +} + +int btrfs_add_dead_reloc_root(struct btrfs_root *root) +{ + list_add(&root->dead_list, &root->fs_info->dead_reloc_roots); + return 0; +} + +int btrfs_cleanup_reloc_trees(struct btrfs_root *root) +{ + struct btrfs_root *reloc_root; + struct btrfs_trans_handle *trans; + struct btrfs_key location; + int found; + int ret; + + mutex_lock(&root->fs_info->tree_reloc_mutex); + ret = btrfs_find_dead_roots(root, BTRFS_TREE_RELOC_OBJECTID, NULL); + BUG_ON(ret); + found = !list_empty(&root->fs_info->dead_reloc_roots); + mutex_unlock(&root->fs_info->tree_reloc_mutex); + + if (found) { + trans = btrfs_start_transaction(root, 1); + BUG_ON(!trans); + ret = btrfs_commit_transaction(trans, root); + BUG_ON(ret); + } + + location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID; + location.offset = (u64)-1; + location.type = BTRFS_ROOT_ITEM_KEY; + + reloc_root = btrfs_read_fs_root_no_name(root->fs_info, &location); + BUG_ON(!reloc_root); + btrfs_orphan_cleanup(reloc_root); + return 0; +} + +static noinline int init_reloc_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_root *reloc_root; + struct extent_buffer *eb; + struct btrfs_root_item *root_item; + struct btrfs_key root_key; + int ret; + + BUG_ON(!root->ref_cows); + if (root->reloc_root) + return 0; + + root_item = kmalloc(sizeof(*root_item), GFP_NOFS); + BUG_ON(!root_item); + + ret = btrfs_copy_root(trans, root, root->commit_root, + &eb, BTRFS_TREE_RELOC_OBJECTID); + BUG_ON(ret); + + root_key.objectid = BTRFS_TREE_RELOC_OBJECTID; + root_key.offset = root->root_key.objectid; + root_key.type = BTRFS_ROOT_ITEM_KEY; + + memcpy(root_item, &root->root_item, sizeof(root_item)); + btrfs_set_root_refs(root_item, 0); + btrfs_set_root_bytenr(root_item, eb->start); + btrfs_set_root_level(root_item, btrfs_header_level(eb)); + btrfs_set_root_generation(root_item, trans->transid); + + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + + ret = btrfs_insert_root(trans, root->fs_info->tree_root, + &root_key, root_item); + BUG_ON(ret); + kfree(root_item); + + reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root, + &root_key); + BUG_ON(!reloc_root); + reloc_root->last_trans = trans->transid; + reloc_root->commit_root = NULL; + reloc_root->ref_tree = &root->fs_info->reloc_ref_tree; + + root->reloc_root = reloc_root; + return 0; +} + +/* + * Core function of space balance. + * + * The idea is using reloc trees to relocate tree blocks in reference + * counted roots. There is one reloc tree for each subvol, and all + * reloc trees share same root key objectid. Reloc trees are snapshots + * of the latest committed roots of subvols (root->commit_root). + * + * To relocate a tree block referenced by a subvol, there are two steps. + * COW the block through subvol's reloc tree, then update block pointer + * in the subvol to point to the new block. Since all reloc trees share + * same root key objectid, doing special handing for tree blocks owned + * by them is easy. Once a tree block has been COWed in one reloc tree, + * we can use the resulting new block directly when the same block is + * required to COW again through other reloc trees. By this way, relocated + * tree blocks are shared between reloc trees, so they are also shared + * between subvols. + */ +static noinline int relocate_one_path(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *first_key, + struct btrfs_ref_path *ref_path, + struct btrfs_block_group_cache *group, + struct inode *reloc_inode) +{ + struct btrfs_root *reloc_root; + struct extent_buffer *eb = NULL; + struct btrfs_key *keys; + u64 *nodes; + int level; + int shared_level; + int lowest_level = 0; + int ret; + + if (ref_path->owner_objectid < BTRFS_FIRST_FREE_OBJECTID) + lowest_level = ref_path->owner_objectid; + + if (!root->ref_cows) { + path->lowest_level = lowest_level; + ret = btrfs_search_slot(trans, root, first_key, path, 0, 1); + BUG_ON(ret < 0); + path->lowest_level = 0; + btrfs_release_path(root, path); + return 0; + } + + mutex_lock(&root->fs_info->tree_reloc_mutex); + ret = init_reloc_tree(trans, root); + BUG_ON(ret); + reloc_root = root->reloc_root; + + shared_level = ref_path->shared_level; + ref_path->shared_level = BTRFS_MAX_LEVEL - 1; + + keys = ref_path->node_keys; + nodes = ref_path->new_nodes; + memset(&keys[shared_level + 1], 0, + sizeof(*keys) * (BTRFS_MAX_LEVEL - shared_level - 1)); + memset(&nodes[shared_level + 1], 0, + sizeof(*nodes) * (BTRFS_MAX_LEVEL - shared_level - 1)); + + if (nodes[lowest_level] == 0) { + path->lowest_level = lowest_level; + ret = btrfs_search_slot(trans, reloc_root, first_key, path, + 0, 1); + BUG_ON(ret); + for (level = lowest_level; level < BTRFS_MAX_LEVEL; level++) { + eb = path->nodes[level]; + if (!eb || eb == reloc_root->node) + break; + nodes[level] = eb->start; + if (level == 0) + btrfs_item_key_to_cpu(eb, &keys[level], 0); + else + btrfs_node_key_to_cpu(eb, &keys[level], 0); + } + if (nodes[0] && + ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { + eb = path->nodes[0]; + ret = replace_extents_in_leaf(trans, reloc_root, eb, + group, reloc_inode); + BUG_ON(ret); + } + btrfs_release_path(reloc_root, path); + } else { + ret = btrfs_merge_path(trans, reloc_root, keys, nodes, + lowest_level); + BUG_ON(ret); + } + + /* + * replace tree blocks in the fs tree with tree blocks in + * the reloc tree. + */ + ret = btrfs_merge_path(trans, root, keys, nodes, lowest_level); + BUG_ON(ret < 0); + + if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { + ret = btrfs_search_slot(trans, reloc_root, first_key, path, + 0, 0); + BUG_ON(ret); + extent_buffer_get(path->nodes[0]); + eb = path->nodes[0]; + btrfs_release_path(reloc_root, path); + ret = invalidate_extent_cache(reloc_root, eb, group, root); + BUG_ON(ret); + free_extent_buffer(eb); + } + + mutex_unlock(&root->fs_info->tree_reloc_mutex); + path->lowest_level = 0; + return 0; +} + +static noinline int relocate_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *first_key, + struct btrfs_ref_path *ref_path) +{ + int ret; + + ret = relocate_one_path(trans, root, path, first_key, + ref_path, NULL, NULL); + BUG_ON(ret); + + if (root == root->fs_info->extent_root) + btrfs_extent_post_op(trans, root); + + return 0; +} + +static noinline int del_extent_zero(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, + struct btrfs_path *path, + struct btrfs_key *extent_key) +{ + int ret; + + ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1); + if (ret) + goto out; + ret = btrfs_del_item(trans, extent_root, path); +out: + btrfs_release_path(extent_root, path); + return ret; +} + +static noinline struct btrfs_root *read_ref_root(struct btrfs_fs_info *fs_info, + struct btrfs_ref_path *ref_path) +{ + struct btrfs_key root_key; + + root_key.objectid = ref_path->root_objectid; + root_key.type = BTRFS_ROOT_ITEM_KEY; + if (is_cowonly_root(ref_path->root_objectid)) + root_key.offset = 0; + else + root_key.offset = (u64)-1; + + return btrfs_read_fs_root_no_name(fs_info, &root_key); +} + +static noinline int relocate_one_extent(struct btrfs_root *extent_root, + struct btrfs_path *path, + struct btrfs_key *extent_key, + struct btrfs_block_group_cache *group, + struct inode *reloc_inode, int pass) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *found_root; + struct btrfs_ref_path *ref_path = NULL; + struct disk_extent *new_extents = NULL; + int nr_extents = 0; + int loops; + int ret; + int level; + struct btrfs_key first_key; + u64 prev_block = 0; + + + trans = btrfs_start_transaction(extent_root, 1); + BUG_ON(!trans); + + if (extent_key->objectid == 0) { + ret = del_extent_zero(trans, extent_root, path, extent_key); + goto out; + } + + ref_path = kmalloc(sizeof(*ref_path), GFP_NOFS); + if (!ref_path) { + ret = -ENOMEM; + goto out; + } + + for (loops = 0; ; loops++) { + if (loops == 0) { + ret = btrfs_first_ref_path(trans, extent_root, ref_path, + extent_key->objectid); + } else { + ret = btrfs_next_ref_path(trans, extent_root, ref_path); + } + if (ret < 0) + goto out; + if (ret > 0) + break; + + if (ref_path->root_objectid == BTRFS_TREE_LOG_OBJECTID || + ref_path->root_objectid == BTRFS_TREE_RELOC_OBJECTID) + continue; + + found_root = read_ref_root(extent_root->fs_info, ref_path); + BUG_ON(!found_root); + /* + * for reference counted tree, only process reference paths + * rooted at the latest committed root. + */ + if (found_root->ref_cows && + ref_path->root_generation != found_root->root_key.offset) + continue; + + if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { + if (pass == 0) { + /* + * copy data extents to new locations + */ + u64 group_start = group->key.objectid; + ret = relocate_data_extent(reloc_inode, + extent_key, + group_start); + if (ret < 0) + goto out; + break; + } + level = 0; + } else { + level = ref_path->owner_objectid; + } + + if (prev_block != ref_path->nodes[level]) { + struct extent_buffer *eb; + u64 block_start = ref_path->nodes[level]; + u64 block_size = btrfs_level_size(found_root, level); + + eb = read_tree_block(found_root, block_start, + block_size, 0); + btrfs_tree_lock(eb); + BUG_ON(level != btrfs_header_level(eb)); + + if (level == 0) + btrfs_item_key_to_cpu(eb, &first_key, 0); + else + btrfs_node_key_to_cpu(eb, &first_key, 0); + + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + prev_block = block_start; + } + + btrfs_record_root_in_trans(found_root); + if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { + /* + * try to update data extent references while + * keeping metadata shared between snapshots. + */ + if (pass == 1) { + ret = relocate_one_path(trans, found_root, + path, &first_key, ref_path, + group, reloc_inode); + if (ret < 0) + goto out; + continue; + } + /* + * use fallback method to process the remaining + * references. + */ + if (!new_extents) { + u64 group_start = group->key.objectid; + new_extents = kmalloc(sizeof(*new_extents), + GFP_NOFS); + nr_extents = 1; + ret = get_new_locations(reloc_inode, + extent_key, + group_start, 1, + &new_extents, + &nr_extents); + if (ret) + goto out; + } + ret = replace_one_extent(trans, found_root, + path, extent_key, + &first_key, ref_path, + new_extents, nr_extents); + } else { + ret = relocate_tree_block(trans, found_root, path, + &first_key, ref_path); + } + if (ret < 0) + goto out; + } + ret = 0; +out: + btrfs_end_transaction(trans, extent_root); + kfree(new_extents); + kfree(ref_path); + return ret; +} + +static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) +{ + u64 num_devices; + u64 stripped = BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; + + num_devices = root->fs_info->fs_devices->rw_devices; + if (num_devices == 1) { + stripped |= BTRFS_BLOCK_GROUP_DUP; + stripped = flags & ~stripped; + + /* turn raid0 into single device chunks */ + if (flags & BTRFS_BLOCK_GROUP_RAID0) + return stripped; + + /* turn mirroring into duplication */ + if (flags & (BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10)) + return stripped | BTRFS_BLOCK_GROUP_DUP; + return flags; + } else { + /* they already had raid on here, just return */ + if (flags & stripped) + return flags; + + stripped |= BTRFS_BLOCK_GROUP_DUP; + stripped = flags & ~stripped; + + /* switch duplicated blocks with raid1 */ + if (flags & BTRFS_BLOCK_GROUP_DUP) + return stripped | BTRFS_BLOCK_GROUP_RAID1; + + /* turn single device chunks into raid0 */ + return stripped | BTRFS_BLOCK_GROUP_RAID0; + } + return flags; +} + +static int __alloc_chunk_for_shrink(struct btrfs_root *root, + struct btrfs_block_group_cache *shrink_block_group, + int force) +{ + struct btrfs_trans_handle *trans; + u64 new_alloc_flags; + u64 calc; + + spin_lock(&shrink_block_group->lock); + if (btrfs_block_group_used(&shrink_block_group->item) > 0) { + spin_unlock(&shrink_block_group->lock); + + trans = btrfs_start_transaction(root, 1); + spin_lock(&shrink_block_group->lock); + + new_alloc_flags = update_block_group_flags(root, + shrink_block_group->flags); + if (new_alloc_flags != shrink_block_group->flags) { + calc = + btrfs_block_group_used(&shrink_block_group->item); + } else { + calc = shrink_block_group->key.offset; + } + spin_unlock(&shrink_block_group->lock); + + do_chunk_alloc(trans, root->fs_info->extent_root, + calc + 2 * 1024 * 1024, new_alloc_flags, force); + + btrfs_end_transaction(trans, root); + } else + spin_unlock(&shrink_block_group->lock); + return 0; +} + +static int __insert_orphan_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 objectid, u64 size) +{ + struct btrfs_path *path; + struct btrfs_inode_item *item; + struct extent_buffer *leaf; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_insert_empty_inode(trans, root, path, objectid); + if (ret) + goto out; + + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); + memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item)); + btrfs_set_inode_generation(leaf, item, 1); + btrfs_set_inode_size(leaf, item, size); + btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); + btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS); + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(root, path); +out: + btrfs_free_path(path); + return ret; +} + +static noinline struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *group) +{ + struct inode *inode = NULL; + struct btrfs_trans_handle *trans; + struct btrfs_root *root; + struct btrfs_key root_key; + u64 objectid = BTRFS_FIRST_FREE_OBJECTID; + int err = 0; + + root_key.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID; + root_key.type = BTRFS_ROOT_ITEM_KEY; + root_key.offset = (u64)-1; + root = btrfs_read_fs_root_no_name(fs_info, &root_key); + if (IS_ERR(root)) + return ERR_CAST(root); + + trans = btrfs_start_transaction(root, 1); + BUG_ON(!trans); + + err = btrfs_find_free_objectid(trans, root, objectid, &objectid); + if (err) + goto out; + + err = __insert_orphan_inode(trans, root, objectid, group->key.offset); + BUG_ON(err); + + err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0, + group->key.offset, 0, group->key.offset, + 0, 0, 0); + BUG_ON(err); + + inode = btrfs_iget_locked(root->fs_info->sb, objectid, root); + if (inode->i_state & I_NEW) { + BTRFS_I(inode)->root = root; + BTRFS_I(inode)->location.objectid = objectid; + BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; + BTRFS_I(inode)->location.offset = 0; + btrfs_read_locked_inode(inode); + unlock_new_inode(inode); + BUG_ON(is_bad_inode(inode)); + } else { + BUG_ON(1); + } + BTRFS_I(inode)->index_cnt = group->key.objectid; + + err = btrfs_orphan_add(trans, inode); +out: + btrfs_end_transaction(trans, root); + if (err) { + if (inode) + iput(inode); + inode = ERR_PTR(err); + } + return inode; +} + +int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) +{ + + struct btrfs_ordered_sum *sums; + struct btrfs_sector_sum *sector_sum; + struct btrfs_ordered_extent *ordered; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct list_head list; + size_t offset; + int ret; + u64 disk_bytenr; + + INIT_LIST_HEAD(&list); + + ordered = btrfs_lookup_ordered_extent(inode, file_pos); + BUG_ON(ordered->file_offset != file_pos || ordered->len != len); + + disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt; + ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr, + disk_bytenr + len - 1, &list); + + while (!list_empty(&list)) { + sums = list_entry(list.next, struct btrfs_ordered_sum, list); + list_del_init(&sums->list); + + sector_sum = sums->sums; + sums->bytenr = ordered->start; + + offset = 0; + while (offset < sums->len) { + sector_sum->bytenr += ordered->start - disk_bytenr; + sector_sum++; + offset += root->sectorsize; + } + + btrfs_add_ordered_sum(inode, ordered, sums); + } + btrfs_put_ordered_extent(ordered); + return 0; +} + +int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start) +{ + struct btrfs_trans_handle *trans; + struct btrfs_path *path; + struct btrfs_fs_info *info = root->fs_info; + struct extent_buffer *leaf; + struct inode *reloc_inode; + struct btrfs_block_group_cache *block_group; + struct btrfs_key key; + u64 skipped; + u64 cur_byte; + u64 total_found; + u32 nritems; + int ret; + int progress; + int pass = 0; + + root = root->fs_info->extent_root; + + block_group = btrfs_lookup_block_group(info, group_start); + BUG_ON(!block_group); + + printk(KERN_INFO "btrfs relocating block group %llu flags %llu\n", + (unsigned long long)block_group->key.objectid, + (unsigned long long)block_group->flags); + + path = btrfs_alloc_path(); + BUG_ON(!path); + + reloc_inode = create_reloc_inode(info, block_group); + BUG_ON(IS_ERR(reloc_inode)); + + __alloc_chunk_for_shrink(root, block_group, 1); + set_block_group_readonly(block_group); + + btrfs_start_delalloc_inodes(info->tree_root); + btrfs_wait_ordered_extents(info->tree_root, 0); +again: + skipped = 0; + total_found = 0; + progress = 0; + key.objectid = block_group->key.objectid; + key.offset = 0; + key.type = 0; + cur_byte = key.objectid; + + trans = btrfs_start_transaction(info->tree_root, 1); + btrfs_commit_transaction(trans, info->tree_root); + + mutex_lock(&root->fs_info->cleaner_mutex); + btrfs_clean_old_snapshots(info->tree_root); + btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1); + mutex_unlock(&root->fs_info->cleaner_mutex); + + while (1) { + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; +next: + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + if (ret == 1) { + ret = 0; + break; + } + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + + if (key.objectid >= block_group->key.objectid + + block_group->key.offset) + break; + + if (progress && need_resched()) { + btrfs_release_path(root, path); + cond_resched(); + progress = 0; + continue; + } + progress = 1; + + if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY || + key.objectid + key.offset <= cur_byte) { + path->slots[0]++; + goto next; + } + + total_found++; + cur_byte = key.objectid + key.offset; + btrfs_release_path(root, path); + + __alloc_chunk_for_shrink(root, block_group, 0); + ret = relocate_one_extent(root, path, &key, block_group, + reloc_inode, pass); + BUG_ON(ret < 0); + if (ret > 0) + skipped++; + + key.objectid = cur_byte; + key.type = 0; + key.offset = 0; + } + + btrfs_release_path(root, path); + + if (pass == 0) { + btrfs_wait_ordered_range(reloc_inode, 0, (u64)-1); + invalidate_mapping_pages(reloc_inode->i_mapping, 0, -1); + } + + if (total_found > 0) { + printk(KERN_INFO "btrfs found %llu extents in pass %d\n", + (unsigned long long)total_found, pass); + pass++; + if (total_found == skipped && pass > 2) { + iput(reloc_inode); + reloc_inode = create_reloc_inode(info, block_group); + pass = 0; + } + goto again; + } + + /* delete reloc_inode */ + iput(reloc_inode); + + /* unpin extents in this range */ + trans = btrfs_start_transaction(info->tree_root, 1); + btrfs_commit_transaction(trans, info->tree_root); + + spin_lock(&block_group->lock); + WARN_ON(block_group->pinned > 0); + WARN_ON(block_group->reserved > 0); + WARN_ON(btrfs_block_group_used(&block_group->item) > 0); + spin_unlock(&block_group->lock); + put_block_group(block_group); + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + +static int find_first_block_group(struct btrfs_root *root, + struct btrfs_path *path, struct btrfs_key *key) +{ + int ret = 0; + struct btrfs_key found_key; + struct extent_buffer *leaf; + int slot; + + ret = btrfs_search_slot(NULL, root, key, path, 0, 0); + if (ret < 0) + goto out; + + while (1) { + slot = path->slots[0]; + leaf = path->nodes[0]; + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret == 0) + continue; + if (ret < 0) + goto out; + break; + } + btrfs_item_key_to_cpu(leaf, &found_key, slot); + + if (found_key.objectid >= key->objectid && + found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { + ret = 0; + goto out; + } + path->slots[0]++; + } + ret = -ENOENT; +out: + return ret; +} + +int btrfs_free_block_groups(struct btrfs_fs_info *info) +{ + struct btrfs_block_group_cache *block_group; + struct rb_node *n; + + spin_lock(&info->block_group_cache_lock); + while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { + block_group = rb_entry(n, struct btrfs_block_group_cache, + cache_node); + rb_erase(&block_group->cache_node, + &info->block_group_cache_tree); + spin_unlock(&info->block_group_cache_lock); + + btrfs_remove_free_space_cache(block_group); + down_write(&block_group->space_info->groups_sem); + list_del(&block_group->list); + up_write(&block_group->space_info->groups_sem); + + WARN_ON(atomic_read(&block_group->count) != 1); + kfree(block_group); + + spin_lock(&info->block_group_cache_lock); + } + spin_unlock(&info->block_group_cache_lock); + return 0; +} + +int btrfs_read_block_groups(struct btrfs_root *root) +{ + struct btrfs_path *path; + int ret; + struct btrfs_block_group_cache *cache; + struct btrfs_fs_info *info = root->fs_info; + struct btrfs_space_info *space_info; + struct btrfs_key key; + struct btrfs_key found_key; + struct extent_buffer *leaf; + + root = info->extent_root; + key.objectid = 0; + key.offset = 0; + btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY); + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + while (1) { + ret = find_first_block_group(root, path, &key); + if (ret > 0) { + ret = 0; + goto error; + } + if (ret != 0) + goto error; + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + cache = kzalloc(sizeof(*cache), GFP_NOFS); + if (!cache) { + ret = -ENOMEM; + break; + } + + atomic_set(&cache->count, 1); + spin_lock_init(&cache->lock); + mutex_init(&cache->alloc_mutex); + mutex_init(&cache->cache_mutex); + INIT_LIST_HEAD(&cache->list); + read_extent_buffer(leaf, &cache->item, + btrfs_item_ptr_offset(leaf, path->slots[0]), + sizeof(cache->item)); + memcpy(&cache->key, &found_key, sizeof(found_key)); + + key.objectid = found_key.objectid + found_key.offset; + btrfs_release_path(root, path); + cache->flags = btrfs_block_group_flags(&cache->item); + + ret = update_space_info(info, cache->flags, found_key.offset, + btrfs_block_group_used(&cache->item), + &space_info); + BUG_ON(ret); + cache->space_info = space_info; + down_write(&space_info->groups_sem); + list_add_tail(&cache->list, &space_info->block_groups); + up_write(&space_info->groups_sem); + + ret = btrfs_add_block_group_cache(root->fs_info, cache); + BUG_ON(ret); + + set_avail_alloc_bits(root->fs_info, cache->flags); + if (btrfs_chunk_readonly(root, cache->key.objectid)) + set_block_group_readonly(cache); + } + ret = 0; +error: + btrfs_free_path(path); + return ret; +} + +int btrfs_make_block_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytes_used, + u64 type, u64 chunk_objectid, u64 chunk_offset, + u64 size) +{ + int ret; + struct btrfs_root *extent_root; + struct btrfs_block_group_cache *cache; + + extent_root = root->fs_info->extent_root; + + root->fs_info->last_trans_new_blockgroup = trans->transid; + + cache = kzalloc(sizeof(*cache), GFP_NOFS); + if (!cache) + return -ENOMEM; + + cache->key.objectid = chunk_offset; + cache->key.offset = size; + cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; + atomic_set(&cache->count, 1); + spin_lock_init(&cache->lock); + mutex_init(&cache->alloc_mutex); + mutex_init(&cache->cache_mutex); + INIT_LIST_HEAD(&cache->list); + + btrfs_set_block_group_used(&cache->item, bytes_used); + btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid); + cache->flags = type; + btrfs_set_block_group_flags(&cache->item, type); + + ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, + &cache->space_info); + BUG_ON(ret); + down_write(&cache->space_info->groups_sem); + list_add_tail(&cache->list, &cache->space_info->block_groups); + up_write(&cache->space_info->groups_sem); + + ret = btrfs_add_block_group_cache(root->fs_info, cache); + BUG_ON(ret); + + ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item, + sizeof(cache->item)); + BUG_ON(ret); + + finish_current_insert(trans, extent_root, 0); + ret = del_pending_extents(trans, extent_root, 0); + BUG_ON(ret); + set_avail_alloc_bits(extent_root->fs_info, type); + + return 0; +} + +int btrfs_remove_block_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 group_start) +{ + struct btrfs_path *path; + struct btrfs_block_group_cache *block_group; + struct btrfs_key key; + int ret; + + root = root->fs_info->extent_root; + + block_group = btrfs_lookup_block_group(root->fs_info, group_start); + BUG_ON(!block_group); + BUG_ON(!block_group->ro); + + memcpy(&key, &block_group->key, sizeof(key)); + + path = btrfs_alloc_path(); + BUG_ON(!path); + + btrfs_remove_free_space_cache(block_group); + rb_erase(&block_group->cache_node, + &root->fs_info->block_group_cache_tree); + down_write(&block_group->space_info->groups_sem); + list_del(&block_group->list); + up_write(&block_group->space_info->groups_sem); + + spin_lock(&block_group->space_info->lock); + block_group->space_info->total_bytes -= block_group->key.offset; + block_group->space_info->bytes_readonly -= block_group->key.offset; + spin_unlock(&block_group->space_info->lock); + block_group->space_info->full = 0; + + put_block_group(block_group); + put_block_group(block_group); + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) + ret = -EIO; + if (ret < 0) + goto out; + + ret = btrfs_del_item(trans, root, path); +out: + btrfs_free_path(path); + return ret; +} diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c new file mode 100644 index 000000000000..e086d407f1fa --- /dev/null +++ b/fs/btrfs/extent_io.c @@ -0,0 +1,3717 @@ +#include <linux/bitops.h> +#include <linux/slab.h> +#include <linux/bio.h> +#include <linux/mm.h> +#include <linux/gfp.h> +#include <linux/pagemap.h> +#include <linux/page-flags.h> +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/blkdev.h> +#include <linux/swap.h> +#include <linux/version.h> +#include <linux/writeback.h> +#include <linux/pagevec.h> +#include "extent_io.h" +#include "extent_map.h" +#include "compat.h" +#include "ctree.h" +#include "btrfs_inode.h" + +/* temporary define until extent_map moves out of btrfs */ +struct kmem_cache *btrfs_cache_create(const char *name, size_t size, + unsigned long extra_flags, + void (*ctor)(void *, struct kmem_cache *, + unsigned long)); + +static struct kmem_cache *extent_state_cache; +static struct kmem_cache *extent_buffer_cache; + +static LIST_HEAD(buffers); +static LIST_HEAD(states); + +#define LEAK_DEBUG 0 +#ifdef LEAK_DEBUG +static DEFINE_SPINLOCK(leak_lock); +#endif + +#define BUFFER_LRU_MAX 64 + +struct tree_entry { + u64 start; + u64 end; + struct rb_node rb_node; +}; + +struct extent_page_data { + struct bio *bio; + struct extent_io_tree *tree; + get_extent_t *get_extent; + + /* tells writepage not to lock the state bits for this range + * it still does the unlocking + */ + int extent_locked; +}; + +int __init extent_io_init(void) +{ + extent_state_cache = btrfs_cache_create("extent_state", + sizeof(struct extent_state), 0, + NULL); + if (!extent_state_cache) + return -ENOMEM; + + extent_buffer_cache = btrfs_cache_create("extent_buffers", + sizeof(struct extent_buffer), 0, + NULL); + if (!extent_buffer_cache) + goto free_state_cache; + return 0; + +free_state_cache: + kmem_cache_destroy(extent_state_cache); + return -ENOMEM; +} + +void extent_io_exit(void) +{ + struct extent_state *state; + struct extent_buffer *eb; + + while (!list_empty(&states)) { + state = list_entry(states.next, struct extent_state, leak_list); + printk(KERN_ERR "btrfs state leak: start %llu end %llu " + "state %lu in tree %p refs %d\n", + (unsigned long long)state->start, + (unsigned long long)state->end, + state->state, state->tree, atomic_read(&state->refs)); + list_del(&state->leak_list); + kmem_cache_free(extent_state_cache, state); + + } + + while (!list_empty(&buffers)) { + eb = list_entry(buffers.next, struct extent_buffer, leak_list); + printk(KERN_ERR "btrfs buffer leak start %llu len %lu " + "refs %d\n", (unsigned long long)eb->start, + eb->len, atomic_read(&eb->refs)); + list_del(&eb->leak_list); + kmem_cache_free(extent_buffer_cache, eb); + } + if (extent_state_cache) + kmem_cache_destroy(extent_state_cache); + if (extent_buffer_cache) + kmem_cache_destroy(extent_buffer_cache); +} + +void extent_io_tree_init(struct extent_io_tree *tree, + struct address_space *mapping, gfp_t mask) +{ + tree->state.rb_node = NULL; + tree->buffer.rb_node = NULL; + tree->ops = NULL; + tree->dirty_bytes = 0; + spin_lock_init(&tree->lock); + spin_lock_init(&tree->buffer_lock); + tree->mapping = mapping; +} + +static struct extent_state *alloc_extent_state(gfp_t mask) +{ + struct extent_state *state; +#ifdef LEAK_DEBUG + unsigned long flags; +#endif + + state = kmem_cache_alloc(extent_state_cache, mask); + if (!state) + return state; + state->state = 0; + state->private = 0; + state->tree = NULL; +#ifdef LEAK_DEBUG + spin_lock_irqsave(&leak_lock, flags); + list_add(&state->leak_list, &states); + spin_unlock_irqrestore(&leak_lock, flags); +#endif + atomic_set(&state->refs, 1); + init_waitqueue_head(&state->wq); + return state; +} + +static void free_extent_state(struct extent_state *state) +{ + if (!state) + return; + if (atomic_dec_and_test(&state->refs)) { +#ifdef LEAK_DEBUG + unsigned long flags; +#endif + WARN_ON(state->tree); +#ifdef LEAK_DEBUG + spin_lock_irqsave(&leak_lock, flags); + list_del(&state->leak_list); + spin_unlock_irqrestore(&leak_lock, flags); +#endif + kmem_cache_free(extent_state_cache, state); + } +} + +static struct rb_node *tree_insert(struct rb_root *root, u64 offset, + struct rb_node *node) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct tree_entry *entry; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct tree_entry, rb_node); + + if (offset < entry->start) + p = &(*p)->rb_left; + else if (offset > entry->end) + p = &(*p)->rb_right; + else + return parent; + } + + entry = rb_entry(node, struct tree_entry, rb_node); + rb_link_node(node, parent, p); + rb_insert_color(node, root); + return NULL; +} + +static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, + struct rb_node **prev_ret, + struct rb_node **next_ret) +{ + struct rb_root *root = &tree->state; + struct rb_node *n = root->rb_node; + struct rb_node *prev = NULL; + struct rb_node *orig_prev = NULL; + struct tree_entry *entry; + struct tree_entry *prev_entry = NULL; + + while (n) { + entry = rb_entry(n, struct tree_entry, rb_node); + prev = n; + prev_entry = entry; + + if (offset < entry->start) + n = n->rb_left; + else if (offset > entry->end) + n = n->rb_right; + else + return n; + } + + if (prev_ret) { + orig_prev = prev; + while (prev && offset > prev_entry->end) { + prev = rb_next(prev); + prev_entry = rb_entry(prev, struct tree_entry, rb_node); + } + *prev_ret = prev; + prev = orig_prev; + } + + if (next_ret) { + prev_entry = rb_entry(prev, struct tree_entry, rb_node); + while (prev && offset < prev_entry->start) { + prev = rb_prev(prev); + prev_entry = rb_entry(prev, struct tree_entry, rb_node); + } + *next_ret = prev; + } + return NULL; +} + +static inline struct rb_node *tree_search(struct extent_io_tree *tree, + u64 offset) +{ + struct rb_node *prev = NULL; + struct rb_node *ret; + + ret = __etree_search(tree, offset, &prev, NULL); + if (!ret) + return prev; + return ret; +} + +static struct extent_buffer *buffer_tree_insert(struct extent_io_tree *tree, + u64 offset, struct rb_node *node) +{ + struct rb_root *root = &tree->buffer; + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct extent_buffer *eb; + + while (*p) { + parent = *p; + eb = rb_entry(parent, struct extent_buffer, rb_node); + + if (offset < eb->start) + p = &(*p)->rb_left; + else if (offset > eb->start) + p = &(*p)->rb_right; + else + return eb; + } + + rb_link_node(node, parent, p); + rb_insert_color(node, root); + return NULL; +} + +static struct extent_buffer *buffer_search(struct extent_io_tree *tree, + u64 offset) +{ + struct rb_root *root = &tree->buffer; + struct rb_node *n = root->rb_node; + struct extent_buffer *eb; + + while (n) { + eb = rb_entry(n, struct extent_buffer, rb_node); + if (offset < eb->start) + n = n->rb_left; + else if (offset > eb->start) + n = n->rb_right; + else + return eb; + } + return NULL; +} + +/* + * utility function to look for merge candidates inside a given range. + * Any extents with matching state are merged together into a single + * extent in the tree. Extents with EXTENT_IO in their state field + * are not merged because the end_io handlers need to be able to do + * operations on them without sleeping (or doing allocations/splits). + * + * This should be called with the tree lock held. + */ +static int merge_state(struct extent_io_tree *tree, + struct extent_state *state) +{ + struct extent_state *other; + struct rb_node *other_node; + + if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) + return 0; + + other_node = rb_prev(&state->rb_node); + if (other_node) { + other = rb_entry(other_node, struct extent_state, rb_node); + if (other->end == state->start - 1 && + other->state == state->state) { + state->start = other->start; + other->tree = NULL; + rb_erase(&other->rb_node, &tree->state); + free_extent_state(other); + } + } + other_node = rb_next(&state->rb_node); + if (other_node) { + other = rb_entry(other_node, struct extent_state, rb_node); + if (other->start == state->end + 1 && + other->state == state->state) { + other->start = state->start; + state->tree = NULL; + rb_erase(&state->rb_node, &tree->state); + free_extent_state(state); + } + } + return 0; +} + +static void set_state_cb(struct extent_io_tree *tree, + struct extent_state *state, + unsigned long bits) +{ + if (tree->ops && tree->ops->set_bit_hook) { + tree->ops->set_bit_hook(tree->mapping->host, state->start, + state->end, state->state, bits); + } +} + +static void clear_state_cb(struct extent_io_tree *tree, + struct extent_state *state, + unsigned long bits) +{ + if (tree->ops && tree->ops->clear_bit_hook) { + tree->ops->clear_bit_hook(tree->mapping->host, state->start, + state->end, state->state, bits); + } +} + +/* + * insert an extent_state struct into the tree. 'bits' are set on the + * struct before it is inserted. + * + * This may return -EEXIST if the extent is already there, in which case the + * state struct is freed. + * + * The tree lock is not taken internally. This is a utility function and + * probably isn't what you want to call (see set/clear_extent_bit). + */ +static int insert_state(struct extent_io_tree *tree, + struct extent_state *state, u64 start, u64 end, + int bits) +{ + struct rb_node *node; + + if (end < start) { + printk(KERN_ERR "btrfs end < start %llu %llu\n", + (unsigned long long)end, + (unsigned long long)start); + WARN_ON(1); + } + if (bits & EXTENT_DIRTY) + tree->dirty_bytes += end - start + 1; + set_state_cb(tree, state, bits); + state->state |= bits; + state->start = start; + state->end = end; + node = tree_insert(&tree->state, end, &state->rb_node); + if (node) { + struct extent_state *found; + found = rb_entry(node, struct extent_state, rb_node); + printk(KERN_ERR "btrfs found node %llu %llu on insert of " + "%llu %llu\n", (unsigned long long)found->start, + (unsigned long long)found->end, + (unsigned long long)start, (unsigned long long)end); + free_extent_state(state); + return -EEXIST; + } + state->tree = tree; + merge_state(tree, state); + return 0; +} + +/* + * split a given extent state struct in two, inserting the preallocated + * struct 'prealloc' as the newly created second half. 'split' indicates an + * offset inside 'orig' where it should be split. + * + * Before calling, + * the tree has 'orig' at [orig->start, orig->end]. After calling, there + * are two extent state structs in the tree: + * prealloc: [orig->start, split - 1] + * orig: [ split, orig->end ] + * + * The tree locks are not taken by this function. They need to be held + * by the caller. + */ +static int split_state(struct extent_io_tree *tree, struct extent_state *orig, + struct extent_state *prealloc, u64 split) +{ + struct rb_node *node; + prealloc->start = orig->start; + prealloc->end = split - 1; + prealloc->state = orig->state; + orig->start = split; + + node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node); + if (node) { + struct extent_state *found; + found = rb_entry(node, struct extent_state, rb_node); + free_extent_state(prealloc); + return -EEXIST; + } + prealloc->tree = tree; + return 0; +} + +/* + * utility function to clear some bits in an extent state struct. + * it will optionally wake up any one waiting on this state (wake == 1), or + * forcibly remove the state from the tree (delete == 1). + * + * If no bits are set on the state struct after clearing things, the + * struct is freed and removed from the tree + */ +static int clear_state_bit(struct extent_io_tree *tree, + struct extent_state *state, int bits, int wake, + int delete) +{ + int ret = state->state & bits; + + if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { + u64 range = state->end - state->start + 1; + WARN_ON(range > tree->dirty_bytes); + tree->dirty_bytes -= range; + } + clear_state_cb(tree, state, bits); + state->state &= ~bits; + if (wake) + wake_up(&state->wq); + if (delete || state->state == 0) { + if (state->tree) { + clear_state_cb(tree, state, state->state); + rb_erase(&state->rb_node, &tree->state); + state->tree = NULL; + free_extent_state(state); + } else { + WARN_ON(1); + } + } else { + merge_state(tree, state); + } + return ret; +} + +/* + * clear some bits on a range in the tree. This may require splitting + * or inserting elements in the tree, so the gfp mask is used to + * indicate which allocations or sleeping are allowed. + * + * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove + * the given range from the tree regardless of state (ie for truncate). + * + * the range [start, end] is inclusive. + * + * This takes the tree lock, and returns < 0 on error, > 0 if any of the + * bits were already set, or zero if none of the bits were already set. + */ +int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + int bits, int wake, int delete, gfp_t mask) +{ + struct extent_state *state; + struct extent_state *prealloc = NULL; + struct rb_node *node; + int err; + int set = 0; + +again: + if (!prealloc && (mask & __GFP_WAIT)) { + prealloc = alloc_extent_state(mask); + if (!prealloc) + return -ENOMEM; + } + + spin_lock(&tree->lock); + /* + * this search will find the extents that end after + * our range starts + */ + node = tree_search(tree, start); + if (!node) + goto out; + state = rb_entry(node, struct extent_state, rb_node); + if (state->start > end) + goto out; + WARN_ON(state->end < start); + + /* + * | ---- desired range ---- | + * | state | or + * | ------------- state -------------- | + * + * We need to split the extent we found, and may flip + * bits on second half. + * + * If the extent we found extends past our range, we + * just split and search again. It'll get split again + * the next time though. + * + * If the extent we found is inside our range, we clear + * the desired bit on it. + */ + + if (state->start < start) { + if (!prealloc) + prealloc = alloc_extent_state(GFP_ATOMIC); + err = split_state(tree, state, prealloc, start); + BUG_ON(err == -EEXIST); + prealloc = NULL; + if (err) + goto out; + if (state->end <= end) { + start = state->end + 1; + set |= clear_state_bit(tree, state, bits, + wake, delete); + } else { + start = state->start; + } + goto search_again; + } + /* + * | ---- desired range ---- | + * | state | + * We need to split the extent, and clear the bit + * on the first half + */ + if (state->start <= end && state->end > end) { + if (!prealloc) + prealloc = alloc_extent_state(GFP_ATOMIC); + err = split_state(tree, state, prealloc, end + 1); + BUG_ON(err == -EEXIST); + + if (wake) + wake_up(&state->wq); + set |= clear_state_bit(tree, prealloc, bits, + wake, delete); + prealloc = NULL; + goto out; + } + + start = state->end + 1; + set |= clear_state_bit(tree, state, bits, wake, delete); + goto search_again; + +out: + spin_unlock(&tree->lock); + if (prealloc) + free_extent_state(prealloc); + + return set; + +search_again: + if (start > end) + goto out; + spin_unlock(&tree->lock); + if (mask & __GFP_WAIT) + cond_resched(); + goto again; +} + +static int wait_on_state(struct extent_io_tree *tree, + struct extent_state *state) + __releases(tree->lock) + __acquires(tree->lock) +{ + DEFINE_WAIT(wait); + prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); + spin_unlock(&tree->lock); + schedule(); + spin_lock(&tree->lock); + finish_wait(&state->wq, &wait); + return 0; +} + +/* + * waits for one or more bits to clear on a range in the state tree. + * The range [start, end] is inclusive. + * The tree lock is taken by this function + */ +int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits) +{ + struct extent_state *state; + struct rb_node *node; + + spin_lock(&tree->lock); +again: + while (1) { + /* + * this search will find all the extents that end after + * our range starts + */ + node = tree_search(tree, start); + if (!node) + break; + + state = rb_entry(node, struct extent_state, rb_node); + + if (state->start > end) + goto out; + + if (state->state & bits) { + start = state->start; + atomic_inc(&state->refs); + wait_on_state(tree, state); + free_extent_state(state); + goto again; + } + start = state->end + 1; + + if (start > end) + break; + + if (need_resched()) { + spin_unlock(&tree->lock); + cond_resched(); + spin_lock(&tree->lock); + } + } +out: + spin_unlock(&tree->lock); + return 0; +} + +static void set_state_bits(struct extent_io_tree *tree, + struct extent_state *state, + int bits) +{ + if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { + u64 range = state->end - state->start + 1; + tree->dirty_bytes += range; + } + set_state_cb(tree, state, bits); + state->state |= bits; +} + +/* + * set some bits on a range in the tree. This may require allocations + * or sleeping, so the gfp mask is used to indicate what is allowed. + * + * If 'exclusive' == 1, this will fail with -EEXIST if some part of the + * range already has the desired bits set. The start of the existing + * range is returned in failed_start in this case. + * + * [start, end] is inclusive + * This takes the tree lock. + */ +static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + int bits, int exclusive, u64 *failed_start, + gfp_t mask) +{ + struct extent_state *state; + struct extent_state *prealloc = NULL; + struct rb_node *node; + int err = 0; + int set; + u64 last_start; + u64 last_end; +again: + if (!prealloc && (mask & __GFP_WAIT)) { + prealloc = alloc_extent_state(mask); + if (!prealloc) + return -ENOMEM; + } + + spin_lock(&tree->lock); + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search(tree, start); + if (!node) { + err = insert_state(tree, prealloc, start, end, bits); + prealloc = NULL; + BUG_ON(err == -EEXIST); + goto out; + } + + state = rb_entry(node, struct extent_state, rb_node); + last_start = state->start; + last_end = state->end; + + /* + * | ---- desired range ---- | + * | state | + * + * Just lock what we found and keep going + */ + if (state->start == start && state->end <= end) { + set = state->state & bits; + if (set && exclusive) { + *failed_start = state->start; + err = -EEXIST; + goto out; + } + set_state_bits(tree, state, bits); + start = state->end + 1; + merge_state(tree, state); + goto search_again; + } + + /* + * | ---- desired range ---- | + * | state | + * or + * | ------------- state -------------- | + * + * We need to split the extent we found, and may flip bits on + * second half. + * + * If the extent we found extends past our + * range, we just split and search again. It'll get split + * again the next time though. + * + * If the extent we found is inside our range, we set the + * desired bit on it. + */ + if (state->start < start) { + set = state->state & bits; + if (exclusive && set) { + *failed_start = start; + err = -EEXIST; + goto out; + } + err = split_state(tree, state, prealloc, start); + BUG_ON(err == -EEXIST); + prealloc = NULL; + if (err) + goto out; + if (state->end <= end) { + set_state_bits(tree, state, bits); + start = state->end + 1; + merge_state(tree, state); + } else { + start = state->start; + } + goto search_again; + } + /* + * | ---- desired range ---- | + * | state | or | state | + * + * There's a hole, we need to insert something in it and + * ignore the extent we found. + */ + if (state->start > start) { + u64 this_end; + if (end < last_start) + this_end = end; + else + this_end = last_start - 1; + err = insert_state(tree, prealloc, start, this_end, + bits); + prealloc = NULL; + BUG_ON(err == -EEXIST); + if (err) + goto out; + start = this_end + 1; + goto search_again; + } + /* + * | ---- desired range ---- | + * | state | + * We need to split the extent, and set the bit + * on the first half + */ + if (state->start <= end && state->end > end) { + set = state->state & bits; + if (exclusive && set) { + *failed_start = start; + err = -EEXIST; + goto out; + } + err = split_state(tree, state, prealloc, end + 1); + BUG_ON(err == -EEXIST); + + set_state_bits(tree, prealloc, bits); + merge_state(tree, prealloc); + prealloc = NULL; + goto out; + } + + goto search_again; + +out: + spin_unlock(&tree->lock); + if (prealloc) + free_extent_state(prealloc); + + return err; + +search_again: + if (start > end) + goto out; + spin_unlock(&tree->lock); + if (mask & __GFP_WAIT) + cond_resched(); + goto again; +} + +/* wrappers around set/clear extent bit */ +int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL, + mask); +} + +int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return set_extent_bit(tree, start, end, EXTENT_ORDERED, 0, NULL, mask); +} + +int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + int bits, gfp_t mask) +{ + return set_extent_bit(tree, start, end, bits, 0, NULL, + mask); +} + +int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + int bits, gfp_t mask) +{ + return clear_extent_bit(tree, start, end, bits, 0, 0, mask); +} + +int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return set_extent_bit(tree, start, end, + EXTENT_DELALLOC | EXTENT_DIRTY, + 0, NULL, mask); +} + +int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return clear_extent_bit(tree, start, end, + EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask); +} + +int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return clear_extent_bit(tree, start, end, EXTENT_ORDERED, 1, 0, mask); +} + +int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL, + mask); +} + +static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask); +} + +int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL, + mask); +} + +static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, + u64 end, gfp_t mask) +{ + return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask); +} + +static int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return set_extent_bit(tree, start, end, EXTENT_WRITEBACK, + 0, NULL, mask); +} + +static int clear_extent_writeback(struct extent_io_tree *tree, u64 start, + u64 end, gfp_t mask) +{ + return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask); +} + +int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end) +{ + return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK); +} + +/* + * either insert or lock state struct between start and end use mask to tell + * us if waiting is desired. + */ +int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) +{ + int err; + u64 failed_start; + while (1) { + err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1, + &failed_start, mask); + if (err == -EEXIST && (mask & __GFP_WAIT)) { + wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); + start = failed_start; + } else { + break; + } + WARN_ON(start > end); + } + return err; +} + +int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + int err; + u64 failed_start; + + err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1, + &failed_start, mask); + if (err == -EEXIST) { + if (failed_start > start) + clear_extent_bit(tree, start, failed_start - 1, + EXTENT_LOCKED, 1, 0, mask); + return 0; + } + return 1; +} + +int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask); +} + +/* + * helper function to set pages and extents in the tree dirty + */ +int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end) +{ + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + struct page *page; + + while (index <= end_index) { + page = find_get_page(tree->mapping, index); + BUG_ON(!page); + __set_page_dirty_nobuffers(page); + page_cache_release(page); + index++; + } + set_extent_dirty(tree, start, end, GFP_NOFS); + return 0; +} + +/* + * helper function to set both pages and extents in the tree writeback + */ +static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) +{ + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + struct page *page; + + while (index <= end_index) { + page = find_get_page(tree->mapping, index); + BUG_ON(!page); + set_page_writeback(page); + page_cache_release(page); + index++; + } + set_extent_writeback(tree, start, end, GFP_NOFS); + return 0; +} + +/* + * find the first offset in the io tree with 'bits' set. zero is + * returned if we find something, and *start_ret and *end_ret are + * set to reflect the state struct that was found. + * + * If nothing was found, 1 is returned, < 0 on error + */ +int find_first_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, int bits) +{ + struct rb_node *node; + struct extent_state *state; + int ret = 1; + + spin_lock(&tree->lock); + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search(tree, start); + if (!node) + goto out; + + while (1) { + state = rb_entry(node, struct extent_state, rb_node); + if (state->end >= start && (state->state & bits)) { + *start_ret = state->start; + *end_ret = state->end; + ret = 0; + break; + } + node = rb_next(node); + if (!node) + break; + } +out: + spin_unlock(&tree->lock); + return ret; +} + +/* find the first state struct with 'bits' set after 'start', and + * return it. tree->lock must be held. NULL will returned if + * nothing was found after 'start' + */ +struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree, + u64 start, int bits) +{ + struct rb_node *node; + struct extent_state *state; + + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search(tree, start); + if (!node) + goto out; + + while (1) { + state = rb_entry(node, struct extent_state, rb_node); + if (state->end >= start && (state->state & bits)) + return state; + + node = rb_next(node); + if (!node) + break; + } +out: + return NULL; +} + +/* + * find a contiguous range of bytes in the file marked as delalloc, not + * more than 'max_bytes'. start and end are used to return the range, + * + * 1 is returned if we find something, 0 if nothing was in the tree + */ +static noinline u64 find_delalloc_range(struct extent_io_tree *tree, + u64 *start, u64 *end, u64 max_bytes) +{ + struct rb_node *node; + struct extent_state *state; + u64 cur_start = *start; + u64 found = 0; + u64 total_bytes = 0; + + spin_lock(&tree->lock); + + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search(tree, cur_start); + if (!node) { + if (!found) + *end = (u64)-1; + goto out; + } + + while (1) { + state = rb_entry(node, struct extent_state, rb_node); + if (found && (state->start != cur_start || + (state->state & EXTENT_BOUNDARY))) { + goto out; + } + if (!(state->state & EXTENT_DELALLOC)) { + if (!found) + *end = state->end; + goto out; + } + if (!found) + *start = state->start; + found++; + *end = state->end; + cur_start = state->end + 1; + node = rb_next(node); + if (!node) + break; + total_bytes += state->end - state->start + 1; + if (total_bytes >= max_bytes) + break; + } +out: + spin_unlock(&tree->lock); + return found; +} + +static noinline int __unlock_for_delalloc(struct inode *inode, + struct page *locked_page, + u64 start, u64 end) +{ + int ret; + struct page *pages[16]; + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + unsigned long nr_pages = end_index - index + 1; + int i; + + if (index == locked_page->index && end_index == index) + return 0; + + while (nr_pages > 0) { + ret = find_get_pages_contig(inode->i_mapping, index, + min_t(unsigned long, nr_pages, + ARRAY_SIZE(pages)), pages); + for (i = 0; i < ret; i++) { + if (pages[i] != locked_page) + unlock_page(pages[i]); + page_cache_release(pages[i]); + } + nr_pages -= ret; + index += ret; + cond_resched(); + } + return 0; +} + +static noinline int lock_delalloc_pages(struct inode *inode, + struct page *locked_page, + u64 delalloc_start, + u64 delalloc_end) +{ + unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT; + unsigned long start_index = index; + unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT; + unsigned long pages_locked = 0; + struct page *pages[16]; + unsigned long nrpages; + int ret; + int i; + + /* the caller is responsible for locking the start index */ + if (index == locked_page->index && index == end_index) + return 0; + + /* skip the page at the start index */ + nrpages = end_index - index + 1; + while (nrpages > 0) { + ret = find_get_pages_contig(inode->i_mapping, index, + min_t(unsigned long, + nrpages, ARRAY_SIZE(pages)), pages); + if (ret == 0) { + ret = -EAGAIN; + goto done; + } + /* now we have an array of pages, lock them all */ + for (i = 0; i < ret; i++) { + /* + * the caller is taking responsibility for + * locked_page + */ + if (pages[i] != locked_page) { + lock_page(pages[i]); + if (!PageDirty(pages[i]) || + pages[i]->mapping != inode->i_mapping) { + ret = -EAGAIN; + unlock_page(pages[i]); + page_cache_release(pages[i]); + goto done; + } + } + page_cache_release(pages[i]); + pages_locked++; + } + nrpages -= ret; + index += ret; + cond_resched(); + } + ret = 0; +done: + if (ret && pages_locked) { + __unlock_for_delalloc(inode, locked_page, + delalloc_start, + ((u64)(start_index + pages_locked - 1)) << + PAGE_CACHE_SHIFT); + } + return ret; +} + +/* + * find a contiguous range of bytes in the file marked as delalloc, not + * more than 'max_bytes'. start and end are used to return the range, + * + * 1 is returned if we find something, 0 if nothing was in the tree + */ +static noinline u64 find_lock_delalloc_range(struct inode *inode, + struct extent_io_tree *tree, + struct page *locked_page, + u64 *start, u64 *end, + u64 max_bytes) +{ + u64 delalloc_start; + u64 delalloc_end; + u64 found; + int ret; + int loops = 0; + +again: + /* step one, find a bunch of delalloc bytes starting at start */ + delalloc_start = *start; + delalloc_end = 0; + found = find_delalloc_range(tree, &delalloc_start, &delalloc_end, + max_bytes); + if (!found || delalloc_end <= *start) { + *start = delalloc_start; + *end = delalloc_end; + return found; + } + + /* + * start comes from the offset of locked_page. We have to lock + * pages in order, so we can't process delalloc bytes before + * locked_page + */ + if (delalloc_start < *start) + delalloc_start = *start; + + /* + * make sure to limit the number of pages we try to lock down + * if we're looping. + */ + if (delalloc_end + 1 - delalloc_start > max_bytes && loops) + delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1; + + /* step two, lock all the pages after the page that has start */ + ret = lock_delalloc_pages(inode, locked_page, + delalloc_start, delalloc_end); + if (ret == -EAGAIN) { + /* some of the pages are gone, lets avoid looping by + * shortening the size of the delalloc range we're searching + */ + if (!loops) { + unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1); + max_bytes = PAGE_CACHE_SIZE - offset; + loops = 1; + goto again; + } else { + found = 0; + goto out_failed; + } + } + BUG_ON(ret); + + /* step three, lock the state bits for the whole range */ + lock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS); + + /* then test to make sure it is all still delalloc */ + ret = test_range_bit(tree, delalloc_start, delalloc_end, + EXTENT_DELALLOC, 1); + if (!ret) { + unlock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS); + __unlock_for_delalloc(inode, locked_page, + delalloc_start, delalloc_end); + cond_resched(); + goto again; + } + *start = delalloc_start; + *end = delalloc_end; +out_failed: + return found; +} + +int extent_clear_unlock_delalloc(struct inode *inode, + struct extent_io_tree *tree, + u64 start, u64 end, struct page *locked_page, + int unlock_pages, + int clear_unlock, + int clear_delalloc, int clear_dirty, + int set_writeback, + int end_writeback) +{ + int ret; + struct page *pages[16]; + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + unsigned long nr_pages = end_index - index + 1; + int i; + int clear_bits = 0; + + if (clear_unlock) + clear_bits |= EXTENT_LOCKED; + if (clear_dirty) + clear_bits |= EXTENT_DIRTY; + + if (clear_delalloc) + clear_bits |= EXTENT_DELALLOC; + + clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS); + if (!(unlock_pages || clear_dirty || set_writeback || end_writeback)) + return 0; + + while (nr_pages > 0) { + ret = find_get_pages_contig(inode->i_mapping, index, + min_t(unsigned long, + nr_pages, ARRAY_SIZE(pages)), pages); + for (i = 0; i < ret; i++) { + if (pages[i] == locked_page) { + page_cache_release(pages[i]); + continue; + } + if (clear_dirty) + clear_page_dirty_for_io(pages[i]); + if (set_writeback) + set_page_writeback(pages[i]); + if (end_writeback) + end_page_writeback(pages[i]); + if (unlock_pages) + unlock_page(pages[i]); + page_cache_release(pages[i]); + } + nr_pages -= ret; + index += ret; + cond_resched(); + } + return 0; +} + +/* + * count the number of bytes in the tree that have a given bit(s) + * set. This can be fairly slow, except for EXTENT_DIRTY which is + * cached. The total number found is returned. + */ +u64 count_range_bits(struct extent_io_tree *tree, + u64 *start, u64 search_end, u64 max_bytes, + unsigned long bits) +{ + struct rb_node *node; + struct extent_state *state; + u64 cur_start = *start; + u64 total_bytes = 0; + int found = 0; + + if (search_end <= cur_start) { + WARN_ON(1); + return 0; + } + + spin_lock(&tree->lock); + if (cur_start == 0 && bits == EXTENT_DIRTY) { + total_bytes = tree->dirty_bytes; + goto out; + } + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search(tree, cur_start); + if (!node) + goto out; + + while (1) { + state = rb_entry(node, struct extent_state, rb_node); + if (state->start > search_end) + break; + if (state->end >= cur_start && (state->state & bits)) { + total_bytes += min(search_end, state->end) + 1 - + max(cur_start, state->start); + if (total_bytes >= max_bytes) + break; + if (!found) { + *start = state->start; + found = 1; + } + } + node = rb_next(node); + if (!node) + break; + } +out: + spin_unlock(&tree->lock); + return total_bytes; +} + +#if 0 +/* + * helper function to lock both pages and extents in the tree. + * pages must be locked first. + */ +static int lock_range(struct extent_io_tree *tree, u64 start, u64 end) +{ + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + struct page *page; + int err; + + while (index <= end_index) { + page = grab_cache_page(tree->mapping, index); + if (!page) { + err = -ENOMEM; + goto failed; + } + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto failed; + } + index++; + } + lock_extent(tree, start, end, GFP_NOFS); + return 0; + +failed: + /* + * we failed above in getting the page at 'index', so we undo here + * up to but not including the page at 'index' + */ + end_index = index; + index = start >> PAGE_CACHE_SHIFT; + while (index < end_index) { + page = find_get_page(tree->mapping, index); + unlock_page(page); + page_cache_release(page); + index++; + } + return err; +} + +/* + * helper function to unlock both pages and extents in the tree. + */ +static int unlock_range(struct extent_io_tree *tree, u64 start, u64 end) +{ + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + struct page *page; + + while (index <= end_index) { + page = find_get_page(tree->mapping, index); + unlock_page(page); + page_cache_release(page); + index++; + } + unlock_extent(tree, start, end, GFP_NOFS); + return 0; +} +#endif + +/* + * set the private field for a given byte offset in the tree. If there isn't + * an extent_state there already, this does nothing. + */ +int set_state_private(struct extent_io_tree *tree, u64 start, u64 private) +{ + struct rb_node *node; + struct extent_state *state; + int ret = 0; + + spin_lock(&tree->lock); + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search(tree, start); + if (!node) { + ret = -ENOENT; + goto out; + } + state = rb_entry(node, struct extent_state, rb_node); + if (state->start != start) { + ret = -ENOENT; + goto out; + } + state->private = private; +out: + spin_unlock(&tree->lock); + return ret; +} + +int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private) +{ + struct rb_node *node; + struct extent_state *state; + int ret = 0; + + spin_lock(&tree->lock); + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search(tree, start); + if (!node) { + ret = -ENOENT; + goto out; + } + state = rb_entry(node, struct extent_state, rb_node); + if (state->start != start) { + ret = -ENOENT; + goto out; + } + *private = state->private; +out: + spin_unlock(&tree->lock); + return ret; +} + +/* + * searches a range in the state tree for a given mask. + * If 'filled' == 1, this returns 1 only if every extent in the tree + * has the bits set. Otherwise, 1 is returned if any bit in the + * range is found set. + */ +int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, + int bits, int filled) +{ + struct extent_state *state = NULL; + struct rb_node *node; + int bitset = 0; + + spin_lock(&tree->lock); + node = tree_search(tree, start); + while (node && start <= end) { + state = rb_entry(node, struct extent_state, rb_node); + + if (filled && state->start > start) { + bitset = 0; + break; + } + + if (state->start > end) + break; + + if (state->state & bits) { + bitset = 1; + if (!filled) + break; + } else if (filled) { + bitset = 0; + break; + } + start = state->end + 1; + if (start > end) + break; + node = rb_next(node); + if (!node) { + if (filled) + bitset = 0; + break; + } + } + spin_unlock(&tree->lock); + return bitset; +} + +/* + * helper function to set a given page up to date if all the + * extents in the tree for that page are up to date + */ +static int check_page_uptodate(struct extent_io_tree *tree, + struct page *page) +{ + u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 end = start + PAGE_CACHE_SIZE - 1; + if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1)) + SetPageUptodate(page); + return 0; +} + +/* + * helper function to unlock a page if all the extents in the tree + * for that page are unlocked + */ +static int check_page_locked(struct extent_io_tree *tree, + struct page *page) +{ + u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 end = start + PAGE_CACHE_SIZE - 1; + if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0)) + unlock_page(page); + return 0; +} + +/* + * helper function to end page writeback if all the extents + * in the tree for that page are done with writeback + */ +static int check_page_writeback(struct extent_io_tree *tree, + struct page *page) +{ + u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 end = start + PAGE_CACHE_SIZE - 1; + if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0)) + end_page_writeback(page); + return 0; +} + +/* lots and lots of room for performance fixes in the end_bio funcs */ + +/* + * after a writepage IO is done, we need to: + * clear the uptodate bits on error + * clear the writeback bits in the extent tree for this IO + * end_page_writeback if the page has no more pending IO + * + * Scheduling is not allowed, so the extent state tree is expected + * to have one and only one object corresponding to this IO. + */ +static void end_bio_extent_writepage(struct bio *bio, int err) +{ + int uptodate = err == 0; + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct extent_io_tree *tree; + u64 start; + u64 end; + int whole_page; + int ret; + + do { + struct page *page = bvec->bv_page; + tree = &BTRFS_I(page->mapping->host)->io_tree; + + start = ((u64)page->index << PAGE_CACHE_SHIFT) + + bvec->bv_offset; + end = start + bvec->bv_len - 1; + + if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) + whole_page = 1; + else + whole_page = 0; + + if (--bvec >= bio->bi_io_vec) + prefetchw(&bvec->bv_page->flags); + if (tree->ops && tree->ops->writepage_end_io_hook) { + ret = tree->ops->writepage_end_io_hook(page, start, + end, NULL, uptodate); + if (ret) + uptodate = 0; + } + + if (!uptodate && tree->ops && + tree->ops->writepage_io_failed_hook) { + ret = tree->ops->writepage_io_failed_hook(bio, page, + start, end, NULL); + if (ret == 0) { + uptodate = (err == 0); + continue; + } + } + + if (!uptodate) { + clear_extent_uptodate(tree, start, end, GFP_ATOMIC); + ClearPageUptodate(page); + SetPageError(page); + } + + clear_extent_writeback(tree, start, end, GFP_ATOMIC); + + if (whole_page) + end_page_writeback(page); + else + check_page_writeback(tree, page); + } while (bvec >= bio->bi_io_vec); + + bio_put(bio); +} + +/* + * after a readpage IO is done, we need to: + * clear the uptodate bits on error + * set the uptodate bits if things worked + * set the page up to date if all extents in the tree are uptodate + * clear the lock bit in the extent tree + * unlock the page if there are no other extents locked for it + * + * Scheduling is not allowed, so the extent state tree is expected + * to have one and only one object corresponding to this IO. + */ +static void end_bio_extent_readpage(struct bio *bio, int err) +{ + int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct extent_io_tree *tree; + u64 start; + u64 end; + int whole_page; + int ret; + + if (err) + uptodate = 0; + + do { + struct page *page = bvec->bv_page; + tree = &BTRFS_I(page->mapping->host)->io_tree; + + start = ((u64)page->index << PAGE_CACHE_SHIFT) + + bvec->bv_offset; + end = start + bvec->bv_len - 1; + + if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) + whole_page = 1; + else + whole_page = 0; + + if (--bvec >= bio->bi_io_vec) + prefetchw(&bvec->bv_page->flags); + + if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { + ret = tree->ops->readpage_end_io_hook(page, start, end, + NULL); + if (ret) + uptodate = 0; + } + if (!uptodate && tree->ops && + tree->ops->readpage_io_failed_hook) { + ret = tree->ops->readpage_io_failed_hook(bio, page, + start, end, NULL); + if (ret == 0) { + uptodate = + test_bit(BIO_UPTODATE, &bio->bi_flags); + if (err) + uptodate = 0; + continue; + } + } + + if (uptodate) { + set_extent_uptodate(tree, start, end, + GFP_ATOMIC); + } + unlock_extent(tree, start, end, GFP_ATOMIC); + + if (whole_page) { + if (uptodate) { + SetPageUptodate(page); + } else { + ClearPageUptodate(page); + SetPageError(page); + } + unlock_page(page); + } else { + if (uptodate) { + check_page_uptodate(tree, page); + } else { + ClearPageUptodate(page); + SetPageError(page); + } + check_page_locked(tree, page); + } + } while (bvec >= bio->bi_io_vec); + + bio_put(bio); +} + +/* + * IO done from prepare_write is pretty simple, we just unlock + * the structs in the extent tree when done, and set the uptodate bits + * as appropriate. + */ +static void end_bio_extent_preparewrite(struct bio *bio, int err) +{ + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct extent_io_tree *tree; + u64 start; + u64 end; + + do { + struct page *page = bvec->bv_page; + tree = &BTRFS_I(page->mapping->host)->io_tree; + + start = ((u64)page->index << PAGE_CACHE_SHIFT) + + bvec->bv_offset; + end = start + bvec->bv_len - 1; + + if (--bvec >= bio->bi_io_vec) + prefetchw(&bvec->bv_page->flags); + + if (uptodate) { + set_extent_uptodate(tree, start, end, GFP_ATOMIC); + } else { + ClearPageUptodate(page); + SetPageError(page); + } + + unlock_extent(tree, start, end, GFP_ATOMIC); + + } while (bvec >= bio->bi_io_vec); + + bio_put(bio); +} + +static struct bio * +extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, + gfp_t gfp_flags) +{ + struct bio *bio; + + bio = bio_alloc(gfp_flags, nr_vecs); + + if (bio == NULL && (current->flags & PF_MEMALLOC)) { + while (!bio && (nr_vecs /= 2)) + bio = bio_alloc(gfp_flags, nr_vecs); + } + + if (bio) { + bio->bi_size = 0; + bio->bi_bdev = bdev; + bio->bi_sector = first_sector; + } + return bio; +} + +static int submit_one_bio(int rw, struct bio *bio, int mirror_num, + unsigned long bio_flags) +{ + int ret = 0; + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct page *page = bvec->bv_page; + struct extent_io_tree *tree = bio->bi_private; + u64 start; + u64 end; + + start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; + end = start + bvec->bv_len - 1; + + bio->bi_private = NULL; + + bio_get(bio); + + if (tree->ops && tree->ops->submit_bio_hook) + tree->ops->submit_bio_hook(page->mapping->host, rw, bio, + mirror_num, bio_flags); + else + submit_bio(rw, bio); + if (bio_flagged(bio, BIO_EOPNOTSUPP)) + ret = -EOPNOTSUPP; + bio_put(bio); + return ret; +} + +static int submit_extent_page(int rw, struct extent_io_tree *tree, + struct page *page, sector_t sector, + size_t size, unsigned long offset, + struct block_device *bdev, + struct bio **bio_ret, + unsigned long max_pages, + bio_end_io_t end_io_func, + int mirror_num, + unsigned long prev_bio_flags, + unsigned long bio_flags) +{ + int ret = 0; + struct bio *bio; + int nr; + int contig = 0; + int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED; + int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED; + size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE); + + if (bio_ret && *bio_ret) { + bio = *bio_ret; + if (old_compressed) + contig = bio->bi_sector == sector; + else + contig = bio->bi_sector + (bio->bi_size >> 9) == + sector; + + if (prev_bio_flags != bio_flags || !contig || + (tree->ops && tree->ops->merge_bio_hook && + tree->ops->merge_bio_hook(page, offset, page_size, bio, + bio_flags)) || + bio_add_page(bio, page, page_size, offset) < page_size) { + ret = submit_one_bio(rw, bio, mirror_num, + prev_bio_flags); + bio = NULL; + } else { + return 0; + } + } + if (this_compressed) + nr = BIO_MAX_PAGES; + else + nr = bio_get_nr_vecs(bdev); + + bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); + + bio_add_page(bio, page, page_size, offset); + bio->bi_end_io = end_io_func; + bio->bi_private = tree; + + if (bio_ret) + *bio_ret = bio; + else + ret = submit_one_bio(rw, bio, mirror_num, bio_flags); + + return ret; +} + +void set_page_extent_mapped(struct page *page) +{ + if (!PagePrivate(page)) { + SetPagePrivate(page); + page_cache_get(page); + set_page_private(page, EXTENT_PAGE_PRIVATE); + } +} + +static void set_page_extent_head(struct page *page, unsigned long len) +{ + set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2); +} + +/* + * basic readpage implementation. Locked extent state structs are inserted + * into the tree that are removed when the IO is done (by the end_io + * handlers) + */ +static int __extent_read_full_page(struct extent_io_tree *tree, + struct page *page, + get_extent_t *get_extent, + struct bio **bio, int mirror_num, + unsigned long *bio_flags) +{ + struct inode *inode = page->mapping->host; + u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 page_end = start + PAGE_CACHE_SIZE - 1; + u64 end; + u64 cur = start; + u64 extent_offset; + u64 last_byte = i_size_read(inode); + u64 block_start; + u64 cur_end; + sector_t sector; + struct extent_map *em; + struct block_device *bdev; + int ret; + int nr = 0; + size_t page_offset = 0; + size_t iosize; + size_t disk_io_size; + size_t blocksize = inode->i_sb->s_blocksize; + unsigned long this_bio_flag = 0; + + set_page_extent_mapped(page); + + end = page_end; + lock_extent(tree, start, end, GFP_NOFS); + + if (page->index == last_byte >> PAGE_CACHE_SHIFT) { + char *userpage; + size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1); + + if (zero_offset) { + iosize = PAGE_CACHE_SIZE - zero_offset; + userpage = kmap_atomic(page, KM_USER0); + memset(userpage + zero_offset, 0, iosize); + flush_dcache_page(page); + kunmap_atomic(userpage, KM_USER0); + } + } + while (cur <= end) { + if (cur >= last_byte) { + char *userpage; + iosize = PAGE_CACHE_SIZE - page_offset; + userpage = kmap_atomic(page, KM_USER0); + memset(userpage + page_offset, 0, iosize); + flush_dcache_page(page); + kunmap_atomic(userpage, KM_USER0); + set_extent_uptodate(tree, cur, cur + iosize - 1, + GFP_NOFS); + unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); + break; + } + em = get_extent(inode, page, page_offset, cur, + end - cur + 1, 0); + if (IS_ERR(em) || !em) { + SetPageError(page); + unlock_extent(tree, cur, end, GFP_NOFS); + break; + } + extent_offset = cur - em->start; + BUG_ON(extent_map_end(em) <= cur); + BUG_ON(end < cur); + + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) + this_bio_flag = EXTENT_BIO_COMPRESSED; + + iosize = min(extent_map_end(em) - cur, end - cur + 1); + cur_end = min(extent_map_end(em) - 1, end); + iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); + if (this_bio_flag & EXTENT_BIO_COMPRESSED) { + disk_io_size = em->block_len; + sector = em->block_start >> 9; + } else { + sector = (em->block_start + extent_offset) >> 9; + disk_io_size = iosize; + } + bdev = em->bdev; + block_start = em->block_start; + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + block_start = EXTENT_MAP_HOLE; + free_extent_map(em); + em = NULL; + + /* we've found a hole, just zero and go on */ + if (block_start == EXTENT_MAP_HOLE) { + char *userpage; + userpage = kmap_atomic(page, KM_USER0); + memset(userpage + page_offset, 0, iosize); + flush_dcache_page(page); + kunmap_atomic(userpage, KM_USER0); + + set_extent_uptodate(tree, cur, cur + iosize - 1, + GFP_NOFS); + unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); + cur = cur + iosize; + page_offset += iosize; + continue; + } + /* the get_extent function already copied into the page */ + if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) { + check_page_uptodate(tree, page); + unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); + cur = cur + iosize; + page_offset += iosize; + continue; + } + /* we have an inline extent but it didn't get marked up + * to date. Error out + */ + if (block_start == EXTENT_MAP_INLINE) { + SetPageError(page); + unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); + cur = cur + iosize; + page_offset += iosize; + continue; + } + + ret = 0; + if (tree->ops && tree->ops->readpage_io_hook) { + ret = tree->ops->readpage_io_hook(page, cur, + cur + iosize - 1); + } + if (!ret) { + unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1; + pnr -= page->index; + ret = submit_extent_page(READ, tree, page, + sector, disk_io_size, page_offset, + bdev, bio, pnr, + end_bio_extent_readpage, mirror_num, + *bio_flags, + this_bio_flag); + nr++; + *bio_flags = this_bio_flag; + } + if (ret) + SetPageError(page); + cur = cur + iosize; + page_offset += iosize; + } + if (!nr) { + if (!PageError(page)) + SetPageUptodate(page); + unlock_page(page); + } + return 0; +} + +int extent_read_full_page(struct extent_io_tree *tree, struct page *page, + get_extent_t *get_extent) +{ + struct bio *bio = NULL; + unsigned long bio_flags = 0; + int ret; + + ret = __extent_read_full_page(tree, page, get_extent, &bio, 0, + &bio_flags); + if (bio) + submit_one_bio(READ, bio, 0, bio_flags); + return ret; +} + +/* + * the writepage semantics are similar to regular writepage. extent + * records are inserted to lock ranges in the tree, and as dirty areas + * are found, they are marked writeback. Then the lock bits are removed + * and the end_io handler clears the writeback ranges + */ +static int __extent_writepage(struct page *page, struct writeback_control *wbc, + void *data) +{ + struct inode *inode = page->mapping->host; + struct extent_page_data *epd = data; + struct extent_io_tree *tree = epd->tree; + u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 delalloc_start; + u64 page_end = start + PAGE_CACHE_SIZE - 1; + u64 end; + u64 cur = start; + u64 extent_offset; + u64 last_byte = i_size_read(inode); + u64 block_start; + u64 iosize; + u64 unlock_start; + sector_t sector; + struct extent_map *em; + struct block_device *bdev; + int ret; + int nr = 0; + size_t pg_offset = 0; + size_t blocksize; + loff_t i_size = i_size_read(inode); + unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; + u64 nr_delalloc; + u64 delalloc_end; + int page_started; + int compressed; + unsigned long nr_written = 0; + + WARN_ON(!PageLocked(page)); + pg_offset = i_size & (PAGE_CACHE_SIZE - 1); + if (page->index > end_index || + (page->index == end_index && !pg_offset)) { + page->mapping->a_ops->invalidatepage(page, 0); + unlock_page(page); + return 0; + } + + if (page->index == end_index) { + char *userpage; + + userpage = kmap_atomic(page, KM_USER0); + memset(userpage + pg_offset, 0, + PAGE_CACHE_SIZE - pg_offset); + kunmap_atomic(userpage, KM_USER0); + flush_dcache_page(page); + } + pg_offset = 0; + + set_page_extent_mapped(page); + + delalloc_start = start; + delalloc_end = 0; + page_started = 0; + if (!epd->extent_locked) { + while (delalloc_end < page_end) { + nr_delalloc = find_lock_delalloc_range(inode, tree, + page, + &delalloc_start, + &delalloc_end, + 128 * 1024 * 1024); + if (nr_delalloc == 0) { + delalloc_start = delalloc_end + 1; + continue; + } + tree->ops->fill_delalloc(inode, page, delalloc_start, + delalloc_end, &page_started, + &nr_written); + delalloc_start = delalloc_end + 1; + } + + /* did the fill delalloc function already unlock and start + * the IO? + */ + if (page_started) { + ret = 0; + goto update_nr_written; + } + } + lock_extent(tree, start, page_end, GFP_NOFS); + + unlock_start = start; + + if (tree->ops && tree->ops->writepage_start_hook) { + ret = tree->ops->writepage_start_hook(page, start, + page_end); + if (ret == -EAGAIN) { + unlock_extent(tree, start, page_end, GFP_NOFS); + redirty_page_for_writepage(wbc, page); + unlock_page(page); + ret = 0; + goto update_nr_written; + } + } + + nr_written++; + + end = page_end; + if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) + printk(KERN_ERR "btrfs delalloc bits after lock_extent\n"); + + if (last_byte <= start) { + clear_extent_dirty(tree, start, page_end, GFP_NOFS); + unlock_extent(tree, start, page_end, GFP_NOFS); + if (tree->ops && tree->ops->writepage_end_io_hook) + tree->ops->writepage_end_io_hook(page, start, + page_end, NULL, 1); + unlock_start = page_end + 1; + goto done; + } + + set_extent_uptodate(tree, start, page_end, GFP_NOFS); + blocksize = inode->i_sb->s_blocksize; + + while (cur <= end) { + if (cur >= last_byte) { + clear_extent_dirty(tree, cur, page_end, GFP_NOFS); + unlock_extent(tree, unlock_start, page_end, GFP_NOFS); + if (tree->ops && tree->ops->writepage_end_io_hook) + tree->ops->writepage_end_io_hook(page, cur, + page_end, NULL, 1); + unlock_start = page_end + 1; + break; + } + em = epd->get_extent(inode, page, pg_offset, cur, + end - cur + 1, 1); + if (IS_ERR(em) || !em) { + SetPageError(page); + break; + } + + extent_offset = cur - em->start; + BUG_ON(extent_map_end(em) <= cur); + BUG_ON(end < cur); + iosize = min(extent_map_end(em) - cur, end - cur + 1); + iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); + sector = (em->block_start + extent_offset) >> 9; + bdev = em->bdev; + block_start = em->block_start; + compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + free_extent_map(em); + em = NULL; + + /* + * compressed and inline extents are written through other + * paths in the FS + */ + if (compressed || block_start == EXTENT_MAP_HOLE || + block_start == EXTENT_MAP_INLINE) { + clear_extent_dirty(tree, cur, + cur + iosize - 1, GFP_NOFS); + + unlock_extent(tree, unlock_start, cur + iosize - 1, + GFP_NOFS); + + /* + * end_io notification does not happen here for + * compressed extents + */ + if (!compressed && tree->ops && + tree->ops->writepage_end_io_hook) + tree->ops->writepage_end_io_hook(page, cur, + cur + iosize - 1, + NULL, 1); + else if (compressed) { + /* we don't want to end_page_writeback on + * a compressed extent. this happens + * elsewhere + */ + nr++; + } + + cur += iosize; + pg_offset += iosize; + unlock_start = cur; + continue; + } + /* leave this out until we have a page_mkwrite call */ + if (0 && !test_range_bit(tree, cur, cur + iosize - 1, + EXTENT_DIRTY, 0)) { + cur = cur + iosize; + pg_offset += iosize; + continue; + } + + clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS); + if (tree->ops && tree->ops->writepage_io_hook) { + ret = tree->ops->writepage_io_hook(page, cur, + cur + iosize - 1); + } else { + ret = 0; + } + if (ret) { + SetPageError(page); + } else { + unsigned long max_nr = end_index + 1; + + set_range_writeback(tree, cur, cur + iosize - 1); + if (!PageWriteback(page)) { + printk(KERN_ERR "btrfs warning page %lu not " + "writeback, cur %llu end %llu\n", + page->index, (unsigned long long)cur, + (unsigned long long)end); + } + + ret = submit_extent_page(WRITE, tree, page, sector, + iosize, pg_offset, bdev, + &epd->bio, max_nr, + end_bio_extent_writepage, + 0, 0, 0); + if (ret) + SetPageError(page); + } + cur = cur + iosize; + pg_offset += iosize; + nr++; + } +done: + if (nr == 0) { + /* make sure the mapping tag for page dirty gets cleared */ + set_page_writeback(page); + end_page_writeback(page); + } + if (unlock_start <= page_end) + unlock_extent(tree, unlock_start, page_end, GFP_NOFS); + unlock_page(page); + +update_nr_written: + wbc->nr_to_write -= nr_written; + if (wbc->range_cyclic || (wbc->nr_to_write > 0 && + wbc->range_start == 0 && wbc->range_end == LLONG_MAX)) + page->mapping->writeback_index = page->index + nr_written; + return 0; +} + +/** + * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. + * @mapping: address space structure to write + * @wbc: subtract the number of written pages from *@wbc->nr_to_write + * @writepage: function called for each page + * @data: data passed to writepage function + * + * If a page is already under I/O, write_cache_pages() skips it, even + * if it's dirty. This is desirable behaviour for memory-cleaning writeback, + * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() + * and msync() need to guarantee that all the data which was dirty at the time + * the call was made get new I/O started against them. If wbc->sync_mode is + * WB_SYNC_ALL then we were called for data integrity and we must wait for + * existing IO to complete. + */ +static int extent_write_cache_pages(struct extent_io_tree *tree, + struct address_space *mapping, + struct writeback_control *wbc, + writepage_t writepage, void *data, + void (*flush_fn)(void *)) +{ + struct backing_dev_info *bdi = mapping->backing_dev_info; + int ret = 0; + int done = 0; + struct pagevec pvec; + int nr_pages; + pgoff_t index; + pgoff_t end; /* Inclusive */ + int scanned = 0; + int range_whole = 0; + + if (wbc->nonblocking && bdi_write_congested(bdi)) { + wbc->encountered_congestion = 1; + return 0; + } + + pagevec_init(&pvec, 0); + if (wbc->range_cyclic) { + index = mapping->writeback_index; /* Start from prev offset */ + end = -1; + } else { + index = wbc->range_start >> PAGE_CACHE_SHIFT; + end = wbc->range_end >> PAGE_CACHE_SHIFT; + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; + scanned = 1; + } +retry: + while (!done && (index <= end) && + (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY, min(end - index, + (pgoff_t)PAGEVEC_SIZE-1) + 1))) { + unsigned i; + + scanned = 1; + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + /* + * At this point we hold neither mapping->tree_lock nor + * lock on the page itself: the page may be truncated or + * invalidated (changing page->mapping to NULL), or even + * swizzled back from swapper_space to tmpfs file + * mapping + */ + if (tree->ops && tree->ops->write_cache_pages_lock_hook) + tree->ops->write_cache_pages_lock_hook(page); + else + lock_page(page); + + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + continue; + } + + if (!wbc->range_cyclic && page->index > end) { + done = 1; + unlock_page(page); + continue; + } + + if (wbc->sync_mode != WB_SYNC_NONE) { + if (PageWriteback(page)) + flush_fn(data); + wait_on_page_writeback(page); + } + + if (PageWriteback(page) || + !clear_page_dirty_for_io(page)) { + unlock_page(page); + continue; + } + + ret = (*writepage)(page, wbc, data); + + if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { + unlock_page(page); + ret = 0; + } + if (ret || wbc->nr_to_write <= 0) + done = 1; + if (wbc->nonblocking && bdi_write_congested(bdi)) { + wbc->encountered_congestion = 1; + done = 1; + } + } + pagevec_release(&pvec); + cond_resched(); + } + if (!scanned && !done) { + /* + * We hit the last page and there is more work to be done: wrap + * back to the start of the file + */ + scanned = 1; + index = 0; + goto retry; + } + return ret; +} + +static noinline void flush_write_bio(void *data) +{ + struct extent_page_data *epd = data; + if (epd->bio) { + submit_one_bio(WRITE, epd->bio, 0, 0); + epd->bio = NULL; + } +} + +int extent_write_full_page(struct extent_io_tree *tree, struct page *page, + get_extent_t *get_extent, + struct writeback_control *wbc) +{ + int ret; + struct address_space *mapping = page->mapping; + struct extent_page_data epd = { + .bio = NULL, + .tree = tree, + .get_extent = get_extent, + .extent_locked = 0, + }; + struct writeback_control wbc_writepages = { + .bdi = wbc->bdi, + .sync_mode = WB_SYNC_NONE, + .older_than_this = NULL, + .nr_to_write = 64, + .range_start = page_offset(page) + PAGE_CACHE_SIZE, + .range_end = (loff_t)-1, + }; + + + ret = __extent_writepage(page, wbc, &epd); + + extent_write_cache_pages(tree, mapping, &wbc_writepages, + __extent_writepage, &epd, flush_write_bio); + if (epd.bio) + submit_one_bio(WRITE, epd.bio, 0, 0); + return ret; +} + +int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, + u64 start, u64 end, get_extent_t *get_extent, + int mode) +{ + int ret = 0; + struct address_space *mapping = inode->i_mapping; + struct page *page; + unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >> + PAGE_CACHE_SHIFT; + + struct extent_page_data epd = { + .bio = NULL, + .tree = tree, + .get_extent = get_extent, + .extent_locked = 1, + }; + struct writeback_control wbc_writepages = { + .bdi = inode->i_mapping->backing_dev_info, + .sync_mode = mode, + .older_than_this = NULL, + .nr_to_write = nr_pages * 2, + .range_start = start, + .range_end = end + 1, + }; + + while (start <= end) { + page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); + if (clear_page_dirty_for_io(page)) + ret = __extent_writepage(page, &wbc_writepages, &epd); + else { + if (tree->ops && tree->ops->writepage_end_io_hook) + tree->ops->writepage_end_io_hook(page, start, + start + PAGE_CACHE_SIZE - 1, + NULL, 1); + unlock_page(page); + } + page_cache_release(page); + start += PAGE_CACHE_SIZE; + } + + if (epd.bio) + submit_one_bio(WRITE, epd.bio, 0, 0); + return ret; +} + +int extent_writepages(struct extent_io_tree *tree, + struct address_space *mapping, + get_extent_t *get_extent, + struct writeback_control *wbc) +{ + int ret = 0; + struct extent_page_data epd = { + .bio = NULL, + .tree = tree, + .get_extent = get_extent, + .extent_locked = 0, + }; + + ret = extent_write_cache_pages(tree, mapping, wbc, + __extent_writepage, &epd, + flush_write_bio); + if (epd.bio) + submit_one_bio(WRITE, epd.bio, 0, 0); + return ret; +} + +int extent_readpages(struct extent_io_tree *tree, + struct address_space *mapping, + struct list_head *pages, unsigned nr_pages, + get_extent_t get_extent) +{ + struct bio *bio = NULL; + unsigned page_idx; + struct pagevec pvec; + unsigned long bio_flags = 0; + + pagevec_init(&pvec, 0); + for (page_idx = 0; page_idx < nr_pages; page_idx++) { + struct page *page = list_entry(pages->prev, struct page, lru); + + prefetchw(&page->flags); + list_del(&page->lru); + /* + * what we want to do here is call add_to_page_cache_lru, + * but that isn't exported, so we reproduce it here + */ + if (!add_to_page_cache(page, mapping, + page->index, GFP_KERNEL)) { + + /* open coding of lru_cache_add, also not exported */ + page_cache_get(page); + if (!pagevec_add(&pvec, page)) + __pagevec_lru_add_file(&pvec); + __extent_read_full_page(tree, page, get_extent, + &bio, 0, &bio_flags); + } + page_cache_release(page); + } + if (pagevec_count(&pvec)) + __pagevec_lru_add_file(&pvec); + BUG_ON(!list_empty(pages)); + if (bio) + submit_one_bio(READ, bio, 0, bio_flags); + return 0; +} + +/* + * basic invalidatepage code, this waits on any locked or writeback + * ranges corresponding to the page, and then deletes any extent state + * records from the tree + */ +int extent_invalidatepage(struct extent_io_tree *tree, + struct page *page, unsigned long offset) +{ + u64 start = ((u64)page->index << PAGE_CACHE_SHIFT); + u64 end = start + PAGE_CACHE_SIZE - 1; + size_t blocksize = page->mapping->host->i_sb->s_blocksize; + + start += (offset + blocksize - 1) & ~(blocksize - 1); + if (start > end) + return 0; + + lock_extent(tree, start, end, GFP_NOFS); + wait_on_extent_writeback(tree, start, end); + clear_extent_bit(tree, start, end, + EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC, + 1, 1, GFP_NOFS); + return 0; +} + +/* + * simple commit_write call, set_range_dirty is used to mark both + * the pages and the extent records as dirty + */ +int extent_commit_write(struct extent_io_tree *tree, + struct inode *inode, struct page *page, + unsigned from, unsigned to) +{ + loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; + + set_page_extent_mapped(page); + set_page_dirty(page); + + if (pos > inode->i_size) { + i_size_write(inode, pos); + mark_inode_dirty(inode); + } + return 0; +} + +int extent_prepare_write(struct extent_io_tree *tree, + struct inode *inode, struct page *page, + unsigned from, unsigned to, get_extent_t *get_extent) +{ + u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 page_end = page_start + PAGE_CACHE_SIZE - 1; + u64 block_start; + u64 orig_block_start; + u64 block_end; + u64 cur_end; + struct extent_map *em; + unsigned blocksize = 1 << inode->i_blkbits; + size_t page_offset = 0; + size_t block_off_start; + size_t block_off_end; + int err = 0; + int iocount = 0; + int ret = 0; + int isnew; + + set_page_extent_mapped(page); + + block_start = (page_start + from) & ~((u64)blocksize - 1); + block_end = (page_start + to - 1) | (blocksize - 1); + orig_block_start = block_start; + + lock_extent(tree, page_start, page_end, GFP_NOFS); + while (block_start <= block_end) { + em = get_extent(inode, page, page_offset, block_start, + block_end - block_start + 1, 1); + if (IS_ERR(em) || !em) + goto err; + + cur_end = min(block_end, extent_map_end(em) - 1); + block_off_start = block_start & (PAGE_CACHE_SIZE - 1); + block_off_end = block_off_start + blocksize; + isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS); + + if (!PageUptodate(page) && isnew && + (block_off_end > to || block_off_start < from)) { + void *kaddr; + + kaddr = kmap_atomic(page, KM_USER0); + if (block_off_end > to) + memset(kaddr + to, 0, block_off_end - to); + if (block_off_start < from) + memset(kaddr + block_off_start, 0, + from - block_off_start); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + } + if ((em->block_start != EXTENT_MAP_HOLE && + em->block_start != EXTENT_MAP_INLINE) && + !isnew && !PageUptodate(page) && + (block_off_end > to || block_off_start < from) && + !test_range_bit(tree, block_start, cur_end, + EXTENT_UPTODATE, 1)) { + u64 sector; + u64 extent_offset = block_start - em->start; + size_t iosize; + sector = (em->block_start + extent_offset) >> 9; + iosize = (cur_end - block_start + blocksize) & + ~((u64)blocksize - 1); + /* + * we've already got the extent locked, but we + * need to split the state such that our end_bio + * handler can clear the lock. + */ + set_extent_bit(tree, block_start, + block_start + iosize - 1, + EXTENT_LOCKED, 0, NULL, GFP_NOFS); + ret = submit_extent_page(READ, tree, page, + sector, iosize, page_offset, em->bdev, + NULL, 1, + end_bio_extent_preparewrite, 0, + 0, 0); + iocount++; + block_start = block_start + iosize; + } else { + set_extent_uptodate(tree, block_start, cur_end, + GFP_NOFS); + unlock_extent(tree, block_start, cur_end, GFP_NOFS); + block_start = cur_end + 1; + } + page_offset = block_start & (PAGE_CACHE_SIZE - 1); + free_extent_map(em); + } + if (iocount) { + wait_extent_bit(tree, orig_block_start, + block_end, EXTENT_LOCKED); + } + check_page_uptodate(tree, page); +err: + /* FIXME, zero out newly allocated blocks on error */ + return err; +} + +/* + * a helper for releasepage, this tests for areas of the page that + * are locked or under IO and drops the related state bits if it is safe + * to drop the page. + */ +int try_release_extent_state(struct extent_map_tree *map, + struct extent_io_tree *tree, struct page *page, + gfp_t mask) +{ + u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 end = start + PAGE_CACHE_SIZE - 1; + int ret = 1; + + if (test_range_bit(tree, start, end, + EXTENT_IOBITS | EXTENT_ORDERED, 0)) + ret = 0; + else { + if ((mask & GFP_NOFS) == GFP_NOFS) + mask = GFP_NOFS; + clear_extent_bit(tree, start, end, EXTENT_UPTODATE, + 1, 1, mask); + } + return ret; +} + +/* + * a helper for releasepage. As long as there are no locked extents + * in the range corresponding to the page, both state records and extent + * map records are removed + */ +int try_release_extent_mapping(struct extent_map_tree *map, + struct extent_io_tree *tree, struct page *page, + gfp_t mask) +{ + struct extent_map *em; + u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 end = start + PAGE_CACHE_SIZE - 1; + + if ((mask & __GFP_WAIT) && + page->mapping->host->i_size > 16 * 1024 * 1024) { + u64 len; + while (start <= end) { + len = end - start + 1; + spin_lock(&map->lock); + em = lookup_extent_mapping(map, start, len); + if (!em || IS_ERR(em)) { + spin_unlock(&map->lock); + break; + } + if (test_bit(EXTENT_FLAG_PINNED, &em->flags) || + em->start != start) { + spin_unlock(&map->lock); + free_extent_map(em); + break; + } + if (!test_range_bit(tree, em->start, + extent_map_end(em) - 1, + EXTENT_LOCKED | EXTENT_WRITEBACK | + EXTENT_ORDERED, + 0)) { + remove_extent_mapping(map, em); + /* once for the rb tree */ + free_extent_map(em); + } + start = extent_map_end(em); + spin_unlock(&map->lock); + + /* once for us */ + free_extent_map(em); + } + } + return try_release_extent_state(map, tree, page, mask); +} + +sector_t extent_bmap(struct address_space *mapping, sector_t iblock, + get_extent_t *get_extent) +{ + struct inode *inode = mapping->host; + u64 start = iblock << inode->i_blkbits; + sector_t sector = 0; + size_t blksize = (1 << inode->i_blkbits); + struct extent_map *em; + + lock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1, + GFP_NOFS); + em = get_extent(inode, NULL, 0, start, blksize, 0); + unlock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1, + GFP_NOFS); + if (!em || IS_ERR(em)) + return 0; + + if (em->block_start > EXTENT_MAP_LAST_BYTE) + goto out; + + sector = (em->block_start + start - em->start) >> inode->i_blkbits; +out: + free_extent_map(em); + return sector; +} + +static inline struct page *extent_buffer_page(struct extent_buffer *eb, + unsigned long i) +{ + struct page *p; + struct address_space *mapping; + + if (i == 0) + return eb->first_page; + i += eb->start >> PAGE_CACHE_SHIFT; + mapping = eb->first_page->mapping; + if (!mapping) + return NULL; + + /* + * extent_buffer_page is only called after pinning the page + * by increasing the reference count. So we know the page must + * be in the radix tree. + */ + rcu_read_lock(); + p = radix_tree_lookup(&mapping->page_tree, i); + rcu_read_unlock(); + + return p; +} + +static inline unsigned long num_extent_pages(u64 start, u64 len) +{ + return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - + (start >> PAGE_CACHE_SHIFT); +} + +static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, + u64 start, + unsigned long len, + gfp_t mask) +{ + struct extent_buffer *eb = NULL; +#ifdef LEAK_DEBUG + unsigned long flags; +#endif + + eb = kmem_cache_zalloc(extent_buffer_cache, mask); + eb->start = start; + eb->len = len; + mutex_init(&eb->mutex); +#ifdef LEAK_DEBUG + spin_lock_irqsave(&leak_lock, flags); + list_add(&eb->leak_list, &buffers); + spin_unlock_irqrestore(&leak_lock, flags); +#endif + atomic_set(&eb->refs, 1); + + return eb; +} + +static void __free_extent_buffer(struct extent_buffer *eb) +{ +#ifdef LEAK_DEBUG + unsigned long flags; + spin_lock_irqsave(&leak_lock, flags); + list_del(&eb->leak_list); + spin_unlock_irqrestore(&leak_lock, flags); +#endif + kmem_cache_free(extent_buffer_cache, eb); +} + +struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, + u64 start, unsigned long len, + struct page *page0, + gfp_t mask) +{ + unsigned long num_pages = num_extent_pages(start, len); + unsigned long i; + unsigned long index = start >> PAGE_CACHE_SHIFT; + struct extent_buffer *eb; + struct extent_buffer *exists = NULL; + struct page *p; + struct address_space *mapping = tree->mapping; + int uptodate = 1; + + spin_lock(&tree->buffer_lock); + eb = buffer_search(tree, start); + if (eb) { + atomic_inc(&eb->refs); + spin_unlock(&tree->buffer_lock); + mark_page_accessed(eb->first_page); + return eb; + } + spin_unlock(&tree->buffer_lock); + + eb = __alloc_extent_buffer(tree, start, len, mask); + if (!eb) + return NULL; + + if (page0) { + eb->first_page = page0; + i = 1; + index++; + page_cache_get(page0); + mark_page_accessed(page0); + set_page_extent_mapped(page0); + set_page_extent_head(page0, len); + uptodate = PageUptodate(page0); + } else { + i = 0; + } + for (; i < num_pages; i++, index++) { + p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM); + if (!p) { + WARN_ON(1); + goto free_eb; + } + set_page_extent_mapped(p); + mark_page_accessed(p); + if (i == 0) { + eb->first_page = p; + set_page_extent_head(p, len); + } else { + set_page_private(p, EXTENT_PAGE_PRIVATE); + } + if (!PageUptodate(p)) + uptodate = 0; + unlock_page(p); + } + if (uptodate) + eb->flags |= EXTENT_UPTODATE; + eb->flags |= EXTENT_BUFFER_FILLED; + + spin_lock(&tree->buffer_lock); + exists = buffer_tree_insert(tree, start, &eb->rb_node); + if (exists) { + /* add one reference for the caller */ + atomic_inc(&exists->refs); + spin_unlock(&tree->buffer_lock); + goto free_eb; + } + spin_unlock(&tree->buffer_lock); + + /* add one reference for the tree */ + atomic_inc(&eb->refs); + return eb; + +free_eb: + if (!atomic_dec_and_test(&eb->refs)) + return exists; + for (index = 1; index < i; index++) + page_cache_release(extent_buffer_page(eb, index)); + page_cache_release(extent_buffer_page(eb, 0)); + __free_extent_buffer(eb); + return exists; +} + +struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, + u64 start, unsigned long len, + gfp_t mask) +{ + struct extent_buffer *eb; + + spin_lock(&tree->buffer_lock); + eb = buffer_search(tree, start); + if (eb) + atomic_inc(&eb->refs); + spin_unlock(&tree->buffer_lock); + + if (eb) + mark_page_accessed(eb->first_page); + + return eb; +} + +void free_extent_buffer(struct extent_buffer *eb) +{ + if (!eb) + return; + + if (!atomic_dec_and_test(&eb->refs)) + return; + + WARN_ON(1); +} + +int clear_extent_buffer_dirty(struct extent_io_tree *tree, + struct extent_buffer *eb) +{ + int set; + unsigned long i; + unsigned long num_pages; + struct page *page; + + u64 start = eb->start; + u64 end = start + eb->len - 1; + + set = clear_extent_dirty(tree, start, end, GFP_NOFS); + num_pages = num_extent_pages(eb->start, eb->len); + + for (i = 0; i < num_pages; i++) { + page = extent_buffer_page(eb, i); + if (!set && !PageDirty(page)) + continue; + + lock_page(page); + if (i == 0) + set_page_extent_head(page, eb->len); + else + set_page_private(page, EXTENT_PAGE_PRIVATE); + + /* + * if we're on the last page or the first page and the + * block isn't aligned on a page boundary, do extra checks + * to make sure we don't clean page that is partially dirty + */ + if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || + ((i == num_pages - 1) && + ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) { + start = (u64)page->index << PAGE_CACHE_SHIFT; + end = start + PAGE_CACHE_SIZE - 1; + if (test_range_bit(tree, start, end, + EXTENT_DIRTY, 0)) { + unlock_page(page); + continue; + } + } + clear_page_dirty_for_io(page); + spin_lock_irq(&page->mapping->tree_lock); + if (!PageDirty(page)) { + radix_tree_tag_clear(&page->mapping->page_tree, + page_index(page), + PAGECACHE_TAG_DIRTY); + } + spin_unlock_irq(&page->mapping->tree_lock); + unlock_page(page); + } + return 0; +} + +int wait_on_extent_buffer_writeback(struct extent_io_tree *tree, + struct extent_buffer *eb) +{ + return wait_on_extent_writeback(tree, eb->start, + eb->start + eb->len - 1); +} + +int set_extent_buffer_dirty(struct extent_io_tree *tree, + struct extent_buffer *eb) +{ + unsigned long i; + unsigned long num_pages; + + num_pages = num_extent_pages(eb->start, eb->len); + for (i = 0; i < num_pages; i++) { + struct page *page = extent_buffer_page(eb, i); + /* writepage may need to do something special for the + * first page, we have to make sure page->private is + * properly set. releasepage may drop page->private + * on us if the page isn't already dirty. + */ + lock_page(page); + if (i == 0) { + set_page_extent_head(page, eb->len); + } else if (PagePrivate(page) && + page->private != EXTENT_PAGE_PRIVATE) { + set_page_extent_mapped(page); + } + __set_page_dirty_nobuffers(extent_buffer_page(eb, i)); + set_extent_dirty(tree, page_offset(page), + page_offset(page) + PAGE_CACHE_SIZE - 1, + GFP_NOFS); + unlock_page(page); + } + return 0; +} + +int clear_extent_buffer_uptodate(struct extent_io_tree *tree, + struct extent_buffer *eb) +{ + unsigned long i; + struct page *page; + unsigned long num_pages; + + num_pages = num_extent_pages(eb->start, eb->len); + eb->flags &= ~EXTENT_UPTODATE; + + clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, + GFP_NOFS); + for (i = 0; i < num_pages; i++) { + page = extent_buffer_page(eb, i); + if (page) + ClearPageUptodate(page); + } + return 0; +} + +int set_extent_buffer_uptodate(struct extent_io_tree *tree, + struct extent_buffer *eb) +{ + unsigned long i; + struct page *page; + unsigned long num_pages; + + num_pages = num_extent_pages(eb->start, eb->len); + + set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, + GFP_NOFS); + for (i = 0; i < num_pages; i++) { + page = extent_buffer_page(eb, i); + if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || + ((i == num_pages - 1) && + ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) { + check_page_uptodate(tree, page); + continue; + } + SetPageUptodate(page); + } + return 0; +} + +int extent_range_uptodate(struct extent_io_tree *tree, + u64 start, u64 end) +{ + struct page *page; + int ret; + int pg_uptodate = 1; + int uptodate; + unsigned long index; + + ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1); + if (ret) + return 1; + while (start <= end) { + index = start >> PAGE_CACHE_SHIFT; + page = find_get_page(tree->mapping, index); + uptodate = PageUptodate(page); + page_cache_release(page); + if (!uptodate) { + pg_uptodate = 0; + break; + } + start += PAGE_CACHE_SIZE; + } + return pg_uptodate; +} + +int extent_buffer_uptodate(struct extent_io_tree *tree, + struct extent_buffer *eb) +{ + int ret = 0; + unsigned long num_pages; + unsigned long i; + struct page *page; + int pg_uptodate = 1; + + if (eb->flags & EXTENT_UPTODATE) + return 1; + + ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, + EXTENT_UPTODATE, 1); + if (ret) + return ret; + + num_pages = num_extent_pages(eb->start, eb->len); + for (i = 0; i < num_pages; i++) { + page = extent_buffer_page(eb, i); + if (!PageUptodate(page)) { + pg_uptodate = 0; + break; + } + } + return pg_uptodate; +} + +int read_extent_buffer_pages(struct extent_io_tree *tree, + struct extent_buffer *eb, + u64 start, int wait, + get_extent_t *get_extent, int mirror_num) +{ + unsigned long i; + unsigned long start_i; + struct page *page; + int err; + int ret = 0; + int locked_pages = 0; + int all_uptodate = 1; + int inc_all_pages = 0; + unsigned long num_pages; + struct bio *bio = NULL; + unsigned long bio_flags = 0; + + if (eb->flags & EXTENT_UPTODATE) + return 0; + + if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, + EXTENT_UPTODATE, 1)) { + return 0; + } + + if (start) { + WARN_ON(start < eb->start); + start_i = (start >> PAGE_CACHE_SHIFT) - + (eb->start >> PAGE_CACHE_SHIFT); + } else { + start_i = 0; + } + + num_pages = num_extent_pages(eb->start, eb->len); + for (i = start_i; i < num_pages; i++) { + page = extent_buffer_page(eb, i); + if (!wait) { + if (!trylock_page(page)) + goto unlock_exit; + } else { + lock_page(page); + } + locked_pages++; + if (!PageUptodate(page)) + all_uptodate = 0; + } + if (all_uptodate) { + if (start_i == 0) + eb->flags |= EXTENT_UPTODATE; + goto unlock_exit; + } + + for (i = start_i; i < num_pages; i++) { + page = extent_buffer_page(eb, i); + if (inc_all_pages) + page_cache_get(page); + if (!PageUptodate(page)) { + if (start_i == 0) + inc_all_pages = 1; + ClearPageError(page); + err = __extent_read_full_page(tree, page, + get_extent, &bio, + mirror_num, &bio_flags); + if (err) + ret = err; + } else { + unlock_page(page); + } + } + + if (bio) + submit_one_bio(READ, bio, mirror_num, bio_flags); + + if (ret || !wait) + return ret; + + for (i = start_i; i < num_pages; i++) { + page = extent_buffer_page(eb, i); + wait_on_page_locked(page); + if (!PageUptodate(page)) + ret = -EIO; + } + + if (!ret) + eb->flags |= EXTENT_UPTODATE; + return ret; + +unlock_exit: + i = start_i; + while (locked_pages > 0) { + page = extent_buffer_page(eb, i); + i++; + unlock_page(page); + locked_pages--; + } + return ret; +} + +void read_extent_buffer(struct extent_buffer *eb, void *dstv, + unsigned long start, + unsigned long len) +{ + size_t cur; + size_t offset; + struct page *page; + char *kaddr; + char *dst = (char *)dstv; + size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + + WARN_ON(start > eb->len); + WARN_ON(start + len > eb->start + eb->len); + + offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); + + while (len > 0) { + page = extent_buffer_page(eb, i); + + cur = min(len, (PAGE_CACHE_SIZE - offset)); + kaddr = kmap_atomic(page, KM_USER1); + memcpy(dst, kaddr + offset, cur); + kunmap_atomic(kaddr, KM_USER1); + + dst += cur; + len -= cur; + offset = 0; + i++; + } +} + +int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, + unsigned long min_len, char **token, char **map, + unsigned long *map_start, + unsigned long *map_len, int km) +{ + size_t offset = start & (PAGE_CACHE_SIZE - 1); + char *kaddr; + struct page *p; + size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + unsigned long end_i = (start_offset + start + min_len - 1) >> + PAGE_CACHE_SHIFT; + + if (i != end_i) + return -EINVAL; + + if (i == 0) { + offset = start_offset; + *map_start = 0; + } else { + offset = 0; + *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset; + } + + if (start + min_len > eb->len) { + printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, " + "wanted %lu %lu\n", (unsigned long long)eb->start, + eb->len, start, min_len); + WARN_ON(1); + } + + p = extent_buffer_page(eb, i); + kaddr = kmap_atomic(p, km); + *token = kaddr; + *map = kaddr + offset; + *map_len = PAGE_CACHE_SIZE - offset; + return 0; +} + +int map_extent_buffer(struct extent_buffer *eb, unsigned long start, + unsigned long min_len, + char **token, char **map, + unsigned long *map_start, + unsigned long *map_len, int km) +{ + int err; + int save = 0; + if (eb->map_token) { + unmap_extent_buffer(eb, eb->map_token, km); + eb->map_token = NULL; + save = 1; + WARN_ON(!mutex_is_locked(&eb->mutex)); + } + err = map_private_extent_buffer(eb, start, min_len, token, map, + map_start, map_len, km); + if (!err && save) { + eb->map_token = *token; + eb->kaddr = *map; + eb->map_start = *map_start; + eb->map_len = *map_len; + } + return err; +} + +void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km) +{ + kunmap_atomic(token, km); +} + +int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, + unsigned long start, + unsigned long len) +{ + size_t cur; + size_t offset; + struct page *page; + char *kaddr; + char *ptr = (char *)ptrv; + size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + int ret = 0; + + WARN_ON(start > eb->len); + WARN_ON(start + len > eb->start + eb->len); + + offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); + + while (len > 0) { + page = extent_buffer_page(eb, i); + + cur = min(len, (PAGE_CACHE_SIZE - offset)); + + kaddr = kmap_atomic(page, KM_USER0); + ret = memcmp(ptr, kaddr + offset, cur); + kunmap_atomic(kaddr, KM_USER0); + if (ret) + break; + + ptr += cur; + len -= cur; + offset = 0; + i++; + } + return ret; +} + +void write_extent_buffer(struct extent_buffer *eb, const void *srcv, + unsigned long start, unsigned long len) +{ + size_t cur; + size_t offset; + struct page *page; + char *kaddr; + char *src = (char *)srcv; + size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + + WARN_ON(start > eb->len); + WARN_ON(start + len > eb->start + eb->len); + + offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); + + while (len > 0) { + page = extent_buffer_page(eb, i); + WARN_ON(!PageUptodate(page)); + + cur = min(len, PAGE_CACHE_SIZE - offset); + kaddr = kmap_atomic(page, KM_USER1); + memcpy(kaddr + offset, src, cur); + kunmap_atomic(kaddr, KM_USER1); + + src += cur; + len -= cur; + offset = 0; + i++; + } +} + +void memset_extent_buffer(struct extent_buffer *eb, char c, + unsigned long start, unsigned long len) +{ + size_t cur; + size_t offset; + struct page *page; + char *kaddr; + size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + + WARN_ON(start > eb->len); + WARN_ON(start + len > eb->start + eb->len); + + offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); + + while (len > 0) { + page = extent_buffer_page(eb, i); + WARN_ON(!PageUptodate(page)); + + cur = min(len, PAGE_CACHE_SIZE - offset); + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr + offset, c, cur); + kunmap_atomic(kaddr, KM_USER0); + + len -= cur; + offset = 0; + i++; + } +} + +void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, + unsigned long dst_offset, unsigned long src_offset, + unsigned long len) +{ + u64 dst_len = dst->len; + size_t cur; + size_t offset; + struct page *page; + char *kaddr; + size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; + + WARN_ON(src->len != dst_len); + + offset = (start_offset + dst_offset) & + ((unsigned long)PAGE_CACHE_SIZE - 1); + + while (len > 0) { + page = extent_buffer_page(dst, i); + WARN_ON(!PageUptodate(page)); + + cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset)); + + kaddr = kmap_atomic(page, KM_USER0); + read_extent_buffer(src, kaddr + offset, src_offset, cur); + kunmap_atomic(kaddr, KM_USER0); + + src_offset += cur; + len -= cur; + offset = 0; + i++; + } +} + +static void move_pages(struct page *dst_page, struct page *src_page, + unsigned long dst_off, unsigned long src_off, + unsigned long len) +{ + char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); + if (dst_page == src_page) { + memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len); + } else { + char *src_kaddr = kmap_atomic(src_page, KM_USER1); + char *p = dst_kaddr + dst_off + len; + char *s = src_kaddr + src_off + len; + + while (len--) + *--p = *--s; + + kunmap_atomic(src_kaddr, KM_USER1); + } + kunmap_atomic(dst_kaddr, KM_USER0); +} + +static void copy_pages(struct page *dst_page, struct page *src_page, + unsigned long dst_off, unsigned long src_off, + unsigned long len) +{ + char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); + char *src_kaddr; + + if (dst_page != src_page) + src_kaddr = kmap_atomic(src_page, KM_USER1); + else + src_kaddr = dst_kaddr; + + memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); + kunmap_atomic(dst_kaddr, KM_USER0); + if (dst_page != src_page) + kunmap_atomic(src_kaddr, KM_USER1); +} + +void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, + unsigned long src_offset, unsigned long len) +{ + size_t cur; + size_t dst_off_in_page; + size_t src_off_in_page; + size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long dst_i; + unsigned long src_i; + + if (src_offset + len > dst->len) { + printk(KERN_ERR "btrfs memmove bogus src_offset %lu move " + "len %lu dst len %lu\n", src_offset, len, dst->len); + BUG_ON(1); + } + if (dst_offset + len > dst->len) { + printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move " + "len %lu dst len %lu\n", dst_offset, len, dst->len); + BUG_ON(1); + } + + while (len > 0) { + dst_off_in_page = (start_offset + dst_offset) & + ((unsigned long)PAGE_CACHE_SIZE - 1); + src_off_in_page = (start_offset + src_offset) & + ((unsigned long)PAGE_CACHE_SIZE - 1); + + dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; + src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT; + + cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - + src_off_in_page)); + cur = min_t(unsigned long, cur, + (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page)); + + copy_pages(extent_buffer_page(dst, dst_i), + extent_buffer_page(dst, src_i), + dst_off_in_page, src_off_in_page, cur); + + src_offset += cur; + dst_offset += cur; + len -= cur; + } +} + +void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, + unsigned long src_offset, unsigned long len) +{ + size_t cur; + size_t dst_off_in_page; + size_t src_off_in_page; + unsigned long dst_end = dst_offset + len - 1; + unsigned long src_end = src_offset + len - 1; + size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long dst_i; + unsigned long src_i; + + if (src_offset + len > dst->len) { + printk(KERN_ERR "btrfs memmove bogus src_offset %lu move " + "len %lu len %lu\n", src_offset, len, dst->len); + BUG_ON(1); + } + if (dst_offset + len > dst->len) { + printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move " + "len %lu len %lu\n", dst_offset, len, dst->len); + BUG_ON(1); + } + if (dst_offset < src_offset) { + memcpy_extent_buffer(dst, dst_offset, src_offset, len); + return; + } + while (len > 0) { + dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT; + src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT; + + dst_off_in_page = (start_offset + dst_end) & + ((unsigned long)PAGE_CACHE_SIZE - 1); + src_off_in_page = (start_offset + src_end) & + ((unsigned long)PAGE_CACHE_SIZE - 1); + + cur = min_t(unsigned long, len, src_off_in_page + 1); + cur = min(cur, dst_off_in_page + 1); + move_pages(extent_buffer_page(dst, dst_i), + extent_buffer_page(dst, src_i), + dst_off_in_page - cur + 1, + src_off_in_page - cur + 1, cur); + + dst_end -= cur; + src_end -= cur; + len -= cur; + } +} + +int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page) +{ + u64 start = page_offset(page); + struct extent_buffer *eb; + int ret = 1; + unsigned long i; + unsigned long num_pages; + + spin_lock(&tree->buffer_lock); + eb = buffer_search(tree, start); + if (!eb) + goto out; + + if (atomic_read(&eb->refs) > 1) { + ret = 0; + goto out; + } + /* at this point we can safely release the extent buffer */ + num_pages = num_extent_pages(eb->start, eb->len); + for (i = 0; i < num_pages; i++) + page_cache_release(extent_buffer_page(eb, i)); + rb_erase(&eb->rb_node, &tree->buffer); + __free_extent_buffer(eb); +out: + spin_unlock(&tree->buffer_lock); + return ret; +} diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h new file mode 100644 index 000000000000..c5b483a79137 --- /dev/null +++ b/fs/btrfs/extent_io.h @@ -0,0 +1,269 @@ +#ifndef __EXTENTIO__ +#define __EXTENTIO__ + +#include <linux/rbtree.h> + +/* bits for the extent state */ +#define EXTENT_DIRTY 1 +#define EXTENT_WRITEBACK (1 << 1) +#define EXTENT_UPTODATE (1 << 2) +#define EXTENT_LOCKED (1 << 3) +#define EXTENT_NEW (1 << 4) +#define EXTENT_DELALLOC (1 << 5) +#define EXTENT_DEFRAG (1 << 6) +#define EXTENT_DEFRAG_DONE (1 << 7) +#define EXTENT_BUFFER_FILLED (1 << 8) +#define EXTENT_ORDERED (1 << 9) +#define EXTENT_ORDERED_METADATA (1 << 10) +#define EXTENT_BOUNDARY (1 << 11) +#define EXTENT_NODATASUM (1 << 12) +#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) + +/* flags for bio submission */ +#define EXTENT_BIO_COMPRESSED 1 + +/* + * page->private values. Every page that is controlled by the extent + * map has page->private set to one. + */ +#define EXTENT_PAGE_PRIVATE 1 +#define EXTENT_PAGE_PRIVATE_FIRST_PAGE 3 + +struct extent_state; + +typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw, + struct bio *bio, int mirror_num, + unsigned long bio_flags); +struct extent_io_ops { + int (*fill_delalloc)(struct inode *inode, struct page *locked_page, + u64 start, u64 end, int *page_started, + unsigned long *nr_written); + int (*writepage_start_hook)(struct page *page, u64 start, u64 end); + int (*writepage_io_hook)(struct page *page, u64 start, u64 end); + extent_submit_bio_hook_t *submit_bio_hook; + int (*merge_bio_hook)(struct page *page, unsigned long offset, + size_t size, struct bio *bio, + unsigned long bio_flags); + int (*readpage_io_hook)(struct page *page, u64 start, u64 end); + int (*readpage_io_failed_hook)(struct bio *bio, struct page *page, + u64 start, u64 end, + struct extent_state *state); + int (*writepage_io_failed_hook)(struct bio *bio, struct page *page, + u64 start, u64 end, + struct extent_state *state); + int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end, + struct extent_state *state); + int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, + struct extent_state *state, int uptodate); + int (*set_bit_hook)(struct inode *inode, u64 start, u64 end, + unsigned long old, unsigned long bits); + int (*clear_bit_hook)(struct inode *inode, u64 start, u64 end, + unsigned long old, unsigned long bits); + int (*write_cache_pages_lock_hook)(struct page *page); +}; + +struct extent_io_tree { + struct rb_root state; + struct rb_root buffer; + struct address_space *mapping; + u64 dirty_bytes; + spinlock_t lock; + spinlock_t buffer_lock; + struct extent_io_ops *ops; +}; + +struct extent_state { + u64 start; + u64 end; /* inclusive */ + struct rb_node rb_node; + struct extent_io_tree *tree; + wait_queue_head_t wq; + atomic_t refs; + unsigned long state; + + /* for use by the FS */ + u64 private; + + struct list_head leak_list; +}; + +struct extent_buffer { + u64 start; + unsigned long len; + char *map_token; + char *kaddr; + unsigned long map_start; + unsigned long map_len; + struct page *first_page; + atomic_t refs; + int flags; + struct list_head leak_list; + struct rb_node rb_node; + struct mutex mutex; +}; + +struct extent_map_tree; + +static inline struct extent_state *extent_state_next(struct extent_state *state) +{ + struct rb_node *node; + node = rb_next(&state->rb_node); + if (!node) + return NULL; + return rb_entry(node, struct extent_state, rb_node); +} + +typedef struct extent_map *(get_extent_t)(struct inode *inode, + struct page *page, + size_t page_offset, + u64 start, u64 len, + int create); + +void extent_io_tree_init(struct extent_io_tree *tree, + struct address_space *mapping, gfp_t mask); +int try_release_extent_mapping(struct extent_map_tree *map, + struct extent_io_tree *tree, struct page *page, + gfp_t mask); +int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page); +int try_release_extent_state(struct extent_map_tree *map, + struct extent_io_tree *tree, struct page *page, + gfp_t mask); +int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); +int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); +int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask); +int extent_read_full_page(struct extent_io_tree *tree, struct page *page, + get_extent_t *get_extent); +int __init extent_io_init(void); +void extent_io_exit(void); + +u64 count_range_bits(struct extent_io_tree *tree, + u64 *start, u64 search_end, + u64 max_bytes, unsigned long bits); + +int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, + int bits, int filled); +int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + int bits, gfp_t mask); +int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + int bits, int wake, int delete, gfp_t mask); +int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + int bits, gfp_t mask); +int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask); +int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask); +int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask); +int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask); +int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask); +int clear_extent_ordered_metadata(struct extent_io_tree *tree, u64 start, + u64 end, gfp_t mask); +int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask); +int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask); +int find_first_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, int bits); +struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree, + u64 start, int bits); +int extent_invalidatepage(struct extent_io_tree *tree, + struct page *page, unsigned long offset); +int extent_write_full_page(struct extent_io_tree *tree, struct page *page, + get_extent_t *get_extent, + struct writeback_control *wbc); +int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, + u64 start, u64 end, get_extent_t *get_extent, + int mode); +int extent_writepages(struct extent_io_tree *tree, + struct address_space *mapping, + get_extent_t *get_extent, + struct writeback_control *wbc); +int extent_readpages(struct extent_io_tree *tree, + struct address_space *mapping, + struct list_head *pages, unsigned nr_pages, + get_extent_t get_extent); +int extent_prepare_write(struct extent_io_tree *tree, + struct inode *inode, struct page *page, + unsigned from, unsigned to, get_extent_t *get_extent); +int extent_commit_write(struct extent_io_tree *tree, + struct inode *inode, struct page *page, + unsigned from, unsigned to); +sector_t extent_bmap(struct address_space *mapping, sector_t iblock, + get_extent_t *get_extent); +int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end); +int set_state_private(struct extent_io_tree *tree, u64 start, u64 private); +int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private); +void set_page_extent_mapped(struct page *page); + +struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, + u64 start, unsigned long len, + struct page *page0, + gfp_t mask); +struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, + u64 start, unsigned long len, + gfp_t mask); +void free_extent_buffer(struct extent_buffer *eb); +int read_extent_buffer_pages(struct extent_io_tree *tree, + struct extent_buffer *eb, u64 start, int wait, + get_extent_t *get_extent, int mirror_num); + +static inline void extent_buffer_get(struct extent_buffer *eb) +{ + atomic_inc(&eb->refs); +} + +int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, + unsigned long start, + unsigned long len); +void read_extent_buffer(struct extent_buffer *eb, void *dst, + unsigned long start, + unsigned long len); +void write_extent_buffer(struct extent_buffer *eb, const void *src, + unsigned long start, unsigned long len); +void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, + unsigned long dst_offset, unsigned long src_offset, + unsigned long len); +void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, + unsigned long src_offset, unsigned long len); +void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, + unsigned long src_offset, unsigned long len); +void memset_extent_buffer(struct extent_buffer *eb, char c, + unsigned long start, unsigned long len); +int wait_on_extent_buffer_writeback(struct extent_io_tree *tree, + struct extent_buffer *eb); +int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end); +int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits); +int clear_extent_buffer_dirty(struct extent_io_tree *tree, + struct extent_buffer *eb); +int set_extent_buffer_dirty(struct extent_io_tree *tree, + struct extent_buffer *eb); +int set_extent_buffer_uptodate(struct extent_io_tree *tree, + struct extent_buffer *eb); +int clear_extent_buffer_uptodate(struct extent_io_tree *tree, + struct extent_buffer *eb); +int extent_buffer_uptodate(struct extent_io_tree *tree, + struct extent_buffer *eb); +int map_extent_buffer(struct extent_buffer *eb, unsigned long offset, + unsigned long min_len, char **token, char **map, + unsigned long *map_start, + unsigned long *map_len, int km); +int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset, + unsigned long min_len, char **token, char **map, + unsigned long *map_start, + unsigned long *map_len, int km); +void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km); +int release_extent_buffer_tail_pages(struct extent_buffer *eb); +int extent_range_uptodate(struct extent_io_tree *tree, + u64 start, u64 end); +int extent_clear_unlock_delalloc(struct inode *inode, + struct extent_io_tree *tree, + u64 start, u64 end, struct page *locked_page, + int unlock_page, + int clear_unlock, + int clear_delalloc, int clear_dirty, + int set_writeback, + int end_writeback); +#endif diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c new file mode 100644 index 000000000000..4a83e33ada32 --- /dev/null +++ b/fs/btrfs/extent_map.c @@ -0,0 +1,351 @@ +#include <linux/err.h> +#include <linux/gfp.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/version.h> +#include <linux/hardirq.h> +#include "extent_map.h" + +/* temporary define until extent_map moves out of btrfs */ +struct kmem_cache *btrfs_cache_create(const char *name, size_t size, + unsigned long extra_flags, + void (*ctor)(void *, struct kmem_cache *, + unsigned long)); + +static struct kmem_cache *extent_map_cache; + +int __init extent_map_init(void) +{ + extent_map_cache = btrfs_cache_create("extent_map", + sizeof(struct extent_map), 0, + NULL); + if (!extent_map_cache) + return -ENOMEM; + return 0; +} + +void extent_map_exit(void) +{ + if (extent_map_cache) + kmem_cache_destroy(extent_map_cache); +} + +/** + * extent_map_tree_init - initialize extent map tree + * @tree: tree to initialize + * @mask: flags for memory allocations during tree operations + * + * Initialize the extent tree @tree. Should be called for each new inode + * or other user of the extent_map interface. + */ +void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask) +{ + tree->map.rb_node = NULL; + spin_lock_init(&tree->lock); +} +EXPORT_SYMBOL(extent_map_tree_init); + +/** + * alloc_extent_map - allocate new extent map structure + * @mask: memory allocation flags + * + * Allocate a new extent_map structure. The new structure is + * returned with a reference count of one and needs to be + * freed using free_extent_map() + */ +struct extent_map *alloc_extent_map(gfp_t mask) +{ + struct extent_map *em; + em = kmem_cache_alloc(extent_map_cache, mask); + if (!em || IS_ERR(em)) + return em; + em->in_tree = 0; + em->flags = 0; + atomic_set(&em->refs, 1); + return em; +} +EXPORT_SYMBOL(alloc_extent_map); + +/** + * free_extent_map - drop reference count of an extent_map + * @em: extent map beeing releasead + * + * Drops the reference out on @em by one and free the structure + * if the reference count hits zero. + */ +void free_extent_map(struct extent_map *em) +{ + if (!em) + return; + WARN_ON(atomic_read(&em->refs) == 0); + if (atomic_dec_and_test(&em->refs)) { + WARN_ON(em->in_tree); + kmem_cache_free(extent_map_cache, em); + } +} +EXPORT_SYMBOL(free_extent_map); + +static struct rb_node *tree_insert(struct rb_root *root, u64 offset, + struct rb_node *node) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct extent_map *entry; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct extent_map, rb_node); + + WARN_ON(!entry->in_tree); + + if (offset < entry->start) + p = &(*p)->rb_left; + else if (offset >= extent_map_end(entry)) + p = &(*p)->rb_right; + else + return parent; + } + + entry = rb_entry(node, struct extent_map, rb_node); + entry->in_tree = 1; + rb_link_node(node, parent, p); + rb_insert_color(node, root); + return NULL; +} + +/* + * search through the tree for an extent_map with a given offset. If + * it can't be found, try to find some neighboring extents + */ +static struct rb_node *__tree_search(struct rb_root *root, u64 offset, + struct rb_node **prev_ret, + struct rb_node **next_ret) +{ + struct rb_node *n = root->rb_node; + struct rb_node *prev = NULL; + struct rb_node *orig_prev = NULL; + struct extent_map *entry; + struct extent_map *prev_entry = NULL; + + while (n) { + entry = rb_entry(n, struct extent_map, rb_node); + prev = n; + prev_entry = entry; + + WARN_ON(!entry->in_tree); + + if (offset < entry->start) + n = n->rb_left; + else if (offset >= extent_map_end(entry)) + n = n->rb_right; + else + return n; + } + + if (prev_ret) { + orig_prev = prev; + while (prev && offset >= extent_map_end(prev_entry)) { + prev = rb_next(prev); + prev_entry = rb_entry(prev, struct extent_map, rb_node); + } + *prev_ret = prev; + prev = orig_prev; + } + + if (next_ret) { + prev_entry = rb_entry(prev, struct extent_map, rb_node); + while (prev && offset < prev_entry->start) { + prev = rb_prev(prev); + prev_entry = rb_entry(prev, struct extent_map, rb_node); + } + *next_ret = prev; + } + return NULL; +} + +/* + * look for an offset in the tree, and if it can't be found, return + * the first offset we can find smaller than 'offset'. + */ +static inline struct rb_node *tree_search(struct rb_root *root, u64 offset) +{ + struct rb_node *prev; + struct rb_node *ret; + ret = __tree_search(root, offset, &prev, NULL); + if (!ret) + return prev; + return ret; +} + +/* check to see if two extent_map structs are adjacent and safe to merge */ +static int mergable_maps(struct extent_map *prev, struct extent_map *next) +{ + if (test_bit(EXTENT_FLAG_PINNED, &prev->flags)) + return 0; + + /* + * don't merge compressed extents, we need to know their + * actual size + */ + if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags)) + return 0; + + if (extent_map_end(prev) == next->start && + prev->flags == next->flags && + prev->bdev == next->bdev && + ((next->block_start == EXTENT_MAP_HOLE && + prev->block_start == EXTENT_MAP_HOLE) || + (next->block_start == EXTENT_MAP_INLINE && + prev->block_start == EXTENT_MAP_INLINE) || + (next->block_start == EXTENT_MAP_DELALLOC && + prev->block_start == EXTENT_MAP_DELALLOC) || + (next->block_start < EXTENT_MAP_LAST_BYTE - 1 && + next->block_start == extent_map_block_end(prev)))) { + return 1; + } + return 0; +} + +/** + * add_extent_mapping - add new extent map to the extent tree + * @tree: tree to insert new map in + * @em: map to insert + * + * Insert @em into @tree or perform a simple forward/backward merge with + * existing mappings. The extent_map struct passed in will be inserted + * into the tree directly, with an additional reference taken, or a + * reference dropped if the merge attempt was sucessfull. + */ +int add_extent_mapping(struct extent_map_tree *tree, + struct extent_map *em) +{ + int ret = 0; + struct extent_map *merge = NULL; + struct rb_node *rb; + struct extent_map *exist; + + exist = lookup_extent_mapping(tree, em->start, em->len); + if (exist) { + free_extent_map(exist); + ret = -EEXIST; + goto out; + } + assert_spin_locked(&tree->lock); + rb = tree_insert(&tree->map, em->start, &em->rb_node); + if (rb) { + ret = -EEXIST; + free_extent_map(merge); + goto out; + } + atomic_inc(&em->refs); + if (em->start != 0) { + rb = rb_prev(&em->rb_node); + if (rb) + merge = rb_entry(rb, struct extent_map, rb_node); + if (rb && mergable_maps(merge, em)) { + em->start = merge->start; + em->len += merge->len; + em->block_len += merge->block_len; + em->block_start = merge->block_start; + merge->in_tree = 0; + rb_erase(&merge->rb_node, &tree->map); + free_extent_map(merge); + } + } + rb = rb_next(&em->rb_node); + if (rb) + merge = rb_entry(rb, struct extent_map, rb_node); + if (rb && mergable_maps(em, merge)) { + em->len += merge->len; + em->block_len += merge->len; + rb_erase(&merge->rb_node, &tree->map); + merge->in_tree = 0; + free_extent_map(merge); + } +out: + return ret; +} +EXPORT_SYMBOL(add_extent_mapping); + +/* simple helper to do math around the end of an extent, handling wrap */ +static u64 range_end(u64 start, u64 len) +{ + if (start + len < start) + return (u64)-1; + return start + len; +} + +/** + * lookup_extent_mapping - lookup extent_map + * @tree: tree to lookup in + * @start: byte offset to start the search + * @len: length of the lookup range + * + * Find and return the first extent_map struct in @tree that intersects the + * [start, len] range. There may be additional objects in the tree that + * intersect, so check the object returned carefully to make sure that no + * additional lookups are needed. + */ +struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 len) +{ + struct extent_map *em; + struct rb_node *rb_node; + struct rb_node *prev = NULL; + struct rb_node *next = NULL; + u64 end = range_end(start, len); + + assert_spin_locked(&tree->lock); + rb_node = __tree_search(&tree->map, start, &prev, &next); + if (!rb_node && prev) { + em = rb_entry(prev, struct extent_map, rb_node); + if (end > em->start && start < extent_map_end(em)) + goto found; + } + if (!rb_node && next) { + em = rb_entry(next, struct extent_map, rb_node); + if (end > em->start && start < extent_map_end(em)) + goto found; + } + if (!rb_node) { + em = NULL; + goto out; + } + if (IS_ERR(rb_node)) { + em = ERR_PTR(PTR_ERR(rb_node)); + goto out; + } + em = rb_entry(rb_node, struct extent_map, rb_node); + if (end > em->start && start < extent_map_end(em)) + goto found; + + em = NULL; + goto out; + +found: + atomic_inc(&em->refs); +out: + return em; +} +EXPORT_SYMBOL(lookup_extent_mapping); + +/** + * remove_extent_mapping - removes an extent_map from the extent tree + * @tree: extent tree to remove from + * @em: extent map beeing removed + * + * Removes @em from @tree. No reference counts are dropped, and no checks + * are done to see if the range is in use + */ +int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) +{ + int ret = 0; + + WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags)); + assert_spin_locked(&tree->lock); + rb_erase(&em->rb_node, &tree->map); + em->in_tree = 0; + return ret; +} +EXPORT_SYMBOL(remove_extent_mapping); diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h new file mode 100644 index 000000000000..fb6eeef06bb0 --- /dev/null +++ b/fs/btrfs/extent_map.h @@ -0,0 +1,62 @@ +#ifndef __EXTENTMAP__ +#define __EXTENTMAP__ + +#include <linux/rbtree.h> + +#define EXTENT_MAP_LAST_BYTE (u64)-4 +#define EXTENT_MAP_HOLE (u64)-3 +#define EXTENT_MAP_INLINE (u64)-2 +#define EXTENT_MAP_DELALLOC (u64)-1 + +/* bits for the flags field */ +#define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */ +#define EXTENT_FLAG_COMPRESSED 1 +#define EXTENT_FLAG_VACANCY 2 /* no file extent item found */ +#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */ + +struct extent_map { + struct rb_node rb_node; + + /* all of these are in bytes */ + u64 start; + u64 len; + u64 orig_start; + u64 block_start; + u64 block_len; + unsigned long flags; + struct block_device *bdev; + atomic_t refs; + int in_tree; +}; + +struct extent_map_tree { + struct rb_root map; + spinlock_t lock; +}; + +static inline u64 extent_map_end(struct extent_map *em) +{ + if (em->start + em->len < em->start) + return (u64)-1; + return em->start + em->len; +} + +static inline u64 extent_map_block_end(struct extent_map *em) +{ + if (em->block_start + em->block_len < em->block_start) + return (u64)-1; + return em->block_start + em->block_len; +} + +void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask); +struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 len); +int add_extent_mapping(struct extent_map_tree *tree, + struct extent_map *em); +int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em); + +struct extent_map *alloc_extent_map(gfp_t mask); +void free_extent_map(struct extent_map *em); +int __init extent_map_init(void); +void extent_map_exit(void); +#endif diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c new file mode 100644 index 000000000000..964652435fd1 --- /dev/null +++ b/fs/btrfs/file-item.c @@ -0,0 +1,831 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/bio.h> +#include <linux/pagemap.h> +#include <linux/highmem.h> +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "print-tree.h" + +#define MAX_CSUM_ITEMS(r, size) ((((BTRFS_LEAF_DATA_SIZE(r) - \ + sizeof(struct btrfs_item) * 2) / \ + size) - 1)) + +#define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \ + sizeof(struct btrfs_ordered_sum)) / \ + sizeof(struct btrfs_sector_sum) * \ + (r)->sectorsize - (r)->sectorsize) + +int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 objectid, u64 pos, + u64 disk_offset, u64 disk_num_bytes, + u64 num_bytes, u64 offset, u64 ram_bytes, + u8 compression, u8 encryption, u16 other_encoding) +{ + int ret = 0; + struct btrfs_file_extent_item *item; + struct btrfs_key file_key; + struct btrfs_path *path; + struct extent_buffer *leaf; + + path = btrfs_alloc_path(); + BUG_ON(!path); + file_key.objectid = objectid; + file_key.offset = pos; + btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY); + + ret = btrfs_insert_empty_item(trans, root, path, &file_key, + sizeof(*item)); + if (ret < 0) + goto out; + BUG_ON(ret); + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_disk_bytenr(leaf, item, disk_offset); + btrfs_set_file_extent_disk_num_bytes(leaf, item, disk_num_bytes); + btrfs_set_file_extent_offset(leaf, item, offset); + btrfs_set_file_extent_num_bytes(leaf, item, num_bytes); + btrfs_set_file_extent_ram_bytes(leaf, item, ram_bytes); + btrfs_set_file_extent_generation(leaf, item, trans->transid); + btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG); + btrfs_set_file_extent_compression(leaf, item, compression); + btrfs_set_file_extent_encryption(leaf, item, encryption); + btrfs_set_file_extent_other_encoding(leaf, item, other_encoding); + + btrfs_mark_buffer_dirty(leaf); +out: + btrfs_free_path(path); + return ret; +} + +struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, int cow) +{ + int ret; + struct btrfs_key file_key; + struct btrfs_key found_key; + struct btrfs_csum_item *item; + struct extent_buffer *leaf; + u64 csum_offset = 0; + u16 csum_size = + btrfs_super_csum_size(&root->fs_info->super_copy); + int csums_in_item; + + file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + file_key.offset = bytenr; + btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY); + ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow); + if (ret < 0) + goto fail; + leaf = path->nodes[0]; + if (ret > 0) { + ret = 1; + if (path->slots[0] == 0) + goto fail; + path->slots[0]--; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY) + goto fail; + + csum_offset = (bytenr - found_key.offset) >> + root->fs_info->sb->s_blocksize_bits; + csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]); + csums_in_item /= csum_size; + + if (csum_offset >= csums_in_item) { + ret = -EFBIG; + goto fail; + } + } + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); + item = (struct btrfs_csum_item *)((unsigned char *)item + + csum_offset * csum_size); + return item; +fail: + if (ret > 0) + ret = -ENOENT; + return ERR_PTR(ret); +} + + +int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid, + u64 offset, int mod) +{ + int ret; + struct btrfs_key file_key; + int ins_len = mod < 0 ? -1 : 0; + int cow = mod != 0; + + file_key.objectid = objectid; + file_key.offset = offset; + btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY); + ret = btrfs_search_slot(trans, root, &file_key, path, ins_len, cow); + return ret; +} + + +int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, + struct bio *bio, u32 *dst) +{ + u32 sum; + struct bio_vec *bvec = bio->bi_io_vec; + int bio_index = 0; + u64 offset; + u64 item_start_offset = 0; + u64 item_last_offset = 0; + u64 disk_bytenr; + u32 diff; + u16 csum_size = + btrfs_super_csum_size(&root->fs_info->super_copy); + int ret; + struct btrfs_path *path; + struct btrfs_csum_item *item = NULL; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + + path = btrfs_alloc_path(); + if (bio->bi_size > PAGE_CACHE_SIZE * 8) + path->reada = 2; + + WARN_ON(bio->bi_vcnt <= 0); + + disk_bytenr = (u64)bio->bi_sector << 9; + while (bio_index < bio->bi_vcnt) { + offset = page_offset(bvec->bv_page) + bvec->bv_offset; + ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum); + if (ret == 0) + goto found; + + if (!item || disk_bytenr < item_start_offset || + disk_bytenr >= item_last_offset) { + struct btrfs_key found_key; + u32 item_size; + + if (item) + btrfs_release_path(root, path); + item = btrfs_lookup_csum(NULL, root->fs_info->csum_root, + path, disk_bytenr, 0); + if (IS_ERR(item)) { + ret = PTR_ERR(item); + if (ret == -ENOENT || ret == -EFBIG) + ret = 0; + sum = 0; + if (BTRFS_I(inode)->root->root_key.objectid == + BTRFS_DATA_RELOC_TREE_OBJECTID) { + set_extent_bits(io_tree, offset, + offset + bvec->bv_len - 1, + EXTENT_NODATASUM, GFP_NOFS); + } else { + printk(KERN_INFO "btrfs no csum found " + "for inode %lu start %llu\n", + inode->i_ino, + (unsigned long long)offset); + } + item = NULL; + btrfs_release_path(root, path); + goto found; + } + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + + item_start_offset = found_key.offset; + item_size = btrfs_item_size_nr(path->nodes[0], + path->slots[0]); + item_last_offset = item_start_offset + + (item_size / csum_size) * + root->sectorsize; + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_csum_item); + } + /* + * this byte range must be able to fit inside + * a single leaf so it will also fit inside a u32 + */ + diff = disk_bytenr - item_start_offset; + diff = diff / root->sectorsize; + diff = diff * csum_size; + + read_extent_buffer(path->nodes[0], &sum, + ((unsigned long)item) + diff, + csum_size); +found: + if (dst) + *dst++ = sum; + else + set_state_private(io_tree, offset, sum); + disk_bytenr += bvec->bv_len; + bio_index++; + bvec++; + } + btrfs_free_path(path); + return 0; +} + +int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, + struct list_head *list) +{ + struct btrfs_key key; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_ordered_sum *sums; + struct btrfs_sector_sum *sector_sum; + struct btrfs_csum_item *item; + unsigned long offset; + int ret; + size_t size; + u64 csum_end; + u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); + + path = btrfs_alloc_path(); + BUG_ON(!path); + + key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + key.offset = start; + key.type = BTRFS_EXTENT_CSUM_KEY; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto fail; + if (ret > 0 && path->slots[0] > 0) { + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1); + if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID && + key.type == BTRFS_EXTENT_CSUM_KEY) { + offset = (start - key.offset) >> + root->fs_info->sb->s_blocksize_bits; + if (offset * csum_size < + btrfs_item_size_nr(leaf, path->slots[0] - 1)) + path->slots[0]--; + } + } + + while (start <= end) { + leaf = path->nodes[0]; + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto fail; + if (ret > 0) + break; + leaf = path->nodes[0]; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || + key.type != BTRFS_EXTENT_CSUM_KEY) + break; + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.offset > end) + break; + + if (key.offset > start) + start = key.offset; + + size = btrfs_item_size_nr(leaf, path->slots[0]); + csum_end = key.offset + (size / csum_size) * root->sectorsize; + if (csum_end <= start) { + path->slots[0]++; + continue; + } + + csum_end = min(csum_end, end + 1); + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_csum_item); + while (start < csum_end) { + size = min_t(size_t, csum_end - start, + MAX_ORDERED_SUM_BYTES(root)); + sums = kzalloc(btrfs_ordered_sum_size(root, size), + GFP_NOFS); + BUG_ON(!sums); + + sector_sum = sums->sums; + sums->bytenr = start; + sums->len = size; + + offset = (start - key.offset) >> + root->fs_info->sb->s_blocksize_bits; + offset *= csum_size; + + while (size > 0) { + read_extent_buffer(path->nodes[0], + §or_sum->sum, + ((unsigned long)item) + + offset, csum_size); + sector_sum->bytenr = start; + + size -= root->sectorsize; + start += root->sectorsize; + offset += csum_size; + sector_sum++; + } + list_add_tail(&sums->list, list); + } + path->slots[0]++; + } + ret = 0; +fail: + btrfs_free_path(path); + return ret; +} + +int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, + struct bio *bio, u64 file_start, int contig) +{ + struct btrfs_ordered_sum *sums; + struct btrfs_sector_sum *sector_sum; + struct btrfs_ordered_extent *ordered; + char *data; + struct bio_vec *bvec = bio->bi_io_vec; + int bio_index = 0; + unsigned long total_bytes = 0; + unsigned long this_sum_bytes = 0; + u64 offset; + u64 disk_bytenr; + + WARN_ON(bio->bi_vcnt <= 0); + sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_size), GFP_NOFS); + if (!sums) + return -ENOMEM; + + sector_sum = sums->sums; + disk_bytenr = (u64)bio->bi_sector << 9; + sums->len = bio->bi_size; + INIT_LIST_HEAD(&sums->list); + + if (contig) + offset = file_start; + else + offset = page_offset(bvec->bv_page) + bvec->bv_offset; + + ordered = btrfs_lookup_ordered_extent(inode, offset); + BUG_ON(!ordered); + sums->bytenr = ordered->start; + + while (bio_index < bio->bi_vcnt) { + if (!contig) + offset = page_offset(bvec->bv_page) + bvec->bv_offset; + + if (!contig && (offset >= ordered->file_offset + ordered->len || + offset < ordered->file_offset)) { + unsigned long bytes_left; + sums->len = this_sum_bytes; + this_sum_bytes = 0; + btrfs_add_ordered_sum(inode, ordered, sums); + btrfs_put_ordered_extent(ordered); + + bytes_left = bio->bi_size - total_bytes; + + sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left), + GFP_NOFS); + BUG_ON(!sums); + sector_sum = sums->sums; + sums->len = bytes_left; + ordered = btrfs_lookup_ordered_extent(inode, offset); + BUG_ON(!ordered); + sums->bytenr = ordered->start; + } + + data = kmap_atomic(bvec->bv_page, KM_USER0); + sector_sum->sum = ~(u32)0; + sector_sum->sum = btrfs_csum_data(root, + data + bvec->bv_offset, + sector_sum->sum, + bvec->bv_len); + kunmap_atomic(data, KM_USER0); + btrfs_csum_final(sector_sum->sum, + (char *)§or_sum->sum); + sector_sum->bytenr = disk_bytenr; + + sector_sum++; + bio_index++; + total_bytes += bvec->bv_len; + this_sum_bytes += bvec->bv_len; + disk_bytenr += bvec->bv_len; + offset += bvec->bv_len; + bvec++; + } + this_sum_bytes = 0; + btrfs_add_ordered_sum(inode, ordered, sums); + btrfs_put_ordered_extent(ordered); + return 0; +} + +/* + * helper function for csum removal, this expects the + * key to describe the csum pointed to by the path, and it expects + * the csum to overlap the range [bytenr, len] + * + * The csum should not be entirely contained in the range and the + * range should not be entirely contained in the csum. + * + * This calls btrfs_truncate_item with the correct args based on the + * overlap, and fixes up the key as required. + */ +static noinline int truncate_one_csum(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *key, + u64 bytenr, u64 len) +{ + struct extent_buffer *leaf; + u16 csum_size = + btrfs_super_csum_size(&root->fs_info->super_copy); + u64 csum_end; + u64 end_byte = bytenr + len; + u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits; + int ret; + + leaf = path->nodes[0]; + csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size; + csum_end <<= root->fs_info->sb->s_blocksize_bits; + csum_end += key->offset; + + if (key->offset < bytenr && csum_end <= end_byte) { + /* + * [ bytenr - len ] + * [ ] + * [csum ] + * A simple truncate off the end of the item + */ + u32 new_size = (bytenr - key->offset) >> blocksize_bits; + new_size *= csum_size; + ret = btrfs_truncate_item(trans, root, path, new_size, 1); + BUG_ON(ret); + } else if (key->offset >= bytenr && csum_end > end_byte && + end_byte > key->offset) { + /* + * [ bytenr - len ] + * [ ] + * [csum ] + * we need to truncate from the beginning of the csum + */ + u32 new_size = (csum_end - end_byte) >> blocksize_bits; + new_size *= csum_size; + + ret = btrfs_truncate_item(trans, root, path, new_size, 0); + BUG_ON(ret); + + key->offset = end_byte; + ret = btrfs_set_item_key_safe(trans, root, path, key); + BUG_ON(ret); + } else { + BUG(); + } + return 0; +} + +/* + * deletes the csum items from the csum tree for a given + * range of bytes. + */ +int btrfs_del_csums(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, u64 len) +{ + struct btrfs_path *path; + struct btrfs_key key; + u64 end_byte = bytenr + len; + u64 csum_end; + struct extent_buffer *leaf; + int ret; + u16 csum_size = + btrfs_super_csum_size(&root->fs_info->super_copy); + int blocksize_bits = root->fs_info->sb->s_blocksize_bits; + + root = root->fs_info->csum_root; + + path = btrfs_alloc_path(); + + while (1) { + key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + key.offset = end_byte - 1; + key.type = BTRFS_EXTENT_CSUM_KEY; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) { + if (path->slots[0] == 0) + goto out; + path->slots[0]--; + } + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + + if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || + key.type != BTRFS_EXTENT_CSUM_KEY) { + break; + } + + if (key.offset >= end_byte) + break; + + csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size; + csum_end <<= blocksize_bits; + csum_end += key.offset; + + /* this csum ends before we start, we're done */ + if (csum_end <= bytenr) + break; + + /* delete the entire item, it is inside our range */ + if (key.offset >= bytenr && csum_end <= end_byte) { + ret = btrfs_del_item(trans, root, path); + BUG_ON(ret); + if (key.offset == bytenr) + break; + } else if (key.offset < bytenr && csum_end > end_byte) { + unsigned long offset; + unsigned long shift_len; + unsigned long item_offset; + /* + * [ bytenr - len ] + * [csum ] + * + * Our bytes are in the middle of the csum, + * we need to split this item and insert a new one. + * + * But we can't drop the path because the + * csum could change, get removed, extended etc. + * + * The trick here is the max size of a csum item leaves + * enough room in the tree block for a single + * item header. So, we split the item in place, + * adding a new header pointing to the existing + * bytes. Then we loop around again and we have + * a nicely formed csum item that we can neatly + * truncate. + */ + offset = (bytenr - key.offset) >> blocksize_bits; + offset *= csum_size; + + shift_len = (len >> blocksize_bits) * csum_size; + + item_offset = btrfs_item_ptr_offset(leaf, + path->slots[0]); + + memset_extent_buffer(leaf, 0, item_offset + offset, + shift_len); + key.offset = bytenr; + + /* + * btrfs_split_item returns -EAGAIN when the + * item changed size or key + */ + ret = btrfs_split_item(trans, root, path, &key, offset); + BUG_ON(ret && ret != -EAGAIN); + + key.offset = end_byte - 1; + } else { + ret = truncate_one_csum(trans, root, path, + &key, bytenr, len); + BUG_ON(ret); + if (key.offset < bytenr) + break; + } + btrfs_release_path(root, path); + } +out: + btrfs_free_path(path); + return 0; +} + +int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_ordered_sum *sums) +{ + u64 bytenr; + int ret; + struct btrfs_key file_key; + struct btrfs_key found_key; + u64 next_offset; + u64 total_bytes = 0; + int found_next; + struct btrfs_path *path; + struct btrfs_csum_item *item; + struct btrfs_csum_item *item_end; + struct extent_buffer *leaf = NULL; + u64 csum_offset; + struct btrfs_sector_sum *sector_sum; + u32 nritems; + u32 ins_size; + char *eb_map; + char *eb_token; + unsigned long map_len; + unsigned long map_start; + u16 csum_size = + btrfs_super_csum_size(&root->fs_info->super_copy); + + path = btrfs_alloc_path(); + BUG_ON(!path); + sector_sum = sums->sums; +again: + next_offset = (u64)-1; + found_next = 0; + file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + file_key.offset = sector_sum->bytenr; + bytenr = sector_sum->bytenr; + btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY); + + item = btrfs_lookup_csum(trans, root, path, sector_sum->bytenr, 1); + if (!IS_ERR(item)) { + leaf = path->nodes[0]; + ret = 0; + goto found; + } + ret = PTR_ERR(item); + if (ret == -EFBIG) { + u32 item_size; + /* we found one, but it isn't big enough yet */ + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + if ((item_size / csum_size) >= + MAX_CSUM_ITEMS(root, csum_size)) { + /* already at max size, make a new one */ + goto insert; + } + } else { + int slot = path->slots[0] + 1; + /* we didn't find a csum item, insert one */ + nritems = btrfs_header_nritems(path->nodes[0]); + if (path->slots[0] >= nritems - 1) { + ret = btrfs_next_leaf(root, path); + if (ret == 1) + found_next = 1; + if (ret != 0) + goto insert; + slot = 0; + } + btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot); + if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || + found_key.type != BTRFS_EXTENT_CSUM_KEY) { + found_next = 1; + goto insert; + } + next_offset = found_key.offset; + found_next = 1; + goto insert; + } + + /* + * at this point, we know the tree has an item, but it isn't big + * enough yet to put our csum in. Grow it + */ + btrfs_release_path(root, path); + ret = btrfs_search_slot(trans, root, &file_key, path, + csum_size, 1); + if (ret < 0) + goto fail_unlock; + + if (ret > 0) { + if (path->slots[0] == 0) + goto insert; + path->slots[0]--; + } + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + csum_offset = (bytenr - found_key.offset) >> + root->fs_info->sb->s_blocksize_bits; + + if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY || + found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || + csum_offset >= MAX_CSUM_ITEMS(root, csum_size)) { + goto insert; + } + + if (csum_offset >= btrfs_item_size_nr(leaf, path->slots[0]) / + csum_size) { + u32 diff = (csum_offset + 1) * csum_size; + + /* + * is the item big enough already? we dropped our lock + * before and need to recheck + */ + if (diff < btrfs_item_size_nr(leaf, path->slots[0])) + goto csum; + + diff = diff - btrfs_item_size_nr(leaf, path->slots[0]); + if (diff != csum_size) + goto insert; + + ret = btrfs_extend_item(trans, root, path, diff); + BUG_ON(ret); + goto csum; + } + +insert: + btrfs_release_path(root, path); + csum_offset = 0; + if (found_next) { + u64 tmp = total_bytes + root->sectorsize; + u64 next_sector = sector_sum->bytenr; + struct btrfs_sector_sum *next = sector_sum + 1; + + while (tmp < sums->len) { + if (next_sector + root->sectorsize != next->bytenr) + break; + tmp += root->sectorsize; + next_sector = next->bytenr; + next++; + } + tmp = min(tmp, next_offset - file_key.offset); + tmp >>= root->fs_info->sb->s_blocksize_bits; + tmp = max((u64)1, tmp); + tmp = min(tmp, (u64)MAX_CSUM_ITEMS(root, csum_size)); + ins_size = csum_size * tmp; + } else { + ins_size = csum_size; + } + ret = btrfs_insert_empty_item(trans, root, path, &file_key, + ins_size); + if (ret < 0) + goto fail_unlock; + if (ret != 0) { + WARN_ON(1); + goto fail_unlock; + } +csum: + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); + ret = 0; + item = (struct btrfs_csum_item *)((unsigned char *)item + + csum_offset * csum_size); +found: + item_end = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); + item_end = (struct btrfs_csum_item *)((unsigned char *)item_end + + btrfs_item_size_nr(leaf, path->slots[0])); + eb_token = NULL; + cond_resched(); +next_sector: + + if (!eb_token || + (unsigned long)item + csum_size >= map_start + map_len) { + int err; + + if (eb_token) + unmap_extent_buffer(leaf, eb_token, KM_USER1); + eb_token = NULL; + err = map_private_extent_buffer(leaf, (unsigned long)item, + csum_size, + &eb_token, &eb_map, + &map_start, &map_len, KM_USER1); + if (err) + eb_token = NULL; + } + if (eb_token) { + memcpy(eb_token + ((unsigned long)item & (PAGE_CACHE_SIZE - 1)), + §or_sum->sum, csum_size); + } else { + write_extent_buffer(leaf, §or_sum->sum, + (unsigned long)item, csum_size); + } + + total_bytes += root->sectorsize; + sector_sum++; + if (total_bytes < sums->len) { + item = (struct btrfs_csum_item *)((char *)item + + csum_size); + if (item < item_end && bytenr + PAGE_CACHE_SIZE == + sector_sum->bytenr) { + bytenr = sector_sum->bytenr; + goto next_sector; + } + } + if (eb_token) { + unmap_extent_buffer(leaf, eb_token, KM_USER1); + eb_token = NULL; + } + btrfs_mark_buffer_dirty(path->nodes[0]); + cond_resched(); + if (total_bytes < sums->len) { + btrfs_release_path(root, path); + goto again; + } +out: + btrfs_free_path(path); + return ret; + +fail_unlock: + goto out; +} diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c new file mode 100644 index 000000000000..90268334145e --- /dev/null +++ b/fs/btrfs/file.c @@ -0,0 +1,1288 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/fs.h> +#include <linux/pagemap.h> +#include <linux/highmem.h> +#include <linux/time.h> +#include <linux/init.h> +#include <linux/string.h> +#include <linux/smp_lock.h> +#include <linux/backing-dev.h> +#include <linux/mpage.h> +#include <linux/swap.h> +#include <linux/writeback.h> +#include <linux/statfs.h> +#include <linux/compat.h> +#include <linux/version.h> +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "btrfs_inode.h" +#include "ioctl.h" +#include "print-tree.h" +#include "tree-log.h" +#include "locking.h" +#include "compat.h" + + +/* simple helper to fault in pages and copy. This should go away + * and be replaced with calls into generic code. + */ +static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, + int write_bytes, + struct page **prepared_pages, + const char __user *buf) +{ + long page_fault = 0; + int i; + int offset = pos & (PAGE_CACHE_SIZE - 1); + + for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) { + size_t count = min_t(size_t, + PAGE_CACHE_SIZE - offset, write_bytes); + struct page *page = prepared_pages[i]; + fault_in_pages_readable(buf, count); + + /* Copy data from userspace to the current page */ + kmap(page); + page_fault = __copy_from_user(page_address(page) + offset, + buf, count); + /* Flush processor's dcache for this page */ + flush_dcache_page(page); + kunmap(page); + buf += count; + write_bytes -= count; + + if (page_fault) + break; + } + return page_fault ? -EFAULT : 0; +} + +/* + * unlocks pages after btrfs_file_write is done with them + */ +static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages) +{ + size_t i; + for (i = 0; i < num_pages; i++) { + if (!pages[i]) + break; + /* page checked is some magic around finding pages that + * have been modified without going through btrfs_set_page_dirty + * clear it here + */ + ClearPageChecked(pages[i]); + unlock_page(pages[i]); + mark_page_accessed(pages[i]); + page_cache_release(pages[i]); + } +} + +/* + * after copy_from_user, pages need to be dirtied and we need to make + * sure holes are created between the current EOF and the start of + * any next extents (if required). + * + * this also makes the decision about creating an inline extent vs + * doing real data extents, marking pages dirty and delalloc as required. + */ +static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct file *file, + struct page **pages, + size_t num_pages, + loff_t pos, + size_t write_bytes) +{ + int err = 0; + int i; + struct inode *inode = fdentry(file)->d_inode; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + u64 hint_byte; + u64 num_bytes; + u64 start_pos; + u64 end_of_last_block; + u64 end_pos = pos + write_bytes; + loff_t isize = i_size_read(inode); + + start_pos = pos & ~((u64)root->sectorsize - 1); + num_bytes = (write_bytes + pos - start_pos + + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); + + end_of_last_block = start_pos + num_bytes - 1; + + lock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS); + trans = btrfs_join_transaction(root, 1); + if (!trans) { + err = -ENOMEM; + goto out_unlock; + } + btrfs_set_trans_block_group(trans, inode); + hint_byte = 0; + + set_extent_uptodate(io_tree, start_pos, end_of_last_block, GFP_NOFS); + + /* check for reserved extents on each page, we don't want + * to reset the delalloc bit on things that already have + * extents reserved. + */ + btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block); + for (i = 0; i < num_pages; i++) { + struct page *p = pages[i]; + SetPageUptodate(p); + ClearPageChecked(p); + set_page_dirty(p); + } + if (end_pos > isize) { + i_size_write(inode, end_pos); + btrfs_update_inode(trans, root, inode); + } + err = btrfs_end_transaction(trans, root); +out_unlock: + unlock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS); + return err; +} + +/* + * this drops all the extents in the cache that intersect the range + * [start, end]. Existing extents are split as required. + */ +int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, + int skip_pinned) +{ + struct extent_map *em; + struct extent_map *split = NULL; + struct extent_map *split2 = NULL; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + u64 len = end - start + 1; + int ret; + int testend = 1; + unsigned long flags; + int compressed = 0; + + WARN_ON(end < start); + if (end == (u64)-1) { + len = (u64)-1; + testend = 0; + } + while (1) { + if (!split) + split = alloc_extent_map(GFP_NOFS); + if (!split2) + split2 = alloc_extent_map(GFP_NOFS); + + spin_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, len); + if (!em) { + spin_unlock(&em_tree->lock); + break; + } + flags = em->flags; + if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) { + spin_unlock(&em_tree->lock); + if (em->start <= start && + (!testend || em->start + em->len >= start + len)) { + free_extent_map(em); + break; + } + if (start < em->start) { + len = em->start - start; + } else { + len = start + len - (em->start + em->len); + start = em->start + em->len; + } + free_extent_map(em); + continue; + } + compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + clear_bit(EXTENT_FLAG_PINNED, &em->flags); + remove_extent_mapping(em_tree, em); + + if (em->block_start < EXTENT_MAP_LAST_BYTE && + em->start < start) { + split->start = em->start; + split->len = start - em->start; + split->orig_start = em->orig_start; + split->block_start = em->block_start; + + if (compressed) + split->block_len = em->block_len; + else + split->block_len = split->len; + + split->bdev = em->bdev; + split->flags = flags; + ret = add_extent_mapping(em_tree, split); + BUG_ON(ret); + free_extent_map(split); + split = split2; + split2 = NULL; + } + if (em->block_start < EXTENT_MAP_LAST_BYTE && + testend && em->start + em->len > start + len) { + u64 diff = start + len - em->start; + + split->start = start + len; + split->len = em->start + em->len - (start + len); + split->bdev = em->bdev; + split->flags = flags; + + if (compressed) { + split->block_len = em->block_len; + split->block_start = em->block_start; + split->orig_start = em->orig_start; + } else { + split->block_len = split->len; + split->block_start = em->block_start + diff; + split->orig_start = split->start; + } + + ret = add_extent_mapping(em_tree, split); + BUG_ON(ret); + free_extent_map(split); + split = NULL; + } + spin_unlock(&em_tree->lock); + + /* once for us */ + free_extent_map(em); + /* once for the tree*/ + free_extent_map(em); + } + if (split) + free_extent_map(split); + if (split2) + free_extent_map(split2); + return 0; +} + +int btrfs_check_file(struct btrfs_root *root, struct inode *inode) +{ + return 0; +#if 0 + struct btrfs_path *path; + struct btrfs_key found_key; + struct extent_buffer *leaf; + struct btrfs_file_extent_item *extent; + u64 last_offset = 0; + int nritems; + int slot; + int found_type; + int ret; + int err = 0; + u64 extent_end = 0; + + path = btrfs_alloc_path(); + ret = btrfs_lookup_file_extent(NULL, root, path, inode->i_ino, + last_offset, 0); + while (1) { + nritems = btrfs_header_nritems(path->nodes[0]); + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret) + goto out; + nritems = btrfs_header_nritems(path->nodes[0]); + } + slot = path->slots[0]; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, slot); + if (found_key.objectid != inode->i_ino) + break; + if (found_key.type != BTRFS_EXTENT_DATA_KEY) + goto out; + + if (found_key.offset < last_offset) { + WARN_ON(1); + btrfs_print_leaf(root, leaf); + printk(KERN_ERR "inode %lu found offset %llu " + "expected %llu\n", inode->i_ino, + (unsigned long long)found_key.offset, + (unsigned long long)last_offset); + err = 1; + goto out; + } + extent = btrfs_item_ptr(leaf, slot, + struct btrfs_file_extent_item); + found_type = btrfs_file_extent_type(leaf, extent); + if (found_type == BTRFS_FILE_EXTENT_REG) { + extent_end = found_key.offset + + btrfs_file_extent_num_bytes(leaf, extent); + } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { + struct btrfs_item *item; + item = btrfs_item_nr(leaf, slot); + extent_end = found_key.offset + + btrfs_file_extent_inline_len(leaf, extent); + extent_end = (extent_end + root->sectorsize - 1) & + ~((u64)root->sectorsize - 1); + } + last_offset = extent_end; + path->slots[0]++; + } + if (0 && last_offset < inode->i_size) { + WARN_ON(1); + btrfs_print_leaf(root, leaf); + printk(KERN_ERR "inode %lu found offset %llu size %llu\n", + inode->i_ino, (unsigned long long)last_offset, + (unsigned long long)inode->i_size); + err = 1; + + } +out: + btrfs_free_path(path); + return err; +#endif +} + +/* + * this is very complex, but the basic idea is to drop all extents + * in the range start - end. hint_block is filled in with a block number + * that would be a good hint to the block allocator for this file. + * + * If an extent intersects the range but is not entirely inside the range + * it is either truncated or split. Anything entirely inside the range + * is deleted from the tree. + * + * inline_limit is used to tell this code which offsets in the file to keep + * if they contain inline extents. + */ +noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + u64 start, u64 end, u64 inline_limit, u64 *hint_byte) +{ + u64 extent_end = 0; + u64 locked_end = end; + u64 search_start = start; + u64 leaf_start; + u64 ram_bytes = 0; + u64 orig_parent = 0; + u64 disk_bytenr = 0; + u8 compression; + u8 encryption; + u16 other_encoding = 0; + u64 root_gen; + u64 root_owner; + struct extent_buffer *leaf; + struct btrfs_file_extent_item *extent; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_file_extent_item old; + int keep; + int slot; + int bookend; + int found_type = 0; + int found_extent; + int found_inline; + int recow; + int ret; + + inline_limit = 0; + btrfs_drop_extent_cache(inode, start, end - 1, 0); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + while (1) { + recow = 0; + btrfs_release_path(root, path); + ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino, + search_start, -1); + if (ret < 0) + goto out; + if (ret > 0) { + if (path->slots[0] == 0) { + ret = 0; + goto out; + } + path->slots[0]--; + } +next_slot: + keep = 0; + bookend = 0; + found_extent = 0; + found_inline = 0; + leaf_start = 0; + root_gen = 0; + root_owner = 0; + compression = 0; + encryption = 0; + extent = NULL; + leaf = path->nodes[0]; + slot = path->slots[0]; + ret = 0; + btrfs_item_key_to_cpu(leaf, &key, slot); + if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY && + key.offset >= end) { + goto out; + } + if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY || + key.objectid != inode->i_ino) { + goto out; + } + if (recow) { + search_start = max(key.offset, start); + continue; + } + if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) { + extent = btrfs_item_ptr(leaf, slot, + struct btrfs_file_extent_item); + found_type = btrfs_file_extent_type(leaf, extent); + compression = btrfs_file_extent_compression(leaf, + extent); + encryption = btrfs_file_extent_encryption(leaf, + extent); + other_encoding = btrfs_file_extent_other_encoding(leaf, + extent); + if (found_type == BTRFS_FILE_EXTENT_REG || + found_type == BTRFS_FILE_EXTENT_PREALLOC) { + extent_end = + btrfs_file_extent_disk_bytenr(leaf, + extent); + if (extent_end) + *hint_byte = extent_end; + + extent_end = key.offset + + btrfs_file_extent_num_bytes(leaf, extent); + ram_bytes = btrfs_file_extent_ram_bytes(leaf, + extent); + found_extent = 1; + } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { + found_inline = 1; + extent_end = key.offset + + btrfs_file_extent_inline_len(leaf, extent); + } + } else { + extent_end = search_start; + } + + /* we found nothing we can drop */ + if ((!found_extent && !found_inline) || + search_start >= extent_end) { + int nextret; + u32 nritems; + nritems = btrfs_header_nritems(leaf); + if (slot >= nritems - 1) { + nextret = btrfs_next_leaf(root, path); + if (nextret) + goto out; + recow = 1; + } else { + path->slots[0]++; + } + goto next_slot; + } + + if (end <= extent_end && start >= key.offset && found_inline) + *hint_byte = EXTENT_MAP_INLINE; + + if (found_extent) { + read_extent_buffer(leaf, &old, (unsigned long)extent, + sizeof(old)); + root_gen = btrfs_header_generation(leaf); + root_owner = btrfs_header_owner(leaf); + leaf_start = leaf->start; + } + + if (end < extent_end && end >= key.offset) { + bookend = 1; + if (found_inline && start <= key.offset) + keep = 1; + } + + if (bookend && found_extent) { + if (locked_end < extent_end) { + ret = try_lock_extent(&BTRFS_I(inode)->io_tree, + locked_end, extent_end - 1, + GFP_NOFS); + if (!ret) { + btrfs_release_path(root, path); + lock_extent(&BTRFS_I(inode)->io_tree, + locked_end, extent_end - 1, + GFP_NOFS); + locked_end = extent_end; + continue; + } + locked_end = extent_end; + } + orig_parent = path->nodes[0]->start; + disk_bytenr = le64_to_cpu(old.disk_bytenr); + if (disk_bytenr != 0) { + ret = btrfs_inc_extent_ref(trans, root, + disk_bytenr, + le64_to_cpu(old.disk_num_bytes), + orig_parent, root->root_key.objectid, + trans->transid, inode->i_ino); + BUG_ON(ret); + } + } + + if (found_inline) { + u64 mask = root->sectorsize - 1; + search_start = (extent_end + mask) & ~mask; + } else + search_start = extent_end; + + /* truncate existing extent */ + if (start > key.offset) { + u64 new_num; + u64 old_num; + keep = 1; + WARN_ON(start & (root->sectorsize - 1)); + if (found_extent) { + new_num = start - key.offset; + old_num = btrfs_file_extent_num_bytes(leaf, + extent); + *hint_byte = + btrfs_file_extent_disk_bytenr(leaf, + extent); + if (btrfs_file_extent_disk_bytenr(leaf, + extent)) { + inode_sub_bytes(inode, old_num - + new_num); + } + btrfs_set_file_extent_num_bytes(leaf, + extent, new_num); + btrfs_mark_buffer_dirty(leaf); + } else if (key.offset < inline_limit && + (end > extent_end) && + (inline_limit < extent_end)) { + u32 new_size; + new_size = btrfs_file_extent_calc_inline_size( + inline_limit - key.offset); + inode_sub_bytes(inode, extent_end - + inline_limit); + btrfs_set_file_extent_ram_bytes(leaf, extent, + new_size); + if (!compression && !encryption) { + btrfs_truncate_item(trans, root, path, + new_size, 1); + } + } + } + /* delete the entire extent */ + if (!keep) { + if (found_inline) + inode_sub_bytes(inode, extent_end - + key.offset); + ret = btrfs_del_item(trans, root, path); + /* TODO update progress marker and return */ + BUG_ON(ret); + extent = NULL; + btrfs_release_path(root, path); + /* the extent will be freed later */ + } + if (bookend && found_inline && start <= key.offset) { + u32 new_size; + new_size = btrfs_file_extent_calc_inline_size( + extent_end - end); + inode_sub_bytes(inode, end - key.offset); + btrfs_set_file_extent_ram_bytes(leaf, extent, + new_size); + if (!compression && !encryption) + ret = btrfs_truncate_item(trans, root, path, + new_size, 0); + BUG_ON(ret); + } + /* create bookend, splitting the extent in two */ + if (bookend && found_extent) { + struct btrfs_key ins; + ins.objectid = inode->i_ino; + ins.offset = end; + btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY); + + btrfs_release_path(root, path); + ret = btrfs_insert_empty_item(trans, root, path, &ins, + sizeof(*extent)); + BUG_ON(ret); + + leaf = path->nodes[0]; + extent = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + write_extent_buffer(leaf, &old, + (unsigned long)extent, sizeof(old)); + + btrfs_set_file_extent_compression(leaf, extent, + compression); + btrfs_set_file_extent_encryption(leaf, extent, + encryption); + btrfs_set_file_extent_other_encoding(leaf, extent, + other_encoding); + btrfs_set_file_extent_offset(leaf, extent, + le64_to_cpu(old.offset) + end - key.offset); + WARN_ON(le64_to_cpu(old.num_bytes) < + (extent_end - end)); + btrfs_set_file_extent_num_bytes(leaf, extent, + extent_end - end); + + /* + * set the ram bytes to the size of the full extent + * before splitting. This is a worst case flag, + * but its the best we can do because we don't know + * how splitting affects compression + */ + btrfs_set_file_extent_ram_bytes(leaf, extent, + ram_bytes); + btrfs_set_file_extent_type(leaf, extent, found_type); + + btrfs_mark_buffer_dirty(path->nodes[0]); + + if (disk_bytenr != 0) { + ret = btrfs_update_extent_ref(trans, root, + disk_bytenr, orig_parent, + leaf->start, + root->root_key.objectid, + trans->transid, ins.objectid); + + BUG_ON(ret); + } + btrfs_release_path(root, path); + if (disk_bytenr != 0) + inode_add_bytes(inode, extent_end - end); + } + + if (found_extent && !keep) { + u64 old_disk_bytenr = le64_to_cpu(old.disk_bytenr); + + if (old_disk_bytenr != 0) { + inode_sub_bytes(inode, + le64_to_cpu(old.num_bytes)); + ret = btrfs_free_extent(trans, root, + old_disk_bytenr, + le64_to_cpu(old.disk_num_bytes), + leaf_start, root_owner, + root_gen, key.objectid, 0); + BUG_ON(ret); + *hint_byte = old_disk_bytenr; + } + } + + if (search_start >= end) { + ret = 0; + goto out; + } + } +out: + btrfs_free_path(path); + if (locked_end > end) { + unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1, + GFP_NOFS); + } + btrfs_check_file(root, inode); + return ret; +} + +static int extent_mergeable(struct extent_buffer *leaf, int slot, + u64 objectid, u64 bytenr, u64 *start, u64 *end) +{ + struct btrfs_file_extent_item *fi; + struct btrfs_key key; + u64 extent_end; + + if (slot < 0 || slot >= btrfs_header_nritems(leaf)) + return 0; + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY) + return 0; + + fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG || + btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr || + btrfs_file_extent_compression(leaf, fi) || + btrfs_file_extent_encryption(leaf, fi) || + btrfs_file_extent_other_encoding(leaf, fi)) + return 0; + + extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); + if ((*start && *start != key.offset) || (*end && *end != extent_end)) + return 0; + + *start = key.offset; + *end = extent_end; + return 1; +} + +/* + * Mark extent in the range start - end as written. + * + * This changes extent type from 'pre-allocated' to 'regular'. If only + * part of extent is marked as written, the extent will be split into + * two or three. + */ +int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, u64 start, u64 end) +{ + struct extent_buffer *leaf; + struct btrfs_path *path; + struct btrfs_file_extent_item *fi; + struct btrfs_key key; + u64 bytenr; + u64 num_bytes; + u64 extent_end; + u64 extent_offset; + u64 other_start; + u64 other_end; + u64 split = start; + u64 locked_end = end; + u64 orig_parent; + int extent_type; + int split_end = 1; + int ret; + + btrfs_drop_extent_cache(inode, start, end - 1, 0); + + path = btrfs_alloc_path(); + BUG_ON(!path); +again: + key.objectid = inode->i_ino; + key.type = BTRFS_EXTENT_DATA_KEY; + if (split == start) + key.offset = split; + else + key.offset = split - 1; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0 && path->slots[0] > 0) + path->slots[0]--; + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + BUG_ON(key.objectid != inode->i_ino || + key.type != BTRFS_EXTENT_DATA_KEY); + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + extent_type = btrfs_file_extent_type(leaf, fi); + BUG_ON(extent_type != BTRFS_FILE_EXTENT_PREALLOC); + extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); + BUG_ON(key.offset > start || extent_end < end); + + bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); + extent_offset = btrfs_file_extent_offset(leaf, fi); + + if (key.offset == start) + split = end; + + if (key.offset == start && extent_end == end) { + int del_nr = 0; + int del_slot = 0; + u64 leaf_owner = btrfs_header_owner(leaf); + u64 leaf_gen = btrfs_header_generation(leaf); + other_start = end; + other_end = 0; + if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino, + bytenr, &other_start, &other_end)) { + extent_end = other_end; + del_slot = path->slots[0] + 1; + del_nr++; + ret = btrfs_free_extent(trans, root, bytenr, num_bytes, + leaf->start, leaf_owner, + leaf_gen, inode->i_ino, 0); + BUG_ON(ret); + } + other_start = 0; + other_end = start; + if (extent_mergeable(leaf, path->slots[0] - 1, inode->i_ino, + bytenr, &other_start, &other_end)) { + key.offset = other_start; + del_slot = path->slots[0]; + del_nr++; + ret = btrfs_free_extent(trans, root, bytenr, num_bytes, + leaf->start, leaf_owner, + leaf_gen, inode->i_ino, 0); + BUG_ON(ret); + } + split_end = 0; + if (del_nr == 0) { + btrfs_set_file_extent_type(leaf, fi, + BTRFS_FILE_EXTENT_REG); + goto done; + } + + fi = btrfs_item_ptr(leaf, del_slot - 1, + struct btrfs_file_extent_item); + btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG); + btrfs_set_file_extent_num_bytes(leaf, fi, + extent_end - key.offset); + btrfs_mark_buffer_dirty(leaf); + + ret = btrfs_del_items(trans, root, path, del_slot, del_nr); + BUG_ON(ret); + goto done; + } else if (split == start) { + if (locked_end < extent_end) { + ret = try_lock_extent(&BTRFS_I(inode)->io_tree, + locked_end, extent_end - 1, GFP_NOFS); + if (!ret) { + btrfs_release_path(root, path); + lock_extent(&BTRFS_I(inode)->io_tree, + locked_end, extent_end - 1, GFP_NOFS); + locked_end = extent_end; + goto again; + } + locked_end = extent_end; + } + btrfs_set_file_extent_num_bytes(leaf, fi, split - key.offset); + extent_offset += split - key.offset; + } else { + BUG_ON(key.offset != start); + btrfs_set_file_extent_offset(leaf, fi, extent_offset + + split - key.offset); + btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split); + key.offset = split; + btrfs_set_item_key_safe(trans, root, path, &key); + extent_end = split; + } + + if (extent_end == end) { + split_end = 0; + extent_type = BTRFS_FILE_EXTENT_REG; + } + if (extent_end == end && split == start) { + other_start = end; + other_end = 0; + if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino, + bytenr, &other_start, &other_end)) { + path->slots[0]++; + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + key.offset = split; + btrfs_set_item_key_safe(trans, root, path, &key); + btrfs_set_file_extent_offset(leaf, fi, extent_offset); + btrfs_set_file_extent_num_bytes(leaf, fi, + other_end - split); + goto done; + } + } + if (extent_end == end && split == end) { + other_start = 0; + other_end = start; + if (extent_mergeable(leaf, path->slots[0] - 1 , inode->i_ino, + bytenr, &other_start, &other_end)) { + path->slots[0]--; + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - + other_start); + goto done; + } + } + + btrfs_mark_buffer_dirty(leaf); + + orig_parent = leaf->start; + ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, + orig_parent, root->root_key.objectid, + trans->transid, inode->i_ino); + BUG_ON(ret); + btrfs_release_path(root, path); + + key.offset = start; + ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*fi)); + BUG_ON(ret); + + leaf = path->nodes[0]; + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, fi, trans->transid); + btrfs_set_file_extent_type(leaf, fi, extent_type); + btrfs_set_file_extent_disk_bytenr(leaf, fi, bytenr); + btrfs_set_file_extent_disk_num_bytes(leaf, fi, num_bytes); + btrfs_set_file_extent_offset(leaf, fi, extent_offset); + btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset); + btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); + btrfs_set_file_extent_compression(leaf, fi, 0); + btrfs_set_file_extent_encryption(leaf, fi, 0); + btrfs_set_file_extent_other_encoding(leaf, fi, 0); + + if (orig_parent != leaf->start) { + ret = btrfs_update_extent_ref(trans, root, bytenr, + orig_parent, leaf->start, + root->root_key.objectid, + trans->transid, inode->i_ino); + BUG_ON(ret); + } +done: + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(root, path); + if (split_end && split == start) { + split = end; + goto again; + } + if (locked_end > end) { + unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1, + GFP_NOFS); + } + btrfs_free_path(path); + return 0; +} + +/* + * this gets pages into the page cache and locks them down, it also properly + * waits for data=ordered extents to finish before allowing the pages to be + * modified. + */ +static noinline int prepare_pages(struct btrfs_root *root, struct file *file, + struct page **pages, size_t num_pages, + loff_t pos, unsigned long first_index, + unsigned long last_index, size_t write_bytes) +{ + int i; + unsigned long index = pos >> PAGE_CACHE_SHIFT; + struct inode *inode = fdentry(file)->d_inode; + int err = 0; + u64 start_pos; + u64 last_pos; + + start_pos = pos & ~((u64)root->sectorsize - 1); + last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT; + + if (start_pos > inode->i_size) { + err = btrfs_cont_expand(inode, start_pos); + if (err) + return err; + } + + memset(pages, 0, num_pages * sizeof(struct page *)); +again: + for (i = 0; i < num_pages; i++) { + pages[i] = grab_cache_page(inode->i_mapping, index + i); + if (!pages[i]) { + err = -ENOMEM; + BUG_ON(1); + } + wait_on_page_writeback(pages[i]); + } + if (start_pos < inode->i_size) { + struct btrfs_ordered_extent *ordered; + lock_extent(&BTRFS_I(inode)->io_tree, + start_pos, last_pos - 1, GFP_NOFS); + ordered = btrfs_lookup_first_ordered_extent(inode, + last_pos - 1); + if (ordered && + ordered->file_offset + ordered->len > start_pos && + ordered->file_offset < last_pos) { + btrfs_put_ordered_extent(ordered); + unlock_extent(&BTRFS_I(inode)->io_tree, + start_pos, last_pos - 1, GFP_NOFS); + for (i = 0; i < num_pages; i++) { + unlock_page(pages[i]); + page_cache_release(pages[i]); + } + btrfs_wait_ordered_range(inode, start_pos, + last_pos - start_pos); + goto again; + } + if (ordered) + btrfs_put_ordered_extent(ordered); + + clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos, + last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC, + GFP_NOFS); + unlock_extent(&BTRFS_I(inode)->io_tree, + start_pos, last_pos - 1, GFP_NOFS); + } + for (i = 0; i < num_pages; i++) { + clear_page_dirty_for_io(pages[i]); + set_page_extent_mapped(pages[i]); + WARN_ON(!PageLocked(pages[i])); + } + return 0; +} + +static ssize_t btrfs_file_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + loff_t pos; + loff_t start_pos; + ssize_t num_written = 0; + ssize_t err = 0; + int ret = 0; + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct page **pages = NULL; + int nrptrs; + struct page *pinned[2]; + unsigned long first_index; + unsigned long last_index; + int will_write; + + will_write = ((file->f_flags & O_SYNC) || IS_SYNC(inode) || + (file->f_flags & O_DIRECT)); + + nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE, + PAGE_CACHE_SIZE / (sizeof(struct page *))); + pinned[0] = NULL; + pinned[1] = NULL; + + pos = *ppos; + start_pos = pos; + + vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); + current->backing_dev_info = inode->i_mapping->backing_dev_info; + err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); + if (err) + goto out_nolock; + if (count == 0) + goto out_nolock; + + err = file_remove_suid(file); + if (err) + goto out_nolock; + file_update_time(file); + + pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); + + mutex_lock(&inode->i_mutex); + BTRFS_I(inode)->sequence++; + first_index = pos >> PAGE_CACHE_SHIFT; + last_index = (pos + count) >> PAGE_CACHE_SHIFT; + + /* + * there are lots of better ways to do this, but this code + * makes sure the first and last page in the file range are + * up to date and ready for cow + */ + if ((pos & (PAGE_CACHE_SIZE - 1))) { + pinned[0] = grab_cache_page(inode->i_mapping, first_index); + if (!PageUptodate(pinned[0])) { + ret = btrfs_readpage(NULL, pinned[0]); + BUG_ON(ret); + wait_on_page_locked(pinned[0]); + } else { + unlock_page(pinned[0]); + } + } + if ((pos + count) & (PAGE_CACHE_SIZE - 1)) { + pinned[1] = grab_cache_page(inode->i_mapping, last_index); + if (!PageUptodate(pinned[1])) { + ret = btrfs_readpage(NULL, pinned[1]); + BUG_ON(ret); + wait_on_page_locked(pinned[1]); + } else { + unlock_page(pinned[1]); + } + } + + while (count > 0) { + size_t offset = pos & (PAGE_CACHE_SIZE - 1); + size_t write_bytes = min(count, nrptrs * + (size_t)PAGE_CACHE_SIZE - + offset); + size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + + WARN_ON(num_pages > nrptrs); + memset(pages, 0, sizeof(struct page *) * nrptrs); + + ret = btrfs_check_free_space(root, write_bytes, 0); + if (ret) + goto out; + + ret = prepare_pages(root, file, pages, num_pages, + pos, first_index, last_index, + write_bytes); + if (ret) + goto out; + + ret = btrfs_copy_from_user(pos, num_pages, + write_bytes, pages, buf); + if (ret) { + btrfs_drop_pages(pages, num_pages); + goto out; + } + + ret = dirty_and_release_pages(NULL, root, file, pages, + num_pages, pos, write_bytes); + btrfs_drop_pages(pages, num_pages); + if (ret) + goto out; + + if (will_write) { + btrfs_fdatawrite_range(inode->i_mapping, pos, + pos + write_bytes - 1, + WB_SYNC_NONE); + } else { + balance_dirty_pages_ratelimited_nr(inode->i_mapping, + num_pages); + if (num_pages < + (root->leafsize >> PAGE_CACHE_SHIFT) + 1) + btrfs_btree_balance_dirty(root, 1); + btrfs_throttle(root); + } + + buf += write_bytes; + count -= write_bytes; + pos += write_bytes; + num_written += write_bytes; + + cond_resched(); + } +out: + mutex_unlock(&inode->i_mutex); + +out_nolock: + kfree(pages); + if (pinned[0]) + page_cache_release(pinned[0]); + if (pinned[1]) + page_cache_release(pinned[1]); + *ppos = pos; + + if (num_written > 0 && will_write) { + struct btrfs_trans_handle *trans; + + err = btrfs_wait_ordered_range(inode, start_pos, num_written); + if (err) + num_written = err; + + if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) { + trans = btrfs_start_transaction(root, 1); + ret = btrfs_log_dentry_safe(trans, root, + file->f_dentry); + if (ret == 0) { + btrfs_sync_log(trans, root); + btrfs_end_transaction(trans, root); + } else { + btrfs_commit_transaction(trans, root); + } + } + if (file->f_flags & O_DIRECT) { + invalidate_mapping_pages(inode->i_mapping, + start_pos >> PAGE_CACHE_SHIFT, + (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT); + } + } + current->backing_dev_info = NULL; + return num_written ? num_written : err; +} + +int btrfs_release_file(struct inode *inode, struct file *filp) +{ + if (filp->private_data) + btrfs_ioctl_trans_end(filp); + return 0; +} + +/* + * fsync call for both files and directories. This logs the inode into + * the tree log instead of forcing full commits whenever possible. + * + * It needs to call filemap_fdatawait so that all ordered extent updates are + * in the metadata btree are up to date for copying to the log. + * + * It drops the inode mutex before doing the tree log commit. This is an + * important optimization for directories because holding the mutex prevents + * new operations on the dir while we write to disk. + */ +int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) +{ + struct inode *inode = dentry->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret = 0; + struct btrfs_trans_handle *trans; + + /* + * check the transaction that last modified this inode + * and see if its already been committed + */ + if (!BTRFS_I(inode)->last_trans) + goto out; + + mutex_lock(&root->fs_info->trans_mutex); + if (BTRFS_I(inode)->last_trans <= + root->fs_info->last_trans_committed) { + BTRFS_I(inode)->last_trans = 0; + mutex_unlock(&root->fs_info->trans_mutex); + goto out; + } + mutex_unlock(&root->fs_info->trans_mutex); + + root->fs_info->tree_log_batch++; + filemap_fdatawrite(inode->i_mapping); + btrfs_wait_ordered_range(inode, 0, (u64)-1); + root->fs_info->tree_log_batch++; + + /* + * ok we haven't committed the transaction yet, lets do a commit + */ + if (file->private_data) + btrfs_ioctl_trans_end(file); + + trans = btrfs_start_transaction(root, 1); + if (!trans) { + ret = -ENOMEM; + goto out; + } + + ret = btrfs_log_dentry_safe(trans, root, file->f_dentry); + if (ret < 0) + goto out; + + /* we've logged all the items and now have a consistent + * version of the file in the log. It is possible that + * someone will come in and modify the file, but that's + * fine because the log is consistent on disk, and we + * have references to all of the file's extents + * + * It is possible that someone will come in and log the + * file again, but that will end up using the synchronization + * inside btrfs_sync_log to keep things safe. + */ + mutex_unlock(&file->f_dentry->d_inode->i_mutex); + + if (ret > 0) { + ret = btrfs_commit_transaction(trans, root); + } else { + btrfs_sync_log(trans, root); + ret = btrfs_end_transaction(trans, root); + } + mutex_lock(&file->f_dentry->d_inode->i_mutex); +out: + return ret > 0 ? EIO : ret; +} + +static struct vm_operations_struct btrfs_file_vm_ops = { + .fault = filemap_fault, + .page_mkwrite = btrfs_page_mkwrite, +}; + +static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) +{ + vma->vm_ops = &btrfs_file_vm_ops; + file_accessed(filp); + return 0; +} + +struct file_operations btrfs_file_operations = { + .llseek = generic_file_llseek, + .read = do_sync_read, + .aio_read = generic_file_aio_read, + .splice_read = generic_file_splice_read, + .write = btrfs_file_write, + .mmap = btrfs_file_mmap, + .open = generic_file_open, + .release = btrfs_release_file, + .fsync = btrfs_sync_file, + .unlocked_ioctl = btrfs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = btrfs_ioctl, +#endif +}; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c new file mode 100644 index 000000000000..d1e5f0e84c58 --- /dev/null +++ b/fs/btrfs/free-space-cache.c @@ -0,0 +1,495 @@ +/* + * Copyright (C) 2008 Red Hat. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/sched.h> +#include "ctree.h" + +static int tree_insert_offset(struct rb_root *root, u64 offset, + struct rb_node *node) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct btrfs_free_space *info; + + while (*p) { + parent = *p; + info = rb_entry(parent, struct btrfs_free_space, offset_index); + + if (offset < info->offset) + p = &(*p)->rb_left; + else if (offset > info->offset) + p = &(*p)->rb_right; + else + return -EEXIST; + } + + rb_link_node(node, parent, p); + rb_insert_color(node, root); + + return 0; +} + +static int tree_insert_bytes(struct rb_root *root, u64 bytes, + struct rb_node *node) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct btrfs_free_space *info; + + while (*p) { + parent = *p; + info = rb_entry(parent, struct btrfs_free_space, bytes_index); + + if (bytes < info->bytes) + p = &(*p)->rb_left; + else + p = &(*p)->rb_right; + } + + rb_link_node(node, parent, p); + rb_insert_color(node, root); + + return 0; +} + +/* + * searches the tree for the given offset. If contains is set we will return + * the free space that contains the given offset. If contains is not set we + * will return the free space that starts at or after the given offset and is + * at least bytes long. + */ +static struct btrfs_free_space *tree_search_offset(struct rb_root *root, + u64 offset, u64 bytes, + int contains) +{ + struct rb_node *n = root->rb_node; + struct btrfs_free_space *entry, *ret = NULL; + + while (n) { + entry = rb_entry(n, struct btrfs_free_space, offset_index); + + if (offset < entry->offset) { + if (!contains && + (!ret || entry->offset < ret->offset) && + (bytes <= entry->bytes)) + ret = entry; + n = n->rb_left; + } else if (offset > entry->offset) { + if ((entry->offset + entry->bytes - 1) >= offset && + bytes <= entry->bytes) { + ret = entry; + break; + } + n = n->rb_right; + } else { + if (bytes > entry->bytes) { + n = n->rb_right; + continue; + } + ret = entry; + break; + } + } + + return ret; +} + +/* + * return a chunk at least bytes size, as close to offset that we can get. + */ +static struct btrfs_free_space *tree_search_bytes(struct rb_root *root, + u64 offset, u64 bytes) +{ + struct rb_node *n = root->rb_node; + struct btrfs_free_space *entry, *ret = NULL; + + while (n) { + entry = rb_entry(n, struct btrfs_free_space, bytes_index); + + if (bytes < entry->bytes) { + /* + * We prefer to get a hole size as close to the size we + * are asking for so we don't take small slivers out of + * huge holes, but we also want to get as close to the + * offset as possible so we don't have a whole lot of + * fragmentation. + */ + if (offset <= entry->offset) { + if (!ret) + ret = entry; + else if (entry->bytes < ret->bytes) + ret = entry; + else if (entry->offset < ret->offset) + ret = entry; + } + n = n->rb_left; + } else if (bytes > entry->bytes) { + n = n->rb_right; + } else { + /* + * Ok we may have multiple chunks of the wanted size, + * so we don't want to take the first one we find, we + * want to take the one closest to our given offset, so + * keep searching just in case theres a better match. + */ + n = n->rb_right; + if (offset > entry->offset) + continue; + else if (!ret || entry->offset < ret->offset) + ret = entry; + } + } + + return ret; +} + +static void unlink_free_space(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *info) +{ + rb_erase(&info->offset_index, &block_group->free_space_offset); + rb_erase(&info->bytes_index, &block_group->free_space_bytes); +} + +static int link_free_space(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *info) +{ + int ret = 0; + + + ret = tree_insert_offset(&block_group->free_space_offset, info->offset, + &info->offset_index); + if (ret) + return ret; + + ret = tree_insert_bytes(&block_group->free_space_bytes, info->bytes, + &info->bytes_index); + if (ret) + return ret; + + return ret; +} + +static int __btrfs_add_free_space(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes) +{ + struct btrfs_free_space *right_info; + struct btrfs_free_space *left_info; + struct btrfs_free_space *info = NULL; + struct btrfs_free_space *alloc_info; + int ret = 0; + + alloc_info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS); + if (!alloc_info) + return -ENOMEM; + + /* + * first we want to see if there is free space adjacent to the range we + * are adding, if there is remove that struct and add a new one to + * cover the entire range + */ + right_info = tree_search_offset(&block_group->free_space_offset, + offset+bytes, 0, 1); + left_info = tree_search_offset(&block_group->free_space_offset, + offset-1, 0, 1); + + if (right_info && right_info->offset == offset+bytes) { + unlink_free_space(block_group, right_info); + info = right_info; + info->offset = offset; + info->bytes += bytes; + } else if (right_info && right_info->offset != offset+bytes) { + printk(KERN_ERR "btrfs adding space in the middle of an " + "existing free space area. existing: " + "offset=%llu, bytes=%llu. new: offset=%llu, " + "bytes=%llu\n", (unsigned long long)right_info->offset, + (unsigned long long)right_info->bytes, + (unsigned long long)offset, + (unsigned long long)bytes); + BUG(); + } + + if (left_info) { + unlink_free_space(block_group, left_info); + + if (unlikely((left_info->offset + left_info->bytes) != + offset)) { + printk(KERN_ERR "btrfs free space to the left " + "of new free space isn't " + "quite right. existing: offset=%llu, " + "bytes=%llu. new: offset=%llu, bytes=%llu\n", + (unsigned long long)left_info->offset, + (unsigned long long)left_info->bytes, + (unsigned long long)offset, + (unsigned long long)bytes); + BUG(); + } + + if (info) { + info->offset = left_info->offset; + info->bytes += left_info->bytes; + kfree(left_info); + } else { + info = left_info; + info->bytes += bytes; + } + } + + if (info) { + ret = link_free_space(block_group, info); + if (!ret) + info = NULL; + goto out; + } + + info = alloc_info; + alloc_info = NULL; + info->offset = offset; + info->bytes = bytes; + + ret = link_free_space(block_group, info); + if (ret) + kfree(info); +out: + if (ret) { + printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret); + if (ret == -EEXIST) + BUG(); + } + + kfree(alloc_info); + + return ret; +} + +static int +__btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes) +{ + struct btrfs_free_space *info; + int ret = 0; + + info = tree_search_offset(&block_group->free_space_offset, offset, 0, + 1); + + if (info && info->offset == offset) { + if (info->bytes < bytes) { + printk(KERN_ERR "Found free space at %llu, size %llu," + "trying to use %llu\n", + (unsigned long long)info->offset, + (unsigned long long)info->bytes, + (unsigned long long)bytes); + WARN_ON(1); + ret = -EINVAL; + goto out; + } + unlink_free_space(block_group, info); + + if (info->bytes == bytes) { + kfree(info); + goto out; + } + + info->offset += bytes; + info->bytes -= bytes; + + ret = link_free_space(block_group, info); + BUG_ON(ret); + } else if (info && info->offset < offset && + info->offset + info->bytes >= offset + bytes) { + u64 old_start = info->offset; + /* + * we're freeing space in the middle of the info, + * this can happen during tree log replay + * + * first unlink the old info and then + * insert it again after the hole we're creating + */ + unlink_free_space(block_group, info); + if (offset + bytes < info->offset + info->bytes) { + u64 old_end = info->offset + info->bytes; + + info->offset = offset + bytes; + info->bytes = old_end - info->offset; + ret = link_free_space(block_group, info); + BUG_ON(ret); + } else { + /* the hole we're creating ends at the end + * of the info struct, just free the info + */ + kfree(info); + } + + /* step two, insert a new info struct to cover anything + * before the hole + */ + ret = __btrfs_add_free_space(block_group, old_start, + offset - old_start); + BUG_ON(ret); + } else { + WARN_ON(1); + } +out: + return ret; +} + +int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes) +{ + int ret; + struct btrfs_free_space *sp; + + mutex_lock(&block_group->alloc_mutex); + ret = __btrfs_add_free_space(block_group, offset, bytes); + sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1); + BUG_ON(!sp); + mutex_unlock(&block_group->alloc_mutex); + + return ret; +} + +int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes) +{ + int ret; + struct btrfs_free_space *sp; + + ret = __btrfs_add_free_space(block_group, offset, bytes); + sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1); + BUG_ON(!sp); + + return ret; +} + +int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes) +{ + int ret = 0; + + mutex_lock(&block_group->alloc_mutex); + ret = __btrfs_remove_free_space(block_group, offset, bytes); + mutex_unlock(&block_group->alloc_mutex); + + return ret; +} + +int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes) +{ + int ret; + + ret = __btrfs_remove_free_space(block_group, offset, bytes); + + return ret; +} + +void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, + u64 bytes) +{ + struct btrfs_free_space *info; + struct rb_node *n; + int count = 0; + + for (n = rb_first(&block_group->free_space_offset); n; n = rb_next(n)) { + info = rb_entry(n, struct btrfs_free_space, offset_index); + if (info->bytes >= bytes) + count++; + } + printk(KERN_INFO "%d blocks of free space at or bigger than bytes is" + "\n", count); +} + +u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group) +{ + struct btrfs_free_space *info; + struct rb_node *n; + u64 ret = 0; + + for (n = rb_first(&block_group->free_space_offset); n; + n = rb_next(n)) { + info = rb_entry(n, struct btrfs_free_space, offset_index); + ret += info->bytes; + } + + return ret; +} + +void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) +{ + struct btrfs_free_space *info; + struct rb_node *node; + + mutex_lock(&block_group->alloc_mutex); + while ((node = rb_last(&block_group->free_space_bytes)) != NULL) { + info = rb_entry(node, struct btrfs_free_space, bytes_index); + unlink_free_space(block_group, info); + kfree(info); + if (need_resched()) { + mutex_unlock(&block_group->alloc_mutex); + cond_resched(); + mutex_lock(&block_group->alloc_mutex); + } + } + mutex_unlock(&block_group->alloc_mutex); +} + +#if 0 +static struct btrfs_free_space *btrfs_find_free_space_offset(struct + btrfs_block_group_cache + *block_group, u64 offset, + u64 bytes) +{ + struct btrfs_free_space *ret; + + mutex_lock(&block_group->alloc_mutex); + ret = tree_search_offset(&block_group->free_space_offset, offset, + bytes, 0); + mutex_unlock(&block_group->alloc_mutex); + + return ret; +} + +static struct btrfs_free_space *btrfs_find_free_space_bytes(struct + btrfs_block_group_cache + *block_group, u64 offset, + u64 bytes) +{ + struct btrfs_free_space *ret; + + mutex_lock(&block_group->alloc_mutex); + + ret = tree_search_bytes(&block_group->free_space_bytes, offset, bytes); + mutex_unlock(&block_group->alloc_mutex); + + return ret; +} +#endif + +struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache + *block_group, u64 offset, + u64 bytes) +{ + struct btrfs_free_space *ret = NULL; + + ret = tree_search_offset(&block_group->free_space_offset, offset, + bytes, 0); + if (!ret) + ret = tree_search_bytes(&block_group->free_space_bytes, + offset, bytes); + + return ret; +} diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h new file mode 100644 index 000000000000..2a020b276768 --- /dev/null +++ b/fs/btrfs/hash.h @@ -0,0 +1,27 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __HASH__ +#define __HASH__ + +#include "crc32c.h" +static inline u64 btrfs_name_hash(const char *name, int len) +{ + return btrfs_crc32c((u32)~1, name, len); +} +#endif diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c new file mode 100644 index 000000000000..3d46fa1f29a4 --- /dev/null +++ b/fs/btrfs/inode-item.c @@ -0,0 +1,206 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" + +static int find_name_in_backref(struct btrfs_path *path, const char *name, + int name_len, struct btrfs_inode_ref **ref_ret) +{ + struct extent_buffer *leaf; + struct btrfs_inode_ref *ref; + unsigned long ptr; + unsigned long name_ptr; + u32 item_size; + u32 cur_offset = 0; + int len; + + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + while (cur_offset < item_size) { + ref = (struct btrfs_inode_ref *)(ptr + cur_offset); + len = btrfs_inode_ref_name_len(leaf, ref); + name_ptr = (unsigned long)(ref + 1); + cur_offset += len + sizeof(*ref); + if (len != name_len) + continue; + if (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) { + *ref_ret = ref; + return 1; + } + } + return 0; +} + +int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, u64 *index) +{ + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_inode_ref *ref; + struct extent_buffer *leaf; + unsigned long ptr; + unsigned long item_start; + u32 item_size; + u32 sub_item_len; + int ret; + int del_len = name_len + sizeof(*ref); + + key.objectid = inode_objectid; + key.offset = ref_objectid; + btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) { + ret = -ENOENT; + goto out; + } else if (ret < 0) { + goto out; + } + if (!find_name_in_backref(path, name, name_len, &ref)) { + ret = -ENOENT; + goto out; + } + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + + if (index) + *index = btrfs_inode_ref_index(leaf, ref); + + if (del_len == item_size) { + ret = btrfs_del_item(trans, root, path); + goto out; + } + ptr = (unsigned long)ref; + sub_item_len = name_len + sizeof(*ref); + item_start = btrfs_item_ptr_offset(leaf, path->slots[0]); + memmove_extent_buffer(leaf, ptr, ptr + sub_item_len, + item_size - (ptr + sub_item_len - item_start)); + ret = btrfs_truncate_item(trans, root, path, + item_size - sub_item_len, 1); + BUG_ON(ret); +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, u64 index) +{ + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_inode_ref *ref; + unsigned long ptr; + int ret; + int ins_len = name_len + sizeof(*ref); + + key.objectid = inode_objectid; + key.offset = ref_objectid; + btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_insert_empty_item(trans, root, path, &key, + ins_len); + if (ret == -EEXIST) { + u32 old_size; + + if (find_name_in_backref(path, name, name_len, &ref)) + goto out; + + old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); + ret = btrfs_extend_item(trans, root, path, ins_len); + BUG_ON(ret); + ref = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_ref); + ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size); + btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); + btrfs_set_inode_ref_index(path->nodes[0], ref, index); + ptr = (unsigned long)(ref + 1); + ret = 0; + } else if (ret < 0) { + goto out; + } else { + ref = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_ref); + btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); + btrfs_set_inode_ref_index(path->nodes[0], ref, index); + ptr = (unsigned long)(ref + 1); + } + write_extent_buffer(path->nodes[0], name, ptr, name_len); + btrfs_mark_buffer_dirty(path->nodes[0]); + +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid) +{ + struct btrfs_key key; + int ret; + key.objectid = objectid; + btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); + key.offset = 0; + + ret = btrfs_insert_empty_item(trans, root, path, &key, + sizeof(struct btrfs_inode_item)); + if (ret == 0 && objectid > root->highest_inode) + root->highest_inode = objectid; + return ret; +} + +int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, + struct btrfs_key *location, int mod) +{ + int ins_len = mod < 0 ? -1 : 0; + int cow = mod != 0; + int ret; + int slot; + struct extent_buffer *leaf; + struct btrfs_key found_key; + + ret = btrfs_search_slot(trans, root, location, path, ins_len, cow); + if (ret > 0 && btrfs_key_type(location) == BTRFS_ROOT_ITEM_KEY && + location->offset == (u64)-1 && path->slots[0] != 0) { + slot = path->slots[0] - 1; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, slot); + if (found_key.objectid == location->objectid && + btrfs_key_type(&found_key) == btrfs_key_type(location)) { + path->slots[0]--; + return 0; + } + } + return ret; +} diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c new file mode 100644 index 000000000000..2aa79873eb46 --- /dev/null +++ b/fs/btrfs/inode-map.c @@ -0,0 +1,144 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" + +int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid) +{ + struct btrfs_path *path; + int ret; + struct extent_buffer *l; + struct btrfs_key search_key; + struct btrfs_key found_key; + int slot; + + path = btrfs_alloc_path(); + BUG_ON(!path); + + search_key.objectid = BTRFS_LAST_FREE_OBJECTID; + search_key.type = -1; + search_key.offset = (u64)-1; + ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); + if (ret < 0) + goto error; + BUG_ON(ret == 0); + if (path->slots[0] > 0) { + slot = path->slots[0] - 1; + l = path->nodes[0]; + btrfs_item_key_to_cpu(l, &found_key, slot); + *objectid = found_key.objectid; + } else { + *objectid = BTRFS_FIRST_FREE_OBJECTID; + } + ret = 0; +error: + btrfs_free_path(path); + return ret; +} + +/* + * walks the btree of allocated inodes and find a hole. + */ +int btrfs_find_free_objectid(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 dirid, u64 *objectid) +{ + struct btrfs_path *path; + struct btrfs_key key; + int ret; + int slot = 0; + u64 last_ino = 0; + int start_found; + struct extent_buffer *l; + struct btrfs_key search_key; + u64 search_start = dirid; + + mutex_lock(&root->objectid_mutex); + if (root->last_inode_alloc >= BTRFS_FIRST_FREE_OBJECTID && + root->last_inode_alloc < BTRFS_LAST_FREE_OBJECTID) { + *objectid = ++root->last_inode_alloc; + mutex_unlock(&root->objectid_mutex); + return 0; + } + path = btrfs_alloc_path(); + BUG_ON(!path); + search_start = max(search_start, BTRFS_FIRST_FREE_OBJECTID); + search_key.objectid = search_start; + search_key.type = 0; + search_key.offset = 0; + + btrfs_init_path(path); + start_found = 0; + ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0); + if (ret < 0) + goto error; + + while (1) { + l = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(l)) { + ret = btrfs_next_leaf(root, path); + if (ret == 0) + continue; + if (ret < 0) + goto error; + if (!start_found) { + *objectid = search_start; + start_found = 1; + goto found; + } + *objectid = last_ino > search_start ? + last_ino : search_start; + goto found; + } + btrfs_item_key_to_cpu(l, &key, slot); + if (key.objectid >= search_start) { + if (start_found) { + if (last_ino < search_start) + last_ino = search_start; + if (key.objectid > last_ino) { + *objectid = last_ino; + goto found; + } + } else if (key.objectid > search_start) { + *objectid = search_start; + goto found; + } + } + if (key.objectid >= BTRFS_LAST_FREE_OBJECTID) + break; + + start_found = 1; + last_ino = key.objectid + 1; + path->slots[0]++; + } + BUG_ON(1); +found: + btrfs_release_path(root, path); + btrfs_free_path(path); + BUG_ON(*objectid < search_start); + mutex_unlock(&root->objectid_mutex); + return 0; +error: + btrfs_release_path(root, path); + btrfs_free_path(path); + mutex_unlock(&root->objectid_mutex); + return ret; +} diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c new file mode 100644 index 000000000000..8adfe059ab41 --- /dev/null +++ b/fs/btrfs/inode.c @@ -0,0 +1,5035 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/kernel.h> +#include <linux/bio.h> +#include <linux/buffer_head.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/pagemap.h> +#include <linux/highmem.h> +#include <linux/time.h> +#include <linux/init.h> +#include <linux/string.h> +#include <linux/smp_lock.h> +#include <linux/backing-dev.h> +#include <linux/mpage.h> +#include <linux/swap.h> +#include <linux/writeback.h> +#include <linux/statfs.h> +#include <linux/compat.h> +#include <linux/bit_spinlock.h> +#include <linux/version.h> +#include <linux/xattr.h> +#include <linux/posix_acl.h> +#include <linux/falloc.h> +#include "compat.h" +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "btrfs_inode.h" +#include "ioctl.h" +#include "print-tree.h" +#include "volumes.h" +#include "ordered-data.h" +#include "xattr.h" +#include "tree-log.h" +#include "ref-cache.h" +#include "compression.h" + +struct btrfs_iget_args { + u64 ino; + struct btrfs_root *root; +}; + +static struct inode_operations btrfs_dir_inode_operations; +static struct inode_operations btrfs_symlink_inode_operations; +static struct inode_operations btrfs_dir_ro_inode_operations; +static struct inode_operations btrfs_special_inode_operations; +static struct inode_operations btrfs_file_inode_operations; +static struct address_space_operations btrfs_aops; +static struct address_space_operations btrfs_symlink_aops; +static struct file_operations btrfs_dir_file_operations; +static struct extent_io_ops btrfs_extent_io_ops; + +static struct kmem_cache *btrfs_inode_cachep; +struct kmem_cache *btrfs_trans_handle_cachep; +struct kmem_cache *btrfs_transaction_cachep; +struct kmem_cache *btrfs_bit_radix_cachep; +struct kmem_cache *btrfs_path_cachep; + +#define S_SHIFT 12 +static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { + [S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE, + [S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR, + [S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV, + [S_IFBLK >> S_SHIFT] = BTRFS_FT_BLKDEV, + [S_IFIFO >> S_SHIFT] = BTRFS_FT_FIFO, + [S_IFSOCK >> S_SHIFT] = BTRFS_FT_SOCK, + [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK, +}; + +static void btrfs_truncate(struct inode *inode); +static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end); +static noinline int cow_file_range(struct inode *inode, + struct page *locked_page, + u64 start, u64 end, int *page_started, + unsigned long *nr_written, int unlock); + +/* + * a very lame attempt at stopping writes when the FS is 85% full. There + * are countless ways this is incorrect, but it is better than nothing. + */ +int btrfs_check_free_space(struct btrfs_root *root, u64 num_required, + int for_del) +{ + u64 total; + u64 used; + u64 thresh; + int ret = 0; + + spin_lock(&root->fs_info->delalloc_lock); + total = btrfs_super_total_bytes(&root->fs_info->super_copy); + used = btrfs_super_bytes_used(&root->fs_info->super_copy); + if (for_del) + thresh = total * 90; + else + thresh = total * 85; + + do_div(thresh, 100); + + if (used + root->fs_info->delalloc_bytes + num_required > thresh) + ret = -ENOSPC; + spin_unlock(&root->fs_info->delalloc_lock); + return ret; +} + +/* + * this does all the hard work for inserting an inline extent into + * the btree. The caller should have done a btrfs_drop_extents so that + * no overlapping inline items exist in the btree + */ +static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + u64 start, size_t size, size_t compressed_size, + struct page **compressed_pages) +{ + struct btrfs_key key; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct page *page = NULL; + char *kaddr; + unsigned long ptr; + struct btrfs_file_extent_item *ei; + int err = 0; + int ret; + size_t cur_size = size; + size_t datasize; + unsigned long offset; + int use_compress = 0; + + if (compressed_size && compressed_pages) { + use_compress = 1; + cur_size = compressed_size; + } + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + btrfs_set_trans_block_group(trans, inode); + + key.objectid = inode->i_ino; + key.offset = start; + btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); + datasize = btrfs_file_extent_calc_inline_size(cur_size); + + inode_add_bytes(inode, size); + ret = btrfs_insert_empty_item(trans, root, path, &key, + datasize); + BUG_ON(ret); + if (ret) { + err = ret; + goto fail; + } + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, ei, trans->transid); + btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE); + btrfs_set_file_extent_encryption(leaf, ei, 0); + btrfs_set_file_extent_other_encoding(leaf, ei, 0); + btrfs_set_file_extent_ram_bytes(leaf, ei, size); + ptr = btrfs_file_extent_inline_start(ei); + + if (use_compress) { + struct page *cpage; + int i = 0; + while (compressed_size > 0) { + cpage = compressed_pages[i]; + cur_size = min_t(unsigned long, compressed_size, + PAGE_CACHE_SIZE); + + kaddr = kmap(cpage); + write_extent_buffer(leaf, kaddr, ptr, cur_size); + kunmap(cpage); + + i++; + ptr += cur_size; + compressed_size -= cur_size; + } + btrfs_set_file_extent_compression(leaf, ei, + BTRFS_COMPRESS_ZLIB); + } else { + page = find_get_page(inode->i_mapping, + start >> PAGE_CACHE_SHIFT); + btrfs_set_file_extent_compression(leaf, ei, 0); + kaddr = kmap_atomic(page, KM_USER0); + offset = start & (PAGE_CACHE_SIZE - 1); + write_extent_buffer(leaf, kaddr + offset, ptr, size); + kunmap_atomic(kaddr, KM_USER0); + page_cache_release(page); + } + btrfs_mark_buffer_dirty(leaf); + btrfs_free_path(path); + + BTRFS_I(inode)->disk_i_size = inode->i_size; + btrfs_update_inode(trans, root, inode); + return 0; +fail: + btrfs_free_path(path); + return err; +} + + +/* + * conditionally insert an inline extent into the file. This + * does the checks required to make sure the data is small enough + * to fit as an inline extent. + */ +static int cow_file_range_inline(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, u64 start, u64 end, + size_t compressed_size, + struct page **compressed_pages) +{ + u64 isize = i_size_read(inode); + u64 actual_end = min(end + 1, isize); + u64 inline_len = actual_end - start; + u64 aligned_end = (end + root->sectorsize - 1) & + ~((u64)root->sectorsize - 1); + u64 hint_byte; + u64 data_len = inline_len; + int ret; + + if (compressed_size) + data_len = compressed_size; + + if (start > 0 || + actual_end >= PAGE_CACHE_SIZE || + data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) || + (!compressed_size && + (actual_end & (root->sectorsize - 1)) == 0) || + end + 1 < isize || + data_len > root->fs_info->max_inline) { + return 1; + } + + ret = btrfs_drop_extents(trans, root, inode, start, + aligned_end, start, &hint_byte); + BUG_ON(ret); + + if (isize > actual_end) + inline_len = min_t(u64, isize, actual_end); + ret = insert_inline_extent(trans, root, inode, start, + inline_len, compressed_size, + compressed_pages); + BUG_ON(ret); + btrfs_drop_extent_cache(inode, start, aligned_end, 0); + return 0; +} + +struct async_extent { + u64 start; + u64 ram_size; + u64 compressed_size; + struct page **pages; + unsigned long nr_pages; + struct list_head list; +}; + +struct async_cow { + struct inode *inode; + struct btrfs_root *root; + struct page *locked_page; + u64 start; + u64 end; + struct list_head extents; + struct btrfs_work work; +}; + +static noinline int add_async_extent(struct async_cow *cow, + u64 start, u64 ram_size, + u64 compressed_size, + struct page **pages, + unsigned long nr_pages) +{ + struct async_extent *async_extent; + + async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS); + async_extent->start = start; + async_extent->ram_size = ram_size; + async_extent->compressed_size = compressed_size; + async_extent->pages = pages; + async_extent->nr_pages = nr_pages; + list_add_tail(&async_extent->list, &cow->extents); + return 0; +} + +/* + * we create compressed extents in two phases. The first + * phase compresses a range of pages that have already been + * locked (both pages and state bits are locked). + * + * This is done inside an ordered work queue, and the compression + * is spread across many cpus. The actual IO submission is step + * two, and the ordered work queue takes care of making sure that + * happens in the same order things were put onto the queue by + * writepages and friends. + * + * If this code finds it can't get good compression, it puts an + * entry onto the work queue to write the uncompressed bytes. This + * makes sure that both compressed inodes and uncompressed inodes + * are written in the same order that pdflush sent them down. + */ +static noinline int compress_file_range(struct inode *inode, + struct page *locked_page, + u64 start, u64 end, + struct async_cow *async_cow, + int *num_added) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + u64 num_bytes; + u64 orig_start; + u64 disk_num_bytes; + u64 blocksize = root->sectorsize; + u64 actual_end; + u64 isize = i_size_read(inode); + int ret = 0; + struct page **pages = NULL; + unsigned long nr_pages; + unsigned long nr_pages_ret = 0; + unsigned long total_compressed = 0; + unsigned long total_in = 0; + unsigned long max_compressed = 128 * 1024; + unsigned long max_uncompressed = 128 * 1024; + int i; + int will_compress; + + orig_start = start; + + actual_end = min_t(u64, isize, end + 1); +again: + will_compress = 0; + nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1; + nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE); + + total_compressed = actual_end - start; + + /* we want to make sure that amount of ram required to uncompress + * an extent is reasonable, so we limit the total size in ram + * of a compressed extent to 128k. This is a crucial number + * because it also controls how easily we can spread reads across + * cpus for decompression. + * + * We also want to make sure the amount of IO required to do + * a random read is reasonably small, so we limit the size of + * a compressed extent to 128k. + */ + total_compressed = min(total_compressed, max_uncompressed); + num_bytes = (end - start + blocksize) & ~(blocksize - 1); + num_bytes = max(blocksize, num_bytes); + disk_num_bytes = num_bytes; + total_in = 0; + ret = 0; + + /* + * we do compression for mount -o compress and when the + * inode has not been flagged as nocompress. This flag can + * change at any time if we discover bad compression ratios. + */ + if (!btrfs_test_flag(inode, NOCOMPRESS) && + btrfs_test_opt(root, COMPRESS)) { + WARN_ON(pages); + pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); + + ret = btrfs_zlib_compress_pages(inode->i_mapping, start, + total_compressed, pages, + nr_pages, &nr_pages_ret, + &total_in, + &total_compressed, + max_compressed); + + if (!ret) { + unsigned long offset = total_compressed & + (PAGE_CACHE_SIZE - 1); + struct page *page = pages[nr_pages_ret - 1]; + char *kaddr; + + /* zero the tail end of the last page, we might be + * sending it down to disk + */ + if (offset) { + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr + offset, 0, + PAGE_CACHE_SIZE - offset); + kunmap_atomic(kaddr, KM_USER0); + } + will_compress = 1; + } + } + if (start == 0) { + trans = btrfs_join_transaction(root, 1); + BUG_ON(!trans); + btrfs_set_trans_block_group(trans, inode); + + /* lets try to make an inline extent */ + if (ret || total_in < (actual_end - start)) { + /* we didn't compress the entire range, try + * to make an uncompressed inline extent. + */ + ret = cow_file_range_inline(trans, root, inode, + start, end, 0, NULL); + } else { + /* try making a compressed inline extent */ + ret = cow_file_range_inline(trans, root, inode, + start, end, + total_compressed, pages); + } + btrfs_end_transaction(trans, root); + if (ret == 0) { + /* + * inline extent creation worked, we don't need + * to create any more async work items. Unlock + * and free up our temp pages. + */ + extent_clear_unlock_delalloc(inode, + &BTRFS_I(inode)->io_tree, + start, end, NULL, 1, 0, + 0, 1, 1, 1); + ret = 0; + goto free_pages_out; + } + } + + if (will_compress) { + /* + * we aren't doing an inline extent round the compressed size + * up to a block size boundary so the allocator does sane + * things + */ + total_compressed = (total_compressed + blocksize - 1) & + ~(blocksize - 1); + + /* + * one last check to make sure the compression is really a + * win, compare the page count read with the blocks on disk + */ + total_in = (total_in + PAGE_CACHE_SIZE - 1) & + ~(PAGE_CACHE_SIZE - 1); + if (total_compressed >= total_in) { + will_compress = 0; + } else { + disk_num_bytes = total_compressed; + num_bytes = total_in; + } + } + if (!will_compress && pages) { + /* + * the compression code ran but failed to make things smaller, + * free any pages it allocated and our page pointer array + */ + for (i = 0; i < nr_pages_ret; i++) { + WARN_ON(pages[i]->mapping); + page_cache_release(pages[i]); + } + kfree(pages); + pages = NULL; + total_compressed = 0; + nr_pages_ret = 0; + + /* flag the file so we don't compress in the future */ + btrfs_set_flag(inode, NOCOMPRESS); + } + if (will_compress) { + *num_added += 1; + + /* the async work queues will take care of doing actual + * allocation on disk for these compressed pages, + * and will submit them to the elevator. + */ + add_async_extent(async_cow, start, num_bytes, + total_compressed, pages, nr_pages_ret); + + if (start + num_bytes < end && start + num_bytes < actual_end) { + start += num_bytes; + pages = NULL; + cond_resched(); + goto again; + } + } else { + /* + * No compression, but we still need to write the pages in + * the file we've been given so far. redirty the locked + * page if it corresponds to our extent and set things up + * for the async work queue to run cow_file_range to do + * the normal delalloc dance + */ + if (page_offset(locked_page) >= start && + page_offset(locked_page) <= end) { + __set_page_dirty_nobuffers(locked_page); + /* unlocked later on in the async handlers */ + } + add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0); + *num_added += 1; + } + +out: + return 0; + +free_pages_out: + for (i = 0; i < nr_pages_ret; i++) { + WARN_ON(pages[i]->mapping); + page_cache_release(pages[i]); + } + kfree(pages); + + goto out; +} + +/* + * phase two of compressed writeback. This is the ordered portion + * of the code, which only gets called in the order the work was + * queued. We walk all the async extents created by compress_file_range + * and send them down to the disk. + */ +static noinline int submit_compressed_extents(struct inode *inode, + struct async_cow *async_cow) +{ + struct async_extent *async_extent; + u64 alloc_hint = 0; + struct btrfs_trans_handle *trans; + struct btrfs_key ins; + struct extent_map *em; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_io_tree *io_tree; + int ret; + + if (list_empty(&async_cow->extents)) + return 0; + + trans = btrfs_join_transaction(root, 1); + + while (!list_empty(&async_cow->extents)) { + async_extent = list_entry(async_cow->extents.next, + struct async_extent, list); + list_del(&async_extent->list); + + io_tree = &BTRFS_I(inode)->io_tree; + + /* did the compression code fall back to uncompressed IO? */ + if (!async_extent->pages) { + int page_started = 0; + unsigned long nr_written = 0; + + lock_extent(io_tree, async_extent->start, + async_extent->start + + async_extent->ram_size - 1, GFP_NOFS); + + /* allocate blocks */ + cow_file_range(inode, async_cow->locked_page, + async_extent->start, + async_extent->start + + async_extent->ram_size - 1, + &page_started, &nr_written, 0); + + /* + * if page_started, cow_file_range inserted an + * inline extent and took care of all the unlocking + * and IO for us. Otherwise, we need to submit + * all those pages down to the drive. + */ + if (!page_started) + extent_write_locked_range(io_tree, + inode, async_extent->start, + async_extent->start + + async_extent->ram_size - 1, + btrfs_get_extent, + WB_SYNC_ALL); + kfree(async_extent); + cond_resched(); + continue; + } + + lock_extent(io_tree, async_extent->start, + async_extent->start + async_extent->ram_size - 1, + GFP_NOFS); + /* + * here we're doing allocation and writeback of the + * compressed pages + */ + btrfs_drop_extent_cache(inode, async_extent->start, + async_extent->start + + async_extent->ram_size - 1, 0); + + ret = btrfs_reserve_extent(trans, root, + async_extent->compressed_size, + async_extent->compressed_size, + 0, alloc_hint, + (u64)-1, &ins, 1); + BUG_ON(ret); + em = alloc_extent_map(GFP_NOFS); + em->start = async_extent->start; + em->len = async_extent->ram_size; + em->orig_start = em->start; + + em->block_start = ins.objectid; + em->block_len = ins.offset; + em->bdev = root->fs_info->fs_devices->latest_bdev; + set_bit(EXTENT_FLAG_PINNED, &em->flags); + set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + + while (1) { + spin_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); + spin_unlock(&em_tree->lock); + if (ret != -EEXIST) { + free_extent_map(em); + break; + } + btrfs_drop_extent_cache(inode, async_extent->start, + async_extent->start + + async_extent->ram_size - 1, 0); + } + + ret = btrfs_add_ordered_extent(inode, async_extent->start, + ins.objectid, + async_extent->ram_size, + ins.offset, + BTRFS_ORDERED_COMPRESSED); + BUG_ON(ret); + + btrfs_end_transaction(trans, root); + + /* + * clear dirty, set writeback and unlock the pages. + */ + extent_clear_unlock_delalloc(inode, + &BTRFS_I(inode)->io_tree, + async_extent->start, + async_extent->start + + async_extent->ram_size - 1, + NULL, 1, 1, 0, 1, 1, 0); + + ret = btrfs_submit_compressed_write(inode, + async_extent->start, + async_extent->ram_size, + ins.objectid, + ins.offset, async_extent->pages, + async_extent->nr_pages); + + BUG_ON(ret); + trans = btrfs_join_transaction(root, 1); + alloc_hint = ins.objectid + ins.offset; + kfree(async_extent); + cond_resched(); + } + + btrfs_end_transaction(trans, root); + return 0; +} + +/* + * when extent_io.c finds a delayed allocation range in the file, + * the call backs end up in this code. The basic idea is to + * allocate extents on disk for the range, and create ordered data structs + * in ram to track those extents. + * + * locked_page is the page that writepage had locked already. We use + * it to make sure we don't do extra locks or unlocks. + * + * *page_started is set to one if we unlock locked_page and do everything + * required to start IO on it. It may be clean and already done with + * IO when we return. + */ +static noinline int cow_file_range(struct inode *inode, + struct page *locked_page, + u64 start, u64 end, int *page_started, + unsigned long *nr_written, + int unlock) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + u64 alloc_hint = 0; + u64 num_bytes; + unsigned long ram_size; + u64 disk_num_bytes; + u64 cur_alloc_size; + u64 blocksize = root->sectorsize; + u64 actual_end; + u64 isize = i_size_read(inode); + struct btrfs_key ins; + struct extent_map *em; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + int ret = 0; + + trans = btrfs_join_transaction(root, 1); + BUG_ON(!trans); + btrfs_set_trans_block_group(trans, inode); + + actual_end = min_t(u64, isize, end + 1); + + num_bytes = (end - start + blocksize) & ~(blocksize - 1); + num_bytes = max(blocksize, num_bytes); + disk_num_bytes = num_bytes; + ret = 0; + + if (start == 0) { + /* lets try to make an inline extent */ + ret = cow_file_range_inline(trans, root, inode, + start, end, 0, NULL); + if (ret == 0) { + extent_clear_unlock_delalloc(inode, + &BTRFS_I(inode)->io_tree, + start, end, NULL, 1, 1, + 1, 1, 1, 1); + *nr_written = *nr_written + + (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE; + *page_started = 1; + ret = 0; + goto out; + } + } + + BUG_ON(disk_num_bytes > + btrfs_super_total_bytes(&root->fs_info->super_copy)); + + btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); + + while (disk_num_bytes > 0) { + cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent); + ret = btrfs_reserve_extent(trans, root, cur_alloc_size, + root->sectorsize, 0, alloc_hint, + (u64)-1, &ins, 1); + BUG_ON(ret); + + em = alloc_extent_map(GFP_NOFS); + em->start = start; + em->orig_start = em->start; + + ram_size = ins.offset; + em->len = ins.offset; + + em->block_start = ins.objectid; + em->block_len = ins.offset; + em->bdev = root->fs_info->fs_devices->latest_bdev; + set_bit(EXTENT_FLAG_PINNED, &em->flags); + + while (1) { + spin_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); + spin_unlock(&em_tree->lock); + if (ret != -EEXIST) { + free_extent_map(em); + break; + } + btrfs_drop_extent_cache(inode, start, + start + ram_size - 1, 0); + } + + cur_alloc_size = ins.offset; + ret = btrfs_add_ordered_extent(inode, start, ins.objectid, + ram_size, cur_alloc_size, 0); + BUG_ON(ret); + + if (root->root_key.objectid == + BTRFS_DATA_RELOC_TREE_OBJECTID) { + ret = btrfs_reloc_clone_csums(inode, start, + cur_alloc_size); + BUG_ON(ret); + } + + if (disk_num_bytes < cur_alloc_size) + break; + + /* we're not doing compressed IO, don't unlock the first + * page (which the caller expects to stay locked), don't + * clear any dirty bits and don't set any writeback bits + */ + extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, + start, start + ram_size - 1, + locked_page, unlock, 1, + 1, 0, 0, 0); + disk_num_bytes -= cur_alloc_size; + num_bytes -= cur_alloc_size; + alloc_hint = ins.objectid + ins.offset; + start += cur_alloc_size; + } +out: + ret = 0; + btrfs_end_transaction(trans, root); + + return ret; +} + +/* + * work queue call back to started compression on a file and pages + */ +static noinline void async_cow_start(struct btrfs_work *work) +{ + struct async_cow *async_cow; + int num_added = 0; + async_cow = container_of(work, struct async_cow, work); + + compress_file_range(async_cow->inode, async_cow->locked_page, + async_cow->start, async_cow->end, async_cow, + &num_added); + if (num_added == 0) + async_cow->inode = NULL; +} + +/* + * work queue call back to submit previously compressed pages + */ +static noinline void async_cow_submit(struct btrfs_work *work) +{ + struct async_cow *async_cow; + struct btrfs_root *root; + unsigned long nr_pages; + + async_cow = container_of(work, struct async_cow, work); + + root = async_cow->root; + nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >> + PAGE_CACHE_SHIFT; + + atomic_sub(nr_pages, &root->fs_info->async_delalloc_pages); + + if (atomic_read(&root->fs_info->async_delalloc_pages) < + 5 * 1042 * 1024 && + waitqueue_active(&root->fs_info->async_submit_wait)) + wake_up(&root->fs_info->async_submit_wait); + + if (async_cow->inode) + submit_compressed_extents(async_cow->inode, async_cow); +} + +static noinline void async_cow_free(struct btrfs_work *work) +{ + struct async_cow *async_cow; + async_cow = container_of(work, struct async_cow, work); + kfree(async_cow); +} + +static int cow_file_range_async(struct inode *inode, struct page *locked_page, + u64 start, u64 end, int *page_started, + unsigned long *nr_written) +{ + struct async_cow *async_cow; + struct btrfs_root *root = BTRFS_I(inode)->root; + unsigned long nr_pages; + u64 cur_end; + int limit = 10 * 1024 * 1042; + + if (!btrfs_test_opt(root, COMPRESS)) { + return cow_file_range(inode, locked_page, start, end, + page_started, nr_written, 1); + } + + clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED | + EXTENT_DELALLOC, 1, 0, GFP_NOFS); + while (start < end) { + async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); + async_cow->inode = inode; + async_cow->root = root; + async_cow->locked_page = locked_page; + async_cow->start = start; + + if (btrfs_test_flag(inode, NOCOMPRESS)) + cur_end = end; + else + cur_end = min(end, start + 512 * 1024 - 1); + + async_cow->end = cur_end; + INIT_LIST_HEAD(&async_cow->extents); + + async_cow->work.func = async_cow_start; + async_cow->work.ordered_func = async_cow_submit; + async_cow->work.ordered_free = async_cow_free; + async_cow->work.flags = 0; + + nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >> + PAGE_CACHE_SHIFT; + atomic_add(nr_pages, &root->fs_info->async_delalloc_pages); + + btrfs_queue_worker(&root->fs_info->delalloc_workers, + &async_cow->work); + + if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) { + wait_event(root->fs_info->async_submit_wait, + (atomic_read(&root->fs_info->async_delalloc_pages) < + limit)); + } + + while (atomic_read(&root->fs_info->async_submit_draining) && + atomic_read(&root->fs_info->async_delalloc_pages)) { + wait_event(root->fs_info->async_submit_wait, + (atomic_read(&root->fs_info->async_delalloc_pages) == + 0)); + } + + *nr_written += nr_pages; + start = cur_end + 1; + } + *page_started = 1; + return 0; +} + +static noinline int csum_exist_in_range(struct btrfs_root *root, + u64 bytenr, u64 num_bytes) +{ + int ret; + struct btrfs_ordered_sum *sums; + LIST_HEAD(list); + + ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr, + bytenr + num_bytes - 1, &list); + if (ret == 0 && list_empty(&list)) + return 0; + + while (!list_empty(&list)) { + sums = list_entry(list.next, struct btrfs_ordered_sum, list); + list_del(&sums->list); + kfree(sums); + } + return 1; +} + +/* + * when nowcow writeback call back. This checks for snapshots or COW copies + * of the extents that exist in the file, and COWs the file as required. + * + * If no cow copies or snapshots exist, we write directly to the existing + * blocks on disk + */ +static int run_delalloc_nocow(struct inode *inode, struct page *locked_page, + u64 start, u64 end, int *page_started, int force, + unsigned long *nr_written) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + struct extent_buffer *leaf; + struct btrfs_path *path; + struct btrfs_file_extent_item *fi; + struct btrfs_key found_key; + u64 cow_start; + u64 cur_offset; + u64 extent_end; + u64 disk_bytenr; + u64 num_bytes; + int extent_type; + int ret; + int type; + int nocow; + int check_prev = 1; + + path = btrfs_alloc_path(); + BUG_ON(!path); + trans = btrfs_join_transaction(root, 1); + BUG_ON(!trans); + + cow_start = (u64)-1; + cur_offset = start; + while (1) { + ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino, + cur_offset, 0); + BUG_ON(ret < 0); + if (ret > 0 && path->slots[0] > 0 && check_prev) { + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, + path->slots[0] - 1); + if (found_key.objectid == inode->i_ino && + found_key.type == BTRFS_EXTENT_DATA_KEY) + path->slots[0]--; + } + check_prev = 0; +next_slot: + leaf = path->nodes[0]; + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + BUG_ON(1); + if (ret > 0) + break; + leaf = path->nodes[0]; + } + + nocow = 0; + disk_bytenr = 0; + num_bytes = 0; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + if (found_key.objectid > inode->i_ino || + found_key.type > BTRFS_EXTENT_DATA_KEY || + found_key.offset > end) + break; + + if (found_key.offset > cur_offset) { + extent_end = found_key.offset; + goto out_check; + } + + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + extent_type = btrfs_file_extent_type(leaf, fi); + + if (extent_type == BTRFS_FILE_EXTENT_REG || + extent_type == BTRFS_FILE_EXTENT_PREALLOC) { + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + extent_end = found_key.offset + + btrfs_file_extent_num_bytes(leaf, fi); + if (extent_end <= start) { + path->slots[0]++; + goto next_slot; + } + if (disk_bytenr == 0) + goto out_check; + if (btrfs_file_extent_compression(leaf, fi) || + btrfs_file_extent_encryption(leaf, fi) || + btrfs_file_extent_other_encoding(leaf, fi)) + goto out_check; + if (extent_type == BTRFS_FILE_EXTENT_REG && !force) + goto out_check; + if (btrfs_extent_readonly(root, disk_bytenr)) + goto out_check; + if (btrfs_cross_ref_exist(trans, root, inode->i_ino, + disk_bytenr)) + goto out_check; + disk_bytenr += btrfs_file_extent_offset(leaf, fi); + disk_bytenr += cur_offset - found_key.offset; + num_bytes = min(end + 1, extent_end) - cur_offset; + /* + * force cow if csum exists in the range. + * this ensure that csum for a given extent are + * either valid or do not exist. + */ + if (csum_exist_in_range(root, disk_bytenr, num_bytes)) + goto out_check; + nocow = 1; + } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + extent_end = found_key.offset + + btrfs_file_extent_inline_len(leaf, fi); + extent_end = ALIGN(extent_end, root->sectorsize); + } else { + BUG_ON(1); + } +out_check: + if (extent_end <= start) { + path->slots[0]++; + goto next_slot; + } + if (!nocow) { + if (cow_start == (u64)-1) + cow_start = cur_offset; + cur_offset = extent_end; + if (cur_offset > end) + break; + path->slots[0]++; + goto next_slot; + } + + btrfs_release_path(root, path); + if (cow_start != (u64)-1) { + ret = cow_file_range(inode, locked_page, cow_start, + found_key.offset - 1, page_started, + nr_written, 1); + BUG_ON(ret); + cow_start = (u64)-1; + } + + if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { + struct extent_map *em; + struct extent_map_tree *em_tree; + em_tree = &BTRFS_I(inode)->extent_tree; + em = alloc_extent_map(GFP_NOFS); + em->start = cur_offset; + em->orig_start = em->start; + em->len = num_bytes; + em->block_len = num_bytes; + em->block_start = disk_bytenr; + em->bdev = root->fs_info->fs_devices->latest_bdev; + set_bit(EXTENT_FLAG_PINNED, &em->flags); + while (1) { + spin_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); + spin_unlock(&em_tree->lock); + if (ret != -EEXIST) { + free_extent_map(em); + break; + } + btrfs_drop_extent_cache(inode, em->start, + em->start + em->len - 1, 0); + } + type = BTRFS_ORDERED_PREALLOC; + } else { + type = BTRFS_ORDERED_NOCOW; + } + + ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr, + num_bytes, num_bytes, type); + BUG_ON(ret); + + extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, + cur_offset, cur_offset + num_bytes - 1, + locked_page, 1, 1, 1, 0, 0, 0); + cur_offset = extent_end; + if (cur_offset > end) + break; + } + btrfs_release_path(root, path); + + if (cur_offset <= end && cow_start == (u64)-1) + cow_start = cur_offset; + if (cow_start != (u64)-1) { + ret = cow_file_range(inode, locked_page, cow_start, end, + page_started, nr_written, 1); + BUG_ON(ret); + } + + ret = btrfs_end_transaction(trans, root); + BUG_ON(ret); + btrfs_free_path(path); + return 0; +} + +/* + * extent_io.c call back to do delayed allocation processing + */ +static int run_delalloc_range(struct inode *inode, struct page *locked_page, + u64 start, u64 end, int *page_started, + unsigned long *nr_written) +{ + int ret; + + if (btrfs_test_flag(inode, NODATACOW)) + ret = run_delalloc_nocow(inode, locked_page, start, end, + page_started, 1, nr_written); + else if (btrfs_test_flag(inode, PREALLOC)) + ret = run_delalloc_nocow(inode, locked_page, start, end, + page_started, 0, nr_written); + else + ret = cow_file_range_async(inode, locked_page, start, end, + page_started, nr_written); + + return ret; +} + +/* + * extent_io.c set_bit_hook, used to track delayed allocation + * bytes in this file, and to maintain the list of inodes that + * have pending delalloc work to be done. + */ +static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, + unsigned long old, unsigned long bits) +{ + /* + * set_bit and clear bit hooks normally require _irqsave/restore + * but in this case, we are only testeing for the DELALLOC + * bit, which is only set or cleared with irqs on + */ + if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { + struct btrfs_root *root = BTRFS_I(inode)->root; + spin_lock(&root->fs_info->delalloc_lock); + BTRFS_I(inode)->delalloc_bytes += end - start + 1; + root->fs_info->delalloc_bytes += end - start + 1; + if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { + list_add_tail(&BTRFS_I(inode)->delalloc_inodes, + &root->fs_info->delalloc_inodes); + } + spin_unlock(&root->fs_info->delalloc_lock); + } + return 0; +} + +/* + * extent_io.c clear_bit_hook, see set_bit_hook for why + */ +static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end, + unsigned long old, unsigned long bits) +{ + /* + * set_bit and clear bit hooks normally require _irqsave/restore + * but in this case, we are only testeing for the DELALLOC + * bit, which is only set or cleared with irqs on + */ + if ((old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { + struct btrfs_root *root = BTRFS_I(inode)->root; + + spin_lock(&root->fs_info->delalloc_lock); + if (end - start + 1 > root->fs_info->delalloc_bytes) { + printk(KERN_INFO "btrfs warning: delalloc account " + "%llu %llu\n", + (unsigned long long)end - start + 1, + (unsigned long long) + root->fs_info->delalloc_bytes); + root->fs_info->delalloc_bytes = 0; + BTRFS_I(inode)->delalloc_bytes = 0; + } else { + root->fs_info->delalloc_bytes -= end - start + 1; + BTRFS_I(inode)->delalloc_bytes -= end - start + 1; + } + if (BTRFS_I(inode)->delalloc_bytes == 0 && + !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { + list_del_init(&BTRFS_I(inode)->delalloc_inodes); + } + spin_unlock(&root->fs_info->delalloc_lock); + } + return 0; +} + +/* + * extent_io.c merge_bio_hook, this must check the chunk tree to make sure + * we don't create bios that span stripes or chunks + */ +int btrfs_merge_bio_hook(struct page *page, unsigned long offset, + size_t size, struct bio *bio, + unsigned long bio_flags) +{ + struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; + struct btrfs_mapping_tree *map_tree; + u64 logical = (u64)bio->bi_sector << 9; + u64 length = 0; + u64 map_length; + int ret; + + if (bio_flags & EXTENT_BIO_COMPRESSED) + return 0; + + length = bio->bi_size; + map_tree = &root->fs_info->mapping_tree; + map_length = length; + ret = btrfs_map_block(map_tree, READ, logical, + &map_length, NULL, 0); + + if (map_length < length + size) + return 1; + return 0; +} + +/* + * in order to insert checksums into the metadata in large chunks, + * we wait until bio submission time. All the pages in the bio are + * checksummed and sums are attached onto the ordered extent record. + * + * At IO completion time the cums attached on the ordered extent record + * are inserted into the btree + */ +static int __btrfs_submit_bio_start(struct inode *inode, int rw, + struct bio *bio, int mirror_num, + unsigned long bio_flags) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret = 0; + + ret = btrfs_csum_one_bio(root, inode, bio, 0, 0); + BUG_ON(ret); + return 0; +} + +/* + * in order to insert checksums into the metadata in large chunks, + * we wait until bio submission time. All the pages in the bio are + * checksummed and sums are attached onto the ordered extent record. + * + * At IO completion time the cums attached on the ordered extent record + * are inserted into the btree + */ +static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio, + int mirror_num, unsigned long bio_flags) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + return btrfs_map_bio(root, rw, bio, mirror_num, 1); +} + +/* + * extent_io.c submission hook. This does the right thing for csum calculation + * on write, or reading the csums from the tree before a read + */ +static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, + int mirror_num, unsigned long bio_flags) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret = 0; + int skip_sum; + + skip_sum = btrfs_test_flag(inode, NODATASUM); + + ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); + BUG_ON(ret); + + if (!(rw & (1 << BIO_RW))) { + if (bio_flags & EXTENT_BIO_COMPRESSED) { + return btrfs_submit_compressed_read(inode, bio, + mirror_num, bio_flags); + } else if (!skip_sum) + btrfs_lookup_bio_sums(root, inode, bio, NULL); + goto mapit; + } else if (!skip_sum) { + /* csum items have already been cloned */ + if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) + goto mapit; + /* we're doing a write, do the async checksumming */ + return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, + inode, rw, bio, mirror_num, + bio_flags, __btrfs_submit_bio_start, + __btrfs_submit_bio_done); + } + +mapit: + return btrfs_map_bio(root, rw, bio, mirror_num, 0); +} + +/* + * given a list of ordered sums record them in the inode. This happens + * at IO completion time based on sums calculated at bio submission time. + */ +static noinline int add_pending_csums(struct btrfs_trans_handle *trans, + struct inode *inode, u64 file_offset, + struct list_head *list) +{ + struct list_head *cur; + struct btrfs_ordered_sum *sum; + + btrfs_set_trans_block_group(trans, inode); + list_for_each(cur, list) { + sum = list_entry(cur, struct btrfs_ordered_sum, list); + btrfs_csum_file_blocks(trans, + BTRFS_I(inode)->root->fs_info->csum_root, sum); + } + return 0; +} + +int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end) +{ + if ((end & (PAGE_CACHE_SIZE - 1)) == 0) + WARN_ON(1); + return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, + GFP_NOFS); +} + +/* see btrfs_writepage_start_hook for details on why this is required */ +struct btrfs_writepage_fixup { + struct page *page; + struct btrfs_work work; +}; + +static void btrfs_writepage_fixup_worker(struct btrfs_work *work) +{ + struct btrfs_writepage_fixup *fixup; + struct btrfs_ordered_extent *ordered; + struct page *page; + struct inode *inode; + u64 page_start; + u64 page_end; + + fixup = container_of(work, struct btrfs_writepage_fixup, work); + page = fixup->page; +again: + lock_page(page); + if (!page->mapping || !PageDirty(page) || !PageChecked(page)) { + ClearPageChecked(page); + goto out_page; + } + + inode = page->mapping->host; + page_start = page_offset(page); + page_end = page_offset(page) + PAGE_CACHE_SIZE - 1; + + lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS); + + /* already ordered? We're done */ + if (test_range_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, + EXTENT_ORDERED, 0)) { + goto out; + } + + ordered = btrfs_lookup_ordered_extent(inode, page_start); + if (ordered) { + unlock_extent(&BTRFS_I(inode)->io_tree, page_start, + page_end, GFP_NOFS); + unlock_page(page); + btrfs_start_ordered_extent(inode, ordered, 1); + goto again; + } + + btrfs_set_extent_delalloc(inode, page_start, page_end); + ClearPageChecked(page); +out: + unlock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS); +out_page: + unlock_page(page); + page_cache_release(page); +} + +/* + * There are a few paths in the higher layers of the kernel that directly + * set the page dirty bit without asking the filesystem if it is a + * good idea. This causes problems because we want to make sure COW + * properly happens and the data=ordered rules are followed. + * + * In our case any range that doesn't have the ORDERED bit set + * hasn't been properly setup for IO. We kick off an async process + * to fix it up. The async helper will wait for ordered extents, set + * the delalloc bit and make it safe to write the page. + */ +static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end) +{ + struct inode *inode = page->mapping->host; + struct btrfs_writepage_fixup *fixup; + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + ret = test_range_bit(&BTRFS_I(inode)->io_tree, start, end, + EXTENT_ORDERED, 0); + if (ret) + return 0; + + if (PageChecked(page)) + return -EAGAIN; + + fixup = kzalloc(sizeof(*fixup), GFP_NOFS); + if (!fixup) + return -EAGAIN; + + SetPageChecked(page); + page_cache_get(page); + fixup->work.func = btrfs_writepage_fixup_worker; + fixup->page = page; + btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work); + return -EAGAIN; +} + +static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, + struct inode *inode, u64 file_pos, + u64 disk_bytenr, u64 disk_num_bytes, + u64 num_bytes, u64 ram_bytes, + u8 compression, u8 encryption, + u16 other_encoding, int extent_type) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_file_extent_item *fi; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_key ins; + u64 hint; + int ret; + + path = btrfs_alloc_path(); + BUG_ON(!path); + + ret = btrfs_drop_extents(trans, root, inode, file_pos, + file_pos + num_bytes, file_pos, &hint); + BUG_ON(ret); + + ins.objectid = inode->i_ino; + ins.offset = file_pos; + ins.type = BTRFS_EXTENT_DATA_KEY; + ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi)); + BUG_ON(ret); + leaf = path->nodes[0]; + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, fi, trans->transid); + btrfs_set_file_extent_type(leaf, fi, extent_type); + btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr); + btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes); + btrfs_set_file_extent_offset(leaf, fi, 0); + btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); + btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes); + btrfs_set_file_extent_compression(leaf, fi, compression); + btrfs_set_file_extent_encryption(leaf, fi, encryption); + btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding); + btrfs_mark_buffer_dirty(leaf); + + inode_add_bytes(inode, num_bytes); + btrfs_drop_extent_cache(inode, file_pos, file_pos + num_bytes - 1, 0); + + ins.objectid = disk_bytenr; + ins.offset = disk_num_bytes; + ins.type = BTRFS_EXTENT_ITEM_KEY; + ret = btrfs_alloc_reserved_extent(trans, root, leaf->start, + root->root_key.objectid, + trans->transid, inode->i_ino, &ins); + BUG_ON(ret); + + btrfs_free_path(path); + return 0; +} + +/* as ordered data IO finishes, this gets called so we can finish + * an ordered extent if the range of bytes in the file it covers are + * fully written. + */ +static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + struct btrfs_ordered_extent *ordered_extent; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + int compressed = 0; + int ret; + + ret = btrfs_dec_test_ordered_pending(inode, start, end - start + 1); + if (!ret) + return 0; + + trans = btrfs_join_transaction(root, 1); + + ordered_extent = btrfs_lookup_ordered_extent(inode, start); + BUG_ON(!ordered_extent); + if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) + goto nocow; + + lock_extent(io_tree, ordered_extent->file_offset, + ordered_extent->file_offset + ordered_extent->len - 1, + GFP_NOFS); + + if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) + compressed = 1; + if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { + BUG_ON(compressed); + ret = btrfs_mark_extent_written(trans, root, inode, + ordered_extent->file_offset, + ordered_extent->file_offset + + ordered_extent->len); + BUG_ON(ret); + } else { + ret = insert_reserved_file_extent(trans, inode, + ordered_extent->file_offset, + ordered_extent->start, + ordered_extent->disk_len, + ordered_extent->len, + ordered_extent->len, + compressed, 0, 0, + BTRFS_FILE_EXTENT_REG); + BUG_ON(ret); + } + unlock_extent(io_tree, ordered_extent->file_offset, + ordered_extent->file_offset + ordered_extent->len - 1, + GFP_NOFS); +nocow: + add_pending_csums(trans, inode, ordered_extent->file_offset, + &ordered_extent->list); + + mutex_lock(&BTRFS_I(inode)->extent_mutex); + btrfs_ordered_update_i_size(inode, ordered_extent); + btrfs_update_inode(trans, root, inode); + btrfs_remove_ordered_extent(inode, ordered_extent); + mutex_unlock(&BTRFS_I(inode)->extent_mutex); + + /* once for us */ + btrfs_put_ordered_extent(ordered_extent); + /* once for the tree */ + btrfs_put_ordered_extent(ordered_extent); + + btrfs_end_transaction(trans, root); + return 0; +} + +static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, + struct extent_state *state, int uptodate) +{ + return btrfs_finish_ordered_io(page->mapping->host, start, end); +} + +/* + * When IO fails, either with EIO or csum verification fails, we + * try other mirrors that might have a good copy of the data. This + * io_failure_record is used to record state as we go through all the + * mirrors. If another mirror has good data, the page is set up to date + * and things continue. If a good mirror can't be found, the original + * bio end_io callback is called to indicate things have failed. + */ +struct io_failure_record { + struct page *page; + u64 start; + u64 len; + u64 logical; + unsigned long bio_flags; + int last_mirror; +}; + +static int btrfs_io_failed_hook(struct bio *failed_bio, + struct page *page, u64 start, u64 end, + struct extent_state *state) +{ + struct io_failure_record *failrec = NULL; + u64 private; + struct extent_map *em; + struct inode *inode = page->mapping->host; + struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct bio *bio; + int num_copies; + int ret; + int rw; + u64 logical; + + ret = get_state_private(failure_tree, start, &private); + if (ret) { + failrec = kmalloc(sizeof(*failrec), GFP_NOFS); + if (!failrec) + return -ENOMEM; + failrec->start = start; + failrec->len = end - start + 1; + failrec->last_mirror = 0; + failrec->bio_flags = 0; + + spin_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, failrec->len); + if (em->start > start || em->start + em->len < start) { + free_extent_map(em); + em = NULL; + } + spin_unlock(&em_tree->lock); + + if (!em || IS_ERR(em)) { + kfree(failrec); + return -EIO; + } + logical = start - em->start; + logical = em->block_start + logical; + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { + logical = em->block_start; + failrec->bio_flags = EXTENT_BIO_COMPRESSED; + } + failrec->logical = logical; + free_extent_map(em); + set_extent_bits(failure_tree, start, end, EXTENT_LOCKED | + EXTENT_DIRTY, GFP_NOFS); + set_state_private(failure_tree, start, + (u64)(unsigned long)failrec); + } else { + failrec = (struct io_failure_record *)(unsigned long)private; + } + num_copies = btrfs_num_copies( + &BTRFS_I(inode)->root->fs_info->mapping_tree, + failrec->logical, failrec->len); + failrec->last_mirror++; + if (!state) { + spin_lock(&BTRFS_I(inode)->io_tree.lock); + state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree, + failrec->start, + EXTENT_LOCKED); + if (state && state->start != failrec->start) + state = NULL; + spin_unlock(&BTRFS_I(inode)->io_tree.lock); + } + if (!state || failrec->last_mirror > num_copies) { + set_state_private(failure_tree, failrec->start, 0); + clear_extent_bits(failure_tree, failrec->start, + failrec->start + failrec->len - 1, + EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); + kfree(failrec); + return -EIO; + } + bio = bio_alloc(GFP_NOFS, 1); + bio->bi_private = state; + bio->bi_end_io = failed_bio->bi_end_io; + bio->bi_sector = failrec->logical >> 9; + bio->bi_bdev = failed_bio->bi_bdev; + bio->bi_size = 0; + + bio_add_page(bio, page, failrec->len, start - page_offset(page)); + if (failed_bio->bi_rw & (1 << BIO_RW)) + rw = WRITE; + else + rw = READ; + + BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio, + failrec->last_mirror, + failrec->bio_flags); + return 0; +} + +/* + * each time an IO finishes, we do a fast check in the IO failure tree + * to see if we need to process or clean up an io_failure_record + */ +static int btrfs_clean_io_failures(struct inode *inode, u64 start) +{ + u64 private; + u64 private_failure; + struct io_failure_record *failure; + int ret; + + private = 0; + if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, + (u64)-1, 1, EXTENT_DIRTY)) { + ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, + start, &private_failure); + if (ret == 0) { + failure = (struct io_failure_record *)(unsigned long) + private_failure; + set_state_private(&BTRFS_I(inode)->io_failure_tree, + failure->start, 0); + clear_extent_bits(&BTRFS_I(inode)->io_failure_tree, + failure->start, + failure->start + failure->len - 1, + EXTENT_DIRTY | EXTENT_LOCKED, + GFP_NOFS); + kfree(failure); + } + } + return 0; +} + +/* + * when reads are done, we need to check csums to verify the data is correct + * if there's a match, we allow the bio to finish. If not, we go through + * the io_failure_record routines to find good copies + */ +static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, + struct extent_state *state) +{ + size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT); + struct inode *inode = page->mapping->host; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + char *kaddr; + u64 private = ~(u32)0; + int ret; + struct btrfs_root *root = BTRFS_I(inode)->root; + u32 csum = ~(u32)0; + + if (PageChecked(page)) { + ClearPageChecked(page); + goto good; + } + if (btrfs_test_flag(inode, NODATASUM)) + return 0; + + if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && + test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1)) { + clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM, + GFP_NOFS); + return 0; + } + + if (state && state->start == start) { + private = state->private; + ret = 0; + } else { + ret = get_state_private(io_tree, start, &private); + } + kaddr = kmap_atomic(page, KM_USER0); + if (ret) + goto zeroit; + + csum = btrfs_csum_data(root, kaddr + offset, csum, end - start + 1); + btrfs_csum_final(csum, (char *)&csum); + if (csum != private) + goto zeroit; + + kunmap_atomic(kaddr, KM_USER0); +good: + /* if the io failure tree for this inode is non-empty, + * check to see if we've recovered from a failed IO + */ + btrfs_clean_io_failures(inode, start); + return 0; + +zeroit: + printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u " + "private %llu\n", page->mapping->host->i_ino, + (unsigned long long)start, csum, + (unsigned long long)private); + memset(kaddr + offset, 1, end - start + 1); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + if (private == 0) + return 0; + return -EIO; +} + +/* + * This creates an orphan entry for the given inode in case something goes + * wrong in the middle of an unlink/truncate. + */ +int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret = 0; + + spin_lock(&root->list_lock); + + /* already on the orphan list, we're good */ + if (!list_empty(&BTRFS_I(inode)->i_orphan)) { + spin_unlock(&root->list_lock); + return 0; + } + + list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); + + spin_unlock(&root->list_lock); + + /* + * insert an orphan item to track this unlinked/truncated file + */ + ret = btrfs_insert_orphan_item(trans, root, inode->i_ino); + + return ret; +} + +/* + * We have done the truncate/delete so we can go ahead and remove the orphan + * item for this particular inode. + */ +int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret = 0; + + spin_lock(&root->list_lock); + + if (list_empty(&BTRFS_I(inode)->i_orphan)) { + spin_unlock(&root->list_lock); + return 0; + } + + list_del_init(&BTRFS_I(inode)->i_orphan); + if (!trans) { + spin_unlock(&root->list_lock); + return 0; + } + + spin_unlock(&root->list_lock); + + ret = btrfs_del_orphan_item(trans, root, inode->i_ino); + + return ret; +} + +/* + * this cleans up any orphans that may be left on the list from the last use + * of this root. + */ +void btrfs_orphan_cleanup(struct btrfs_root *root) +{ + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_item *item; + struct btrfs_key key, found_key; + struct btrfs_trans_handle *trans; + struct inode *inode; + int ret = 0, nr_unlink = 0, nr_truncate = 0; + + path = btrfs_alloc_path(); + if (!path) + return; + path->reada = -1; + + key.objectid = BTRFS_ORPHAN_OBJECTID; + btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); + key.offset = (u64)-1; + + + while (1) { + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + printk(KERN_ERR "Error searching slot for orphan: %d" + "\n", ret); + break; + } + + /* + * if ret == 0 means we found what we were searching for, which + * is weird, but possible, so only screw with path if we didnt + * find the key and see if we have stuff that matches + */ + if (ret > 0) { + if (path->slots[0] == 0) + break; + path->slots[0]--; + } + + /* pull out the item */ + leaf = path->nodes[0]; + item = btrfs_item_nr(leaf, path->slots[0]); + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + /* make sure the item matches what we want */ + if (found_key.objectid != BTRFS_ORPHAN_OBJECTID) + break; + if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY) + break; + + /* release the path since we're done with it */ + btrfs_release_path(root, path); + + /* + * this is where we are basically btrfs_lookup, without the + * crossing root thing. we store the inode number in the + * offset of the orphan item. + */ + inode = btrfs_iget_locked(root->fs_info->sb, + found_key.offset, root); + if (!inode) + break; + + if (inode->i_state & I_NEW) { + BTRFS_I(inode)->root = root; + + /* have to set the location manually */ + BTRFS_I(inode)->location.objectid = inode->i_ino; + BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; + BTRFS_I(inode)->location.offset = 0; + + btrfs_read_locked_inode(inode); + unlock_new_inode(inode); + } + + /* + * add this inode to the orphan list so btrfs_orphan_del does + * the proper thing when we hit it + */ + spin_lock(&root->list_lock); + list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); + spin_unlock(&root->list_lock); + + /* + * if this is a bad inode, means we actually succeeded in + * removing the inode, but not the orphan record, which means + * we need to manually delete the orphan since iput will just + * do a destroy_inode + */ + if (is_bad_inode(inode)) { + trans = btrfs_start_transaction(root, 1); + btrfs_orphan_del(trans, inode); + btrfs_end_transaction(trans, root); + iput(inode); + continue; + } + + /* if we have links, this was a truncate, lets do that */ + if (inode->i_nlink) { + nr_truncate++; + btrfs_truncate(inode); + } else { + nr_unlink++; + } + + /* this will do delete_inode and everything for us */ + iput(inode); + } + + if (nr_unlink) + printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink); + if (nr_truncate) + printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate); + + btrfs_free_path(path); +} + +/* + * read an inode from the btree into the in-memory inode + */ +void btrfs_read_locked_inode(struct inode *inode) +{ + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_inode_item *inode_item; + struct btrfs_timespec *tspec; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_key location; + u64 alloc_group_block; + u32 rdev; + int ret; + + path = btrfs_alloc_path(); + BUG_ON(!path); + memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); + + ret = btrfs_lookup_inode(NULL, root, path, &location, 0); + if (ret) + goto make_bad; + + leaf = path->nodes[0]; + inode_item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_inode_item); + + inode->i_mode = btrfs_inode_mode(leaf, inode_item); + inode->i_nlink = btrfs_inode_nlink(leaf, inode_item); + inode->i_uid = btrfs_inode_uid(leaf, inode_item); + inode->i_gid = btrfs_inode_gid(leaf, inode_item); + btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item)); + + tspec = btrfs_inode_atime(inode_item); + inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec); + inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); + + tspec = btrfs_inode_mtime(inode_item); + inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec); + inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); + + tspec = btrfs_inode_ctime(inode_item); + inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec); + inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); + + inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); + BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); + BTRFS_I(inode)->sequence = btrfs_inode_sequence(leaf, inode_item); + inode->i_generation = BTRFS_I(inode)->generation; + inode->i_rdev = 0; + rdev = btrfs_inode_rdev(leaf, inode_item); + + BTRFS_I(inode)->index_cnt = (u64)-1; + BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); + + alloc_group_block = btrfs_inode_block_group(leaf, inode_item); + BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0, + alloc_group_block, 0); + btrfs_free_path(path); + inode_item = NULL; + + switch (inode->i_mode & S_IFMT) { + case S_IFREG: + inode->i_mapping->a_ops = &btrfs_aops; + inode->i_mapping->backing_dev_info = &root->fs_info->bdi; + BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; + inode->i_fop = &btrfs_file_operations; + inode->i_op = &btrfs_file_inode_operations; + break; + case S_IFDIR: + inode->i_fop = &btrfs_dir_file_operations; + if (root == root->fs_info->tree_root) + inode->i_op = &btrfs_dir_ro_inode_operations; + else + inode->i_op = &btrfs_dir_inode_operations; + break; + case S_IFLNK: + inode->i_op = &btrfs_symlink_inode_operations; + inode->i_mapping->a_ops = &btrfs_symlink_aops; + inode->i_mapping->backing_dev_info = &root->fs_info->bdi; + break; + default: + init_special_inode(inode, inode->i_mode, rdev); + break; + } + return; + +make_bad: + btrfs_free_path(path); + make_bad_inode(inode); +} + +/* + * given a leaf and an inode, copy the inode fields into the leaf + */ +static void fill_inode_item(struct btrfs_trans_handle *trans, + struct extent_buffer *leaf, + struct btrfs_inode_item *item, + struct inode *inode) +{ + btrfs_set_inode_uid(leaf, item, inode->i_uid); + btrfs_set_inode_gid(leaf, item, inode->i_gid); + btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size); + btrfs_set_inode_mode(leaf, item, inode->i_mode); + btrfs_set_inode_nlink(leaf, item, inode->i_nlink); + + btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item), + inode->i_atime.tv_sec); + btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item), + inode->i_atime.tv_nsec); + + btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item), + inode->i_mtime.tv_sec); + btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item), + inode->i_mtime.tv_nsec); + + btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item), + inode->i_ctime.tv_sec); + btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item), + inode->i_ctime.tv_nsec); + + btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode)); + btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation); + btrfs_set_inode_sequence(leaf, item, BTRFS_I(inode)->sequence); + btrfs_set_inode_transid(leaf, item, trans->transid); + btrfs_set_inode_rdev(leaf, item, inode->i_rdev); + btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); + btrfs_set_inode_block_group(leaf, item, BTRFS_I(inode)->block_group); +} + +/* + * copy everything in the in-memory inode into the btree. + */ +noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode) +{ + struct btrfs_inode_item *inode_item; + struct btrfs_path *path; + struct extent_buffer *leaf; + int ret; + + path = btrfs_alloc_path(); + BUG_ON(!path); + ret = btrfs_lookup_inode(trans, root, path, + &BTRFS_I(inode)->location, 1); + if (ret) { + if (ret > 0) + ret = -ENOENT; + goto failed; + } + + leaf = path->nodes[0]; + inode_item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_inode_item); + + fill_inode_item(trans, leaf, inode_item, inode); + btrfs_mark_buffer_dirty(leaf); + btrfs_set_inode_last_trans(trans, inode); + ret = 0; +failed: + btrfs_free_path(path); + return ret; +} + + +/* + * unlink helper that gets used here in inode.c and in the tree logging + * recovery code. It remove a link in a directory with a given name, and + * also drops the back refs in the inode to the directory + */ +int btrfs_unlink_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *dir, struct inode *inode, + const char *name, int name_len) +{ + struct btrfs_path *path; + int ret = 0; + struct extent_buffer *leaf; + struct btrfs_dir_item *di; + struct btrfs_key key; + u64 index; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto err; + } + + di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, + name, name_len, -1); + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto err; + } + if (!di) { + ret = -ENOENT; + goto err; + } + leaf = path->nodes[0]; + btrfs_dir_item_key_to_cpu(leaf, di, &key); + ret = btrfs_delete_one_dir_name(trans, root, path, di); + if (ret) + goto err; + btrfs_release_path(root, path); + + ret = btrfs_del_inode_ref(trans, root, name, name_len, + inode->i_ino, + dir->i_ino, &index); + if (ret) { + printk(KERN_INFO "btrfs failed to delete reference to %.*s, " + "inode %lu parent %lu\n", name_len, name, + inode->i_ino, dir->i_ino); + goto err; + } + + di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, + index, name, name_len, -1); + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto err; + } + if (!di) { + ret = -ENOENT; + goto err; + } + ret = btrfs_delete_one_dir_name(trans, root, path, di); + btrfs_release_path(root, path); + + ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, + inode, dir->i_ino); + BUG_ON(ret != 0 && ret != -ENOENT); + if (ret != -ENOENT) + BTRFS_I(dir)->log_dirty_trans = trans->transid; + + ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, + dir, index); + BUG_ON(ret); +err: + btrfs_free_path(path); + if (ret) + goto out; + + btrfs_i_size_write(dir, dir->i_size - name_len * 2); + inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME; + btrfs_update_inode(trans, root, dir); + btrfs_drop_nlink(inode); + ret = btrfs_update_inode(trans, root, inode); + dir->i_sb->s_dirt = 1; +out: + return ret; +} + +static int btrfs_unlink(struct inode *dir, struct dentry *dentry) +{ + struct btrfs_root *root; + struct btrfs_trans_handle *trans; + struct inode *inode = dentry->d_inode; + int ret; + unsigned long nr = 0; + + root = BTRFS_I(dir)->root; + + ret = btrfs_check_free_space(root, 1, 1); + if (ret) + goto fail; + + trans = btrfs_start_transaction(root, 1); + + btrfs_set_trans_block_group(trans, dir); + ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, + dentry->d_name.name, dentry->d_name.len); + + if (inode->i_nlink == 0) + ret = btrfs_orphan_add(trans, inode); + + nr = trans->blocks_used; + + btrfs_end_transaction_throttle(trans, root); +fail: + btrfs_btree_balance_dirty(root, nr); + return ret; +} + +static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + int err = 0; + int ret; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_trans_handle *trans; + unsigned long nr = 0; + + /* + * the FIRST_FREE_OBJECTID check makes sure we don't try to rmdir + * the root of a subvolume or snapshot + */ + if (inode->i_size > BTRFS_EMPTY_DIR_SIZE || + inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) { + return -ENOTEMPTY; + } + + ret = btrfs_check_free_space(root, 1, 1); + if (ret) + goto fail; + + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, dir); + + err = btrfs_orphan_add(trans, inode); + if (err) + goto fail_trans; + + /* now the directory is empty */ + err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, + dentry->d_name.name, dentry->d_name.len); + if (!err) + btrfs_i_size_write(inode, 0); + +fail_trans: + nr = trans->blocks_used; + ret = btrfs_end_transaction_throttle(trans, root); +fail: + btrfs_btree_balance_dirty(root, nr); + + if (ret && !err) + err = ret; + return err; +} + +#if 0 +/* + * when truncating bytes in a file, it is possible to avoid reading + * the leaves that contain only checksum items. This can be the + * majority of the IO required to delete a large file, but it must + * be done carefully. + * + * The keys in the level just above the leaves are checked to make sure + * the lowest key in a given leaf is a csum key, and starts at an offset + * after the new size. + * + * Then the key for the next leaf is checked to make sure it also has + * a checksum item for the same file. If it does, we know our target leaf + * contains only checksum items, and it can be safely freed without reading + * it. + * + * This is just an optimization targeted at large files. It may do + * nothing. It will return 0 unless things went badly. + */ +static noinline int drop_csum_leaves(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct inode *inode, u64 new_size) +{ + struct btrfs_key key; + int ret; + int nritems; + struct btrfs_key found_key; + struct btrfs_key other_key; + struct btrfs_leaf_ref *ref; + u64 leaf_gen; + u64 leaf_start; + + path->lowest_level = 1; + key.objectid = inode->i_ino; + key.type = BTRFS_CSUM_ITEM_KEY; + key.offset = new_size; +again: + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + goto out; + + if (path->nodes[1] == NULL) { + ret = 0; + goto out; + } + ret = 0; + btrfs_node_key_to_cpu(path->nodes[1], &found_key, path->slots[1]); + nritems = btrfs_header_nritems(path->nodes[1]); + + if (!nritems) + goto out; + + if (path->slots[1] >= nritems) + goto next_node; + + /* did we find a key greater than anything we want to delete? */ + if (found_key.objectid > inode->i_ino || + (found_key.objectid == inode->i_ino && found_key.type > key.type)) + goto out; + + /* we check the next key in the node to make sure the leave contains + * only checksum items. This comparison doesn't work if our + * leaf is the last one in the node + */ + if (path->slots[1] + 1 >= nritems) { +next_node: + /* search forward from the last key in the node, this + * will bring us into the next node in the tree + */ + btrfs_node_key_to_cpu(path->nodes[1], &found_key, nritems - 1); + + /* unlikely, but we inc below, so check to be safe */ + if (found_key.offset == (u64)-1) + goto out; + + /* search_forward needs a path with locks held, do the + * search again for the original key. It is possible + * this will race with a balance and return a path that + * we could modify, but this drop is just an optimization + * and is allowed to miss some leaves. + */ + btrfs_release_path(root, path); + found_key.offset++; + + /* setup a max key for search_forward */ + other_key.offset = (u64)-1; + other_key.type = key.type; + other_key.objectid = key.objectid; + + path->keep_locks = 1; + ret = btrfs_search_forward(root, &found_key, &other_key, + path, 0, 0); + path->keep_locks = 0; + if (ret || found_key.objectid != key.objectid || + found_key.type != key.type) { + ret = 0; + goto out; + } + + key.offset = found_key.offset; + btrfs_release_path(root, path); + cond_resched(); + goto again; + } + + /* we know there's one more slot after us in the tree, + * read that key so we can verify it is also a checksum item + */ + btrfs_node_key_to_cpu(path->nodes[1], &other_key, path->slots[1] + 1); + + if (found_key.objectid < inode->i_ino) + goto next_key; + + if (found_key.type != key.type || found_key.offset < new_size) + goto next_key; + + /* + * if the key for the next leaf isn't a csum key from this objectid, + * we can't be sure there aren't good items inside this leaf. + * Bail out + */ + if (other_key.objectid != inode->i_ino || other_key.type != key.type) + goto out; + + leaf_start = btrfs_node_blockptr(path->nodes[1], path->slots[1]); + leaf_gen = btrfs_node_ptr_generation(path->nodes[1], path->slots[1]); + /* + * it is safe to delete this leaf, it contains only + * csum items from this inode at an offset >= new_size + */ + ret = btrfs_del_leaf(trans, root, path, leaf_start); + BUG_ON(ret); + + if (root->ref_cows && leaf_gen < trans->transid) { + ref = btrfs_alloc_leaf_ref(root, 0); + if (ref) { + ref->root_gen = root->root_key.offset; + ref->bytenr = leaf_start; + ref->owner = 0; + ref->generation = leaf_gen; + ref->nritems = 0; + + ret = btrfs_add_leaf_ref(root, ref, 0); + WARN_ON(ret); + btrfs_free_leaf_ref(root, ref); + } else { + WARN_ON(1); + } + } +next_key: + btrfs_release_path(root, path); + + if (other_key.objectid == inode->i_ino && + other_key.type == key.type && other_key.offset > key.offset) { + key.offset = other_key.offset; + cond_resched(); + goto again; + } + ret = 0; +out: + /* fixup any changes we've made to the path */ + path->lowest_level = 0; + path->keep_locks = 0; + btrfs_release_path(root, path); + return ret; +} + +#endif + +/* + * this can truncate away extent items, csum items and directory items. + * It starts at a high offset and removes keys until it can't find + * any higher than new_size + * + * csum items that cross the new i_size are truncated to the new size + * as well. + * + * min_type is the minimum key type to truncate down to. If set to 0, this + * will kill all the items on this inode, including the INODE_ITEM_KEY. + */ +noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, + u64 new_size, u32 min_type) +{ + int ret; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_key found_key; + u32 found_type; + struct extent_buffer *leaf; + struct btrfs_file_extent_item *fi; + u64 extent_start = 0; + u64 extent_num_bytes = 0; + u64 item_end = 0; + u64 root_gen = 0; + u64 root_owner = 0; + int found_extent; + int del_item; + int pending_del_nr = 0; + int pending_del_slot = 0; + int extent_type = -1; + int encoding; + u64 mask = root->sectorsize - 1; + + if (root->ref_cows) + btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0); + path = btrfs_alloc_path(); + path->reada = -1; + BUG_ON(!path); + + /* FIXME, add redo link to tree so we don't leak on crash */ + key.objectid = inode->i_ino; + key.offset = (u64)-1; + key.type = (u8)-1; + + btrfs_init_path(path); + +search_again: + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + goto error; + + if (ret > 0) { + /* there are no items in the tree for us to truncate, we're + * done + */ + if (path->slots[0] == 0) { + ret = 0; + goto error; + } + path->slots[0]--; + } + + while (1) { + fi = NULL; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + found_type = btrfs_key_type(&found_key); + encoding = 0; + + if (found_key.objectid != inode->i_ino) + break; + + if (found_type < min_type) + break; + + item_end = found_key.offset; + if (found_type == BTRFS_EXTENT_DATA_KEY) { + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + extent_type = btrfs_file_extent_type(leaf, fi); + encoding = btrfs_file_extent_compression(leaf, fi); + encoding |= btrfs_file_extent_encryption(leaf, fi); + encoding |= btrfs_file_extent_other_encoding(leaf, fi); + + if (extent_type != BTRFS_FILE_EXTENT_INLINE) { + item_end += + btrfs_file_extent_num_bytes(leaf, fi); + } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + item_end += btrfs_file_extent_inline_len(leaf, + fi); + } + item_end--; + } + if (item_end < new_size) { + if (found_type == BTRFS_DIR_ITEM_KEY) + found_type = BTRFS_INODE_ITEM_KEY; + else if (found_type == BTRFS_EXTENT_ITEM_KEY) + found_type = BTRFS_EXTENT_DATA_KEY; + else if (found_type == BTRFS_EXTENT_DATA_KEY) + found_type = BTRFS_XATTR_ITEM_KEY; + else if (found_type == BTRFS_XATTR_ITEM_KEY) + found_type = BTRFS_INODE_REF_KEY; + else if (found_type) + found_type--; + else + break; + btrfs_set_key_type(&key, found_type); + goto next; + } + if (found_key.offset >= new_size) + del_item = 1; + else + del_item = 0; + found_extent = 0; + + /* FIXME, shrink the extent if the ref count is only 1 */ + if (found_type != BTRFS_EXTENT_DATA_KEY) + goto delete; + + if (extent_type != BTRFS_FILE_EXTENT_INLINE) { + u64 num_dec; + extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); + if (!del_item && !encoding) { + u64 orig_num_bytes = + btrfs_file_extent_num_bytes(leaf, fi); + extent_num_bytes = new_size - + found_key.offset + root->sectorsize - 1; + extent_num_bytes = extent_num_bytes & + ~((u64)root->sectorsize - 1); + btrfs_set_file_extent_num_bytes(leaf, fi, + extent_num_bytes); + num_dec = (orig_num_bytes - + extent_num_bytes); + if (root->ref_cows && extent_start != 0) + inode_sub_bytes(inode, num_dec); + btrfs_mark_buffer_dirty(leaf); + } else { + extent_num_bytes = + btrfs_file_extent_disk_num_bytes(leaf, + fi); + /* FIXME blocksize != 4096 */ + num_dec = btrfs_file_extent_num_bytes(leaf, fi); + if (extent_start != 0) { + found_extent = 1; + if (root->ref_cows) + inode_sub_bytes(inode, num_dec); + } + root_gen = btrfs_header_generation(leaf); + root_owner = btrfs_header_owner(leaf); + } + } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + /* + * we can't truncate inline items that have had + * special encodings + */ + if (!del_item && + btrfs_file_extent_compression(leaf, fi) == 0 && + btrfs_file_extent_encryption(leaf, fi) == 0 && + btrfs_file_extent_other_encoding(leaf, fi) == 0) { + u32 size = new_size - found_key.offset; + + if (root->ref_cows) { + inode_sub_bytes(inode, item_end + 1 - + new_size); + } + size = + btrfs_file_extent_calc_inline_size(size); + ret = btrfs_truncate_item(trans, root, path, + size, 1); + BUG_ON(ret); + } else if (root->ref_cows) { + inode_sub_bytes(inode, item_end + 1 - + found_key.offset); + } + } +delete: + if (del_item) { + if (!pending_del_nr) { + /* no pending yet, add ourselves */ + pending_del_slot = path->slots[0]; + pending_del_nr = 1; + } else if (pending_del_nr && + path->slots[0] + 1 == pending_del_slot) { + /* hop on the pending chunk */ + pending_del_nr++; + pending_del_slot = path->slots[0]; + } else { + BUG(); + } + } else { + break; + } + if (found_extent) { + ret = btrfs_free_extent(trans, root, extent_start, + extent_num_bytes, + leaf->start, root_owner, + root_gen, inode->i_ino, 0); + BUG_ON(ret); + } +next: + if (path->slots[0] == 0) { + if (pending_del_nr) + goto del_pending; + btrfs_release_path(root, path); + goto search_again; + } + + path->slots[0]--; + if (pending_del_nr && + path->slots[0] + 1 != pending_del_slot) { + struct btrfs_key debug; +del_pending: + btrfs_item_key_to_cpu(path->nodes[0], &debug, + pending_del_slot); + ret = btrfs_del_items(trans, root, path, + pending_del_slot, + pending_del_nr); + BUG_ON(ret); + pending_del_nr = 0; + btrfs_release_path(root, path); + goto search_again; + } + } + ret = 0; +error: + if (pending_del_nr) { + ret = btrfs_del_items(trans, root, path, pending_del_slot, + pending_del_nr); + } + btrfs_free_path(path); + inode->i_sb->s_dirt = 1; + return ret; +} + +/* + * taken from block_truncate_page, but does cow as it zeros out + * any bytes left in the last page in the file. + */ +static int btrfs_truncate_page(struct address_space *mapping, loff_t from) +{ + struct inode *inode = mapping->host; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_ordered_extent *ordered; + char *kaddr; + u32 blocksize = root->sectorsize; + pgoff_t index = from >> PAGE_CACHE_SHIFT; + unsigned offset = from & (PAGE_CACHE_SIZE-1); + struct page *page; + int ret = 0; + u64 page_start; + u64 page_end; + + if ((offset & (blocksize - 1)) == 0) + goto out; + + ret = -ENOMEM; +again: + page = grab_cache_page(mapping, index); + if (!page) + goto out; + + page_start = page_offset(page); + page_end = page_start + PAGE_CACHE_SIZE - 1; + + if (!PageUptodate(page)) { + ret = btrfs_readpage(NULL, page); + lock_page(page); + if (page->mapping != mapping) { + unlock_page(page); + page_cache_release(page); + goto again; + } + if (!PageUptodate(page)) { + ret = -EIO; + goto out_unlock; + } + } + wait_on_page_writeback(page); + + lock_extent(io_tree, page_start, page_end, GFP_NOFS); + set_page_extent_mapped(page); + + ordered = btrfs_lookup_ordered_extent(inode, page_start); + if (ordered) { + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + unlock_page(page); + page_cache_release(page); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + goto again; + } + + btrfs_set_extent_delalloc(inode, page_start, page_end); + ret = 0; + if (offset != PAGE_CACHE_SIZE) { + kaddr = kmap(page); + memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset); + flush_dcache_page(page); + kunmap(page); + } + ClearPageChecked(page); + set_page_dirty(page); + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + +out_unlock: + unlock_page(page); + page_cache_release(page); +out: + return ret; +} + +int btrfs_cont_expand(struct inode *inode, loff_t size) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct extent_map *em; + u64 mask = root->sectorsize - 1; + u64 hole_start = (inode->i_size + mask) & ~mask; + u64 block_end = (size + mask) & ~mask; + u64 last_byte; + u64 cur_offset; + u64 hole_size; + int err; + + if (size <= hole_start) + return 0; + + err = btrfs_check_free_space(root, 1, 0); + if (err) + return err; + + btrfs_truncate_page(inode->i_mapping, inode->i_size); + + while (1) { + struct btrfs_ordered_extent *ordered; + btrfs_wait_ordered_range(inode, hole_start, + block_end - hole_start); + lock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS); + ordered = btrfs_lookup_ordered_extent(inode, hole_start); + if (!ordered) + break; + unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS); + btrfs_put_ordered_extent(ordered); + } + + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); + + cur_offset = hole_start; + while (1) { + em = btrfs_get_extent(inode, NULL, 0, cur_offset, + block_end - cur_offset, 0); + BUG_ON(IS_ERR(em) || !em); + last_byte = min(extent_map_end(em), block_end); + last_byte = (last_byte + mask) & ~mask; + if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) { + u64 hint_byte = 0; + hole_size = last_byte - cur_offset; + err = btrfs_drop_extents(trans, root, inode, + cur_offset, + cur_offset + hole_size, + cur_offset, &hint_byte); + if (err) + break; + err = btrfs_insert_file_extent(trans, root, + inode->i_ino, cur_offset, 0, + 0, hole_size, 0, hole_size, + 0, 0, 0); + btrfs_drop_extent_cache(inode, hole_start, + last_byte - 1, 0); + } + free_extent_map(em); + cur_offset = last_byte; + if (err || cur_offset >= block_end) + break; + } + + btrfs_end_transaction(trans, root); + unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS); + return err; +} + +static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = dentry->d_inode; + int err; + + err = inode_change_ok(inode, attr); + if (err) + return err; + + if (S_ISREG(inode->i_mode) && + attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) { + err = btrfs_cont_expand(inode, attr->ia_size); + if (err) + return err; + } + + err = inode_setattr(inode, attr); + + if (!err && ((attr->ia_valid & ATTR_MODE))) + err = btrfs_acl_chmod(inode); + return err; +} + +void btrfs_delete_inode(struct inode *inode) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(inode)->root; + unsigned long nr; + int ret; + + truncate_inode_pages(&inode->i_data, 0); + if (is_bad_inode(inode)) { + btrfs_orphan_del(NULL, inode); + goto no_delete; + } + btrfs_wait_ordered_range(inode, 0, (u64)-1); + + btrfs_i_size_write(inode, 0); + trans = btrfs_join_transaction(root, 1); + + btrfs_set_trans_block_group(trans, inode); + ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, 0); + if (ret) { + btrfs_orphan_del(NULL, inode); + goto no_delete_lock; + } + + btrfs_orphan_del(trans, inode); + + nr = trans->blocks_used; + clear_inode(inode); + + btrfs_end_transaction(trans, root); + btrfs_btree_balance_dirty(root, nr); + return; + +no_delete_lock: + nr = trans->blocks_used; + btrfs_end_transaction(trans, root); + btrfs_btree_balance_dirty(root, nr); +no_delete: + clear_inode(inode); +} + +/* + * this returns the key found in the dir entry in the location pointer. + * If no dir entries were found, location->objectid is 0. + */ +static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, + struct btrfs_key *location) +{ + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + struct btrfs_dir_item *di; + struct btrfs_path *path; + struct btrfs_root *root = BTRFS_I(dir)->root; + int ret = 0; + + path = btrfs_alloc_path(); + BUG_ON(!path); + + di = btrfs_lookup_dir_item(NULL, root, path, dir->i_ino, name, + namelen, 0); + if (IS_ERR(di)) + ret = PTR_ERR(di); + + if (!di || IS_ERR(di)) + goto out_err; + + btrfs_dir_item_key_to_cpu(path->nodes[0], di, location); +out: + btrfs_free_path(path); + return ret; +out_err: + location->objectid = 0; + goto out; +} + +/* + * when we hit a tree root in a directory, the btrfs part of the inode + * needs to be changed to reflect the root directory of the tree root. This + * is kind of like crossing a mount point. + */ +static int fixup_tree_root_location(struct btrfs_root *root, + struct btrfs_key *location, + struct btrfs_root **sub_root, + struct dentry *dentry) +{ + struct btrfs_root_item *ri; + + if (btrfs_key_type(location) != BTRFS_ROOT_ITEM_KEY) + return 0; + if (location->objectid == BTRFS_ROOT_TREE_OBJECTID) + return 0; + + *sub_root = btrfs_read_fs_root(root->fs_info, location, + dentry->d_name.name, + dentry->d_name.len); + if (IS_ERR(*sub_root)) + return PTR_ERR(*sub_root); + + ri = &(*sub_root)->root_item; + location->objectid = btrfs_root_dirid(ri); + btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY); + location->offset = 0; + + return 0; +} + +static noinline void init_btrfs_i(struct inode *inode) +{ + struct btrfs_inode *bi = BTRFS_I(inode); + + bi->i_acl = NULL; + bi->i_default_acl = NULL; + + bi->generation = 0; + bi->sequence = 0; + bi->last_trans = 0; + bi->logged_trans = 0; + bi->delalloc_bytes = 0; + bi->disk_i_size = 0; + bi->flags = 0; + bi->index_cnt = (u64)-1; + bi->log_dirty_trans = 0; + extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS); + extent_io_tree_init(&BTRFS_I(inode)->io_tree, + inode->i_mapping, GFP_NOFS); + extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree, + inode->i_mapping, GFP_NOFS); + INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes); + btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree); + mutex_init(&BTRFS_I(inode)->extent_mutex); + mutex_init(&BTRFS_I(inode)->log_mutex); +} + +static int btrfs_init_locked_inode(struct inode *inode, void *p) +{ + struct btrfs_iget_args *args = p; + inode->i_ino = args->ino; + init_btrfs_i(inode); + BTRFS_I(inode)->root = args->root; + return 0; +} + +static int btrfs_find_actor(struct inode *inode, void *opaque) +{ + struct btrfs_iget_args *args = opaque; + return args->ino == inode->i_ino && + args->root == BTRFS_I(inode)->root; +} + +struct inode *btrfs_ilookup(struct super_block *s, u64 objectid, + struct btrfs_root *root, int wait) +{ + struct inode *inode; + struct btrfs_iget_args args; + args.ino = objectid; + args.root = root; + + if (wait) { + inode = ilookup5(s, objectid, btrfs_find_actor, + (void *)&args); + } else { + inode = ilookup5_nowait(s, objectid, btrfs_find_actor, + (void *)&args); + } + return inode; +} + +struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid, + struct btrfs_root *root) +{ + struct inode *inode; + struct btrfs_iget_args args; + args.ino = objectid; + args.root = root; + + inode = iget5_locked(s, objectid, btrfs_find_actor, + btrfs_init_locked_inode, + (void *)&args); + return inode; +} + +/* Get an inode object given its location and corresponding root. + * Returns in *is_new if the inode was read from disk + */ +struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, + struct btrfs_root *root, int *is_new) +{ + struct inode *inode; + + inode = btrfs_iget_locked(s, location->objectid, root); + if (!inode) + return ERR_PTR(-EACCES); + + if (inode->i_state & I_NEW) { + BTRFS_I(inode)->root = root; + memcpy(&BTRFS_I(inode)->location, location, sizeof(*location)); + btrfs_read_locked_inode(inode); + unlock_new_inode(inode); + if (is_new) + *is_new = 1; + } else { + if (is_new) + *is_new = 0; + } + + return inode; +} + +struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode; + struct btrfs_inode *bi = BTRFS_I(dir); + struct btrfs_root *root = bi->root; + struct btrfs_root *sub_root = root; + struct btrfs_key location; + int ret, new; + + if (dentry->d_name.len > BTRFS_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + + ret = btrfs_inode_by_name(dir, dentry, &location); + + if (ret < 0) + return ERR_PTR(ret); + + inode = NULL; + if (location.objectid) { + ret = fixup_tree_root_location(root, &location, &sub_root, + dentry); + if (ret < 0) + return ERR_PTR(ret); + if (ret > 0) + return ERR_PTR(-ENOENT); + inode = btrfs_iget(dir->i_sb, &location, sub_root, &new); + if (IS_ERR(inode)) + return ERR_CAST(inode); + } + return inode; +} + +static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + struct inode *inode; + + if (dentry->d_name.len > BTRFS_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + + inode = btrfs_lookup_dentry(dir, dentry); + if (IS_ERR(inode)) + return ERR_CAST(inode); + + return d_splice_alias(inode, dentry); +} + +static unsigned char btrfs_filetype_table[] = { + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK +}; + +static int btrfs_real_readdir(struct file *filp, void *dirent, + filldir_t filldir) +{ + struct inode *inode = filp->f_dentry->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_item *item; + struct btrfs_dir_item *di; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_path *path; + int ret; + u32 nritems; + struct extent_buffer *leaf; + int slot; + int advance; + unsigned char d_type; + int over = 0; + u32 di_cur; + u32 di_total; + u32 di_len; + int key_type = BTRFS_DIR_INDEX_KEY; + char tmp_name[32]; + char *name_ptr; + int name_len; + + /* FIXME, use a real flag for deciding about the key type */ + if (root->fs_info->tree_root == root) + key_type = BTRFS_DIR_ITEM_KEY; + + /* special case for "." */ + if (filp->f_pos == 0) { + over = filldir(dirent, ".", 1, + 1, inode->i_ino, + DT_DIR); + if (over) + return 0; + filp->f_pos = 1; + } + /* special case for .., just use the back ref */ + if (filp->f_pos == 1) { + u64 pino = parent_ino(filp->f_path.dentry); + over = filldir(dirent, "..", 2, + 2, pino, DT_DIR); + if (over) + return 0; + filp->f_pos = 2; + } + path = btrfs_alloc_path(); + path->reada = 2; + + btrfs_set_key_type(&key, key_type); + key.offset = filp->f_pos; + key.objectid = inode->i_ino; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto err; + advance = 0; + + while (1) { + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + slot = path->slots[0]; + if (advance || slot >= nritems) { + if (slot >= nritems - 1) { + ret = btrfs_next_leaf(root, path); + if (ret) + break; + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + slot = path->slots[0]; + } else { + slot++; + path->slots[0]++; + } + } + + advance = 1; + item = btrfs_item_nr(leaf, slot); + btrfs_item_key_to_cpu(leaf, &found_key, slot); + + if (found_key.objectid != key.objectid) + break; + if (btrfs_key_type(&found_key) != key_type) + break; + if (found_key.offset < filp->f_pos) + continue; + + filp->f_pos = found_key.offset; + + di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); + di_cur = 0; + di_total = btrfs_item_size(leaf, item); + + while (di_cur < di_total) { + struct btrfs_key location; + + name_len = btrfs_dir_name_len(leaf, di); + if (name_len <= sizeof(tmp_name)) { + name_ptr = tmp_name; + } else { + name_ptr = kmalloc(name_len, GFP_NOFS); + if (!name_ptr) { + ret = -ENOMEM; + goto err; + } + } + read_extent_buffer(leaf, name_ptr, + (unsigned long)(di + 1), name_len); + + d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)]; + btrfs_dir_item_key_to_cpu(leaf, di, &location); + + /* is this a reference to our own snapshot? If so + * skip it + */ + if (location.type == BTRFS_ROOT_ITEM_KEY && + location.objectid == root->root_key.objectid) { + over = 0; + goto skip; + } + over = filldir(dirent, name_ptr, name_len, + found_key.offset, location.objectid, + d_type); + +skip: + if (name_ptr != tmp_name) + kfree(name_ptr); + + if (over) + goto nopos; + di_len = btrfs_dir_name_len(leaf, di) + + btrfs_dir_data_len(leaf, di) + sizeof(*di); + di_cur += di_len; + di = (struct btrfs_dir_item *)((char *)di + di_len); + } + } + + /* Reached end of directory/root. Bump pos past the last item. */ + if (key_type == BTRFS_DIR_INDEX_KEY) + filp->f_pos = INT_LIMIT(typeof(filp->f_pos)); + else + filp->f_pos++; +nopos: + ret = 0; +err: + btrfs_free_path(path); + return ret; +} + +int btrfs_write_inode(struct inode *inode, int wait) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + int ret = 0; + + if (root->fs_info->btree_inode == inode) + return 0; + + if (wait) { + trans = btrfs_join_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); + ret = btrfs_commit_transaction(trans, root); + } + return ret; +} + +/* + * This is somewhat expensive, updating the tree every time the + * inode changes. But, it is most likely to find the inode in cache. + * FIXME, needs more benchmarking...there are no reasons other than performance + * to keep or drop this code. + */ +void btrfs_dirty_inode(struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + + trans = btrfs_join_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); + btrfs_update_inode(trans, root, inode); + btrfs_end_transaction(trans, root); +} + +/* + * find the highest existing sequence number in a directory + * and then set the in-memory index_cnt variable to reflect + * free sequence numbers + */ +static int btrfs_set_inode_index_count(struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_key key, found_key; + struct btrfs_path *path; + struct extent_buffer *leaf; + int ret; + + key.objectid = inode->i_ino; + btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY); + key.offset = (u64)-1; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + /* FIXME: we should be able to handle this */ + if (ret == 0) + goto out; + ret = 0; + + /* + * MAGIC NUMBER EXPLANATION: + * since we search a directory based on f_pos we have to start at 2 + * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody + * else has to start at 2 + */ + if (path->slots[0] == 0) { + BTRFS_I(inode)->index_cnt = 2; + goto out; + } + + path->slots[0]--; + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + if (found_key.objectid != inode->i_ino || + btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) { + BTRFS_I(inode)->index_cnt = 2; + goto out; + } + + BTRFS_I(inode)->index_cnt = found_key.offset + 1; +out: + btrfs_free_path(path); + return ret; +} + +/* + * helper to find a free sequence number in a given directory. This current + * code is very simple, later versions will do smarter things in the btree + */ +int btrfs_set_inode_index(struct inode *dir, u64 *index) +{ + int ret = 0; + + if (BTRFS_I(dir)->index_cnt == (u64)-1) { + ret = btrfs_set_inode_index_count(dir); + if (ret) + return ret; + } + + *index = BTRFS_I(dir)->index_cnt; + BTRFS_I(dir)->index_cnt++; + + return ret; +} + +static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *dir, + const char *name, int name_len, + u64 ref_objectid, u64 objectid, + u64 alloc_hint, int mode, u64 *index) +{ + struct inode *inode; + struct btrfs_inode_item *inode_item; + struct btrfs_key *location; + struct btrfs_path *path; + struct btrfs_inode_ref *ref; + struct btrfs_key key[2]; + u32 sizes[2]; + unsigned long ptr; + int ret; + int owner; + + path = btrfs_alloc_path(); + BUG_ON(!path); + + inode = new_inode(root->fs_info->sb); + if (!inode) + return ERR_PTR(-ENOMEM); + + if (dir) { + ret = btrfs_set_inode_index(dir, index); + if (ret) + return ERR_PTR(ret); + } + /* + * index_cnt is ignored for everything but a dir, + * btrfs_get_inode_index_count has an explanation for the magic + * number + */ + init_btrfs_i(inode); + BTRFS_I(inode)->index_cnt = 2; + BTRFS_I(inode)->root = root; + BTRFS_I(inode)->generation = trans->transid; + + if (mode & S_IFDIR) + owner = 0; + else + owner = 1; + BTRFS_I(inode)->block_group = + btrfs_find_block_group(root, 0, alloc_hint, owner); + if ((mode & S_IFREG)) { + if (btrfs_test_opt(root, NODATASUM)) + btrfs_set_flag(inode, NODATASUM); + if (btrfs_test_opt(root, NODATACOW)) + btrfs_set_flag(inode, NODATACOW); + } + + key[0].objectid = objectid; + btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY); + key[0].offset = 0; + + key[1].objectid = objectid; + btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY); + key[1].offset = ref_objectid; + + sizes[0] = sizeof(struct btrfs_inode_item); + sizes[1] = name_len + sizeof(*ref); + + ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2); + if (ret != 0) + goto fail; + + if (objectid > root->highest_inode) + root->highest_inode = objectid; + + inode->i_uid = current_fsuid(); + inode->i_gid = current_fsgid(); + inode->i_mode = mode; + inode->i_ino = objectid; + inode_set_bytes(inode, 0); + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_item); + fill_inode_item(trans, path->nodes[0], inode_item, inode); + + ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, + struct btrfs_inode_ref); + btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); + btrfs_set_inode_ref_index(path->nodes[0], ref, *index); + ptr = (unsigned long)(ref + 1); + write_extent_buffer(path->nodes[0], name, ptr, name_len); + + btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_free_path(path); + + location = &BTRFS_I(inode)->location; + location->objectid = objectid; + location->offset = 0; + btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY); + + insert_inode_hash(inode); + return inode; +fail: + if (dir) + BTRFS_I(dir)->index_cnt--; + btrfs_free_path(path); + return ERR_PTR(ret); +} + +static inline u8 btrfs_inode_type(struct inode *inode) +{ + return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT]; +} + +/* + * utility function to add 'inode' into 'parent_inode' with + * a give name and a given sequence number. + * if 'add_backref' is true, also insert a backref from the + * inode to the parent directory. + */ +int btrfs_add_link(struct btrfs_trans_handle *trans, + struct inode *parent_inode, struct inode *inode, + const char *name, int name_len, int add_backref, u64 index) +{ + int ret; + struct btrfs_key key; + struct btrfs_root *root = BTRFS_I(parent_inode)->root; + + key.objectid = inode->i_ino; + btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); + key.offset = 0; + + ret = btrfs_insert_dir_item(trans, root, name, name_len, + parent_inode->i_ino, + &key, btrfs_inode_type(inode), + index); + if (ret == 0) { + if (add_backref) { + ret = btrfs_insert_inode_ref(trans, root, + name, name_len, + inode->i_ino, + parent_inode->i_ino, + index); + } + btrfs_i_size_write(parent_inode, parent_inode->i_size + + name_len * 2); + parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; + ret = btrfs_update_inode(trans, root, parent_inode); + } + return ret; +} + +static int btrfs_add_nondir(struct btrfs_trans_handle *trans, + struct dentry *dentry, struct inode *inode, + int backref, u64 index) +{ + int err = btrfs_add_link(trans, dentry->d_parent->d_inode, + inode, dentry->d_name.name, + dentry->d_name.len, backref, index); + if (!err) { + d_instantiate(dentry, inode); + return 0; + } + if (err > 0) + err = -EEXIST; + return err; +} + +static int btrfs_mknod(struct inode *dir, struct dentry *dentry, + int mode, dev_t rdev) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct inode *inode = NULL; + int err; + int drop_inode = 0; + u64 objectid; + unsigned long nr = 0; + u64 index = 0; + + if (!new_valid_dev(rdev)) + return -EINVAL; + + err = btrfs_check_free_space(root, 1, 0); + if (err) + goto fail; + + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, dir); + + err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); + if (err) { + err = -ENOSPC; + goto out_unlock; + } + + inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, + dentry->d_name.len, + dentry->d_parent->d_inode->i_ino, objectid, + BTRFS_I(dir)->block_group, mode, &index); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_unlock; + + err = btrfs_init_acl(inode, dir); + if (err) { + drop_inode = 1; + goto out_unlock; + } + + btrfs_set_trans_block_group(trans, inode); + err = btrfs_add_nondir(trans, dentry, inode, 0, index); + if (err) + drop_inode = 1; + else { + inode->i_op = &btrfs_special_inode_operations; + init_special_inode(inode, inode->i_mode, rdev); + btrfs_update_inode(trans, root, inode); + } + dir->i_sb->s_dirt = 1; + btrfs_update_inode_block_group(trans, inode); + btrfs_update_inode_block_group(trans, dir); +out_unlock: + nr = trans->blocks_used; + btrfs_end_transaction_throttle(trans, root); +fail: + if (drop_inode) { + inode_dec_link_count(inode); + iput(inode); + } + btrfs_btree_balance_dirty(root, nr); + return err; +} + +static int btrfs_create(struct inode *dir, struct dentry *dentry, + int mode, struct nameidata *nd) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct inode *inode = NULL; + int err; + int drop_inode = 0; + unsigned long nr = 0; + u64 objectid; + u64 index = 0; + + err = btrfs_check_free_space(root, 1, 0); + if (err) + goto fail; + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, dir); + + err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); + if (err) { + err = -ENOSPC; + goto out_unlock; + } + + inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, + dentry->d_name.len, + dentry->d_parent->d_inode->i_ino, + objectid, BTRFS_I(dir)->block_group, mode, + &index); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_unlock; + + err = btrfs_init_acl(inode, dir); + if (err) { + drop_inode = 1; + goto out_unlock; + } + + btrfs_set_trans_block_group(trans, inode); + err = btrfs_add_nondir(trans, dentry, inode, 0, index); + if (err) + drop_inode = 1; + else { + inode->i_mapping->a_ops = &btrfs_aops; + inode->i_mapping->backing_dev_info = &root->fs_info->bdi; + inode->i_fop = &btrfs_file_operations; + inode->i_op = &btrfs_file_inode_operations; + BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; + } + dir->i_sb->s_dirt = 1; + btrfs_update_inode_block_group(trans, inode); + btrfs_update_inode_block_group(trans, dir); +out_unlock: + nr = trans->blocks_used; + btrfs_end_transaction_throttle(trans, root); +fail: + if (drop_inode) { + inode_dec_link_count(inode); + iput(inode); + } + btrfs_btree_balance_dirty(root, nr); + return err; +} + +static int btrfs_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct inode *inode = old_dentry->d_inode; + u64 index; + unsigned long nr = 0; + int err; + int drop_inode = 0; + + if (inode->i_nlink == 0) + return -ENOENT; + + btrfs_inc_nlink(inode); + err = btrfs_check_free_space(root, 1, 0); + if (err) + goto fail; + err = btrfs_set_inode_index(dir, &index); + if (err) + goto fail; + + trans = btrfs_start_transaction(root, 1); + + btrfs_set_trans_block_group(trans, dir); + atomic_inc(&inode->i_count); + + err = btrfs_add_nondir(trans, dentry, inode, 1, index); + + if (err) + drop_inode = 1; + + dir->i_sb->s_dirt = 1; + btrfs_update_inode_block_group(trans, dir); + err = btrfs_update_inode(trans, root, inode); + + if (err) + drop_inode = 1; + + nr = trans->blocks_used; + btrfs_end_transaction_throttle(trans, root); +fail: + if (drop_inode) { + inode_dec_link_count(inode); + iput(inode); + } + btrfs_btree_balance_dirty(root, nr); + return err; +} + +static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + struct inode *inode = NULL; + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(dir)->root; + int err = 0; + int drop_on_err = 0; + u64 objectid = 0; + u64 index = 0; + unsigned long nr = 1; + + err = btrfs_check_free_space(root, 1, 0); + if (err) + goto out_unlock; + + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, dir); + + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out_unlock; + } + + err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); + if (err) { + err = -ENOSPC; + goto out_unlock; + } + + inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, + dentry->d_name.len, + dentry->d_parent->d_inode->i_ino, objectid, + BTRFS_I(dir)->block_group, S_IFDIR | mode, + &index); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out_fail; + } + + drop_on_err = 1; + + err = btrfs_init_acl(inode, dir); + if (err) + goto out_fail; + + inode->i_op = &btrfs_dir_inode_operations; + inode->i_fop = &btrfs_dir_file_operations; + btrfs_set_trans_block_group(trans, inode); + + btrfs_i_size_write(inode, 0); + err = btrfs_update_inode(trans, root, inode); + if (err) + goto out_fail; + + err = btrfs_add_link(trans, dentry->d_parent->d_inode, + inode, dentry->d_name.name, + dentry->d_name.len, 0, index); + if (err) + goto out_fail; + + d_instantiate(dentry, inode); + drop_on_err = 0; + dir->i_sb->s_dirt = 1; + btrfs_update_inode_block_group(trans, inode); + btrfs_update_inode_block_group(trans, dir); + +out_fail: + nr = trans->blocks_used; + btrfs_end_transaction_throttle(trans, root); + +out_unlock: + if (drop_on_err) + iput(inode); + btrfs_btree_balance_dirty(root, nr); + return err; +} + +/* helper for btfs_get_extent. Given an existing extent in the tree, + * and an extent that you want to insert, deal with overlap and insert + * the new extent into the tree. + */ +static int merge_extent_mapping(struct extent_map_tree *em_tree, + struct extent_map *existing, + struct extent_map *em, + u64 map_start, u64 map_len) +{ + u64 start_diff; + + BUG_ON(map_start < em->start || map_start >= extent_map_end(em)); + start_diff = map_start - em->start; + em->start = map_start; + em->len = map_len; + if (em->block_start < EXTENT_MAP_LAST_BYTE && + !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { + em->block_start += start_diff; + em->block_len -= start_diff; + } + return add_extent_mapping(em_tree, em); +} + +static noinline int uncompress_inline(struct btrfs_path *path, + struct inode *inode, struct page *page, + size_t pg_offset, u64 extent_offset, + struct btrfs_file_extent_item *item) +{ + int ret; + struct extent_buffer *leaf = path->nodes[0]; + char *tmp; + size_t max_size; + unsigned long inline_size; + unsigned long ptr; + + WARN_ON(pg_offset != 0); + max_size = btrfs_file_extent_ram_bytes(leaf, item); + inline_size = btrfs_file_extent_inline_item_len(leaf, + btrfs_item_nr(leaf, path->slots[0])); + tmp = kmalloc(inline_size, GFP_NOFS); + ptr = btrfs_file_extent_inline_start(item); + + read_extent_buffer(leaf, tmp, ptr, inline_size); + + max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size); + ret = btrfs_zlib_decompress(tmp, page, extent_offset, + inline_size, max_size); + if (ret) { + char *kaddr = kmap_atomic(page, KM_USER0); + unsigned long copy_size = min_t(u64, + PAGE_CACHE_SIZE - pg_offset, + max_size - extent_offset); + memset(kaddr + pg_offset, 0, copy_size); + kunmap_atomic(kaddr, KM_USER0); + } + kfree(tmp); + return 0; +} + +/* + * a bit scary, this does extent mapping from logical file offset to the disk. + * the ugly parts come from merging extents from the disk with the in-ram + * representation. This gets more complex because of the data=ordered code, + * where the in-ram extents might be locked pending data=ordered completion. + * + * This also copies inline extents directly into the page. + */ + +struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, + size_t pg_offset, u64 start, u64 len, + int create) +{ + int ret; + int err = 0; + u64 bytenr; + u64 extent_start = 0; + u64 extent_end = 0; + u64 objectid = inode->i_ino; + u32 found_type; + struct btrfs_path *path = NULL; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_file_extent_item *item; + struct extent_buffer *leaf; + struct btrfs_key found_key; + struct extent_map *em = NULL; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_trans_handle *trans = NULL; + int compressed; + +again: + spin_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, len); + if (em) + em->bdev = root->fs_info->fs_devices->latest_bdev; + spin_unlock(&em_tree->lock); + + if (em) { + if (em->start > start || em->start + em->len <= start) + free_extent_map(em); + else if (em->block_start == EXTENT_MAP_INLINE && page) + free_extent_map(em); + else + goto out; + } + em = alloc_extent_map(GFP_NOFS); + if (!em) { + err = -ENOMEM; + goto out; + } + em->bdev = root->fs_info->fs_devices->latest_bdev; + em->start = EXTENT_MAP_HOLE; + em->orig_start = EXTENT_MAP_HOLE; + em->len = (u64)-1; + em->block_len = (u64)-1; + + if (!path) { + path = btrfs_alloc_path(); + BUG_ON(!path); + } + + ret = btrfs_lookup_file_extent(trans, root, path, + objectid, start, trans != NULL); + if (ret < 0) { + err = ret; + goto out; + } + + if (ret != 0) { + if (path->slots[0] == 0) + goto not_found; + path->slots[0]--; + } + + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + /* are we inside the extent that was found? */ + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + found_type = btrfs_key_type(&found_key); + if (found_key.objectid != objectid || + found_type != BTRFS_EXTENT_DATA_KEY) { + goto not_found; + } + + found_type = btrfs_file_extent_type(leaf, item); + extent_start = found_key.offset; + compressed = btrfs_file_extent_compression(leaf, item); + if (found_type == BTRFS_FILE_EXTENT_REG || + found_type == BTRFS_FILE_EXTENT_PREALLOC) { + extent_end = extent_start + + btrfs_file_extent_num_bytes(leaf, item); + } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { + size_t size; + size = btrfs_file_extent_inline_len(leaf, item); + extent_end = (extent_start + size + root->sectorsize - 1) & + ~((u64)root->sectorsize - 1); + } + + if (start >= extent_end) { + path->slots[0]++; + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) { + err = ret; + goto out; + } + if (ret > 0) + goto not_found; + leaf = path->nodes[0]; + } + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (found_key.objectid != objectid || + found_key.type != BTRFS_EXTENT_DATA_KEY) + goto not_found; + if (start + len <= found_key.offset) + goto not_found; + em->start = start; + em->len = found_key.offset - start; + goto not_found_em; + } + + if (found_type == BTRFS_FILE_EXTENT_REG || + found_type == BTRFS_FILE_EXTENT_PREALLOC) { + em->start = extent_start; + em->len = extent_end - extent_start; + em->orig_start = extent_start - + btrfs_file_extent_offset(leaf, item); + bytenr = btrfs_file_extent_disk_bytenr(leaf, item); + if (bytenr == 0) { + em->block_start = EXTENT_MAP_HOLE; + goto insert; + } + if (compressed) { + set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + em->block_start = bytenr; + em->block_len = btrfs_file_extent_disk_num_bytes(leaf, + item); + } else { + bytenr += btrfs_file_extent_offset(leaf, item); + em->block_start = bytenr; + em->block_len = em->len; + if (found_type == BTRFS_FILE_EXTENT_PREALLOC) + set_bit(EXTENT_FLAG_PREALLOC, &em->flags); + } + goto insert; + } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { + unsigned long ptr; + char *map; + size_t size; + size_t extent_offset; + size_t copy_size; + + em->block_start = EXTENT_MAP_INLINE; + if (!page || create) { + em->start = extent_start; + em->len = extent_end - extent_start; + goto out; + } + + size = btrfs_file_extent_inline_len(leaf, item); + extent_offset = page_offset(page) + pg_offset - extent_start; + copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset, + size - extent_offset); + em->start = extent_start + extent_offset; + em->len = (copy_size + root->sectorsize - 1) & + ~((u64)root->sectorsize - 1); + em->orig_start = EXTENT_MAP_INLINE; + if (compressed) + set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + ptr = btrfs_file_extent_inline_start(item) + extent_offset; + if (create == 0 && !PageUptodate(page)) { + if (btrfs_file_extent_compression(leaf, item) == + BTRFS_COMPRESS_ZLIB) { + ret = uncompress_inline(path, inode, page, + pg_offset, + extent_offset, item); + BUG_ON(ret); + } else { + map = kmap(page); + read_extent_buffer(leaf, map + pg_offset, ptr, + copy_size); + kunmap(page); + } + flush_dcache_page(page); + } else if (create && PageUptodate(page)) { + if (!trans) { + kunmap(page); + free_extent_map(em); + em = NULL; + btrfs_release_path(root, path); + trans = btrfs_join_transaction(root, 1); + goto again; + } + map = kmap(page); + write_extent_buffer(leaf, map + pg_offset, ptr, + copy_size); + kunmap(page); + btrfs_mark_buffer_dirty(leaf); + } + set_extent_uptodate(io_tree, em->start, + extent_map_end(em) - 1, GFP_NOFS); + goto insert; + } else { + printk(KERN_ERR "btrfs unknown found_type %d\n", found_type); + WARN_ON(1); + } +not_found: + em->start = start; + em->len = len; +not_found_em: + em->block_start = EXTENT_MAP_HOLE; + set_bit(EXTENT_FLAG_VACANCY, &em->flags); +insert: + btrfs_release_path(root, path); + if (em->start > start || extent_map_end(em) <= start) { + printk(KERN_ERR "Btrfs: bad extent! em: [%llu %llu] passed " + "[%llu %llu]\n", (unsigned long long)em->start, + (unsigned long long)em->len, + (unsigned long long)start, + (unsigned long long)len); + err = -EIO; + goto out; + } + + err = 0; + spin_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); + /* it is possible that someone inserted the extent into the tree + * while we had the lock dropped. It is also possible that + * an overlapping map exists in the tree + */ + if (ret == -EEXIST) { + struct extent_map *existing; + + ret = 0; + + existing = lookup_extent_mapping(em_tree, start, len); + if (existing && (existing->start > start || + existing->start + existing->len <= start)) { + free_extent_map(existing); + existing = NULL; + } + if (!existing) { + existing = lookup_extent_mapping(em_tree, em->start, + em->len); + if (existing) { + err = merge_extent_mapping(em_tree, existing, + em, start, + root->sectorsize); + free_extent_map(existing); + if (err) { + free_extent_map(em); + em = NULL; + } + } else { + err = -EIO; + free_extent_map(em); + em = NULL; + } + } else { + free_extent_map(em); + em = existing; + err = 0; + } + } + spin_unlock(&em_tree->lock); +out: + if (path) + btrfs_free_path(path); + if (trans) { + ret = btrfs_end_transaction(trans, root); + if (!err) + err = ret; + } + if (err) { + free_extent_map(em); + WARN_ON(1); + return ERR_PTR(err); + } + return em; +} + +static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, + unsigned long nr_segs) +{ + return -EINVAL; +} + +static sector_t btrfs_bmap(struct address_space *mapping, sector_t iblock) +{ + return extent_bmap(mapping, iblock, btrfs_get_extent); +} + +int btrfs_readpage(struct file *file, struct page *page) +{ + struct extent_io_tree *tree; + tree = &BTRFS_I(page->mapping->host)->io_tree; + return extent_read_full_page(tree, page, btrfs_get_extent); +} + +static int btrfs_writepage(struct page *page, struct writeback_control *wbc) +{ + struct extent_io_tree *tree; + + + if (current->flags & PF_MEMALLOC) { + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; + } + tree = &BTRFS_I(page->mapping->host)->io_tree; + return extent_write_full_page(tree, page, btrfs_get_extent, wbc); +} + +int btrfs_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct extent_io_tree *tree; + + tree = &BTRFS_I(mapping->host)->io_tree; + return extent_writepages(tree, mapping, btrfs_get_extent, wbc); +} + +static int +btrfs_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + struct extent_io_tree *tree; + tree = &BTRFS_I(mapping->host)->io_tree; + return extent_readpages(tree, mapping, pages, nr_pages, + btrfs_get_extent); +} +static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags) +{ + struct extent_io_tree *tree; + struct extent_map_tree *map; + int ret; + + tree = &BTRFS_I(page->mapping->host)->io_tree; + map = &BTRFS_I(page->mapping->host)->extent_tree; + ret = try_release_extent_mapping(map, tree, page, gfp_flags); + if (ret == 1) { + ClearPagePrivate(page); + set_page_private(page, 0); + page_cache_release(page); + } + return ret; +} + +static int btrfs_releasepage(struct page *page, gfp_t gfp_flags) +{ + if (PageWriteback(page) || PageDirty(page)) + return 0; + return __btrfs_releasepage(page, gfp_flags); +} + +static void btrfs_invalidatepage(struct page *page, unsigned long offset) +{ + struct extent_io_tree *tree; + struct btrfs_ordered_extent *ordered; + u64 page_start = page_offset(page); + u64 page_end = page_start + PAGE_CACHE_SIZE - 1; + + wait_on_page_writeback(page); + tree = &BTRFS_I(page->mapping->host)->io_tree; + if (offset) { + btrfs_releasepage(page, GFP_NOFS); + return; + } + + lock_extent(tree, page_start, page_end, GFP_NOFS); + ordered = btrfs_lookup_ordered_extent(page->mapping->host, + page_offset(page)); + if (ordered) { + /* + * IO on this page will never be started, so we need + * to account for any ordered extents now + */ + clear_extent_bit(tree, page_start, page_end, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_LOCKED, 1, 0, GFP_NOFS); + btrfs_finish_ordered_io(page->mapping->host, + page_start, page_end); + btrfs_put_ordered_extent(ordered); + lock_extent(tree, page_start, page_end, GFP_NOFS); + } + clear_extent_bit(tree, page_start, page_end, + EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_ORDERED, + 1, 1, GFP_NOFS); + __btrfs_releasepage(page, GFP_NOFS); + + ClearPageChecked(page); + if (PagePrivate(page)) { + ClearPagePrivate(page); + set_page_private(page, 0); + page_cache_release(page); + } +} + +/* + * btrfs_page_mkwrite() is not allowed to change the file size as it gets + * called from a page fault handler when a page is first dirtied. Hence we must + * be careful to check for EOF conditions here. We set the page up correctly + * for a written page which means we get ENOSPC checking when writing into + * holes and correct delalloc and unwritten extent mapping on filesystems that + * support these features. + * + * We are not allowed to take the i_mutex here so we have to play games to + * protect against truncate races as the page could now be beyond EOF. Because + * vmtruncate() writes the inode size before removing pages, once we have the + * page lock we can determine safely if the page is beyond EOF. If it is not + * beyond EOF, then the page is guaranteed safe against truncation until we + * unlock the page. + */ +int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page) +{ + struct inode *inode = fdentry(vma->vm_file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_ordered_extent *ordered; + char *kaddr; + unsigned long zero_start; + loff_t size; + int ret; + u64 page_start; + u64 page_end; + + ret = btrfs_check_free_space(root, PAGE_CACHE_SIZE, 0); + if (ret) + goto out; + + ret = -EINVAL; +again: + lock_page(page); + size = i_size_read(inode); + page_start = page_offset(page); + page_end = page_start + PAGE_CACHE_SIZE - 1; + + if ((page->mapping != inode->i_mapping) || + (page_start >= size)) { + /* page got truncated out from underneath us */ + goto out_unlock; + } + wait_on_page_writeback(page); + + lock_extent(io_tree, page_start, page_end, GFP_NOFS); + set_page_extent_mapped(page); + + /* + * we can't set the delalloc bits if there are pending ordered + * extents. Drop our locks and wait for them to finish + */ + ordered = btrfs_lookup_ordered_extent(inode, page_start); + if (ordered) { + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + unlock_page(page); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + goto again; + } + + btrfs_set_extent_delalloc(inode, page_start, page_end); + ret = 0; + + /* page is wholly or partially inside EOF */ + if (page_start + PAGE_CACHE_SIZE > size) + zero_start = size & ~PAGE_CACHE_MASK; + else + zero_start = PAGE_CACHE_SIZE; + + if (zero_start != PAGE_CACHE_SIZE) { + kaddr = kmap(page); + memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start); + flush_dcache_page(page); + kunmap(page); + } + ClearPageChecked(page); + set_page_dirty(page); + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + +out_unlock: + unlock_page(page); +out: + return ret; +} + +static void btrfs_truncate(struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + struct btrfs_trans_handle *trans; + unsigned long nr; + u64 mask = root->sectorsize - 1; + + if (!S_ISREG(inode->i_mode)) + return; + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return; + + btrfs_truncate_page(inode->i_mapping, inode->i_size); + btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); + + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); + btrfs_i_size_write(inode, inode->i_size); + + ret = btrfs_orphan_add(trans, inode); + if (ret) + goto out; + /* FIXME, add redo link to tree so we don't leak on crash */ + ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, + BTRFS_EXTENT_DATA_KEY); + btrfs_update_inode(trans, root, inode); + + ret = btrfs_orphan_del(trans, inode); + BUG_ON(ret); + +out: + nr = trans->blocks_used; + ret = btrfs_end_transaction_throttle(trans, root); + BUG_ON(ret); + btrfs_btree_balance_dirty(root, nr); +} + +/* + * create a new subvolume directory/inode (helper for the ioctl). + */ +int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, + struct btrfs_root *new_root, struct dentry *dentry, + u64 new_dirid, u64 alloc_hint) +{ + struct inode *inode; + int error; + u64 index = 0; + + inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid, + new_dirid, alloc_hint, S_IFDIR | 0700, &index); + if (IS_ERR(inode)) + return PTR_ERR(inode); + inode->i_op = &btrfs_dir_inode_operations; + inode->i_fop = &btrfs_dir_file_operations; + + inode->i_nlink = 1; + btrfs_i_size_write(inode, 0); + + error = btrfs_update_inode(trans, new_root, inode); + if (error) + return error; + + d_instantiate(dentry, inode); + return 0; +} + +/* helper function for file defrag and space balancing. This + * forces readahead on a given range of bytes in an inode + */ +unsigned long btrfs_force_ra(struct address_space *mapping, + struct file_ra_state *ra, struct file *file, + pgoff_t offset, pgoff_t last_index) +{ + pgoff_t req_size = last_index - offset + 1; + + page_cache_sync_readahead(mapping, ra, file, offset, req_size); + return offset + req_size; +} + +struct inode *btrfs_alloc_inode(struct super_block *sb) +{ + struct btrfs_inode *ei; + + ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS); + if (!ei) + return NULL; + ei->last_trans = 0; + ei->logged_trans = 0; + btrfs_ordered_inode_tree_init(&ei->ordered_tree); + ei->i_acl = BTRFS_ACL_NOT_CACHED; + ei->i_default_acl = BTRFS_ACL_NOT_CACHED; + INIT_LIST_HEAD(&ei->i_orphan); + return &ei->vfs_inode; +} + +void btrfs_destroy_inode(struct inode *inode) +{ + struct btrfs_ordered_extent *ordered; + WARN_ON(!list_empty(&inode->i_dentry)); + WARN_ON(inode->i_data.nrpages); + + if (BTRFS_I(inode)->i_acl && + BTRFS_I(inode)->i_acl != BTRFS_ACL_NOT_CACHED) + posix_acl_release(BTRFS_I(inode)->i_acl); + if (BTRFS_I(inode)->i_default_acl && + BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED) + posix_acl_release(BTRFS_I(inode)->i_default_acl); + + spin_lock(&BTRFS_I(inode)->root->list_lock); + if (!list_empty(&BTRFS_I(inode)->i_orphan)) { + printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan" + " list\n", inode->i_ino); + dump_stack(); + } + spin_unlock(&BTRFS_I(inode)->root->list_lock); + + while (1) { + ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); + if (!ordered) + break; + else { + printk(KERN_ERR "btrfs found ordered " + "extent %llu %llu on inode cleanup\n", + (unsigned long long)ordered->file_offset, + (unsigned long long)ordered->len); + btrfs_remove_ordered_extent(inode, ordered); + btrfs_put_ordered_extent(ordered); + btrfs_put_ordered_extent(ordered); + } + } + btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); + kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); +} + +static void init_once(void *foo) +{ + struct btrfs_inode *ei = (struct btrfs_inode *) foo; + + inode_init_once(&ei->vfs_inode); +} + +void btrfs_destroy_cachep(void) +{ + if (btrfs_inode_cachep) + kmem_cache_destroy(btrfs_inode_cachep); + if (btrfs_trans_handle_cachep) + kmem_cache_destroy(btrfs_trans_handle_cachep); + if (btrfs_transaction_cachep) + kmem_cache_destroy(btrfs_transaction_cachep); + if (btrfs_bit_radix_cachep) + kmem_cache_destroy(btrfs_bit_radix_cachep); + if (btrfs_path_cachep) + kmem_cache_destroy(btrfs_path_cachep); +} + +struct kmem_cache *btrfs_cache_create(const char *name, size_t size, + unsigned long extra_flags, + void (*ctor)(void *)) +{ + return kmem_cache_create(name, size, 0, (SLAB_RECLAIM_ACCOUNT | + SLAB_MEM_SPREAD | extra_flags), ctor); +} + +int btrfs_init_cachep(void) +{ + btrfs_inode_cachep = btrfs_cache_create("btrfs_inode_cache", + sizeof(struct btrfs_inode), + 0, init_once); + if (!btrfs_inode_cachep) + goto fail; + btrfs_trans_handle_cachep = + btrfs_cache_create("btrfs_trans_handle_cache", + sizeof(struct btrfs_trans_handle), + 0, NULL); + if (!btrfs_trans_handle_cachep) + goto fail; + btrfs_transaction_cachep = btrfs_cache_create("btrfs_transaction_cache", + sizeof(struct btrfs_transaction), + 0, NULL); + if (!btrfs_transaction_cachep) + goto fail; + btrfs_path_cachep = btrfs_cache_create("btrfs_path_cache", + sizeof(struct btrfs_path), + 0, NULL); + if (!btrfs_path_cachep) + goto fail; + btrfs_bit_radix_cachep = btrfs_cache_create("btrfs_radix", 256, + SLAB_DESTROY_BY_RCU, NULL); + if (!btrfs_bit_radix_cachep) + goto fail; + return 0; +fail: + btrfs_destroy_cachep(); + return -ENOMEM; +} + +static int btrfs_getattr(struct vfsmount *mnt, + struct dentry *dentry, struct kstat *stat) +{ + struct inode *inode = dentry->d_inode; + generic_fillattr(inode, stat); + stat->dev = BTRFS_I(inode)->root->anon_super.s_dev; + stat->blksize = PAGE_CACHE_SIZE; + stat->blocks = (inode_get_bytes(inode) + + BTRFS_I(inode)->delalloc_bytes) >> 9; + return 0; +} + +static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(old_dir)->root; + struct inode *new_inode = new_dentry->d_inode; + struct inode *old_inode = old_dentry->d_inode; + struct timespec ctime = CURRENT_TIME; + u64 index = 0; + int ret; + + /* we're not allowed to rename between subvolumes */ + if (BTRFS_I(old_inode)->root->root_key.objectid != + BTRFS_I(new_dir)->root->root_key.objectid) + return -EXDEV; + + if (S_ISDIR(old_inode->i_mode) && new_inode && + new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) { + return -ENOTEMPTY; + } + + /* to rename a snapshot or subvolume, we need to juggle the + * backrefs. This isn't coded yet + */ + if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) + return -EXDEV; + + ret = btrfs_check_free_space(root, 1, 0); + if (ret) + goto out_unlock; + + trans = btrfs_start_transaction(root, 1); + + btrfs_set_trans_block_group(trans, new_dir); + + btrfs_inc_nlink(old_dentry->d_inode); + old_dir->i_ctime = old_dir->i_mtime = ctime; + new_dir->i_ctime = new_dir->i_mtime = ctime; + old_inode->i_ctime = ctime; + + ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode, + old_dentry->d_name.name, + old_dentry->d_name.len); + if (ret) + goto out_fail; + + if (new_inode) { + new_inode->i_ctime = CURRENT_TIME; + ret = btrfs_unlink_inode(trans, root, new_dir, + new_dentry->d_inode, + new_dentry->d_name.name, + new_dentry->d_name.len); + if (ret) + goto out_fail; + if (new_inode->i_nlink == 0) { + ret = btrfs_orphan_add(trans, new_dentry->d_inode); + if (ret) + goto out_fail; + } + + } + ret = btrfs_set_inode_index(new_dir, &index); + if (ret) + goto out_fail; + + ret = btrfs_add_link(trans, new_dentry->d_parent->d_inode, + old_inode, new_dentry->d_name.name, + new_dentry->d_name.len, 1, index); + if (ret) + goto out_fail; + +out_fail: + btrfs_end_transaction_throttle(trans, root); +out_unlock: + return ret; +} + +/* + * some fairly slow code that needs optimization. This walks the list + * of all the inodes with pending delalloc and forces them to disk. + */ +int btrfs_start_delalloc_inodes(struct btrfs_root *root) +{ + struct list_head *head = &root->fs_info->delalloc_inodes; + struct btrfs_inode *binode; + struct inode *inode; + + if (root->fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + spin_lock(&root->fs_info->delalloc_lock); + while (!list_empty(head)) { + binode = list_entry(head->next, struct btrfs_inode, + delalloc_inodes); + inode = igrab(&binode->vfs_inode); + if (!inode) + list_del_init(&binode->delalloc_inodes); + spin_unlock(&root->fs_info->delalloc_lock); + if (inode) { + filemap_flush(inode->i_mapping); + iput(inode); + } + cond_resched(); + spin_lock(&root->fs_info->delalloc_lock); + } + spin_unlock(&root->fs_info->delalloc_lock); + + /* the filemap_flush will queue IO into the worker threads, but + * we have to make sure the IO is actually started and that + * ordered extents get created before we return + */ + atomic_inc(&root->fs_info->async_submit_draining); + while (atomic_read(&root->fs_info->nr_async_submits) || + atomic_read(&root->fs_info->async_delalloc_pages)) { + wait_event(root->fs_info->async_submit_wait, + (atomic_read(&root->fs_info->nr_async_submits) == 0 && + atomic_read(&root->fs_info->async_delalloc_pages) == 0)); + } + atomic_dec(&root->fs_info->async_submit_draining); + return 0; +} + +static int btrfs_symlink(struct inode *dir, struct dentry *dentry, + const char *symname) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_path *path; + struct btrfs_key key; + struct inode *inode = NULL; + int err; + int drop_inode = 0; + u64 objectid; + u64 index = 0 ; + int name_len; + int datasize; + unsigned long ptr; + struct btrfs_file_extent_item *ei; + struct extent_buffer *leaf; + unsigned long nr = 0; + + name_len = strlen(symname) + 1; + if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) + return -ENAMETOOLONG; + + err = btrfs_check_free_space(root, 1, 0); + if (err) + goto out_fail; + + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, dir); + + err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); + if (err) { + err = -ENOSPC; + goto out_unlock; + } + + inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, + dentry->d_name.len, + dentry->d_parent->d_inode->i_ino, objectid, + BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO, + &index); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_unlock; + + err = btrfs_init_acl(inode, dir); + if (err) { + drop_inode = 1; + goto out_unlock; + } + + btrfs_set_trans_block_group(trans, inode); + err = btrfs_add_nondir(trans, dentry, inode, 0, index); + if (err) + drop_inode = 1; + else { + inode->i_mapping->a_ops = &btrfs_aops; + inode->i_mapping->backing_dev_info = &root->fs_info->bdi; + inode->i_fop = &btrfs_file_operations; + inode->i_op = &btrfs_file_inode_operations; + BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; + } + dir->i_sb->s_dirt = 1; + btrfs_update_inode_block_group(trans, inode); + btrfs_update_inode_block_group(trans, dir); + if (drop_inode) + goto out_unlock; + + path = btrfs_alloc_path(); + BUG_ON(!path); + key.objectid = inode->i_ino; + key.offset = 0; + btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); + datasize = btrfs_file_extent_calc_inline_size(name_len); + err = btrfs_insert_empty_item(trans, root, path, &key, + datasize); + if (err) { + drop_inode = 1; + goto out_unlock; + } + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, ei, trans->transid); + btrfs_set_file_extent_type(leaf, ei, + BTRFS_FILE_EXTENT_INLINE); + btrfs_set_file_extent_encryption(leaf, ei, 0); + btrfs_set_file_extent_compression(leaf, ei, 0); + btrfs_set_file_extent_other_encoding(leaf, ei, 0); + btrfs_set_file_extent_ram_bytes(leaf, ei, name_len); + + ptr = btrfs_file_extent_inline_start(ei); + write_extent_buffer(leaf, symname, ptr, name_len); + btrfs_mark_buffer_dirty(leaf); + btrfs_free_path(path); + + inode->i_op = &btrfs_symlink_inode_operations; + inode->i_mapping->a_ops = &btrfs_symlink_aops; + inode->i_mapping->backing_dev_info = &root->fs_info->bdi; + inode_set_bytes(inode, name_len); + btrfs_i_size_write(inode, name_len - 1); + err = btrfs_update_inode(trans, root, inode); + if (err) + drop_inode = 1; + +out_unlock: + nr = trans->blocks_used; + btrfs_end_transaction_throttle(trans, root); +out_fail: + if (drop_inode) { + inode_dec_link_count(inode); + iput(inode); + } + btrfs_btree_balance_dirty(root, nr); + return err; +} + +static int prealloc_file_range(struct inode *inode, u64 start, u64 end, + u64 alloc_hint, int mode) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_key ins; + u64 alloc_size; + u64 cur_offset = start; + u64 num_bytes = end - start; + int ret = 0; + + trans = btrfs_join_transaction(root, 1); + BUG_ON(!trans); + btrfs_set_trans_block_group(trans, inode); + + while (num_bytes > 0) { + alloc_size = min(num_bytes, root->fs_info->max_extent); + ret = btrfs_reserve_extent(trans, root, alloc_size, + root->sectorsize, 0, alloc_hint, + (u64)-1, &ins, 1); + if (ret) { + WARN_ON(1); + goto out; + } + ret = insert_reserved_file_extent(trans, inode, + cur_offset, ins.objectid, + ins.offset, ins.offset, + ins.offset, 0, 0, 0, + BTRFS_FILE_EXTENT_PREALLOC); + BUG_ON(ret); + num_bytes -= ins.offset; + cur_offset += ins.offset; + alloc_hint = ins.objectid + ins.offset; + } +out: + if (cur_offset > start) { + inode->i_ctime = CURRENT_TIME; + btrfs_set_flag(inode, PREALLOC); + if (!(mode & FALLOC_FL_KEEP_SIZE) && + cur_offset > i_size_read(inode)) + btrfs_i_size_write(inode, cur_offset); + ret = btrfs_update_inode(trans, root, inode); + BUG_ON(ret); + } + + btrfs_end_transaction(trans, root); + return ret; +} + +static long btrfs_fallocate(struct inode *inode, int mode, + loff_t offset, loff_t len) +{ + u64 cur_offset; + u64 last_byte; + u64 alloc_start; + u64 alloc_end; + u64 alloc_hint = 0; + u64 mask = BTRFS_I(inode)->root->sectorsize - 1; + struct extent_map *em; + int ret; + + alloc_start = offset & ~mask; + alloc_end = (offset + len + mask) & ~mask; + + mutex_lock(&inode->i_mutex); + if (alloc_start > inode->i_size) { + ret = btrfs_cont_expand(inode, alloc_start); + if (ret) + goto out; + } + + while (1) { + struct btrfs_ordered_extent *ordered; + lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, + alloc_end - 1, GFP_NOFS); + ordered = btrfs_lookup_first_ordered_extent(inode, + alloc_end - 1); + if (ordered && + ordered->file_offset + ordered->len > alloc_start && + ordered->file_offset < alloc_end) { + btrfs_put_ordered_extent(ordered); + unlock_extent(&BTRFS_I(inode)->io_tree, + alloc_start, alloc_end - 1, GFP_NOFS); + btrfs_wait_ordered_range(inode, alloc_start, + alloc_end - alloc_start); + } else { + if (ordered) + btrfs_put_ordered_extent(ordered); + break; + } + } + + cur_offset = alloc_start; + while (1) { + em = btrfs_get_extent(inode, NULL, 0, cur_offset, + alloc_end - cur_offset, 0); + BUG_ON(IS_ERR(em) || !em); + last_byte = min(extent_map_end(em), alloc_end); + last_byte = (last_byte + mask) & ~mask; + if (em->block_start == EXTENT_MAP_HOLE) { + ret = prealloc_file_range(inode, cur_offset, + last_byte, alloc_hint, mode); + if (ret < 0) { + free_extent_map(em); + break; + } + } + if (em->block_start <= EXTENT_MAP_LAST_BYTE) + alloc_hint = em->block_start; + free_extent_map(em); + + cur_offset = last_byte; + if (cur_offset >= alloc_end) { + ret = 0; + break; + } + } + unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, alloc_end - 1, + GFP_NOFS); +out: + mutex_unlock(&inode->i_mutex); + return ret; +} + +static int btrfs_set_page_dirty(struct page *page) +{ + return __set_page_dirty_nobuffers(page); +} + +static int btrfs_permission(struct inode *inode, int mask) +{ + if (btrfs_test_flag(inode, READONLY) && (mask & MAY_WRITE)) + return -EACCES; + return generic_permission(inode, mask, btrfs_check_acl); +} + +static struct inode_operations btrfs_dir_inode_operations = { + .getattr = btrfs_getattr, + .lookup = btrfs_lookup, + .create = btrfs_create, + .unlink = btrfs_unlink, + .link = btrfs_link, + .mkdir = btrfs_mkdir, + .rmdir = btrfs_rmdir, + .rename = btrfs_rename, + .symlink = btrfs_symlink, + .setattr = btrfs_setattr, + .mknod = btrfs_mknod, + .setxattr = btrfs_setxattr, + .getxattr = btrfs_getxattr, + .listxattr = btrfs_listxattr, + .removexattr = btrfs_removexattr, + .permission = btrfs_permission, +}; +static struct inode_operations btrfs_dir_ro_inode_operations = { + .lookup = btrfs_lookup, + .permission = btrfs_permission, +}; +static struct file_operations btrfs_dir_file_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .readdir = btrfs_real_readdir, + .unlocked_ioctl = btrfs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = btrfs_ioctl, +#endif + .release = btrfs_release_file, + .fsync = btrfs_sync_file, +}; + +static struct extent_io_ops btrfs_extent_io_ops = { + .fill_delalloc = run_delalloc_range, + .submit_bio_hook = btrfs_submit_bio_hook, + .merge_bio_hook = btrfs_merge_bio_hook, + .readpage_end_io_hook = btrfs_readpage_end_io_hook, + .writepage_end_io_hook = btrfs_writepage_end_io_hook, + .writepage_start_hook = btrfs_writepage_start_hook, + .readpage_io_failed_hook = btrfs_io_failed_hook, + .set_bit_hook = btrfs_set_bit_hook, + .clear_bit_hook = btrfs_clear_bit_hook, +}; + +static struct address_space_operations btrfs_aops = { + .readpage = btrfs_readpage, + .writepage = btrfs_writepage, + .writepages = btrfs_writepages, + .readpages = btrfs_readpages, + .sync_page = block_sync_page, + .bmap = btrfs_bmap, + .direct_IO = btrfs_direct_IO, + .invalidatepage = btrfs_invalidatepage, + .releasepage = btrfs_releasepage, + .set_page_dirty = btrfs_set_page_dirty, +}; + +static struct address_space_operations btrfs_symlink_aops = { + .readpage = btrfs_readpage, + .writepage = btrfs_writepage, + .invalidatepage = btrfs_invalidatepage, + .releasepage = btrfs_releasepage, +}; + +static struct inode_operations btrfs_file_inode_operations = { + .truncate = btrfs_truncate, + .getattr = btrfs_getattr, + .setattr = btrfs_setattr, + .setxattr = btrfs_setxattr, + .getxattr = btrfs_getxattr, + .listxattr = btrfs_listxattr, + .removexattr = btrfs_removexattr, + .permission = btrfs_permission, + .fallocate = btrfs_fallocate, +}; +static struct inode_operations btrfs_special_inode_operations = { + .getattr = btrfs_getattr, + .setattr = btrfs_setattr, + .permission = btrfs_permission, + .setxattr = btrfs_setxattr, + .getxattr = btrfs_getxattr, + .listxattr = btrfs_listxattr, + .removexattr = btrfs_removexattr, +}; +static struct inode_operations btrfs_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, + .permission = btrfs_permission, +}; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c new file mode 100644 index 000000000000..c2aa33e3feb5 --- /dev/null +++ b/fs/btrfs/ioctl.c @@ -0,0 +1,1132 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/kernel.h> +#include <linux/bio.h> +#include <linux/buffer_head.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/fsnotify.h> +#include <linux/pagemap.h> +#include <linux/highmem.h> +#include <linux/time.h> +#include <linux/init.h> +#include <linux/string.h> +#include <linux/smp_lock.h> +#include <linux/backing-dev.h> +#include <linux/mount.h> +#include <linux/mpage.h> +#include <linux/namei.h> +#include <linux/swap.h> +#include <linux/writeback.h> +#include <linux/statfs.h> +#include <linux/compat.h> +#include <linux/bit_spinlock.h> +#include <linux/security.h> +#include <linux/version.h> +#include <linux/xattr.h> +#include <linux/vmalloc.h> +#include "compat.h" +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "btrfs_inode.h" +#include "ioctl.h" +#include "print-tree.h" +#include "volumes.h" +#include "locking.h" + + + +static noinline int create_subvol(struct btrfs_root *root, + struct dentry *dentry, + char *name, int namelen) +{ + struct btrfs_trans_handle *trans; + struct btrfs_key key; + struct btrfs_root_item root_item; + struct btrfs_inode_item *inode_item; + struct extent_buffer *leaf; + struct btrfs_root *new_root = root; + struct inode *dir; + int ret; + int err; + u64 objectid; + u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; + u64 index = 0; + unsigned long nr = 1; + + ret = btrfs_check_free_space(root, 1, 0); + if (ret) + goto fail_commit; + + trans = btrfs_start_transaction(root, 1); + BUG_ON(!trans); + + ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root, + 0, &objectid); + if (ret) + goto fail; + + leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, + objectid, trans->transid, 0, 0, 0); + if (IS_ERR(leaf)) { + ret = PTR_ERR(leaf); + goto fail; + } + + btrfs_set_header_nritems(leaf, 0); + btrfs_set_header_level(leaf, 0); + btrfs_set_header_bytenr(leaf, leaf->start); + btrfs_set_header_generation(leaf, trans->transid); + btrfs_set_header_owner(leaf, objectid); + + write_extent_buffer(leaf, root->fs_info->fsid, + (unsigned long)btrfs_header_fsid(leaf), + BTRFS_FSID_SIZE); + btrfs_mark_buffer_dirty(leaf); + + inode_item = &root_item.inode; + memset(inode_item, 0, sizeof(*inode_item)); + inode_item->generation = cpu_to_le64(1); + inode_item->size = cpu_to_le64(3); + inode_item->nlink = cpu_to_le32(1); + inode_item->nbytes = cpu_to_le64(root->leafsize); + inode_item->mode = cpu_to_le32(S_IFDIR | 0755); + + btrfs_set_root_bytenr(&root_item, leaf->start); + btrfs_set_root_generation(&root_item, trans->transid); + btrfs_set_root_level(&root_item, 0); + btrfs_set_root_refs(&root_item, 1); + btrfs_set_root_used(&root_item, 0); + btrfs_set_root_last_snapshot(&root_item, 0); + + memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress)); + root_item.drop_level = 0; + + btrfs_tree_unlock(leaf); + free_extent_buffer(leaf); + leaf = NULL; + + btrfs_set_root_dirid(&root_item, new_dirid); + + key.objectid = objectid; + key.offset = 1; + btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, + &root_item); + if (ret) + goto fail; + + /* + * insert the directory item + */ + key.offset = (u64)-1; + dir = dentry->d_parent->d_inode; + ret = btrfs_set_inode_index(dir, &index); + BUG_ON(ret); + + ret = btrfs_insert_dir_item(trans, root, + name, namelen, dir->i_ino, &key, + BTRFS_FT_DIR, index); + if (ret) + goto fail; + + btrfs_i_size_write(dir, dir->i_size + namelen * 2); + ret = btrfs_update_inode(trans, root, dir); + BUG_ON(ret); + + /* add the backref first */ + ret = btrfs_add_root_ref(trans, root->fs_info->tree_root, + objectid, BTRFS_ROOT_BACKREF_KEY, + root->root_key.objectid, + dir->i_ino, index, name, namelen); + + BUG_ON(ret); + + /* now add the forward ref */ + ret = btrfs_add_root_ref(trans, root->fs_info->tree_root, + root->root_key.objectid, BTRFS_ROOT_REF_KEY, + objectid, + dir->i_ino, index, name, namelen); + + BUG_ON(ret); + + ret = btrfs_commit_transaction(trans, root); + if (ret) + goto fail_commit; + + new_root = btrfs_read_fs_root_no_name(root->fs_info, &key); + BUG_ON(!new_root); + + trans = btrfs_start_transaction(new_root, 1); + BUG_ON(!trans); + + ret = btrfs_create_subvol_root(trans, new_root, dentry, new_dirid, + BTRFS_I(dir)->block_group); + if (ret) + goto fail; + +fail: + nr = trans->blocks_used; + err = btrfs_commit_transaction(trans, new_root); + if (err && !ret) + ret = err; +fail_commit: + btrfs_btree_balance_dirty(root, nr); + return ret; +} + +static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, + char *name, int namelen) +{ + struct btrfs_pending_snapshot *pending_snapshot; + struct btrfs_trans_handle *trans; + int ret = 0; + int err; + unsigned long nr = 0; + + if (!root->ref_cows) + return -EINVAL; + + ret = btrfs_check_free_space(root, 1, 0); + if (ret) + goto fail_unlock; + + pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); + if (!pending_snapshot) { + ret = -ENOMEM; + goto fail_unlock; + } + pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS); + if (!pending_snapshot->name) { + ret = -ENOMEM; + kfree(pending_snapshot); + goto fail_unlock; + } + memcpy(pending_snapshot->name, name, namelen); + pending_snapshot->name[namelen] = '\0'; + pending_snapshot->dentry = dentry; + trans = btrfs_start_transaction(root, 1); + BUG_ON(!trans); + pending_snapshot->root = root; + list_add(&pending_snapshot->list, + &trans->transaction->pending_snapshots); + err = btrfs_commit_transaction(trans, root); + +fail_unlock: + btrfs_btree_balance_dirty(root, nr); + return ret; +} + +/* copy of may_create in fs/namei.c() */ +static inline int btrfs_may_create(struct inode *dir, struct dentry *child) +{ + if (child->d_inode) + return -EEXIST; + if (IS_DEADDIR(dir)) + return -ENOENT; + return inode_permission(dir, MAY_WRITE | MAY_EXEC); +} + +/* + * Create a new subvolume below @parent. This is largely modeled after + * sys_mkdirat and vfs_mkdir, but we only do a single component lookup + * inside this filesystem so it's quite a bit simpler. + */ +static noinline int btrfs_mksubvol(struct path *parent, char *name, + int mode, int namelen, + struct btrfs_root *snap_src) +{ + struct dentry *dentry; + int error; + + mutex_lock_nested(&parent->dentry->d_inode->i_mutex, I_MUTEX_PARENT); + + dentry = lookup_one_len(name, parent->dentry, namelen); + error = PTR_ERR(dentry); + if (IS_ERR(dentry)) + goto out_unlock; + + error = -EEXIST; + if (dentry->d_inode) + goto out_dput; + + if (!IS_POSIXACL(parent->dentry->d_inode)) + mode &= ~current->fs->umask; + + error = mnt_want_write(parent->mnt); + if (error) + goto out_dput; + + error = btrfs_may_create(parent->dentry->d_inode, dentry); + if (error) + goto out_drop_write; + + /* + * Actually perform the low-level subvolume creation after all + * this VFS fuzz. + * + * Eventually we want to pass in an inode under which we create this + * subvolume, but for now all are under the filesystem root. + * + * Also we should pass on the mode eventually to allow creating new + * subvolume with specific mode bits. + */ + if (snap_src) { + struct dentry *dir = dentry->d_parent; + struct dentry *test = dir->d_parent; + struct btrfs_path *path = btrfs_alloc_path(); + int ret; + u64 test_oid; + u64 parent_oid = BTRFS_I(dir->d_inode)->root->root_key.objectid; + + test_oid = snap_src->root_key.objectid; + + ret = btrfs_find_root_ref(snap_src->fs_info->tree_root, + path, parent_oid, test_oid); + if (ret == 0) + goto create; + btrfs_release_path(snap_src->fs_info->tree_root, path); + + /* we need to make sure we aren't creating a directory loop + * by taking a snapshot of something that has our current + * subvol in its directory tree. So, this loops through + * the dentries and checks the forward refs for each subvolume + * to see if is references the subvolume where we are + * placing this new snapshot. + */ + while (1) { + if (!test || + dir == snap_src->fs_info->sb->s_root || + test == snap_src->fs_info->sb->s_root || + test->d_inode->i_sb != snap_src->fs_info->sb) { + break; + } + if (S_ISLNK(test->d_inode->i_mode)) { + printk(KERN_INFO "Btrfs symlink in snapshot " + "path, failed\n"); + error = -EMLINK; + btrfs_free_path(path); + goto out_drop_write; + } + test_oid = + BTRFS_I(test->d_inode)->root->root_key.objectid; + ret = btrfs_find_root_ref(snap_src->fs_info->tree_root, + path, test_oid, parent_oid); + if (ret == 0) { + printk(KERN_INFO "Btrfs snapshot creation " + "failed, looping\n"); + error = -EMLINK; + btrfs_free_path(path); + goto out_drop_write; + } + btrfs_release_path(snap_src->fs_info->tree_root, path); + test = test->d_parent; + } +create: + btrfs_free_path(path); + error = create_snapshot(snap_src, dentry, name, namelen); + } else { + error = create_subvol(BTRFS_I(parent->dentry->d_inode)->root, + dentry, name, namelen); + } + if (error) + goto out_drop_write; + + fsnotify_mkdir(parent->dentry->d_inode, dentry); +out_drop_write: + mnt_drop_write(parent->mnt); +out_dput: + dput(dentry); +out_unlock: + mutex_unlock(&parent->dentry->d_inode->i_mutex); + return error; +} + + +static int btrfs_defrag_file(struct file *file) +{ + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_ordered_extent *ordered; + struct page *page; + unsigned long last_index; + unsigned long ra_pages = root->fs_info->bdi.ra_pages; + unsigned long total_read = 0; + u64 page_start; + u64 page_end; + unsigned long i; + int ret; + + ret = btrfs_check_free_space(root, inode->i_size, 0); + if (ret) + return -ENOSPC; + + mutex_lock(&inode->i_mutex); + last_index = inode->i_size >> PAGE_CACHE_SHIFT; + for (i = 0; i <= last_index; i++) { + if (total_read % ra_pages == 0) { + btrfs_force_ra(inode->i_mapping, &file->f_ra, file, i, + min(last_index, i + ra_pages - 1)); + } + total_read++; +again: + page = grab_cache_page(inode->i_mapping, i); + if (!page) + goto out_unlock; + if (!PageUptodate(page)) { + btrfs_readpage(NULL, page); + lock_page(page); + if (!PageUptodate(page)) { + unlock_page(page); + page_cache_release(page); + goto out_unlock; + } + } + + wait_on_page_writeback(page); + + page_start = (u64)page->index << PAGE_CACHE_SHIFT; + page_end = page_start + PAGE_CACHE_SIZE - 1; + lock_extent(io_tree, page_start, page_end, GFP_NOFS); + + ordered = btrfs_lookup_ordered_extent(inode, page_start); + if (ordered) { + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + unlock_page(page); + page_cache_release(page); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + goto again; + } + set_page_extent_mapped(page); + + /* + * this makes sure page_mkwrite is called on the + * page if it is dirtied again later + */ + clear_page_dirty_for_io(page); + + btrfs_set_extent_delalloc(inode, page_start, page_end); + + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + set_page_dirty(page); + unlock_page(page); + page_cache_release(page); + balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1); + } + +out_unlock: + mutex_unlock(&inode->i_mutex); + return 0; +} + +/* + * Called inside transaction, so use GFP_NOFS + */ + +static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg) +{ + u64 new_size; + u64 old_size; + u64 devid = 1; + struct btrfs_ioctl_vol_args *vol_args; + struct btrfs_trans_handle *trans; + struct btrfs_device *device = NULL; + char *sizestr; + char *devstr = NULL; + int ret = 0; + int namelen; + int mod = 0; + + if (root->fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); + + if (!vol_args) + return -ENOMEM; + + if (copy_from_user(vol_args, arg, sizeof(*vol_args))) { + ret = -EFAULT; + goto out; + } + + vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + namelen = strlen(vol_args->name); + + mutex_lock(&root->fs_info->volume_mutex); + sizestr = vol_args->name; + devstr = strchr(sizestr, ':'); + if (devstr) { + char *end; + sizestr = devstr + 1; + *devstr = '\0'; + devstr = vol_args->name; + devid = simple_strtoull(devstr, &end, 10); + printk(KERN_INFO "resizing devid %llu\n", devid); + } + device = btrfs_find_device(root, devid, NULL, NULL); + if (!device) { + printk(KERN_INFO "resizer unable to find device %llu\n", devid); + ret = -EINVAL; + goto out_unlock; + } + if (!strcmp(sizestr, "max")) + new_size = device->bdev->bd_inode->i_size; + else { + if (sizestr[0] == '-') { + mod = -1; + sizestr++; + } else if (sizestr[0] == '+') { + mod = 1; + sizestr++; + } + new_size = btrfs_parse_size(sizestr); + if (new_size == 0) { + ret = -EINVAL; + goto out_unlock; + } + } + + old_size = device->total_bytes; + + if (mod < 0) { + if (new_size > old_size) { + ret = -EINVAL; + goto out_unlock; + } + new_size = old_size - new_size; + } else if (mod > 0) { + new_size = old_size + new_size; + } + + if (new_size < 256 * 1024 * 1024) { + ret = -EINVAL; + goto out_unlock; + } + if (new_size > device->bdev->bd_inode->i_size) { + ret = -EFBIG; + goto out_unlock; + } + + do_div(new_size, root->sectorsize); + new_size *= root->sectorsize; + + printk(KERN_INFO "new size for %s is %llu\n", + device->name, (unsigned long long)new_size); + + if (new_size > old_size) { + trans = btrfs_start_transaction(root, 1); + ret = btrfs_grow_device(trans, device, new_size); + btrfs_commit_transaction(trans, root); + } else { + ret = btrfs_shrink_device(device, new_size); + } + +out_unlock: + mutex_unlock(&root->fs_info->volume_mutex); +out: + kfree(vol_args); + return ret; +} + +static noinline int btrfs_ioctl_snap_create(struct file *file, + void __user *arg, int subvol) +{ + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; + struct btrfs_ioctl_vol_args *vol_args; + struct btrfs_dir_item *di; + struct btrfs_path *path; + struct file *src_file; + u64 root_dirid; + int namelen; + int ret = 0; + + if (root->fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); + + if (!vol_args) + return -ENOMEM; + + if (copy_from_user(vol_args, arg, sizeof(*vol_args))) { + ret = -EFAULT; + goto out; + } + + vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + namelen = strlen(vol_args->name); + if (strchr(vol_args->name, '/')) { + ret = -EINVAL; + goto out; + } + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + root_dirid = root->fs_info->sb->s_root->d_inode->i_ino, + di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root, + path, root_dirid, + vol_args->name, namelen, 0); + btrfs_free_path(path); + + if (di && !IS_ERR(di)) { + ret = -EEXIST; + goto out; + } + + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } + + if (subvol) { + ret = btrfs_mksubvol(&file->f_path, vol_args->name, + file->f_path.dentry->d_inode->i_mode, + namelen, NULL); + } else { + struct inode *src_inode; + src_file = fget(vol_args->fd); + if (!src_file) { + ret = -EINVAL; + goto out; + } + + src_inode = src_file->f_path.dentry->d_inode; + if (src_inode->i_sb != file->f_path.dentry->d_inode->i_sb) { + printk(KERN_INFO "btrfs: Snapshot src from " + "another FS\n"); + ret = -EINVAL; + fput(src_file); + goto out; + } + ret = btrfs_mksubvol(&file->f_path, vol_args->name, + file->f_path.dentry->d_inode->i_mode, + namelen, BTRFS_I(src_inode)->root); + fput(src_file); + } + +out: + kfree(vol_args); + return ret; +} + +static int btrfs_ioctl_defrag(struct file *file) +{ + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + ret = mnt_want_write(file->f_path.mnt); + if (ret) + return ret; + + switch (inode->i_mode & S_IFMT) { + case S_IFDIR: + if (!capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out; + } + btrfs_defrag_root(root, 0); + btrfs_defrag_root(root->fs_info->extent_root, 0); + break; + case S_IFREG: + if (!(file->f_mode & FMODE_WRITE)) { + ret = -EINVAL; + goto out; + } + btrfs_defrag_file(file); + break; + } +out: + mnt_drop_write(file->f_path.mnt); + return ret; +} + +static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg) +{ + struct btrfs_ioctl_vol_args *vol_args; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); + + if (!vol_args) + return -ENOMEM; + + if (copy_from_user(vol_args, arg, sizeof(*vol_args))) { + ret = -EFAULT; + goto out; + } + vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + ret = btrfs_init_new_device(root, vol_args->name); + +out: + kfree(vol_args); + return ret; +} + +static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) +{ + struct btrfs_ioctl_vol_args *vol_args; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (root->fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); + + if (!vol_args) + return -ENOMEM; + + if (copy_from_user(vol_args, arg, sizeof(*vol_args))) { + ret = -EFAULT; + goto out; + } + vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + ret = btrfs_rm_device(root, vol_args->name); + +out: + kfree(vol_args); + return ret; +} + +static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, + u64 off, u64 olen, u64 destoff) +{ + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct file *src_file; + struct inode *src; + struct btrfs_trans_handle *trans; + struct btrfs_path *path; + struct extent_buffer *leaf; + char *buf; + struct btrfs_key key; + u32 nritems; + int slot; + int ret; + u64 len = olen; + u64 bs = root->fs_info->sb->s_blocksize; + u64 hint_byte; + + /* + * TODO: + * - split compressed inline extents. annoying: we need to + * decompress into destination's address_space (the file offset + * may change, so source mapping won't do), then recompress (or + * otherwise reinsert) a subrange. + * - allow ranges within the same file to be cloned (provided + * they don't overlap)? + */ + + /* the destination must be opened for writing */ + if (!(file->f_mode & FMODE_WRITE)) + return -EINVAL; + + ret = mnt_want_write(file->f_path.mnt); + if (ret) + return ret; + + src_file = fget(srcfd); + if (!src_file) { + ret = -EBADF; + goto out_drop_write; + } + src = src_file->f_dentry->d_inode; + + ret = -EINVAL; + if (src == inode) + goto out_fput; + + ret = -EISDIR; + if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode)) + goto out_fput; + + ret = -EXDEV; + if (src->i_sb != inode->i_sb || BTRFS_I(src)->root != root) + goto out_fput; + + ret = -ENOMEM; + buf = vmalloc(btrfs_level_size(root, 0)); + if (!buf) + goto out_fput; + + path = btrfs_alloc_path(); + if (!path) { + vfree(buf); + goto out_fput; + } + path->reada = 2; + + if (inode < src) { + mutex_lock(&inode->i_mutex); + mutex_lock(&src->i_mutex); + } else { + mutex_lock(&src->i_mutex); + mutex_lock(&inode->i_mutex); + } + + /* determine range to clone */ + ret = -EINVAL; + if (off >= src->i_size || off + len > src->i_size) + goto out_unlock; + if (len == 0) + olen = len = src->i_size - off; + /* if we extend to eof, continue to block boundary */ + if (off + len == src->i_size) + len = ((src->i_size + bs-1) & ~(bs-1)) + - off; + + /* verify the end result is block aligned */ + if ((off & (bs-1)) || + ((off + len) & (bs-1))) + goto out_unlock; + + /* do any pending delalloc/csum calc on src, one way or + another, and lock file content */ + while (1) { + struct btrfs_ordered_extent *ordered; + lock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); + ordered = btrfs_lookup_first_ordered_extent(inode, off+len); + if (BTRFS_I(src)->delalloc_bytes == 0 && !ordered) + break; + unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); + if (ordered) + btrfs_put_ordered_extent(ordered); + btrfs_wait_ordered_range(src, off, off+len); + } + + trans = btrfs_start_transaction(root, 1); + BUG_ON(!trans); + + /* punch hole in destination first */ + btrfs_drop_extents(trans, root, inode, off, off+len, 0, &hint_byte); + + /* clone data */ + key.objectid = src->i_ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = 0; + + while (1) { + /* + * note the key will change type as we walk through the + * tree. + */ + ret = btrfs_search_slot(trans, root, &key, path, 0, 0); + if (ret < 0) + goto out; + + nritems = btrfs_header_nritems(path->nodes[0]); + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + if (ret > 0) + break; + nritems = btrfs_header_nritems(path->nodes[0]); + } + leaf = path->nodes[0]; + slot = path->slots[0]; + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY || + key.objectid != src->i_ino) + break; + + if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) { + struct btrfs_file_extent_item *extent; + int type; + u32 size; + struct btrfs_key new_key; + u64 disko = 0, diskl = 0; + u64 datao = 0, datal = 0; + u8 comp; + + size = btrfs_item_size_nr(leaf, slot); + read_extent_buffer(leaf, buf, + btrfs_item_ptr_offset(leaf, slot), + size); + + extent = btrfs_item_ptr(leaf, slot, + struct btrfs_file_extent_item); + comp = btrfs_file_extent_compression(leaf, extent); + type = btrfs_file_extent_type(leaf, extent); + if (type == BTRFS_FILE_EXTENT_REG) { + disko = btrfs_file_extent_disk_bytenr(leaf, + extent); + diskl = btrfs_file_extent_disk_num_bytes(leaf, + extent); + datao = btrfs_file_extent_offset(leaf, extent); + datal = btrfs_file_extent_num_bytes(leaf, + extent); + } else if (type == BTRFS_FILE_EXTENT_INLINE) { + /* take upper bound, may be compressed */ + datal = btrfs_file_extent_ram_bytes(leaf, + extent); + } + btrfs_release_path(root, path); + + if (key.offset + datal < off || + key.offset >= off+len) + goto next; + + memcpy(&new_key, &key, sizeof(new_key)); + new_key.objectid = inode->i_ino; + new_key.offset = key.offset + destoff - off; + + if (type == BTRFS_FILE_EXTENT_REG) { + ret = btrfs_insert_empty_item(trans, root, path, + &new_key, size); + if (ret) + goto out; + + leaf = path->nodes[0]; + slot = path->slots[0]; + write_extent_buffer(leaf, buf, + btrfs_item_ptr_offset(leaf, slot), + size); + + extent = btrfs_item_ptr(leaf, slot, + struct btrfs_file_extent_item); + + if (off > key.offset) { + datao += off - key.offset; + datal -= off - key.offset; + } + if (key.offset + datao + datal + key.offset > + off + len) + datal = off + len - key.offset - datao; + /* disko == 0 means it's a hole */ + if (!disko) + datao = 0; + + btrfs_set_file_extent_offset(leaf, extent, + datao); + btrfs_set_file_extent_num_bytes(leaf, extent, + datal); + if (disko) { + inode_add_bytes(inode, datal); + ret = btrfs_inc_extent_ref(trans, root, + disko, diskl, leaf->start, + root->root_key.objectid, + trans->transid, + inode->i_ino); + BUG_ON(ret); + } + } else if (type == BTRFS_FILE_EXTENT_INLINE) { + u64 skip = 0; + u64 trim = 0; + if (off > key.offset) { + skip = off - key.offset; + new_key.offset += skip; + } + + if (key.offset + datal > off+len) + trim = key.offset + datal - (off+len); + + if (comp && (skip || trim)) { + ret = -EINVAL; + goto out; + } + size -= skip + trim; + datal -= skip + trim; + ret = btrfs_insert_empty_item(trans, root, path, + &new_key, size); + if (ret) + goto out; + + if (skip) { + u32 start = + btrfs_file_extent_calc_inline_size(0); + memmove(buf+start, buf+start+skip, + datal); + } + + leaf = path->nodes[0]; + slot = path->slots[0]; + write_extent_buffer(leaf, buf, + btrfs_item_ptr_offset(leaf, slot), + size); + inode_add_bytes(inode, datal); + } + + btrfs_mark_buffer_dirty(leaf); + } + +next: + btrfs_release_path(root, path); + key.offset++; + } + ret = 0; +out: + btrfs_release_path(root, path); + if (ret == 0) { + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + if (destoff + olen > inode->i_size) + btrfs_i_size_write(inode, destoff + olen); + BTRFS_I(inode)->flags = BTRFS_I(src)->flags; + ret = btrfs_update_inode(trans, root, inode); + } + btrfs_end_transaction(trans, root); + unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); + if (ret) + vmtruncate(inode, 0); +out_unlock: + mutex_unlock(&src->i_mutex); + mutex_unlock(&inode->i_mutex); + vfree(buf); + btrfs_free_path(path); +out_fput: + fput(src_file); +out_drop_write: + mnt_drop_write(file->f_path.mnt); + return ret; +} + +static long btrfs_ioctl_clone_range(struct file *file, void __user *argp) +{ + struct btrfs_ioctl_clone_range_args args; + + if (copy_from_user(&args, argp, sizeof(args))) + return -EFAULT; + return btrfs_ioctl_clone(file, args.src_fd, args.src_offset, + args.src_length, args.dest_offset); +} + +/* + * there are many ways the trans_start and trans_end ioctls can lead + * to deadlocks. They should only be used by applications that + * basically own the machine, and have a very in depth understanding + * of all the possible deadlocks and enospc problems. + */ +static long btrfs_ioctl_trans_start(struct file *file) +{ + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + int ret = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (file->private_data) { + ret = -EINPROGRESS; + goto out; + } + + ret = mnt_want_write(file->f_path.mnt); + if (ret) + goto out; + + mutex_lock(&root->fs_info->trans_mutex); + root->fs_info->open_ioctl_trans++; + mutex_unlock(&root->fs_info->trans_mutex); + + trans = btrfs_start_ioctl_transaction(root, 0); + if (trans) + file->private_data = trans; + else + ret = -ENOMEM; + /*printk(KERN_INFO "btrfs_ioctl_trans_start on %p\n", file);*/ +out: + return ret; +} + +/* + * there are many ways the trans_start and trans_end ioctls can lead + * to deadlocks. They should only be used by applications that + * basically own the machine, and have a very in depth understanding + * of all the possible deadlocks and enospc problems. + */ +long btrfs_ioctl_trans_end(struct file *file) +{ + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + int ret = 0; + + trans = file->private_data; + if (!trans) { + ret = -EINVAL; + goto out; + } + btrfs_end_transaction(trans, root); + file->private_data = NULL; + + mutex_lock(&root->fs_info->trans_mutex); + root->fs_info->open_ioctl_trans--; + mutex_unlock(&root->fs_info->trans_mutex); + + mnt_drop_write(file->f_path.mnt); + +out: + return ret; +} + +long btrfs_ioctl(struct file *file, unsigned int + cmd, unsigned long arg) +{ + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; + void __user *argp = (void __user *)arg; + + switch (cmd) { + case BTRFS_IOC_SNAP_CREATE: + return btrfs_ioctl_snap_create(file, argp, 0); + case BTRFS_IOC_SUBVOL_CREATE: + return btrfs_ioctl_snap_create(file, argp, 1); + case BTRFS_IOC_DEFRAG: + return btrfs_ioctl_defrag(file); + case BTRFS_IOC_RESIZE: + return btrfs_ioctl_resize(root, argp); + case BTRFS_IOC_ADD_DEV: + return btrfs_ioctl_add_dev(root, argp); + case BTRFS_IOC_RM_DEV: + return btrfs_ioctl_rm_dev(root, argp); + case BTRFS_IOC_BALANCE: + return btrfs_balance(root->fs_info->dev_root); + case BTRFS_IOC_CLONE: + return btrfs_ioctl_clone(file, arg, 0, 0, 0); + case BTRFS_IOC_CLONE_RANGE: + return btrfs_ioctl_clone_range(file, argp); + case BTRFS_IOC_TRANS_START: + return btrfs_ioctl_trans_start(file); + case BTRFS_IOC_TRANS_END: + return btrfs_ioctl_trans_end(file); + case BTRFS_IOC_SYNC: + btrfs_sync_fs(file->f_dentry->d_sb, 1); + return 0; + } + + return -ENOTTY; +} diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h new file mode 100644 index 000000000000..78049ea208db --- /dev/null +++ b/fs/btrfs/ioctl.h @@ -0,0 +1,67 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __IOCTL_ +#define __IOCTL_ +#include <linux/ioctl.h> + +#define BTRFS_IOCTL_MAGIC 0x94 +#define BTRFS_VOL_NAME_MAX 255 +#define BTRFS_PATH_NAME_MAX 3072 + +struct btrfs_ioctl_vol_args { + __s64 fd; + char name[BTRFS_PATH_NAME_MAX + 1]; +}; + +#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ + struct btrfs_ioctl_vol_args) +#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ + struct btrfs_ioctl_vol_args) +#define BTRFS_IOC_RESIZE _IOW(BTRFS_IOCTL_MAGIC, 3, \ + struct btrfs_ioctl_vol_args) +#define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \ + struct btrfs_ioctl_vol_args) +/* trans start and trans end are dangerous, and only for + * use by applications that know how to avoid the + * resulting deadlocks + */ +#define BTRFS_IOC_TRANS_START _IO(BTRFS_IOCTL_MAGIC, 6) +#define BTRFS_IOC_TRANS_END _IO(BTRFS_IOCTL_MAGIC, 7) +#define BTRFS_IOC_SYNC _IO(BTRFS_IOCTL_MAGIC, 8) + +#define BTRFS_IOC_CLONE _IOW(BTRFS_IOCTL_MAGIC, 9, int) +#define BTRFS_IOC_ADD_DEV _IOW(BTRFS_IOCTL_MAGIC, 10, \ + struct btrfs_ioctl_vol_args) +#define BTRFS_IOC_RM_DEV _IOW(BTRFS_IOCTL_MAGIC, 11, \ + struct btrfs_ioctl_vol_args) +#define BTRFS_IOC_BALANCE _IOW(BTRFS_IOCTL_MAGIC, 12, \ + struct btrfs_ioctl_vol_args) +struct btrfs_ioctl_clone_range_args { + __s64 src_fd; + __u64 src_offset, src_length; + __u64 dest_offset; +}; + +#define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \ + struct btrfs_ioctl_clone_range_args) + +#define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \ + struct btrfs_ioctl_vol_args) + +#endif diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c new file mode 100644 index 000000000000..39bae7761db6 --- /dev/null +++ b/fs/btrfs/locking.c @@ -0,0 +1,88 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ +#include <linux/sched.h> +#include <linux/gfp.h> +#include <linux/pagemap.h> +#include <linux/spinlock.h> +#include <linux/page-flags.h> +#include <asm/bug.h> +#include "ctree.h" +#include "extent_io.h" +#include "locking.h" + +/* + * locks the per buffer mutex in an extent buffer. This uses adaptive locks + * and the spin is not tuned very extensively. The spinning does make a big + * difference in almost every workload, but spinning for the right amount of + * time needs some help. + * + * In general, we want to spin as long as the lock holder is doing btree + * searches, and we should give up if they are in more expensive code. + */ + +int btrfs_tree_lock(struct extent_buffer *eb) +{ + int i; + + if (mutex_trylock(&eb->mutex)) + return 0; + for (i = 0; i < 512; i++) { + cpu_relax(); + if (mutex_trylock(&eb->mutex)) + return 0; + } + cpu_relax(); + mutex_lock_nested(&eb->mutex, BTRFS_MAX_LEVEL - btrfs_header_level(eb)); + return 0; +} + +int btrfs_try_tree_lock(struct extent_buffer *eb) +{ + return mutex_trylock(&eb->mutex); +} + +int btrfs_tree_unlock(struct extent_buffer *eb) +{ + mutex_unlock(&eb->mutex); + return 0; +} + +int btrfs_tree_locked(struct extent_buffer *eb) +{ + return mutex_is_locked(&eb->mutex); +} + +/* + * btrfs_search_slot uses this to decide if it should drop its locks + * before doing something expensive like allocating free blocks for cow. + */ +int btrfs_path_lock_waiting(struct btrfs_path *path, int level) +{ + int i; + struct extent_buffer *eb; + for (i = level; i <= level + 1 && i < BTRFS_MAX_LEVEL; i++) { + eb = path->nodes[i]; + if (!eb) + break; + smp_mb(); + if (!list_empty(&eb->mutex.wait_list)) + return 1; + } + return 0; +} + diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h new file mode 100644 index 000000000000..bc1faef12519 --- /dev/null +++ b/fs/btrfs/locking.h @@ -0,0 +1,27 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_LOCKING_ +#define __BTRFS_LOCKING_ + +int btrfs_tree_lock(struct extent_buffer *eb); +int btrfs_tree_unlock(struct extent_buffer *eb); +int btrfs_tree_locked(struct extent_buffer *eb); +int btrfs_try_tree_lock(struct extent_buffer *eb); +int btrfs_path_lock_waiting(struct btrfs_path *path, int level); +#endif diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c new file mode 100644 index 000000000000..a20940170274 --- /dev/null +++ b/fs/btrfs/ordered-data.c @@ -0,0 +1,730 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/gfp.h> +#include <linux/slab.h> +#include <linux/blkdev.h> +#include <linux/writeback.h> +#include <linux/pagevec.h> +#include "ctree.h" +#include "transaction.h" +#include "btrfs_inode.h" +#include "extent_io.h" + +static u64 entry_end(struct btrfs_ordered_extent *entry) +{ + if (entry->file_offset + entry->len < entry->file_offset) + return (u64)-1; + return entry->file_offset + entry->len; +} + +/* returns NULL if the insertion worked, or it returns the node it did find + * in the tree + */ +static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset, + struct rb_node *node) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct btrfs_ordered_extent *entry; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node); + + if (file_offset < entry->file_offset) + p = &(*p)->rb_left; + else if (file_offset >= entry_end(entry)) + p = &(*p)->rb_right; + else + return parent; + } + + rb_link_node(node, parent, p); + rb_insert_color(node, root); + return NULL; +} + +/* + * look for a given offset in the tree, and if it can't be found return the + * first lesser offset + */ +static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset, + struct rb_node **prev_ret) +{ + struct rb_node *n = root->rb_node; + struct rb_node *prev = NULL; + struct rb_node *test; + struct btrfs_ordered_extent *entry; + struct btrfs_ordered_extent *prev_entry = NULL; + + while (n) { + entry = rb_entry(n, struct btrfs_ordered_extent, rb_node); + prev = n; + prev_entry = entry; + + if (file_offset < entry->file_offset) + n = n->rb_left; + else if (file_offset >= entry_end(entry)) + n = n->rb_right; + else + return n; + } + if (!prev_ret) + return NULL; + + while (prev && file_offset >= entry_end(prev_entry)) { + test = rb_next(prev); + if (!test) + break; + prev_entry = rb_entry(test, struct btrfs_ordered_extent, + rb_node); + if (file_offset < entry_end(prev_entry)) + break; + + prev = test; + } + if (prev) + prev_entry = rb_entry(prev, struct btrfs_ordered_extent, + rb_node); + while (prev && file_offset < entry_end(prev_entry)) { + test = rb_prev(prev); + if (!test) + break; + prev_entry = rb_entry(test, struct btrfs_ordered_extent, + rb_node); + prev = test; + } + *prev_ret = prev; + return NULL; +} + +/* + * helper to check if a given offset is inside a given entry + */ +static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset) +{ + if (file_offset < entry->file_offset || + entry->file_offset + entry->len <= file_offset) + return 0; + return 1; +} + +/* + * look find the first ordered struct that has this offset, otherwise + * the first one less than this offset + */ +static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, + u64 file_offset) +{ + struct rb_root *root = &tree->tree; + struct rb_node *prev; + struct rb_node *ret; + struct btrfs_ordered_extent *entry; + + if (tree->last) { + entry = rb_entry(tree->last, struct btrfs_ordered_extent, + rb_node); + if (offset_in_entry(entry, file_offset)) + return tree->last; + } + ret = __tree_search(root, file_offset, &prev); + if (!ret) + ret = prev; + if (ret) + tree->last = ret; + return ret; +} + +/* allocate and add a new ordered_extent into the per-inode tree. + * file_offset is the logical offset in the file + * + * start is the disk block number of an extent already reserved in the + * extent allocation tree + * + * len is the length of the extent + * + * This also sets the EXTENT_ORDERED bit on the range in the inode. + * + * The tree is given a single reference on the ordered extent that was + * inserted. + */ +int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, + u64 start, u64 len, u64 disk_len, int type) +{ + struct btrfs_ordered_inode_tree *tree; + struct rb_node *node; + struct btrfs_ordered_extent *entry; + + tree = &BTRFS_I(inode)->ordered_tree; + entry = kzalloc(sizeof(*entry), GFP_NOFS); + if (!entry) + return -ENOMEM; + + mutex_lock(&tree->mutex); + entry->file_offset = file_offset; + entry->start = start; + entry->len = len; + entry->disk_len = disk_len; + entry->inode = inode; + if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) + set_bit(type, &entry->flags); + + /* one ref for the tree */ + atomic_set(&entry->refs, 1); + init_waitqueue_head(&entry->wait); + INIT_LIST_HEAD(&entry->list); + INIT_LIST_HEAD(&entry->root_extent_list); + + node = tree_insert(&tree->tree, file_offset, + &entry->rb_node); + BUG_ON(node); + + set_extent_ordered(&BTRFS_I(inode)->io_tree, file_offset, + entry_end(entry) - 1, GFP_NOFS); + + spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); + list_add_tail(&entry->root_extent_list, + &BTRFS_I(inode)->root->fs_info->ordered_extents); + spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); + + mutex_unlock(&tree->mutex); + BUG_ON(node); + return 0; +} + +/* + * Add a struct btrfs_ordered_sum into the list of checksums to be inserted + * when an ordered extent is finished. If the list covers more than one + * ordered extent, it is split across multiples. + */ +int btrfs_add_ordered_sum(struct inode *inode, + struct btrfs_ordered_extent *entry, + struct btrfs_ordered_sum *sum) +{ + struct btrfs_ordered_inode_tree *tree; + + tree = &BTRFS_I(inode)->ordered_tree; + mutex_lock(&tree->mutex); + list_add_tail(&sum->list, &entry->list); + mutex_unlock(&tree->mutex); + return 0; +} + +/* + * this is used to account for finished IO across a given range + * of the file. The IO should not span ordered extents. If + * a given ordered_extent is completely done, 1 is returned, otherwise + * 0. + * + * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used + * to make sure this function only returns 1 once for a given ordered extent. + */ +int btrfs_dec_test_ordered_pending(struct inode *inode, + u64 file_offset, u64 io_size) +{ + struct btrfs_ordered_inode_tree *tree; + struct rb_node *node; + struct btrfs_ordered_extent *entry; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + int ret; + + tree = &BTRFS_I(inode)->ordered_tree; + mutex_lock(&tree->mutex); + clear_extent_ordered(io_tree, file_offset, file_offset + io_size - 1, + GFP_NOFS); + node = tree_search(tree, file_offset); + if (!node) { + ret = 1; + goto out; + } + + entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); + if (!offset_in_entry(entry, file_offset)) { + ret = 1; + goto out; + } + + ret = test_range_bit(io_tree, entry->file_offset, + entry->file_offset + entry->len - 1, + EXTENT_ORDERED, 0); + if (ret == 0) + ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); +out: + mutex_unlock(&tree->mutex); + return ret == 0; +} + +/* + * used to drop a reference on an ordered extent. This will free + * the extent if the last reference is dropped + */ +int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) +{ + struct list_head *cur; + struct btrfs_ordered_sum *sum; + + if (atomic_dec_and_test(&entry->refs)) { + while (!list_empty(&entry->list)) { + cur = entry->list.next; + sum = list_entry(cur, struct btrfs_ordered_sum, list); + list_del(&sum->list); + kfree(sum); + } + kfree(entry); + } + return 0; +} + +/* + * remove an ordered extent from the tree. No references are dropped + * but, anyone waiting on this extent is woken up. + */ +int btrfs_remove_ordered_extent(struct inode *inode, + struct btrfs_ordered_extent *entry) +{ + struct btrfs_ordered_inode_tree *tree; + struct rb_node *node; + + tree = &BTRFS_I(inode)->ordered_tree; + mutex_lock(&tree->mutex); + node = &entry->rb_node; + rb_erase(node, &tree->tree); + tree->last = NULL; + set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); + + spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); + list_del_init(&entry->root_extent_list); + spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); + + mutex_unlock(&tree->mutex); + wake_up(&entry->wait); + return 0; +} + +/* + * wait for all the ordered extents in a root. This is done when balancing + * space between drives. + */ +int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only) +{ + struct list_head splice; + struct list_head *cur; + struct btrfs_ordered_extent *ordered; + struct inode *inode; + + INIT_LIST_HEAD(&splice); + + spin_lock(&root->fs_info->ordered_extent_lock); + list_splice_init(&root->fs_info->ordered_extents, &splice); + while (!list_empty(&splice)) { + cur = splice.next; + ordered = list_entry(cur, struct btrfs_ordered_extent, + root_extent_list); + if (nocow_only && + !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) && + !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) { + list_move(&ordered->root_extent_list, + &root->fs_info->ordered_extents); + cond_resched_lock(&root->fs_info->ordered_extent_lock); + continue; + } + + list_del_init(&ordered->root_extent_list); + atomic_inc(&ordered->refs); + + /* + * the inode may be getting freed (in sys_unlink path). + */ + inode = igrab(ordered->inode); + + spin_unlock(&root->fs_info->ordered_extent_lock); + + if (inode) { + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + iput(inode); + } else { + btrfs_put_ordered_extent(ordered); + } + + spin_lock(&root->fs_info->ordered_extent_lock); + } + spin_unlock(&root->fs_info->ordered_extent_lock); + return 0; +} + +/* + * Used to start IO or wait for a given ordered extent to finish. + * + * If wait is one, this effectively waits on page writeback for all the pages + * in the extent, and it waits on the io completion code to insert + * metadata into the btree corresponding to the extent + */ +void btrfs_start_ordered_extent(struct inode *inode, + struct btrfs_ordered_extent *entry, + int wait) +{ + u64 start = entry->file_offset; + u64 end = start + entry->len - 1; + + /* + * pages in the range can be dirty, clean or writeback. We + * start IO on any dirty ones so the wait doesn't stall waiting + * for pdflush to find them + */ + btrfs_fdatawrite_range(inode->i_mapping, start, end, WB_SYNC_ALL); + if (wait) { + wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, + &entry->flags)); + } +} + +/* + * Used to wait on ordered extents across a large range of bytes. + */ +int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) +{ + u64 end; + u64 orig_end; + u64 wait_end; + struct btrfs_ordered_extent *ordered; + + if (start + len < start) { + orig_end = INT_LIMIT(loff_t); + } else { + orig_end = start + len - 1; + if (orig_end > INT_LIMIT(loff_t)) + orig_end = INT_LIMIT(loff_t); + } + wait_end = orig_end; +again: + /* start IO across the range first to instantiate any delalloc + * extents + */ + btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_NONE); + + /* The compression code will leave pages locked but return from + * writepage without setting the page writeback. Starting again + * with WB_SYNC_ALL will end up waiting for the IO to actually start. + */ + btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL); + + btrfs_wait_on_page_writeback_range(inode->i_mapping, + start >> PAGE_CACHE_SHIFT, + orig_end >> PAGE_CACHE_SHIFT); + + end = orig_end; + while (1) { + ordered = btrfs_lookup_first_ordered_extent(inode, end); + if (!ordered) + break; + if (ordered->file_offset > orig_end) { + btrfs_put_ordered_extent(ordered); + break; + } + if (ordered->file_offset + ordered->len < start) { + btrfs_put_ordered_extent(ordered); + break; + } + btrfs_start_ordered_extent(inode, ordered, 1); + end = ordered->file_offset; + btrfs_put_ordered_extent(ordered); + if (end == 0 || end == start) + break; + end--; + } + if (test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end, + EXTENT_ORDERED | EXTENT_DELALLOC, 0)) { + schedule_timeout(1); + goto again; + } + return 0; +} + +/* + * find an ordered extent corresponding to file_offset. return NULL if + * nothing is found, otherwise take a reference on the extent and return it + */ +struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode, + u64 file_offset) +{ + struct btrfs_ordered_inode_tree *tree; + struct rb_node *node; + struct btrfs_ordered_extent *entry = NULL; + + tree = &BTRFS_I(inode)->ordered_tree; + mutex_lock(&tree->mutex); + node = tree_search(tree, file_offset); + if (!node) + goto out; + + entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); + if (!offset_in_entry(entry, file_offset)) + entry = NULL; + if (entry) + atomic_inc(&entry->refs); +out: + mutex_unlock(&tree->mutex); + return entry; +} + +/* + * lookup and return any extent before 'file_offset'. NULL is returned + * if none is found + */ +struct btrfs_ordered_extent * +btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset) +{ + struct btrfs_ordered_inode_tree *tree; + struct rb_node *node; + struct btrfs_ordered_extent *entry = NULL; + + tree = &BTRFS_I(inode)->ordered_tree; + mutex_lock(&tree->mutex); + node = tree_search(tree, file_offset); + if (!node) + goto out; + + entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); + atomic_inc(&entry->refs); +out: + mutex_unlock(&tree->mutex); + return entry; +} + +/* + * After an extent is done, call this to conditionally update the on disk + * i_size. i_size is updated to cover any fully written part of the file. + */ +int btrfs_ordered_update_i_size(struct inode *inode, + struct btrfs_ordered_extent *ordered) +{ + struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + u64 disk_i_size; + u64 new_i_size; + u64 i_size_test; + struct rb_node *node; + struct btrfs_ordered_extent *test; + + mutex_lock(&tree->mutex); + disk_i_size = BTRFS_I(inode)->disk_i_size; + + /* + * if the disk i_size is already at the inode->i_size, or + * this ordered extent is inside the disk i_size, we're done + */ + if (disk_i_size >= inode->i_size || + ordered->file_offset + ordered->len <= disk_i_size) { + goto out; + } + + /* + * we can't update the disk_isize if there are delalloc bytes + * between disk_i_size and this ordered extent + */ + if (test_range_bit(io_tree, disk_i_size, + ordered->file_offset + ordered->len - 1, + EXTENT_DELALLOC, 0)) { + goto out; + } + /* + * walk backward from this ordered extent to disk_i_size. + * if we find an ordered extent then we can't update disk i_size + * yet + */ + node = &ordered->rb_node; + while (1) { + node = rb_prev(node); + if (!node) + break; + test = rb_entry(node, struct btrfs_ordered_extent, rb_node); + if (test->file_offset + test->len <= disk_i_size) + break; + if (test->file_offset >= inode->i_size) + break; + if (test->file_offset >= disk_i_size) + goto out; + } + new_i_size = min_t(u64, entry_end(ordered), i_size_read(inode)); + + /* + * at this point, we know we can safely update i_size to at least + * the offset from this ordered extent. But, we need to + * walk forward and see if ios from higher up in the file have + * finished. + */ + node = rb_next(&ordered->rb_node); + i_size_test = 0; + if (node) { + /* + * do we have an area where IO might have finished + * between our ordered extent and the next one. + */ + test = rb_entry(node, struct btrfs_ordered_extent, rb_node); + if (test->file_offset > entry_end(ordered)) + i_size_test = test->file_offset; + } else { + i_size_test = i_size_read(inode); + } + + /* + * i_size_test is the end of a region after this ordered + * extent where there are no ordered extents. As long as there + * are no delalloc bytes in this area, it is safe to update + * disk_i_size to the end of the region. + */ + if (i_size_test > entry_end(ordered) && + !test_range_bit(io_tree, entry_end(ordered), i_size_test - 1, + EXTENT_DELALLOC, 0)) { + new_i_size = min_t(u64, i_size_test, i_size_read(inode)); + } + BTRFS_I(inode)->disk_i_size = new_i_size; +out: + mutex_unlock(&tree->mutex); + return 0; +} + +/* + * search the ordered extents for one corresponding to 'offset' and + * try to find a checksum. This is used because we allow pages to + * be reclaimed before their checksum is actually put into the btree + */ +int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, + u32 *sum) +{ + struct btrfs_ordered_sum *ordered_sum; + struct btrfs_sector_sum *sector_sums; + struct btrfs_ordered_extent *ordered; + struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; + struct list_head *cur; + unsigned long num_sectors; + unsigned long i; + u32 sectorsize = BTRFS_I(inode)->root->sectorsize; + int ret = 1; + + ordered = btrfs_lookup_ordered_extent(inode, offset); + if (!ordered) + return 1; + + mutex_lock(&tree->mutex); + list_for_each_prev(cur, &ordered->list) { + ordered_sum = list_entry(cur, struct btrfs_ordered_sum, list); + if (disk_bytenr >= ordered_sum->bytenr) { + num_sectors = ordered_sum->len / sectorsize; + sector_sums = ordered_sum->sums; + for (i = 0; i < num_sectors; i++) { + if (sector_sums[i].bytenr == disk_bytenr) { + *sum = sector_sums[i].sum; + ret = 0; + goto out; + } + } + } + } +out: + mutex_unlock(&tree->mutex); + btrfs_put_ordered_extent(ordered); + return ret; +} + + +/** + * taken from mm/filemap.c because it isn't exported + * + * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range + * @mapping: address space structure to write + * @start: offset in bytes where the range starts + * @end: offset in bytes where the range ends (inclusive) + * @sync_mode: enable synchronous operation + * + * Start writeback against all of a mapping's dirty pages that lie + * within the byte offsets <start, end> inclusive. + * + * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as + * opposed to a regular memory cleansing writeback. The difference between + * these two operations is that if a dirty page/buffer is encountered, it must + * be waited upon, and not just skipped over. + */ +int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start, + loff_t end, int sync_mode) +{ + struct writeback_control wbc = { + .sync_mode = sync_mode, + .nr_to_write = mapping->nrpages * 2, + .range_start = start, + .range_end = end, + .for_writepages = 1, + }; + return btrfs_writepages(mapping, &wbc); +} + +/** + * taken from mm/filemap.c because it isn't exported + * + * wait_on_page_writeback_range - wait for writeback to complete + * @mapping: target address_space + * @start: beginning page index + * @end: ending page index + * + * Wait for writeback to complete against pages indexed by start->end + * inclusive + */ +int btrfs_wait_on_page_writeback_range(struct address_space *mapping, + pgoff_t start, pgoff_t end) +{ + struct pagevec pvec; + int nr_pages; + int ret = 0; + pgoff_t index; + + if (end < start) + return 0; + + pagevec_init(&pvec, 0); + index = start; + while ((index <= end) && + (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_WRITEBACK, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) { + unsigned i; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + /* until radix tree lookup accepts end_index */ + if (page->index > end) + continue; + + wait_on_page_writeback(page); + if (PageError(page)) + ret = -EIO; + } + pagevec_release(&pvec); + cond_resched(); + } + + /* Check for outstanding write errors */ + if (test_and_clear_bit(AS_ENOSPC, &mapping->flags)) + ret = -ENOSPC; + if (test_and_clear_bit(AS_EIO, &mapping->flags)) + ret = -EIO; + + return ret; +} diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h new file mode 100644 index 000000000000..ab66d5e8d6d6 --- /dev/null +++ b/fs/btrfs/ordered-data.h @@ -0,0 +1,158 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_ORDERED_DATA__ +#define __BTRFS_ORDERED_DATA__ + +/* one of these per inode */ +struct btrfs_ordered_inode_tree { + struct mutex mutex; + struct rb_root tree; + struct rb_node *last; +}; + +/* + * these are used to collect checksums done just before bios submission. + * They are attached via a list into the ordered extent, and + * checksum items are inserted into the tree after all the blocks in + * the ordered extent are on disk + */ +struct btrfs_sector_sum { + /* bytenr on disk */ + u64 bytenr; + u32 sum; +}; + +struct btrfs_ordered_sum { + /* bytenr is the start of this extent on disk */ + u64 bytenr; + + /* + * this is the length in bytes covered by the sums array below. + */ + unsigned long len; + struct list_head list; + /* last field is a variable length array of btrfs_sector_sums */ + struct btrfs_sector_sum sums[]; +}; + +/* + * bits for the flags field: + * + * BTRFS_ORDERED_IO_DONE is set when all of the blocks are written. + * It is used to make sure metadata is inserted into the tree only once + * per extent. + * + * BTRFS_ORDERED_COMPLETE is set when the extent is removed from the + * rbtree, just before waking any waiters. It is used to indicate the + * IO is done and any metadata is inserted into the tree. + */ +#define BTRFS_ORDERED_IO_DONE 0 /* set when all the pages are written */ + +#define BTRFS_ORDERED_COMPLETE 1 /* set when removed from the tree */ + +#define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */ + +#define BTRFS_ORDERED_COMPRESSED 3 /* writing a compressed extent */ + +#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */ + +struct btrfs_ordered_extent { + /* logical offset in the file */ + u64 file_offset; + + /* disk byte number */ + u64 start; + + /* ram length of the extent in bytes */ + u64 len; + + /* extent length on disk */ + u64 disk_len; + + /* flags (described above) */ + unsigned long flags; + + /* reference count */ + atomic_t refs; + + /* the inode we belong to */ + struct inode *inode; + + /* list of checksums for insertion when the extent io is done */ + struct list_head list; + + /* used to wait for the BTRFS_ORDERED_COMPLETE bit */ + wait_queue_head_t wait; + + /* our friendly rbtree entry */ + struct rb_node rb_node; + + /* a per root list of all the pending ordered extents */ + struct list_head root_extent_list; +}; + + +/* + * calculates the total size you need to allocate for an ordered sum + * structure spanning 'bytes' in the file + */ +static inline int btrfs_ordered_sum_size(struct btrfs_root *root, + unsigned long bytes) +{ + unsigned long num_sectors = (bytes + root->sectorsize - 1) / + root->sectorsize; + num_sectors++; + return sizeof(struct btrfs_ordered_sum) + + num_sectors * sizeof(struct btrfs_sector_sum); +} + +static inline void +btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t) +{ + mutex_init(&t->mutex); + t->tree.rb_node = NULL; + t->last = NULL; +} + +int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry); +int btrfs_remove_ordered_extent(struct inode *inode, + struct btrfs_ordered_extent *entry); +int btrfs_dec_test_ordered_pending(struct inode *inode, + u64 file_offset, u64 io_size); +int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, + u64 start, u64 len, u64 disk_len, int tyep); +int btrfs_add_ordered_sum(struct inode *inode, + struct btrfs_ordered_extent *entry, + struct btrfs_ordered_sum *sum); +struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode, + u64 file_offset); +void btrfs_start_ordered_extent(struct inode *inode, + struct btrfs_ordered_extent *entry, int wait); +int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); +struct btrfs_ordered_extent * +btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset); +int btrfs_ordered_update_i_size(struct inode *inode, + struct btrfs_ordered_extent *ordered); +int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); +int btrfs_wait_on_page_writeback_range(struct address_space *mapping, + pgoff_t start, pgoff_t end); +int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start, + loff_t end, int sync_mode); +int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only); +#endif diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c new file mode 100644 index 000000000000..3c0d52af4f80 --- /dev/null +++ b/fs/btrfs/orphan.c @@ -0,0 +1,67 @@ +/* + * Copyright (C) 2008 Red Hat. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include "ctree.h" +#include "disk-io.h" + +int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 offset) +{ + struct btrfs_path *path; + struct btrfs_key key; + int ret = 0; + + key.objectid = BTRFS_ORPHAN_OBJECTID; + btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); + key.offset = offset; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_insert_empty_item(trans, root, path, &key, 0); + + btrfs_free_path(path); + return ret; +} + +int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 offset) +{ + struct btrfs_path *path; + struct btrfs_key key; + int ret = 0; + + key.objectid = BTRFS_ORPHAN_OBJECTID; + btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); + key.offset = offset; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret) + goto out; + + ret = btrfs_del_item(trans, root, path); + +out: + btrfs_free_path(path); + return ret; +} diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c new file mode 100644 index 000000000000..5f8f218c1005 --- /dev/null +++ b/fs/btrfs/print-tree.c @@ -0,0 +1,216 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include "ctree.h" +#include "disk-io.h" +#include "print-tree.h" + +static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk) +{ + int num_stripes = btrfs_chunk_num_stripes(eb, chunk); + int i; + printk(KERN_INFO "\t\tchunk length %llu owner %llu type %llu " + "num_stripes %d\n", + (unsigned long long)btrfs_chunk_length(eb, chunk), + (unsigned long long)btrfs_chunk_owner(eb, chunk), + (unsigned long long)btrfs_chunk_type(eb, chunk), + num_stripes); + for (i = 0 ; i < num_stripes ; i++) { + printk(KERN_INFO "\t\t\tstripe %d devid %llu offset %llu\n", i, + (unsigned long long)btrfs_stripe_devid_nr(eb, chunk, i), + (unsigned long long)btrfs_stripe_offset_nr(eb, chunk, i)); + } +} +static void print_dev_item(struct extent_buffer *eb, + struct btrfs_dev_item *dev_item) +{ + printk(KERN_INFO "\t\tdev item devid %llu " + "total_bytes %llu bytes used %llu\n", + (unsigned long long)btrfs_device_id(eb, dev_item), + (unsigned long long)btrfs_device_total_bytes(eb, dev_item), + (unsigned long long)btrfs_device_bytes_used(eb, dev_item)); +} +void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) +{ + int i; + u32 nr = btrfs_header_nritems(l); + struct btrfs_item *item; + struct btrfs_extent_item *ei; + struct btrfs_root_item *ri; + struct btrfs_dir_item *di; + struct btrfs_inode_item *ii; + struct btrfs_block_group_item *bi; + struct btrfs_file_extent_item *fi; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_extent_ref *ref; + struct btrfs_dev_extent *dev_extent; + u32 type; + + printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n", + (unsigned long long)btrfs_header_bytenr(l), nr, + btrfs_leaf_free_space(root, l)); + for (i = 0 ; i < nr ; i++) { + item = btrfs_item_nr(l, i); + btrfs_item_key_to_cpu(l, &key, i); + type = btrfs_key_type(&key); + printk(KERN_INFO "\titem %d key (%llu %x %llu) itemoff %d " + "itemsize %d\n", + i, + (unsigned long long)key.objectid, type, + (unsigned long long)key.offset, + btrfs_item_offset(l, item), btrfs_item_size(l, item)); + switch (type) { + case BTRFS_INODE_ITEM_KEY: + ii = btrfs_item_ptr(l, i, struct btrfs_inode_item); + printk(KERN_INFO "\t\tinode generation %llu size %llu " + "mode %o\n", + (unsigned long long) + btrfs_inode_generation(l, ii), + (unsigned long long)btrfs_inode_size(l, ii), + btrfs_inode_mode(l, ii)); + break; + case BTRFS_DIR_ITEM_KEY: + di = btrfs_item_ptr(l, i, struct btrfs_dir_item); + btrfs_dir_item_key_to_cpu(l, di, &found_key); + printk(KERN_INFO "\t\tdir oid %llu type %u\n", + (unsigned long long)found_key.objectid, + btrfs_dir_type(l, di)); + break; + case BTRFS_ROOT_ITEM_KEY: + ri = btrfs_item_ptr(l, i, struct btrfs_root_item); + printk(KERN_INFO "\t\troot data bytenr %llu refs %u\n", + (unsigned long long) + btrfs_disk_root_bytenr(l, ri), + btrfs_disk_root_refs(l, ri)); + break; + case BTRFS_EXTENT_ITEM_KEY: + ei = btrfs_item_ptr(l, i, struct btrfs_extent_item); + printk(KERN_INFO "\t\textent data refs %u\n", + btrfs_extent_refs(l, ei)); + break; + case BTRFS_EXTENT_REF_KEY: + ref = btrfs_item_ptr(l, i, struct btrfs_extent_ref); + printk(KERN_INFO "\t\textent back ref root %llu " + "gen %llu owner %llu num_refs %lu\n", + (unsigned long long)btrfs_ref_root(l, ref), + (unsigned long long)btrfs_ref_generation(l, ref), + (unsigned long long)btrfs_ref_objectid(l, ref), + (unsigned long)btrfs_ref_num_refs(l, ref)); + break; + + case BTRFS_EXTENT_DATA_KEY: + fi = btrfs_item_ptr(l, i, + struct btrfs_file_extent_item); + if (btrfs_file_extent_type(l, fi) == + BTRFS_FILE_EXTENT_INLINE) { + printk(KERN_INFO "\t\tinline extent data " + "size %u\n", + btrfs_file_extent_inline_len(l, fi)); + break; + } + printk(KERN_INFO "\t\textent data disk bytenr %llu " + "nr %llu\n", + (unsigned long long) + btrfs_file_extent_disk_bytenr(l, fi), + (unsigned long long) + btrfs_file_extent_disk_num_bytes(l, fi)); + printk(KERN_INFO "\t\textent data offset %llu " + "nr %llu ram %llu\n", + (unsigned long long) + btrfs_file_extent_offset(l, fi), + (unsigned long long) + btrfs_file_extent_num_bytes(l, fi), + (unsigned long long) + btrfs_file_extent_ram_bytes(l, fi)); + break; + case BTRFS_BLOCK_GROUP_ITEM_KEY: + bi = btrfs_item_ptr(l, i, + struct btrfs_block_group_item); + printk(KERN_INFO "\t\tblock group used %llu\n", + (unsigned long long) + btrfs_disk_block_group_used(l, bi)); + break; + case BTRFS_CHUNK_ITEM_KEY: + print_chunk(l, btrfs_item_ptr(l, i, + struct btrfs_chunk)); + break; + case BTRFS_DEV_ITEM_KEY: + print_dev_item(l, btrfs_item_ptr(l, i, + struct btrfs_dev_item)); + break; + case BTRFS_DEV_EXTENT_KEY: + dev_extent = btrfs_item_ptr(l, i, + struct btrfs_dev_extent); + printk(KERN_INFO "\t\tdev extent chunk_tree %llu\n" + "\t\tchunk objectid %llu chunk offset %llu " + "length %llu\n", + (unsigned long long) + btrfs_dev_extent_chunk_tree(l, dev_extent), + (unsigned long long) + btrfs_dev_extent_chunk_objectid(l, dev_extent), + (unsigned long long) + btrfs_dev_extent_chunk_offset(l, dev_extent), + (unsigned long long) + btrfs_dev_extent_length(l, dev_extent)); + }; + } +} + +void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c) +{ + int i; u32 nr; + struct btrfs_key key; + int level; + + if (!c) + return; + nr = btrfs_header_nritems(c); + level = btrfs_header_level(c); + if (level == 0) { + btrfs_print_leaf(root, c); + return; + } + printk(KERN_INFO "node %llu level %d total ptrs %d free spc %u\n", + (unsigned long long)btrfs_header_bytenr(c), + btrfs_header_level(c), nr, + (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr); + for (i = 0; i < nr; i++) { + btrfs_node_key_to_cpu(c, &key, i); + printk(KERN_INFO "\tkey %d (%llu %u %llu) block %llu\n", + i, + (unsigned long long)key.objectid, + key.type, + (unsigned long long)key.offset, + (unsigned long long)btrfs_node_blockptr(c, i)); + } + for (i = 0; i < nr; i++) { + struct extent_buffer *next = read_tree_block(root, + btrfs_node_blockptr(c, i), + btrfs_level_size(root, level - 1), + btrfs_node_ptr_generation(c, i)); + if (btrfs_is_leaf(next) && + btrfs_header_level(c) != 1) + BUG(); + if (btrfs_header_level(next) != + btrfs_header_level(c) - 1) + BUG(); + btrfs_print_tree(root, next); + free_extent_buffer(next); + } +} diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h new file mode 100644 index 000000000000..da75efe534d5 --- /dev/null +++ b/fs/btrfs/print-tree.h @@ -0,0 +1,23 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __PRINT_TREE_ +#define __PRINT_TREE_ +void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l); +void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *t); +#endif diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c new file mode 100644 index 000000000000..6f0acc4c9eab --- /dev/null +++ b/fs/btrfs/ref-cache.c @@ -0,0 +1,230 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/sched.h> +#include "ctree.h" +#include "ref-cache.h" +#include "transaction.h" + +/* + * leaf refs are used to cache the information about which extents + * a given leaf has references on. This allows us to process that leaf + * in btrfs_drop_snapshot without needing to read it back from disk. + */ + +/* + * kmalloc a leaf reference struct and update the counters for the + * total ref cache size + */ +struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root, + int nr_extents) +{ + struct btrfs_leaf_ref *ref; + size_t size = btrfs_leaf_ref_size(nr_extents); + + ref = kmalloc(size, GFP_NOFS); + if (ref) { + spin_lock(&root->fs_info->ref_cache_lock); + root->fs_info->total_ref_cache_size += size; + spin_unlock(&root->fs_info->ref_cache_lock); + + memset(ref, 0, sizeof(*ref)); + atomic_set(&ref->usage, 1); + INIT_LIST_HEAD(&ref->list); + } + return ref; +} + +/* + * free a leaf reference struct and update the counters for the + * total ref cache size + */ +void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref) +{ + if (!ref) + return; + WARN_ON(atomic_read(&ref->usage) == 0); + if (atomic_dec_and_test(&ref->usage)) { + size_t size = btrfs_leaf_ref_size(ref->nritems); + + BUG_ON(ref->in_tree); + kfree(ref); + + spin_lock(&root->fs_info->ref_cache_lock); + root->fs_info->total_ref_cache_size -= size; + spin_unlock(&root->fs_info->ref_cache_lock); + } +} + +static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr, + struct rb_node *node) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct btrfs_leaf_ref *entry; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct btrfs_leaf_ref, rb_node); + + if (bytenr < entry->bytenr) + p = &(*p)->rb_left; + else if (bytenr > entry->bytenr) + p = &(*p)->rb_right; + else + return parent; + } + + entry = rb_entry(node, struct btrfs_leaf_ref, rb_node); + rb_link_node(node, parent, p); + rb_insert_color(node, root); + return NULL; +} + +static struct rb_node *tree_search(struct rb_root *root, u64 bytenr) +{ + struct rb_node *n = root->rb_node; + struct btrfs_leaf_ref *entry; + + while (n) { + entry = rb_entry(n, struct btrfs_leaf_ref, rb_node); + WARN_ON(!entry->in_tree); + + if (bytenr < entry->bytenr) + n = n->rb_left; + else if (bytenr > entry->bytenr) + n = n->rb_right; + else + return n; + } + return NULL; +} + +int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen, + int shared) +{ + struct btrfs_leaf_ref *ref = NULL; + struct btrfs_leaf_ref_tree *tree = root->ref_tree; + + if (shared) + tree = &root->fs_info->shared_ref_tree; + if (!tree) + return 0; + + spin_lock(&tree->lock); + while (!list_empty(&tree->list)) { + ref = list_entry(tree->list.next, struct btrfs_leaf_ref, list); + BUG_ON(ref->tree != tree); + if (ref->root_gen > max_root_gen) + break; + if (!xchg(&ref->in_tree, 0)) { + cond_resched_lock(&tree->lock); + continue; + } + + rb_erase(&ref->rb_node, &tree->root); + list_del_init(&ref->list); + + spin_unlock(&tree->lock); + btrfs_free_leaf_ref(root, ref); + cond_resched(); + spin_lock(&tree->lock); + } + spin_unlock(&tree->lock); + return 0; +} + +/* + * find the leaf ref for a given extent. This returns the ref struct with + * a usage reference incremented + */ +struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root, + u64 bytenr) +{ + struct rb_node *rb; + struct btrfs_leaf_ref *ref = NULL; + struct btrfs_leaf_ref_tree *tree = root->ref_tree; +again: + if (tree) { + spin_lock(&tree->lock); + rb = tree_search(&tree->root, bytenr); + if (rb) + ref = rb_entry(rb, struct btrfs_leaf_ref, rb_node); + if (ref) + atomic_inc(&ref->usage); + spin_unlock(&tree->lock); + if (ref) + return ref; + } + if (tree != &root->fs_info->shared_ref_tree) { + tree = &root->fs_info->shared_ref_tree; + goto again; + } + return NULL; +} + +/* + * add a fully filled in leaf ref struct + * remove all the refs older than a given root generation + */ +int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref, + int shared) +{ + int ret = 0; + struct rb_node *rb; + struct btrfs_leaf_ref_tree *tree = root->ref_tree; + + if (shared) + tree = &root->fs_info->shared_ref_tree; + + spin_lock(&tree->lock); + rb = tree_insert(&tree->root, ref->bytenr, &ref->rb_node); + if (rb) { + ret = -EEXIST; + } else { + atomic_inc(&ref->usage); + ref->tree = tree; + ref->in_tree = 1; + list_add_tail(&ref->list, &tree->list); + } + spin_unlock(&tree->lock); + return ret; +} + +/* + * remove a single leaf ref from the tree. This drops the ref held by the tree + * only + */ +int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref) +{ + struct btrfs_leaf_ref_tree *tree; + + if (!xchg(&ref->in_tree, 0)) + return 0; + + tree = ref->tree; + spin_lock(&tree->lock); + + rb_erase(&ref->rb_node, &tree->root); + list_del_init(&ref->list); + + spin_unlock(&tree->lock); + + btrfs_free_leaf_ref(root, ref); + return 0; +} diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h new file mode 100644 index 000000000000..16f3183d7c59 --- /dev/null +++ b/fs/btrfs/ref-cache.h @@ -0,0 +1,77 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ +#ifndef __REFCACHE__ +#define __REFCACHE__ + +struct btrfs_extent_info { + /* bytenr and num_bytes find the extent in the extent allocation tree */ + u64 bytenr; + u64 num_bytes; + + /* objectid and offset find the back reference for the file */ + u64 objectid; + u64 offset; +}; + +struct btrfs_leaf_ref { + struct rb_node rb_node; + struct btrfs_leaf_ref_tree *tree; + int in_tree; + atomic_t usage; + + u64 root_gen; + u64 bytenr; + u64 owner; + u64 generation; + int nritems; + + struct list_head list; + struct btrfs_extent_info extents[]; +}; + +static inline size_t btrfs_leaf_ref_size(int nr_extents) +{ + return sizeof(struct btrfs_leaf_ref) + + sizeof(struct btrfs_extent_info) * nr_extents; +} + +static inline void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree) +{ + tree->root.rb_node = NULL; + INIT_LIST_HEAD(&tree->list); + spin_lock_init(&tree->lock); +} + +static inline int btrfs_leaf_ref_tree_empty(struct btrfs_leaf_ref_tree *tree) +{ + return RB_EMPTY_ROOT(&tree->root); +} + +void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree); +struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root, + int nr_extents); +void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref); +struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root, + u64 bytenr); +int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref, + int shared); +int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen, + int shared); +int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref); + +#endif diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c new file mode 100644 index 000000000000..b48650de4472 --- /dev/null +++ b/fs/btrfs/root-tree.c @@ -0,0 +1,366 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include "ctree.h" +#include "transaction.h" +#include "disk-io.h" +#include "print-tree.h" + +/* + * search forward for a root, starting with objectid 'search_start' + * if a root key is found, the objectid we find is filled into 'found_objectid' + * and 0 is returned. < 0 is returned on error, 1 if there is nothing + * left in the tree. + */ +int btrfs_search_root(struct btrfs_root *root, u64 search_start, + u64 *found_objectid) +{ + struct btrfs_path *path; + struct btrfs_key search_key; + int ret; + + root = root->fs_info->tree_root; + search_key.objectid = search_start; + search_key.type = (u8)-1; + search_key.offset = (u64)-1; + + path = btrfs_alloc_path(); + BUG_ON(!path); +again: + ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); + if (ret < 0) + goto out; + if (ret == 0) { + ret = 1; + goto out; + } + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(root, path); + if (ret) + goto out; + } + btrfs_item_key_to_cpu(path->nodes[0], &search_key, path->slots[0]); + if (search_key.type != BTRFS_ROOT_ITEM_KEY) { + search_key.offset++; + btrfs_release_path(root, path); + goto again; + } + ret = 0; + *found_objectid = search_key.objectid; + +out: + btrfs_free_path(path); + return ret; +} + +/* + * lookup the root with the highest offset for a given objectid. The key we do + * find is copied into 'key'. If we find something return 0, otherwise 1, < 0 + * on error. + */ +int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, + struct btrfs_root_item *item, struct btrfs_key *key) +{ + struct btrfs_path *path; + struct btrfs_key search_key; + struct btrfs_key found_key; + struct extent_buffer *l; + int ret; + int slot; + + search_key.objectid = objectid; + search_key.type = BTRFS_ROOT_ITEM_KEY; + search_key.offset = (u64)-1; + + path = btrfs_alloc_path(); + BUG_ON(!path); + ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); + if (ret < 0) + goto out; + + BUG_ON(ret == 0); + l = path->nodes[0]; + BUG_ON(path->slots[0] == 0); + slot = path->slots[0] - 1; + btrfs_item_key_to_cpu(l, &found_key, slot); + if (found_key.objectid != objectid) { + ret = 1; + goto out; + } + read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot), + sizeof(*item)); + memcpy(key, &found_key, sizeof(found_key)); + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + +/* + * copy the data in 'item' into the btree + */ +int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *key, struct btrfs_root_item + *item) +{ + struct btrfs_path *path; + struct extent_buffer *l; + int ret; + int slot; + unsigned long ptr; + + path = btrfs_alloc_path(); + BUG_ON(!path); + ret = btrfs_search_slot(trans, root, key, path, 0, 1); + if (ret < 0) + goto out; + + if (ret != 0) { + btrfs_print_leaf(root, path->nodes[0]); + printk(KERN_CRIT "unable to update root key %llu %u %llu\n", + (unsigned long long)key->objectid, key->type, + (unsigned long long)key->offset); + BUG_ON(1); + } + + l = path->nodes[0]; + slot = path->slots[0]; + ptr = btrfs_item_ptr_offset(l, slot); + write_extent_buffer(l, item, ptr, sizeof(*item)); + btrfs_mark_buffer_dirty(path->nodes[0]); +out: + btrfs_release_path(root, path); + btrfs_free_path(path); + return ret; +} + +int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *key, struct btrfs_root_item + *item) +{ + int ret; + ret = btrfs_insert_item(trans, root, key, item, sizeof(*item)); + return ret; +} + +/* + * at mount time we want to find all the old transaction snapshots that were in + * the process of being deleted if we crashed. This is any root item with an + * offset lower than the latest root. They need to be queued for deletion to + * finish what was happening when we crashed. + */ +int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid, + struct btrfs_root *latest) +{ + struct btrfs_root *dead_root; + struct btrfs_item *item; + struct btrfs_root_item *ri; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_path *path; + int ret; + u32 nritems; + struct extent_buffer *leaf; + int slot; + + key.objectid = objectid; + btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + key.offset = 0; + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + +again: + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto err; + while (1) { + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + slot = path->slots[0]; + if (slot >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret) + break; + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + slot = path->slots[0]; + } + item = btrfs_item_nr(leaf, slot); + btrfs_item_key_to_cpu(leaf, &key, slot); + if (btrfs_key_type(&key) != BTRFS_ROOT_ITEM_KEY) + goto next; + + if (key.objectid < objectid) + goto next; + + if (key.objectid > objectid) + break; + + ri = btrfs_item_ptr(leaf, slot, struct btrfs_root_item); + if (btrfs_disk_root_refs(leaf, ri) != 0) + goto next; + + memcpy(&found_key, &key, sizeof(key)); + key.offset++; + btrfs_release_path(root, path); + dead_root = + btrfs_read_fs_root_no_radix(root->fs_info->tree_root, + &found_key); + if (IS_ERR(dead_root)) { + ret = PTR_ERR(dead_root); + goto err; + } + + if (objectid == BTRFS_TREE_RELOC_OBJECTID) + ret = btrfs_add_dead_reloc_root(dead_root); + else + ret = btrfs_add_dead_root(dead_root, latest); + if (ret) + goto err; + goto again; +next: + slot++; + path->slots[0]++; + } + ret = 0; +err: + btrfs_free_path(path); + return ret; +} + +/* drop the root item for 'key' from 'root' */ +int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_key *key) +{ + struct btrfs_path *path; + int ret; + u32 refs; + struct btrfs_root_item *ri; + struct extent_buffer *leaf; + + path = btrfs_alloc_path(); + BUG_ON(!path); + ret = btrfs_search_slot(trans, root, key, path, -1, 1); + if (ret < 0) + goto out; + + BUG_ON(ret != 0); + leaf = path->nodes[0]; + ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item); + + refs = btrfs_disk_root_refs(leaf, ri); + BUG_ON(refs != 0); + ret = btrfs_del_item(trans, root, path); +out: + btrfs_release_path(root, path); + btrfs_free_path(path); + return ret; +} + +#if 0 /* this will get used when snapshot deletion is implemented */ +int btrfs_del_root_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *tree_root, + u64 root_id, u8 type, u64 ref_id) +{ + struct btrfs_key key; + int ret; + struct btrfs_path *path; + + path = btrfs_alloc_path(); + + key.objectid = root_id; + key.type = type; + key.offset = ref_id; + + ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); + BUG_ON(ret); + + ret = btrfs_del_item(trans, tree_root, path); + BUG_ON(ret); + + btrfs_free_path(path); + return ret; +} +#endif + +int btrfs_find_root_ref(struct btrfs_root *tree_root, + struct btrfs_path *path, + u64 root_id, u64 ref_id) +{ + struct btrfs_key key; + int ret; + + key.objectid = root_id; + key.type = BTRFS_ROOT_REF_KEY; + key.offset = ref_id; + + ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0); + return ret; +} + + +/* + * add a btrfs_root_ref item. type is either BTRFS_ROOT_REF_KEY + * or BTRFS_ROOT_BACKREF_KEY. + * + * The dirid, sequence, name and name_len refer to the directory entry + * that is referencing the root. + * + * For a forward ref, the root_id is the id of the tree referencing + * the root and ref_id is the id of the subvol or snapshot. + * + * For a back ref the root_id is the id of the subvol or snapshot and + * ref_id is the id of the tree referencing it. + */ +int btrfs_add_root_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *tree_root, + u64 root_id, u8 type, u64 ref_id, + u64 dirid, u64 sequence, + const char *name, int name_len) +{ + struct btrfs_key key; + int ret; + struct btrfs_path *path; + struct btrfs_root_ref *ref; + struct extent_buffer *leaf; + unsigned long ptr; + + + path = btrfs_alloc_path(); + + key.objectid = root_id; + key.type = type; + key.offset = ref_id; + + ret = btrfs_insert_empty_item(trans, tree_root, path, &key, + sizeof(*ref) + name_len); + BUG_ON(ret); + + leaf = path->nodes[0]; + ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); + btrfs_set_root_ref_dirid(leaf, ref, dirid); + btrfs_set_root_ref_sequence(leaf, ref, sequence); + btrfs_set_root_ref_name_len(leaf, ref, name_len); + ptr = (unsigned long)(ref + 1); + write_extent_buffer(leaf, name, ptr, name_len); + btrfs_mark_buffer_dirty(leaf); + + btrfs_free_path(path); + return ret; +} diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c new file mode 100644 index 000000000000..c0f7ecaf1e79 --- /dev/null +++ b/fs/btrfs/struct-funcs.c @@ -0,0 +1,139 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/highmem.h> + +/* this is some deeply nasty code. ctree.h has a different + * definition for this BTRFS_SETGET_FUNCS macro, behind a #ifndef + * + * The end result is that anyone who #includes ctree.h gets a + * declaration for the btrfs_set_foo functions and btrfs_foo functions + * + * This file declares the macros and then #includes ctree.h, which results + * in cpp creating the function here based on the template below. + * + * These setget functions do all the extent_buffer related mapping + * required to efficiently read and write specific fields in the extent + * buffers. Every pointer to metadata items in btrfs is really just + * an unsigned long offset into the extent buffer which has been + * cast to a specific type. This gives us all the gcc type checking. + * + * The extent buffer api is used to do all the kmapping and page + * spanning work required to get extent buffers in highmem and have + * a metadata blocksize different from the page size. + * + * The macro starts with a simple function prototype declaration so that + * sparse won't complain about it being static. + */ + +#define BTRFS_SETGET_FUNCS(name, type, member, bits) \ +u##bits btrfs_##name(struct extent_buffer *eb, type *s); \ +void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val); \ +u##bits btrfs_##name(struct extent_buffer *eb, \ + type *s) \ +{ \ + unsigned long part_offset = (unsigned long)s; \ + unsigned long offset = part_offset + offsetof(type, member); \ + type *p; \ + /* ugly, but we want the fast path here */ \ + if (eb->map_token && offset >= eb->map_start && \ + offset + sizeof(((type *)0)->member) <= eb->map_start + \ + eb->map_len) { \ + p = (type *)(eb->kaddr + part_offset - eb->map_start); \ + return le##bits##_to_cpu(p->member); \ + } \ + { \ + int err; \ + char *map_token; \ + char *kaddr; \ + int unmap_on_exit = (eb->map_token == NULL); \ + unsigned long map_start; \ + unsigned long map_len; \ + u##bits res; \ + err = map_extent_buffer(eb, offset, \ + sizeof(((type *)0)->member), \ + &map_token, &kaddr, \ + &map_start, &map_len, KM_USER1); \ + if (err) { \ + __le##bits leres; \ + read_eb_member(eb, s, type, member, &leres); \ + return le##bits##_to_cpu(leres); \ + } \ + p = (type *)(kaddr + part_offset - map_start); \ + res = le##bits##_to_cpu(p->member); \ + if (unmap_on_exit) \ + unmap_extent_buffer(eb, map_token, KM_USER1); \ + return res; \ + } \ +} \ +void btrfs_set_##name(struct extent_buffer *eb, \ + type *s, u##bits val) \ +{ \ + unsigned long part_offset = (unsigned long)s; \ + unsigned long offset = part_offset + offsetof(type, member); \ + type *p; \ + /* ugly, but we want the fast path here */ \ + if (eb->map_token && offset >= eb->map_start && \ + offset + sizeof(((type *)0)->member) <= eb->map_start + \ + eb->map_len) { \ + p = (type *)(eb->kaddr + part_offset - eb->map_start); \ + p->member = cpu_to_le##bits(val); \ + return; \ + } \ + { \ + int err; \ + char *map_token; \ + char *kaddr; \ + int unmap_on_exit = (eb->map_token == NULL); \ + unsigned long map_start; \ + unsigned long map_len; \ + err = map_extent_buffer(eb, offset, \ + sizeof(((type *)0)->member), \ + &map_token, &kaddr, \ + &map_start, &map_len, KM_USER1); \ + if (err) { \ + __le##bits val2; \ + val2 = cpu_to_le##bits(val); \ + write_eb_member(eb, s, type, member, &val2); \ + return; \ + } \ + p = (type *)(kaddr + part_offset - map_start); \ + p->member = cpu_to_le##bits(val); \ + if (unmap_on_exit) \ + unmap_extent_buffer(eb, map_token, KM_USER1); \ + } \ +} + +#include "ctree.h" + +void btrfs_node_key(struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int nr) +{ + unsigned long ptr = btrfs_node_key_ptr_offset(nr); + if (eb->map_token && ptr >= eb->map_start && + ptr + sizeof(*disk_key) <= eb->map_start + eb->map_len) { + memcpy(disk_key, eb->kaddr + ptr - eb->map_start, + sizeof(*disk_key)); + return; + } else if (eb->map_token) { + unmap_extent_buffer(eb, eb->map_token, KM_USER1); + eb->map_token = NULL; + } + read_eb_member(eb, (struct btrfs_key_ptr *)ptr, + struct btrfs_key_ptr, key, disk_key); +} diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c new file mode 100644 index 000000000000..0a14b495532f --- /dev/null +++ b/fs/btrfs/super.c @@ -0,0 +1,722 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/blkdev.h> +#include <linux/module.h> +#include <linux/buffer_head.h> +#include <linux/fs.h> +#include <linux/pagemap.h> +#include <linux/highmem.h> +#include <linux/time.h> +#include <linux/init.h> +#include <linux/string.h> +#include <linux/smp_lock.h> +#include <linux/backing-dev.h> +#include <linux/mount.h> +#include <linux/mpage.h> +#include <linux/swap.h> +#include <linux/writeback.h> +#include <linux/statfs.h> +#include <linux/compat.h> +#include <linux/parser.h> +#include <linux/ctype.h> +#include <linux/namei.h> +#include <linux/miscdevice.h> +#include <linux/version.h> +#include "compat.h" +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "btrfs_inode.h" +#include "ioctl.h" +#include "print-tree.h" +#include "xattr.h" +#include "volumes.h" +#include "version.h" +#include "export.h" +#include "compression.h" + +#define BTRFS_SUPER_MAGIC 0x9123683E + +static struct super_operations btrfs_super_ops; + +static void btrfs_put_super(struct super_block *sb) +{ + struct btrfs_root *root = btrfs_sb(sb); + int ret; + + ret = close_ctree(root); + sb->s_fs_info = NULL; +} + +enum { + Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow, + Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, + Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_err, +}; + +static match_table_t tokens = { + {Opt_degraded, "degraded"}, + {Opt_subvol, "subvol=%s"}, + {Opt_device, "device=%s"}, + {Opt_nodatasum, "nodatasum"}, + {Opt_nodatacow, "nodatacow"}, + {Opt_nobarrier, "nobarrier"}, + {Opt_max_extent, "max_extent=%s"}, + {Opt_max_inline, "max_inline=%s"}, + {Opt_alloc_start, "alloc_start=%s"}, + {Opt_thread_pool, "thread_pool=%d"}, + {Opt_compress, "compress"}, + {Opt_ssd, "ssd"}, + {Opt_noacl, "noacl"}, + {Opt_err, NULL}, +}; + +u64 btrfs_parse_size(char *str) +{ + u64 res; + int mult = 1; + char *end; + char last; + + res = simple_strtoul(str, &end, 10); + + last = end[0]; + if (isalpha(last)) { + last = tolower(last); + switch (last) { + case 'g': + mult *= 1024; + case 'm': + mult *= 1024; + case 'k': + mult *= 1024; + } + res = res * mult; + } + return res; +} + +/* + * Regular mount options parser. Everything that is needed only when + * reading in a new superblock is parsed here. + */ +int btrfs_parse_options(struct btrfs_root *root, char *options) +{ + struct btrfs_fs_info *info = root->fs_info; + substring_t args[MAX_OPT_ARGS]; + char *p, *num; + int intarg; + + if (!options) + return 0; + + /* + * strsep changes the string, duplicate it because parse_options + * gets called twice + */ + options = kstrdup(options, GFP_NOFS); + if (!options) + return -ENOMEM; + + + while ((p = strsep(&options, ",")) != NULL) { + int token; + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_degraded: + printk(KERN_INFO "btrfs: allowing degraded mounts\n"); + btrfs_set_opt(info->mount_opt, DEGRADED); + break; + case Opt_subvol: + case Opt_device: + /* + * These are parsed by btrfs_parse_early_options + * and can be happily ignored here. + */ + break; + case Opt_nodatasum: + printk(KERN_INFO "btrfs: setting nodatacsum\n"); + btrfs_set_opt(info->mount_opt, NODATASUM); + break; + case Opt_nodatacow: + printk(KERN_INFO "btrfs: setting nodatacow\n"); + btrfs_set_opt(info->mount_opt, NODATACOW); + btrfs_set_opt(info->mount_opt, NODATASUM); + break; + case Opt_compress: + printk(KERN_INFO "btrfs: use compression\n"); + btrfs_set_opt(info->mount_opt, COMPRESS); + break; + case Opt_ssd: + printk(KERN_INFO "btrfs: use ssd allocation scheme\n"); + btrfs_set_opt(info->mount_opt, SSD); + break; + case Opt_nobarrier: + printk(KERN_INFO "btrfs: turning off barriers\n"); + btrfs_set_opt(info->mount_opt, NOBARRIER); + break; + case Opt_thread_pool: + intarg = 0; + match_int(&args[0], &intarg); + if (intarg) { + info->thread_pool_size = intarg; + printk(KERN_INFO "btrfs: thread pool %d\n", + info->thread_pool_size); + } + break; + case Opt_max_extent: + num = match_strdup(&args[0]); + if (num) { + info->max_extent = btrfs_parse_size(num); + kfree(num); + + info->max_extent = max_t(u64, + info->max_extent, root->sectorsize); + printk(KERN_INFO "btrfs: max_extent at %llu\n", + info->max_extent); + } + break; + case Opt_max_inline: + num = match_strdup(&args[0]); + if (num) { + info->max_inline = btrfs_parse_size(num); + kfree(num); + + if (info->max_inline) { + info->max_inline = max_t(u64, + info->max_inline, + root->sectorsize); + } + printk(KERN_INFO "btrfs: max_inline at %llu\n", + info->max_inline); + } + break; + case Opt_alloc_start: + num = match_strdup(&args[0]); + if (num) { + info->alloc_start = btrfs_parse_size(num); + kfree(num); + printk(KERN_INFO + "btrfs: allocations start at %llu\n", + info->alloc_start); + } + break; + case Opt_noacl: + root->fs_info->sb->s_flags &= ~MS_POSIXACL; + break; + default: + break; + } + } + kfree(options); + return 0; +} + +/* + * Parse mount options that are required early in the mount process. + * + * All other options will be parsed on much later in the mount process and + * only when we need to allocate a new super block. + */ +static int btrfs_parse_early_options(const char *options, fmode_t flags, + void *holder, char **subvol_name, + struct btrfs_fs_devices **fs_devices) +{ + substring_t args[MAX_OPT_ARGS]; + char *opts, *p; + int error = 0; + + if (!options) + goto out; + + /* + * strsep changes the string, duplicate it because parse_options + * gets called twice + */ + opts = kstrdup(options, GFP_KERNEL); + if (!opts) + return -ENOMEM; + + while ((p = strsep(&opts, ",")) != NULL) { + int token; + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_subvol: + *subvol_name = match_strdup(&args[0]); + break; + case Opt_device: + error = btrfs_scan_one_device(match_strdup(&args[0]), + flags, holder, fs_devices); + if (error) + goto out_free_opts; + break; + default: + break; + } + } + + out_free_opts: + kfree(opts); + out: + /* + * If no subvolume name is specified we use the default one. Allocate + * a copy of the string "." here so that code later in the + * mount path doesn't care if it's the default volume or another one. + */ + if (!*subvol_name) { + *subvol_name = kstrdup(".", GFP_KERNEL); + if (!*subvol_name) + return -ENOMEM; + } + return error; +} + +static int btrfs_fill_super(struct super_block *sb, + struct btrfs_fs_devices *fs_devices, + void *data, int silent) +{ + struct inode *inode; + struct dentry *root_dentry; + struct btrfs_super_block *disk_super; + struct btrfs_root *tree_root; + struct btrfs_inode *bi; + int err; + + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_magic = BTRFS_SUPER_MAGIC; + sb->s_op = &btrfs_super_ops; + sb->s_export_op = &btrfs_export_ops; + sb->s_xattr = btrfs_xattr_handlers; + sb->s_time_gran = 1; + sb->s_flags |= MS_POSIXACL; + + tree_root = open_ctree(sb, fs_devices, (char *)data); + + if (IS_ERR(tree_root)) { + printk("btrfs: open_ctree failed\n"); + return PTR_ERR(tree_root); + } + sb->s_fs_info = tree_root; + disk_super = &tree_root->fs_info->super_copy; + inode = btrfs_iget_locked(sb, BTRFS_FIRST_FREE_OBJECTID, + tree_root->fs_info->fs_root); + bi = BTRFS_I(inode); + bi->location.objectid = inode->i_ino; + bi->location.offset = 0; + bi->root = tree_root->fs_info->fs_root; + + btrfs_set_key_type(&bi->location, BTRFS_INODE_ITEM_KEY); + + if (!inode) { + err = -ENOMEM; + goto fail_close; + } + if (inode->i_state & I_NEW) { + btrfs_read_locked_inode(inode); + unlock_new_inode(inode); + } + + root_dentry = d_alloc_root(inode); + if (!root_dentry) { + iput(inode); + err = -ENOMEM; + goto fail_close; + } +#if 0 + /* this does the super kobj at the same time */ + err = btrfs_sysfs_add_super(tree_root->fs_info); + if (err) + goto fail_close; +#endif + + sb->s_root = root_dentry; + + save_mount_options(sb, data); + return 0; + +fail_close: + close_ctree(tree_root); + return err; +} + +int btrfs_sync_fs(struct super_block *sb, int wait) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root; + int ret; + root = btrfs_sb(sb); + + if (sb->s_flags & MS_RDONLY) + return 0; + + sb->s_dirt = 0; + if (!wait) { + filemap_flush(root->fs_info->btree_inode->i_mapping); + return 0; + } + + btrfs_start_delalloc_inodes(root); + btrfs_wait_ordered_extents(root, 0); + + btrfs_clean_old_snapshots(root); + trans = btrfs_start_transaction(root, 1); + ret = btrfs_commit_transaction(trans, root); + sb->s_dirt = 0; + return ret; +} + +static void btrfs_write_super(struct super_block *sb) +{ + sb->s_dirt = 0; +} + +static int btrfs_test_super(struct super_block *s, void *data) +{ + struct btrfs_fs_devices *test_fs_devices = data; + struct btrfs_root *root = btrfs_sb(s); + + return root->fs_info->fs_devices == test_fs_devices; +} + +/* + * Find a superblock for the given device / mount point. + * + * Note: This is based on get_sb_bdev from fs/super.c with a few additions + * for multiple device setup. Make sure to keep it in sync. + */ +static int btrfs_get_sb(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data, struct vfsmount *mnt) +{ + char *subvol_name = NULL; + struct block_device *bdev = NULL; + struct super_block *s; + struct dentry *root; + struct btrfs_fs_devices *fs_devices = NULL; + fmode_t mode = FMODE_READ; + int error = 0; + + if (!(flags & MS_RDONLY)) + mode |= FMODE_WRITE; + + error = btrfs_parse_early_options(data, mode, fs_type, + &subvol_name, &fs_devices); + if (error) + return error; + + error = btrfs_scan_one_device(dev_name, mode, fs_type, &fs_devices); + if (error) + goto error_free_subvol_name; + + error = btrfs_open_devices(fs_devices, mode, fs_type); + if (error) + goto error_free_subvol_name; + + if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) { + error = -EACCES; + goto error_close_devices; + } + + bdev = fs_devices->latest_bdev; + s = sget(fs_type, btrfs_test_super, set_anon_super, fs_devices); + if (IS_ERR(s)) + goto error_s; + + if (s->s_root) { + if ((flags ^ s->s_flags) & MS_RDONLY) { + up_write(&s->s_umount); + deactivate_super(s); + error = -EBUSY; + goto error_close_devices; + } + + btrfs_close_devices(fs_devices); + } else { + char b[BDEVNAME_SIZE]; + + s->s_flags = flags; + strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); + error = btrfs_fill_super(s, fs_devices, data, + flags & MS_SILENT ? 1 : 0); + if (error) { + up_write(&s->s_umount); + deactivate_super(s); + goto error_free_subvol_name; + } + + btrfs_sb(s)->fs_info->bdev_holder = fs_type; + s->s_flags |= MS_ACTIVE; + } + + if (!strcmp(subvol_name, ".")) + root = dget(s->s_root); + else { + mutex_lock(&s->s_root->d_inode->i_mutex); + root = lookup_one_len(subvol_name, s->s_root, + strlen(subvol_name)); + mutex_unlock(&s->s_root->d_inode->i_mutex); + + if (IS_ERR(root)) { + up_write(&s->s_umount); + deactivate_super(s); + error = PTR_ERR(root); + goto error_free_subvol_name; + } + if (!root->d_inode) { + dput(root); + up_write(&s->s_umount); + deactivate_super(s); + error = -ENXIO; + goto error_free_subvol_name; + } + } + + mnt->mnt_sb = s; + mnt->mnt_root = root; + + kfree(subvol_name); + return 0; + +error_s: + error = PTR_ERR(s); +error_close_devices: + btrfs_close_devices(fs_devices); +error_free_subvol_name: + kfree(subvol_name); + return error; +} + +static int btrfs_remount(struct super_block *sb, int *flags, char *data) +{ + struct btrfs_root *root = btrfs_sb(sb); + int ret; + + if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) + return 0; + + if (*flags & MS_RDONLY) { + sb->s_flags |= MS_RDONLY; + + ret = btrfs_commit_super(root); + WARN_ON(ret); + } else { + if (root->fs_info->fs_devices->rw_devices == 0) + return -EACCES; + + if (btrfs_super_log_root(&root->fs_info->super_copy) != 0) + return -EINVAL; + + ret = btrfs_cleanup_reloc_trees(root); + WARN_ON(ret); + + ret = btrfs_cleanup_fs_roots(root->fs_info); + WARN_ON(ret); + + sb->s_flags &= ~MS_RDONLY; + } + + return 0; +} + +static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct btrfs_root *root = btrfs_sb(dentry->d_sb); + struct btrfs_super_block *disk_super = &root->fs_info->super_copy; + int bits = dentry->d_sb->s_blocksize_bits; + __be32 *fsid = (__be32 *)root->fs_info->fsid; + + buf->f_namelen = BTRFS_NAME_LEN; + buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits; + buf->f_bfree = buf->f_blocks - + (btrfs_super_bytes_used(disk_super) >> bits); + buf->f_bavail = buf->f_bfree; + buf->f_bsize = dentry->d_sb->s_blocksize; + buf->f_type = BTRFS_SUPER_MAGIC; + + /* We treat it as constant endianness (it doesn't matter _which_) + because we want the fsid to come out the same whether mounted + on a big-endian or little-endian host */ + buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]); + buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]); + /* Mask in the root object ID too, to disambiguate subvols */ + buf->f_fsid.val[0] ^= BTRFS_I(dentry->d_inode)->root->objectid >> 32; + buf->f_fsid.val[1] ^= BTRFS_I(dentry->d_inode)->root->objectid; + + return 0; +} + +static struct file_system_type btrfs_fs_type = { + .owner = THIS_MODULE, + .name = "btrfs", + .get_sb = btrfs_get_sb, + .kill_sb = kill_anon_super, + .fs_flags = FS_REQUIRES_DEV, +}; + +/* + * used by btrfsctl to scan devices when no FS is mounted + */ +static long btrfs_control_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct btrfs_ioctl_vol_args *vol; + struct btrfs_fs_devices *fs_devices; + int ret = 0; + int len; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + vol = kmalloc(sizeof(*vol), GFP_KERNEL); + if (copy_from_user(vol, (void __user *)arg, sizeof(*vol))) { + ret = -EFAULT; + goto out; + } + len = strnlen(vol->name, BTRFS_PATH_NAME_MAX); + switch (cmd) { + case BTRFS_IOC_SCAN_DEV: + ret = btrfs_scan_one_device(vol->name, FMODE_READ, + &btrfs_fs_type, &fs_devices); + break; + } +out: + kfree(vol); + return ret; +} + +static int btrfs_freeze(struct super_block *sb) +{ + struct btrfs_root *root = btrfs_sb(sb); + mutex_lock(&root->fs_info->transaction_kthread_mutex); + mutex_lock(&root->fs_info->cleaner_mutex); + return 0; +} + +static int btrfs_unfreeze(struct super_block *sb) +{ + struct btrfs_root *root = btrfs_sb(sb); + mutex_unlock(&root->fs_info->cleaner_mutex); + mutex_unlock(&root->fs_info->transaction_kthread_mutex); + return 0; +} + +static struct super_operations btrfs_super_ops = { + .delete_inode = btrfs_delete_inode, + .put_super = btrfs_put_super, + .write_super = btrfs_write_super, + .sync_fs = btrfs_sync_fs, + .show_options = generic_show_options, + .write_inode = btrfs_write_inode, + .dirty_inode = btrfs_dirty_inode, + .alloc_inode = btrfs_alloc_inode, + .destroy_inode = btrfs_destroy_inode, + .statfs = btrfs_statfs, + .remount_fs = btrfs_remount, + .freeze_fs = btrfs_freeze, + .unfreeze_fs = btrfs_unfreeze, +}; + +static const struct file_operations btrfs_ctl_fops = { + .unlocked_ioctl = btrfs_control_ioctl, + .compat_ioctl = btrfs_control_ioctl, + .owner = THIS_MODULE, +}; + +static struct miscdevice btrfs_misc = { + .minor = MISC_DYNAMIC_MINOR, + .name = "btrfs-control", + .fops = &btrfs_ctl_fops +}; + +static int btrfs_interface_init(void) +{ + return misc_register(&btrfs_misc); +} + +static void btrfs_interface_exit(void) +{ + if (misc_deregister(&btrfs_misc) < 0) + printk(KERN_INFO "misc_deregister failed for control device"); +} + +static int __init init_btrfs_fs(void) +{ + int err; + + err = btrfs_init_sysfs(); + if (err) + return err; + + err = btrfs_init_cachep(); + if (err) + goto free_sysfs; + + err = extent_io_init(); + if (err) + goto free_cachep; + + err = extent_map_init(); + if (err) + goto free_extent_io; + + err = btrfs_interface_init(); + if (err) + goto free_extent_map; + + err = register_filesystem(&btrfs_fs_type); + if (err) + goto unregister_ioctl; + + printk(KERN_INFO "%s loaded\n", BTRFS_BUILD_VERSION); + return 0; + +unregister_ioctl: + btrfs_interface_exit(); +free_extent_map: + extent_map_exit(); +free_extent_io: + extent_io_exit(); +free_cachep: + btrfs_destroy_cachep(); +free_sysfs: + btrfs_exit_sysfs(); + return err; +} + +static void __exit exit_btrfs_fs(void) +{ + btrfs_destroy_cachep(); + extent_map_exit(); + extent_io_exit(); + btrfs_interface_exit(); + unregister_filesystem(&btrfs_fs_type); + btrfs_exit_sysfs(); + btrfs_cleanup_fs_uuids(); + btrfs_zlib_exit(); +} + +module_init(init_btrfs_fs) +module_exit(exit_btrfs_fs) + +MODULE_LICENSE("GPL"); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c new file mode 100644 index 000000000000..a240b6fa81df --- /dev/null +++ b/fs/btrfs/sysfs.c @@ -0,0 +1,269 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/buffer_head.h> +#include <linux/module.h> +#include <linux/kobject.h> + +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" + +static ssize_t root_blocks_used_show(struct btrfs_root *root, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)btrfs_root_used(&root->root_item)); +} + +static ssize_t root_block_limit_show(struct btrfs_root *root, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)btrfs_root_limit(&root->root_item)); +} + +static ssize_t super_blocks_used_show(struct btrfs_fs_info *fs, char *buf) +{ + + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)btrfs_super_bytes_used(&fs->super_copy)); +} + +static ssize_t super_total_blocks_show(struct btrfs_fs_info *fs, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)btrfs_super_total_bytes(&fs->super_copy)); +} + +static ssize_t super_blocksize_show(struct btrfs_fs_info *fs, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)btrfs_super_sectorsize(&fs->super_copy)); +} + +/* this is for root attrs (subvols/snapshots) */ +struct btrfs_root_attr { + struct attribute attr; + ssize_t (*show)(struct btrfs_root *, char *); + ssize_t (*store)(struct btrfs_root *, const char *, size_t); +}; + +#define ROOT_ATTR(name, mode, show, store) \ +static struct btrfs_root_attr btrfs_root_attr_##name = __ATTR(name, mode, \ + show, store) + +ROOT_ATTR(blocks_used, 0444, root_blocks_used_show, NULL); +ROOT_ATTR(block_limit, 0644, root_block_limit_show, NULL); + +static struct attribute *btrfs_root_attrs[] = { + &btrfs_root_attr_blocks_used.attr, + &btrfs_root_attr_block_limit.attr, + NULL, +}; + +/* this is for super attrs (actual full fs) */ +struct btrfs_super_attr { + struct attribute attr; + ssize_t (*show)(struct btrfs_fs_info *, char *); + ssize_t (*store)(struct btrfs_fs_info *, const char *, size_t); +}; + +#define SUPER_ATTR(name, mode, show, store) \ +static struct btrfs_super_attr btrfs_super_attr_##name = __ATTR(name, mode, \ + show, store) + +SUPER_ATTR(blocks_used, 0444, super_blocks_used_show, NULL); +SUPER_ATTR(total_blocks, 0444, super_total_blocks_show, NULL); +SUPER_ATTR(blocksize, 0444, super_blocksize_show, NULL); + +static struct attribute *btrfs_super_attrs[] = { + &btrfs_super_attr_blocks_used.attr, + &btrfs_super_attr_total_blocks.attr, + &btrfs_super_attr_blocksize.attr, + NULL, +}; + +static ssize_t btrfs_super_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info, + super_kobj); + struct btrfs_super_attr *a = container_of(attr, + struct btrfs_super_attr, + attr); + + return a->show ? a->show(fs, buf) : 0; +} + +static ssize_t btrfs_super_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t len) +{ + struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info, + super_kobj); + struct btrfs_super_attr *a = container_of(attr, + struct btrfs_super_attr, + attr); + + return a->store ? a->store(fs, buf, len) : 0; +} + +static ssize_t btrfs_root_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct btrfs_root *root = container_of(kobj, struct btrfs_root, + root_kobj); + struct btrfs_root_attr *a = container_of(attr, + struct btrfs_root_attr, + attr); + + return a->show ? a->show(root, buf) : 0; +} + +static ssize_t btrfs_root_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t len) +{ + struct btrfs_root *root = container_of(kobj, struct btrfs_root, + root_kobj); + struct btrfs_root_attr *a = container_of(attr, + struct btrfs_root_attr, + attr); + return a->store ? a->store(root, buf, len) : 0; +} + +static void btrfs_super_release(struct kobject *kobj) +{ + struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info, + super_kobj); + complete(&fs->kobj_unregister); +} + +static void btrfs_root_release(struct kobject *kobj) +{ + struct btrfs_root *root = container_of(kobj, struct btrfs_root, + root_kobj); + complete(&root->kobj_unregister); +} + +static struct sysfs_ops btrfs_super_attr_ops = { + .show = btrfs_super_attr_show, + .store = btrfs_super_attr_store, +}; + +static struct sysfs_ops btrfs_root_attr_ops = { + .show = btrfs_root_attr_show, + .store = btrfs_root_attr_store, +}; + +static struct kobj_type btrfs_root_ktype = { + .default_attrs = btrfs_root_attrs, + .sysfs_ops = &btrfs_root_attr_ops, + .release = btrfs_root_release, +}; + +static struct kobj_type btrfs_super_ktype = { + .default_attrs = btrfs_super_attrs, + .sysfs_ops = &btrfs_super_attr_ops, + .release = btrfs_super_release, +}; + +/* /sys/fs/btrfs/ entry */ +static struct kset *btrfs_kset; + +int btrfs_sysfs_add_super(struct btrfs_fs_info *fs) +{ + int error; + char *name; + char c; + int len = strlen(fs->sb->s_id) + 1; + int i; + + name = kmalloc(len, GFP_NOFS); + if (!name) { + error = -ENOMEM; + goto fail; + } + + for (i = 0; i < len; i++) { + c = fs->sb->s_id[i]; + if (c == '/' || c == '\\') + c = '!'; + name[i] = c; + } + name[len] = '\0'; + + fs->super_kobj.kset = btrfs_kset; + error = kobject_init_and_add(&fs->super_kobj, &btrfs_super_ktype, + NULL, "%s", name); + kfree(name); + if (error) + goto fail; + + return 0; + +fail: + printk(KERN_ERR "btrfs: sysfs creation for super failed\n"); + return error; +} + +int btrfs_sysfs_add_root(struct btrfs_root *root) +{ + int error; + + error = kobject_init_and_add(&root->root_kobj, &btrfs_root_ktype, + &root->fs_info->super_kobj, + "%s", root->name); + if (error) + goto fail; + + return 0; + +fail: + printk(KERN_ERR "btrfs: sysfs creation for root failed\n"); + return error; +} + +void btrfs_sysfs_del_root(struct btrfs_root *root) +{ + kobject_put(&root->root_kobj); + wait_for_completion(&root->kobj_unregister); +} + +void btrfs_sysfs_del_super(struct btrfs_fs_info *fs) +{ + kobject_put(&fs->super_kobj); + wait_for_completion(&fs->kobj_unregister); +} + +int btrfs_init_sysfs(void) +{ + btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj); + if (!btrfs_kset) + return -ENOMEM; + return 0; +} + +void btrfs_exit_sysfs(void) +{ + kset_unregister(btrfs_kset); +} + diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c new file mode 100644 index 000000000000..8a08f9443340 --- /dev/null +++ b/fs/btrfs/transaction.c @@ -0,0 +1,1097 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/fs.h> +#include <linux/sched.h> +#include <linux/writeback.h> +#include <linux/pagemap.h> +#include <linux/blkdev.h> +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "locking.h" +#include "ref-cache.h" +#include "tree-log.h" + +#define BTRFS_ROOT_TRANS_TAG 0 + +static noinline void put_transaction(struct btrfs_transaction *transaction) +{ + WARN_ON(transaction->use_count == 0); + transaction->use_count--; + if (transaction->use_count == 0) { + list_del_init(&transaction->list); + memset(transaction, 0, sizeof(*transaction)); + kmem_cache_free(btrfs_transaction_cachep, transaction); + } +} + +/* + * either allocate a new transaction or hop into the existing one + */ +static noinline int join_transaction(struct btrfs_root *root) +{ + struct btrfs_transaction *cur_trans; + cur_trans = root->fs_info->running_transaction; + if (!cur_trans) { + cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, + GFP_NOFS); + BUG_ON(!cur_trans); + root->fs_info->generation++; + root->fs_info->last_alloc = 0; + root->fs_info->last_data_alloc = 0; + cur_trans->num_writers = 1; + cur_trans->num_joined = 0; + cur_trans->transid = root->fs_info->generation; + init_waitqueue_head(&cur_trans->writer_wait); + init_waitqueue_head(&cur_trans->commit_wait); + cur_trans->in_commit = 0; + cur_trans->blocked = 0; + cur_trans->use_count = 1; + cur_trans->commit_done = 0; + cur_trans->start_time = get_seconds(); + INIT_LIST_HEAD(&cur_trans->pending_snapshots); + list_add_tail(&cur_trans->list, &root->fs_info->trans_list); + extent_io_tree_init(&cur_trans->dirty_pages, + root->fs_info->btree_inode->i_mapping, + GFP_NOFS); + spin_lock(&root->fs_info->new_trans_lock); + root->fs_info->running_transaction = cur_trans; + spin_unlock(&root->fs_info->new_trans_lock); + } else { + cur_trans->num_writers++; + cur_trans->num_joined++; + } + + return 0; +} + +/* + * this does all the record keeping required to make sure that a reference + * counted root is properly recorded in a given transaction. This is required + * to make sure the old root from before we joined the transaction is deleted + * when the transaction commits + */ +noinline int btrfs_record_root_in_trans(struct btrfs_root *root) +{ + struct btrfs_dirty_root *dirty; + u64 running_trans_id = root->fs_info->running_transaction->transid; + if (root->ref_cows && root->last_trans < running_trans_id) { + WARN_ON(root == root->fs_info->extent_root); + if (root->root_item.refs != 0) { + radix_tree_tag_set(&root->fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + BTRFS_ROOT_TRANS_TAG); + + dirty = kmalloc(sizeof(*dirty), GFP_NOFS); + BUG_ON(!dirty); + dirty->root = kmalloc(sizeof(*dirty->root), GFP_NOFS); + BUG_ON(!dirty->root); + dirty->latest_root = root; + INIT_LIST_HEAD(&dirty->list); + + root->commit_root = btrfs_root_node(root); + + memcpy(dirty->root, root, sizeof(*root)); + spin_lock_init(&dirty->root->node_lock); + spin_lock_init(&dirty->root->list_lock); + mutex_init(&dirty->root->objectid_mutex); + mutex_init(&dirty->root->log_mutex); + INIT_LIST_HEAD(&dirty->root->dead_list); + dirty->root->node = root->commit_root; + dirty->root->commit_root = NULL; + + spin_lock(&root->list_lock); + list_add(&dirty->root->dead_list, &root->dead_list); + spin_unlock(&root->list_lock); + + root->dirty_root = dirty; + } else { + WARN_ON(1); + } + root->last_trans = running_trans_id; + } + return 0; +} + +/* wait for commit against the current transaction to become unblocked + * when this is done, it is safe to start a new transaction, but the current + * transaction might not be fully on disk. + */ +static void wait_current_trans(struct btrfs_root *root) +{ + struct btrfs_transaction *cur_trans; + + cur_trans = root->fs_info->running_transaction; + if (cur_trans && cur_trans->blocked) { + DEFINE_WAIT(wait); + cur_trans->use_count++; + while (1) { + prepare_to_wait(&root->fs_info->transaction_wait, &wait, + TASK_UNINTERRUPTIBLE); + if (cur_trans->blocked) { + mutex_unlock(&root->fs_info->trans_mutex); + schedule(); + mutex_lock(&root->fs_info->trans_mutex); + finish_wait(&root->fs_info->transaction_wait, + &wait); + } else { + finish_wait(&root->fs_info->transaction_wait, + &wait); + break; + } + } + put_transaction(cur_trans); + } +} + +static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, + int num_blocks, int wait) +{ + struct btrfs_trans_handle *h = + kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); + int ret; + + mutex_lock(&root->fs_info->trans_mutex); + if (!root->fs_info->log_root_recovering && + ((wait == 1 && !root->fs_info->open_ioctl_trans) || wait == 2)) + wait_current_trans(root); + ret = join_transaction(root); + BUG_ON(ret); + + btrfs_record_root_in_trans(root); + h->transid = root->fs_info->running_transaction->transid; + h->transaction = root->fs_info->running_transaction; + h->blocks_reserved = num_blocks; + h->blocks_used = 0; + h->block_group = 0; + h->alloc_exclude_nr = 0; + h->alloc_exclude_start = 0; + root->fs_info->running_transaction->use_count++; + mutex_unlock(&root->fs_info->trans_mutex); + return h; +} + +struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, + int num_blocks) +{ + return start_transaction(root, num_blocks, 1); +} +struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, + int num_blocks) +{ + return start_transaction(root, num_blocks, 0); +} + +struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, + int num_blocks) +{ + return start_transaction(r, num_blocks, 2); +} + +/* wait for a transaction commit to be fully complete */ +static noinline int wait_for_commit(struct btrfs_root *root, + struct btrfs_transaction *commit) +{ + DEFINE_WAIT(wait); + mutex_lock(&root->fs_info->trans_mutex); + while (!commit->commit_done) { + prepare_to_wait(&commit->commit_wait, &wait, + TASK_UNINTERRUPTIBLE); + if (commit->commit_done) + break; + mutex_unlock(&root->fs_info->trans_mutex); + schedule(); + mutex_lock(&root->fs_info->trans_mutex); + } + mutex_unlock(&root->fs_info->trans_mutex); + finish_wait(&commit->commit_wait, &wait); + return 0; +} + +/* + * rate limit against the drop_snapshot code. This helps to slow down new + * operations if the drop_snapshot code isn't able to keep up. + */ +static void throttle_on_drops(struct btrfs_root *root) +{ + struct btrfs_fs_info *info = root->fs_info; + int harder_count = 0; + +harder: + if (atomic_read(&info->throttles)) { + DEFINE_WAIT(wait); + int thr; + thr = atomic_read(&info->throttle_gen); + + do { + prepare_to_wait(&info->transaction_throttle, + &wait, TASK_UNINTERRUPTIBLE); + if (!atomic_read(&info->throttles)) { + finish_wait(&info->transaction_throttle, &wait); + break; + } + schedule(); + finish_wait(&info->transaction_throttle, &wait); + } while (thr == atomic_read(&info->throttle_gen)); + harder_count++; + + if (root->fs_info->total_ref_cache_size > 1 * 1024 * 1024 && + harder_count < 2) + goto harder; + + if (root->fs_info->total_ref_cache_size > 5 * 1024 * 1024 && + harder_count < 10) + goto harder; + + if (root->fs_info->total_ref_cache_size > 10 * 1024 * 1024 && + harder_count < 20) + goto harder; + } +} + +void btrfs_throttle(struct btrfs_root *root) +{ + mutex_lock(&root->fs_info->trans_mutex); + if (!root->fs_info->open_ioctl_trans) + wait_current_trans(root); + mutex_unlock(&root->fs_info->trans_mutex); + + throttle_on_drops(root); +} + +static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root, int throttle) +{ + struct btrfs_transaction *cur_trans; + struct btrfs_fs_info *info = root->fs_info; + + mutex_lock(&info->trans_mutex); + cur_trans = info->running_transaction; + WARN_ON(cur_trans != trans->transaction); + WARN_ON(cur_trans->num_writers < 1); + cur_trans->num_writers--; + + if (waitqueue_active(&cur_trans->writer_wait)) + wake_up(&cur_trans->writer_wait); + put_transaction(cur_trans); + mutex_unlock(&info->trans_mutex); + memset(trans, 0, sizeof(*trans)); + kmem_cache_free(btrfs_trans_handle_cachep, trans); + + if (throttle) + throttle_on_drops(root); + + return 0; +} + +int btrfs_end_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + return __btrfs_end_transaction(trans, root, 0); +} + +int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + return __btrfs_end_transaction(trans, root, 1); +} + +/* + * when btree blocks are allocated, they have some corresponding bits set for + * them in one of two extent_io trees. This is used to make sure all of + * those extents are on disk for transaction or log commit + */ +int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages) +{ + int ret; + int err = 0; + int werr = 0; + struct page *page; + struct inode *btree_inode = root->fs_info->btree_inode; + u64 start = 0; + u64 end; + unsigned long index; + + while (1) { + ret = find_first_extent_bit(dirty_pages, start, &start, &end, + EXTENT_DIRTY); + if (ret) + break; + while (start <= end) { + cond_resched(); + + index = start >> PAGE_CACHE_SHIFT; + start = (u64)(index + 1) << PAGE_CACHE_SHIFT; + page = find_get_page(btree_inode->i_mapping, index); + if (!page) + continue; + + btree_lock_page_hook(page); + if (!page->mapping) { + unlock_page(page); + page_cache_release(page); + continue; + } + + if (PageWriteback(page)) { + if (PageDirty(page)) + wait_on_page_writeback(page); + else { + unlock_page(page); + page_cache_release(page); + continue; + } + } + err = write_one_page(page, 0); + if (err) + werr = err; + page_cache_release(page); + } + } + while (1) { + ret = find_first_extent_bit(dirty_pages, 0, &start, &end, + EXTENT_DIRTY); + if (ret) + break; + + clear_extent_dirty(dirty_pages, start, end, GFP_NOFS); + while (start <= end) { + index = start >> PAGE_CACHE_SHIFT; + start = (u64)(index + 1) << PAGE_CACHE_SHIFT; + page = find_get_page(btree_inode->i_mapping, index); + if (!page) + continue; + if (PageDirty(page)) { + btree_lock_page_hook(page); + wait_on_page_writeback(page); + err = write_one_page(page, 0); + if (err) + werr = err; + } + wait_on_page_writeback(page); + page_cache_release(page); + cond_resched(); + } + } + if (err) + werr = err; + return werr; +} + +int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + if (!trans || !trans->transaction) { + struct inode *btree_inode; + btree_inode = root->fs_info->btree_inode; + return filemap_write_and_wait(btree_inode->i_mapping); + } + return btrfs_write_and_wait_marked_extents(root, + &trans->transaction->dirty_pages); +} + +/* + * this is used to update the root pointer in the tree of tree roots. + * + * But, in the case of the extent allocation tree, updating the root + * pointer may allocate blocks which may change the root of the extent + * allocation tree. + * + * So, this loops and repeats and makes sure the cowonly root didn't + * change while the root pointer was being updated in the metadata. + */ +static int update_cowonly_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + int ret; + u64 old_root_bytenr; + struct btrfs_root *tree_root = root->fs_info->tree_root; + + btrfs_extent_post_op(trans, root); + btrfs_write_dirty_block_groups(trans, root); + btrfs_extent_post_op(trans, root); + + while (1) { + old_root_bytenr = btrfs_root_bytenr(&root->root_item); + if (old_root_bytenr == root->node->start) + break; + btrfs_set_root_bytenr(&root->root_item, + root->node->start); + btrfs_set_root_level(&root->root_item, + btrfs_header_level(root->node)); + btrfs_set_root_generation(&root->root_item, trans->transid); + + btrfs_extent_post_op(trans, root); + + ret = btrfs_update_root(trans, tree_root, + &root->root_key, + &root->root_item); + BUG_ON(ret); + btrfs_write_dirty_block_groups(trans, root); + btrfs_extent_post_op(trans, root); + } + return 0; +} + +/* + * update all the cowonly tree roots on disk + */ +int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct list_head *next; + struct extent_buffer *eb; + + btrfs_extent_post_op(trans, fs_info->tree_root); + + eb = btrfs_lock_root_node(fs_info->tree_root); + btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb, 0); + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + + btrfs_extent_post_op(trans, fs_info->tree_root); + + while (!list_empty(&fs_info->dirty_cowonly_roots)) { + next = fs_info->dirty_cowonly_roots.next; + list_del_init(next); + root = list_entry(next, struct btrfs_root, dirty_list); + + update_cowonly_root(trans, root); + } + return 0; +} + +/* + * dead roots are old snapshots that need to be deleted. This allocates + * a dirty root struct and adds it into the list of dead roots that need to + * be deleted + */ +int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest) +{ + struct btrfs_dirty_root *dirty; + + dirty = kmalloc(sizeof(*dirty), GFP_NOFS); + if (!dirty) + return -ENOMEM; + dirty->root = root; + dirty->latest_root = latest; + + mutex_lock(&root->fs_info->trans_mutex); + list_add(&dirty->list, &latest->fs_info->dead_roots); + mutex_unlock(&root->fs_info->trans_mutex); + return 0; +} + +/* + * at transaction commit time we need to schedule the old roots for + * deletion via btrfs_drop_snapshot. This runs through all the + * reference counted roots that were modified in the current + * transaction and puts them into the drop list + */ +static noinline int add_dirty_roots(struct btrfs_trans_handle *trans, + struct radix_tree_root *radix, + struct list_head *list) +{ + struct btrfs_dirty_root *dirty; + struct btrfs_root *gang[8]; + struct btrfs_root *root; + int i; + int ret; + int err = 0; + u32 refs; + + while (1) { + ret = radix_tree_gang_lookup_tag(radix, (void **)gang, 0, + ARRAY_SIZE(gang), + BTRFS_ROOT_TRANS_TAG); + if (ret == 0) + break; + for (i = 0; i < ret; i++) { + root = gang[i]; + radix_tree_tag_clear(radix, + (unsigned long)root->root_key.objectid, + BTRFS_ROOT_TRANS_TAG); + + BUG_ON(!root->ref_tree); + dirty = root->dirty_root; + + btrfs_free_log(trans, root); + btrfs_free_reloc_root(trans, root); + + if (root->commit_root == root->node) { + WARN_ON(root->node->start != + btrfs_root_bytenr(&root->root_item)); + + free_extent_buffer(root->commit_root); + root->commit_root = NULL; + root->dirty_root = NULL; + + spin_lock(&root->list_lock); + list_del_init(&dirty->root->dead_list); + spin_unlock(&root->list_lock); + + kfree(dirty->root); + kfree(dirty); + + /* make sure to update the root on disk + * so we get any updates to the block used + * counts + */ + err = btrfs_update_root(trans, + root->fs_info->tree_root, + &root->root_key, + &root->root_item); + continue; + } + + memset(&root->root_item.drop_progress, 0, + sizeof(struct btrfs_disk_key)); + root->root_item.drop_level = 0; + root->commit_root = NULL; + root->dirty_root = NULL; + root->root_key.offset = root->fs_info->generation; + btrfs_set_root_bytenr(&root->root_item, + root->node->start); + btrfs_set_root_level(&root->root_item, + btrfs_header_level(root->node)); + btrfs_set_root_generation(&root->root_item, + root->root_key.offset); + + err = btrfs_insert_root(trans, root->fs_info->tree_root, + &root->root_key, + &root->root_item); + if (err) + break; + + refs = btrfs_root_refs(&dirty->root->root_item); + btrfs_set_root_refs(&dirty->root->root_item, refs - 1); + err = btrfs_update_root(trans, root->fs_info->tree_root, + &dirty->root->root_key, + &dirty->root->root_item); + + BUG_ON(err); + if (refs == 1) { + list_add(&dirty->list, list); + } else { + WARN_ON(1); + free_extent_buffer(dirty->root->node); + kfree(dirty->root); + kfree(dirty); + } + } + } + return err; +} + +/* + * defrag a given btree. If cacheonly == 1, this won't read from the disk, + * otherwise every leaf in the btree is read and defragged. + */ +int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) +{ + struct btrfs_fs_info *info = root->fs_info; + int ret; + struct btrfs_trans_handle *trans; + unsigned long nr; + + smp_mb(); + if (root->defrag_running) + return 0; + trans = btrfs_start_transaction(root, 1); + while (1) { + root->defrag_running = 1; + ret = btrfs_defrag_leaves(trans, root, cacheonly); + nr = trans->blocks_used; + btrfs_end_transaction(trans, root); + btrfs_btree_balance_dirty(info->tree_root, nr); + cond_resched(); + + trans = btrfs_start_transaction(root, 1); + if (root->fs_info->closing || ret != -EAGAIN) + break; + } + root->defrag_running = 0; + smp_mb(); + btrfs_end_transaction(trans, root); + return 0; +} + +/* + * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on + * all of them + */ +static noinline int drop_dirty_roots(struct btrfs_root *tree_root, + struct list_head *list) +{ + struct btrfs_dirty_root *dirty; + struct btrfs_trans_handle *trans; + unsigned long nr; + u64 num_bytes; + u64 bytes_used; + u64 max_useless; + int ret = 0; + int err; + + while (!list_empty(list)) { + struct btrfs_root *root; + + dirty = list_entry(list->prev, struct btrfs_dirty_root, list); + list_del_init(&dirty->list); + + num_bytes = btrfs_root_used(&dirty->root->root_item); + root = dirty->latest_root; + atomic_inc(&root->fs_info->throttles); + + while (1) { + trans = btrfs_start_transaction(tree_root, 1); + mutex_lock(&root->fs_info->drop_mutex); + ret = btrfs_drop_snapshot(trans, dirty->root); + if (ret != -EAGAIN) + break; + mutex_unlock(&root->fs_info->drop_mutex); + + err = btrfs_update_root(trans, + tree_root, + &dirty->root->root_key, + &dirty->root->root_item); + if (err) + ret = err; + nr = trans->blocks_used; + ret = btrfs_end_transaction(trans, tree_root); + BUG_ON(ret); + + btrfs_btree_balance_dirty(tree_root, nr); + cond_resched(); + } + BUG_ON(ret); + atomic_dec(&root->fs_info->throttles); + wake_up(&root->fs_info->transaction_throttle); + + num_bytes -= btrfs_root_used(&dirty->root->root_item); + bytes_used = btrfs_root_used(&root->root_item); + if (num_bytes) { + btrfs_record_root_in_trans(root); + btrfs_set_root_used(&root->root_item, + bytes_used - num_bytes); + } + + ret = btrfs_del_root(trans, tree_root, &dirty->root->root_key); + if (ret) { + BUG(); + break; + } + mutex_unlock(&root->fs_info->drop_mutex); + + spin_lock(&root->list_lock); + list_del_init(&dirty->root->dead_list); + if (!list_empty(&root->dead_list)) { + struct btrfs_root *oldest; + oldest = list_entry(root->dead_list.prev, + struct btrfs_root, dead_list); + max_useless = oldest->root_key.offset - 1; + } else { + max_useless = root->root_key.offset - 1; + } + spin_unlock(&root->list_lock); + + nr = trans->blocks_used; + ret = btrfs_end_transaction(trans, tree_root); + BUG_ON(ret); + + ret = btrfs_remove_leaf_refs(root, max_useless, 0); + BUG_ON(ret); + + free_extent_buffer(dirty->root->node); + kfree(dirty->root); + kfree(dirty); + + btrfs_btree_balance_dirty(tree_root, nr); + cond_resched(); + } + return ret; +} + +/* + * new snapshots need to be created at a very specific time in the + * transaction commit. This does the actual creation + */ +static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_pending_snapshot *pending) +{ + struct btrfs_key key; + struct btrfs_root_item *new_root_item; + struct btrfs_root *tree_root = fs_info->tree_root; + struct btrfs_root *root = pending->root; + struct extent_buffer *tmp; + struct extent_buffer *old; + int ret; + u64 objectid; + + new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); + if (!new_root_item) { + ret = -ENOMEM; + goto fail; + } + ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid); + if (ret) + goto fail; + + btrfs_record_root_in_trans(root); + btrfs_set_root_last_snapshot(&root->root_item, trans->transid); + memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); + + key.objectid = objectid; + key.offset = trans->transid; + btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + + old = btrfs_lock_root_node(root); + btrfs_cow_block(trans, root, old, NULL, 0, &old, 0); + + btrfs_copy_root(trans, root, old, &tmp, objectid); + btrfs_tree_unlock(old); + free_extent_buffer(old); + + btrfs_set_root_bytenr(new_root_item, tmp->start); + btrfs_set_root_level(new_root_item, btrfs_header_level(tmp)); + btrfs_set_root_generation(new_root_item, trans->transid); + ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, + new_root_item); + btrfs_tree_unlock(tmp); + free_extent_buffer(tmp); + if (ret) + goto fail; + + key.offset = (u64)-1; + memcpy(&pending->root_key, &key, sizeof(key)); +fail: + kfree(new_root_item); + return ret; +} + +static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info, + struct btrfs_pending_snapshot *pending) +{ + int ret; + int namelen; + u64 index = 0; + struct btrfs_trans_handle *trans; + struct inode *parent_inode; + struct inode *inode; + struct btrfs_root *parent_root; + + parent_inode = pending->dentry->d_parent->d_inode; + parent_root = BTRFS_I(parent_inode)->root; + trans = btrfs_join_transaction(parent_root, 1); + + /* + * insert the directory item + */ + namelen = strlen(pending->name); + ret = btrfs_set_inode_index(parent_inode, &index); + ret = btrfs_insert_dir_item(trans, parent_root, + pending->name, namelen, + parent_inode->i_ino, + &pending->root_key, BTRFS_FT_DIR, index); + + if (ret) + goto fail; + + btrfs_i_size_write(parent_inode, parent_inode->i_size + namelen * 2); + ret = btrfs_update_inode(trans, parent_root, parent_inode); + BUG_ON(ret); + + /* add the backref first */ + ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root, + pending->root_key.objectid, + BTRFS_ROOT_BACKREF_KEY, + parent_root->root_key.objectid, + parent_inode->i_ino, index, pending->name, + namelen); + + BUG_ON(ret); + + /* now add the forward ref */ + ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root, + parent_root->root_key.objectid, + BTRFS_ROOT_REF_KEY, + pending->root_key.objectid, + parent_inode->i_ino, index, pending->name, + namelen); + + inode = btrfs_lookup_dentry(parent_inode, pending->dentry); + d_instantiate(pending->dentry, inode); +fail: + btrfs_end_transaction(trans, fs_info->fs_root); + return ret; +} + +/* + * create all the snapshots we've scheduled for creation + */ +static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_pending_snapshot *pending; + struct list_head *head = &trans->transaction->pending_snapshots; + struct list_head *cur; + int ret; + + list_for_each(cur, head) { + pending = list_entry(cur, struct btrfs_pending_snapshot, list); + ret = create_pending_snapshot(trans, fs_info, pending); + BUG_ON(ret); + } + return 0; +} + +static noinline int finish_pending_snapshots(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_pending_snapshot *pending; + struct list_head *head = &trans->transaction->pending_snapshots; + int ret; + + while (!list_empty(head)) { + pending = list_entry(head->next, + struct btrfs_pending_snapshot, list); + ret = finish_pending_snapshot(fs_info, pending); + BUG_ON(ret); + list_del(&pending->list); + kfree(pending->name); + kfree(pending); + } + return 0; +} + +int btrfs_commit_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + unsigned long joined = 0; + unsigned long timeout = 1; + struct btrfs_transaction *cur_trans; + struct btrfs_transaction *prev_trans = NULL; + struct btrfs_root *chunk_root = root->fs_info->chunk_root; + struct list_head dirty_fs_roots; + struct extent_io_tree *pinned_copy; + DEFINE_WAIT(wait); + int ret; + + INIT_LIST_HEAD(&dirty_fs_roots); + mutex_lock(&root->fs_info->trans_mutex); + if (trans->transaction->in_commit) { + cur_trans = trans->transaction; + trans->transaction->use_count++; + mutex_unlock(&root->fs_info->trans_mutex); + btrfs_end_transaction(trans, root); + + ret = wait_for_commit(root, cur_trans); + BUG_ON(ret); + + mutex_lock(&root->fs_info->trans_mutex); + put_transaction(cur_trans); + mutex_unlock(&root->fs_info->trans_mutex); + + return 0; + } + + pinned_copy = kmalloc(sizeof(*pinned_copy), GFP_NOFS); + if (!pinned_copy) + return -ENOMEM; + + extent_io_tree_init(pinned_copy, + root->fs_info->btree_inode->i_mapping, GFP_NOFS); + + trans->transaction->in_commit = 1; + trans->transaction->blocked = 1; + cur_trans = trans->transaction; + if (cur_trans->list.prev != &root->fs_info->trans_list) { + prev_trans = list_entry(cur_trans->list.prev, + struct btrfs_transaction, list); + if (!prev_trans->commit_done) { + prev_trans->use_count++; + mutex_unlock(&root->fs_info->trans_mutex); + + wait_for_commit(root, prev_trans); + + mutex_lock(&root->fs_info->trans_mutex); + put_transaction(prev_trans); + } + } + + do { + int snap_pending = 0; + joined = cur_trans->num_joined; + if (!list_empty(&trans->transaction->pending_snapshots)) + snap_pending = 1; + + WARN_ON(cur_trans != trans->transaction); + prepare_to_wait(&cur_trans->writer_wait, &wait, + TASK_UNINTERRUPTIBLE); + + if (cur_trans->num_writers > 1) + timeout = MAX_SCHEDULE_TIMEOUT; + else + timeout = 1; + + mutex_unlock(&root->fs_info->trans_mutex); + + if (snap_pending) { + ret = btrfs_wait_ordered_extents(root, 1); + BUG_ON(ret); + } + + schedule_timeout(timeout); + + mutex_lock(&root->fs_info->trans_mutex); + finish_wait(&cur_trans->writer_wait, &wait); + } while (cur_trans->num_writers > 1 || + (cur_trans->num_joined != joined)); + + ret = create_pending_snapshots(trans, root->fs_info); + BUG_ON(ret); + + WARN_ON(cur_trans != trans->transaction); + + /* btrfs_commit_tree_roots is responsible for getting the + * various roots consistent with each other. Every pointer + * in the tree of tree roots has to point to the most up to date + * root for every subvolume and other tree. So, we have to keep + * the tree logging code from jumping in and changing any + * of the trees. + * + * At this point in the commit, there can't be any tree-log + * writers, but a little lower down we drop the trans mutex + * and let new people in. By holding the tree_log_mutex + * from now until after the super is written, we avoid races + * with the tree-log code. + */ + mutex_lock(&root->fs_info->tree_log_mutex); + /* + * keep tree reloc code from adding new reloc trees + */ + mutex_lock(&root->fs_info->tree_reloc_mutex); + + + ret = add_dirty_roots(trans, &root->fs_info->fs_roots_radix, + &dirty_fs_roots); + BUG_ON(ret); + + /* add_dirty_roots gets rid of all the tree log roots, it is now + * safe to free the root of tree log roots + */ + btrfs_free_log_root_tree(trans, root->fs_info); + + ret = btrfs_commit_tree_roots(trans, root); + BUG_ON(ret); + + cur_trans = root->fs_info->running_transaction; + spin_lock(&root->fs_info->new_trans_lock); + root->fs_info->running_transaction = NULL; + spin_unlock(&root->fs_info->new_trans_lock); + btrfs_set_super_generation(&root->fs_info->super_copy, + cur_trans->transid); + btrfs_set_super_root(&root->fs_info->super_copy, + root->fs_info->tree_root->node->start); + btrfs_set_super_root_level(&root->fs_info->super_copy, + btrfs_header_level(root->fs_info->tree_root->node)); + + btrfs_set_super_chunk_root(&root->fs_info->super_copy, + chunk_root->node->start); + btrfs_set_super_chunk_root_level(&root->fs_info->super_copy, + btrfs_header_level(chunk_root->node)); + btrfs_set_super_chunk_root_generation(&root->fs_info->super_copy, + btrfs_header_generation(chunk_root->node)); + + if (!root->fs_info->log_root_recovering) { + btrfs_set_super_log_root(&root->fs_info->super_copy, 0); + btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0); + } + + memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy, + sizeof(root->fs_info->super_copy)); + + btrfs_copy_pinned(root, pinned_copy); + + trans->transaction->blocked = 0; + wake_up(&root->fs_info->transaction_throttle); + wake_up(&root->fs_info->transaction_wait); + + mutex_unlock(&root->fs_info->trans_mutex); + ret = btrfs_write_and_wait_transaction(trans, root); + BUG_ON(ret); + write_ctree_super(trans, root, 0); + + /* + * the super is written, we can safely allow the tree-loggers + * to go about their business + */ + mutex_unlock(&root->fs_info->tree_log_mutex); + + btrfs_finish_extent_commit(trans, root, pinned_copy); + kfree(pinned_copy); + + btrfs_drop_dead_reloc_roots(root); + mutex_unlock(&root->fs_info->tree_reloc_mutex); + + /* do the directory inserts of any pending snapshot creations */ + finish_pending_snapshots(trans, root->fs_info); + + mutex_lock(&root->fs_info->trans_mutex); + + cur_trans->commit_done = 1; + root->fs_info->last_trans_committed = cur_trans->transid; + wake_up(&cur_trans->commit_wait); + + put_transaction(cur_trans); + put_transaction(cur_trans); + + list_splice_init(&dirty_fs_roots, &root->fs_info->dead_roots); + if (root->fs_info->closing) + list_splice_init(&root->fs_info->dead_roots, &dirty_fs_roots); + + mutex_unlock(&root->fs_info->trans_mutex); + + kmem_cache_free(btrfs_trans_handle_cachep, trans); + + if (root->fs_info->closing) + drop_dirty_roots(root->fs_info->tree_root, &dirty_fs_roots); + return ret; +} + +/* + * interface function to delete all the snapshots we have scheduled for deletion + */ +int btrfs_clean_old_snapshots(struct btrfs_root *root) +{ + struct list_head dirty_roots; + INIT_LIST_HEAD(&dirty_roots); +again: + mutex_lock(&root->fs_info->trans_mutex); + list_splice_init(&root->fs_info->dead_roots, &dirty_roots); + mutex_unlock(&root->fs_info->trans_mutex); + + if (!list_empty(&dirty_roots)) { + drop_dirty_roots(root, &dirty_roots); + goto again; + } + return 0; +} diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h new file mode 100644 index 000000000000..ea292117f882 --- /dev/null +++ b/fs/btrfs/transaction.h @@ -0,0 +1,106 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_TRANSACTION__ +#define __BTRFS_TRANSACTION__ +#include "btrfs_inode.h" + +struct btrfs_transaction { + u64 transid; + unsigned long num_writers; + unsigned long num_joined; + int in_commit; + int use_count; + int commit_done; + int blocked; + struct list_head list; + struct extent_io_tree dirty_pages; + unsigned long start_time; + wait_queue_head_t writer_wait; + wait_queue_head_t commit_wait; + struct list_head pending_snapshots; +}; + +struct btrfs_trans_handle { + u64 transid; + unsigned long blocks_reserved; + unsigned long blocks_used; + struct btrfs_transaction *transaction; + u64 block_group; + u64 alloc_exclude_start; + u64 alloc_exclude_nr; +}; + +struct btrfs_pending_snapshot { + struct dentry *dentry; + struct btrfs_root *root; + char *name; + struct btrfs_key root_key; + struct list_head list; +}; + +struct btrfs_dirty_root { + struct list_head list; + struct btrfs_root *root; + struct btrfs_root *latest_root; +}; + +static inline void btrfs_set_trans_block_group(struct btrfs_trans_handle *trans, + struct inode *inode) +{ + trans->block_group = BTRFS_I(inode)->block_group; +} + +static inline void btrfs_update_inode_block_group( + struct btrfs_trans_handle *trans, + struct inode *inode) +{ + BTRFS_I(inode)->block_group = trans->block_group; +} + +static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans, + struct inode *inode) +{ + BTRFS_I(inode)->last_trans = trans->transaction->transid; +} + +int btrfs_end_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, + int num_blocks); +struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, + int num_blocks); +struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, + int num_blocks); +int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, + struct btrfs_root *root); + +int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest); +int btrfs_defrag_root(struct btrfs_root *root, int cacheonly); +int btrfs_clean_old_snapshots(struct btrfs_root *root); +int btrfs_commit_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +void btrfs_throttle(struct btrfs_root *root); +int btrfs_record_root_in_trans(struct btrfs_root *root); +int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages); +#endif diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c new file mode 100644 index 000000000000..3e8358c36165 --- /dev/null +++ b/fs/btrfs/tree-defrag.c @@ -0,0 +1,147 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/sched.h> +#include "ctree.h" +#include "disk-io.h" +#include "print-tree.h" +#include "transaction.h" +#include "locking.h" + +/* defrag all the leaves in a given btree. If cache_only == 1, don't read + * things from disk, otherwise read all the leaves and try to get key order to + * better reflect disk order + */ + +int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, + struct btrfs_root *root, int cache_only) +{ + struct btrfs_path *path = NULL; + struct btrfs_key key; + int ret = 0; + int wret; + int level; + int orig_level; + int is_extent = 0; + int next_key_ret = 0; + u64 last_ret = 0; + u64 min_trans = 0; + + if (cache_only) + goto out; + + if (root->fs_info->extent_root == root) { + /* + * there's recursion here right now in the tree locking, + * we can't defrag the extent root without deadlock + */ + goto out; + } + + if (root->ref_cows == 0 && !is_extent) + goto out; + + if (btrfs_test_opt(root, SSD)) + goto out; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + level = btrfs_header_level(root->node); + orig_level = level; + + if (level == 0) + goto out; + + if (root->defrag_progress.objectid == 0) { + struct extent_buffer *root_node; + u32 nritems; + + root_node = btrfs_lock_root_node(root); + nritems = btrfs_header_nritems(root_node); + root->defrag_max.objectid = 0; + /* from above we know this is not a leaf */ + btrfs_node_key_to_cpu(root_node, &root->defrag_max, + nritems - 1); + btrfs_tree_unlock(root_node); + free_extent_buffer(root_node); + memset(&key, 0, sizeof(key)); + } else { + memcpy(&key, &root->defrag_progress, sizeof(key)); + } + + path->keep_locks = 1; + if (cache_only) + min_trans = root->defrag_trans_start; + + ret = btrfs_search_forward(root, &key, NULL, path, + cache_only, min_trans); + if (ret < 0) + goto out; + if (ret > 0) { + ret = 0; + goto out; + } + btrfs_release_path(root, path); + wret = btrfs_search_slot(trans, root, &key, path, 0, 1); + + if (wret < 0) { + ret = wret; + goto out; + } + if (!path->nodes[1]) { + ret = 0; + goto out; + } + path->slots[1] = btrfs_header_nritems(path->nodes[1]); + next_key_ret = btrfs_find_next_key(root, path, &key, 1, cache_only, + min_trans); + ret = btrfs_realloc_node(trans, root, + path->nodes[1], 0, + cache_only, &last_ret, + &root->defrag_progress); + WARN_ON(ret && ret != -EAGAIN); + if (next_key_ret == 0) { + memcpy(&root->defrag_progress, &key, sizeof(key)); + ret = -EAGAIN; + } + + btrfs_release_path(root, path); + if (is_extent) + btrfs_extent_post_op(trans, root); +out: + if (path) + btrfs_free_path(path); + if (ret == -EAGAIN) { + if (root->defrag_max.objectid > root->defrag_progress.objectid) + goto done; + if (root->defrag_max.type > root->defrag_progress.type) + goto done; + if (root->defrag_max.offset > root->defrag_progress.offset) + goto done; + ret = 0; + } +done: + if (ret != -EAGAIN) { + memset(&root->defrag_progress, 0, + sizeof(root->defrag_progress)); + root->defrag_trans_start = trans->transid; + } + return ret; +} diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c new file mode 100644 index 000000000000..d81cda2e077c --- /dev/null +++ b/fs/btrfs/tree-log.c @@ -0,0 +1,2898 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/sched.h> +#include "ctree.h" +#include "transaction.h" +#include "disk-io.h" +#include "locking.h" +#include "print-tree.h" +#include "compat.h" +#include "tree-log.h" + +/* magic values for the inode_only field in btrfs_log_inode: + * + * LOG_INODE_ALL means to log everything + * LOG_INODE_EXISTS means to log just enough to recreate the inode + * during log replay + */ +#define LOG_INODE_ALL 0 +#define LOG_INODE_EXISTS 1 + +/* + * stages for the tree walking. The first + * stage (0) is to only pin down the blocks we find + * the second stage (1) is to make sure that all the inodes + * we find in the log are created in the subvolume. + * + * The last stage is to deal with directories and links and extents + * and all the other fun semantics + */ +#define LOG_WALK_PIN_ONLY 0 +#define LOG_WALK_REPLAY_INODES 1 +#define LOG_WALK_REPLAY_ALL 2 + +static int __btrfs_log_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + int inode_only); +static int link_to_fixup_dir(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid); + +/* + * tree logging is a special write ahead log used to make sure that + * fsyncs and O_SYNCs can happen without doing full tree commits. + * + * Full tree commits are expensive because they require commonly + * modified blocks to be recowed, creating many dirty pages in the + * extent tree an 4x-6x higher write load than ext3. + * + * Instead of doing a tree commit on every fsync, we use the + * key ranges and transaction ids to find items for a given file or directory + * that have changed in this transaction. Those items are copied into + * a special tree (one per subvolume root), that tree is written to disk + * and then the fsync is considered complete. + * + * After a crash, items are copied out of the log-tree back into the + * subvolume tree. Any file data extents found are recorded in the extent + * allocation tree, and the log-tree freed. + * + * The log tree is read three times, once to pin down all the extents it is + * using in ram and once, once to create all the inodes logged in the tree + * and once to do all the other items. + */ + +/* + * btrfs_add_log_tree adds a new per-subvolume log tree into the + * tree of log tree roots. This must be called with a tree log transaction + * running (see start_log_trans). + */ +static int btrfs_add_log_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_key key; + struct btrfs_root_item root_item; + struct btrfs_inode_item *inode_item; + struct extent_buffer *leaf; + struct btrfs_root *new_root = root; + int ret; + u64 objectid = root->root_key.objectid; + + leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, + BTRFS_TREE_LOG_OBJECTID, + trans->transid, 0, 0, 0); + if (IS_ERR(leaf)) { + ret = PTR_ERR(leaf); + return ret; + } + + btrfs_set_header_nritems(leaf, 0); + btrfs_set_header_level(leaf, 0); + btrfs_set_header_bytenr(leaf, leaf->start); + btrfs_set_header_generation(leaf, trans->transid); + btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID); + + write_extent_buffer(leaf, root->fs_info->fsid, + (unsigned long)btrfs_header_fsid(leaf), + BTRFS_FSID_SIZE); + btrfs_mark_buffer_dirty(leaf); + + inode_item = &root_item.inode; + memset(inode_item, 0, sizeof(*inode_item)); + inode_item->generation = cpu_to_le64(1); + inode_item->size = cpu_to_le64(3); + inode_item->nlink = cpu_to_le32(1); + inode_item->nbytes = cpu_to_le64(root->leafsize); + inode_item->mode = cpu_to_le32(S_IFDIR | 0755); + + btrfs_set_root_bytenr(&root_item, leaf->start); + btrfs_set_root_generation(&root_item, trans->transid); + btrfs_set_root_level(&root_item, 0); + btrfs_set_root_refs(&root_item, 0); + btrfs_set_root_used(&root_item, 0); + + memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress)); + root_item.drop_level = 0; + + btrfs_tree_unlock(leaf); + free_extent_buffer(leaf); + leaf = NULL; + + btrfs_set_root_dirid(&root_item, 0); + + key.objectid = BTRFS_TREE_LOG_OBJECTID; + key.offset = objectid; + btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + ret = btrfs_insert_root(trans, root->fs_info->log_root_tree, &key, + &root_item); + if (ret) + goto fail; + + new_root = btrfs_read_fs_root_no_radix(root->fs_info->log_root_tree, + &key); + BUG_ON(!new_root); + + WARN_ON(root->log_root); + root->log_root = new_root; + + /* + * log trees do not get reference counted because they go away + * before a real commit is actually done. They do store pointers + * to file data extents, and those reference counts still get + * updated (along with back refs to the log tree). + */ + new_root->ref_cows = 0; + new_root->last_trans = trans->transid; + + /* + * we need to make sure the root block for this new tree + * is marked as dirty in the dirty_log_pages tree. This + * is how it gets flushed down to disk at tree log commit time. + * + * the tree logging mutex keeps others from coming in and changing + * the new_root->node, so we can safely access it here + */ + set_extent_dirty(&new_root->dirty_log_pages, new_root->node->start, + new_root->node->start + new_root->node->len - 1, + GFP_NOFS); + +fail: + return ret; +} + +/* + * start a sub transaction and setup the log tree + * this increments the log tree writer count to make the people + * syncing the tree wait for us to finish + */ +static int start_log_trans(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + int ret; + mutex_lock(&root->fs_info->tree_log_mutex); + if (!root->fs_info->log_root_tree) { + ret = btrfs_init_log_root_tree(trans, root->fs_info); + BUG_ON(ret); + } + if (!root->log_root) { + ret = btrfs_add_log_tree(trans, root); + BUG_ON(ret); + } + atomic_inc(&root->fs_info->tree_log_writers); + root->fs_info->tree_log_batch++; + mutex_unlock(&root->fs_info->tree_log_mutex); + return 0; +} + +/* + * returns 0 if there was a log transaction running and we were able + * to join, or returns -ENOENT if there were not transactions + * in progress + */ +static int join_running_log_trans(struct btrfs_root *root) +{ + int ret = -ENOENT; + + smp_mb(); + if (!root->log_root) + return -ENOENT; + + mutex_lock(&root->fs_info->tree_log_mutex); + if (root->log_root) { + ret = 0; + atomic_inc(&root->fs_info->tree_log_writers); + root->fs_info->tree_log_batch++; + } + mutex_unlock(&root->fs_info->tree_log_mutex); + return ret; +} + +/* + * indicate we're done making changes to the log tree + * and wake up anyone waiting to do a sync + */ +static int end_log_trans(struct btrfs_root *root) +{ + atomic_dec(&root->fs_info->tree_log_writers); + smp_mb(); + if (waitqueue_active(&root->fs_info->tree_log_wait)) + wake_up(&root->fs_info->tree_log_wait); + return 0; +} + + +/* + * the walk control struct is used to pass state down the chain when + * processing the log tree. The stage field tells us which part + * of the log tree processing we are currently doing. The others + * are state fields used for that specific part + */ +struct walk_control { + /* should we free the extent on disk when done? This is used + * at transaction commit time while freeing a log tree + */ + int free; + + /* should we write out the extent buffer? This is used + * while flushing the log tree to disk during a sync + */ + int write; + + /* should we wait for the extent buffer io to finish? Also used + * while flushing the log tree to disk for a sync + */ + int wait; + + /* pin only walk, we record which extents on disk belong to the + * log trees + */ + int pin; + + /* what stage of the replay code we're currently in */ + int stage; + + /* the root we are currently replaying */ + struct btrfs_root *replay_dest; + + /* the trans handle for the current replay */ + struct btrfs_trans_handle *trans; + + /* the function that gets used to process blocks we find in the + * tree. Note the extent_buffer might not be up to date when it is + * passed in, and it must be checked or read if you need the data + * inside it + */ + int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, + struct walk_control *wc, u64 gen); +}; + +/* + * process_func used to pin down extents, write them or wait on them + */ +static int process_one_buffer(struct btrfs_root *log, + struct extent_buffer *eb, + struct walk_control *wc, u64 gen) +{ + if (wc->pin) { + mutex_lock(&log->fs_info->pinned_mutex); + btrfs_update_pinned_extents(log->fs_info->extent_root, + eb->start, eb->len, 1); + mutex_unlock(&log->fs_info->pinned_mutex); + } + + if (btrfs_buffer_uptodate(eb, gen)) { + if (wc->write) + btrfs_write_tree_block(eb); + if (wc->wait) + btrfs_wait_tree_block_writeback(eb); + } + return 0; +} + +/* + * Item overwrite used by replay and tree logging. eb, slot and key all refer + * to the src data we are copying out. + * + * root is the tree we are copying into, and path is a scratch + * path for use in this function (it should be released on entry and + * will be released on exit). + * + * If the key is already in the destination tree the existing item is + * overwritten. If the existing item isn't big enough, it is extended. + * If it is too large, it is truncated. + * + * If the key isn't in the destination yet, a new item is inserted. + */ +static noinline int overwrite_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *eb, int slot, + struct btrfs_key *key) +{ + int ret; + u32 item_size; + u64 saved_i_size = 0; + int save_old_i_size = 0; + unsigned long src_ptr; + unsigned long dst_ptr; + int overwrite_root = 0; + + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) + overwrite_root = 1; + + item_size = btrfs_item_size_nr(eb, slot); + src_ptr = btrfs_item_ptr_offset(eb, slot); + + /* look for the key in the destination tree */ + ret = btrfs_search_slot(NULL, root, key, path, 0, 0); + if (ret == 0) { + char *src_copy; + char *dst_copy; + u32 dst_size = btrfs_item_size_nr(path->nodes[0], + path->slots[0]); + if (dst_size != item_size) + goto insert; + + if (item_size == 0) { + btrfs_release_path(root, path); + return 0; + } + dst_copy = kmalloc(item_size, GFP_NOFS); + src_copy = kmalloc(item_size, GFP_NOFS); + + read_extent_buffer(eb, src_copy, src_ptr, item_size); + + dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); + read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, + item_size); + ret = memcmp(dst_copy, src_copy, item_size); + + kfree(dst_copy); + kfree(src_copy); + /* + * they have the same contents, just return, this saves + * us from cowing blocks in the destination tree and doing + * extra writes that may not have been done by a previous + * sync + */ + if (ret == 0) { + btrfs_release_path(root, path); + return 0; + } + + } +insert: + btrfs_release_path(root, path); + /* try to insert the key into the destination tree */ + ret = btrfs_insert_empty_item(trans, root, path, + key, item_size); + + /* make sure any existing item is the correct size */ + if (ret == -EEXIST) { + u32 found_size; + found_size = btrfs_item_size_nr(path->nodes[0], + path->slots[0]); + if (found_size > item_size) { + btrfs_truncate_item(trans, root, path, item_size, 1); + } else if (found_size < item_size) { + ret = btrfs_extend_item(trans, root, path, + item_size - found_size); + BUG_ON(ret); + } + } else if (ret) { + BUG(); + } + dst_ptr = btrfs_item_ptr_offset(path->nodes[0], + path->slots[0]); + + /* don't overwrite an existing inode if the generation number + * was logged as zero. This is done when the tree logging code + * is just logging an inode to make sure it exists after recovery. + * + * Also, don't overwrite i_size on directories during replay. + * log replay inserts and removes directory items based on the + * state of the tree found in the subvolume, and i_size is modified + * as it goes + */ + if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { + struct btrfs_inode_item *src_item; + struct btrfs_inode_item *dst_item; + + src_item = (struct btrfs_inode_item *)src_ptr; + dst_item = (struct btrfs_inode_item *)dst_ptr; + + if (btrfs_inode_generation(eb, src_item) == 0) + goto no_copy; + + if (overwrite_root && + S_ISDIR(btrfs_inode_mode(eb, src_item)) && + S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { + save_old_i_size = 1; + saved_i_size = btrfs_inode_size(path->nodes[0], + dst_item); + } + } + + copy_extent_buffer(path->nodes[0], eb, dst_ptr, + src_ptr, item_size); + + if (save_old_i_size) { + struct btrfs_inode_item *dst_item; + dst_item = (struct btrfs_inode_item *)dst_ptr; + btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); + } + + /* make sure the generation is filled in */ + if (key->type == BTRFS_INODE_ITEM_KEY) { + struct btrfs_inode_item *dst_item; + dst_item = (struct btrfs_inode_item *)dst_ptr; + if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { + btrfs_set_inode_generation(path->nodes[0], dst_item, + trans->transid); + } + } +no_copy: + btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_release_path(root, path); + return 0; +} + +/* + * simple helper to read an inode off the disk from a given root + * This can only be called for subvolume roots and not for the log + */ +static noinline struct inode *read_one_inode(struct btrfs_root *root, + u64 objectid) +{ + struct inode *inode; + inode = btrfs_iget_locked(root->fs_info->sb, objectid, root); + if (inode->i_state & I_NEW) { + BTRFS_I(inode)->root = root; + BTRFS_I(inode)->location.objectid = objectid; + BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; + BTRFS_I(inode)->location.offset = 0; + btrfs_read_locked_inode(inode); + unlock_new_inode(inode); + + } + if (is_bad_inode(inode)) { + iput(inode); + inode = NULL; + } + return inode; +} + +/* replays a single extent in 'eb' at 'slot' with 'key' into the + * subvolume 'root'. path is released on entry and should be released + * on exit. + * + * extents in the log tree have not been allocated out of the extent + * tree yet. So, this completes the allocation, taking a reference + * as required if the extent already exists or creating a new extent + * if it isn't in the extent allocation tree yet. + * + * The extent is inserted into the file, dropping any existing extents + * from the file that overlap the new one. + */ +static noinline int replay_one_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *eb, int slot, + struct btrfs_key *key) +{ + int found_type; + u64 mask = root->sectorsize - 1; + u64 extent_end; + u64 alloc_hint; + u64 start = key->offset; + u64 saved_nbytes; + struct btrfs_file_extent_item *item; + struct inode *inode = NULL; + unsigned long size; + int ret = 0; + + item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); + found_type = btrfs_file_extent_type(eb, item); + + if (found_type == BTRFS_FILE_EXTENT_REG || + found_type == BTRFS_FILE_EXTENT_PREALLOC) + extent_end = start + btrfs_file_extent_num_bytes(eb, item); + else if (found_type == BTRFS_FILE_EXTENT_INLINE) { + size = btrfs_file_extent_inline_len(eb, item); + extent_end = (start + size + mask) & ~mask; + } else { + ret = 0; + goto out; + } + + inode = read_one_inode(root, key->objectid); + if (!inode) { + ret = -EIO; + goto out; + } + + /* + * first check to see if we already have this extent in the + * file. This must be done before the btrfs_drop_extents run + * so we don't try to drop this extent. + */ + ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino, + start, 0); + + if (ret == 0 && + (found_type == BTRFS_FILE_EXTENT_REG || + found_type == BTRFS_FILE_EXTENT_PREALLOC)) { + struct btrfs_file_extent_item cmp1; + struct btrfs_file_extent_item cmp2; + struct btrfs_file_extent_item *existing; + struct extent_buffer *leaf; + + leaf = path->nodes[0]; + existing = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + + read_extent_buffer(eb, &cmp1, (unsigned long)item, + sizeof(cmp1)); + read_extent_buffer(leaf, &cmp2, (unsigned long)existing, + sizeof(cmp2)); + + /* + * we already have a pointer to this exact extent, + * we don't have to do anything + */ + if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { + btrfs_release_path(root, path); + goto out; + } + } + btrfs_release_path(root, path); + + saved_nbytes = inode_get_bytes(inode); + /* drop any overlapping extents */ + ret = btrfs_drop_extents(trans, root, inode, + start, extent_end, start, &alloc_hint); + BUG_ON(ret); + + if (found_type == BTRFS_FILE_EXTENT_REG || + found_type == BTRFS_FILE_EXTENT_PREALLOC) { + unsigned long dest_offset; + struct btrfs_key ins; + + ret = btrfs_insert_empty_item(trans, root, path, key, + sizeof(*item)); + BUG_ON(ret); + dest_offset = btrfs_item_ptr_offset(path->nodes[0], + path->slots[0]); + copy_extent_buffer(path->nodes[0], eb, dest_offset, + (unsigned long)item, sizeof(*item)); + + ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); + ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); + ins.type = BTRFS_EXTENT_ITEM_KEY; + + if (ins.objectid > 0) { + u64 csum_start; + u64 csum_end; + LIST_HEAD(ordered_sums); + /* + * is this extent already allocated in the extent + * allocation tree? If so, just add a reference + */ + ret = btrfs_lookup_extent(root, ins.objectid, + ins.offset); + if (ret == 0) { + ret = btrfs_inc_extent_ref(trans, root, + ins.objectid, ins.offset, + path->nodes[0]->start, + root->root_key.objectid, + trans->transid, key->objectid); + } else { + /* + * insert the extent pointer in the extent + * allocation tree + */ + ret = btrfs_alloc_logged_extent(trans, root, + path->nodes[0]->start, + root->root_key.objectid, + trans->transid, key->objectid, + &ins); + BUG_ON(ret); + } + btrfs_release_path(root, path); + + if (btrfs_file_extent_compression(eb, item)) { + csum_start = ins.objectid; + csum_end = csum_start + ins.offset; + } else { + csum_start = ins.objectid + + btrfs_file_extent_offset(eb, item); + csum_end = csum_start + + btrfs_file_extent_num_bytes(eb, item); + } + + ret = btrfs_lookup_csums_range(root->log_root, + csum_start, csum_end - 1, + &ordered_sums); + BUG_ON(ret); + while (!list_empty(&ordered_sums)) { + struct btrfs_ordered_sum *sums; + sums = list_entry(ordered_sums.next, + struct btrfs_ordered_sum, + list); + ret = btrfs_csum_file_blocks(trans, + root->fs_info->csum_root, + sums); + BUG_ON(ret); + list_del(&sums->list); + kfree(sums); + } + } else { + btrfs_release_path(root, path); + } + } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { + /* inline extents are easy, we just overwrite them */ + ret = overwrite_item(trans, root, path, eb, slot, key); + BUG_ON(ret); + } + + inode_set_bytes(inode, saved_nbytes); + btrfs_update_inode(trans, root, inode); +out: + if (inode) + iput(inode); + return ret; +} + +/* + * when cleaning up conflicts between the directory names in the + * subvolume, directory names in the log and directory names in the + * inode back references, we may have to unlink inodes from directories. + * + * This is a helper function to do the unlink of a specific directory + * item + */ +static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct inode *dir, + struct btrfs_dir_item *di) +{ + struct inode *inode; + char *name; + int name_len; + struct extent_buffer *leaf; + struct btrfs_key location; + int ret; + + leaf = path->nodes[0]; + + btrfs_dir_item_key_to_cpu(leaf, di, &location); + name_len = btrfs_dir_name_len(leaf, di); + name = kmalloc(name_len, GFP_NOFS); + read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); + btrfs_release_path(root, path); + + inode = read_one_inode(root, location.objectid); + BUG_ON(!inode); + + ret = link_to_fixup_dir(trans, root, path, location.objectid); + BUG_ON(ret); + ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); + BUG_ON(ret); + kfree(name); + + iput(inode); + return ret; +} + +/* + * helper function to see if a given name and sequence number found + * in an inode back reference are already in a directory and correctly + * point to this inode + */ +static noinline int inode_in_dir(struct btrfs_root *root, + struct btrfs_path *path, + u64 dirid, u64 objectid, u64 index, + const char *name, int name_len) +{ + struct btrfs_dir_item *di; + struct btrfs_key location; + int match = 0; + + di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, + index, name, name_len, 0); + if (di && !IS_ERR(di)) { + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); + if (location.objectid != objectid) + goto out; + } else + goto out; + btrfs_release_path(root, path); + + di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); + if (di && !IS_ERR(di)) { + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); + if (location.objectid != objectid) + goto out; + } else + goto out; + match = 1; +out: + btrfs_release_path(root, path); + return match; +} + +/* + * helper function to check a log tree for a named back reference in + * an inode. This is used to decide if a back reference that is + * found in the subvolume conflicts with what we find in the log. + * + * inode backreferences may have multiple refs in a single item, + * during replay we process one reference at a time, and we don't + * want to delete valid links to a file from the subvolume if that + * link is also in the log. + */ +static noinline int backref_in_log(struct btrfs_root *log, + struct btrfs_key *key, + char *name, int namelen) +{ + struct btrfs_path *path; + struct btrfs_inode_ref *ref; + unsigned long ptr; + unsigned long ptr_end; + unsigned long name_ptr; + int found_name_len; + int item_size; + int ret; + int match = 0; + + path = btrfs_alloc_path(); + ret = btrfs_search_slot(NULL, log, key, path, 0, 0); + if (ret != 0) + goto out; + + item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); + ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); + ptr_end = ptr + item_size; + while (ptr < ptr_end) { + ref = (struct btrfs_inode_ref *)ptr; + found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref); + if (found_name_len == namelen) { + name_ptr = (unsigned long)(ref + 1); + ret = memcmp_extent_buffer(path->nodes[0], name, + name_ptr, namelen); + if (ret == 0) { + match = 1; + goto out; + } + } + ptr = (unsigned long)(ref + 1) + found_name_len; + } +out: + btrfs_free_path(path); + return match; +} + + +/* + * replay one inode back reference item found in the log tree. + * eb, slot and key refer to the buffer and key found in the log tree. + * root is the destination we are replaying into, and path is for temp + * use by this function. (it should be released on return). + */ +static noinline int add_inode_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_root *log, + struct btrfs_path *path, + struct extent_buffer *eb, int slot, + struct btrfs_key *key) +{ + struct inode *dir; + int ret; + struct btrfs_key location; + struct btrfs_inode_ref *ref; + struct btrfs_dir_item *di; + struct inode *inode; + char *name; + int namelen; + unsigned long ref_ptr; + unsigned long ref_end; + + location.objectid = key->objectid; + location.type = BTRFS_INODE_ITEM_KEY; + location.offset = 0; + + /* + * it is possible that we didn't log all the parent directories + * for a given inode. If we don't find the dir, just don't + * copy the back ref in. The link count fixup code will take + * care of the rest + */ + dir = read_one_inode(root, key->offset); + if (!dir) + return -ENOENT; + + inode = read_one_inode(root, key->objectid); + BUG_ON(!dir); + + ref_ptr = btrfs_item_ptr_offset(eb, slot); + ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); + +again: + ref = (struct btrfs_inode_ref *)ref_ptr; + + namelen = btrfs_inode_ref_name_len(eb, ref); + name = kmalloc(namelen, GFP_NOFS); + BUG_ON(!name); + + read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen); + + /* if we already have a perfect match, we're done */ + if (inode_in_dir(root, path, dir->i_ino, inode->i_ino, + btrfs_inode_ref_index(eb, ref), + name, namelen)) { + goto out; + } + + /* + * look for a conflicting back reference in the metadata. + * if we find one we have to unlink that name of the file + * before we add our new link. Later on, we overwrite any + * existing back reference, and we don't want to create + * dangling pointers in the directory. + */ +conflict_again: + ret = btrfs_search_slot(NULL, root, key, path, 0, 0); + if (ret == 0) { + char *victim_name; + int victim_name_len; + struct btrfs_inode_ref *victim_ref; + unsigned long ptr; + unsigned long ptr_end; + struct extent_buffer *leaf = path->nodes[0]; + + /* are we trying to overwrite a back ref for the root directory + * if so, just jump out, we're done + */ + if (key->objectid == key->offset) + goto out_nowrite; + + /* check all the names in this back reference to see + * if they are in the log. if so, we allow them to stay + * otherwise they must be unlinked as a conflict + */ + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]); + while (ptr < ptr_end) { + victim_ref = (struct btrfs_inode_ref *)ptr; + victim_name_len = btrfs_inode_ref_name_len(leaf, + victim_ref); + victim_name = kmalloc(victim_name_len, GFP_NOFS); + BUG_ON(!victim_name); + + read_extent_buffer(leaf, victim_name, + (unsigned long)(victim_ref + 1), + victim_name_len); + + if (!backref_in_log(log, key, victim_name, + victim_name_len)) { + btrfs_inc_nlink(inode); + btrfs_release_path(root, path); + ret = btrfs_unlink_inode(trans, root, dir, + inode, victim_name, + victim_name_len); + kfree(victim_name); + btrfs_release_path(root, path); + goto conflict_again; + } + kfree(victim_name); + ptr = (unsigned long)(victim_ref + 1) + victim_name_len; + } + BUG_ON(ret); + } + btrfs_release_path(root, path); + + /* look for a conflicting sequence number */ + di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, + btrfs_inode_ref_index(eb, ref), + name, namelen, 0); + if (di && !IS_ERR(di)) { + ret = drop_one_dir_item(trans, root, path, dir, di); + BUG_ON(ret); + } + btrfs_release_path(root, path); + + + /* look for a conflicting name */ + di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, + name, namelen, 0); + if (di && !IS_ERR(di)) { + ret = drop_one_dir_item(trans, root, path, dir, di); + BUG_ON(ret); + } + btrfs_release_path(root, path); + + /* insert our name */ + ret = btrfs_add_link(trans, dir, inode, name, namelen, 0, + btrfs_inode_ref_index(eb, ref)); + BUG_ON(ret); + + btrfs_update_inode(trans, root, inode); + +out: + ref_ptr = (unsigned long)(ref + 1) + namelen; + kfree(name); + if (ref_ptr < ref_end) + goto again; + + /* finally write the back reference in the inode */ + ret = overwrite_item(trans, root, path, eb, slot, key); + BUG_ON(ret); + +out_nowrite: + btrfs_release_path(root, path); + iput(dir); + iput(inode); + return 0; +} + +/* + * There are a few corners where the link count of the file can't + * be properly maintained during replay. So, instead of adding + * lots of complexity to the log code, we just scan the backrefs + * for any file that has been through replay. + * + * The scan will update the link count on the inode to reflect the + * number of back refs found. If it goes down to zero, the iput + * will free the inode. + */ +static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode) +{ + struct btrfs_path *path; + int ret; + struct btrfs_key key; + u64 nlink = 0; + unsigned long ptr; + unsigned long ptr_end; + int name_len; + + key.objectid = inode->i_ino; + key.type = BTRFS_INODE_REF_KEY; + key.offset = (u64)-1; + + path = btrfs_alloc_path(); + + while (1) { + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + break; + if (ret > 0) { + if (path->slots[0] == 0) + break; + path->slots[0]--; + } + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0]); + if (key.objectid != inode->i_ino || + key.type != BTRFS_INODE_REF_KEY) + break; + ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); + ptr_end = ptr + btrfs_item_size_nr(path->nodes[0], + path->slots[0]); + while (ptr < ptr_end) { + struct btrfs_inode_ref *ref; + + ref = (struct btrfs_inode_ref *)ptr; + name_len = btrfs_inode_ref_name_len(path->nodes[0], + ref); + ptr = (unsigned long)(ref + 1) + name_len; + nlink++; + } + + if (key.offset == 0) + break; + key.offset--; + btrfs_release_path(root, path); + } + btrfs_free_path(path); + if (nlink != inode->i_nlink) { + inode->i_nlink = nlink; + btrfs_update_inode(trans, root, inode); + } + BTRFS_I(inode)->index_cnt = (u64)-1; + + return 0; +} + +static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path) +{ + int ret; + struct btrfs_key key; + struct inode *inode; + + key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; + key.type = BTRFS_ORPHAN_ITEM_KEY; + key.offset = (u64)-1; + while (1) { + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + break; + + if (ret == 1) { + if (path->slots[0] == 0) + break; + path->slots[0]--; + } + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || + key.type != BTRFS_ORPHAN_ITEM_KEY) + break; + + ret = btrfs_del_item(trans, root, path); + BUG_ON(ret); + + btrfs_release_path(root, path); + inode = read_one_inode(root, key.offset); + BUG_ON(!inode); + + ret = fixup_inode_link_count(trans, root, inode); + BUG_ON(ret); + + iput(inode); + + if (key.offset == 0) + break; + key.offset--; + } + btrfs_release_path(root, path); + return 0; +} + + +/* + * record a given inode in the fixup dir so we can check its link + * count when replay is done. The link count is incremented here + * so the inode won't go away until we check it + */ +static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 objectid) +{ + struct btrfs_key key; + int ret = 0; + struct inode *inode; + + inode = read_one_inode(root, objectid); + BUG_ON(!inode); + + key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; + btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); + key.offset = objectid; + + ret = btrfs_insert_empty_item(trans, root, path, &key, 0); + + btrfs_release_path(root, path); + if (ret == 0) { + btrfs_inc_nlink(inode); + btrfs_update_inode(trans, root, inode); + } else if (ret == -EEXIST) { + ret = 0; + } else { + BUG(); + } + iput(inode); + + return ret; +} + +/* + * when replaying the log for a directory, we only insert names + * for inodes that actually exist. This means an fsync on a directory + * does not implicitly fsync all the new files in it + */ +static noinline int insert_one_name(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 dirid, u64 index, + char *name, int name_len, u8 type, + struct btrfs_key *location) +{ + struct inode *inode; + struct inode *dir; + int ret; + + inode = read_one_inode(root, location->objectid); + if (!inode) + return -ENOENT; + + dir = read_one_inode(root, dirid); + if (!dir) { + iput(inode); + return -EIO; + } + ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index); + + /* FIXME, put inode into FIXUP list */ + + iput(inode); + iput(dir); + return ret; +} + +/* + * take a single entry in a log directory item and replay it into + * the subvolume. + * + * if a conflicting item exists in the subdirectory already, + * the inode it points to is unlinked and put into the link count + * fix up tree. + * + * If a name from the log points to a file or directory that does + * not exist in the FS, it is skipped. fsyncs on directories + * do not force down inodes inside that directory, just changes to the + * names or unlinks in a directory. + */ +static noinline int replay_one_name(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *eb, + struct btrfs_dir_item *di, + struct btrfs_key *key) +{ + char *name; + int name_len; + struct btrfs_dir_item *dst_di; + struct btrfs_key found_key; + struct btrfs_key log_key; + struct inode *dir; + u8 log_type; + int exists; + int ret; + + dir = read_one_inode(root, key->objectid); + BUG_ON(!dir); + + name_len = btrfs_dir_name_len(eb, di); + name = kmalloc(name_len, GFP_NOFS); + log_type = btrfs_dir_type(eb, di); + read_extent_buffer(eb, name, (unsigned long)(di + 1), + name_len); + + btrfs_dir_item_key_to_cpu(eb, di, &log_key); + exists = btrfs_lookup_inode(trans, root, path, &log_key, 0); + if (exists == 0) + exists = 1; + else + exists = 0; + btrfs_release_path(root, path); + + if (key->type == BTRFS_DIR_ITEM_KEY) { + dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, + name, name_len, 1); + } else if (key->type == BTRFS_DIR_INDEX_KEY) { + dst_di = btrfs_lookup_dir_index_item(trans, root, path, + key->objectid, + key->offset, name, + name_len, 1); + } else { + BUG(); + } + if (!dst_di || IS_ERR(dst_di)) { + /* we need a sequence number to insert, so we only + * do inserts for the BTRFS_DIR_INDEX_KEY types + */ + if (key->type != BTRFS_DIR_INDEX_KEY) + goto out; + goto insert; + } + + btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); + /* the existing item matches the logged item */ + if (found_key.objectid == log_key.objectid && + found_key.type == log_key.type && + found_key.offset == log_key.offset && + btrfs_dir_type(path->nodes[0], dst_di) == log_type) { + goto out; + } + + /* + * don't drop the conflicting directory entry if the inode + * for the new entry doesn't exist + */ + if (!exists) + goto out; + + ret = drop_one_dir_item(trans, root, path, dir, dst_di); + BUG_ON(ret); + + if (key->type == BTRFS_DIR_INDEX_KEY) + goto insert; +out: + btrfs_release_path(root, path); + kfree(name); + iput(dir); + return 0; + +insert: + btrfs_release_path(root, path); + ret = insert_one_name(trans, root, path, key->objectid, key->offset, + name, name_len, log_type, &log_key); + + if (ret && ret != -ENOENT) + BUG(); + goto out; +} + +/* + * find all the names in a directory item and reconcile them into + * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than + * one name in a directory item, but the same code gets used for + * both directory index types + */ +static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *eb, int slot, + struct btrfs_key *key) +{ + int ret; + u32 item_size = btrfs_item_size_nr(eb, slot); + struct btrfs_dir_item *di; + int name_len; + unsigned long ptr; + unsigned long ptr_end; + + ptr = btrfs_item_ptr_offset(eb, slot); + ptr_end = ptr + item_size; + while (ptr < ptr_end) { + di = (struct btrfs_dir_item *)ptr; + name_len = btrfs_dir_name_len(eb, di); + ret = replay_one_name(trans, root, path, eb, di, key); + BUG_ON(ret); + ptr = (unsigned long)(di + 1); + ptr += name_len; + } + return 0; +} + +/* + * directory replay has two parts. There are the standard directory + * items in the log copied from the subvolume, and range items + * created in the log while the subvolume was logged. + * + * The range items tell us which parts of the key space the log + * is authoritative for. During replay, if a key in the subvolume + * directory is in a logged range item, but not actually in the log + * that means it was deleted from the directory before the fsync + * and should be removed. + */ +static noinline int find_dir_range(struct btrfs_root *root, + struct btrfs_path *path, + u64 dirid, int key_type, + u64 *start_ret, u64 *end_ret) +{ + struct btrfs_key key; + u64 found_end; + struct btrfs_dir_log_item *item; + int ret; + int nritems; + + if (*start_ret == (u64)-1) + return 1; + + key.objectid = dirid; + key.type = key_type; + key.offset = *start_ret; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + if (ret > 0) { + if (path->slots[0] == 0) + goto out; + path->slots[0]--; + } + if (ret != 0) + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + if (key.type != key_type || key.objectid != dirid) { + ret = 1; + goto next; + } + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_dir_log_item); + found_end = btrfs_dir_log_end(path->nodes[0], item); + + if (*start_ret >= key.offset && *start_ret <= found_end) { + ret = 0; + *start_ret = key.offset; + *end_ret = found_end; + goto out; + } + ret = 1; +next: + /* check the next slot in the tree to see if it is a valid item */ + nritems = btrfs_header_nritems(path->nodes[0]); + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret) + goto out; + } else { + path->slots[0]++; + } + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + if (key.type != key_type || key.objectid != dirid) { + ret = 1; + goto out; + } + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_dir_log_item); + found_end = btrfs_dir_log_end(path->nodes[0], item); + *start_ret = key.offset; + *end_ret = found_end; + ret = 0; +out: + btrfs_release_path(root, path); + return ret; +} + +/* + * this looks for a given directory item in the log. If the directory + * item is not in the log, the item is removed and the inode it points + * to is unlinked + */ +static noinline int check_item_in_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_root *log, + struct btrfs_path *path, + struct btrfs_path *log_path, + struct inode *dir, + struct btrfs_key *dir_key) +{ + int ret; + struct extent_buffer *eb; + int slot; + u32 item_size; + struct btrfs_dir_item *di; + struct btrfs_dir_item *log_di; + int name_len; + unsigned long ptr; + unsigned long ptr_end; + char *name; + struct inode *inode; + struct btrfs_key location; + +again: + eb = path->nodes[0]; + slot = path->slots[0]; + item_size = btrfs_item_size_nr(eb, slot); + ptr = btrfs_item_ptr_offset(eb, slot); + ptr_end = ptr + item_size; + while (ptr < ptr_end) { + di = (struct btrfs_dir_item *)ptr; + name_len = btrfs_dir_name_len(eb, di); + name = kmalloc(name_len, GFP_NOFS); + if (!name) { + ret = -ENOMEM; + goto out; + } + read_extent_buffer(eb, name, (unsigned long)(di + 1), + name_len); + log_di = NULL; + if (dir_key->type == BTRFS_DIR_ITEM_KEY) { + log_di = btrfs_lookup_dir_item(trans, log, log_path, + dir_key->objectid, + name, name_len, 0); + } else if (dir_key->type == BTRFS_DIR_INDEX_KEY) { + log_di = btrfs_lookup_dir_index_item(trans, log, + log_path, + dir_key->objectid, + dir_key->offset, + name, name_len, 0); + } + if (!log_di || IS_ERR(log_di)) { + btrfs_dir_item_key_to_cpu(eb, di, &location); + btrfs_release_path(root, path); + btrfs_release_path(log, log_path); + inode = read_one_inode(root, location.objectid); + BUG_ON(!inode); + + ret = link_to_fixup_dir(trans, root, + path, location.objectid); + BUG_ON(ret); + btrfs_inc_nlink(inode); + ret = btrfs_unlink_inode(trans, root, dir, inode, + name, name_len); + BUG_ON(ret); + kfree(name); + iput(inode); + + /* there might still be more names under this key + * check and repeat if required + */ + ret = btrfs_search_slot(NULL, root, dir_key, path, + 0, 0); + if (ret == 0) + goto again; + ret = 0; + goto out; + } + btrfs_release_path(log, log_path); + kfree(name); + + ptr = (unsigned long)(di + 1); + ptr += name_len; + } + ret = 0; +out: + btrfs_release_path(root, path); + btrfs_release_path(log, log_path); + return ret; +} + +/* + * deletion replay happens before we copy any new directory items + * out of the log or out of backreferences from inodes. It + * scans the log to find ranges of keys that log is authoritative for, + * and then scans the directory to find items in those ranges that are + * not present in the log. + * + * Anything we don't find in the log is unlinked and removed from the + * directory. + */ +static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_root *log, + struct btrfs_path *path, + u64 dirid) +{ + u64 range_start; + u64 range_end; + int key_type = BTRFS_DIR_LOG_ITEM_KEY; + int ret = 0; + struct btrfs_key dir_key; + struct btrfs_key found_key; + struct btrfs_path *log_path; + struct inode *dir; + + dir_key.objectid = dirid; + dir_key.type = BTRFS_DIR_ITEM_KEY; + log_path = btrfs_alloc_path(); + if (!log_path) + return -ENOMEM; + + dir = read_one_inode(root, dirid); + /* it isn't an error if the inode isn't there, that can happen + * because we replay the deletes before we copy in the inode item + * from the log + */ + if (!dir) { + btrfs_free_path(log_path); + return 0; + } +again: + range_start = 0; + range_end = 0; + while (1) { + ret = find_dir_range(log, path, dirid, key_type, + &range_start, &range_end); + if (ret != 0) + break; + + dir_key.offset = range_start; + while (1) { + int nritems; + ret = btrfs_search_slot(NULL, root, &dir_key, path, + 0, 0); + if (ret < 0) + goto out; + + nritems = btrfs_header_nritems(path->nodes[0]); + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret) + break; + } + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + if (found_key.objectid != dirid || + found_key.type != dir_key.type) + goto next_type; + + if (found_key.offset > range_end) + break; + + ret = check_item_in_log(trans, root, log, path, + log_path, dir, &found_key); + BUG_ON(ret); + if (found_key.offset == (u64)-1) + break; + dir_key.offset = found_key.offset + 1; + } + btrfs_release_path(root, path); + if (range_end == (u64)-1) + break; + range_start = range_end + 1; + } + +next_type: + ret = 0; + if (key_type == BTRFS_DIR_LOG_ITEM_KEY) { + key_type = BTRFS_DIR_LOG_INDEX_KEY; + dir_key.type = BTRFS_DIR_INDEX_KEY; + btrfs_release_path(root, path); + goto again; + } +out: + btrfs_release_path(root, path); + btrfs_free_path(log_path); + iput(dir); + return ret; +} + +/* + * the process_func used to replay items from the log tree. This + * gets called in two different stages. The first stage just looks + * for inodes and makes sure they are all copied into the subvolume. + * + * The second stage copies all the other item types from the log into + * the subvolume. The two stage approach is slower, but gets rid of + * lots of complexity around inodes referencing other inodes that exist + * only in the log (references come from either directory items or inode + * back refs). + */ +static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, + struct walk_control *wc, u64 gen) +{ + int nritems; + struct btrfs_path *path; + struct btrfs_root *root = wc->replay_dest; + struct btrfs_key key; + u32 item_size; + int level; + int i; + int ret; + + btrfs_read_buffer(eb, gen); + + level = btrfs_header_level(eb); + + if (level != 0) + return 0; + + path = btrfs_alloc_path(); + BUG_ON(!path); + + nritems = btrfs_header_nritems(eb); + for (i = 0; i < nritems; i++) { + btrfs_item_key_to_cpu(eb, &key, i); + item_size = btrfs_item_size_nr(eb, i); + + /* inode keys are done during the first stage */ + if (key.type == BTRFS_INODE_ITEM_KEY && + wc->stage == LOG_WALK_REPLAY_INODES) { + struct inode *inode; + struct btrfs_inode_item *inode_item; + u32 mode; + + inode_item = btrfs_item_ptr(eb, i, + struct btrfs_inode_item); + mode = btrfs_inode_mode(eb, inode_item); + if (S_ISDIR(mode)) { + ret = replay_dir_deletes(wc->trans, + root, log, path, key.objectid); + BUG_ON(ret); + } + ret = overwrite_item(wc->trans, root, path, + eb, i, &key); + BUG_ON(ret); + + /* for regular files, truncate away + * extents past the new EOF + */ + if (S_ISREG(mode)) { + inode = read_one_inode(root, + key.objectid); + BUG_ON(!inode); + + ret = btrfs_truncate_inode_items(wc->trans, + root, inode, inode->i_size, + BTRFS_EXTENT_DATA_KEY); + BUG_ON(ret); + iput(inode); + } + ret = link_to_fixup_dir(wc->trans, root, + path, key.objectid); + BUG_ON(ret); + } + if (wc->stage < LOG_WALK_REPLAY_ALL) + continue; + + /* these keys are simply copied */ + if (key.type == BTRFS_XATTR_ITEM_KEY) { + ret = overwrite_item(wc->trans, root, path, + eb, i, &key); + BUG_ON(ret); + } else if (key.type == BTRFS_INODE_REF_KEY) { + ret = add_inode_ref(wc->trans, root, log, path, + eb, i, &key); + BUG_ON(ret && ret != -ENOENT); + } else if (key.type == BTRFS_EXTENT_DATA_KEY) { + ret = replay_one_extent(wc->trans, root, path, + eb, i, &key); + BUG_ON(ret); + } else if (key.type == BTRFS_DIR_ITEM_KEY || + key.type == BTRFS_DIR_INDEX_KEY) { + ret = replay_one_dir_item(wc->trans, root, path, + eb, i, &key); + BUG_ON(ret); + } + } + btrfs_free_path(path); + return 0; +} + +static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int *level, + struct walk_control *wc) +{ + u64 root_owner; + u64 root_gen; + u64 bytenr; + u64 ptr_gen; + struct extent_buffer *next; + struct extent_buffer *cur; + struct extent_buffer *parent; + u32 blocksize; + int ret = 0; + + WARN_ON(*level < 0); + WARN_ON(*level >= BTRFS_MAX_LEVEL); + + while (*level > 0) { + WARN_ON(*level < 0); + WARN_ON(*level >= BTRFS_MAX_LEVEL); + cur = path->nodes[*level]; + + if (btrfs_header_level(cur) != *level) + WARN_ON(1); + + if (path->slots[*level] >= + btrfs_header_nritems(cur)) + break; + + bytenr = btrfs_node_blockptr(cur, path->slots[*level]); + ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); + blocksize = btrfs_level_size(root, *level - 1); + + parent = path->nodes[*level]; + root_owner = btrfs_header_owner(parent); + root_gen = btrfs_header_generation(parent); + + next = btrfs_find_create_tree_block(root, bytenr, blocksize); + + wc->process_func(root, next, wc, ptr_gen); + + if (*level == 1) { + path->slots[*level]++; + if (wc->free) { + btrfs_read_buffer(next, ptr_gen); + + btrfs_tree_lock(next); + clean_tree_block(trans, root, next); + btrfs_wait_tree_block_writeback(next); + btrfs_tree_unlock(next); + + ret = btrfs_drop_leaf_ref(trans, root, next); + BUG_ON(ret); + + WARN_ON(root_owner != + BTRFS_TREE_LOG_OBJECTID); + ret = btrfs_free_reserved_extent(root, + bytenr, blocksize); + BUG_ON(ret); + } + free_extent_buffer(next); + continue; + } + btrfs_read_buffer(next, ptr_gen); + + WARN_ON(*level <= 0); + if (path->nodes[*level-1]) + free_extent_buffer(path->nodes[*level-1]); + path->nodes[*level-1] = next; + *level = btrfs_header_level(next); + path->slots[*level] = 0; + cond_resched(); + } + WARN_ON(*level < 0); + WARN_ON(*level >= BTRFS_MAX_LEVEL); + + if (path->nodes[*level] == root->node) + parent = path->nodes[*level]; + else + parent = path->nodes[*level + 1]; + + bytenr = path->nodes[*level]->start; + + blocksize = btrfs_level_size(root, *level); + root_owner = btrfs_header_owner(parent); + root_gen = btrfs_header_generation(parent); + + wc->process_func(root, path->nodes[*level], wc, + btrfs_header_generation(path->nodes[*level])); + + if (wc->free) { + next = path->nodes[*level]; + btrfs_tree_lock(next); + clean_tree_block(trans, root, next); + btrfs_wait_tree_block_writeback(next); + btrfs_tree_unlock(next); + + if (*level == 0) { + ret = btrfs_drop_leaf_ref(trans, root, next); + BUG_ON(ret); + } + WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); + ret = btrfs_free_reserved_extent(root, bytenr, blocksize); + BUG_ON(ret); + } + free_extent_buffer(path->nodes[*level]); + path->nodes[*level] = NULL; + *level += 1; + + cond_resched(); + return 0; +} + +static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int *level, + struct walk_control *wc) +{ + u64 root_owner; + u64 root_gen; + int i; + int slot; + int ret; + + for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { + slot = path->slots[i]; + if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { + struct extent_buffer *node; + node = path->nodes[i]; + path->slots[i]++; + *level = i; + WARN_ON(*level == 0); + return 0; + } else { + struct extent_buffer *parent; + if (path->nodes[*level] == root->node) + parent = path->nodes[*level]; + else + parent = path->nodes[*level + 1]; + + root_owner = btrfs_header_owner(parent); + root_gen = btrfs_header_generation(parent); + wc->process_func(root, path->nodes[*level], wc, + btrfs_header_generation(path->nodes[*level])); + if (wc->free) { + struct extent_buffer *next; + + next = path->nodes[*level]; + + btrfs_tree_lock(next); + clean_tree_block(trans, root, next); + btrfs_wait_tree_block_writeback(next); + btrfs_tree_unlock(next); + + if (*level == 0) { + ret = btrfs_drop_leaf_ref(trans, root, + next); + BUG_ON(ret); + } + + WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); + ret = btrfs_free_reserved_extent(root, + path->nodes[*level]->start, + path->nodes[*level]->len); + BUG_ON(ret); + } + free_extent_buffer(path->nodes[*level]); + path->nodes[*level] = NULL; + *level = i + 1; + } + } + return 1; +} + +/* + * drop the reference count on the tree rooted at 'snap'. This traverses + * the tree freeing any blocks that have a ref count of zero after being + * decremented. + */ +static int walk_log_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *log, struct walk_control *wc) +{ + int ret = 0; + int wret; + int level; + struct btrfs_path *path; + int i; + int orig_level; + + path = btrfs_alloc_path(); + BUG_ON(!path); + + level = btrfs_header_level(log->node); + orig_level = level; + path->nodes[level] = log->node; + extent_buffer_get(log->node); + path->slots[level] = 0; + + while (1) { + wret = walk_down_log_tree(trans, log, path, &level, wc); + if (wret > 0) + break; + if (wret < 0) + ret = wret; + + wret = walk_up_log_tree(trans, log, path, &level, wc); + if (wret > 0) + break; + if (wret < 0) + ret = wret; + } + + /* was the root node processed? if not, catch it here */ + if (path->nodes[orig_level]) { + wc->process_func(log, path->nodes[orig_level], wc, + btrfs_header_generation(path->nodes[orig_level])); + if (wc->free) { + struct extent_buffer *next; + + next = path->nodes[orig_level]; + + btrfs_tree_lock(next); + clean_tree_block(trans, log, next); + btrfs_wait_tree_block_writeback(next); + btrfs_tree_unlock(next); + + if (orig_level == 0) { + ret = btrfs_drop_leaf_ref(trans, log, + next); + BUG_ON(ret); + } + WARN_ON(log->root_key.objectid != + BTRFS_TREE_LOG_OBJECTID); + ret = btrfs_free_reserved_extent(log, next->start, + next->len); + BUG_ON(ret); + } + } + + for (i = 0; i <= orig_level; i++) { + if (path->nodes[i]) { + free_extent_buffer(path->nodes[i]); + path->nodes[i] = NULL; + } + } + btrfs_free_path(path); + if (wc->free) + free_extent_buffer(log->node); + return ret; +} + +static int wait_log_commit(struct btrfs_root *log) +{ + DEFINE_WAIT(wait); + u64 transid = log->fs_info->tree_log_transid; + + do { + prepare_to_wait(&log->fs_info->tree_log_wait, &wait, + TASK_UNINTERRUPTIBLE); + mutex_unlock(&log->fs_info->tree_log_mutex); + if (atomic_read(&log->fs_info->tree_log_commit)) + schedule(); + finish_wait(&log->fs_info->tree_log_wait, &wait); + mutex_lock(&log->fs_info->tree_log_mutex); + } while (transid == log->fs_info->tree_log_transid && + atomic_read(&log->fs_info->tree_log_commit)); + return 0; +} + +/* + * btrfs_sync_log does sends a given tree log down to the disk and + * updates the super blocks to record it. When this call is done, + * you know that any inodes previously logged are safely on disk + */ +int btrfs_sync_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + int ret; + unsigned long batch; + struct btrfs_root *log = root->log_root; + + mutex_lock(&log->fs_info->tree_log_mutex); + if (atomic_read(&log->fs_info->tree_log_commit)) { + wait_log_commit(log); + goto out; + } + atomic_set(&log->fs_info->tree_log_commit, 1); + + while (1) { + batch = log->fs_info->tree_log_batch; + mutex_unlock(&log->fs_info->tree_log_mutex); + schedule_timeout_uninterruptible(1); + mutex_lock(&log->fs_info->tree_log_mutex); + + while (atomic_read(&log->fs_info->tree_log_writers)) { + DEFINE_WAIT(wait); + prepare_to_wait(&log->fs_info->tree_log_wait, &wait, + TASK_UNINTERRUPTIBLE); + mutex_unlock(&log->fs_info->tree_log_mutex); + if (atomic_read(&log->fs_info->tree_log_writers)) + schedule(); + mutex_lock(&log->fs_info->tree_log_mutex); + finish_wait(&log->fs_info->tree_log_wait, &wait); + } + if (batch == log->fs_info->tree_log_batch) + break; + } + + ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages); + BUG_ON(ret); + ret = btrfs_write_and_wait_marked_extents(root->fs_info->log_root_tree, + &root->fs_info->log_root_tree->dirty_log_pages); + BUG_ON(ret); + + btrfs_set_super_log_root(&root->fs_info->super_for_commit, + log->fs_info->log_root_tree->node->start); + btrfs_set_super_log_root_level(&root->fs_info->super_for_commit, + btrfs_header_level(log->fs_info->log_root_tree->node)); + + write_ctree_super(trans, log->fs_info->tree_root, 2); + log->fs_info->tree_log_transid++; + log->fs_info->tree_log_batch = 0; + atomic_set(&log->fs_info->tree_log_commit, 0); + smp_mb(); + if (waitqueue_active(&log->fs_info->tree_log_wait)) + wake_up(&log->fs_info->tree_log_wait); +out: + mutex_unlock(&log->fs_info->tree_log_mutex); + return 0; +} + +/* * free all the extents used by the tree log. This should be called + * at commit time of the full transaction + */ +int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) +{ + int ret; + struct btrfs_root *log; + struct key; + u64 start; + u64 end; + struct walk_control wc = { + .free = 1, + .process_func = process_one_buffer + }; + + if (!root->log_root || root->fs_info->log_root_recovering) + return 0; + + log = root->log_root; + ret = walk_log_tree(trans, log, &wc); + BUG_ON(ret); + + while (1) { + ret = find_first_extent_bit(&log->dirty_log_pages, + 0, &start, &end, EXTENT_DIRTY); + if (ret) + break; + + clear_extent_dirty(&log->dirty_log_pages, + start, end, GFP_NOFS); + } + + log = root->log_root; + ret = btrfs_del_root(trans, root->fs_info->log_root_tree, + &log->root_key); + BUG_ON(ret); + root->log_root = NULL; + kfree(root->log_root); + return 0; +} + +/* + * helper function to update the item for a given subvolumes log root + * in the tree of log roots + */ +static int update_log_root(struct btrfs_trans_handle *trans, + struct btrfs_root *log) +{ + u64 bytenr = btrfs_root_bytenr(&log->root_item); + int ret; + + if (log->node->start == bytenr) + return 0; + + btrfs_set_root_bytenr(&log->root_item, log->node->start); + btrfs_set_root_generation(&log->root_item, trans->transid); + btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node)); + ret = btrfs_update_root(trans, log->fs_info->log_root_tree, + &log->root_key, &log->root_item); + BUG_ON(ret); + return ret; +} + +/* + * If both a file and directory are logged, and unlinks or renames are + * mixed in, we have a few interesting corners: + * + * create file X in dir Y + * link file X to X.link in dir Y + * fsync file X + * unlink file X but leave X.link + * fsync dir Y + * + * After a crash we would expect only X.link to exist. But file X + * didn't get fsync'd again so the log has back refs for X and X.link. + * + * We solve this by removing directory entries and inode backrefs from the + * log when a file that was logged in the current transaction is + * unlinked. Any later fsync will include the updated log entries, and + * we'll be able to reconstruct the proper directory items from backrefs. + * + * This optimizations allows us to avoid relogging the entire inode + * or the entire directory. + */ +int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + struct inode *dir, u64 index) +{ + struct btrfs_root *log; + struct btrfs_dir_item *di; + struct btrfs_path *path; + int ret; + int bytes_del = 0; + + if (BTRFS_I(dir)->logged_trans < trans->transid) + return 0; + + ret = join_running_log_trans(root); + if (ret) + return 0; + + mutex_lock(&BTRFS_I(dir)->log_mutex); + + log = root->log_root; + path = btrfs_alloc_path(); + di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino, + name, name_len, -1); + if (di && !IS_ERR(di)) { + ret = btrfs_delete_one_dir_name(trans, log, path, di); + bytes_del += name_len; + BUG_ON(ret); + } + btrfs_release_path(log, path); + di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino, + index, name, name_len, -1); + if (di && !IS_ERR(di)) { + ret = btrfs_delete_one_dir_name(trans, log, path, di); + bytes_del += name_len; + BUG_ON(ret); + } + + /* update the directory size in the log to reflect the names + * we have removed + */ + if (bytes_del) { + struct btrfs_key key; + + key.objectid = dir->i_ino; + key.offset = 0; + key.type = BTRFS_INODE_ITEM_KEY; + btrfs_release_path(log, path); + + ret = btrfs_search_slot(trans, log, &key, path, 0, 1); + if (ret == 0) { + struct btrfs_inode_item *item; + u64 i_size; + + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_item); + i_size = btrfs_inode_size(path->nodes[0], item); + if (i_size > bytes_del) + i_size -= bytes_del; + else + i_size = 0; + btrfs_set_inode_size(path->nodes[0], item, i_size); + btrfs_mark_buffer_dirty(path->nodes[0]); + } else + ret = 0; + btrfs_release_path(log, path); + } + + btrfs_free_path(path); + mutex_unlock(&BTRFS_I(dir)->log_mutex); + end_log_trans(root); + + return 0; +} + +/* see comments for btrfs_del_dir_entries_in_log */ +int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + struct inode *inode, u64 dirid) +{ + struct btrfs_root *log; + u64 index; + int ret; + + if (BTRFS_I(inode)->logged_trans < trans->transid) + return 0; + + ret = join_running_log_trans(root); + if (ret) + return 0; + log = root->log_root; + mutex_lock(&BTRFS_I(inode)->log_mutex); + + ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino, + dirid, &index); + mutex_unlock(&BTRFS_I(inode)->log_mutex); + end_log_trans(root); + + return ret; +} + +/* + * creates a range item in the log for 'dirid'. first_offset and + * last_offset tell us which parts of the key space the log should + * be considered authoritative for. + */ +static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, + struct btrfs_root *log, + struct btrfs_path *path, + int key_type, u64 dirid, + u64 first_offset, u64 last_offset) +{ + int ret; + struct btrfs_key key; + struct btrfs_dir_log_item *item; + + key.objectid = dirid; + key.offset = first_offset; + if (key_type == BTRFS_DIR_ITEM_KEY) + key.type = BTRFS_DIR_LOG_ITEM_KEY; + else + key.type = BTRFS_DIR_LOG_INDEX_KEY; + ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); + BUG_ON(ret); + + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_dir_log_item); + btrfs_set_dir_log_end(path->nodes[0], item, last_offset); + btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_release_path(log, path); + return 0; +} + +/* + * log all the items included in the current transaction for a given + * directory. This also creates the range items in the log tree required + * to replay anything deleted before the fsync + */ +static noinline int log_dir_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + struct btrfs_path *path, + struct btrfs_path *dst_path, int key_type, + u64 min_offset, u64 *last_offset_ret) +{ + struct btrfs_key min_key; + struct btrfs_key max_key; + struct btrfs_root *log = root->log_root; + struct extent_buffer *src; + int ret; + int i; + int nritems; + u64 first_offset = min_offset; + u64 last_offset = (u64)-1; + + log = root->log_root; + max_key.objectid = inode->i_ino; + max_key.offset = (u64)-1; + max_key.type = key_type; + + min_key.objectid = inode->i_ino; + min_key.type = key_type; + min_key.offset = min_offset; + + path->keep_locks = 1; + + ret = btrfs_search_forward(root, &min_key, &max_key, + path, 0, trans->transid); + + /* + * we didn't find anything from this transaction, see if there + * is anything at all + */ + if (ret != 0 || min_key.objectid != inode->i_ino || + min_key.type != key_type) { + min_key.objectid = inode->i_ino; + min_key.type = key_type; + min_key.offset = (u64)-1; + btrfs_release_path(root, path); + ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); + if (ret < 0) { + btrfs_release_path(root, path); + return ret; + } + ret = btrfs_previous_item(root, path, inode->i_ino, key_type); + + /* if ret == 0 there are items for this type, + * create a range to tell us the last key of this type. + * otherwise, there are no items in this directory after + * *min_offset, and we create a range to indicate that. + */ + if (ret == 0) { + struct btrfs_key tmp; + btrfs_item_key_to_cpu(path->nodes[0], &tmp, + path->slots[0]); + if (key_type == tmp.type) + first_offset = max(min_offset, tmp.offset) + 1; + } + goto done; + } + + /* go backward to find any previous key */ + ret = btrfs_previous_item(root, path, inode->i_ino, key_type); + if (ret == 0) { + struct btrfs_key tmp; + btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); + if (key_type == tmp.type) { + first_offset = tmp.offset; + ret = overwrite_item(trans, log, dst_path, + path->nodes[0], path->slots[0], + &tmp); + } + } + btrfs_release_path(root, path); + + /* find the first key from this transaction again */ + ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); + if (ret != 0) { + WARN_ON(1); + goto done; + } + + /* + * we have a block from this transaction, log every item in it + * from our directory + */ + while (1) { + struct btrfs_key tmp; + src = path->nodes[0]; + nritems = btrfs_header_nritems(src); + for (i = path->slots[0]; i < nritems; i++) { + btrfs_item_key_to_cpu(src, &min_key, i); + + if (min_key.objectid != inode->i_ino || + min_key.type != key_type) + goto done; + ret = overwrite_item(trans, log, dst_path, src, i, + &min_key); + BUG_ON(ret); + } + path->slots[0] = nritems; + + /* + * look ahead to the next item and see if it is also + * from this directory and from this transaction + */ + ret = btrfs_next_leaf(root, path); + if (ret == 1) { + last_offset = (u64)-1; + goto done; + } + btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); + if (tmp.objectid != inode->i_ino || tmp.type != key_type) { + last_offset = (u64)-1; + goto done; + } + if (btrfs_header_generation(path->nodes[0]) != trans->transid) { + ret = overwrite_item(trans, log, dst_path, + path->nodes[0], path->slots[0], + &tmp); + + BUG_ON(ret); + last_offset = tmp.offset; + goto done; + } + } +done: + *last_offset_ret = last_offset; + btrfs_release_path(root, path); + btrfs_release_path(log, dst_path); + + /* insert the log range keys to indicate where the log is valid */ + ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino, + first_offset, last_offset); + BUG_ON(ret); + return 0; +} + +/* + * logging directories is very similar to logging inodes, We find all the items + * from the current transaction and write them to the log. + * + * The recovery code scans the directory in the subvolume, and if it finds a + * key in the range logged that is not present in the log tree, then it means + * that dir entry was unlinked during the transaction. + * + * In order for that scan to work, we must include one key smaller than + * the smallest logged by this transaction and one key larger than the largest + * key logged by this transaction. + */ +static noinline int log_directory_changes(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + struct btrfs_path *path, + struct btrfs_path *dst_path) +{ + u64 min_key; + u64 max_key; + int ret; + int key_type = BTRFS_DIR_ITEM_KEY; + +again: + min_key = 0; + max_key = 0; + while (1) { + ret = log_dir_items(trans, root, inode, path, + dst_path, key_type, min_key, + &max_key); + BUG_ON(ret); + if (max_key == (u64)-1) + break; + min_key = max_key + 1; + } + + if (key_type == BTRFS_DIR_ITEM_KEY) { + key_type = BTRFS_DIR_INDEX_KEY; + goto again; + } + return 0; +} + +/* + * a helper function to drop items from the log before we relog an + * inode. max_key_type indicates the highest item type to remove. + * This cannot be run for file data extents because it does not + * free the extents they point to. + */ +static int drop_objectid_items(struct btrfs_trans_handle *trans, + struct btrfs_root *log, + struct btrfs_path *path, + u64 objectid, int max_key_type) +{ + int ret; + struct btrfs_key key; + struct btrfs_key found_key; + + key.objectid = objectid; + key.type = max_key_type; + key.offset = (u64)-1; + + while (1) { + ret = btrfs_search_slot(trans, log, &key, path, -1, 1); + + if (ret != 1) + break; + + if (path->slots[0] == 0) + break; + + path->slots[0]--; + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + + if (found_key.objectid != objectid) + break; + + ret = btrfs_del_item(trans, log, path); + BUG_ON(ret); + btrfs_release_path(log, path); + } + btrfs_release_path(log, path); + return 0; +} + +static noinline int copy_items(struct btrfs_trans_handle *trans, + struct btrfs_root *log, + struct btrfs_path *dst_path, + struct extent_buffer *src, + int start_slot, int nr, int inode_only) +{ + unsigned long src_offset; + unsigned long dst_offset; + struct btrfs_file_extent_item *extent; + struct btrfs_inode_item *inode_item; + int ret; + struct btrfs_key *ins_keys; + u32 *ins_sizes; + char *ins_data; + int i; + struct list_head ordered_sums; + + INIT_LIST_HEAD(&ordered_sums); + + ins_data = kmalloc(nr * sizeof(struct btrfs_key) + + nr * sizeof(u32), GFP_NOFS); + ins_sizes = (u32 *)ins_data; + ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); + + for (i = 0; i < nr; i++) { + ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot); + btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); + } + ret = btrfs_insert_empty_items(trans, log, dst_path, + ins_keys, ins_sizes, nr); + BUG_ON(ret); + + for (i = 0; i < nr; i++) { + dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], + dst_path->slots[0]); + + src_offset = btrfs_item_ptr_offset(src, start_slot + i); + + copy_extent_buffer(dst_path->nodes[0], src, dst_offset, + src_offset, ins_sizes[i]); + + if (inode_only == LOG_INODE_EXISTS && + ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { + inode_item = btrfs_item_ptr(dst_path->nodes[0], + dst_path->slots[0], + struct btrfs_inode_item); + btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0); + + /* set the generation to zero so the recover code + * can tell the difference between an logging + * just to say 'this inode exists' and a logging + * to say 'update this inode with these values' + */ + btrfs_set_inode_generation(dst_path->nodes[0], + inode_item, 0); + } + /* take a reference on file data extents so that truncates + * or deletes of this inode don't have to relog the inode + * again + */ + if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) { + int found_type; + extent = btrfs_item_ptr(src, start_slot + i, + struct btrfs_file_extent_item); + + found_type = btrfs_file_extent_type(src, extent); + if (found_type == BTRFS_FILE_EXTENT_REG || + found_type == BTRFS_FILE_EXTENT_PREALLOC) { + u64 ds = btrfs_file_extent_disk_bytenr(src, + extent); + u64 dl = btrfs_file_extent_disk_num_bytes(src, + extent); + u64 cs = btrfs_file_extent_offset(src, extent); + u64 cl = btrfs_file_extent_num_bytes(src, + extent);; + if (btrfs_file_extent_compression(src, + extent)) { + cs = 0; + cl = dl; + } + /* ds == 0 is a hole */ + if (ds != 0) { + ret = btrfs_inc_extent_ref(trans, log, + ds, dl, + dst_path->nodes[0]->start, + BTRFS_TREE_LOG_OBJECTID, + trans->transid, + ins_keys[i].objectid); + BUG_ON(ret); + ret = btrfs_lookup_csums_range( + log->fs_info->csum_root, + ds + cs, ds + cs + cl - 1, + &ordered_sums); + BUG_ON(ret); + } + } + } + dst_path->slots[0]++; + } + + btrfs_mark_buffer_dirty(dst_path->nodes[0]); + btrfs_release_path(log, dst_path); + kfree(ins_data); + + /* + * we have to do this after the loop above to avoid changing the + * log tree while trying to change the log tree. + */ + while (!list_empty(&ordered_sums)) { + struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, + struct btrfs_ordered_sum, + list); + ret = btrfs_csum_file_blocks(trans, log, sums); + BUG_ON(ret); + list_del(&sums->list); + kfree(sums); + } + return 0; +} + +/* log a single inode in the tree log. + * At least one parent directory for this inode must exist in the tree + * or be logged already. + * + * Any items from this inode changed by the current transaction are copied + * to the log tree. An extra reference is taken on any extents in this + * file, allowing us to avoid a whole pile of corner cases around logging + * blocks that have been removed from the tree. + * + * See LOG_INODE_ALL and related defines for a description of what inode_only + * does. + * + * This handles both files and directories. + */ +static int __btrfs_log_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + int inode_only) +{ + struct btrfs_path *path; + struct btrfs_path *dst_path; + struct btrfs_key min_key; + struct btrfs_key max_key; + struct btrfs_root *log = root->log_root; + struct extent_buffer *src = NULL; + u32 size; + int ret; + int nritems; + int ins_start_slot = 0; + int ins_nr; + + log = root->log_root; + + path = btrfs_alloc_path(); + dst_path = btrfs_alloc_path(); + + min_key.objectid = inode->i_ino; + min_key.type = BTRFS_INODE_ITEM_KEY; + min_key.offset = 0; + + max_key.objectid = inode->i_ino; + if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode)) + max_key.type = BTRFS_XATTR_ITEM_KEY; + else + max_key.type = (u8)-1; + max_key.offset = (u64)-1; + + /* + * if this inode has already been logged and we're in inode_only + * mode, we don't want to delete the things that have already + * been written to the log. + * + * But, if the inode has been through an inode_only log, + * the logged_trans field is not set. This allows us to catch + * any new names for this inode in the backrefs by logging it + * again + */ + if (inode_only == LOG_INODE_EXISTS && + BTRFS_I(inode)->logged_trans == trans->transid) { + btrfs_free_path(path); + btrfs_free_path(dst_path); + goto out; + } + mutex_lock(&BTRFS_I(inode)->log_mutex); + + /* + * a brute force approach to making sure we get the most uptodate + * copies of everything. + */ + if (S_ISDIR(inode->i_mode)) { + int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; + + if (inode_only == LOG_INODE_EXISTS) + max_key_type = BTRFS_XATTR_ITEM_KEY; + ret = drop_objectid_items(trans, log, path, + inode->i_ino, max_key_type); + } else { + ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0); + } + BUG_ON(ret); + path->keep_locks = 1; + + while (1) { + ins_nr = 0; + ret = btrfs_search_forward(root, &min_key, &max_key, + path, 0, trans->transid); + if (ret != 0) + break; +again: + /* note, ins_nr might be > 0 here, cleanup outside the loop */ + if (min_key.objectid != inode->i_ino) + break; + if (min_key.type > max_key.type) + break; + + src = path->nodes[0]; + size = btrfs_item_size_nr(src, path->slots[0]); + if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { + ins_nr++; + goto next_slot; + } else if (!ins_nr) { + ins_start_slot = path->slots[0]; + ins_nr = 1; + goto next_slot; + } + + ret = copy_items(trans, log, dst_path, src, ins_start_slot, + ins_nr, inode_only); + BUG_ON(ret); + ins_nr = 1; + ins_start_slot = path->slots[0]; +next_slot: + + nritems = btrfs_header_nritems(path->nodes[0]); + path->slots[0]++; + if (path->slots[0] < nritems) { + btrfs_item_key_to_cpu(path->nodes[0], &min_key, + path->slots[0]); + goto again; + } + if (ins_nr) { + ret = copy_items(trans, log, dst_path, src, + ins_start_slot, + ins_nr, inode_only); + BUG_ON(ret); + ins_nr = 0; + } + btrfs_release_path(root, path); + + if (min_key.offset < (u64)-1) + min_key.offset++; + else if (min_key.type < (u8)-1) + min_key.type++; + else if (min_key.objectid < (u64)-1) + min_key.objectid++; + else + break; + } + if (ins_nr) { + ret = copy_items(trans, log, dst_path, src, + ins_start_slot, + ins_nr, inode_only); + BUG_ON(ret); + ins_nr = 0; + } + WARN_ON(ins_nr); + if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { + btrfs_release_path(root, path); + btrfs_release_path(log, dst_path); + BTRFS_I(inode)->log_dirty_trans = 0; + ret = log_directory_changes(trans, root, inode, path, dst_path); + BUG_ON(ret); + } + BTRFS_I(inode)->logged_trans = trans->transid; + mutex_unlock(&BTRFS_I(inode)->log_mutex); + + btrfs_free_path(path); + btrfs_free_path(dst_path); + + mutex_lock(&root->fs_info->tree_log_mutex); + ret = update_log_root(trans, log); + BUG_ON(ret); + mutex_unlock(&root->fs_info->tree_log_mutex); +out: + return 0; +} + +int btrfs_log_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + int inode_only) +{ + int ret; + + start_log_trans(trans, root); + ret = __btrfs_log_inode(trans, root, inode, inode_only); + end_log_trans(root); + return ret; +} + +/* + * helper function around btrfs_log_inode to make sure newly created + * parent directories also end up in the log. A minimal inode and backref + * only logging is done of any parent directories that are older than + * the last committed transaction + */ +int btrfs_log_dentry(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct dentry *dentry) +{ + int inode_only = LOG_INODE_ALL; + struct super_block *sb; + int ret; + + start_log_trans(trans, root); + sb = dentry->d_inode->i_sb; + while (1) { + ret = __btrfs_log_inode(trans, root, dentry->d_inode, + inode_only); + BUG_ON(ret); + inode_only = LOG_INODE_EXISTS; + + dentry = dentry->d_parent; + if (!dentry || !dentry->d_inode || sb != dentry->d_inode->i_sb) + break; + + if (BTRFS_I(dentry->d_inode)->generation <= + root->fs_info->last_trans_committed) + break; + } + end_log_trans(root); + return 0; +} + +/* + * it is not safe to log dentry if the chunk root has added new + * chunks. This returns 0 if the dentry was logged, and 1 otherwise. + * If this returns 1, you must commit the transaction to safely get your + * data on disk. + */ +int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct dentry *dentry) +{ + u64 gen; + gen = root->fs_info->last_trans_new_blockgroup; + if (gen > root->fs_info->last_trans_committed) + return 1; + else + return btrfs_log_dentry(trans, root, dentry); +} + +/* + * should be called during mount to recover any replay any log trees + * from the FS + */ +int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) +{ + int ret; + struct btrfs_path *path; + struct btrfs_trans_handle *trans; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_key tmp_key; + struct btrfs_root *log; + struct btrfs_fs_info *fs_info = log_root_tree->fs_info; + u64 highest_inode; + struct walk_control wc = { + .process_func = process_one_buffer, + .stage = 0, + }; + + fs_info->log_root_recovering = 1; + path = btrfs_alloc_path(); + BUG_ON(!path); + + trans = btrfs_start_transaction(fs_info->tree_root, 1); + + wc.trans = trans; + wc.pin = 1; + + walk_log_tree(trans, log_root_tree, &wc); + +again: + key.objectid = BTRFS_TREE_LOG_OBJECTID; + key.offset = (u64)-1; + btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + + while (1) { + ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); + if (ret < 0) + break; + if (ret > 0) { + if (path->slots[0] == 0) + break; + path->slots[0]--; + } + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + btrfs_release_path(log_root_tree, path); + if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) + break; + + log = btrfs_read_fs_root_no_radix(log_root_tree, + &found_key); + BUG_ON(!log); + + + tmp_key.objectid = found_key.offset; + tmp_key.type = BTRFS_ROOT_ITEM_KEY; + tmp_key.offset = (u64)-1; + + wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); + BUG_ON(!wc.replay_dest); + + wc.replay_dest->log_root = log; + btrfs_record_root_in_trans(wc.replay_dest); + ret = walk_log_tree(trans, log, &wc); + BUG_ON(ret); + + if (wc.stage == LOG_WALK_REPLAY_ALL) { + ret = fixup_inode_link_counts(trans, wc.replay_dest, + path); + BUG_ON(ret); + } + ret = btrfs_find_highest_inode(wc.replay_dest, &highest_inode); + if (ret == 0) { + wc.replay_dest->highest_inode = highest_inode; + wc.replay_dest->last_inode_alloc = highest_inode; + } + + key.offset = found_key.offset - 1; + wc.replay_dest->log_root = NULL; + free_extent_buffer(log->node); + kfree(log); + + if (found_key.offset == 0) + break; + } + btrfs_release_path(log_root_tree, path); + + /* step one is to pin it all, step two is to replay just inodes */ + if (wc.pin) { + wc.pin = 0; + wc.process_func = replay_one_buffer; + wc.stage = LOG_WALK_REPLAY_INODES; + goto again; + } + /* step three is to replay everything */ + if (wc.stage < LOG_WALK_REPLAY_ALL) { + wc.stage++; + goto again; + } + + btrfs_free_path(path); + + free_extent_buffer(log_root_tree->node); + log_root_tree->log_root = NULL; + fs_info->log_root_recovering = 0; + + /* step 4: commit the transaction, which also unpins the blocks */ + btrfs_commit_transaction(trans, fs_info->tree_root); + + kfree(log_root_tree); + return 0; +} diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h new file mode 100644 index 000000000000..b9409b32ed02 --- /dev/null +++ b/fs/btrfs/tree-log.h @@ -0,0 +1,41 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __TREE_LOG_ +#define __TREE_LOG_ + +int btrfs_sync_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); +int btrfs_log_dentry(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct dentry *dentry); +int btrfs_recover_log_trees(struct btrfs_root *tree_root); +int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct dentry *dentry); +int btrfs_log_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + int inode_only); +int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + struct inode *dir, u64 index); +int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + struct inode *inode, u64 dirid); +#endif diff --git a/fs/btrfs/version.h b/fs/btrfs/version.h new file mode 100644 index 000000000000..9bf3946d5ef2 --- /dev/null +++ b/fs/btrfs/version.h @@ -0,0 +1,4 @@ +#ifndef __BTRFS_VERSION_H +#define __BTRFS_VERSION_H +#define BTRFS_BUILD_VERSION "Btrfs" +#endif diff --git a/fs/btrfs/version.sh b/fs/btrfs/version.sh new file mode 100644 index 000000000000..1ca1952fd917 --- /dev/null +++ b/fs/btrfs/version.sh @@ -0,0 +1,43 @@ +#!/bin/bash +# +# determine-version -- report a useful version for releases +# +# Copyright 2008, Aron Griffis <agriffis@n01se.net> +# Copyright 2008, Oracle +# Released under the GNU GPLv2 + +v="v0.16" + +which git &> /dev/null +if [ $? == 0 ]; then + git branch >& /dev/null + if [ $? == 0 ]; then + if head=`git rev-parse --verify HEAD 2>/dev/null`; then + if tag=`git describe --tags 2>/dev/null`; then + v="$tag" + fi + + # Are there uncommitted changes? + git update-index --refresh --unmerged > /dev/null + if git diff-index --name-only HEAD | \ + grep -v "^scripts/package" \ + | read dummy; then + v="$v"-dirty + fi + fi + fi +fi + +echo "#ifndef __BUILD_VERSION" > .build-version.h +echo "#define __BUILD_VERSION" >> .build-version.h +echo "#define BTRFS_BUILD_VERSION \"Btrfs $v\"" >> .build-version.h +echo "#endif" >> .build-version.h + +diff -q version.h .build-version.h >& /dev/null + +if [ $? == 0 ]; then + rm .build-version.h + exit 0 +fi + +mv .build-version.h version.h diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c new file mode 100644 index 000000000000..b187b537888e --- /dev/null +++ b/fs/btrfs/volumes.c @@ -0,0 +1,3218 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ +#include <linux/sched.h> +#include <linux/bio.h> +#include <linux/buffer_head.h> +#include <linux/blkdev.h> +#include <linux/random.h> +#include <linux/version.h> +#include <asm/div64.h> +#include "compat.h" +#include "ctree.h" +#include "extent_map.h" +#include "disk-io.h" +#include "transaction.h" +#include "print-tree.h" +#include "volumes.h" +#include "async-thread.h" + +struct map_lookup { + u64 type; + int io_align; + int io_width; + int stripe_len; + int sector_size; + int num_stripes; + int sub_stripes; + struct btrfs_bio_stripe stripes[]; +}; + +static int init_first_rw_device(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_device *device); +static int btrfs_relocate_sys_chunks(struct btrfs_root *root); + +#define map_lookup_size(n) (sizeof(struct map_lookup) + \ + (sizeof(struct btrfs_bio_stripe) * (n))) + +static DEFINE_MUTEX(uuid_mutex); +static LIST_HEAD(fs_uuids); + +void btrfs_lock_volumes(void) +{ + mutex_lock(&uuid_mutex); +} + +void btrfs_unlock_volumes(void) +{ + mutex_unlock(&uuid_mutex); +} + +static void lock_chunks(struct btrfs_root *root) +{ + mutex_lock(&root->fs_info->chunk_mutex); +} + +static void unlock_chunks(struct btrfs_root *root) +{ + mutex_unlock(&root->fs_info->chunk_mutex); +} + +static void free_fs_devices(struct btrfs_fs_devices *fs_devices) +{ + struct btrfs_device *device; + WARN_ON(fs_devices->opened); + while (!list_empty(&fs_devices->devices)) { + device = list_entry(fs_devices->devices.next, + struct btrfs_device, dev_list); + list_del(&device->dev_list); + kfree(device->name); + kfree(device); + } + kfree(fs_devices); +} + +int btrfs_cleanup_fs_uuids(void) +{ + struct btrfs_fs_devices *fs_devices; + + while (!list_empty(&fs_uuids)) { + fs_devices = list_entry(fs_uuids.next, + struct btrfs_fs_devices, list); + list_del(&fs_devices->list); + free_fs_devices(fs_devices); + } + return 0; +} + +static noinline struct btrfs_device *__find_device(struct list_head *head, + u64 devid, u8 *uuid) +{ + struct btrfs_device *dev; + struct list_head *cur; + + list_for_each(cur, head) { + dev = list_entry(cur, struct btrfs_device, dev_list); + if (dev->devid == devid && + (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) { + return dev; + } + } + return NULL; +} + +static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) +{ + struct list_head *cur; + struct btrfs_fs_devices *fs_devices; + + list_for_each(cur, &fs_uuids) { + fs_devices = list_entry(cur, struct btrfs_fs_devices, list); + if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) + return fs_devices; + } + return NULL; +} + +/* + * we try to collect pending bios for a device so we don't get a large + * number of procs sending bios down to the same device. This greatly + * improves the schedulers ability to collect and merge the bios. + * + * But, it also turns into a long list of bios to process and that is sure + * to eventually make the worker thread block. The solution here is to + * make some progress and then put this work struct back at the end of + * the list if the block device is congested. This way, multiple devices + * can make progress from a single worker thread. + */ +static noinline int run_scheduled_bios(struct btrfs_device *device) +{ + struct bio *pending; + struct backing_dev_info *bdi; + struct btrfs_fs_info *fs_info; + struct bio *tail; + struct bio *cur; + int again = 0; + unsigned long num_run = 0; + unsigned long limit; + + bdi = device->bdev->bd_inode->i_mapping->backing_dev_info; + fs_info = device->dev_root->fs_info; + limit = btrfs_async_submit_limit(fs_info); + limit = limit * 2 / 3; + +loop: + spin_lock(&device->io_lock); + + /* take all the bios off the list at once and process them + * later on (without the lock held). But, remember the + * tail and other pointers so the bios can be properly reinserted + * into the list if we hit congestion + */ + pending = device->pending_bios; + tail = device->pending_bio_tail; + WARN_ON(pending && !tail); + device->pending_bios = NULL; + device->pending_bio_tail = NULL; + + /* + * if pending was null this time around, no bios need processing + * at all and we can stop. Otherwise it'll loop back up again + * and do an additional check so no bios are missed. + * + * device->running_pending is used to synchronize with the + * schedule_bio code. + */ + if (pending) { + again = 1; + device->running_pending = 1; + } else { + again = 0; + device->running_pending = 0; + } + spin_unlock(&device->io_lock); + + while (pending) { + cur = pending; + pending = pending->bi_next; + cur->bi_next = NULL; + atomic_dec(&fs_info->nr_async_bios); + + if (atomic_read(&fs_info->nr_async_bios) < limit && + waitqueue_active(&fs_info->async_submit_wait)) + wake_up(&fs_info->async_submit_wait); + + BUG_ON(atomic_read(&cur->bi_cnt) == 0); + bio_get(cur); + submit_bio(cur->bi_rw, cur); + bio_put(cur); + num_run++; + + /* + * we made progress, there is more work to do and the bdi + * is now congested. Back off and let other work structs + * run instead + */ + if (pending && bdi_write_congested(bdi) && + fs_info->fs_devices->open_devices > 1) { + struct bio *old_head; + + spin_lock(&device->io_lock); + + old_head = device->pending_bios; + device->pending_bios = pending; + if (device->pending_bio_tail) + tail->bi_next = old_head; + else + device->pending_bio_tail = tail; + + spin_unlock(&device->io_lock); + btrfs_requeue_work(&device->work); + goto done; + } + } + if (again) + goto loop; +done: + return 0; +} + +static void pending_bios_fn(struct btrfs_work *work) +{ + struct btrfs_device *device; + + device = container_of(work, struct btrfs_device, work); + run_scheduled_bios(device); +} + +static noinline int device_list_add(const char *path, + struct btrfs_super_block *disk_super, + u64 devid, struct btrfs_fs_devices **fs_devices_ret) +{ + struct btrfs_device *device; + struct btrfs_fs_devices *fs_devices; + u64 found_transid = btrfs_super_generation(disk_super); + + fs_devices = find_fsid(disk_super->fsid); + if (!fs_devices) { + fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); + if (!fs_devices) + return -ENOMEM; + INIT_LIST_HEAD(&fs_devices->devices); + INIT_LIST_HEAD(&fs_devices->alloc_list); + list_add(&fs_devices->list, &fs_uuids); + memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE); + fs_devices->latest_devid = devid; + fs_devices->latest_trans = found_transid; + device = NULL; + } else { + device = __find_device(&fs_devices->devices, devid, + disk_super->dev_item.uuid); + } + if (!device) { + if (fs_devices->opened) + return -EBUSY; + + device = kzalloc(sizeof(*device), GFP_NOFS); + if (!device) { + /* we can safely leave the fs_devices entry around */ + return -ENOMEM; + } + device->devid = devid; + device->work.func = pending_bios_fn; + memcpy(device->uuid, disk_super->dev_item.uuid, + BTRFS_UUID_SIZE); + device->barriers = 1; + spin_lock_init(&device->io_lock); + device->name = kstrdup(path, GFP_NOFS); + if (!device->name) { + kfree(device); + return -ENOMEM; + } + INIT_LIST_HEAD(&device->dev_alloc_list); + list_add(&device->dev_list, &fs_devices->devices); + device->fs_devices = fs_devices; + fs_devices->num_devices++; + } + + if (found_transid > fs_devices->latest_trans) { + fs_devices->latest_devid = devid; + fs_devices->latest_trans = found_transid; + } + *fs_devices_ret = fs_devices; + return 0; +} + +static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) +{ + struct btrfs_fs_devices *fs_devices; + struct btrfs_device *device; + struct btrfs_device *orig_dev; + + fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); + if (!fs_devices) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&fs_devices->devices); + INIT_LIST_HEAD(&fs_devices->alloc_list); + INIT_LIST_HEAD(&fs_devices->list); + fs_devices->latest_devid = orig->latest_devid; + fs_devices->latest_trans = orig->latest_trans; + memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid)); + + list_for_each_entry(orig_dev, &orig->devices, dev_list) { + device = kzalloc(sizeof(*device), GFP_NOFS); + if (!device) + goto error; + + device->name = kstrdup(orig_dev->name, GFP_NOFS); + if (!device->name) + goto error; + + device->devid = orig_dev->devid; + device->work.func = pending_bios_fn; + memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid)); + device->barriers = 1; + spin_lock_init(&device->io_lock); + INIT_LIST_HEAD(&device->dev_list); + INIT_LIST_HEAD(&device->dev_alloc_list); + + list_add(&device->dev_list, &fs_devices->devices); + device->fs_devices = fs_devices; + fs_devices->num_devices++; + } + return fs_devices; +error: + free_fs_devices(fs_devices); + return ERR_PTR(-ENOMEM); +} + +int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) +{ + struct list_head *tmp; + struct list_head *cur; + struct btrfs_device *device; + + mutex_lock(&uuid_mutex); +again: + list_for_each_safe(cur, tmp, &fs_devices->devices) { + device = list_entry(cur, struct btrfs_device, dev_list); + if (device->in_fs_metadata) + continue; + + if (device->bdev) { + close_bdev_exclusive(device->bdev, device->mode); + device->bdev = NULL; + fs_devices->open_devices--; + } + if (device->writeable) { + list_del_init(&device->dev_alloc_list); + device->writeable = 0; + fs_devices->rw_devices--; + } + list_del_init(&device->dev_list); + fs_devices->num_devices--; + kfree(device->name); + kfree(device); + } + + if (fs_devices->seed) { + fs_devices = fs_devices->seed; + goto again; + } + + mutex_unlock(&uuid_mutex); + return 0; +} + +static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) +{ + struct list_head *cur; + struct btrfs_device *device; + + if (--fs_devices->opened > 0) + return 0; + + list_for_each(cur, &fs_devices->devices) { + device = list_entry(cur, struct btrfs_device, dev_list); + if (device->bdev) { + close_bdev_exclusive(device->bdev, device->mode); + fs_devices->open_devices--; + } + if (device->writeable) { + list_del_init(&device->dev_alloc_list); + fs_devices->rw_devices--; + } + + device->bdev = NULL; + device->writeable = 0; + device->in_fs_metadata = 0; + } + WARN_ON(fs_devices->open_devices); + WARN_ON(fs_devices->rw_devices); + fs_devices->opened = 0; + fs_devices->seeding = 0; + + return 0; +} + +int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) +{ + struct btrfs_fs_devices *seed_devices = NULL; + int ret; + + mutex_lock(&uuid_mutex); + ret = __btrfs_close_devices(fs_devices); + if (!fs_devices->opened) { + seed_devices = fs_devices->seed; + fs_devices->seed = NULL; + } + mutex_unlock(&uuid_mutex); + + while (seed_devices) { + fs_devices = seed_devices; + seed_devices = fs_devices->seed; + __btrfs_close_devices(fs_devices); + free_fs_devices(fs_devices); + } + return ret; +} + +static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, + fmode_t flags, void *holder) +{ + struct block_device *bdev; + struct list_head *head = &fs_devices->devices; + struct list_head *cur; + struct btrfs_device *device; + struct block_device *latest_bdev = NULL; + struct buffer_head *bh; + struct btrfs_super_block *disk_super; + u64 latest_devid = 0; + u64 latest_transid = 0; + u64 devid; + int seeding = 1; + int ret = 0; + + list_for_each(cur, head) { + device = list_entry(cur, struct btrfs_device, dev_list); + if (device->bdev) + continue; + if (!device->name) + continue; + + bdev = open_bdev_exclusive(device->name, flags, holder); + if (IS_ERR(bdev)) { + printk(KERN_INFO "open %s failed\n", device->name); + goto error; + } + set_blocksize(bdev, 4096); + + bh = btrfs_read_dev_super(bdev); + if (!bh) + goto error_close; + + disk_super = (struct btrfs_super_block *)bh->b_data; + devid = le64_to_cpu(disk_super->dev_item.devid); + if (devid != device->devid) + goto error_brelse; + + if (memcmp(device->uuid, disk_super->dev_item.uuid, + BTRFS_UUID_SIZE)) + goto error_brelse; + + device->generation = btrfs_super_generation(disk_super); + if (!latest_transid || device->generation > latest_transid) { + latest_devid = devid; + latest_transid = device->generation; + latest_bdev = bdev; + } + + if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { + device->writeable = 0; + } else { + device->writeable = !bdev_read_only(bdev); + seeding = 0; + } + + device->bdev = bdev; + device->in_fs_metadata = 0; + device->mode = flags; + + fs_devices->open_devices++; + if (device->writeable) { + fs_devices->rw_devices++; + list_add(&device->dev_alloc_list, + &fs_devices->alloc_list); + } + continue; + +error_brelse: + brelse(bh); +error_close: + close_bdev_exclusive(bdev, FMODE_READ); +error: + continue; + } + if (fs_devices->open_devices == 0) { + ret = -EIO; + goto out; + } + fs_devices->seeding = seeding; + fs_devices->opened = 1; + fs_devices->latest_bdev = latest_bdev; + fs_devices->latest_devid = latest_devid; + fs_devices->latest_trans = latest_transid; + fs_devices->total_rw_bytes = 0; +out: + return ret; +} + +int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, + fmode_t flags, void *holder) +{ + int ret; + + mutex_lock(&uuid_mutex); + if (fs_devices->opened) { + fs_devices->opened++; + ret = 0; + } else { + ret = __btrfs_open_devices(fs_devices, flags, holder); + } + mutex_unlock(&uuid_mutex); + return ret; +} + +int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, + struct btrfs_fs_devices **fs_devices_ret) +{ + struct btrfs_super_block *disk_super; + struct block_device *bdev; + struct buffer_head *bh; + int ret; + u64 devid; + u64 transid; + + mutex_lock(&uuid_mutex); + + bdev = open_bdev_exclusive(path, flags, holder); + + if (IS_ERR(bdev)) { + ret = PTR_ERR(bdev); + goto error; + } + + ret = set_blocksize(bdev, 4096); + if (ret) + goto error_close; + bh = btrfs_read_dev_super(bdev); + if (!bh) { + ret = -EIO; + goto error_close; + } + disk_super = (struct btrfs_super_block *)bh->b_data; + devid = le64_to_cpu(disk_super->dev_item.devid); + transid = btrfs_super_generation(disk_super); + if (disk_super->label[0]) + printk(KERN_INFO "device label %s ", disk_super->label); + else { + /* FIXME, make a readl uuid parser */ + printk(KERN_INFO "device fsid %llx-%llx ", + *(unsigned long long *)disk_super->fsid, + *(unsigned long long *)(disk_super->fsid + 8)); + } + printk(KERN_INFO "devid %llu transid %llu %s\n", + (unsigned long long)devid, (unsigned long long)transid, path); + ret = device_list_add(path, disk_super, devid, fs_devices_ret); + + brelse(bh); +error_close: + close_bdev_exclusive(bdev, flags); +error: + mutex_unlock(&uuid_mutex); + return ret; +} + +/* + * this uses a pretty simple search, the expectation is that it is + * called very infrequently and that a given device has a small number + * of extents + */ +static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, + u64 num_bytes, u64 *start) +{ + struct btrfs_key key; + struct btrfs_root *root = device->dev_root; + struct btrfs_dev_extent *dev_extent = NULL; + struct btrfs_path *path; + u64 hole_size = 0; + u64 last_byte = 0; + u64 search_start = 0; + u64 search_end = device->total_bytes; + int ret; + int slot = 0; + int start_found; + struct extent_buffer *l; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->reada = 2; + start_found = 0; + + /* FIXME use last free of some kind */ + + /* we don't want to overwrite the superblock on the drive, + * so we make sure to start at an offset of at least 1MB + */ + search_start = max((u64)1024 * 1024, search_start); + + if (root->fs_info->alloc_start + num_bytes <= device->total_bytes) + search_start = max(root->fs_info->alloc_start, search_start); + + key.objectid = device->devid; + key.offset = search_start; + key.type = BTRFS_DEV_EXTENT_KEY; + ret = btrfs_search_slot(trans, root, &key, path, 0, 0); + if (ret < 0) + goto error; + ret = btrfs_previous_item(root, path, 0, key.type); + if (ret < 0) + goto error; + l = path->nodes[0]; + btrfs_item_key_to_cpu(l, &key, path->slots[0]); + while (1) { + l = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(l)) { + ret = btrfs_next_leaf(root, path); + if (ret == 0) + continue; + if (ret < 0) + goto error; +no_more_items: + if (!start_found) { + if (search_start >= search_end) { + ret = -ENOSPC; + goto error; + } + *start = search_start; + start_found = 1; + goto check_pending; + } + *start = last_byte > search_start ? + last_byte : search_start; + if (search_end <= *start) { + ret = -ENOSPC; + goto error; + } + goto check_pending; + } + btrfs_item_key_to_cpu(l, &key, slot); + + if (key.objectid < device->devid) + goto next; + + if (key.objectid > device->devid) + goto no_more_items; + + if (key.offset >= search_start && key.offset > last_byte && + start_found) { + if (last_byte < search_start) + last_byte = search_start; + hole_size = key.offset - last_byte; + if (key.offset > last_byte && + hole_size >= num_bytes) { + *start = last_byte; + goto check_pending; + } + } + if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) + goto next; + + start_found = 1; + dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); + last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent); +next: + path->slots[0]++; + cond_resched(); + } +check_pending: + /* we have to make sure we didn't find an extent that has already + * been allocated by the map tree or the original allocation + */ + BUG_ON(*start < search_start); + + if (*start + num_bytes > search_end) { + ret = -ENOSPC; + goto error; + } + /* check for pending inserts here */ + ret = 0; + +error: + btrfs_free_path(path); + return ret; +} + +static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, + u64 start) +{ + int ret; + struct btrfs_path *path; + struct btrfs_root *root = device->dev_root; + struct btrfs_key key; + struct btrfs_key found_key; + struct extent_buffer *leaf = NULL; + struct btrfs_dev_extent *extent = NULL; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = device->devid; + key.offset = start; + key.type = BTRFS_DEV_EXTENT_KEY; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) { + ret = btrfs_previous_item(root, path, key.objectid, + BTRFS_DEV_EXTENT_KEY); + BUG_ON(ret); + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + extent = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_dev_extent); + BUG_ON(found_key.offset > start || found_key.offset + + btrfs_dev_extent_length(leaf, extent) < start); + ret = 0; + } else if (ret == 0) { + leaf = path->nodes[0]; + extent = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_dev_extent); + } + BUG_ON(ret); + + if (device->bytes_used > 0) + device->bytes_used -= btrfs_dev_extent_length(leaf, extent); + ret = btrfs_del_item(trans, root, path); + BUG_ON(ret); + + btrfs_free_path(path); + return ret; +} + +int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, + u64 chunk_tree, u64 chunk_objectid, + u64 chunk_offset, u64 start, u64 num_bytes) +{ + int ret; + struct btrfs_path *path; + struct btrfs_root *root = device->dev_root; + struct btrfs_dev_extent *extent; + struct extent_buffer *leaf; + struct btrfs_key key; + + WARN_ON(!device->in_fs_metadata); + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = device->devid; + key.offset = start; + key.type = BTRFS_DEV_EXTENT_KEY; + ret = btrfs_insert_empty_item(trans, root, path, &key, + sizeof(*extent)); + BUG_ON(ret); + + leaf = path->nodes[0]; + extent = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_dev_extent); + btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree); + btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid); + btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); + + write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid, + (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent), + BTRFS_UUID_SIZE); + + btrfs_set_dev_extent_length(leaf, extent, num_bytes); + btrfs_mark_buffer_dirty(leaf); + btrfs_free_path(path); + return ret; +} + +static noinline int find_next_chunk(struct btrfs_root *root, + u64 objectid, u64 *offset) +{ + struct btrfs_path *path; + int ret; + struct btrfs_key key; + struct btrfs_chunk *chunk; + struct btrfs_key found_key; + + path = btrfs_alloc_path(); + BUG_ON(!path); + + key.objectid = objectid; + key.offset = (u64)-1; + key.type = BTRFS_CHUNK_ITEM_KEY; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto error; + + BUG_ON(ret == 0); + + ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY); + if (ret) { + *offset = 0; + } else { + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + if (found_key.objectid != objectid) + *offset = 0; + else { + chunk = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_chunk); + *offset = found_key.offset + + btrfs_chunk_length(path->nodes[0], chunk); + } + } + ret = 0; +error: + btrfs_free_path(path); + return ret; +} + +static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid) +{ + int ret; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_path *path; + + root = root->fs_info->chunk_root; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = BTRFS_DEV_ITEMS_OBJECTID; + key.type = BTRFS_DEV_ITEM_KEY; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto error; + + BUG_ON(ret == 0); + + ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID, + BTRFS_DEV_ITEM_KEY); + if (ret) { + *objectid = 1; + } else { + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + *objectid = found_key.offset + 1; + } + ret = 0; +error: + btrfs_free_path(path); + return ret; +} + +/* + * the device information is stored in the chunk root + * the btrfs_device struct should be fully filled in + */ +int btrfs_add_device(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_device *device) +{ + int ret; + struct btrfs_path *path; + struct btrfs_dev_item *dev_item; + struct extent_buffer *leaf; + struct btrfs_key key; + unsigned long ptr; + + root = root->fs_info->chunk_root; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = BTRFS_DEV_ITEMS_OBJECTID; + key.type = BTRFS_DEV_ITEM_KEY; + key.offset = device->devid; + + ret = btrfs_insert_empty_item(trans, root, path, &key, + sizeof(*dev_item)); + if (ret) + goto out; + + leaf = path->nodes[0]; + dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); + + btrfs_set_device_id(leaf, dev_item, device->devid); + btrfs_set_device_generation(leaf, dev_item, 0); + btrfs_set_device_type(leaf, dev_item, device->type); + btrfs_set_device_io_align(leaf, dev_item, device->io_align); + btrfs_set_device_io_width(leaf, dev_item, device->io_width); + btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); + btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes); + btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); + btrfs_set_device_group(leaf, dev_item, 0); + btrfs_set_device_seek_speed(leaf, dev_item, 0); + btrfs_set_device_bandwidth(leaf, dev_item, 0); + btrfs_set_device_start_offset(leaf, dev_item, 0); + + ptr = (unsigned long)btrfs_device_uuid(dev_item); + write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); + ptr = (unsigned long)btrfs_device_fsid(dev_item); + write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE); + btrfs_mark_buffer_dirty(leaf); + + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + +static int btrfs_rm_dev_item(struct btrfs_root *root, + struct btrfs_device *device) +{ + int ret; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_trans_handle *trans; + + root = root->fs_info->chunk_root; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + trans = btrfs_start_transaction(root, 1); + key.objectid = BTRFS_DEV_ITEMS_OBJECTID; + key.type = BTRFS_DEV_ITEM_KEY; + key.offset = device->devid; + lock_chunks(root); + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + goto out; + + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + ret = btrfs_del_item(trans, root, path); + if (ret) + goto out; +out: + btrfs_free_path(path); + unlock_chunks(root); + btrfs_commit_transaction(trans, root); + return ret; +} + +int btrfs_rm_device(struct btrfs_root *root, char *device_path) +{ + struct btrfs_device *device; + struct btrfs_device *next_device; + struct block_device *bdev; + struct buffer_head *bh = NULL; + struct btrfs_super_block *disk_super; + u64 all_avail; + u64 devid; + u64 num_devices; + u8 *dev_uuid; + int ret = 0; + + mutex_lock(&uuid_mutex); + mutex_lock(&root->fs_info->volume_mutex); + + all_avail = root->fs_info->avail_data_alloc_bits | + root->fs_info->avail_system_alloc_bits | + root->fs_info->avail_metadata_alloc_bits; + + if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && + root->fs_info->fs_devices->rw_devices <= 4) { + printk(KERN_ERR "btrfs: unable to go below four devices " + "on raid10\n"); + ret = -EINVAL; + goto out; + } + + if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && + root->fs_info->fs_devices->rw_devices <= 2) { + printk(KERN_ERR "btrfs: unable to go below two " + "devices on raid1\n"); + ret = -EINVAL; + goto out; + } + + if (strcmp(device_path, "missing") == 0) { + struct list_head *cur; + struct list_head *devices; + struct btrfs_device *tmp; + + device = NULL; + devices = &root->fs_info->fs_devices->devices; + list_for_each(cur, devices) { + tmp = list_entry(cur, struct btrfs_device, dev_list); + if (tmp->in_fs_metadata && !tmp->bdev) { + device = tmp; + break; + } + } + bdev = NULL; + bh = NULL; + disk_super = NULL; + if (!device) { + printk(KERN_ERR "btrfs: no missing devices found to " + "remove\n"); + goto out; + } + } else { + bdev = open_bdev_exclusive(device_path, FMODE_READ, + root->fs_info->bdev_holder); + if (IS_ERR(bdev)) { + ret = PTR_ERR(bdev); + goto out; + } + + set_blocksize(bdev, 4096); + bh = btrfs_read_dev_super(bdev); + if (!bh) { + ret = -EIO; + goto error_close; + } + disk_super = (struct btrfs_super_block *)bh->b_data; + devid = le64_to_cpu(disk_super->dev_item.devid); + dev_uuid = disk_super->dev_item.uuid; + device = btrfs_find_device(root, devid, dev_uuid, + disk_super->fsid); + if (!device) { + ret = -ENOENT; + goto error_brelse; + } + } + + if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) { + printk(KERN_ERR "btrfs: unable to remove the only writeable " + "device\n"); + ret = -EINVAL; + goto error_brelse; + } + + if (device->writeable) { + list_del_init(&device->dev_alloc_list); + root->fs_info->fs_devices->rw_devices--; + } + + ret = btrfs_shrink_device(device, 0); + if (ret) + goto error_brelse; + + ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device); + if (ret) + goto error_brelse; + + device->in_fs_metadata = 0; + list_del_init(&device->dev_list); + device->fs_devices->num_devices--; + + next_device = list_entry(root->fs_info->fs_devices->devices.next, + struct btrfs_device, dev_list); + if (device->bdev == root->fs_info->sb->s_bdev) + root->fs_info->sb->s_bdev = next_device->bdev; + if (device->bdev == root->fs_info->fs_devices->latest_bdev) + root->fs_info->fs_devices->latest_bdev = next_device->bdev; + + if (device->bdev) { + close_bdev_exclusive(device->bdev, device->mode); + device->bdev = NULL; + device->fs_devices->open_devices--; + } + + num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; + btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices); + + if (device->fs_devices->open_devices == 0) { + struct btrfs_fs_devices *fs_devices; + fs_devices = root->fs_info->fs_devices; + while (fs_devices) { + if (fs_devices->seed == device->fs_devices) + break; + fs_devices = fs_devices->seed; + } + fs_devices->seed = device->fs_devices->seed; + device->fs_devices->seed = NULL; + __btrfs_close_devices(device->fs_devices); + free_fs_devices(device->fs_devices); + } + + /* + * at this point, the device is zero sized. We want to + * remove it from the devices list and zero out the old super + */ + if (device->writeable) { + /* make sure this device isn't detected as part of + * the FS anymore + */ + memset(&disk_super->magic, 0, sizeof(disk_super->magic)); + set_buffer_dirty(bh); + sync_dirty_buffer(bh); + } + + kfree(device->name); + kfree(device); + ret = 0; + +error_brelse: + brelse(bh); +error_close: + if (bdev) + close_bdev_exclusive(bdev, FMODE_READ); +out: + mutex_unlock(&root->fs_info->volume_mutex); + mutex_unlock(&uuid_mutex); + return ret; +} + +/* + * does all the dirty work required for changing file system's UUID. + */ +static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; + struct btrfs_fs_devices *old_devices; + struct btrfs_fs_devices *seed_devices; + struct btrfs_super_block *disk_super = &root->fs_info->super_copy; + struct btrfs_device *device; + u64 super_flags; + + BUG_ON(!mutex_is_locked(&uuid_mutex)); + if (!fs_devices->seeding) + return -EINVAL; + + seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); + if (!seed_devices) + return -ENOMEM; + + old_devices = clone_fs_devices(fs_devices); + if (IS_ERR(old_devices)) { + kfree(seed_devices); + return PTR_ERR(old_devices); + } + + list_add(&old_devices->list, &fs_uuids); + + memcpy(seed_devices, fs_devices, sizeof(*seed_devices)); + seed_devices->opened = 1; + INIT_LIST_HEAD(&seed_devices->devices); + INIT_LIST_HEAD(&seed_devices->alloc_list); + list_splice_init(&fs_devices->devices, &seed_devices->devices); + list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); + list_for_each_entry(device, &seed_devices->devices, dev_list) { + device->fs_devices = seed_devices; + } + + fs_devices->seeding = 0; + fs_devices->num_devices = 0; + fs_devices->open_devices = 0; + fs_devices->seed = seed_devices; + + generate_random_uuid(fs_devices->fsid); + memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); + memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); + super_flags = btrfs_super_flags(disk_super) & + ~BTRFS_SUPER_FLAG_SEEDING; + btrfs_set_super_flags(disk_super, super_flags); + + return 0; +} + +/* + * strore the expected generation for seed devices in device items. + */ +static int btrfs_finish_sprout(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_dev_item *dev_item; + struct btrfs_device *device; + struct btrfs_key key; + u8 fs_uuid[BTRFS_UUID_SIZE]; + u8 dev_uuid[BTRFS_UUID_SIZE]; + u64 devid; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + root = root->fs_info->chunk_root; + key.objectid = BTRFS_DEV_ITEMS_OBJECTID; + key.offset = 0; + key.type = BTRFS_DEV_ITEM_KEY; + + while (1) { + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + if (ret < 0) + goto error; + + leaf = path->nodes[0]; +next_slot: + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret > 0) + break; + if (ret < 0) + goto error; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + btrfs_release_path(root, path); + continue; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID || + key.type != BTRFS_DEV_ITEM_KEY) + break; + + dev_item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_dev_item); + devid = btrfs_device_id(leaf, dev_item); + read_extent_buffer(leaf, dev_uuid, + (unsigned long)btrfs_device_uuid(dev_item), + BTRFS_UUID_SIZE); + read_extent_buffer(leaf, fs_uuid, + (unsigned long)btrfs_device_fsid(dev_item), + BTRFS_UUID_SIZE); + device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); + BUG_ON(!device); + + if (device->fs_devices->seeding) { + btrfs_set_device_generation(leaf, dev_item, + device->generation); + btrfs_mark_buffer_dirty(leaf); + } + + path->slots[0]++; + goto next_slot; + } + ret = 0; +error: + btrfs_free_path(path); + return ret; +} + +int btrfs_init_new_device(struct btrfs_root *root, char *device_path) +{ + struct btrfs_trans_handle *trans; + struct btrfs_device *device; + struct block_device *bdev; + struct list_head *cur; + struct list_head *devices; + struct super_block *sb = root->fs_info->sb; + u64 total_bytes; + int seeding_dev = 0; + int ret = 0; + + if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) + return -EINVAL; + + bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder); + if (!bdev) + return -EIO; + + if (root->fs_info->fs_devices->seeding) { + seeding_dev = 1; + down_write(&sb->s_umount); + mutex_lock(&uuid_mutex); + } + + filemap_write_and_wait(bdev->bd_inode->i_mapping); + mutex_lock(&root->fs_info->volume_mutex); + + devices = &root->fs_info->fs_devices->devices; + list_for_each(cur, devices) { + device = list_entry(cur, struct btrfs_device, dev_list); + if (device->bdev == bdev) { + ret = -EEXIST; + goto error; + } + } + + device = kzalloc(sizeof(*device), GFP_NOFS); + if (!device) { + /* we can safely leave the fs_devices entry around */ + ret = -ENOMEM; + goto error; + } + + device->name = kstrdup(device_path, GFP_NOFS); + if (!device->name) { + kfree(device); + ret = -ENOMEM; + goto error; + } + + ret = find_next_devid(root, &device->devid); + if (ret) { + kfree(device); + goto error; + } + + trans = btrfs_start_transaction(root, 1); + lock_chunks(root); + + device->barriers = 1; + device->writeable = 1; + device->work.func = pending_bios_fn; + generate_random_uuid(device->uuid); + spin_lock_init(&device->io_lock); + device->generation = trans->transid; + device->io_width = root->sectorsize; + device->io_align = root->sectorsize; + device->sector_size = root->sectorsize; + device->total_bytes = i_size_read(bdev->bd_inode); + device->dev_root = root->fs_info->dev_root; + device->bdev = bdev; + device->in_fs_metadata = 1; + device->mode = 0; + set_blocksize(device->bdev, 4096); + + if (seeding_dev) { + sb->s_flags &= ~MS_RDONLY; + ret = btrfs_prepare_sprout(trans, root); + BUG_ON(ret); + } + + device->fs_devices = root->fs_info->fs_devices; + list_add(&device->dev_list, &root->fs_info->fs_devices->devices); + list_add(&device->dev_alloc_list, + &root->fs_info->fs_devices->alloc_list); + root->fs_info->fs_devices->num_devices++; + root->fs_info->fs_devices->open_devices++; + root->fs_info->fs_devices->rw_devices++; + root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; + + total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy); + btrfs_set_super_total_bytes(&root->fs_info->super_copy, + total_bytes + device->total_bytes); + + total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy); + btrfs_set_super_num_devices(&root->fs_info->super_copy, + total_bytes + 1); + + if (seeding_dev) { + ret = init_first_rw_device(trans, root, device); + BUG_ON(ret); + ret = btrfs_finish_sprout(trans, root); + BUG_ON(ret); + } else { + ret = btrfs_add_device(trans, root, device); + } + + unlock_chunks(root); + btrfs_commit_transaction(trans, root); + + if (seeding_dev) { + mutex_unlock(&uuid_mutex); + up_write(&sb->s_umount); + + ret = btrfs_relocate_sys_chunks(root); + BUG_ON(ret); + } +out: + mutex_unlock(&root->fs_info->volume_mutex); + return ret; +error: + close_bdev_exclusive(bdev, 0); + if (seeding_dev) { + mutex_unlock(&uuid_mutex); + up_write(&sb->s_umount); + } + goto out; +} + +static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, + struct btrfs_device *device) +{ + int ret; + struct btrfs_path *path; + struct btrfs_root *root; + struct btrfs_dev_item *dev_item; + struct extent_buffer *leaf; + struct btrfs_key key; + + root = device->dev_root->fs_info->chunk_root; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = BTRFS_DEV_ITEMS_OBJECTID; + key.type = BTRFS_DEV_ITEM_KEY; + key.offset = device->devid; + + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + if (ret < 0) + goto out; + + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + leaf = path->nodes[0]; + dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); + + btrfs_set_device_id(leaf, dev_item, device->devid); + btrfs_set_device_type(leaf, dev_item, device->type); + btrfs_set_device_io_align(leaf, dev_item, device->io_align); + btrfs_set_device_io_width(leaf, dev_item, device->io_width); + btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); + btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes); + btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); + btrfs_mark_buffer_dirty(leaf); + +out: + btrfs_free_path(path); + return ret; +} + +static int __btrfs_grow_device(struct btrfs_trans_handle *trans, + struct btrfs_device *device, u64 new_size) +{ + struct btrfs_super_block *super_copy = + &device->dev_root->fs_info->super_copy; + u64 old_total = btrfs_super_total_bytes(super_copy); + u64 diff = new_size - device->total_bytes; + + if (!device->writeable) + return -EACCES; + if (new_size <= device->total_bytes) + return -EINVAL; + + btrfs_set_super_total_bytes(super_copy, old_total + diff); + device->fs_devices->total_rw_bytes += diff; + + device->total_bytes = new_size; + return btrfs_update_device(trans, device); +} + +int btrfs_grow_device(struct btrfs_trans_handle *trans, + struct btrfs_device *device, u64 new_size) +{ + int ret; + lock_chunks(device->dev_root); + ret = __btrfs_grow_device(trans, device, new_size); + unlock_chunks(device->dev_root); + return ret; +} + +static int btrfs_free_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 chunk_tree, u64 chunk_objectid, + u64 chunk_offset) +{ + int ret; + struct btrfs_path *path; + struct btrfs_key key; + + root = root->fs_info->chunk_root; + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = chunk_objectid; + key.offset = chunk_offset; + key.type = BTRFS_CHUNK_ITEM_KEY; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + BUG_ON(ret); + + ret = btrfs_del_item(trans, root, path); + BUG_ON(ret); + + btrfs_free_path(path); + return 0; +} + +static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64 + chunk_offset) +{ + struct btrfs_super_block *super_copy = &root->fs_info->super_copy; + struct btrfs_disk_key *disk_key; + struct btrfs_chunk *chunk; + u8 *ptr; + int ret = 0; + u32 num_stripes; + u32 array_size; + u32 len = 0; + u32 cur; + struct btrfs_key key; + + array_size = btrfs_super_sys_array_size(super_copy); + + ptr = super_copy->sys_chunk_array; + cur = 0; + + while (cur < array_size) { + disk_key = (struct btrfs_disk_key *)ptr; + btrfs_disk_key_to_cpu(&key, disk_key); + + len = sizeof(*disk_key); + + if (key.type == BTRFS_CHUNK_ITEM_KEY) { + chunk = (struct btrfs_chunk *)(ptr + len); + num_stripes = btrfs_stack_chunk_num_stripes(chunk); + len += btrfs_chunk_item_size(num_stripes); + } else { + ret = -EIO; + break; + } + if (key.objectid == chunk_objectid && + key.offset == chunk_offset) { + memmove(ptr, ptr + len, array_size - (cur + len)); + array_size -= len; + btrfs_set_super_sys_array_size(super_copy, array_size); + } else { + ptr += len; + cur += len; + } + } + return ret; +} + +static int btrfs_relocate_chunk(struct btrfs_root *root, + u64 chunk_tree, u64 chunk_objectid, + u64 chunk_offset) +{ + struct extent_map_tree *em_tree; + struct btrfs_root *extent_root; + struct btrfs_trans_handle *trans; + struct extent_map *em; + struct map_lookup *map; + int ret; + int i; + + printk(KERN_INFO "btrfs relocating chunk %llu\n", + (unsigned long long)chunk_offset); + root = root->fs_info->chunk_root; + extent_root = root->fs_info->extent_root; + em_tree = &root->fs_info->mapping_tree.map_tree; + + /* step one, relocate all the extents inside this chunk */ + ret = btrfs_relocate_block_group(extent_root, chunk_offset); + BUG_ON(ret); + + trans = btrfs_start_transaction(root, 1); + BUG_ON(!trans); + + lock_chunks(root); + + /* + * step two, delete the device extents and the + * chunk tree entries + */ + spin_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, chunk_offset, 1); + spin_unlock(&em_tree->lock); + + BUG_ON(em->start > chunk_offset || + em->start + em->len < chunk_offset); + map = (struct map_lookup *)em->bdev; + + for (i = 0; i < map->num_stripes; i++) { + ret = btrfs_free_dev_extent(trans, map->stripes[i].dev, + map->stripes[i].physical); + BUG_ON(ret); + + if (map->stripes[i].dev) { + ret = btrfs_update_device(trans, map->stripes[i].dev); + BUG_ON(ret); + } + } + ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid, + chunk_offset); + + BUG_ON(ret); + + if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { + ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset); + BUG_ON(ret); + } + + ret = btrfs_remove_block_group(trans, extent_root, chunk_offset); + BUG_ON(ret); + + spin_lock(&em_tree->lock); + remove_extent_mapping(em_tree, em); + spin_unlock(&em_tree->lock); + + kfree(map); + em->bdev = NULL; + + /* once for the tree */ + free_extent_map(em); + /* once for us */ + free_extent_map(em); + + unlock_chunks(root); + btrfs_end_transaction(trans, root); + return 0; +} + +static int btrfs_relocate_sys_chunks(struct btrfs_root *root) +{ + struct btrfs_root *chunk_root = root->fs_info->chunk_root; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_chunk *chunk; + struct btrfs_key key; + struct btrfs_key found_key; + u64 chunk_tree = chunk_root->root_key.objectid; + u64 chunk_type; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; + key.offset = (u64)-1; + key.type = BTRFS_CHUNK_ITEM_KEY; + + while (1) { + ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); + if (ret < 0) + goto error; + BUG_ON(ret == 0); + + ret = btrfs_previous_item(chunk_root, path, key.objectid, + key.type); + if (ret < 0) + goto error; + if (ret > 0) + break; + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + chunk = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_chunk); + chunk_type = btrfs_chunk_type(leaf, chunk); + btrfs_release_path(chunk_root, path); + + if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { + ret = btrfs_relocate_chunk(chunk_root, chunk_tree, + found_key.objectid, + found_key.offset); + BUG_ON(ret); + } + + if (found_key.offset == 0) + break; + key.offset = found_key.offset - 1; + } + ret = 0; +error: + btrfs_free_path(path); + return ret; +} + +static u64 div_factor(u64 num, int factor) +{ + if (factor == 10) + return num; + num *= factor; + do_div(num, 10); + return num; +} + +int btrfs_balance(struct btrfs_root *dev_root) +{ + int ret; + struct list_head *cur; + struct list_head *devices = &dev_root->fs_info->fs_devices->devices; + struct btrfs_device *device; + u64 old_size; + u64 size_to_free; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_chunk *chunk; + struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root; + struct btrfs_trans_handle *trans; + struct btrfs_key found_key; + + if (dev_root->fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + mutex_lock(&dev_root->fs_info->volume_mutex); + dev_root = dev_root->fs_info->dev_root; + + /* step one make some room on all the devices */ + list_for_each(cur, devices) { + device = list_entry(cur, struct btrfs_device, dev_list); + old_size = device->total_bytes; + size_to_free = div_factor(old_size, 1); + size_to_free = min(size_to_free, (u64)1 * 1024 * 1024); + if (!device->writeable || + device->total_bytes - device->bytes_used > size_to_free) + continue; + + ret = btrfs_shrink_device(device, old_size - size_to_free); + BUG_ON(ret); + + trans = btrfs_start_transaction(dev_root, 1); + BUG_ON(!trans); + + ret = btrfs_grow_device(trans, device, old_size); + BUG_ON(ret); + + btrfs_end_transaction(trans, dev_root); + } + + /* step two, relocate all the chunks */ + path = btrfs_alloc_path(); + BUG_ON(!path); + + key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; + key.offset = (u64)-1; + key.type = BTRFS_CHUNK_ITEM_KEY; + + while (1) { + ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); + if (ret < 0) + goto error; + + /* + * this shouldn't happen, it means the last relocate + * failed + */ + if (ret == 0) + break; + + ret = btrfs_previous_item(chunk_root, path, 0, + BTRFS_CHUNK_ITEM_KEY); + if (ret) + break; + + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + if (found_key.objectid != key.objectid) + break; + + chunk = btrfs_item_ptr(path->nodes[0], + path->slots[0], + struct btrfs_chunk); + key.offset = found_key.offset; + /* chunk zero is special */ + if (key.offset == 0) + break; + + btrfs_release_path(chunk_root, path); + ret = btrfs_relocate_chunk(chunk_root, + chunk_root->root_key.objectid, + found_key.objectid, + found_key.offset); + BUG_ON(ret); + } + ret = 0; +error: + btrfs_free_path(path); + mutex_unlock(&dev_root->fs_info->volume_mutex); + return ret; +} + +/* + * shrinking a device means finding all of the device extents past + * the new size, and then following the back refs to the chunks. + * The chunk relocation code actually frees the device extent + */ +int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = device->dev_root; + struct btrfs_dev_extent *dev_extent = NULL; + struct btrfs_path *path; + u64 length; + u64 chunk_tree; + u64 chunk_objectid; + u64 chunk_offset; + int ret; + int slot; + struct extent_buffer *l; + struct btrfs_key key; + struct btrfs_super_block *super_copy = &root->fs_info->super_copy; + u64 old_total = btrfs_super_total_bytes(super_copy); + u64 diff = device->total_bytes - new_size; + + if (new_size >= device->total_bytes) + return -EINVAL; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + trans = btrfs_start_transaction(root, 1); + if (!trans) { + ret = -ENOMEM; + goto done; + } + + path->reada = 2; + + lock_chunks(root); + + device->total_bytes = new_size; + if (device->writeable) + device->fs_devices->total_rw_bytes -= diff; + ret = btrfs_update_device(trans, device); + if (ret) { + unlock_chunks(root); + btrfs_end_transaction(trans, root); + goto done; + } + WARN_ON(diff > old_total); + btrfs_set_super_total_bytes(super_copy, old_total - diff); + unlock_chunks(root); + btrfs_end_transaction(trans, root); + + key.objectid = device->devid; + key.offset = (u64)-1; + key.type = BTRFS_DEV_EXTENT_KEY; + + while (1) { + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto done; + + ret = btrfs_previous_item(root, path, 0, key.type); + if (ret < 0) + goto done; + if (ret) { + ret = 0; + goto done; + } + + l = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(l, &key, path->slots[0]); + + if (key.objectid != device->devid) + goto done; + + dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); + length = btrfs_dev_extent_length(l, dev_extent); + + if (key.offset + length <= new_size) + goto done; + + chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); + chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); + chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); + btrfs_release_path(root, path); + + ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid, + chunk_offset); + if (ret) + goto done; + } + +done: + btrfs_free_path(path); + return ret; +} + +static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_key *key, + struct btrfs_chunk *chunk, int item_size) +{ + struct btrfs_super_block *super_copy = &root->fs_info->super_copy; + struct btrfs_disk_key disk_key; + u32 array_size; + u8 *ptr; + + array_size = btrfs_super_sys_array_size(super_copy); + if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) + return -EFBIG; + + ptr = super_copy->sys_chunk_array + array_size; + btrfs_cpu_key_to_disk(&disk_key, key); + memcpy(ptr, &disk_key, sizeof(disk_key)); + ptr += sizeof(disk_key); + memcpy(ptr, chunk, item_size); + item_size += sizeof(disk_key); + btrfs_set_super_sys_array_size(super_copy, array_size + item_size); + return 0; +} + +static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size, + int num_stripes, int sub_stripes) +{ + if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP)) + return calc_size; + else if (type & BTRFS_BLOCK_GROUP_RAID10) + return calc_size * (num_stripes / sub_stripes); + else + return calc_size * num_stripes; +} + +static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, + struct map_lookup **map_ret, + u64 *num_bytes, u64 *stripe_size, + u64 start, u64 type) +{ + struct btrfs_fs_info *info = extent_root->fs_info; + struct btrfs_device *device = NULL; + struct btrfs_fs_devices *fs_devices = info->fs_devices; + struct list_head *cur; + struct map_lookup *map = NULL; + struct extent_map_tree *em_tree; + struct extent_map *em; + struct list_head private_devs; + int min_stripe_size = 1 * 1024 * 1024; + u64 calc_size = 1024 * 1024 * 1024; + u64 max_chunk_size = calc_size; + u64 min_free; + u64 avail; + u64 max_avail = 0; + u64 dev_offset; + int num_stripes = 1; + int min_stripes = 1; + int sub_stripes = 0; + int looped = 0; + int ret; + int index; + int stripe_len = 64 * 1024; + + if ((type & BTRFS_BLOCK_GROUP_RAID1) && + (type & BTRFS_BLOCK_GROUP_DUP)) { + WARN_ON(1); + type &= ~BTRFS_BLOCK_GROUP_DUP; + } + if (list_empty(&fs_devices->alloc_list)) + return -ENOSPC; + + if (type & (BTRFS_BLOCK_GROUP_RAID0)) { + num_stripes = fs_devices->rw_devices; + min_stripes = 2; + } + if (type & (BTRFS_BLOCK_GROUP_DUP)) { + num_stripes = 2; + min_stripes = 2; + } + if (type & (BTRFS_BLOCK_GROUP_RAID1)) { + num_stripes = min_t(u64, 2, fs_devices->rw_devices); + if (num_stripes < 2) + return -ENOSPC; + min_stripes = 2; + } + if (type & (BTRFS_BLOCK_GROUP_RAID10)) { + num_stripes = fs_devices->rw_devices; + if (num_stripes < 4) + return -ENOSPC; + num_stripes &= ~(u32)1; + sub_stripes = 2; + min_stripes = 4; + } + + if (type & BTRFS_BLOCK_GROUP_DATA) { + max_chunk_size = 10 * calc_size; + min_stripe_size = 64 * 1024 * 1024; + } else if (type & BTRFS_BLOCK_GROUP_METADATA) { + max_chunk_size = 4 * calc_size; + min_stripe_size = 32 * 1024 * 1024; + } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { + calc_size = 8 * 1024 * 1024; + max_chunk_size = calc_size * 2; + min_stripe_size = 1 * 1024 * 1024; + } + + /* we don't want a chunk larger than 10% of writeable space */ + max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), + max_chunk_size); + +again: + if (!map || map->num_stripes != num_stripes) { + kfree(map); + map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); + if (!map) + return -ENOMEM; + map->num_stripes = num_stripes; + } + + if (calc_size * num_stripes > max_chunk_size) { + calc_size = max_chunk_size; + do_div(calc_size, num_stripes); + do_div(calc_size, stripe_len); + calc_size *= stripe_len; + } + /* we don't want tiny stripes */ + calc_size = max_t(u64, min_stripe_size, calc_size); + + do_div(calc_size, stripe_len); + calc_size *= stripe_len; + + cur = fs_devices->alloc_list.next; + index = 0; + + if (type & BTRFS_BLOCK_GROUP_DUP) + min_free = calc_size * 2; + else + min_free = calc_size; + + /* + * we add 1MB because we never use the first 1MB of the device, unless + * we've looped, then we are likely allocating the maximum amount of + * space left already + */ + if (!looped) + min_free += 1024 * 1024; + + INIT_LIST_HEAD(&private_devs); + while (index < num_stripes) { + device = list_entry(cur, struct btrfs_device, dev_alloc_list); + BUG_ON(!device->writeable); + if (device->total_bytes > device->bytes_used) + avail = device->total_bytes - device->bytes_used; + else + avail = 0; + cur = cur->next; + + if (device->in_fs_metadata && avail >= min_free) { + ret = find_free_dev_extent(trans, device, + min_free, &dev_offset); + if (ret == 0) { + list_move_tail(&device->dev_alloc_list, + &private_devs); + map->stripes[index].dev = device; + map->stripes[index].physical = dev_offset; + index++; + if (type & BTRFS_BLOCK_GROUP_DUP) { + map->stripes[index].dev = device; + map->stripes[index].physical = + dev_offset + calc_size; + index++; + } + } + } else if (device->in_fs_metadata && avail > max_avail) + max_avail = avail; + if (cur == &fs_devices->alloc_list) + break; + } + list_splice(&private_devs, &fs_devices->alloc_list); + if (index < num_stripes) { + if (index >= min_stripes) { + num_stripes = index; + if (type & (BTRFS_BLOCK_GROUP_RAID10)) { + num_stripes /= sub_stripes; + num_stripes *= sub_stripes; + } + looped = 1; + goto again; + } + if (!looped && max_avail > 0) { + looped = 1; + calc_size = max_avail; + goto again; + } + kfree(map); + return -ENOSPC; + } + map->sector_size = extent_root->sectorsize; + map->stripe_len = stripe_len; + map->io_align = stripe_len; + map->io_width = stripe_len; + map->type = type; + map->num_stripes = num_stripes; + map->sub_stripes = sub_stripes; + + *map_ret = map; + *stripe_size = calc_size; + *num_bytes = chunk_bytes_by_type(type, calc_size, + num_stripes, sub_stripes); + + em = alloc_extent_map(GFP_NOFS); + if (!em) { + kfree(map); + return -ENOMEM; + } + em->bdev = (struct block_device *)map; + em->start = start; + em->len = *num_bytes; + em->block_start = 0; + em->block_len = em->len; + + em_tree = &extent_root->fs_info->mapping_tree.map_tree; + spin_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); + spin_unlock(&em_tree->lock); + BUG_ON(ret); + free_extent_map(em); + + ret = btrfs_make_block_group(trans, extent_root, 0, type, + BTRFS_FIRST_CHUNK_TREE_OBJECTID, + start, *num_bytes); + BUG_ON(ret); + + index = 0; + while (index < map->num_stripes) { + device = map->stripes[index].dev; + dev_offset = map->stripes[index].physical; + + ret = btrfs_alloc_dev_extent(trans, device, + info->chunk_root->root_key.objectid, + BTRFS_FIRST_CHUNK_TREE_OBJECTID, + start, dev_offset, calc_size); + BUG_ON(ret); + index++; + } + + return 0; +} + +static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, + struct map_lookup *map, u64 chunk_offset, + u64 chunk_size, u64 stripe_size) +{ + u64 dev_offset; + struct btrfs_key key; + struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; + struct btrfs_device *device; + struct btrfs_chunk *chunk; + struct btrfs_stripe *stripe; + size_t item_size = btrfs_chunk_item_size(map->num_stripes); + int index = 0; + int ret; + + chunk = kzalloc(item_size, GFP_NOFS); + if (!chunk) + return -ENOMEM; + + index = 0; + while (index < map->num_stripes) { + device = map->stripes[index].dev; + device->bytes_used += stripe_size; + ret = btrfs_update_device(trans, device); + BUG_ON(ret); + index++; + } + + index = 0; + stripe = &chunk->stripe; + while (index < map->num_stripes) { + device = map->stripes[index].dev; + dev_offset = map->stripes[index].physical; + + btrfs_set_stack_stripe_devid(stripe, device->devid); + btrfs_set_stack_stripe_offset(stripe, dev_offset); + memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); + stripe++; + index++; + } + + btrfs_set_stack_chunk_length(chunk, chunk_size); + btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid); + btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len); + btrfs_set_stack_chunk_type(chunk, map->type); + btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); + btrfs_set_stack_chunk_io_align(chunk, map->stripe_len); + btrfs_set_stack_chunk_io_width(chunk, map->stripe_len); + btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize); + btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes); + + key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; + key.type = BTRFS_CHUNK_ITEM_KEY; + key.offset = chunk_offset; + + ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); + BUG_ON(ret); + + if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { + ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk, + item_size); + BUG_ON(ret); + } + kfree(chunk); + return 0; +} + +/* + * Chunk allocation falls into two parts. The first part does works + * that make the new allocated chunk useable, but not do any operation + * that modifies the chunk tree. The second part does the works that + * require modifying the chunk tree. This division is important for the + * bootstrap process of adding storage to a seed btrfs. + */ +int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, u64 type) +{ + u64 chunk_offset; + u64 chunk_size; + u64 stripe_size; + struct map_lookup *map; + struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; + int ret; + + ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID, + &chunk_offset); + if (ret) + return ret; + + ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, + &stripe_size, chunk_offset, type); + if (ret) + return ret; + + ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset, + chunk_size, stripe_size); + BUG_ON(ret); + return 0; +} + +static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_device *device) +{ + u64 chunk_offset; + u64 sys_chunk_offset; + u64 chunk_size; + u64 sys_chunk_size; + u64 stripe_size; + u64 sys_stripe_size; + u64 alloc_profile; + struct map_lookup *map; + struct map_lookup *sys_map; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_root *extent_root = fs_info->extent_root; + int ret; + + ret = find_next_chunk(fs_info->chunk_root, + BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset); + BUG_ON(ret); + + alloc_profile = BTRFS_BLOCK_GROUP_METADATA | + (fs_info->metadata_alloc_profile & + fs_info->avail_metadata_alloc_bits); + alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); + + ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, + &stripe_size, chunk_offset, alloc_profile); + BUG_ON(ret); + + sys_chunk_offset = chunk_offset + chunk_size; + + alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM | + (fs_info->system_alloc_profile & + fs_info->avail_system_alloc_bits); + alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); + + ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, + &sys_chunk_size, &sys_stripe_size, + sys_chunk_offset, alloc_profile); + BUG_ON(ret); + + ret = btrfs_add_device(trans, fs_info->chunk_root, device); + BUG_ON(ret); + + /* + * Modifying chunk tree needs allocating new blocks from both + * system block group and metadata block group. So we only can + * do operations require modifying the chunk tree after both + * block groups were created. + */ + ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset, + chunk_size, stripe_size); + BUG_ON(ret); + + ret = __finish_chunk_alloc(trans, extent_root, sys_map, + sys_chunk_offset, sys_chunk_size, + sys_stripe_size); + BUG_ON(ret); + return 0; +} + +int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset) +{ + struct extent_map *em; + struct map_lookup *map; + struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; + int readonly = 0; + int i; + + spin_lock(&map_tree->map_tree.lock); + em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); + spin_unlock(&map_tree->map_tree.lock); + if (!em) + return 1; + + map = (struct map_lookup *)em->bdev; + for (i = 0; i < map->num_stripes; i++) { + if (!map->stripes[i].dev->writeable) { + readonly = 1; + break; + } + } + free_extent_map(em); + return readonly; +} + +void btrfs_mapping_init(struct btrfs_mapping_tree *tree) +{ + extent_map_tree_init(&tree->map_tree, GFP_NOFS); +} + +void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) +{ + struct extent_map *em; + + while (1) { + spin_lock(&tree->map_tree.lock); + em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1); + if (em) + remove_extent_mapping(&tree->map_tree, em); + spin_unlock(&tree->map_tree.lock); + if (!em) + break; + kfree(em->bdev); + /* once for us */ + free_extent_map(em); + /* once for the tree */ + free_extent_map(em); + } +} + +int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len) +{ + struct extent_map *em; + struct map_lookup *map; + struct extent_map_tree *em_tree = &map_tree->map_tree; + int ret; + + spin_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, logical, len); + spin_unlock(&em_tree->lock); + BUG_ON(!em); + + BUG_ON(em->start > logical || em->start + em->len < logical); + map = (struct map_lookup *)em->bdev; + if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) + ret = map->num_stripes; + else if (map->type & BTRFS_BLOCK_GROUP_RAID10) + ret = map->sub_stripes; + else + ret = 1; + free_extent_map(em); + return ret; +} + +static int find_live_mirror(struct map_lookup *map, int first, int num, + int optimal) +{ + int i; + if (map->stripes[optimal].dev->bdev) + return optimal; + for (i = first; i < first + num; i++) { + if (map->stripes[i].dev->bdev) + return i; + } + /* we couldn't find one that doesn't fail. Just return something + * and the io error handling code will clean up eventually + */ + return optimal; +} + +static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, + u64 logical, u64 *length, + struct btrfs_multi_bio **multi_ret, + int mirror_num, struct page *unplug_page) +{ + struct extent_map *em; + struct map_lookup *map; + struct extent_map_tree *em_tree = &map_tree->map_tree; + u64 offset; + u64 stripe_offset; + u64 stripe_nr; + int stripes_allocated = 8; + int stripes_required = 1; + int stripe_index; + int i; + int num_stripes; + int max_errors = 0; + struct btrfs_multi_bio *multi = NULL; + + if (multi_ret && !(rw & (1 << BIO_RW))) + stripes_allocated = 1; +again: + if (multi_ret) { + multi = kzalloc(btrfs_multi_bio_size(stripes_allocated), + GFP_NOFS); + if (!multi) + return -ENOMEM; + + atomic_set(&multi->error, 0); + } + + spin_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, logical, *length); + spin_unlock(&em_tree->lock); + + if (!em && unplug_page) + return 0; + + if (!em) { + printk(KERN_CRIT "unable to find logical %llu len %llu\n", + (unsigned long long)logical, + (unsigned long long)*length); + BUG(); + } + + BUG_ON(em->start > logical || em->start + em->len < logical); + map = (struct map_lookup *)em->bdev; + offset = logical - em->start; + + if (mirror_num > map->num_stripes) + mirror_num = 0; + + /* if our multi bio struct is too small, back off and try again */ + if (rw & (1 << BIO_RW)) { + if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_DUP)) { + stripes_required = map->num_stripes; + max_errors = 1; + } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { + stripes_required = map->sub_stripes; + max_errors = 1; + } + } + if (multi_ret && rw == WRITE && + stripes_allocated < stripes_required) { + stripes_allocated = map->num_stripes; + free_extent_map(em); + kfree(multi); + goto again; + } + stripe_nr = offset; + /* + * stripe_nr counts the total number of stripes we have to stride + * to get to this block + */ + do_div(stripe_nr, map->stripe_len); + + stripe_offset = stripe_nr * map->stripe_len; + BUG_ON(offset < stripe_offset); + + /* stripe_offset is the offset of this block in its stripe*/ + stripe_offset = offset - stripe_offset; + + if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_DUP)) { + /* we limit the length of each bio to what fits in a stripe */ + *length = min_t(u64, em->len - offset, + map->stripe_len - stripe_offset); + } else { + *length = em->len - offset; + } + + if (!multi_ret && !unplug_page) + goto out; + + num_stripes = 1; + stripe_index = 0; + if (map->type & BTRFS_BLOCK_GROUP_RAID1) { + if (unplug_page || (rw & (1 << BIO_RW))) + num_stripes = map->num_stripes; + else if (mirror_num) + stripe_index = mirror_num - 1; + else { + stripe_index = find_live_mirror(map, 0, + map->num_stripes, + current->pid % map->num_stripes); + } + + } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { + if (rw & (1 << BIO_RW)) + num_stripes = map->num_stripes; + else if (mirror_num) + stripe_index = mirror_num - 1; + + } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { + int factor = map->num_stripes / map->sub_stripes; + + stripe_index = do_div(stripe_nr, factor); + stripe_index *= map->sub_stripes; + + if (unplug_page || (rw & (1 << BIO_RW))) + num_stripes = map->sub_stripes; + else if (mirror_num) + stripe_index += mirror_num - 1; + else { + stripe_index = find_live_mirror(map, stripe_index, + map->sub_stripes, stripe_index + + current->pid % map->sub_stripes); + } + } else { + /* + * after this do_div call, stripe_nr is the number of stripes + * on this device we have to walk to find the data, and + * stripe_index is the number of our device in the stripe array + */ + stripe_index = do_div(stripe_nr, map->num_stripes); + } + BUG_ON(stripe_index >= map->num_stripes); + + for (i = 0; i < num_stripes; i++) { + if (unplug_page) { + struct btrfs_device *device; + struct backing_dev_info *bdi; + + device = map->stripes[stripe_index].dev; + if (device->bdev) { + bdi = blk_get_backing_dev_info(device->bdev); + if (bdi->unplug_io_fn) + bdi->unplug_io_fn(bdi, unplug_page); + } + } else { + multi->stripes[i].physical = + map->stripes[stripe_index].physical + + stripe_offset + stripe_nr * map->stripe_len; + multi->stripes[i].dev = map->stripes[stripe_index].dev; + } + stripe_index++; + } + if (multi_ret) { + *multi_ret = multi; + multi->num_stripes = num_stripes; + multi->max_errors = max_errors; + } +out: + free_extent_map(em); + return 0; +} + +int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, + u64 logical, u64 *length, + struct btrfs_multi_bio **multi_ret, int mirror_num) +{ + return __btrfs_map_block(map_tree, rw, logical, length, multi_ret, + mirror_num, NULL); +} + +int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, + u64 chunk_start, u64 physical, u64 devid, + u64 **logical, int *naddrs, int *stripe_len) +{ + struct extent_map_tree *em_tree = &map_tree->map_tree; + struct extent_map *em; + struct map_lookup *map; + u64 *buf; + u64 bytenr; + u64 length; + u64 stripe_nr; + int i, j, nr = 0; + + spin_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, chunk_start, 1); + spin_unlock(&em_tree->lock); + + BUG_ON(!em || em->start != chunk_start); + map = (struct map_lookup *)em->bdev; + + length = em->len; + if (map->type & BTRFS_BLOCK_GROUP_RAID10) + do_div(length, map->num_stripes / map->sub_stripes); + else if (map->type & BTRFS_BLOCK_GROUP_RAID0) + do_div(length, map->num_stripes); + + buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); + BUG_ON(!buf); + + for (i = 0; i < map->num_stripes; i++) { + if (devid && map->stripes[i].dev->devid != devid) + continue; + if (map->stripes[i].physical > physical || + map->stripes[i].physical + length <= physical) + continue; + + stripe_nr = physical - map->stripes[i].physical; + do_div(stripe_nr, map->stripe_len); + + if (map->type & BTRFS_BLOCK_GROUP_RAID10) { + stripe_nr = stripe_nr * map->num_stripes + i; + do_div(stripe_nr, map->sub_stripes); + } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { + stripe_nr = stripe_nr * map->num_stripes + i; + } + bytenr = chunk_start + stripe_nr * map->stripe_len; + WARN_ON(nr >= map->num_stripes); + for (j = 0; j < nr; j++) { + if (buf[j] == bytenr) + break; + } + if (j == nr) { + WARN_ON(nr >= map->num_stripes); + buf[nr++] = bytenr; + } + } + + for (i = 0; i > nr; i++) { + struct btrfs_multi_bio *multi; + struct btrfs_bio_stripe *stripe; + int ret; + + length = 1; + ret = btrfs_map_block(map_tree, WRITE, buf[i], + &length, &multi, 0); + BUG_ON(ret); + + stripe = multi->stripes; + for (j = 0; j < multi->num_stripes; j++) { + if (stripe->physical >= physical && + physical < stripe->physical + length) + break; + } + BUG_ON(j >= multi->num_stripes); + kfree(multi); + } + + *logical = buf; + *naddrs = nr; + *stripe_len = map->stripe_len; + + free_extent_map(em); + return 0; +} + +int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree, + u64 logical, struct page *page) +{ + u64 length = PAGE_CACHE_SIZE; + return __btrfs_map_block(map_tree, READ, logical, &length, + NULL, 0, page); +} + +static void end_bio_multi_stripe(struct bio *bio, int err) +{ + struct btrfs_multi_bio *multi = bio->bi_private; + int is_orig_bio = 0; + + if (err) + atomic_inc(&multi->error); + + if (bio == multi->orig_bio) + is_orig_bio = 1; + + if (atomic_dec_and_test(&multi->stripes_pending)) { + if (!is_orig_bio) { + bio_put(bio); + bio = multi->orig_bio; + } + bio->bi_private = multi->private; + bio->bi_end_io = multi->end_io; + /* only send an error to the higher layers if it is + * beyond the tolerance of the multi-bio + */ + if (atomic_read(&multi->error) > multi->max_errors) { + err = -EIO; + } else if (err) { + /* + * this bio is actually up to date, we didn't + * go over the max number of errors + */ + set_bit(BIO_UPTODATE, &bio->bi_flags); + err = 0; + } + kfree(multi); + + bio_endio(bio, err); + } else if (!is_orig_bio) { + bio_put(bio); + } +} + +struct async_sched { + struct bio *bio; + int rw; + struct btrfs_fs_info *info; + struct btrfs_work work; +}; + +/* + * see run_scheduled_bios for a description of why bios are collected for + * async submit. + * + * This will add one bio to the pending list for a device and make sure + * the work struct is scheduled. + */ +static noinline int schedule_bio(struct btrfs_root *root, + struct btrfs_device *device, + int rw, struct bio *bio) +{ + int should_queue = 1; + + /* don't bother with additional async steps for reads, right now */ + if (!(rw & (1 << BIO_RW))) { + bio_get(bio); + submit_bio(rw, bio); + bio_put(bio); + return 0; + } + + /* + * nr_async_bios allows us to reliably return congestion to the + * higher layers. Otherwise, the async bio makes it appear we have + * made progress against dirty pages when we've really just put it + * on a queue for later + */ + atomic_inc(&root->fs_info->nr_async_bios); + WARN_ON(bio->bi_next); + bio->bi_next = NULL; + bio->bi_rw |= rw; + + spin_lock(&device->io_lock); + + if (device->pending_bio_tail) + device->pending_bio_tail->bi_next = bio; + + device->pending_bio_tail = bio; + if (!device->pending_bios) + device->pending_bios = bio; + if (device->running_pending) + should_queue = 0; + + spin_unlock(&device->io_lock); + + if (should_queue) + btrfs_queue_worker(&root->fs_info->submit_workers, + &device->work); + return 0; +} + +int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, + int mirror_num, int async_submit) +{ + struct btrfs_mapping_tree *map_tree; + struct btrfs_device *dev; + struct bio *first_bio = bio; + u64 logical = (u64)bio->bi_sector << 9; + u64 length = 0; + u64 map_length; + struct btrfs_multi_bio *multi = NULL; + int ret; + int dev_nr = 0; + int total_devs = 1; + + length = bio->bi_size; + map_tree = &root->fs_info->mapping_tree; + map_length = length; + + ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi, + mirror_num); + BUG_ON(ret); + + total_devs = multi->num_stripes; + if (map_length < length) { + printk(KERN_CRIT "mapping failed logical %llu bio len %llu " + "len %llu\n", (unsigned long long)logical, + (unsigned long long)length, + (unsigned long long)map_length); + BUG(); + } + multi->end_io = first_bio->bi_end_io; + multi->private = first_bio->bi_private; + multi->orig_bio = first_bio; + atomic_set(&multi->stripes_pending, multi->num_stripes); + + while (dev_nr < total_devs) { + if (total_devs > 1) { + if (dev_nr < total_devs - 1) { + bio = bio_clone(first_bio, GFP_NOFS); + BUG_ON(!bio); + } else { + bio = first_bio; + } + bio->bi_private = multi; + bio->bi_end_io = end_bio_multi_stripe; + } + bio->bi_sector = multi->stripes[dev_nr].physical >> 9; + dev = multi->stripes[dev_nr].dev; + BUG_ON(rw == WRITE && !dev->writeable); + if (dev && dev->bdev) { + bio->bi_bdev = dev->bdev; + if (async_submit) + schedule_bio(root, dev, rw, bio); + else + submit_bio(rw, bio); + } else { + bio->bi_bdev = root->fs_info->fs_devices->latest_bdev; + bio->bi_sector = logical >> 9; + bio_endio(bio, -EIO); + } + dev_nr++; + } + if (total_devs == 1) + kfree(multi); + return 0; +} + +struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, + u8 *uuid, u8 *fsid) +{ + struct btrfs_device *device; + struct btrfs_fs_devices *cur_devices; + + cur_devices = root->fs_info->fs_devices; + while (cur_devices) { + if (!fsid || + !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) { + device = __find_device(&cur_devices->devices, + devid, uuid); + if (device) + return device; + } + cur_devices = cur_devices->seed; + } + return NULL; +} + +static struct btrfs_device *add_missing_dev(struct btrfs_root *root, + u64 devid, u8 *dev_uuid) +{ + struct btrfs_device *device; + struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; + + device = kzalloc(sizeof(*device), GFP_NOFS); + if (!device) + return NULL; + list_add(&device->dev_list, + &fs_devices->devices); + device->barriers = 1; + device->dev_root = root->fs_info->dev_root; + device->devid = devid; + device->work.func = pending_bios_fn; + device->fs_devices = fs_devices; + fs_devices->num_devices++; + spin_lock_init(&device->io_lock); + INIT_LIST_HEAD(&device->dev_alloc_list); + memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE); + return device; +} + +static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, + struct extent_buffer *leaf, + struct btrfs_chunk *chunk) +{ + struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; + struct map_lookup *map; + struct extent_map *em; + u64 logical; + u64 length; + u64 devid; + u8 uuid[BTRFS_UUID_SIZE]; + int num_stripes; + int ret; + int i; + + logical = key->offset; + length = btrfs_chunk_length(leaf, chunk); + + spin_lock(&map_tree->map_tree.lock); + em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); + spin_unlock(&map_tree->map_tree.lock); + + /* already mapped? */ + if (em && em->start <= logical && em->start + em->len > logical) { + free_extent_map(em); + return 0; + } else if (em) { + free_extent_map(em); + } + + map = kzalloc(sizeof(*map), GFP_NOFS); + if (!map) + return -ENOMEM; + + em = alloc_extent_map(GFP_NOFS); + if (!em) + return -ENOMEM; + num_stripes = btrfs_chunk_num_stripes(leaf, chunk); + map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); + if (!map) { + free_extent_map(em); + return -ENOMEM; + } + + em->bdev = (struct block_device *)map; + em->start = logical; + em->len = length; + em->block_start = 0; + em->block_len = em->len; + + map->num_stripes = num_stripes; + map->io_width = btrfs_chunk_io_width(leaf, chunk); + map->io_align = btrfs_chunk_io_align(leaf, chunk); + map->sector_size = btrfs_chunk_sector_size(leaf, chunk); + map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); + map->type = btrfs_chunk_type(leaf, chunk); + map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); + for (i = 0; i < num_stripes; i++) { + map->stripes[i].physical = + btrfs_stripe_offset_nr(leaf, chunk, i); + devid = btrfs_stripe_devid_nr(leaf, chunk, i); + read_extent_buffer(leaf, uuid, (unsigned long) + btrfs_stripe_dev_uuid_nr(chunk, i), + BTRFS_UUID_SIZE); + map->stripes[i].dev = btrfs_find_device(root, devid, uuid, + NULL); + if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) { + kfree(map); + free_extent_map(em); + return -EIO; + } + if (!map->stripes[i].dev) { + map->stripes[i].dev = + add_missing_dev(root, devid, uuid); + if (!map->stripes[i].dev) { + kfree(map); + free_extent_map(em); + return -EIO; + } + } + map->stripes[i].dev->in_fs_metadata = 1; + } + + spin_lock(&map_tree->map_tree.lock); + ret = add_extent_mapping(&map_tree->map_tree, em); + spin_unlock(&map_tree->map_tree.lock); + BUG_ON(ret); + free_extent_map(em); + + return 0; +} + +static int fill_device_from_item(struct extent_buffer *leaf, + struct btrfs_dev_item *dev_item, + struct btrfs_device *device) +{ + unsigned long ptr; + + device->devid = btrfs_device_id(leaf, dev_item); + device->total_bytes = btrfs_device_total_bytes(leaf, dev_item); + device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); + device->type = btrfs_device_type(leaf, dev_item); + device->io_align = btrfs_device_io_align(leaf, dev_item); + device->io_width = btrfs_device_io_width(leaf, dev_item); + device->sector_size = btrfs_device_sector_size(leaf, dev_item); + + ptr = (unsigned long)btrfs_device_uuid(dev_item); + read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); + + return 0; +} + +static int open_seed_devices(struct btrfs_root *root, u8 *fsid) +{ + struct btrfs_fs_devices *fs_devices; + int ret; + + mutex_lock(&uuid_mutex); + + fs_devices = root->fs_info->fs_devices->seed; + while (fs_devices) { + if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) { + ret = 0; + goto out; + } + fs_devices = fs_devices->seed; + } + + fs_devices = find_fsid(fsid); + if (!fs_devices) { + ret = -ENOENT; + goto out; + } + + fs_devices = clone_fs_devices(fs_devices); + if (IS_ERR(fs_devices)) { + ret = PTR_ERR(fs_devices); + goto out; + } + + ret = __btrfs_open_devices(fs_devices, FMODE_READ, + root->fs_info->bdev_holder); + if (ret) + goto out; + + if (!fs_devices->seeding) { + __btrfs_close_devices(fs_devices); + free_fs_devices(fs_devices); + ret = -EINVAL; + goto out; + } + + fs_devices->seed = root->fs_info->fs_devices->seed; + root->fs_info->fs_devices->seed = fs_devices; +out: + mutex_unlock(&uuid_mutex); + return ret; +} + +static int read_one_dev(struct btrfs_root *root, + struct extent_buffer *leaf, + struct btrfs_dev_item *dev_item) +{ + struct btrfs_device *device; + u64 devid; + int ret; + u8 fs_uuid[BTRFS_UUID_SIZE]; + u8 dev_uuid[BTRFS_UUID_SIZE]; + + devid = btrfs_device_id(leaf, dev_item); + read_extent_buffer(leaf, dev_uuid, + (unsigned long)btrfs_device_uuid(dev_item), + BTRFS_UUID_SIZE); + read_extent_buffer(leaf, fs_uuid, + (unsigned long)btrfs_device_fsid(dev_item), + BTRFS_UUID_SIZE); + + if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) { + ret = open_seed_devices(root, fs_uuid); + if (ret && !btrfs_test_opt(root, DEGRADED)) + return ret; + } + + device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); + if (!device || !device->bdev) { + if (!btrfs_test_opt(root, DEGRADED)) + return -EIO; + + if (!device) { + printk(KERN_WARNING "warning devid %llu missing\n", + (unsigned long long)devid); + device = add_missing_dev(root, devid, dev_uuid); + if (!device) + return -ENOMEM; + } + } + + if (device->fs_devices != root->fs_info->fs_devices) { + BUG_ON(device->writeable); + if (device->generation != + btrfs_device_generation(leaf, dev_item)) + return -EINVAL; + } + + fill_device_from_item(leaf, dev_item, device); + device->dev_root = root->fs_info->dev_root; + device->in_fs_metadata = 1; + if (device->writeable) + device->fs_devices->total_rw_bytes += device->total_bytes; + ret = 0; + return ret; +} + +int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf) +{ + struct btrfs_dev_item *dev_item; + + dev_item = (struct btrfs_dev_item *)offsetof(struct btrfs_super_block, + dev_item); + return read_one_dev(root, buf, dev_item); +} + +int btrfs_read_sys_array(struct btrfs_root *root) +{ + struct btrfs_super_block *super_copy = &root->fs_info->super_copy; + struct extent_buffer *sb; + struct btrfs_disk_key *disk_key; + struct btrfs_chunk *chunk; + u8 *ptr; + unsigned long sb_ptr; + int ret = 0; + u32 num_stripes; + u32 array_size; + u32 len = 0; + u32 cur; + struct btrfs_key key; + + sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET, + BTRFS_SUPER_INFO_SIZE); + if (!sb) + return -ENOMEM; + btrfs_set_buffer_uptodate(sb); + write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); + array_size = btrfs_super_sys_array_size(super_copy); + + ptr = super_copy->sys_chunk_array; + sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array); + cur = 0; + + while (cur < array_size) { + disk_key = (struct btrfs_disk_key *)ptr; + btrfs_disk_key_to_cpu(&key, disk_key); + + len = sizeof(*disk_key); ptr += len; + sb_ptr += len; + cur += len; + + if (key.type == BTRFS_CHUNK_ITEM_KEY) { + chunk = (struct btrfs_chunk *)sb_ptr; + ret = read_one_chunk(root, &key, sb, chunk); + if (ret) + break; + num_stripes = btrfs_chunk_num_stripes(sb, chunk); + len = btrfs_chunk_item_size(num_stripes); + } else { + ret = -EIO; + break; + } + ptr += len; + sb_ptr += len; + cur += len; + } + free_extent_buffer(sb); + return ret; +} + +int btrfs_read_chunk_tree(struct btrfs_root *root) +{ + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_key key; + struct btrfs_key found_key; + int ret; + int slot; + + root = root->fs_info->chunk_root; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* first we search for all of the device items, and then we + * read in all of the chunk items. This way we can create chunk + * mappings that reference all of the devices that are afound + */ + key.objectid = BTRFS_DEV_ITEMS_OBJECTID; + key.offset = 0; + key.type = 0; +again: + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + while (1) { + leaf = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret == 0) + continue; + if (ret < 0) + goto error; + break; + } + btrfs_item_key_to_cpu(leaf, &found_key, slot); + if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) { + if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID) + break; + if (found_key.type == BTRFS_DEV_ITEM_KEY) { + struct btrfs_dev_item *dev_item; + dev_item = btrfs_item_ptr(leaf, slot, + struct btrfs_dev_item); + ret = read_one_dev(root, leaf, dev_item); + if (ret) + goto error; + } + } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { + struct btrfs_chunk *chunk; + chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); + ret = read_one_chunk(root, &found_key, leaf, chunk); + if (ret) + goto error; + } + path->slots[0]++; + } + if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) { + key.objectid = 0; + btrfs_release_path(root, path); + goto again; + } + ret = 0; +error: + btrfs_free_path(path); + return ret; +} diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h new file mode 100644 index 000000000000..86c44e9ae110 --- /dev/null +++ b/fs/btrfs/volumes.h @@ -0,0 +1,162 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_VOLUMES_ +#define __BTRFS_VOLUMES_ + +#include <linux/bio.h> +#include "async-thread.h" + +struct buffer_head; +struct btrfs_device { + struct list_head dev_list; + struct list_head dev_alloc_list; + struct btrfs_fs_devices *fs_devices; + struct btrfs_root *dev_root; + struct bio *pending_bios; + struct bio *pending_bio_tail; + int running_pending; + u64 generation; + + int barriers; + int writeable; + int in_fs_metadata; + + spinlock_t io_lock; + + struct block_device *bdev; + + /* the mode sent to open_bdev_exclusive */ + fmode_t mode; + + char *name; + + /* the internal btrfs device id */ + u64 devid; + + /* size of the device */ + u64 total_bytes; + + /* bytes used */ + u64 bytes_used; + + /* optimal io alignment for this device */ + u32 io_align; + + /* optimal io width for this device */ + u32 io_width; + + /* minimal io size for this device */ + u32 sector_size; + + /* type and info about this device */ + u64 type; + + /* physical drive uuid (or lvm uuid) */ + u8 uuid[BTRFS_UUID_SIZE]; + + struct btrfs_work work; +}; + +struct btrfs_fs_devices { + u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ + + /* the device with this id has the most recent coyp of the super */ + u64 latest_devid; + u64 latest_trans; + u64 num_devices; + u64 open_devices; + u64 rw_devices; + u64 total_rw_bytes; + struct block_device *latest_bdev; + /* all of the devices in the FS */ + struct list_head devices; + + /* devices not currently being allocated */ + struct list_head alloc_list; + struct list_head list; + + struct btrfs_fs_devices *seed; + int seeding; + + int opened; +}; + +struct btrfs_bio_stripe { + struct btrfs_device *dev; + u64 physical; +}; + +struct btrfs_multi_bio { + atomic_t stripes_pending; + bio_end_io_t *end_io; + struct bio *orig_bio; + void *private; + atomic_t error; + int max_errors; + int num_stripes; + struct btrfs_bio_stripe stripes[]; +}; + +#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \ + (sizeof(struct btrfs_bio_stripe) * (n))) + +int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, + u64 chunk_tree, u64 chunk_objectid, + u64 chunk_offset, u64 start, u64 num_bytes); +int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, + u64 logical, u64 *length, + struct btrfs_multi_bio **multi_ret, int mirror_num); +int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, + u64 chunk_start, u64 physical, u64 devid, + u64 **logical, int *naddrs, int *stripe_len); +int btrfs_read_sys_array(struct btrfs_root *root); +int btrfs_read_chunk_tree(struct btrfs_root *root); +int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, u64 type); +void btrfs_mapping_init(struct btrfs_mapping_tree *tree); +void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree); +int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, + int mirror_num, int async_submit); +int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf); +int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, + fmode_t flags, void *holder); +int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, + struct btrfs_fs_devices **fs_devices_ret); +int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); +int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices); +int btrfs_add_device(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_device *device); +int btrfs_rm_device(struct btrfs_root *root, char *device_path); +int btrfs_cleanup_fs_uuids(void); +int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len); +int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree, + u64 logical, struct page *page); +int btrfs_grow_device(struct btrfs_trans_handle *trans, + struct btrfs_device *device, u64 new_size); +struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, + u8 *uuid, u8 *fsid); +int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); +int btrfs_init_new_device(struct btrfs_root *root, char *path); +int btrfs_balance(struct btrfs_root *dev_root); +void btrfs_unlock_volumes(void); +void btrfs_lock_volumes(void); +int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); +#endif diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c new file mode 100644 index 000000000000..7f332e270894 --- /dev/null +++ b/fs/btrfs/xattr.c @@ -0,0 +1,322 @@ +/* + * Copyright (C) 2007 Red Hat. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/init.h> +#include <linux/fs.h> +#include <linux/slab.h> +#include <linux/rwsem.h> +#include <linux/xattr.h> +#include "ctree.h" +#include "btrfs_inode.h" +#include "transaction.h" +#include "xattr.h" +#include "disk-io.h" + + +ssize_t __btrfs_getxattr(struct inode *inode, const char *name, + void *buffer, size_t size) +{ + struct btrfs_dir_item *di; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_path *path; + struct extent_buffer *leaf; + int ret = 0; + unsigned long data_ptr; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* lookup the xattr by name */ + di = btrfs_lookup_xattr(NULL, root, path, inode->i_ino, name, + strlen(name), 0); + if (!di || IS_ERR(di)) { + ret = -ENODATA; + goto out; + } + + leaf = path->nodes[0]; + /* if size is 0, that means we want the size of the attr */ + if (!size) { + ret = btrfs_dir_data_len(leaf, di); + goto out; + } + + /* now get the data out of our dir_item */ + if (btrfs_dir_data_len(leaf, di) > size) { + ret = -ERANGE; + goto out; + } + data_ptr = (unsigned long)((char *)(di + 1) + + btrfs_dir_name_len(leaf, di)); + read_extent_buffer(leaf, buffer, data_ptr, + btrfs_dir_data_len(leaf, di)); + ret = btrfs_dir_data_len(leaf, di); + +out: + btrfs_free_path(path); + return ret; +} + +int __btrfs_setxattr(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + struct btrfs_dir_item *di; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + struct btrfs_path *path; + int ret = 0, mod = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); + + /* first lets see if we already have this xattr */ + di = btrfs_lookup_xattr(trans, root, path, inode->i_ino, name, + strlen(name), -1); + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } + + /* ok we already have this xattr, lets remove it */ + if (di) { + /* if we want create only exit */ + if (flags & XATTR_CREATE) { + ret = -EEXIST; + goto out; + } + + ret = btrfs_delete_one_dir_name(trans, root, path, di); + if (ret) + goto out; + btrfs_release_path(root, path); + + /* if we don't have a value then we are removing the xattr */ + if (!value) { + mod = 1; + goto out; + } + } else { + btrfs_release_path(root, path); + + if (flags & XATTR_REPLACE) { + /* we couldn't find the attr to replace */ + ret = -ENODATA; + goto out; + } + } + + /* ok we have to create a completely new xattr */ + ret = btrfs_insert_xattr_item(trans, root, name, strlen(name), + value, size, inode->i_ino); + if (ret) + goto out; + mod = 1; + +out: + if (mod) { + inode->i_ctime = CURRENT_TIME; + ret = btrfs_update_inode(trans, root, inode); + } + + btrfs_end_transaction(trans, root); + btrfs_free_path(path); + return ret; +} + +ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) +{ + struct btrfs_key key, found_key; + struct inode *inode = dentry->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_path *path; + struct btrfs_item *item; + struct extent_buffer *leaf; + struct btrfs_dir_item *di; + int ret = 0, slot, advance; + size_t total_size = 0, size_left = size; + unsigned long name_ptr; + size_t name_len; + u32 nritems; + + /* + * ok we want all objects associated with this id. + * NOTE: we set key.offset = 0; because we want to start with the + * first xattr that we find and walk forward + */ + key.objectid = inode->i_ino; + btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY); + key.offset = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->reada = 2; + + /* search for our xattrs */ + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto err; + ret = 0; + advance = 0; + while (1) { + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + slot = path->slots[0]; + + /* this is where we start walking through the path */ + if (advance || slot >= nritems) { + /* + * if we've reached the last slot in this leaf we need + * to go to the next leaf and reset everything + */ + if (slot >= nritems-1) { + ret = btrfs_next_leaf(root, path); + if (ret) + break; + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + slot = path->slots[0]; + } else { + /* + * just walking through the slots on this leaf + */ + slot++; + path->slots[0]++; + } + } + advance = 1; + + item = btrfs_item_nr(leaf, slot); + btrfs_item_key_to_cpu(leaf, &found_key, slot); + + /* check to make sure this item is what we want */ + if (found_key.objectid != key.objectid) + break; + if (btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY) + break; + + di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); + + name_len = btrfs_dir_name_len(leaf, di); + total_size += name_len + 1; + + /* we are just looking for how big our buffer needs to be */ + if (!size) + continue; + + if (!buffer || (name_len + 1) > size_left) { + ret = -ERANGE; + goto err; + } + + name_ptr = (unsigned long)(di + 1); + read_extent_buffer(leaf, buffer, name_ptr, name_len); + buffer[name_len] = '\0'; + + size_left -= name_len + 1; + buffer += name_len + 1; + } + ret = total_size; + +err: + btrfs_free_path(path); + + return ret; +} + +/* + * List of handlers for synthetic system.* attributes. All real ondisk + * attributes are handled directly. + */ +struct xattr_handler *btrfs_xattr_handlers[] = { +#ifdef CONFIG_FS_POSIX_ACL + &btrfs_xattr_acl_access_handler, + &btrfs_xattr_acl_default_handler, +#endif + NULL, +}; + +/* + * Check if the attribute is in a supported namespace. + * + * This applied after the check for the synthetic attributes in the system + * namespace. + */ +static bool btrfs_is_valid_xattr(const char *name) +{ + return !strncmp(name, XATTR_SECURITY_PREFIX, + XATTR_SECURITY_PREFIX_LEN) || + !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) || + !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || + !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); +} + +ssize_t btrfs_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size) +{ + /* + * If this is a request for a synthetic attribute in the system.* + * namespace use the generic infrastructure to resolve a handler + * for it via sb->s_xattr. + */ + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return generic_getxattr(dentry, name, buffer, size); + + if (!btrfs_is_valid_xattr(name)) + return -EOPNOTSUPP; + return __btrfs_getxattr(dentry->d_inode, name, buffer, size); +} + +int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags) +{ + /* + * If this is a request for a synthetic attribute in the system.* + * namespace use the generic infrastructure to resolve a handler + * for it via sb->s_xattr. + */ + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return generic_setxattr(dentry, name, value, size, flags); + + if (!btrfs_is_valid_xattr(name)) + return -EOPNOTSUPP; + + if (size == 0) + value = ""; /* empty EA, do not remove */ + return __btrfs_setxattr(dentry->d_inode, name, value, size, flags); +} + +int btrfs_removexattr(struct dentry *dentry, const char *name) +{ + /* + * If this is a request for a synthetic attribute in the system.* + * namespace use the generic infrastructure to resolve a handler + * for it via sb->s_xattr. + */ + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return generic_removexattr(dentry, name); + + if (!btrfs_is_valid_xattr(name)) + return -EOPNOTSUPP; + return __btrfs_setxattr(dentry->d_inode, name, NULL, 0, XATTR_REPLACE); +} diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h new file mode 100644 index 000000000000..5b1d08f8e68d --- /dev/null +++ b/fs/btrfs/xattr.h @@ -0,0 +1,39 @@ +/* + * Copyright (C) 2007 Red Hat. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __XATTR__ +#define __XATTR__ + +#include <linux/xattr.h> + +extern struct xattr_handler btrfs_xattr_acl_access_handler; +extern struct xattr_handler btrfs_xattr_acl_default_handler; +extern struct xattr_handler *btrfs_xattr_handlers[]; + +extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name, + void *buffer, size_t size); +extern int __btrfs_setxattr(struct inode *inode, const char *name, + const void *value, size_t size, int flags); + +extern ssize_t btrfs_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size); +extern int btrfs_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags); +extern int btrfs_removexattr(struct dentry *dentry, const char *name); + +#endif /* __XATTR__ */ diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c new file mode 100644 index 000000000000..ecfbce836d32 --- /dev/null +++ b/fs/btrfs/zlib.c @@ -0,0 +1,632 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + * Based on jffs2 zlib code: + * Copyright © 2001-2007 Red Hat, Inc. + * Created by David Woodhouse <dwmw2@infradead.org> + */ + +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/zlib.h> +#include <linux/zutil.h> +#include <linux/vmalloc.h> +#include <linux/init.h> +#include <linux/err.h> +#include <linux/sched.h> +#include <linux/pagemap.h> +#include <linux/bio.h> +#include "compression.h" + +/* Plan: call deflate() with avail_in == *sourcelen, + avail_out = *dstlen - 12 and flush == Z_FINISH. + If it doesn't manage to finish, call it again with + avail_in == 0 and avail_out set to the remaining 12 + bytes for it to clean up. + Q: Is 12 bytes sufficient? +*/ +#define STREAM_END_SPACE 12 + +struct workspace { + z_stream inf_strm; + z_stream def_strm; + char *buf; + struct list_head list; +}; + +static LIST_HEAD(idle_workspace); +static DEFINE_SPINLOCK(workspace_lock); +static unsigned long num_workspace; +static atomic_t alloc_workspace = ATOMIC_INIT(0); +static DECLARE_WAIT_QUEUE_HEAD(workspace_wait); + +/* + * this finds an available zlib workspace or allocates a new one + * NULL or an ERR_PTR is returned if things go bad. + */ +static struct workspace *find_zlib_workspace(void) +{ + struct workspace *workspace; + int ret; + int cpus = num_online_cpus(); + +again: + spin_lock(&workspace_lock); + if (!list_empty(&idle_workspace)) { + workspace = list_entry(idle_workspace.next, struct workspace, + list); + list_del(&workspace->list); + num_workspace--; + spin_unlock(&workspace_lock); + return workspace; + + } + spin_unlock(&workspace_lock); + if (atomic_read(&alloc_workspace) > cpus) { + DEFINE_WAIT(wait); + prepare_to_wait(&workspace_wait, &wait, TASK_UNINTERRUPTIBLE); + if (atomic_read(&alloc_workspace) > cpus) + schedule(); + finish_wait(&workspace_wait, &wait); + goto again; + } + atomic_inc(&alloc_workspace); + workspace = kzalloc(sizeof(*workspace), GFP_NOFS); + if (!workspace) { + ret = -ENOMEM; + goto fail; + } + + workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize()); + if (!workspace->def_strm.workspace) { + ret = -ENOMEM; + goto fail; + } + workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize()); + if (!workspace->inf_strm.workspace) { + ret = -ENOMEM; + goto fail_inflate; + } + workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS); + if (!workspace->buf) { + ret = -ENOMEM; + goto fail_kmalloc; + } + return workspace; + +fail_kmalloc: + vfree(workspace->inf_strm.workspace); +fail_inflate: + vfree(workspace->def_strm.workspace); +fail: + kfree(workspace); + atomic_dec(&alloc_workspace); + wake_up(&workspace_wait); + return ERR_PTR(ret); +} + +/* + * put a workspace struct back on the list or free it if we have enough + * idle ones sitting around + */ +static int free_workspace(struct workspace *workspace) +{ + spin_lock(&workspace_lock); + if (num_workspace < num_online_cpus()) { + list_add_tail(&workspace->list, &idle_workspace); + num_workspace++; + spin_unlock(&workspace_lock); + if (waitqueue_active(&workspace_wait)) + wake_up(&workspace_wait); + return 0; + } + spin_unlock(&workspace_lock); + vfree(workspace->def_strm.workspace); + vfree(workspace->inf_strm.workspace); + kfree(workspace->buf); + kfree(workspace); + + atomic_dec(&alloc_workspace); + if (waitqueue_active(&workspace_wait)) + wake_up(&workspace_wait); + return 0; +} + +/* + * cleanup function for module exit + */ +static void free_workspaces(void) +{ + struct workspace *workspace; + while (!list_empty(&idle_workspace)) { + workspace = list_entry(idle_workspace.next, struct workspace, + list); + list_del(&workspace->list); + vfree(workspace->def_strm.workspace); + vfree(workspace->inf_strm.workspace); + kfree(workspace->buf); + kfree(workspace); + atomic_dec(&alloc_workspace); + } +} + +/* + * given an address space and start/len, compress the bytes. + * + * pages are allocated to hold the compressed result and stored + * in 'pages' + * + * out_pages is used to return the number of pages allocated. There + * may be pages allocated even if we return an error + * + * total_in is used to return the number of bytes actually read. It + * may be smaller then len if we had to exit early because we + * ran out of room in the pages array or because we cross the + * max_out threshold. + * + * total_out is used to return the total number of compressed bytes + * + * max_out tells us the max number of bytes that we're allowed to + * stuff into pages + */ +int btrfs_zlib_compress_pages(struct address_space *mapping, + u64 start, unsigned long len, + struct page **pages, + unsigned long nr_dest_pages, + unsigned long *out_pages, + unsigned long *total_in, + unsigned long *total_out, + unsigned long max_out) +{ + int ret; + struct workspace *workspace; + char *data_in; + char *cpage_out; + int nr_pages = 0; + struct page *in_page = NULL; + struct page *out_page = NULL; + int out_written = 0; + int in_read = 0; + unsigned long bytes_left; + + *out_pages = 0; + *total_out = 0; + *total_in = 0; + + workspace = find_zlib_workspace(); + if (!workspace) + return -1; + + if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) { + printk(KERN_WARNING "deflateInit failed\n"); + ret = -1; + goto out; + } + + workspace->def_strm.total_in = 0; + workspace->def_strm.total_out = 0; + + in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); + data_in = kmap(in_page); + + out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + cpage_out = kmap(out_page); + pages[0] = out_page; + nr_pages = 1; + + workspace->def_strm.next_in = data_in; + workspace->def_strm.next_out = cpage_out; + workspace->def_strm.avail_out = PAGE_CACHE_SIZE; + workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE); + + out_written = 0; + in_read = 0; + + while (workspace->def_strm.total_in < len) { + ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH); + if (ret != Z_OK) { + printk(KERN_DEBUG "btrfs deflate in loop returned %d\n", + ret); + zlib_deflateEnd(&workspace->def_strm); + ret = -1; + goto out; + } + + /* we're making it bigger, give up */ + if (workspace->def_strm.total_in > 8192 && + workspace->def_strm.total_in < + workspace->def_strm.total_out) { + ret = -1; + goto out; + } + /* we need another page for writing out. Test this + * before the total_in so we will pull in a new page for + * the stream end if required + */ + if (workspace->def_strm.avail_out == 0) { + kunmap(out_page); + if (nr_pages == nr_dest_pages) { + out_page = NULL; + ret = -1; + goto out; + } + out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + cpage_out = kmap(out_page); + pages[nr_pages] = out_page; + nr_pages++; + workspace->def_strm.avail_out = PAGE_CACHE_SIZE; + workspace->def_strm.next_out = cpage_out; + } + /* we're all done */ + if (workspace->def_strm.total_in >= len) + break; + + /* we've read in a full page, get a new one */ + if (workspace->def_strm.avail_in == 0) { + if (workspace->def_strm.total_out > max_out) + break; + + bytes_left = len - workspace->def_strm.total_in; + kunmap(in_page); + page_cache_release(in_page); + + start += PAGE_CACHE_SIZE; + in_page = find_get_page(mapping, + start >> PAGE_CACHE_SHIFT); + data_in = kmap(in_page); + workspace->def_strm.avail_in = min(bytes_left, + PAGE_CACHE_SIZE); + workspace->def_strm.next_in = data_in; + } + } + workspace->def_strm.avail_in = 0; + ret = zlib_deflate(&workspace->def_strm, Z_FINISH); + zlib_deflateEnd(&workspace->def_strm); + + if (ret != Z_STREAM_END) { + ret = -1; + goto out; + } + + if (workspace->def_strm.total_out >= workspace->def_strm.total_in) { + ret = -1; + goto out; + } + + ret = 0; + *total_out = workspace->def_strm.total_out; + *total_in = workspace->def_strm.total_in; +out: + *out_pages = nr_pages; + if (out_page) + kunmap(out_page); + + if (in_page) { + kunmap(in_page); + page_cache_release(in_page); + } + free_workspace(workspace); + return ret; +} + +/* + * pages_in is an array of pages with compressed data. + * + * disk_start is the starting logical offset of this array in the file + * + * bvec is a bio_vec of pages from the file that we want to decompress into + * + * vcnt is the count of pages in the biovec + * + * srclen is the number of bytes in pages_in + * + * The basic idea is that we have a bio that was created by readpages. + * The pages in the bio are for the uncompressed data, and they may not + * be contiguous. They all correspond to the range of bytes covered by + * the compressed extent. + */ +int btrfs_zlib_decompress_biovec(struct page **pages_in, + u64 disk_start, + struct bio_vec *bvec, + int vcnt, + size_t srclen) +{ + int ret = 0; + int wbits = MAX_WBITS; + struct workspace *workspace; + char *data_in; + size_t total_out = 0; + unsigned long page_bytes_left; + unsigned long page_in_index = 0; + unsigned long page_out_index = 0; + struct page *page_out; + unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) / + PAGE_CACHE_SIZE; + unsigned long buf_start; + unsigned long buf_offset; + unsigned long bytes; + unsigned long working_bytes; + unsigned long pg_offset; + unsigned long start_byte; + unsigned long current_buf_start; + char *kaddr; + + workspace = find_zlib_workspace(); + if (!workspace) + return -ENOMEM; + + data_in = kmap(pages_in[page_in_index]); + workspace->inf_strm.next_in = data_in; + workspace->inf_strm.avail_in = min_t(size_t, srclen, PAGE_CACHE_SIZE); + workspace->inf_strm.total_in = 0; + + workspace->inf_strm.total_out = 0; + workspace->inf_strm.next_out = workspace->buf; + workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; + page_out = bvec[page_out_index].bv_page; + page_bytes_left = PAGE_CACHE_SIZE; + pg_offset = 0; + + /* If it's deflate, and it's got no preset dictionary, then + we can tell zlib to skip the adler32 check. */ + if (srclen > 2 && !(data_in[1] & PRESET_DICT) && + ((data_in[0] & 0x0f) == Z_DEFLATED) && + !(((data_in[0]<<8) + data_in[1]) % 31)) { + + wbits = -((data_in[0] >> 4) + 8); + workspace->inf_strm.next_in += 2; + workspace->inf_strm.avail_in -= 2; + } + + if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { + printk(KERN_WARNING "inflateInit failed\n"); + ret = -1; + goto out; + } + while (workspace->inf_strm.total_in < srclen) { + ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH); + if (ret != Z_OK && ret != Z_STREAM_END) + break; + /* + * buf start is the byte offset we're of the start of + * our workspace buffer + */ + buf_start = total_out; + + /* total_out is the last byte of the workspace buffer */ + total_out = workspace->inf_strm.total_out; + + working_bytes = total_out - buf_start; + + /* + * start byte is the first byte of the page we're currently + * copying into relative to the start of the compressed data. + */ + start_byte = page_offset(page_out) - disk_start; + + if (working_bytes == 0) { + /* we didn't make progress in this inflate + * call, we're done + */ + if (ret != Z_STREAM_END) + ret = -1; + break; + } + + /* we haven't yet hit data corresponding to this page */ + if (total_out <= start_byte) + goto next; + + /* + * the start of the data we care about is offset into + * the middle of our working buffer + */ + if (total_out > start_byte && buf_start < start_byte) { + buf_offset = start_byte - buf_start; + working_bytes -= buf_offset; + } else { + buf_offset = 0; + } + current_buf_start = buf_start; + + /* copy bytes from the working buffer into the pages */ + while (working_bytes > 0) { + bytes = min(PAGE_CACHE_SIZE - pg_offset, + PAGE_CACHE_SIZE - buf_offset); + bytes = min(bytes, working_bytes); + kaddr = kmap_atomic(page_out, KM_USER0); + memcpy(kaddr + pg_offset, workspace->buf + buf_offset, + bytes); + kunmap_atomic(kaddr, KM_USER0); + flush_dcache_page(page_out); + + pg_offset += bytes; + page_bytes_left -= bytes; + buf_offset += bytes; + working_bytes -= bytes; + current_buf_start += bytes; + + /* check if we need to pick another page */ + if (page_bytes_left == 0) { + page_out_index++; + if (page_out_index >= vcnt) { + ret = 0; + goto done; + } + + page_out = bvec[page_out_index].bv_page; + pg_offset = 0; + page_bytes_left = PAGE_CACHE_SIZE; + start_byte = page_offset(page_out) - disk_start; + + /* + * make sure our new page is covered by this + * working buffer + */ + if (total_out <= start_byte) + goto next; + + /* the next page in the biovec might not + * be adjacent to the last page, but it + * might still be found inside this working + * buffer. bump our offset pointer + */ + if (total_out > start_byte && + current_buf_start < start_byte) { + buf_offset = start_byte - buf_start; + working_bytes = total_out - start_byte; + current_buf_start = buf_start + + buf_offset; + } + } + } +next: + workspace->inf_strm.next_out = workspace->buf; + workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; + + if (workspace->inf_strm.avail_in == 0) { + unsigned long tmp; + kunmap(pages_in[page_in_index]); + page_in_index++; + if (page_in_index >= total_pages_in) { + data_in = NULL; + break; + } + data_in = kmap(pages_in[page_in_index]); + workspace->inf_strm.next_in = data_in; + tmp = srclen - workspace->inf_strm.total_in; + workspace->inf_strm.avail_in = min(tmp, + PAGE_CACHE_SIZE); + } + } + if (ret != Z_STREAM_END) + ret = -1; + else + ret = 0; +done: + zlib_inflateEnd(&workspace->inf_strm); + if (data_in) + kunmap(pages_in[page_in_index]); +out: + free_workspace(workspace); + return ret; +} + +/* + * a less complex decompression routine. Our compressed data fits in a + * single page, and we want to read a single page out of it. + * start_byte tells us the offset into the compressed data we're interested in + */ +int btrfs_zlib_decompress(unsigned char *data_in, + struct page *dest_page, + unsigned long start_byte, + size_t srclen, size_t destlen) +{ + int ret = 0; + int wbits = MAX_WBITS; + struct workspace *workspace; + unsigned long bytes_left = destlen; + unsigned long total_out = 0; + char *kaddr; + + if (destlen > PAGE_CACHE_SIZE) + return -ENOMEM; + + workspace = find_zlib_workspace(); + if (!workspace) + return -ENOMEM; + + workspace->inf_strm.next_in = data_in; + workspace->inf_strm.avail_in = srclen; + workspace->inf_strm.total_in = 0; + + workspace->inf_strm.next_out = workspace->buf; + workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; + workspace->inf_strm.total_out = 0; + /* If it's deflate, and it's got no preset dictionary, then + we can tell zlib to skip the adler32 check. */ + if (srclen > 2 && !(data_in[1] & PRESET_DICT) && + ((data_in[0] & 0x0f) == Z_DEFLATED) && + !(((data_in[0]<<8) + data_in[1]) % 31)) { + + wbits = -((data_in[0] >> 4) + 8); + workspace->inf_strm.next_in += 2; + workspace->inf_strm.avail_in -= 2; + } + + if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { + printk(KERN_WARNING "inflateInit failed\n"); + ret = -1; + goto out; + } + + while (bytes_left > 0) { + unsigned long buf_start; + unsigned long buf_offset; + unsigned long bytes; + unsigned long pg_offset = 0; + + ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH); + if (ret != Z_OK && ret != Z_STREAM_END) + break; + + buf_start = total_out; + total_out = workspace->inf_strm.total_out; + + if (total_out == buf_start) { + ret = -1; + break; + } + + if (total_out <= start_byte) + goto next; + + if (total_out > start_byte && buf_start < start_byte) + buf_offset = start_byte - buf_start; + else + buf_offset = 0; + + bytes = min(PAGE_CACHE_SIZE - pg_offset, + PAGE_CACHE_SIZE - buf_offset); + bytes = min(bytes, bytes_left); + + kaddr = kmap_atomic(dest_page, KM_USER0); + memcpy(kaddr + pg_offset, workspace->buf + buf_offset, bytes); + kunmap_atomic(kaddr, KM_USER0); + + pg_offset += bytes; + bytes_left -= bytes; +next: + workspace->inf_strm.next_out = workspace->buf; + workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; + } + + if (ret != Z_STREAM_END && bytes_left != 0) + ret = -1; + else + ret = 0; + + zlib_inflateEnd(&workspace->inf_strm); +out: + free_workspace(workspace); + return ret; +} + +void btrfs_zlib_exit(void) +{ + free_workspaces(); +} diff --git a/fs/buffer.c b/fs/buffer.c index 10179cfa1152..b6e8b8632e2f 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -99,10 +99,18 @@ __clear_page_buffers(struct page *page) page_cache_release(page); } + +static int quiet_error(struct buffer_head *bh) +{ + if (!test_bit(BH_Quiet, &bh->b_state) && printk_ratelimit()) + return 0; + return 1; +} + + static void buffer_io_error(struct buffer_head *bh) { char b[BDEVNAME_SIZE]; - printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n", bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); @@ -144,7 +152,7 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate) if (uptodate) { set_buffer_uptodate(bh); } else { - if (!buffer_eopnotsupp(bh) && printk_ratelimit()) { + if (!buffer_eopnotsupp(bh) && !quiet_error(bh)) { buffer_io_error(bh); printk(KERN_WARNING "lost page write due to " "I/O error on %s\n", @@ -195,10 +203,25 @@ int fsync_bdev(struct block_device *bdev) * happen on bdev until thaw_bdev() is called. * If a superblock is found on this device, we take the s_umount semaphore * on it to make sure nobody unmounts until the snapshot creation is done. + * The reference counter (bd_fsfreeze_count) guarantees that only the last + * unfreeze process can unfreeze the frozen filesystem actually when multiple + * freeze requests arrive simultaneously. It counts up in freeze_bdev() and + * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze + * actually. */ struct super_block *freeze_bdev(struct block_device *bdev) { struct super_block *sb; + int error = 0; + + mutex_lock(&bdev->bd_fsfreeze_mutex); + if (bdev->bd_fsfreeze_count > 0) { + bdev->bd_fsfreeze_count++; + sb = get_super(bdev); + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return sb; + } + bdev->bd_fsfreeze_count++; down(&bdev->bd_mount_sem); sb = get_super(bdev); @@ -213,11 +236,24 @@ struct super_block *freeze_bdev(struct block_device *bdev) sync_blockdev(sb->s_bdev); - if (sb->s_op->write_super_lockfs) - sb->s_op->write_super_lockfs(sb); + if (sb->s_op->freeze_fs) { + error = sb->s_op->freeze_fs(sb); + if (error) { + printk(KERN_ERR + "VFS:Filesystem freeze failed\n"); + sb->s_frozen = SB_UNFROZEN; + drop_super(sb); + up(&bdev->bd_mount_sem); + bdev->bd_fsfreeze_count--; + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return ERR_PTR(error); + } + } } sync_blockdev(bdev); + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */ } EXPORT_SYMBOL(freeze_bdev); @@ -229,20 +265,48 @@ EXPORT_SYMBOL(freeze_bdev); * * Unlocks the filesystem and marks it writeable again after freeze_bdev(). */ -void thaw_bdev(struct block_device *bdev, struct super_block *sb) +int thaw_bdev(struct block_device *bdev, struct super_block *sb) { + int error = 0; + + mutex_lock(&bdev->bd_fsfreeze_mutex); + if (!bdev->bd_fsfreeze_count) { + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return -EINVAL; + } + + bdev->bd_fsfreeze_count--; + if (bdev->bd_fsfreeze_count > 0) { + if (sb) + drop_super(sb); + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return 0; + } + if (sb) { BUG_ON(sb->s_bdev != bdev); - - if (sb->s_op->unlockfs) - sb->s_op->unlockfs(sb); - sb->s_frozen = SB_UNFROZEN; - smp_wmb(); - wake_up(&sb->s_wait_unfrozen); + if (!(sb->s_flags & MS_RDONLY)) { + if (sb->s_op->unfreeze_fs) { + error = sb->s_op->unfreeze_fs(sb); + if (error) { + printk(KERN_ERR + "VFS:Filesystem thaw failed\n"); + sb->s_frozen = SB_FREEZE_TRANS; + bdev->bd_fsfreeze_count++; + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return error; + } + } + sb->s_frozen = SB_UNFROZEN; + smp_wmb(); + wake_up(&sb->s_wait_unfrozen); + } drop_super(sb); } up(&bdev->bd_mount_sem); + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return 0; } EXPORT_SYMBOL(thaw_bdev); @@ -394,7 +458,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate) set_buffer_uptodate(bh); } else { clear_buffer_uptodate(bh); - if (printk_ratelimit()) + if (!quiet_error(bh)) buffer_io_error(bh); SetPageError(page); } @@ -455,7 +519,7 @@ static void end_buffer_async_write(struct buffer_head *bh, int uptodate) if (uptodate) { set_buffer_uptodate(bh); } else { - if (printk_ratelimit()) { + if (!quiet_error(bh)) { buffer_io_error(bh); printk(KERN_WARNING "lost page write due to " "I/O error on %s\n", @@ -1988,7 +2052,7 @@ int block_write_begin(struct file *file, struct address_space *mapping, page = *pagep; if (page == NULL) { ownpage = 1; - page = __grab_cache_page(mapping, index); + page = grab_cache_page_write_begin(mapping, index, flags); if (!page) { status = -ENOMEM; goto out; @@ -2014,7 +2078,6 @@ int block_write_begin(struct file *file, struct address_space *mapping, if (pos + len > inode->i_size) vmtruncate(inode, inode->i_size); } - goto out; } out: @@ -2494,7 +2557,7 @@ int nobh_write_begin(struct file *file, struct address_space *mapping, from = pos & (PAGE_CACHE_SIZE - 1); to = from + len; - page = __grab_cache_page(mapping, index); + page = grab_cache_page_write_begin(mapping, index, flags); if (!page) return -ENOMEM; *pagep = page; @@ -2913,6 +2976,9 @@ static void end_bio_bh_io_sync(struct bio *bio, int err) set_bit(BH_Eopnotsupp, &bh->b_state); } + if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags))) + set_bit(BH_Quiet, &bh->b_state); + bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags)); bio_put(bio); } diff --git a/fs/char_dev.c b/fs/char_dev.c index 700697a72618..38f71222a552 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -120,7 +120,7 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor, cd->major = major; cd->baseminor = baseminor; cd->minorct = minorct; - strncpy(cd->name,name, 64); + strlcpy(cd->name, name, sizeof(cd->name)); i = major_to_index(major); diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile index 6ba43fb346fb..9948c0030e86 100644 --- a/fs/cifs/Makefile +++ b/fs/cifs/Makefile @@ -5,7 +5,7 @@ obj-$(CONFIG_CIFS) += cifs.o cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \ link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o \ - md4.o md5.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o fcntl.o \ + md4.o md5.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o \ readdir.o ioctl.o sess.o export.o cifsacl.o cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 0005a194a75c..13ea53251dcf 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -747,7 +747,6 @@ const struct file_operations cifs_file_ops = { #endif /* CONFIG_CIFS_POSIX */ #ifdef CONFIG_CIFS_EXPERIMENTAL - .dir_notify = cifs_dir_notify, .setlease = cifs_setlease, #endif /* CONFIG_CIFS_EXPERIMENTAL */ }; @@ -768,7 +767,6 @@ const struct file_operations cifs_file_direct_ops = { #endif /* CONFIG_CIFS_POSIX */ .llseek = cifs_llseek, #ifdef CONFIG_CIFS_EXPERIMENTAL - .dir_notify = cifs_dir_notify, .setlease = cifs_setlease, #endif /* CONFIG_CIFS_EXPERIMENTAL */ }; @@ -789,7 +787,6 @@ const struct file_operations cifs_file_nobrl_ops = { #endif /* CONFIG_CIFS_POSIX */ #ifdef CONFIG_CIFS_EXPERIMENTAL - .dir_notify = cifs_dir_notify, .setlease = cifs_setlease, #endif /* CONFIG_CIFS_EXPERIMENTAL */ }; @@ -809,7 +806,6 @@ const struct file_operations cifs_file_direct_nobrl_ops = { #endif /* CONFIG_CIFS_POSIX */ .llseek = cifs_llseek, #ifdef CONFIG_CIFS_EXPERIMENTAL - .dir_notify = cifs_dir_notify, .setlease = cifs_setlease, #endif /* CONFIG_CIFS_EXPERIMENTAL */ }; @@ -818,9 +814,6 @@ const struct file_operations cifs_dir_ops = { .readdir = cifs_readdir, .release = cifs_closedir, .read = generic_read_dir, -#ifdef CONFIG_CIFS_EXPERIMENTAL - .dir_notify = cifs_dir_notify, -#endif /* CONFIG_CIFS_EXPERIMENTAL */ .unlocked_ioctl = cifs_ioctl, .llseek = generic_file_llseek, }; diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 2ce04c73d74e..7ac481841f87 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -76,7 +76,6 @@ extern int cifs_file_mmap(struct file * , struct vm_area_struct *); extern const struct file_operations cifs_dir_ops; extern int cifs_dir_open(struct inode *inode, struct file *file); extern int cifs_readdir(struct file *file, void *direntry, filldir_t filldir); -extern int cifs_dir_notify(struct file *, unsigned long arg); /* Functions related to dir entries */ extern struct dentry_operations cifs_dentry_ops; diff --git a/fs/cifs/fcntl.c b/fs/cifs/fcntl.c deleted file mode 100644 index 5a57581eb4b2..000000000000 --- a/fs/cifs/fcntl.c +++ /dev/null @@ -1,118 +0,0 @@ -/* - * fs/cifs/fcntl.c - * - * vfs operations that deal with the file control API - * - * Copyright (C) International Business Machines Corp., 2003,2004 - * Author(s): Steve French (sfrench@us.ibm.com) - * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ -#include <linux/fs.h> -#include <linux/stat.h> -#include <linux/fcntl.h> -#include "cifsglob.h" -#include "cifsproto.h" -#include "cifs_unicode.h" -#include "cifs_debug.h" -#include "cifsfs.h" - -static __u32 convert_to_cifs_notify_flags(unsigned long fcntl_notify_flags) -{ - __u32 cifs_ntfy_flags = 0; - - /* No way on Linux VFS to ask to monitor xattr - changes (and no stream support either */ - if (fcntl_notify_flags & DN_ACCESS) - cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_LAST_ACCESS; - if (fcntl_notify_flags & DN_MODIFY) { - /* What does this mean on directories? */ - cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_LAST_WRITE | - FILE_NOTIFY_CHANGE_SIZE; - } - if (fcntl_notify_flags & DN_CREATE) { - cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_CREATION | - FILE_NOTIFY_CHANGE_LAST_WRITE; - } - if (fcntl_notify_flags & DN_DELETE) - cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_LAST_WRITE; - if (fcntl_notify_flags & DN_RENAME) { - /* BB review this - checking various server behaviors */ - cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_DIR_NAME | - FILE_NOTIFY_CHANGE_FILE_NAME; - } - if (fcntl_notify_flags & DN_ATTRIB) { - cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_SECURITY | - FILE_NOTIFY_CHANGE_ATTRIBUTES; - } -/* if (fcntl_notify_flags & DN_MULTISHOT) { - cifs_ntfy_flags |= ; - } */ /* BB fixme - not sure how to handle this with CIFS yet */ - - return cifs_ntfy_flags; -} - -int cifs_dir_notify(struct file *file, unsigned long arg) -{ - int xid; - int rc = -EINVAL; - int oplock = 0; - struct cifs_sb_info *cifs_sb; - struct cifsTconInfo *pTcon; - char *full_path = NULL; - __u32 filter = FILE_NOTIFY_CHANGE_NAME | FILE_NOTIFY_CHANGE_ATTRIBUTES; - __u16 netfid; - - if (experimEnabled == 0) - return 0; - - xid = GetXid(); - cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); - pTcon = cifs_sb->tcon; - - full_path = build_path_from_dentry(file->f_path.dentry); - - if (full_path == NULL) { - rc = -ENOMEM; - } else { - cFYI(1, ("dir notify on file %s Arg 0x%lx", full_path, arg)); - rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN, - GENERIC_READ | SYNCHRONIZE, 0 /* create options */, - &netfid, &oplock, NULL, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); - /* BB fixme - add this handle to a notify handle list */ - if (rc) { - cFYI(1, ("Could not open directory for notify")); - } else { - filter = convert_to_cifs_notify_flags(arg); - if (filter != 0) { - rc = CIFSSMBNotify(xid, pTcon, - 0 /* no subdirs */, netfid, - filter, file, arg & DN_MULTISHOT, - cifs_sb->local_nls); - } else { - rc = -EINVAL; - } - /* BB add code to close file eventually (at unmount - it would close automatically but may be a way - to do it easily when inode freed or when - notify info is cleared/changed */ - cFYI(1, ("notify rc %d", rc)); - } - } - - FreeXid(xid); - return rc; -} diff --git a/fs/cifs/file.c b/fs/cifs/file.c index b1e1fc6a6e6a..12bb656fbe75 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2074,7 +2074,7 @@ static int cifs_write_begin(struct file *file, struct address_space *mapping, cFYI(1, ("write_begin from %lld len %d", (long long)pos, len)); - page = __grab_cache_page(mapping, index); + page = grab_cache_page_write_begin(mapping, index, flags); if (!page) { rc = -ENOMEM; goto out; diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index f247da9f4edc..5ab9896fdcb2 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1641,7 +1641,7 @@ do_expand: i_size_write(inode, offset); spin_unlock(&inode->i_lock); out_truncate: - if (inode->i_op && inode->i_op->truncate) + if (inode->i_op->truncate) inode->i_op->truncate(inode); return 0; out_sig: diff --git a/fs/coda/file.c b/fs/coda/file.c index 466303db2df6..6a347fbc998a 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -201,8 +201,7 @@ int coda_release(struct inode *coda_inode, struct file *coda_file) int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync) { struct file *host_file; - struct dentry *host_dentry; - struct inode *host_inode, *coda_inode = coda_dentry->d_inode; + struct inode *coda_inode = coda_dentry->d_inode; struct coda_file_info *cfi; int err = 0; @@ -214,14 +213,7 @@ int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync) BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); host_file = cfi->cfi_container; - if (host_file->f_op && host_file->f_op->fsync) { - host_dentry = host_file->f_path.dentry; - host_inode = host_dentry->d_inode; - mutex_lock(&host_inode->i_mutex); - err = host_file->f_op->fsync(host_file, host_dentry, datasync); - mutex_unlock(&host_inode->i_mutex); - } - + err = vfs_fsync(host_file, host_file->f_path.dentry, datasync); if ( !err && !datasync ) { lock_kernel(); err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode)); diff --git a/fs/coda/sysctl.c b/fs/coda/sysctl.c index 81b7771c6465..43c96ce29614 100644 --- a/fs/coda/sysctl.c +++ b/fs/coda/sysctl.c @@ -11,7 +11,9 @@ #include "coda_int.h" +#ifdef CONFIG_SYSCTL static struct ctl_table_header *fs_table_header; +#endif static ctl_table coda_table[] = { { @@ -41,6 +43,7 @@ static ctl_table coda_table[] = { {} }; +#ifdef CONFIG_SYSCTL static ctl_table fs_table[] = { { .ctl_name = CTL_UNNUMBERED, @@ -50,7 +53,7 @@ static ctl_table fs_table[] = { }, {} }; - +#endif void coda_sysctl_init(void) { diff --git a/fs/compat.c b/fs/compat.c index d1ece79b6411..30f2faa22f5c 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1187,6 +1187,9 @@ compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec, unsign ret = compat_do_readv_writev(READ, file, vec, vlen, &file->f_pos); out: + if (ret > 0) + add_rchar(current, ret); + inc_syscr(current); fput(file); return ret; } @@ -1210,6 +1213,9 @@ compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec, unsig ret = compat_do_readv_writev(WRITE, file, vec, vlen, &file->f_pos); out: + if (ret > 0) + add_wchar(current, ret); + inc_syscw(current); fput(file); return ret; } diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index 4803ccc94480..5d349d38e056 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -117,8 +117,6 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr) static inline void set_default_inode_attr(struct inode * inode, mode_t mode) { inode->i_mode = mode; - inode->i_uid = 0; - inode->i_gid = 0; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; } @@ -136,7 +134,6 @@ struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent * sd) { struct inode * inode = new_inode(configfs_sb); if (inode) { - inode->i_blocks = 0; inode->i_mapping->a_ops = &configfs_aops; inode->i_mapping->backing_dev_info = &configfs_backing_dev_info; inode->i_op = &configfs_inode_operations; diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index f40423eb1a14..a07338d2d140 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -83,8 +83,6 @@ static struct inode *get_cramfs_inode(struct super_block *sb, inode->i_op = &page_symlink_inode_operations; inode->i_data.a_ops = &cramfs_aops; } else { - inode->i_size = 0; - inode->i_blocks = 0; init_special_inode(inode, inode->i_mode, old_decode_dev(cramfs_inode->size)); } diff --git a/fs/dcache.c b/fs/dcache.c index a1d86c7f3e66..4547f66884a0 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -34,7 +34,6 @@ #include <linux/bootmem.h> #include "internal.h" - int sysctl_vfs_cache_pressure __read_mostly = 100; EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure); @@ -948,9 +947,6 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) dentry->d_op = NULL; dentry->d_fsdata = NULL; dentry->d_mounted = 0; -#ifdef CONFIG_PROFILING - dentry->d_cookie = NULL; -#endif INIT_HLIST_NODE(&dentry->d_hash); INIT_LIST_HEAD(&dentry->d_lru); INIT_LIST_HEAD(&dentry->d_subdirs); @@ -1336,7 +1332,7 @@ err_out: * * Searches the children of the parent dentry for the name in question. If * the dentry is found its reference count is incremented and the dentry - * is returned. The caller must use d_put to free the entry when it has + * is returned. The caller must use dput to free the entry when it has * finished using it. %NULL is returned on failure. * * __d_lookup is dcache_lock free. The hash list is protected using RCU. @@ -1571,10 +1567,6 @@ void d_rehash(struct dentry * entry) spin_unlock(&dcache_lock); } -#define do_switch(x,y) do { \ - __typeof__ (x) __tmp = x; \ - x = y; y = __tmp; } while (0) - /* * When switching names, the actual string doesn't strictly have to * be preserved in the target - because we're dropping the target @@ -1593,7 +1585,7 @@ static void switch_names(struct dentry *dentry, struct dentry *target) /* * Both external: swap the pointers */ - do_switch(target->d_name.name, dentry->d_name.name); + swap(target->d_name.name, dentry->d_name.name); } else { /* * dentry:internal, target:external. Steal target's @@ -1620,8 +1612,11 @@ static void switch_names(struct dentry *dentry, struct dentry *target) */ memcpy(dentry->d_iname, target->d_name.name, target->d_name.len + 1); + dentry->d_name.len = target->d_name.len; + return; } } + swap(dentry->d_name.len, target->d_name.len); } /* @@ -1681,8 +1676,7 @@ already_unhashed: /* Switch the names.. */ switch_names(dentry, target); - do_switch(dentry->d_name.len, target->d_name.len); - do_switch(dentry->d_name.hash, target->d_name.hash); + swap(dentry->d_name.hash, target->d_name.hash); /* ... and switch the parents */ if (IS_ROOT(dentry)) { @@ -1690,7 +1684,7 @@ already_unhashed: target->d_parent = target; INIT_LIST_HEAD(&target->d_u.d_child); } else { - do_switch(dentry->d_parent, target->d_parent); + swap(dentry->d_parent, target->d_parent); /* And add them back to the (new) parent lists */ list_add(&target->d_u.d_child, &target->d_parent->d_subdirs); @@ -1791,8 +1785,7 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon) struct dentry *dparent, *aparent; switch_names(dentry, anon); - do_switch(dentry->d_name.len, anon->d_name.len); - do_switch(dentry->d_name.hash, anon->d_name.hash); + swap(dentry->d_name.hash, anon->d_name.hash); dparent = dentry->d_parent; aparent = anon->d_parent; @@ -1911,7 +1904,8 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name) * Convert a dentry into an ASCII path name. If the entry has been deleted * the string " (deleted)" is appended. Note that this is ambiguous. * - * Returns the buffer or an error code if the path was too long. + * Returns a pointer into the buffer or an error code if the + * path was too long. * * "buflen" should be positive. Caller holds the dcache_lock. * @@ -1987,7 +1981,10 @@ Elong: * Convert a dentry into an ASCII path name. If the entry has been deleted * the string " (deleted)" is appended. Note that this is ambiguous. * - * Returns the buffer or an error code if the path was too long. + * Returns a pointer into the buffer or an error code if the path was + * too long. Note: Callers should use the returned pointer, not the passed + * in buffer, to use the name! The implementation often starts at an offset + * into the buffer, and may leave 0 bytes at the start. * * "buflen" should be positive. */ @@ -2313,9 +2310,6 @@ static void __init dcache_init(void) /* SLAB cache for __getname() consumers */ struct kmem_cache *names_cachep __read_mostly; -/* SLAB cache for file structures */ -struct kmem_cache *filp_cachep __read_mostly; - EXPORT_SYMBOL(d_genocide); void __init vfs_caches_init_early(void) @@ -2337,9 +2331,6 @@ void __init vfs_caches_init(unsigned long mempages) names_cachep = kmem_cache_create("names_cache", PATH_MAX, 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); - filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); - dcache_init(); inode_init(); files_init(mempages); diff --git a/fs/dcookies.c b/fs/dcookies.c index 855d4b1d619a..180e9fec4ad8 100644 --- a/fs/dcookies.c +++ b/fs/dcookies.c @@ -93,10 +93,15 @@ static struct dcookie_struct *alloc_dcookie(struct path *path) { struct dcookie_struct *dcs = kmem_cache_alloc(dcookie_cache, GFP_KERNEL); + struct dentry *d; if (!dcs) return NULL; - path->dentry->d_cookie = dcs; + d = path->dentry; + spin_lock(&d->d_lock); + d->d_flags |= DCACHE_COOKIE; + spin_unlock(&d->d_lock); + dcs->path = *path; path_get(path); hash_dcookie(dcs); @@ -119,14 +124,14 @@ int get_dcookie(struct path *path, unsigned long *cookie) goto out; } - dcs = path->dentry->d_cookie; - - if (!dcs) + if (path->dentry->d_flags & DCACHE_COOKIE) { + dcs = find_dcookie((unsigned long)path->dentry); + } else { dcs = alloc_dcookie(path); - - if (!dcs) { - err = -ENOMEM; - goto out; + if (!dcs) { + err = -ENOMEM; + goto out; + } } *cookie = dcookie_value(dcs); @@ -251,7 +256,12 @@ out_kmem: static void free_dcookie(struct dcookie_struct * dcs) { - dcs->path.dentry->d_cookie = NULL; + struct dentry *d = dcs->path.dentry; + + spin_lock(&d->d_lock); + d->d_flags &= ~DCACHE_COOKIE; + spin_unlock(&d->d_lock); + path_put(&dcs->path); kmem_cache_free(dcookie_cache, dcs); } diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 159a5efd6a8a..33a90120f6ad 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -294,6 +294,38 @@ struct dentry *debugfs_create_x32(const char *name, mode_t mode, } EXPORT_SYMBOL_GPL(debugfs_create_x32); + +static int debugfs_size_t_set(void *data, u64 val) +{ + *(size_t *)data = val; + return 0; +} +static int debugfs_size_t_get(void *data, u64 *val) +{ + *val = *(size_t *)data; + return 0; +} +DEFINE_SIMPLE_ATTRIBUTE(fops_size_t, debugfs_size_t_get, debugfs_size_t_set, + "%llu\n"); /* %llu and %zu are more or less the same */ + +/** + * debugfs_create_size_t - create a debugfs file that is used to read and write an size_t value + * @name: a pointer to a string containing the name of the file to create. + * @mode: the permission that the file should have + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is %NULL, then the + * file will be created in the root of the debugfs filesystem. + * @value: a pointer to the variable that the file should read to and write + * from. + */ +struct dentry *debugfs_create_size_t(const char *name, mode_t mode, + struct dentry *parent, size_t *value) +{ + return debugfs_create_file(name, mode, parent, value, &fops_size_t); +} +EXPORT_SYMBOL_GPL(debugfs_create_size_t); + + static ssize_t read_file_bool(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 3dbe2169cf36..81ae9ea3c6e1 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -37,9 +37,6 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d if (inode) { inode->i_mode = mode; - inode->i_uid = 0; - inode->i_gid = 0; - inode->i_blocks = 0; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; switch (mode & S_IFMT) { default: diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 5d61b7c06e13..5f3231b9633f 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -27,25 +27,32 @@ #define DEVPTS_SUPER_MAGIC 0x1cd1 #define DEVPTS_DEFAULT_MODE 0600 +/* + * ptmx is a new node in /dev/pts and will be unused in legacy (single- + * instance) mode. To prevent surprises in user space, set permissions of + * ptmx to 0. Use 'chmod' or remount with '-o ptmxmode' to set meaningful + * permissions. + */ +#define DEVPTS_DEFAULT_PTMX_MODE 0000 #define PTMX_MINOR 2 extern int pty_limit; /* Config limit on Unix98 ptys */ -static DEFINE_IDA(allocated_ptys); static DEFINE_MUTEX(allocated_ptys_lock); static struct vfsmount *devpts_mnt; -static struct dentry *devpts_root; -static struct { +struct pts_mount_opts { int setuid; int setgid; uid_t uid; gid_t gid; umode_t mode; -} config = {.mode = DEVPTS_DEFAULT_MODE}; + umode_t ptmxmode; + int newinstance; +}; enum { - Opt_uid, Opt_gid, Opt_mode, + Opt_uid, Opt_gid, Opt_mode, Opt_ptmxmode, Opt_newinstance, Opt_err }; @@ -53,18 +60,50 @@ static const match_table_t tokens = { {Opt_uid, "uid=%u"}, {Opt_gid, "gid=%u"}, {Opt_mode, "mode=%o"}, +#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES + {Opt_ptmxmode, "ptmxmode=%o"}, + {Opt_newinstance, "newinstance"}, +#endif {Opt_err, NULL} }; -static int devpts_remount(struct super_block *sb, int *flags, char *data) +struct pts_fs_info { + struct ida allocated_ptys; + struct pts_mount_opts mount_opts; + struct dentry *ptmx_dentry; +}; + +static inline struct pts_fs_info *DEVPTS_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline struct super_block *pts_sb_from_inode(struct inode *inode) +{ +#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES + if (inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC) + return inode->i_sb; +#endif + return devpts_mnt->mnt_sb; +} + +#define PARSE_MOUNT 0 +#define PARSE_REMOUNT 1 + +static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts) { char *p; - config.setuid = 0; - config.setgid = 0; - config.uid = 0; - config.gid = 0; - config.mode = DEVPTS_DEFAULT_MODE; + opts->setuid = 0; + opts->setgid = 0; + opts->uid = 0; + opts->gid = 0; + opts->mode = DEVPTS_DEFAULT_MODE; + opts->ptmxmode = DEVPTS_DEFAULT_PTMX_MODE; + + /* newinstance makes sense only on initial mount */ + if (op == PARSE_MOUNT) + opts->newinstance = 0; while ((p = strsep(&data, ",")) != NULL) { substring_t args[MAX_OPT_ARGS]; @@ -79,20 +118,32 @@ static int devpts_remount(struct super_block *sb, int *flags, char *data) case Opt_uid: if (match_int(&args[0], &option)) return -EINVAL; - config.uid = option; - config.setuid = 1; + opts->uid = option; + opts->setuid = 1; break; case Opt_gid: if (match_int(&args[0], &option)) return -EINVAL; - config.gid = option; - config.setgid = 1; + opts->gid = option; + opts->setgid = 1; break; case Opt_mode: if (match_octal(&args[0], &option)) return -EINVAL; - config.mode = option & S_IALLUGO; + opts->mode = option & S_IALLUGO; + break; +#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES + case Opt_ptmxmode: + if (match_octal(&args[0], &option)) + return -EINVAL; + opts->ptmxmode = option & S_IALLUGO; + break; + case Opt_newinstance: + /* newinstance makes sense only on initial mount */ + if (op == PARSE_MOUNT) + opts->newinstance = 1; break; +#endif default: printk(KERN_ERR "devpts: called with bogus options\n"); return -EINVAL; @@ -102,13 +153,106 @@ static int devpts_remount(struct super_block *sb, int *flags, char *data) return 0; } +#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES +static int mknod_ptmx(struct super_block *sb) +{ + int mode; + int rc = -ENOMEM; + struct dentry *dentry; + struct inode *inode; + struct dentry *root = sb->s_root; + struct pts_fs_info *fsi = DEVPTS_SB(sb); + struct pts_mount_opts *opts = &fsi->mount_opts; + + mutex_lock(&root->d_inode->i_mutex); + + /* If we have already created ptmx node, return */ + if (fsi->ptmx_dentry) { + rc = 0; + goto out; + } + + dentry = d_alloc_name(root, "ptmx"); + if (!dentry) { + printk(KERN_NOTICE "Unable to alloc dentry for ptmx node\n"); + goto out; + } + + /* + * Create a new 'ptmx' node in this mount of devpts. + */ + inode = new_inode(sb); + if (!inode) { + printk(KERN_ERR "Unable to alloc inode for ptmx node\n"); + dput(dentry); + goto out; + } + + inode->i_ino = 2; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + + mode = S_IFCHR|opts->ptmxmode; + init_special_inode(inode, mode, MKDEV(TTYAUX_MAJOR, 2)); + + d_add(dentry, inode); + + fsi->ptmx_dentry = dentry; + rc = 0; + + printk(KERN_DEBUG "Created ptmx node in devpts ino %lu\n", + inode->i_ino); +out: + mutex_unlock(&root->d_inode->i_mutex); + return rc; +} + +static void update_ptmx_mode(struct pts_fs_info *fsi) +{ + struct inode *inode; + if (fsi->ptmx_dentry) { + inode = fsi->ptmx_dentry->d_inode; + inode->i_mode = S_IFCHR|fsi->mount_opts.ptmxmode; + } +} +#else +static inline void update_ptmx_mode(struct pts_fs_info *fsi) +{ + return; +} +#endif + +static int devpts_remount(struct super_block *sb, int *flags, char *data) +{ + int err; + struct pts_fs_info *fsi = DEVPTS_SB(sb); + struct pts_mount_opts *opts = &fsi->mount_opts; + + err = parse_mount_options(data, PARSE_REMOUNT, opts); + + /* + * parse_mount_options() restores options to default values + * before parsing and may have changed ptmxmode. So, update the + * mode in the inode too. Bogus options don't fail the remount, + * so do this even on error return. + */ + update_ptmx_mode(fsi); + + return err; +} + static int devpts_show_options(struct seq_file *seq, struct vfsmount *vfs) { - if (config.setuid) - seq_printf(seq, ",uid=%u", config.uid); - if (config.setgid) - seq_printf(seq, ",gid=%u", config.gid); - seq_printf(seq, ",mode=%03o", config.mode); + struct pts_fs_info *fsi = DEVPTS_SB(vfs->mnt_sb); + struct pts_mount_opts *opts = &fsi->mount_opts; + + if (opts->setuid) + seq_printf(seq, ",uid=%u", opts->uid); + if (opts->setgid) + seq_printf(seq, ",gid=%u", opts->gid); + seq_printf(seq, ",mode=%03o", opts->mode); +#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES + seq_printf(seq, ",ptmxmode=%03o", opts->ptmxmode); +#endif return 0; } @@ -119,10 +263,25 @@ static const struct super_operations devpts_sops = { .show_options = devpts_show_options, }; +static void *new_pts_fs_info(void) +{ + struct pts_fs_info *fsi; + + fsi = kzalloc(sizeof(struct pts_fs_info), GFP_KERNEL); + if (!fsi) + return NULL; + + ida_init(&fsi->allocated_ptys); + fsi->mount_opts.mode = DEVPTS_DEFAULT_MODE; + fsi->mount_opts.ptmxmode = DEVPTS_DEFAULT_PTMX_MODE; + + return fsi; +} + static int devpts_fill_super(struct super_block *s, void *data, int silent) { - struct inode * inode; + struct inode *inode; s->s_blocksize = 1024; s->s_blocksize_bits = 10; @@ -130,39 +289,240 @@ devpts_fill_super(struct super_block *s, void *data, int silent) s->s_op = &devpts_sops; s->s_time_gran = 1; + s->s_fs_info = new_pts_fs_info(); + if (!s->s_fs_info) + goto fail; + inode = new_inode(s); if (!inode) - goto fail; + goto free_fsi; inode->i_ino = 1; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; - inode->i_blocks = 0; - inode->i_uid = inode->i_gid = 0; inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR; inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; inode->i_nlink = 2; - devpts_root = s->s_root = d_alloc_root(inode); + s->s_root = d_alloc_root(inode); if (s->s_root) return 0; - - printk("devpts: get root dentry failed\n"); + + printk(KERN_ERR "devpts: get root dentry failed\n"); iput(inode); + +free_fsi: + kfree(s->s_fs_info); fail: return -ENOMEM; } +#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES +static int compare_init_pts_sb(struct super_block *s, void *p) +{ + if (devpts_mnt) + return devpts_mnt->mnt_sb == s; + return 0; +} + +/* + * Safely parse the mount options in @data and update @opts. + * + * devpts ends up parsing options two times during mount, due to the + * two modes of operation it supports. The first parse occurs in + * devpts_get_sb() when determining the mode (single-instance or + * multi-instance mode). The second parse happens in devpts_remount() + * or new_pts_mount() depending on the mode. + * + * Parsing of options modifies the @data making subsequent parsing + * incorrect. So make a local copy of @data and parse it. + * + * Return: 0 On success, -errno on error + */ +static int safe_parse_mount_options(void *data, struct pts_mount_opts *opts) +{ + int rc; + void *datacp; + + if (!data) + return 0; + + /* Use kstrdup() ? */ + datacp = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!datacp) + return -ENOMEM; + + memcpy(datacp, data, PAGE_SIZE); + rc = parse_mount_options((char *)datacp, PARSE_MOUNT, opts); + kfree(datacp); + + return rc; +} + +/* + * Mount a new (private) instance of devpts. PTYs created in this + * instance are independent of the PTYs in other devpts instances. + */ +static int new_pts_mount(struct file_system_type *fs_type, int flags, + void *data, struct vfsmount *mnt) +{ + int err; + struct pts_fs_info *fsi; + struct pts_mount_opts *opts; + + printk(KERN_NOTICE "devpts: newinstance mount\n"); + + err = get_sb_nodev(fs_type, flags, data, devpts_fill_super, mnt); + if (err) + return err; + + fsi = DEVPTS_SB(mnt->mnt_sb); + opts = &fsi->mount_opts; + + err = parse_mount_options(data, PARSE_MOUNT, opts); + if (err) + goto fail; + + err = mknod_ptmx(mnt->mnt_sb); + if (err) + goto fail; + + return 0; + +fail: + dput(mnt->mnt_sb->s_root); + deactivate_super(mnt->mnt_sb); + return err; +} + +/* + * Check if 'newinstance' mount option was specified in @data. + * + * Return: -errno on error (eg: invalid mount options specified) + * : 1 if 'newinstance' mount option was specified + * : 0 if 'newinstance' mount option was NOT specified + */ +static int is_new_instance_mount(void *data) +{ + int rc; + struct pts_mount_opts opts; + + if (!data) + return 0; + + rc = safe_parse_mount_options(data, &opts); + if (!rc) + rc = opts.newinstance; + + return rc; +} + +/* + * get_init_pts_sb() + * + * This interface is needed to support multiple namespace semantics in + * devpts while preserving backward compatibility of the current 'single- + * namespace' semantics. i.e all mounts of devpts without the 'newinstance' + * mount option should bind to the initial kernel mount, like + * get_sb_single(). + * + * Mounts with 'newinstance' option create a new private namespace. + * + * But for single-mount semantics, devpts cannot use get_sb_single(), + * because get_sb_single()/sget() find and use the super-block from + * the most recent mount of devpts. But that recent mount may be a + * 'newinstance' mount and get_sb_single() would pick the newinstance + * super-block instead of the initial super-block. + * + * This interface is identical to get_sb_single() except that it + * consistently selects the 'single-namespace' superblock even in the + * presence of the private namespace (i.e 'newinstance') super-blocks. + */ +static int get_init_pts_sb(struct file_system_type *fs_type, int flags, + void *data, struct vfsmount *mnt) +{ + struct super_block *s; + int error; + + s = sget(fs_type, compare_init_pts_sb, set_anon_super, NULL); + if (IS_ERR(s)) + return PTR_ERR(s); + + if (!s->s_root) { + s->s_flags = flags; + error = devpts_fill_super(s, data, flags & MS_SILENT ? 1 : 0); + if (error) { + up_write(&s->s_umount); + deactivate_super(s); + return error; + } + s->s_flags |= MS_ACTIVE; + } + do_remount_sb(s, flags, data, 0); + return simple_set_mnt(mnt, s); +} + +/* + * Mount or remount the initial kernel mount of devpts. This type of + * mount maintains the legacy, single-instance semantics, while the + * kernel still allows multiple-instances. + */ +static int init_pts_mount(struct file_system_type *fs_type, int flags, + void *data, struct vfsmount *mnt) +{ + int err; + + err = get_init_pts_sb(fs_type, flags, data, mnt); + if (err) + return err; + + err = mknod_ptmx(mnt->mnt_sb); + if (err) { + dput(mnt->mnt_sb->s_root); + deactivate_super(mnt->mnt_sb); + } + + return err; +} + static int devpts_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, struct vfsmount *mnt) { + int new; + + new = is_new_instance_mount(data); + if (new < 0) + return new; + + if (new) + return new_pts_mount(fs_type, flags, data, mnt); + + return init_pts_mount(fs_type, flags, data, mnt); +} +#else +/* + * This supports only the legacy single-instance semantics (no + * multiple-instance semantics) + */ +static int devpts_get_sb(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data, struct vfsmount *mnt) +{ return get_sb_single(fs_type, flags, data, devpts_fill_super, mnt); } +#endif + +static void devpts_kill_sb(struct super_block *sb) +{ + struct pts_fs_info *fsi = DEVPTS_SB(sb); + + kfree(fsi); + kill_litter_super(sb); +} static struct file_system_type devpts_fs_type = { .owner = THIS_MODULE, .name = "devpts", .get_sb = devpts_get_sb, - .kill_sb = kill_anon_super, + .kill_sb = devpts_kill_sb, }; /* @@ -172,16 +532,17 @@ static struct file_system_type devpts_fs_type = { int devpts_new_index(struct inode *ptmx_inode) { + struct super_block *sb = pts_sb_from_inode(ptmx_inode); + struct pts_fs_info *fsi = DEVPTS_SB(sb); int index; int ida_ret; retry: - if (!ida_pre_get(&allocated_ptys, GFP_KERNEL)) { + if (!ida_pre_get(&fsi->allocated_ptys, GFP_KERNEL)) return -ENOMEM; - } mutex_lock(&allocated_ptys_lock); - ida_ret = ida_get_new(&allocated_ptys, &index); + ida_ret = ida_get_new(&fsi->allocated_ptys, &index); if (ida_ret < 0) { mutex_unlock(&allocated_ptys_lock); if (ida_ret == -EAGAIN) @@ -190,7 +551,7 @@ retry: } if (index >= pty_limit) { - ida_remove(&allocated_ptys, index); + ida_remove(&fsi->allocated_ptys, index); mutex_unlock(&allocated_ptys_lock); return -EIO; } @@ -200,18 +561,26 @@ retry: void devpts_kill_index(struct inode *ptmx_inode, int idx) { + struct super_block *sb = pts_sb_from_inode(ptmx_inode); + struct pts_fs_info *fsi = DEVPTS_SB(sb); + mutex_lock(&allocated_ptys_lock); - ida_remove(&allocated_ptys, idx); + ida_remove(&fsi->allocated_ptys, idx); mutex_unlock(&allocated_ptys_lock); } int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty) { - int number = tty->index; /* tty layer puts index from devpts_new_index() in here */ + /* tty layer puts index from devpts_new_index() in here */ + int number = tty->index; struct tty_driver *driver = tty->driver; dev_t device = MKDEV(driver->major, driver->minor_start+number); struct dentry *dentry; - struct inode *inode = new_inode(devpts_mnt->mnt_sb); + struct super_block *sb = pts_sb_from_inode(ptmx_inode); + struct inode *inode = new_inode(sb); + struct dentry *root = sb->s_root; + struct pts_fs_info *fsi = DEVPTS_SB(sb); + struct pts_mount_opts *opts = &fsi->mount_opts; char s[12]; /* We're supposed to be given the slave end of a pty */ @@ -221,25 +590,25 @@ int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty) if (!inode) return -ENOMEM; - inode->i_ino = number+2; - inode->i_uid = config.setuid ? config.uid : current_fsuid(); - inode->i_gid = config.setgid ? config.gid : current_fsgid(); + inode->i_ino = number + 3; + inode->i_uid = opts->setuid ? opts->uid : current_fsuid(); + inode->i_gid = opts->setgid ? opts->gid : current_fsgid(); inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; - init_special_inode(inode, S_IFCHR|config.mode, device); + init_special_inode(inode, S_IFCHR|opts->mode, device); inode->i_private = tty; tty->driver_data = inode; sprintf(s, "%d", number); - mutex_lock(&devpts_root->d_inode->i_mutex); + mutex_lock(&root->d_inode->i_mutex); - dentry = d_alloc_name(devpts_root, s); + dentry = d_alloc_name(root, s); if (!IS_ERR(dentry)) { d_add(dentry, inode); - fsnotify_create(devpts_root->d_inode, dentry); + fsnotify_create(root->d_inode, dentry); } - mutex_unlock(&devpts_root->d_inode->i_mutex); + mutex_unlock(&root->d_inode->i_mutex); return 0; } @@ -256,20 +625,27 @@ struct tty_struct *devpts_get_tty(struct inode *pts_inode, int number) void devpts_pty_kill(struct tty_struct *tty) { struct inode *inode = tty->driver_data; + struct super_block *sb = pts_sb_from_inode(inode); + struct dentry *root = sb->s_root; struct dentry *dentry; BUG_ON(inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR)); - mutex_lock(&devpts_root->d_inode->i_mutex); + mutex_lock(&root->d_inode->i_mutex); dentry = d_find_alias(inode); - if (dentry && !IS_ERR(dentry)) { + if (IS_ERR(dentry)) + goto out; + + if (dentry) { inode->i_nlink--; d_delete(dentry); - dput(dentry); + dput(dentry); /* d_alloc_name() in devpts_pty_new() */ } - mutex_unlock(&devpts_root->d_inode->i_mutex); + dput(dentry); /* d_find_alias above */ +out: + mutex_unlock(&root->d_inode->i_mutex); } static int __init init_devpts_fs(void) diff --git a/fs/direct-io.c b/fs/direct-io.c index af0558dbe8b7..b6d43908ff7a 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -1209,6 +1209,19 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, retval = direct_io_worker(rw, iocb, inode, iov, offset, nr_segs, blkbits, get_block, end_io, dio); + /* + * In case of error extending write may have instantiated a few + * blocks outside i_size. Trim these off again for DIO_LOCKING. + * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this by + * it's own meaner. + */ + if (unlikely(retval < 0 && (rw & WRITE))) { + loff_t isize = i_size_read(inode); + + if (end > isize && dio_lock_type == DIO_LOCKING) + vmtruncate(inode, isize); + } + if (rw == READ && dio_lock_type == DIO_LOCKING) release_i_mutex = 0; diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c index 8bf31e3fbf01..dc2ad6008b2d 100644 --- a/fs/dlm/ast.c +++ b/fs/dlm/ast.c @@ -2,7 +2,7 @@ ******************************************************************************* ** ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. -** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. +** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. ** ** This copyrighted material is made available to anyone wishing to use, ** modify, copy, or redistribute it subject to the terms and conditions @@ -33,10 +33,10 @@ void dlm_del_ast(struct dlm_lkb *lkb) spin_unlock(&ast_queue_lock); } -void dlm_add_ast(struct dlm_lkb *lkb, int type) +void dlm_add_ast(struct dlm_lkb *lkb, int type, int bastmode) { if (lkb->lkb_flags & DLM_IFL_USER) { - dlm_user_add_ast(lkb, type); + dlm_user_add_ast(lkb, type, bastmode); return; } @@ -46,6 +46,8 @@ void dlm_add_ast(struct dlm_lkb *lkb, int type) list_add_tail(&lkb->lkb_astqueue, &ast_queue); } lkb->lkb_ast_type |= type; + if (bastmode) + lkb->lkb_bastmode = bastmode; spin_unlock(&ast_queue_lock); set_bit(WAKE_ASTS, &astd_wakeflags); @@ -59,50 +61,40 @@ static void process_asts(void) struct dlm_lkb *lkb; void (*cast) (void *astparam); void (*bast) (void *astparam, int mode); - int type = 0, found, bmode; - - for (;;) { - found = 0; - spin_lock(&ast_queue_lock); - list_for_each_entry(lkb, &ast_queue, lkb_astqueue) { - r = lkb->lkb_resource; - ls = r->res_ls; - - if (dlm_locking_stopped(ls)) - continue; - - list_del(&lkb->lkb_astqueue); - type = lkb->lkb_ast_type; - lkb->lkb_ast_type = 0; - found = 1; - break; - } - spin_unlock(&ast_queue_lock); + int type = 0, bastmode; + +repeat: + spin_lock(&ast_queue_lock); + list_for_each_entry(lkb, &ast_queue, lkb_astqueue) { + r = lkb->lkb_resource; + ls = r->res_ls; + + if (dlm_locking_stopped(ls)) + continue; - if (!found) - break; + list_del(&lkb->lkb_astqueue); + type = lkb->lkb_ast_type; + lkb->lkb_ast_type = 0; + bastmode = lkb->lkb_bastmode; + spin_unlock(&ast_queue_lock); cast = lkb->lkb_astfn; bast = lkb->lkb_bastfn; - bmode = lkb->lkb_bastmode; if ((type & AST_COMP) && cast) cast(lkb->lkb_astparam); - /* FIXME: Is it safe to look at lkb_grmode here - without doing a lock_rsb() ? - Look at other checks in v1 to avoid basts. */ - if ((type & AST_BAST) && bast) - if (!dlm_modes_compat(lkb->lkb_grmode, bmode)) - bast(lkb->lkb_astparam, bmode); + bast(lkb->lkb_astparam, bastmode); /* this removes the reference added by dlm_add_ast and may result in the lkb being freed */ dlm_put_lkb(lkb); - schedule(); + cond_resched(); + goto repeat; } + spin_unlock(&ast_queue_lock); } static inline int no_asts(void) diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h index 6ee276c74c52..1b5fc5f428fd 100644 --- a/fs/dlm/ast.h +++ b/fs/dlm/ast.h @@ -1,7 +1,7 @@ /****************************************************************************** ******************************************************************************* ** -** Copyright (C) 2005 Red Hat, Inc. All rights reserved. +** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. ** ** This copyrighted material is made available to anyone wishing to use, ** modify, copy, or redistribute it subject to the terms and conditions @@ -13,7 +13,7 @@ #ifndef __ASTD_DOT_H__ #define __ASTD_DOT_H__ -void dlm_add_ast(struct dlm_lkb *lkb, int type); +void dlm_add_ast(struct dlm_lkb *lkb, int type, int bastmode); void dlm_del_ast(struct dlm_lkb *lkb); void dlm_astd_wake(void); diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c index 8fc24f4507a3..2f107d1a6a45 100644 --- a/fs/dlm/debug_fs.c +++ b/fs/dlm/debug_fs.c @@ -1,7 +1,7 @@ /****************************************************************************** ******************************************************************************* ** -** Copyright (C) 2005 Red Hat, Inc. All rights reserved. +** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. ** ** This copyrighted material is made available to anyone wishing to use, ** modify, copy, or redistribute it subject to the terms and conditions @@ -27,7 +27,7 @@ static struct dentry *dlm_root; struct rsb_iter { int entry; - int locks; + int format; int header; struct dlm_ls *ls; struct list_head *next; @@ -60,8 +60,8 @@ static char *print_lockmode(int mode) } } -static void print_resource_lock(struct seq_file *s, struct dlm_lkb *lkb, - struct dlm_rsb *res) +static void print_format1_lock(struct seq_file *s, struct dlm_lkb *lkb, + struct dlm_rsb *res) { seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode)); @@ -83,7 +83,7 @@ static void print_resource_lock(struct seq_file *s, struct dlm_lkb *lkb, seq_printf(s, "\n"); } -static int print_resource(struct dlm_rsb *res, struct seq_file *s) +static int print_format1(struct dlm_rsb *res, struct seq_file *s) { struct dlm_lkb *lkb; int i, lvblen = res->res_ls->ls_lvblen, recover_list, root_list; @@ -134,15 +134,15 @@ static int print_resource(struct dlm_rsb *res, struct seq_file *s) /* Print the locks attached to this resource */ seq_printf(s, "Granted Queue\n"); list_for_each_entry(lkb, &res->res_grantqueue, lkb_statequeue) - print_resource_lock(s, lkb, res); + print_format1_lock(s, lkb, res); seq_printf(s, "Conversion Queue\n"); list_for_each_entry(lkb, &res->res_convertqueue, lkb_statequeue) - print_resource_lock(s, lkb, res); + print_format1_lock(s, lkb, res); seq_printf(s, "Waiting Queue\n"); list_for_each_entry(lkb, &res->res_waitqueue, lkb_statequeue) - print_resource_lock(s, lkb, res); + print_format1_lock(s, lkb, res); if (list_empty(&res->res_lookup)) goto out; @@ -160,23 +160,24 @@ static int print_resource(struct dlm_rsb *res, struct seq_file *s) return 0; } -static void print_lock(struct seq_file *s, struct dlm_lkb *lkb, struct dlm_rsb *r) +static void print_format2_lock(struct seq_file *s, struct dlm_lkb *lkb, + struct dlm_rsb *r) { - unsigned int waiting = 0; - uint64_t xid = 0; + u64 xid = 0; + u64 us; if (lkb->lkb_flags & DLM_IFL_USER) { if (lkb->lkb_ua) xid = lkb->lkb_ua->xid; } - if (lkb->lkb_timestamp) - waiting = jiffies_to_msecs(jiffies - lkb->lkb_timestamp); + /* microseconds since lkb was added to current queue */ + us = ktime_to_us(ktime_sub(ktime_get(), lkb->lkb_timestamp)); - /* id nodeid remid pid xid exflags flags sts grmode rqmode time_ms + /* id nodeid remid pid xid exflags flags sts grmode rqmode time_us r_nodeid r_len r_name */ - seq_printf(s, "%x %d %x %u %llu %x %x %d %d %d %u %u %d \"%s\"\n", + seq_printf(s, "%x %d %x %u %llu %x %x %d %d %d %llu %u %d \"%s\"\n", lkb->lkb_id, lkb->lkb_nodeid, lkb->lkb_remid, @@ -187,26 +188,114 @@ static void print_lock(struct seq_file *s, struct dlm_lkb *lkb, struct dlm_rsb * lkb->lkb_status, lkb->lkb_grmode, lkb->lkb_rqmode, - waiting, + (unsigned long long)us, r->res_nodeid, r->res_length, r->res_name); } -static int print_locks(struct dlm_rsb *r, struct seq_file *s) +static int print_format2(struct dlm_rsb *r, struct seq_file *s) { struct dlm_lkb *lkb; lock_rsb(r); list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) - print_lock(s, lkb, r); + print_format2_lock(s, lkb, r); list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) - print_lock(s, lkb, r); + print_format2_lock(s, lkb, r); list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue) - print_lock(s, lkb, r); + print_format2_lock(s, lkb, r); + + unlock_rsb(r); + return 0; +} + +static void print_format3_lock(struct seq_file *s, struct dlm_lkb *lkb, + int rsb_lookup) +{ + u64 xid = 0; + + if (lkb->lkb_flags & DLM_IFL_USER) { + if (lkb->lkb_ua) + xid = lkb->lkb_ua->xid; + } + + seq_printf(s, "lkb %x %d %x %u %llu %x %x %d %d %d %d %d %d %u %llu %llu\n", + lkb->lkb_id, + lkb->lkb_nodeid, + lkb->lkb_remid, + lkb->lkb_ownpid, + (unsigned long long)xid, + lkb->lkb_exflags, + lkb->lkb_flags, + lkb->lkb_status, + lkb->lkb_grmode, + lkb->lkb_rqmode, + lkb->lkb_highbast, + rsb_lookup, + lkb->lkb_wait_type, + lkb->lkb_lvbseq, + (unsigned long long)ktime_to_ns(lkb->lkb_timestamp), + (unsigned long long)ktime_to_ns(lkb->lkb_time_bast)); +} + +static int print_format3(struct dlm_rsb *r, struct seq_file *s) +{ + struct dlm_lkb *lkb; + int i, lvblen = r->res_ls->ls_lvblen; + int print_name = 1; + + lock_rsb(r); + + seq_printf(s, "rsb %p %d %x %lx %d %d %u %d ", + r, + r->res_nodeid, + r->res_first_lkid, + r->res_flags, + !list_empty(&r->res_root_list), + !list_empty(&r->res_recover_list), + r->res_recover_locks_count, + r->res_length); + + for (i = 0; i < r->res_length; i++) { + if (!isascii(r->res_name[i]) || !isprint(r->res_name[i])) + print_name = 0; + } + + seq_printf(s, "%s", print_name ? "str " : "hex"); + + for (i = 0; i < r->res_length; i++) { + if (print_name) + seq_printf(s, "%c", r->res_name[i]); + else + seq_printf(s, " %02x", (unsigned char)r->res_name[i]); + } + seq_printf(s, "\n"); + + if (!r->res_lvbptr) + goto do_locks; + + seq_printf(s, "lvb %u %d", r->res_lvbseq, lvblen); + + for (i = 0; i < lvblen; i++) + seq_printf(s, " %02x", (unsigned char)r->res_lvbptr[i]); + seq_printf(s, "\n"); + + do_locks: + list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) + print_format3_lock(s, lkb, 0); + + list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) + print_format3_lock(s, lkb, 0); + + list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue) + print_format3_lock(s, lkb, 0); + + list_for_each_entry(lkb, &r->res_lookup, lkb_rsb_lookup) + print_format3_lock(s, lkb, 1); unlock_rsb(r); return 0; @@ -231,7 +320,7 @@ static int rsb_iter_next(struct rsb_iter *ri) break; } read_unlock(&ls->ls_rsbtbl[i].lock); - } + } ri->entry = i; if (ri->entry >= ls->ls_rsbtbl_size) @@ -248,7 +337,7 @@ static int rsb_iter_next(struct rsb_iter *ri) read_unlock(&ls->ls_rsbtbl[i].lock); dlm_put_rsb(old); goto top; - } + } ri->rsb = list_entry(ri->next, struct dlm_rsb, res_hashchain); dlm_hold_rsb(ri->rsb); read_unlock(&ls->ls_rsbtbl[i].lock); @@ -274,6 +363,7 @@ static struct rsb_iter *rsb_iter_init(struct dlm_ls *ls) ri->ls = ls; ri->entry = 0; ri->next = NULL; + ri->format = 1; if (rsb_iter_next(ri)) { rsb_iter_free(ri); @@ -325,16 +415,26 @@ static int rsb_seq_show(struct seq_file *file, void *iter_ptr) { struct rsb_iter *ri = iter_ptr; - if (ri->locks) { + switch (ri->format) { + case 1: + print_format1(ri->rsb, file); + break; + case 2: if (ri->header) { - seq_printf(file, "id nodeid remid pid xid exflags flags " - "sts grmode rqmode time_ms r_nodeid " - "r_len r_name\n"); + seq_printf(file, "id nodeid remid pid xid exflags " + "flags sts grmode rqmode time_ms " + "r_nodeid r_len r_name\n"); ri->header = 0; } - print_locks(ri->rsb, file); - } else { - print_resource(ri->rsb, file); + print_format2(ri->rsb, file); + break; + case 3: + if (ri->header) { + seq_printf(file, "version rsb 1.1 lvb 1.1 lkb 1.1\n"); + ri->header = 0; + } + print_format3(ri->rsb, file); + break; } return 0; @@ -385,7 +485,7 @@ static struct rsb_iter *locks_iter_init(struct dlm_ls *ls, loff_t *pos) ri->ls = ls; ri->entry = 0; ri->next = NULL; - ri->locks = 1; + ri->format = 2; if (*pos == 0) ri->header = 1; @@ -448,6 +548,84 @@ static const struct file_operations locks_fops = { }; /* + * Dump all rsb/lvb/lkb state in compact listing, more complete than _locks + * This can replace both formats 1 and 2 eventually. + */ + +static struct rsb_iter *all_iter_init(struct dlm_ls *ls, loff_t *pos) +{ + struct rsb_iter *ri; + + ri = kzalloc(sizeof *ri, GFP_KERNEL); + if (!ri) + return NULL; + + ri->ls = ls; + ri->entry = 0; + ri->next = NULL; + ri->format = 3; + + if (*pos == 0) + ri->header = 1; + + if (rsb_iter_next(ri)) { + rsb_iter_free(ri); + return NULL; + } + + return ri; +} + +static void *all_seq_start(struct seq_file *file, loff_t *pos) +{ + struct rsb_iter *ri; + loff_t n = *pos; + + ri = all_iter_init(file->private, pos); + if (!ri) + return NULL; + + while (n--) { + if (rsb_iter_next(ri)) { + rsb_iter_free(ri); + return NULL; + } + } + + return ri; +} + +static struct seq_operations all_seq_ops = { + .start = all_seq_start, + .next = rsb_seq_next, + .stop = rsb_seq_stop, + .show = rsb_seq_show, +}; + +static int all_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int ret; + + ret = seq_open(file, &all_seq_ops); + if (ret) + return ret; + + seq = file->private_data; + seq->private = inode->i_private; + + return 0; +} + +static const struct file_operations all_fops = { + .owner = THIS_MODULE, + .open = all_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + +/* * dump lkb's on the ls_waiters list */ @@ -489,30 +667,33 @@ static const struct file_operations waiters_fops = { .read = waiters_read }; +void dlm_delete_debug_file(struct dlm_ls *ls) +{ + if (ls->ls_debug_rsb_dentry) + debugfs_remove(ls->ls_debug_rsb_dentry); + if (ls->ls_debug_waiters_dentry) + debugfs_remove(ls->ls_debug_waiters_dentry); + if (ls->ls_debug_locks_dentry) + debugfs_remove(ls->ls_debug_locks_dentry); + if (ls->ls_debug_all_dentry) + debugfs_remove(ls->ls_debug_all_dentry); +} + int dlm_create_debug_file(struct dlm_ls *ls) { char name[DLM_LOCKSPACE_LEN+8]; + /* format 1 */ + ls->ls_debug_rsb_dentry = debugfs_create_file(ls->ls_name, S_IFREG | S_IRUGO, dlm_root, ls, &rsb_fops); if (!ls->ls_debug_rsb_dentry) - return -ENOMEM; + goto fail; - memset(name, 0, sizeof(name)); - snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_waiters", ls->ls_name); - - ls->ls_debug_waiters_dentry = debugfs_create_file(name, - S_IFREG | S_IRUGO, - dlm_root, - ls, - &waiters_fops); - if (!ls->ls_debug_waiters_dentry) { - debugfs_remove(ls->ls_debug_rsb_dentry); - return -ENOMEM; - } + /* format 2 */ memset(name, 0, sizeof(name)); snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_locks", ls->ls_name); @@ -522,23 +703,38 @@ int dlm_create_debug_file(struct dlm_ls *ls) dlm_root, ls, &locks_fops); - if (!ls->ls_debug_locks_dentry) { - debugfs_remove(ls->ls_debug_waiters_dentry); - debugfs_remove(ls->ls_debug_rsb_dentry); - return -ENOMEM; - } + if (!ls->ls_debug_locks_dentry) + goto fail; + + /* format 3 */ + + memset(name, 0, sizeof(name)); + snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_all", ls->ls_name); + + ls->ls_debug_all_dentry = debugfs_create_file(name, + S_IFREG | S_IRUGO, + dlm_root, + ls, + &all_fops); + if (!ls->ls_debug_all_dentry) + goto fail; + + memset(name, 0, sizeof(name)); + snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_waiters", ls->ls_name); + + ls->ls_debug_waiters_dentry = debugfs_create_file(name, + S_IFREG | S_IRUGO, + dlm_root, + ls, + &waiters_fops); + if (!ls->ls_debug_waiters_dentry) + goto fail; return 0; -} -void dlm_delete_debug_file(struct dlm_ls *ls) -{ - if (ls->ls_debug_rsb_dentry) - debugfs_remove(ls->ls_debug_rsb_dentry); - if (ls->ls_debug_waiters_dentry) - debugfs_remove(ls->ls_debug_waiters_dentry); - if (ls->ls_debug_locks_dentry) - debugfs_remove(ls->ls_debug_locks_dentry); + fail: + dlm_delete_debug_file(ls); + return -ENOMEM; } int __init dlm_register_debugfs(void) diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c index 85defeb64df4..92969f879a17 100644 --- a/fs/dlm/dir.c +++ b/fs/dlm/dir.c @@ -374,7 +374,7 @@ void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen, struct list_head *list; struct dlm_rsb *r; int offset = 0, dir_nodeid; - uint16_t be_namelen; + __be16 be_namelen; down_read(&ls->ls_root_sem); @@ -410,15 +410,15 @@ void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen, if (offset + sizeof(uint16_t)*2 + r->res_length > outlen) { /* Write end-of-block record */ - be_namelen = 0; - memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t)); - offset += sizeof(uint16_t); + be_namelen = cpu_to_be16(0); + memcpy(outbuf + offset, &be_namelen, sizeof(__be16)); + offset += sizeof(__be16); goto out; } be_namelen = cpu_to_be16(r->res_length); - memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t)); - offset += sizeof(uint16_t); + memcpy(outbuf + offset, &be_namelen, sizeof(__be16)); + offset += sizeof(__be16); memcpy(outbuf + offset, r->res_name, r->res_length); offset += r->res_length; } @@ -430,9 +430,9 @@ void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen, if ((list == &ls->ls_root_list) && (offset + sizeof(uint16_t) <= outlen)) { - be_namelen = 0xFFFF; - memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t)); - offset += sizeof(uint16_t); + be_namelen = cpu_to_be16(0xFFFF); + memcpy(outbuf + offset, &be_namelen, sizeof(__be16)); + offset += sizeof(__be16); } out: diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index 868e4c9ef127..ef2f1e353966 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -245,7 +245,8 @@ struct dlm_lkb { struct list_head lkb_astqueue; /* need ast to be sent */ struct list_head lkb_ownqueue; /* list of locks for a process */ struct list_head lkb_time_list; - unsigned long lkb_timestamp; + ktime_t lkb_time_bast; /* for debugging */ + ktime_t lkb_timestamp; unsigned long lkb_timeout_cs; char *lkb_lvbptr; @@ -481,6 +482,7 @@ struct dlm_ls { struct dentry *ls_debug_rsb_dentry; /* debugfs */ struct dentry *ls_debug_waiters_dentry; /* debugfs */ struct dentry *ls_debug_locks_dentry; /* debugfs */ + struct dentry *ls_debug_all_dentry; /* debugfs */ wait_queue_head_t ls_uevent_wait; /* user part of join/leave */ int ls_uevent_result; diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 724ddac91538..6cfe65bbf4a2 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -307,7 +307,7 @@ static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv) lkb->lkb_lksb->sb_status = rv; lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags; - dlm_add_ast(lkb, AST_COMP); + dlm_add_ast(lkb, AST_COMP, 0); } static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb) @@ -318,12 +318,12 @@ static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb) static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode) { + lkb->lkb_time_bast = ktime_get(); + if (is_master_copy(lkb)) send_bast(r, lkb, rqmode); - else { - lkb->lkb_bastmode = rqmode; - dlm_add_ast(lkb, AST_BAST); - } + else + dlm_add_ast(lkb, AST_BAST, rqmode); } /* @@ -744,6 +744,8 @@ static void add_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int status) DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb);); + lkb->lkb_timestamp = ktime_get(); + lkb->lkb_status = status; switch (status) { @@ -1013,10 +1015,8 @@ static void add_timeout(struct dlm_lkb *lkb) { struct dlm_ls *ls = lkb->lkb_resource->res_ls; - if (is_master_copy(lkb)) { - lkb->lkb_timestamp = jiffies; + if (is_master_copy(lkb)) return; - } if (test_bit(LSFL_TIMEWARN, &ls->ls_flags) && !(lkb->lkb_exflags & DLM_LKF_NODLCKWT)) { @@ -1031,7 +1031,6 @@ static void add_timeout(struct dlm_lkb *lkb) DLM_ASSERT(list_empty(&lkb->lkb_time_list), dlm_print_lkb(lkb);); mutex_lock(&ls->ls_timeout_mutex); hold_lkb(lkb); - lkb->lkb_timestamp = jiffies; list_add_tail(&lkb->lkb_time_list, &ls->ls_timeout); mutex_unlock(&ls->ls_timeout_mutex); } @@ -1059,6 +1058,7 @@ void dlm_scan_timeout(struct dlm_ls *ls) struct dlm_rsb *r; struct dlm_lkb *lkb; int do_cancel, do_warn; + s64 wait_us; for (;;) { if (dlm_locking_stopped(ls)) @@ -1069,14 +1069,15 @@ void dlm_scan_timeout(struct dlm_ls *ls) mutex_lock(&ls->ls_timeout_mutex); list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list) { + wait_us = ktime_to_us(ktime_sub(ktime_get(), + lkb->lkb_timestamp)); + if ((lkb->lkb_exflags & DLM_LKF_TIMEOUT) && - time_after_eq(jiffies, lkb->lkb_timestamp + - lkb->lkb_timeout_cs * HZ/100)) + wait_us >= (lkb->lkb_timeout_cs * 10000)) do_cancel = 1; if ((lkb->lkb_flags & DLM_IFL_WATCH_TIMEWARN) && - time_after_eq(jiffies, lkb->lkb_timestamp + - dlm_config.ci_timewarn_cs * HZ/100)) + wait_us >= dlm_config.ci_timewarn_cs * 10000) do_warn = 1; if (!do_cancel && !do_warn) @@ -1122,12 +1123,12 @@ void dlm_scan_timeout(struct dlm_ls *ls) void dlm_adjust_timeouts(struct dlm_ls *ls) { struct dlm_lkb *lkb; - long adj = jiffies - ls->ls_recover_begin; + u64 adj_us = jiffies_to_usecs(jiffies - ls->ls_recover_begin); ls->ls_recover_begin = 0; mutex_lock(&ls->ls_timeout_mutex); list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list) - lkb->lkb_timestamp += adj; + lkb->lkb_timestamp = ktime_add_us(lkb->lkb_timestamp, adj_us); mutex_unlock(&ls->ls_timeout_mutex); } diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 3962262f991a..103a5ebd1371 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -295,6 +295,7 @@ static int add_sock(struct socket *sock, struct connection *con) con->sock->sk->sk_write_space = lowcomms_write_space; con->sock->sk->sk_state_change = lowcomms_state_change; con->sock->sk->sk_user_data = con; + con->sock->sk->sk_allocation = GFP_NOFS; return 0; } @@ -823,7 +824,6 @@ static void sctp_init_assoc(struct connection *con) len = e->len; offset = e->offset; spin_unlock(&con->writequeue_lock); - kmap(e->page); /* Send the first block off the write queue */ iov[0].iov_base = page_address(e->page)+offset; @@ -854,7 +854,6 @@ static void sctp_init_assoc(struct connection *con) if (e->len == 0 && e->users == 0) { list_del(&e->list); - kunmap(e->page); free_entry(e); } spin_unlock(&con->writequeue_lock); @@ -1203,8 +1202,6 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc) if (e) { got_one: - if (users == 0) - kmap(e->page); *ppc = page_address(e->page) + offset; return e; } @@ -1233,7 +1230,6 @@ void dlm_lowcomms_commit_buffer(void *mh) if (users) goto out; e->len = e->end - e->offset; - kunmap(e->page); spin_unlock(&con->writequeue_lock); if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) { @@ -1272,7 +1268,6 @@ static void send_to_sock(struct connection *con) offset = e->offset; BUG_ON(len == 0 && e->users == 0); spin_unlock(&con->writequeue_lock); - kmap(e->page); ret = 0; if (len) { @@ -1294,7 +1289,6 @@ static void send_to_sock(struct connection *con) if (e->len == 0 && e->users == 0) { list_del(&e->list); - kunmap(e->page); free_entry(e); continue; } diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c index 54c14c6d06cb..c1775b84ebab 100644 --- a/fs/dlm/memory.c +++ b/fs/dlm/memory.c @@ -39,7 +39,7 @@ char *dlm_allocate_lvb(struct dlm_ls *ls) { char *p; - p = kzalloc(ls->ls_lvblen, GFP_KERNEL); + p = kzalloc(ls->ls_lvblen, ls->ls_allocation); return p; } @@ -57,7 +57,7 @@ struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls, int namelen) DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,); - r = kzalloc(sizeof(*r) + namelen, GFP_KERNEL); + r = kzalloc(sizeof(*r) + namelen, ls->ls_allocation); return r; } @@ -72,7 +72,7 @@ struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls) { struct dlm_lkb *lkb; - lkb = kmem_cache_zalloc(lkb_cache, GFP_KERNEL); + lkb = kmem_cache_zalloc(lkb_cache, ls->ls_allocation); return lkb; } diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index 07ac709f3ed7..f3396c622aec 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -112,7 +112,7 @@ int dlm_process_incoming_buffer(int nodeid, const void *base, ordinary messages). */ if (msglen > sizeof(__tmp) && p == &__tmp.p) { - p = kmalloc(dlm_config.ci_buffer_size, GFP_KERNEL); + p = kmalloc(dlm_config.ci_buffer_size, GFP_NOFS); if (p == NULL) return ret; } diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c index aa2a5775a027..ccc9d62c462d 100644 --- a/fs/dlm/netlink.c +++ b/fs/dlm/netlink.c @@ -115,7 +115,6 @@ static void fill_data(struct dlm_lock_data *data, struct dlm_lkb *lkb) data->status = lkb->lkb_status; data->grmode = lkb->lkb_grmode; data->rqmode = lkb->lkb_rqmode; - data->timestamp = lkb->lkb_timestamp; if (lkb->lkb_ua) data->xid = lkb->lkb_ua->xid; if (r) { diff --git a/fs/dlm/user.c b/fs/dlm/user.c index b3832c67194a..065149e84f42 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -175,7 +175,7 @@ static int lkb_is_endoflife(struct dlm_lkb *lkb, int sb_status, int type) /* we could possibly check if the cancel of an orphan has resulted in the lkb being removed and then remove that lkb from the orphans list and free it */ -void dlm_user_add_ast(struct dlm_lkb *lkb, int type) +void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int bastmode) { struct dlm_ls *ls; struct dlm_user_args *ua; @@ -208,6 +208,8 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type) ast_type = lkb->lkb_ast_type; lkb->lkb_ast_type |= type; + if (bastmode) + lkb->lkb_bastmode = bastmode; if (!ast_type) { kref_get(&lkb->lkb_ref); diff --git a/fs/dlm/user.h b/fs/dlm/user.h index 35eb6a13d616..1c9686492286 100644 --- a/fs/dlm/user.h +++ b/fs/dlm/user.h @@ -9,7 +9,7 @@ #ifndef __USER_DOT_H__ #define __USER_DOT_H__ -void dlm_user_add_ast(struct dlm_lkb *lkb, int type); +void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int bastmode); int dlm_user_init(void); void dlm_user_exit(void); int dlm_device_deregister(struct dlm_ls *ls); diff --git a/fs/dquot.c b/fs/dquot.c index c237ccc8581c..48c0571f831d 100644 --- a/fs/dquot.c +++ b/fs/dquot.c @@ -211,8 +211,6 @@ static struct hlist_head *dquot_hash; struct dqstats dqstats; -static void dqput(struct dquot *dquot); - static inline unsigned int hashfn(const struct super_block *sb, unsigned int id, int type) { @@ -415,6 +413,17 @@ out_dqlock: return ret; } +void dquot_destroy(struct dquot *dquot) +{ + kmem_cache_free(dquot_cachep, dquot); +} +EXPORT_SYMBOL(dquot_destroy); + +static inline void do_destroy_dquot(struct dquot *dquot) +{ + dquot->dq_sb->dq_op->destroy_dquot(dquot); +} + /* Invalidate all dquots on the list. Note that this function is called after * quota is disabled and pointers from inodes removed so there cannot be new * quota users. There can still be some users of quotas due to inodes being @@ -463,9 +472,44 @@ restart: remove_dquot_hash(dquot); remove_free_dquot(dquot); remove_inuse(dquot); - kmem_cache_free(dquot_cachep, dquot); + do_destroy_dquot(dquot); + } + spin_unlock(&dq_list_lock); +} + +/* Call callback for every active dquot on given filesystem */ +int dquot_scan_active(struct super_block *sb, + int (*fn)(struct dquot *dquot, unsigned long priv), + unsigned long priv) +{ + struct dquot *dquot, *old_dquot = NULL; + int ret = 0; + + mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); + spin_lock(&dq_list_lock); + list_for_each_entry(dquot, &inuse_list, dq_inuse) { + if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) + continue; + if (dquot->dq_sb != sb) + continue; + /* Now we have active dquot so we can just increase use count */ + atomic_inc(&dquot->dq_count); + dqstats.lookups++; + spin_unlock(&dq_list_lock); + dqput(old_dquot); + old_dquot = dquot; + ret = fn(dquot, priv); + if (ret < 0) + goto out; + spin_lock(&dq_list_lock); + /* We are safe to continue now because our dquot could not + * be moved out of the inuse list while we hold the reference */ } spin_unlock(&dq_list_lock); +out: + dqput(old_dquot); + mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); + return ret; } int vfs_quota_sync(struct super_block *sb, int type) @@ -479,7 +523,7 @@ int vfs_quota_sync(struct super_block *sb, int type) for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (type != -1 && cnt != type) continue; - if (!sb_has_quota_enabled(sb, cnt)) + if (!sb_has_quota_active(sb, cnt)) continue; spin_lock(&dq_list_lock); dirty = &dqopt->info[cnt].dqi_dirty_list; @@ -504,8 +548,8 @@ int vfs_quota_sync(struct super_block *sb, int type) } for (cnt = 0; cnt < MAXQUOTAS; cnt++) - if ((cnt == type || type == -1) && sb_has_quota_enabled(sb, cnt) - && info_dirty(&dqopt->info[cnt])) + if ((cnt == type || type == -1) && sb_has_quota_active(sb, cnt) + && info_dirty(&dqopt->info[cnt])) sb->dq_op->write_info(sb, cnt); spin_lock(&dq_list_lock); dqstats.syncs++; @@ -527,7 +571,7 @@ static void prune_dqcache(int count) remove_dquot_hash(dquot); remove_free_dquot(dquot); remove_inuse(dquot); - kmem_cache_free(dquot_cachep, dquot); + do_destroy_dquot(dquot); count--; head = free_dquots.prev; } @@ -558,7 +602,7 @@ static struct shrinker dqcache_shrinker = { * NOTE: If you change this function please check whether dqput_blocks() works right... * MUST be called with either dqptr_sem or dqonoff_mutex held */ -static void dqput(struct dquot *dquot) +void dqput(struct dquot *dquot) { int ret; @@ -584,7 +628,7 @@ we_slept: /* We have more than one user... nothing to do */ atomic_dec(&dquot->dq_count); /* Releasing dquot during quotaoff phase? */ - if (!sb_has_quota_enabled(dquot->dq_sb, dquot->dq_type) && + if (!sb_has_quota_active(dquot->dq_sb, dquot->dq_type) && atomic_read(&dquot->dq_count) == 1) wake_up(&dquot->dq_wait_unused); spin_unlock(&dq_list_lock); @@ -625,11 +669,17 @@ we_slept: spin_unlock(&dq_list_lock); } +struct dquot *dquot_alloc(struct super_block *sb, int type) +{ + return kmem_cache_zalloc(dquot_cachep, GFP_NOFS); +} +EXPORT_SYMBOL(dquot_alloc); + static struct dquot *get_empty_dquot(struct super_block *sb, int type) { struct dquot *dquot; - dquot = kmem_cache_zalloc(dquot_cachep, GFP_NOFS); + dquot = sb->dq_op->alloc_dquot(sb, type); if(!dquot) return NODQUOT; @@ -647,15 +697,33 @@ static struct dquot *get_empty_dquot(struct super_block *sb, int type) } /* + * Check whether dquot is in memory. + * MUST be called with either dqptr_sem or dqonoff_mutex held + */ +int dquot_is_cached(struct super_block *sb, unsigned int id, int type) +{ + unsigned int hashent = hashfn(sb, id, type); + int ret = 0; + + if (!sb_has_quota_active(sb, type)) + return 0; + spin_lock(&dq_list_lock); + if (find_dquot(hashent, sb, id, type) != NODQUOT) + ret = 1; + spin_unlock(&dq_list_lock); + return ret; +} + +/* * Get reference to dquot * MUST be called with either dqptr_sem or dqonoff_mutex held */ -static struct dquot *dqget(struct super_block *sb, unsigned int id, int type) +struct dquot *dqget(struct super_block *sb, unsigned int id, int type) { unsigned int hashent = hashfn(sb, id, type); struct dquot *dquot, *empty = NODQUOT; - if (!sb_has_quota_enabled(sb, type)) + if (!sb_has_quota_active(sb, type)) return NODQUOT; we_slept: spin_lock(&dq_list_lock); @@ -682,7 +750,7 @@ we_slept: dqstats.lookups++; spin_unlock(&dq_list_lock); if (empty) - kmem_cache_free(dquot_cachep, empty); + do_destroy_dquot(empty); } /* Wait for dq_lock - after this we know that either dquot_release() is already * finished or it will be canceled due to dq_count > 1 test */ @@ -820,7 +888,7 @@ static void drop_dquot_ref(struct super_block *sb, int type) } } -static inline void dquot_incr_inodes(struct dquot *dquot, unsigned long number) +static inline void dquot_incr_inodes(struct dquot *dquot, qsize_t number) { dquot->dq_dqb.dqb_curinodes += number; } @@ -830,9 +898,10 @@ static inline void dquot_incr_space(struct dquot *dquot, qsize_t number) dquot->dq_dqb.dqb_curspace += number; } -static inline void dquot_decr_inodes(struct dquot *dquot, unsigned long number) +static inline void dquot_decr_inodes(struct dquot *dquot, qsize_t number) { - if (dquot->dq_dqb.dqb_curinodes > number) + if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE || + dquot->dq_dqb.dqb_curinodes >= number) dquot->dq_dqb.dqb_curinodes -= number; else dquot->dq_dqb.dqb_curinodes = 0; @@ -843,11 +912,12 @@ static inline void dquot_decr_inodes(struct dquot *dquot, unsigned long number) static inline void dquot_decr_space(struct dquot *dquot, qsize_t number) { - if (dquot->dq_dqb.dqb_curspace > number) + if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE || + dquot->dq_dqb.dqb_curspace >= number) dquot->dq_dqb.dqb_curspace -= number; else dquot->dq_dqb.dqb_curspace = 0; - if (toqb(dquot->dq_dqb.dqb_curspace) <= dquot->dq_dqb.dqb_bsoftlimit) + if (dquot->dq_dqb.dqb_curspace <= dquot->dq_dqb.dqb_bsoftlimit) dquot->dq_dqb.dqb_btime = (time_t) 0; clear_bit(DQ_BLKS_B, &dquot->dq_flags); } @@ -1023,10 +1093,11 @@ static inline char ignore_hardlimit(struct dquot *dquot) } /* needs dq_data_lock */ -static int check_idq(struct dquot *dquot, ulong inodes, char *warntype) +static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype) { *warntype = QUOTA_NL_NOWARN; - if (inodes <= 0 || test_bit(DQ_FAKE_B, &dquot->dq_flags)) + if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) || + test_bit(DQ_FAKE_B, &dquot->dq_flags)) return QUOTA_OK; if (dquot->dq_dqb.dqb_ihardlimit && @@ -1058,11 +1129,12 @@ static int check_idq(struct dquot *dquot, ulong inodes, char *warntype) static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *warntype) { *warntype = QUOTA_NL_NOWARN; - if (space <= 0 || test_bit(DQ_FAKE_B, &dquot->dq_flags)) + if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) || + test_bit(DQ_FAKE_B, &dquot->dq_flags)) return QUOTA_OK; if (dquot->dq_dqb.dqb_bhardlimit && - toqb(dquot->dq_dqb.dqb_curspace + space) > dquot->dq_dqb.dqb_bhardlimit && + dquot->dq_dqb.dqb_curspace + space > dquot->dq_dqb.dqb_bhardlimit && !ignore_hardlimit(dquot)) { if (!prealloc) *warntype = QUOTA_NL_BHARDWARN; @@ -1070,7 +1142,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war } if (dquot->dq_dqb.dqb_bsoftlimit && - toqb(dquot->dq_dqb.dqb_curspace + space) > dquot->dq_dqb.dqb_bsoftlimit && + dquot->dq_dqb.dqb_curspace + space > dquot->dq_dqb.dqb_bsoftlimit && dquot->dq_dqb.dqb_btime && get_seconds() >= dquot->dq_dqb.dqb_btime && !ignore_hardlimit(dquot)) { if (!prealloc) @@ -1079,7 +1151,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war } if (dquot->dq_dqb.dqb_bsoftlimit && - toqb(dquot->dq_dqb.dqb_curspace + space) > dquot->dq_dqb.dqb_bsoftlimit && + dquot->dq_dqb.dqb_curspace + space > dquot->dq_dqb.dqb_bsoftlimit && dquot->dq_dqb.dqb_btime == 0) { if (!prealloc) { *warntype = QUOTA_NL_BSOFTWARN; @@ -1096,10 +1168,11 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war return QUOTA_OK; } -static int info_idq_free(struct dquot *dquot, ulong inodes) +static int info_idq_free(struct dquot *dquot, qsize_t inodes) { if (test_bit(DQ_FAKE_B, &dquot->dq_flags) || - dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit) + dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit || + !sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type)) return QUOTA_NL_NOWARN; if (dquot->dq_dqb.dqb_curinodes - inodes <= dquot->dq_dqb.dqb_isoftlimit) @@ -1113,15 +1186,13 @@ static int info_idq_free(struct dquot *dquot, ulong inodes) static int info_bdq_free(struct dquot *dquot, qsize_t space) { if (test_bit(DQ_FAKE_B, &dquot->dq_flags) || - toqb(dquot->dq_dqb.dqb_curspace) <= dquot->dq_dqb.dqb_bsoftlimit) + dquot->dq_dqb.dqb_curspace <= dquot->dq_dqb.dqb_bsoftlimit) return QUOTA_NL_NOWARN; - if (toqb(dquot->dq_dqb.dqb_curspace - space) <= - dquot->dq_dqb.dqb_bsoftlimit) + if (dquot->dq_dqb.dqb_curspace - space <= dquot->dq_dqb.dqb_bsoftlimit) return QUOTA_NL_BSOFTBELOW; - if (toqb(dquot->dq_dqb.dqb_curspace) >= dquot->dq_dqb.dqb_bhardlimit && - toqb(dquot->dq_dqb.dqb_curspace - space) < - dquot->dq_dqb.dqb_bhardlimit) + if (dquot->dq_dqb.dqb_curspace >= dquot->dq_dqb.dqb_bhardlimit && + dquot->dq_dqb.dqb_curspace - space < dquot->dq_dqb.dqb_bhardlimit) return QUOTA_NL_BHARDBELOW; return QUOTA_NL_NOWARN; } @@ -1166,17 +1237,23 @@ out_err: * Release all quotas referenced by inode * Transaction must be started at an entry */ -int dquot_drop(struct inode *inode) +int dquot_drop_locked(struct inode *inode) { int cnt; - down_write(&sb_dqopt(inode->i_sb)->dqptr_sem); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (inode->i_dquot[cnt] != NODQUOT) { dqput(inode->i_dquot[cnt]); inode->i_dquot[cnt] = NODQUOT; } } + return 0; +} + +int dquot_drop(struct inode *inode) +{ + down_write(&sb_dqopt(inode->i_sb)->dqptr_sem); + dquot_drop_locked(inode); up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); return 0; } @@ -1264,7 +1341,7 @@ warn_put_all: /* * This operation can block, but only after everything is updated */ -int dquot_alloc_inode(const struct inode *inode, unsigned long number) +int dquot_alloc_inode(const struct inode *inode, qsize_t number) { int cnt, ret = NO_QUOTA; char warntype[MAXQUOTAS]; @@ -1349,7 +1426,7 @@ out_sub: /* * This operation can block, but only after everything is updated */ -int dquot_free_inode(const struct inode *inode, unsigned long number) +int dquot_free_inode(const struct inode *inode, qsize_t number) { unsigned int cnt; char warntype[MAXQUOTAS]; @@ -1495,7 +1572,7 @@ warn_put_all: /* Wrapper for transferring ownership of an inode */ int vfs_dq_transfer(struct inode *inode, struct iattr *iattr) { - if (sb_any_quota_enabled(inode->i_sb) && !IS_NOQUOTA(inode)) { + if (sb_any_quota_active(inode->i_sb) && !IS_NOQUOTA(inode)) { vfs_dq_init(inode); if (inode->i_sb->dq_op->transfer(inode, iattr) == NO_QUOTA) return 1; @@ -1533,54 +1610,27 @@ struct dquot_operations dquot_operations = { .acquire_dquot = dquot_acquire, .release_dquot = dquot_release, .mark_dirty = dquot_mark_dquot_dirty, - .write_info = dquot_commit_info + .write_info = dquot_commit_info, + .alloc_dquot = dquot_alloc, + .destroy_dquot = dquot_destroy, }; -static inline void set_enable_flags(struct quota_info *dqopt, int type) -{ - switch (type) { - case USRQUOTA: - dqopt->flags |= DQUOT_USR_ENABLED; - dqopt->flags &= ~DQUOT_USR_SUSPENDED; - break; - case GRPQUOTA: - dqopt->flags |= DQUOT_GRP_ENABLED; - dqopt->flags &= ~DQUOT_GRP_SUSPENDED; - break; - } -} - -static inline void reset_enable_flags(struct quota_info *dqopt, int type, - int remount) -{ - switch (type) { - case USRQUOTA: - dqopt->flags &= ~DQUOT_USR_ENABLED; - if (remount) - dqopt->flags |= DQUOT_USR_SUSPENDED; - else - dqopt->flags &= ~DQUOT_USR_SUSPENDED; - break; - case GRPQUOTA: - dqopt->flags &= ~DQUOT_GRP_ENABLED; - if (remount) - dqopt->flags |= DQUOT_GRP_SUSPENDED; - else - dqopt->flags &= ~DQUOT_GRP_SUSPENDED; - break; - } -} - - /* * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount) */ -int vfs_quota_off(struct super_block *sb, int type, int remount) +int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags) { int cnt, ret = 0; struct quota_info *dqopt = sb_dqopt(sb); struct inode *toputinode[MAXQUOTAS]; + /* Cannot turn off usage accounting without turning off limits, or + * suspend quotas and simultaneously turn quotas off. */ + if ((flags & DQUOT_USAGE_ENABLED && !(flags & DQUOT_LIMITS_ENABLED)) + || (flags & DQUOT_SUSPENDED && flags & (DQUOT_LIMITS_ENABLED | + DQUOT_USAGE_ENABLED))) + return -EINVAL; + /* We need to serialize quota_off() for device */ mutex_lock(&dqopt->dqonoff_mutex); @@ -1589,7 +1639,7 @@ int vfs_quota_off(struct super_block *sb, int type, int remount) * sometimes we are called when fill_super() failed and calling * sync_fs() in such cases does no good. */ - if (!sb_any_quota_enabled(sb) && !sb_any_quota_suspended(sb)) { + if (!sb_any_quota_loaded(sb)) { mutex_unlock(&dqopt->dqonoff_mutex); return 0; } @@ -1597,17 +1647,28 @@ int vfs_quota_off(struct super_block *sb, int type, int remount) toputinode[cnt] = NULL; if (type != -1 && cnt != type) continue; - /* If we keep inodes of quota files after remount and quotaoff - * is called, drop kept inodes. */ - if (!remount && sb_has_quota_suspended(sb, cnt)) { - iput(dqopt->files[cnt]); - dqopt->files[cnt] = NULL; - reset_enable_flags(dqopt, cnt, 0); + if (!sb_has_quota_loaded(sb, cnt)) continue; + + if (flags & DQUOT_SUSPENDED) { + dqopt->flags |= + dquot_state_flag(DQUOT_SUSPENDED, cnt); + } else { + dqopt->flags &= ~dquot_state_flag(flags, cnt); + /* Turning off suspended quotas? */ + if (!sb_has_quota_loaded(sb, cnt) && + sb_has_quota_suspended(sb, cnt)) { + dqopt->flags &= ~dquot_state_flag( + DQUOT_SUSPENDED, cnt); + iput(dqopt->files[cnt]); + dqopt->files[cnt] = NULL; + continue; + } } - if (!sb_has_quota_enabled(sb, cnt)) + + /* We still have to keep quota loaded? */ + if (sb_has_quota_loaded(sb, cnt) && !(flags & DQUOT_SUSPENDED)) continue; - reset_enable_flags(dqopt, cnt, remount); /* Note: these are blocking operations */ drop_dquot_ref(sb, cnt); @@ -1623,7 +1684,7 @@ int vfs_quota_off(struct super_block *sb, int type, int remount) put_quota_format(dqopt->info[cnt].dqi_format); toputinode[cnt] = dqopt->files[cnt]; - if (!remount) + if (!sb_has_quota_loaded(sb, cnt)) dqopt->files[cnt] = NULL; dqopt->info[cnt].dqi_flags = 0; dqopt->info[cnt].dqi_igrace = 0; @@ -1631,6 +1692,11 @@ int vfs_quota_off(struct super_block *sb, int type, int remount) dqopt->ops[cnt] = NULL; } mutex_unlock(&dqopt->dqonoff_mutex); + + /* Skip syncing and setting flags if quota files are hidden */ + if (dqopt->flags & DQUOT_QUOTA_SYS_FILE) + goto put_inodes; + /* Sync the superblock so that buffers with quota data are written to * disk (and so userspace sees correct data afterwards). */ if (sb->s_op->sync_fs) @@ -1646,7 +1712,7 @@ int vfs_quota_off(struct super_block *sb, int type, int remount) mutex_lock(&dqopt->dqonoff_mutex); /* If quota was reenabled in the meantime, we have * nothing to do */ - if (!sb_has_quota_enabled(sb, cnt)) { + if (!sb_has_quota_loaded(sb, cnt)) { mutex_lock_nested(&toputinode[cnt]->i_mutex, I_MUTEX_QUOTA); toputinode[cnt]->i_flags &= ~(S_IMMUTABLE | S_NOATIME | S_NOQUOTA); @@ -1655,26 +1721,43 @@ int vfs_quota_off(struct super_block *sb, int type, int remount) mark_inode_dirty(toputinode[cnt]); } mutex_unlock(&dqopt->dqonoff_mutex); + } + if (sb->s_bdev) + invalidate_bdev(sb->s_bdev); +put_inodes: + for (cnt = 0; cnt < MAXQUOTAS; cnt++) + if (toputinode[cnt]) { /* On remount RO, we keep the inode pointer so that we - * can reenable quota on the subsequent remount RW. - * But we have better not keep inode pointer when there - * is pending delete on the quota file... */ - if (!remount) + * can reenable quota on the subsequent remount RW. We + * have to check 'flags' variable and not use sb_has_ + * function because another quotaon / quotaoff could + * change global state before we got here. We refuse + * to suspend quotas when there is pending delete on + * the quota file... */ + if (!(flags & DQUOT_SUSPENDED)) iput(toputinode[cnt]); else if (!toputinode[cnt]->i_nlink) ret = -EBUSY; } - if (sb->s_bdev) - invalidate_bdev(sb->s_bdev); return ret; } +int vfs_quota_off(struct super_block *sb, int type, int remount) +{ + return vfs_quota_disable(sb, type, remount ? DQUOT_SUSPENDED : + (DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED)); +} + /* * Turn quotas on on a device */ -/* Helper function when we already have the inode */ -static int vfs_quota_on_inode(struct inode *inode, int type, int format_id) +/* + * Helper function to turn quotas on when we already have the inode of + * quota file and no quota information is loaded. + */ +static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, + unsigned int flags) { struct quota_format_type *fmt = find_quota_format(format_id); struct super_block *sb = inode->i_sb; @@ -1696,27 +1779,37 @@ static int vfs_quota_on_inode(struct inode *inode, int type, int format_id) error = -EINVAL; goto out_fmt; } + /* Usage always has to be set... */ + if (!(flags & DQUOT_USAGE_ENABLED)) { + error = -EINVAL; + goto out_fmt; + } - /* As we bypass the pagecache we must now flush the inode so that - * we see all the changes from userspace... */ - write_inode_now(inode, 1); - /* And now flush the block cache so that kernel sees the changes */ - invalidate_bdev(sb->s_bdev); + if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) { + /* As we bypass the pagecache we must now flush the inode so + * that we see all the changes from userspace... */ + write_inode_now(inode, 1); + /* And now flush the block cache so that kernel sees the + * changes */ + invalidate_bdev(sb->s_bdev); + } mutex_lock(&inode->i_mutex); mutex_lock(&dqopt->dqonoff_mutex); - if (sb_has_quota_enabled(sb, type) || - sb_has_quota_suspended(sb, type)) { + if (sb_has_quota_loaded(sb, type)) { error = -EBUSY; goto out_lock; } - /* We don't want quota and atime on quota files (deadlocks possible) - * Also nobody should write to the file - we use special IO operations - * which ignore the immutable bit. */ - down_write(&dqopt->dqptr_sem); - oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE | S_NOQUOTA); - inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE; - up_write(&dqopt->dqptr_sem); - sb->dq_op->drop(inode); + + if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) { + /* We don't want quota and atime on quota files (deadlocks + * possible) Also nobody should write to the file - we use + * special IO operations which ignore the immutable bit. */ + down_write(&dqopt->dqptr_sem); + oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE | S_NOQUOTA); + inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE; + up_write(&dqopt->dqptr_sem); + sb->dq_op->drop(inode); + } error = -EIO; dqopt->files[type] = igrab(inode); @@ -1737,7 +1830,7 @@ static int vfs_quota_on_inode(struct inode *inode, int type, int format_id) } mutex_unlock(&dqopt->dqio_mutex); mutex_unlock(&inode->i_mutex); - set_enable_flags(dqopt, type); + dqopt->flags |= dquot_state_flag(flags, type); add_dquot_ref(sb, type); mutex_unlock(&dqopt->dqonoff_mutex); @@ -1770,20 +1863,23 @@ static int vfs_quota_on_remount(struct super_block *sb, int type) struct quota_info *dqopt = sb_dqopt(sb); struct inode *inode; int ret; + unsigned int flags; mutex_lock(&dqopt->dqonoff_mutex); if (!sb_has_quota_suspended(sb, type)) { mutex_unlock(&dqopt->dqonoff_mutex); return 0; } - BUG_ON(sb_has_quota_enabled(sb, type)); - inode = dqopt->files[type]; dqopt->files[type] = NULL; - reset_enable_flags(dqopt, type, 0); + flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED | + DQUOT_LIMITS_ENABLED, type); + dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, type); mutex_unlock(&dqopt->dqonoff_mutex); - ret = vfs_quota_on_inode(inode, type, dqopt->info[type].dqi_fmt_id); + flags = dquot_generic_flag(flags, type); + ret = vfs_load_quota_inode(inode, type, dqopt->info[type].dqi_fmt_id, + flags); iput(inode); return ret; @@ -1799,12 +1895,12 @@ int vfs_quota_on_path(struct super_block *sb, int type, int format_id, if (path->mnt->mnt_sb != sb) error = -EXDEV; else - error = vfs_quota_on_inode(path->dentry->d_inode, type, - format_id); + error = vfs_load_quota_inode(path->dentry->d_inode, type, + format_id, DQUOT_USAGE_ENABLED | + DQUOT_LIMITS_ENABLED); return error; } -/* Actual function called from quotactl() */ int vfs_quota_on(struct super_block *sb, int type, int format_id, char *name, int remount) { @@ -1823,6 +1919,50 @@ int vfs_quota_on(struct super_block *sb, int type, int format_id, char *name, } /* + * More powerful function for turning on quotas allowing setting + * of individual quota flags + */ +int vfs_quota_enable(struct inode *inode, int type, int format_id, + unsigned int flags) +{ + int ret = 0; + struct super_block *sb = inode->i_sb; + struct quota_info *dqopt = sb_dqopt(sb); + + /* Just unsuspend quotas? */ + if (flags & DQUOT_SUSPENDED) + return vfs_quota_on_remount(sb, type); + if (!flags) + return 0; + /* Just updating flags needed? */ + if (sb_has_quota_loaded(sb, type)) { + mutex_lock(&dqopt->dqonoff_mutex); + /* Now do a reliable test... */ + if (!sb_has_quota_loaded(sb, type)) { + mutex_unlock(&dqopt->dqonoff_mutex); + goto load_quota; + } + if (flags & DQUOT_USAGE_ENABLED && + sb_has_quota_usage_enabled(sb, type)) { + ret = -EBUSY; + goto out_lock; + } + if (flags & DQUOT_LIMITS_ENABLED && + sb_has_quota_limits_enabled(sb, type)) { + ret = -EBUSY; + goto out_lock; + } + sb_dqopt(sb)->flags |= dquot_state_flag(flags, type); +out_lock: + mutex_unlock(&dqopt->dqonoff_mutex); + return ret; + } + +load_quota: + return vfs_load_quota_inode(inode, type, format_id, flags); +} + +/* * This function is used when filesystem needs to initialize quotas * during mount time. */ @@ -1843,7 +1983,8 @@ int vfs_quota_on_mount(struct super_block *sb, char *qf_name, error = security_quota_on(dentry); if (!error) - error = vfs_quota_on_inode(dentry->d_inode, type, format_id); + error = vfs_load_quota_inode(dentry->d_inode, type, format_id, + DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); out: dput(dentry); @@ -1866,14 +2007,24 @@ int vfs_dq_quota_on_remount(struct super_block *sb) return ret; } +static inline qsize_t qbtos(qsize_t blocks) +{ + return blocks << QIF_DQBLKSIZE_BITS; +} + +static inline qsize_t stoqb(qsize_t space) +{ + return (space + QIF_DQBLKSIZE - 1) >> QIF_DQBLKSIZE_BITS; +} + /* Generic routine for getting common part of quota structure */ static void do_get_dqblk(struct dquot *dquot, struct if_dqblk *di) { struct mem_dqblk *dm = &dquot->dq_dqb; spin_lock(&dq_data_lock); - di->dqb_bhardlimit = dm->dqb_bhardlimit; - di->dqb_bsoftlimit = dm->dqb_bsoftlimit; + di->dqb_bhardlimit = stoqb(dm->dqb_bhardlimit); + di->dqb_bsoftlimit = stoqb(dm->dqb_bsoftlimit); di->dqb_curspace = dm->dqb_curspace; di->dqb_ihardlimit = dm->dqb_ihardlimit; di->dqb_isoftlimit = dm->dqb_isoftlimit; @@ -1918,28 +2069,38 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di) if (di->dqb_valid & QIF_SPACE) { dm->dqb_curspace = di->dqb_curspace; check_blim = 1; + __set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags); } if (di->dqb_valid & QIF_BLIMITS) { - dm->dqb_bsoftlimit = di->dqb_bsoftlimit; - dm->dqb_bhardlimit = di->dqb_bhardlimit; + dm->dqb_bsoftlimit = qbtos(di->dqb_bsoftlimit); + dm->dqb_bhardlimit = qbtos(di->dqb_bhardlimit); check_blim = 1; + __set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags); } if (di->dqb_valid & QIF_INODES) { dm->dqb_curinodes = di->dqb_curinodes; check_ilim = 1; + __set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags); } if (di->dqb_valid & QIF_ILIMITS) { dm->dqb_isoftlimit = di->dqb_isoftlimit; dm->dqb_ihardlimit = di->dqb_ihardlimit; check_ilim = 1; + __set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags); } - if (di->dqb_valid & QIF_BTIME) + if (di->dqb_valid & QIF_BTIME) { dm->dqb_btime = di->dqb_btime; - if (di->dqb_valid & QIF_ITIME) + check_blim = 1; + __set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags); + } + if (di->dqb_valid & QIF_ITIME) { dm->dqb_itime = di->dqb_itime; + check_ilim = 1; + __set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags); + } if (check_blim) { - if (!dm->dqb_bsoftlimit || toqb(dm->dqb_curspace) < dm->dqb_bsoftlimit) { + if (!dm->dqb_bsoftlimit || dm->dqb_curspace < dm->dqb_bsoftlimit) { dm->dqb_btime = 0; clear_bit(DQ_BLKS_B, &dquot->dq_flags); } @@ -1970,12 +2131,14 @@ int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *d int rc; mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); - if (!(dquot = dqget(sb, id, type))) { - mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); - return -ESRCH; + dquot = dqget(sb, id, type); + if (!dquot) { + rc = -ESRCH; + goto out; } rc = do_set_dqblk(dquot, di); dqput(dquot); +out: mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); return rc; } @@ -1986,7 +2149,7 @@ int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii) struct mem_dqinfo *mi; mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); - if (!sb_has_quota_enabled(sb, type)) { + if (!sb_has_quota_active(sb, type)) { mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); return -ESRCH; } @@ -2005,11 +2168,12 @@ int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii) int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii) { struct mem_dqinfo *mi; + int err = 0; mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); - if (!sb_has_quota_enabled(sb, type)) { - mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); - return -ESRCH; + if (!sb_has_quota_active(sb, type)) { + err = -ESRCH; + goto out; } mi = sb_dqopt(sb)->info + type; spin_lock(&dq_data_lock); @@ -2023,8 +2187,9 @@ int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii) mark_info_dirty(sb, type); /* Force write to disk */ sb->dq_op->write_info(sb, type); +out: mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); - return 0; + return err; } struct quotactl_ops vfs_quotactl_ops = { @@ -2186,10 +2351,13 @@ EXPORT_SYMBOL(register_quota_format); EXPORT_SYMBOL(unregister_quota_format); EXPORT_SYMBOL(dqstats); EXPORT_SYMBOL(dq_data_lock); +EXPORT_SYMBOL(vfs_quota_enable); EXPORT_SYMBOL(vfs_quota_on); EXPORT_SYMBOL(vfs_quota_on_path); EXPORT_SYMBOL(vfs_quota_on_mount); +EXPORT_SYMBOL(vfs_quota_disable); EXPORT_SYMBOL(vfs_quota_off); +EXPORT_SYMBOL(dquot_scan_active); EXPORT_SYMBOL(vfs_quota_sync); EXPORT_SYMBOL(vfs_get_dqinfo); EXPORT_SYMBOL(vfs_set_dqinfo); @@ -2202,7 +2370,11 @@ EXPORT_SYMBOL(dquot_release); EXPORT_SYMBOL(dquot_mark_dquot_dirty); EXPORT_SYMBOL(dquot_initialize); EXPORT_SYMBOL(dquot_drop); +EXPORT_SYMBOL(dquot_drop_locked); EXPORT_SYMBOL(vfs_dq_drop); +EXPORT_SYMBOL(dqget); +EXPORT_SYMBOL(dqput); +EXPORT_SYMBOL(dquot_is_cached); EXPORT_SYMBOL(dquot_alloc_space); EXPORT_SYMBOL(dquot_alloc_inode); EXPORT_SYMBOL(dquot_free_space); diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 6046239465a1..c01e043670e2 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -175,8 +175,8 @@ out: * * Returns zero on success; non-zero on error. */ -static int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat, - loff_t offset) +int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat, + loff_t offset) { int rc = 0; char dst[MD5_DIGEST_SIZE]; @@ -924,6 +924,15 @@ static void ecryptfs_copy_mount_wide_flags_to_inode_flags( crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR; if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) crypt_stat->flags |= ECRYPTFS_VIEW_AS_ENCRYPTED; + if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) { + crypt_stat->flags |= ECRYPTFS_ENCRYPT_FILENAMES; + if (mount_crypt_stat->flags + & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK) + crypt_stat->flags |= ECRYPTFS_ENCFN_USE_MOUNT_FNEK; + else if (mount_crypt_stat->flags + & ECRYPTFS_GLOBAL_ENCFN_USE_FEK) + crypt_stat->flags |= ECRYPTFS_ENCFN_USE_FEK; + } } static int ecryptfs_copy_mount_wide_sigs_to_inode_sigs( @@ -1060,7 +1069,8 @@ struct ecryptfs_flag_map_elem { static struct ecryptfs_flag_map_elem ecryptfs_flag_map[] = { {0x00000001, ECRYPTFS_ENABLE_HMAC}, {0x00000002, ECRYPTFS_ENCRYPTED}, - {0x00000004, ECRYPTFS_METADATA_IN_XATTR} + {0x00000004, ECRYPTFS_METADATA_IN_XATTR}, + {0x00000008, ECRYPTFS_ENCRYPT_FILENAMES} }; /** @@ -1149,19 +1159,20 @@ ecryptfs_cipher_code_str_map[] = { /** * ecryptfs_code_for_cipher_string - * @crypt_stat: The cryptographic context + * @cipher_name: The string alias for the cipher + * @key_bytes: Length of key in bytes; used for AES code selection * * Returns zero on no match, or the cipher code on match */ -u8 ecryptfs_code_for_cipher_string(struct ecryptfs_crypt_stat *crypt_stat) +u8 ecryptfs_code_for_cipher_string(char *cipher_name, size_t key_bytes) { int i; u8 code = 0; struct ecryptfs_cipher_code_str_map_elem *map = ecryptfs_cipher_code_str_map; - if (strcmp(crypt_stat->cipher, "aes") == 0) { - switch (crypt_stat->key_size) { + if (strcmp(cipher_name, "aes") == 0) { + switch (key_bytes) { case 16: code = RFC2440_CIPHER_AES_128; break; @@ -1173,7 +1184,7 @@ u8 ecryptfs_code_for_cipher_string(struct ecryptfs_crypt_stat *crypt_stat) } } else { for (i = 0; i < ARRAY_SIZE(ecryptfs_cipher_code_str_map); i++) - if (strcmp(crypt_stat->cipher, map[i].cipher_str) == 0){ + if (strcmp(cipher_name, map[i].cipher_str) == 0) { code = map[i].cipher_code; break; } @@ -1212,6 +1223,8 @@ int ecryptfs_read_and_validate_header_region(char *data, &(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat); int rc; + if (crypt_stat->extent_size == 0) + crypt_stat->extent_size = ECRYPTFS_DEFAULT_EXTENT_SIZE; rc = ecryptfs_read_lower(data, 0, crypt_stat->extent_size, ecryptfs_inode); if (rc) { @@ -1221,7 +1234,6 @@ int ecryptfs_read_and_validate_header_region(char *data, } if (!contains_ecryptfs_marker(data + ECRYPTFS_FILE_SIZE_BYTES)) { rc = -EINVAL; - ecryptfs_printk(KERN_DEBUG, "Valid marker not found\n"); } out: return rc; @@ -1628,95 +1640,95 @@ out: } /** - * ecryptfs_encode_filename - converts a plaintext file name to cipher text - * @crypt_stat: The crypt_stat struct associated with the file anem to encode - * @name: The plaintext name - * @length: The length of the plaintext - * @encoded_name: The encypted name + * ecryptfs_encrypt_filename - encrypt filename * - * Encrypts and encodes a filename into something that constitutes a - * valid filename for a filesystem, with printable characters. + * CBC-encrypts the filename. We do not want to encrypt the same + * filename with the same key and IV, which may happen with hard + * links, so we prepend random bits to each filename. * - * We assume that we have a properly initialized crypto context, - * pointed to by crypt_stat->tfm. - * - * TODO: Implement filename decoding and decryption here, in place of - * memcpy. We are keeping the framework around for now to (1) - * facilitate testing of the components needed to implement filename - * encryption and (2) to provide a code base from which other - * developers in the community can easily implement this feature. - * - * Returns the length of encoded filename; negative if error + * Returns zero on success; non-zero otherwise */ -int -ecryptfs_encode_filename(struct ecryptfs_crypt_stat *crypt_stat, - const char *name, int length, char **encoded_name) +static int +ecryptfs_encrypt_filename(struct ecryptfs_filename *filename, + struct ecryptfs_crypt_stat *crypt_stat, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat) { - int error = 0; + int rc = 0; - (*encoded_name) = kmalloc(length + 2, GFP_KERNEL); - if (!(*encoded_name)) { - error = -ENOMEM; + filename->encrypted_filename = NULL; + filename->encrypted_filename_size = 0; + if ((crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCFN_USE_MOUNT_FNEK)) + || (mount_crypt_stat && (mount_crypt_stat->flags + & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))) { + size_t packet_size; + size_t remaining_bytes; + + rc = ecryptfs_write_tag_70_packet( + NULL, NULL, + &filename->encrypted_filename_size, + mount_crypt_stat, NULL, + filename->filename_size); + if (rc) { + printk(KERN_ERR "%s: Error attempting to get packet " + "size for tag 72; rc = [%d]\n", __func__, + rc); + filename->encrypted_filename_size = 0; + goto out; + } + filename->encrypted_filename = + kmalloc(filename->encrypted_filename_size, GFP_KERNEL); + if (!filename->encrypted_filename) { + printk(KERN_ERR "%s: Out of memory whilst attempting " + "to kmalloc [%zd] bytes\n", __func__, + filename->encrypted_filename_size); + rc = -ENOMEM; + goto out; + } + remaining_bytes = filename->encrypted_filename_size; + rc = ecryptfs_write_tag_70_packet(filename->encrypted_filename, + &remaining_bytes, + &packet_size, + mount_crypt_stat, + filename->filename, + filename->filename_size); + if (rc) { + printk(KERN_ERR "%s: Error attempting to generate " + "tag 70 packet; rc = [%d]\n", __func__, + rc); + kfree(filename->encrypted_filename); + filename->encrypted_filename = NULL; + filename->encrypted_filename_size = 0; + goto out; + } + filename->encrypted_filename_size = packet_size; + } else { + printk(KERN_ERR "%s: No support for requested filename " + "encryption method in this release\n", __func__); + rc = -ENOTSUPP; goto out; } - /* TODO: Filename encryption is a scheduled feature for a - * future version of eCryptfs. This function is here only for - * the purpose of providing a framework for other developers - * to easily implement filename encryption. Hint: Replace this - * memcpy() with a call to encrypt and encode the - * filename, the set the length accordingly. */ - memcpy((void *)(*encoded_name), (void *)name, length); - (*encoded_name)[length] = '\0'; - error = length + 1; out: - return error; + return rc; } -/** - * ecryptfs_decode_filename - converts the cipher text name to plaintext - * @crypt_stat: The crypt_stat struct associated with the file - * @name: The filename in cipher text - * @length: The length of the cipher text name - * @decrypted_name: The plaintext name - * - * Decodes and decrypts the filename. - * - * We assume that we have a properly initialized crypto context, - * pointed to by crypt_stat->tfm. - * - * TODO: Implement filename decoding and decryption here, in place of - * memcpy. We are keeping the framework around for now to (1) - * facilitate testing of the components needed to implement filename - * encryption and (2) to provide a code base from which other - * developers in the community can easily implement this feature. - * - * Returns the length of decoded filename; negative if error - */ -int -ecryptfs_decode_filename(struct ecryptfs_crypt_stat *crypt_stat, - const char *name, int length, char **decrypted_name) +static int ecryptfs_copy_filename(char **copied_name, size_t *copied_name_size, + const char *name, size_t name_size) { - int error = 0; + int rc = 0; - (*decrypted_name) = kmalloc(length + 2, GFP_KERNEL); - if (!(*decrypted_name)) { - error = -ENOMEM; + (*copied_name) = kmalloc((name_size + 2), GFP_KERNEL); + if (!(*copied_name)) { + rc = -ENOMEM; goto out; } - /* TODO: Filename encryption is a scheduled feature for a - * future version of eCryptfs. This function is here only for - * the purpose of providing a framework for other developers - * to easily implement filename encryption. Hint: Replace this - * memcpy() with a call to decode and decrypt the - * filename, the set the length accordingly. */ - memcpy((void *)(*decrypted_name), (void *)name, length); - (*decrypted_name)[length + 1] = '\0'; /* Only for convenience + memcpy((void *)(*copied_name), (void *)name, name_size); + (*copied_name)[(name_size)] = '\0'; /* Only for convenience * in printing out the * string in debug * messages */ - error = length; + (*copied_name_size) = (name_size + 1); out: - return error; + return rc; } /** @@ -1740,7 +1752,7 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm, *key_tfm = NULL; if (*key_size > ECRYPTFS_MAX_KEY_BYTES) { rc = -EINVAL; - printk(KERN_ERR "Requested key size is [%Zd] bytes; maximum " + printk(KERN_ERR "Requested key size is [%zd] bytes; maximum " "allowable is [%d]\n", *key_size, ECRYPTFS_MAX_KEY_BYTES); goto out; } @@ -1765,7 +1777,7 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm, get_random_bytes(dummy_key, *key_size); rc = crypto_blkcipher_setkey(*key_tfm, dummy_key, *key_size); if (rc) { - printk(KERN_ERR "Error attempting to set key of size [%Zd] for " + printk(KERN_ERR "Error attempting to set key of size [%zd] for " "cipher [%s]; rc = [%d]\n", *key_size, cipher_name, rc); rc = -EINVAL; goto out; @@ -1910,3 +1922,341 @@ out: mutex_unlock(&key_tfm_list_mutex); return rc; } + +/* 64 characters forming a 6-bit target field */ +static unsigned char *portable_filename_chars = ("-.0123456789ABCD" + "EFGHIJKLMNOPQRST" + "UVWXYZabcdefghij" + "klmnopqrstuvwxyz"); + +/* We could either offset on every reverse map or just pad some 0x00's + * at the front here */ +static const unsigned char filename_rev_map[] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 15 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 31 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 39 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, /* 47 */ + 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, /* 55 */ + 0x0A, 0x0B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 63 */ + 0x00, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, /* 71 */ + 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, /* 79 */ + 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, /* 87 */ + 0x23, 0x24, 0x25, 0x00, 0x00, 0x00, 0x00, 0x00, /* 95 */ + 0x00, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, /* 103 */ + 0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, /* 111 */ + 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, /* 119 */ + 0x3D, 0x3E, 0x3F +}; + +/** + * ecryptfs_encode_for_filename + * @dst: Destination location for encoded filename + * @dst_size: Size of the encoded filename in bytes + * @src: Source location for the filename to encode + * @src_size: Size of the source in bytes + */ +void ecryptfs_encode_for_filename(unsigned char *dst, size_t *dst_size, + unsigned char *src, size_t src_size) +{ + size_t num_blocks; + size_t block_num = 0; + size_t dst_offset = 0; + unsigned char last_block[3]; + + if (src_size == 0) { + (*dst_size) = 0; + goto out; + } + num_blocks = (src_size / 3); + if ((src_size % 3) == 0) { + memcpy(last_block, (&src[src_size - 3]), 3); + } else { + num_blocks++; + last_block[2] = 0x00; + switch (src_size % 3) { + case 1: + last_block[0] = src[src_size - 1]; + last_block[1] = 0x00; + break; + case 2: + last_block[0] = src[src_size - 2]; + last_block[1] = src[src_size - 1]; + } + } + (*dst_size) = (num_blocks * 4); + if (!dst) + goto out; + while (block_num < num_blocks) { + unsigned char *src_block; + unsigned char dst_block[4]; + + if (block_num == (num_blocks - 1)) + src_block = last_block; + else + src_block = &src[block_num * 3]; + dst_block[0] = ((src_block[0] >> 2) & 0x3F); + dst_block[1] = (((src_block[0] << 4) & 0x30) + | ((src_block[1] >> 4) & 0x0F)); + dst_block[2] = (((src_block[1] << 2) & 0x3C) + | ((src_block[2] >> 6) & 0x03)); + dst_block[3] = (src_block[2] & 0x3F); + dst[dst_offset++] = portable_filename_chars[dst_block[0]]; + dst[dst_offset++] = portable_filename_chars[dst_block[1]]; + dst[dst_offset++] = portable_filename_chars[dst_block[2]]; + dst[dst_offset++] = portable_filename_chars[dst_block[3]]; + block_num++; + } +out: + return; +} + +/** + * ecryptfs_decode_from_filename + * @dst: If NULL, this function only sets @dst_size and returns. If + * non-NULL, this function decodes the encoded octets in @src + * into the memory that @dst points to. + * @dst_size: Set to the size of the decoded string. + * @src: The encoded set of octets to decode. + * @src_size: The size of the encoded set of octets to decode. + */ +static void +ecryptfs_decode_from_filename(unsigned char *dst, size_t *dst_size, + const unsigned char *src, size_t src_size) +{ + u8 current_bit_offset = 0; + size_t src_byte_offset = 0; + size_t dst_byte_offset = 0; + + if (dst == NULL) { + /* Not exact; conservatively long. Every block of 4 + * encoded characters decodes into a block of 3 + * decoded characters. This segment of code provides + * the caller with the maximum amount of allocated + * space that @dst will need to point to in a + * subsequent call. */ + (*dst_size) = (((src_size + 1) * 3) / 4); + goto out; + } + while (src_byte_offset < src_size) { + unsigned char src_byte = + filename_rev_map[(int)src[src_byte_offset]]; + + switch (current_bit_offset) { + case 0: + dst[dst_byte_offset] = (src_byte << 2); + current_bit_offset = 6; + break; + case 6: + dst[dst_byte_offset++] |= (src_byte >> 4); + dst[dst_byte_offset] = ((src_byte & 0xF) + << 4); + current_bit_offset = 4; + break; + case 4: + dst[dst_byte_offset++] |= (src_byte >> 2); + dst[dst_byte_offset] = (src_byte << 6); + current_bit_offset = 2; + break; + case 2: + dst[dst_byte_offset++] |= (src_byte); + dst[dst_byte_offset] = 0; + current_bit_offset = 0; + break; + } + src_byte_offset++; + } + (*dst_size) = dst_byte_offset; +out: + return; +} + +/** + * ecryptfs_encrypt_and_encode_filename - converts a plaintext file name to cipher text + * @crypt_stat: The crypt_stat struct associated with the file anem to encode + * @name: The plaintext name + * @length: The length of the plaintext + * @encoded_name: The encypted name + * + * Encrypts and encodes a filename into something that constitutes a + * valid filename for a filesystem, with printable characters. + * + * We assume that we have a properly initialized crypto context, + * pointed to by crypt_stat->tfm. + * + * Returns zero on success; non-zero on otherwise + */ +int ecryptfs_encrypt_and_encode_filename( + char **encoded_name, + size_t *encoded_name_size, + struct ecryptfs_crypt_stat *crypt_stat, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat, + const char *name, size_t name_size) +{ + size_t encoded_name_no_prefix_size; + int rc = 0; + + (*encoded_name) = NULL; + (*encoded_name_size) = 0; + if ((crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCRYPT_FILENAMES)) + || (mount_crypt_stat && (mount_crypt_stat->flags + & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES))) { + struct ecryptfs_filename *filename; + + filename = kzalloc(sizeof(*filename), GFP_KERNEL); + if (!filename) { + printk(KERN_ERR "%s: Out of memory whilst attempting " + "to kzalloc [%zd] bytes\n", __func__, + sizeof(*filename)); + rc = -ENOMEM; + goto out; + } + filename->filename = (char *)name; + filename->filename_size = name_size; + rc = ecryptfs_encrypt_filename(filename, crypt_stat, + mount_crypt_stat); + if (rc) { + printk(KERN_ERR "%s: Error attempting to encrypt " + "filename; rc = [%d]\n", __func__, rc); + kfree(filename); + goto out; + } + ecryptfs_encode_for_filename( + NULL, &encoded_name_no_prefix_size, + filename->encrypted_filename, + filename->encrypted_filename_size); + if ((crypt_stat && (crypt_stat->flags + & ECRYPTFS_ENCFN_USE_MOUNT_FNEK)) + || (mount_crypt_stat + && (mount_crypt_stat->flags + & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))) + (*encoded_name_size) = + (ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE + + encoded_name_no_prefix_size); + else + (*encoded_name_size) = + (ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX_SIZE + + encoded_name_no_prefix_size); + (*encoded_name) = kmalloc((*encoded_name_size) + 1, GFP_KERNEL); + if (!(*encoded_name)) { + printk(KERN_ERR "%s: Out of memory whilst attempting " + "to kzalloc [%zd] bytes\n", __func__, + (*encoded_name_size)); + rc = -ENOMEM; + kfree(filename->encrypted_filename); + kfree(filename); + goto out; + } + if ((crypt_stat && (crypt_stat->flags + & ECRYPTFS_ENCFN_USE_MOUNT_FNEK)) + || (mount_crypt_stat + && (mount_crypt_stat->flags + & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))) { + memcpy((*encoded_name), + ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX, + ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE); + ecryptfs_encode_for_filename( + ((*encoded_name) + + ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE), + &encoded_name_no_prefix_size, + filename->encrypted_filename, + filename->encrypted_filename_size); + (*encoded_name_size) = + (ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE + + encoded_name_no_prefix_size); + (*encoded_name)[(*encoded_name_size)] = '\0'; + (*encoded_name_size)++; + } else { + rc = -ENOTSUPP; + } + if (rc) { + printk(KERN_ERR "%s: Error attempting to encode " + "encrypted filename; rc = [%d]\n", __func__, + rc); + kfree((*encoded_name)); + (*encoded_name) = NULL; + (*encoded_name_size) = 0; + } + kfree(filename->encrypted_filename); + kfree(filename); + } else { + rc = ecryptfs_copy_filename(encoded_name, + encoded_name_size, + name, name_size); + } +out: + return rc; +} + +/** + * ecryptfs_decode_and_decrypt_filename - converts the encoded cipher text name to decoded plaintext + * @plaintext_name: The plaintext name + * @plaintext_name_size: The plaintext name size + * @ecryptfs_dir_dentry: eCryptfs directory dentry + * @name: The filename in cipher text + * @name_size: The cipher text name size + * + * Decrypts and decodes the filename. + * + * Returns zero on error; non-zero otherwise + */ +int ecryptfs_decode_and_decrypt_filename(char **plaintext_name, + size_t *plaintext_name_size, + struct dentry *ecryptfs_dir_dentry, + const char *name, size_t name_size) +{ + char *decoded_name; + size_t decoded_name_size; + size_t packet_size; + int rc = 0; + + if ((name_size > ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE) + && (strncmp(name, ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX, + ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE) == 0)) { + struct ecryptfs_mount_crypt_stat *mount_crypt_stat = + &ecryptfs_superblock_to_private( + ecryptfs_dir_dentry->d_sb)->mount_crypt_stat; + const char *orig_name = name; + size_t orig_name_size = name_size; + + name += ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE; + name_size -= ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE; + ecryptfs_decode_from_filename(NULL, &decoded_name_size, + name, name_size); + decoded_name = kmalloc(decoded_name_size, GFP_KERNEL); + if (!decoded_name) { + printk(KERN_ERR "%s: Out of memory whilst attempting " + "to kmalloc [%zd] bytes\n", __func__, + decoded_name_size); + rc = -ENOMEM; + goto out; + } + ecryptfs_decode_from_filename(decoded_name, &decoded_name_size, + name, name_size); + rc = ecryptfs_parse_tag_70_packet(plaintext_name, + plaintext_name_size, + &packet_size, + mount_crypt_stat, + decoded_name, + decoded_name_size); + if (rc) { + printk(KERN_INFO "%s: Could not parse tag 70 packet " + "from filename; copying through filename " + "as-is\n", __func__); + rc = ecryptfs_copy_filename(plaintext_name, + plaintext_name_size, + orig_name, orig_name_size); + goto out_free; + } + } else { + rc = ecryptfs_copy_filename(plaintext_name, + plaintext_name_size, + name, name_size); + goto out; + } +out_free: + kfree(decoded_name); +out: + return rc; +} diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index a75026d35d16..c11fc95714ab 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -51,12 +51,16 @@ #define ECRYPTFS_VERSIONING_XATTR 0x00000010 #define ECRYPTFS_VERSIONING_MULTKEY 0x00000020 #define ECRYPTFS_VERSIONING_DEVMISC 0x00000040 +#define ECRYPTFS_VERSIONING_HMAC 0x00000080 +#define ECRYPTFS_VERSIONING_FILENAME_ENCRYPTION 0x00000100 +#define ECRYPTFS_VERSIONING_GCM 0x00000200 #define ECRYPTFS_VERSIONING_MASK (ECRYPTFS_VERSIONING_PASSPHRASE \ | ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH \ | ECRYPTFS_VERSIONING_PUBKEY \ | ECRYPTFS_VERSIONING_XATTR \ | ECRYPTFS_VERSIONING_MULTKEY \ - | ECRYPTFS_VERSIONING_DEVMISC) + | ECRYPTFS_VERSIONING_DEVMISC \ + | ECRYPTFS_VERSIONING_FILENAME_ENCRYPTION) #define ECRYPTFS_MAX_PASSWORD_LENGTH 64 #define ECRYPTFS_MAX_PASSPHRASE_BYTES ECRYPTFS_MAX_PASSWORD_LENGTH #define ECRYPTFS_SALT_SIZE 8 @@ -199,6 +203,7 @@ ecryptfs_get_key_payload_data(struct key *key) #define ECRYPTFS_DEFAULT_CIPHER "aes" #define ECRYPTFS_DEFAULT_KEY_BYTES 16 #define ECRYPTFS_DEFAULT_HASH "md5" +#define ECRYPTFS_TAG_70_DIGEST ECRYPTFS_DEFAULT_HASH #define ECRYPTFS_TAG_1_PACKET_TYPE 0x01 #define ECRYPTFS_TAG_3_PACKET_TYPE 0x8C #define ECRYPTFS_TAG_11_PACKET_TYPE 0xED @@ -206,30 +211,64 @@ ecryptfs_get_key_payload_data(struct key *key) #define ECRYPTFS_TAG_65_PACKET_TYPE 0x41 #define ECRYPTFS_TAG_66_PACKET_TYPE 0x42 #define ECRYPTFS_TAG_67_PACKET_TYPE 0x43 +#define ECRYPTFS_TAG_70_PACKET_TYPE 0x46 /* FNEK-encrypted filename + * as dentry name */ +#define ECRYPTFS_TAG_71_PACKET_TYPE 0x47 /* FNEK-encrypted filename in + * metadata */ +#define ECRYPTFS_TAG_72_PACKET_TYPE 0x48 /* FEK-encrypted filename as + * dentry name */ +#define ECRYPTFS_TAG_73_PACKET_TYPE 0x49 /* FEK-encrypted filename as + * metadata */ +/* Constraint: ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES >= + * ECRYPTFS_MAX_IV_BYTES */ +#define ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES 16 +#define ECRYPTFS_NON_NULL 0x42 /* A reasonable substitute for NULL */ #define MD5_DIGEST_SIZE 16 +#define ECRYPTFS_TAG_70_DIGEST_SIZE MD5_DIGEST_SIZE +#define ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX "ECRYPTFS_FEK_ENCRYPTED." +#define ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX_SIZE 23 +#define ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX "ECRYPTFS_FNEK_ENCRYPTED." +#define ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE 24 +#define ECRYPTFS_ENCRYPTED_DENTRY_NAME_LEN (18 + 1 + 4 + 1 + 32) struct ecryptfs_key_sig { struct list_head crypt_stat_list; char keysig[ECRYPTFS_SIG_SIZE_HEX]; }; +struct ecryptfs_filename { + struct list_head crypt_stat_list; +#define ECRYPTFS_FILENAME_CONTAINS_DECRYPTED 0x00000001 + u32 flags; + u32 seq_no; + char *filename; + char *encrypted_filename; + size_t filename_size; + size_t encrypted_filename_size; + char fnek_sig[ECRYPTFS_SIG_SIZE_HEX]; + char dentry_name[ECRYPTFS_ENCRYPTED_DENTRY_NAME_LEN + 1]; +}; + /** * This is the primary struct associated with each encrypted file. * * TODO: cache align/pack? */ struct ecryptfs_crypt_stat { -#define ECRYPTFS_STRUCT_INITIALIZED 0x00000001 -#define ECRYPTFS_POLICY_APPLIED 0x00000002 -#define ECRYPTFS_NEW_FILE 0x00000004 -#define ECRYPTFS_ENCRYPTED 0x00000008 -#define ECRYPTFS_SECURITY_WARNING 0x00000010 -#define ECRYPTFS_ENABLE_HMAC 0x00000020 -#define ECRYPTFS_ENCRYPT_IV_PAGES 0x00000040 -#define ECRYPTFS_KEY_VALID 0x00000080 -#define ECRYPTFS_METADATA_IN_XATTR 0x00000100 -#define ECRYPTFS_VIEW_AS_ENCRYPTED 0x00000200 -#define ECRYPTFS_KEY_SET 0x00000400 +#define ECRYPTFS_STRUCT_INITIALIZED 0x00000001 +#define ECRYPTFS_POLICY_APPLIED 0x00000002 +#define ECRYPTFS_NEW_FILE 0x00000004 +#define ECRYPTFS_ENCRYPTED 0x00000008 +#define ECRYPTFS_SECURITY_WARNING 0x00000010 +#define ECRYPTFS_ENABLE_HMAC 0x00000020 +#define ECRYPTFS_ENCRYPT_IV_PAGES 0x00000040 +#define ECRYPTFS_KEY_VALID 0x00000080 +#define ECRYPTFS_METADATA_IN_XATTR 0x00000100 +#define ECRYPTFS_VIEW_AS_ENCRYPTED 0x00000200 +#define ECRYPTFS_KEY_SET 0x00000400 +#define ECRYPTFS_ENCRYPT_FILENAMES 0x00000800 +#define ECRYPTFS_ENCFN_USE_MOUNT_FNEK 0x00001000 +#define ECRYPTFS_ENCFN_USE_FEK 0x00002000 u32 flags; unsigned int file_version; size_t iv_bytes; @@ -332,13 +371,20 @@ struct ecryptfs_mount_crypt_stat { #define ECRYPTFS_XATTR_METADATA_ENABLED 0x00000002 #define ECRYPTFS_ENCRYPTED_VIEW_ENABLED 0x00000004 #define ECRYPTFS_MOUNT_CRYPT_STAT_INITIALIZED 0x00000008 +#define ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES 0x00000010 +#define ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK 0x00000020 +#define ECRYPTFS_GLOBAL_ENCFN_USE_FEK 0x00000040 u32 flags; struct list_head global_auth_tok_list; struct mutex global_auth_tok_list_mutex; size_t num_global_auth_toks; size_t global_default_cipher_key_size; + size_t global_default_fn_cipher_key_bytes; unsigned char global_default_cipher_name[ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1]; + unsigned char global_default_fn_cipher_name[ + ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1]; + char global_default_fnek_sig[ECRYPTFS_SIG_SIZE_HEX + 1]; }; /* superblock private data. */ @@ -571,13 +617,22 @@ struct ecryptfs_open_req { int ecryptfs_interpose(struct dentry *hidden_dentry, struct dentry *this_dentry, struct super_block *sb, u32 flags); +int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry, + struct dentry *lower_dentry, + struct ecryptfs_crypt_stat *crypt_stat, + struct inode *ecryptfs_dir_inode, + struct nameidata *ecryptfs_nd); +int ecryptfs_decode_and_decrypt_filename(char **decrypted_name, + size_t *decrypted_name_size, + struct dentry *ecryptfs_dentry, + const char *name, size_t name_size); int ecryptfs_fill_zeros(struct file *file, loff_t new_length); -int ecryptfs_decode_filename(struct ecryptfs_crypt_stat *crypt_stat, - const char *name, int length, - char **decrypted_name); -int ecryptfs_encode_filename(struct ecryptfs_crypt_stat *crypt_stat, - const char *name, int length, - char **encoded_name); +int ecryptfs_encrypt_and_encode_filename( + char **encoded_name, + size_t *encoded_name_size, + struct ecryptfs_crypt_stat *crypt_stat, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat, + const char *name, size_t name_size); struct dentry *ecryptfs_lower_dentry(struct dentry *this_dentry); void ecryptfs_dump_hex(char *data, int bytes); int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg, @@ -599,7 +654,7 @@ int ecryptfs_read_and_validate_header_region(char *data, struct inode *ecryptfs_inode); int ecryptfs_read_and_validate_xattr_region(char *page_virt, struct dentry *ecryptfs_dentry); -u8 ecryptfs_code_for_cipher_string(struct ecryptfs_crypt_stat *crypt_stat); +u8 ecryptfs_code_for_cipher_string(char *cipher_name, size_t key_bytes); int ecryptfs_cipher_code_to_string(char *str, u8 cipher_code); void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat); int ecryptfs_generate_key_packet_set(char *dest_base, @@ -694,5 +749,17 @@ int ecryptfs_privileged_open(struct file **lower_file, struct vfsmount *lower_mnt, const struct cred *cred); int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry); +int +ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, + size_t *packet_size, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat, + char *filename, size_t filename_size); +int +ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size, + size_t *packet_size, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat, + char *data, size_t max_packet_size); +int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat, + loff_t offset); #endif /* #ifndef ECRYPTFS_KERNEL_H */ diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index eb3dc4c7ac06..9e944057001b 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -77,27 +77,27 @@ struct ecryptfs_getdents_callback { /* Inspired by generic filldir in fs/readdir.c */ static int -ecryptfs_filldir(void *dirent, const char *name, int namelen, loff_t offset, - u64 ino, unsigned int d_type) +ecryptfs_filldir(void *dirent, const char *lower_name, int lower_namelen, + loff_t offset, u64 ino, unsigned int d_type) { - struct ecryptfs_crypt_stat *crypt_stat; struct ecryptfs_getdents_callback *buf = (struct ecryptfs_getdents_callback *)dirent; + size_t name_size; + char *name; int rc; - int decoded_length; - char *decoded_name; - crypt_stat = ecryptfs_dentry_to_private(buf->dentry)->crypt_stat; buf->filldir_called++; - decoded_length = ecryptfs_decode_filename(crypt_stat, name, namelen, - &decoded_name); - if (decoded_length < 0) { - rc = decoded_length; + rc = ecryptfs_decode_and_decrypt_filename(&name, &name_size, + buf->dentry, lower_name, + lower_namelen); + if (rc) { + printk(KERN_ERR "%s: Error attempting to decode and decrypt " + "filename [%s]; rc = [%d]\n", __func__, lower_name, + rc); goto out; } - rc = buf->filldir(buf->dirent, decoded_name, decoded_length, offset, - ino, d_type); - kfree(decoded_name); + rc = buf->filldir(buf->dirent, name, name_size, offset, ino, d_type); + kfree(name); if (rc >= 0) buf->entries_written++; out: @@ -106,8 +106,8 @@ out: /** * ecryptfs_readdir - * @file: The ecryptfs file struct - * @dirent: Directory entry + * @file: The eCryptfs directory file + * @dirent: Directory entry handle * @filldir: The filldir callback function */ static int ecryptfs_readdir(struct file *file, void *dirent, filldir_t filldir) @@ -275,18 +275,9 @@ static int ecryptfs_release(struct inode *inode, struct file *file) static int ecryptfs_fsync(struct file *file, struct dentry *dentry, int datasync) { - struct file *lower_file = ecryptfs_file_to_lower(file); - struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); - struct inode *lower_inode = lower_dentry->d_inode; - int rc = -EINVAL; - - if (lower_inode->i_fop->fsync) { - mutex_lock(&lower_inode->i_mutex); - rc = lower_inode->i_fop->fsync(lower_file, lower_dentry, - datasync); - mutex_unlock(&lower_inode->i_mutex); - } - return rc; + return vfs_fsync(ecryptfs_file_to_lower(file), + ecryptfs_dentry_to_lower(dentry), + datasync); } static int ecryptfs_fasync(int fd, struct file *file, int flag) diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 89209f00f9c7..5697899a168d 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -52,8 +52,7 @@ static void unlock_dir(struct dentry *dir) /** * ecryptfs_create_underlying_file * @lower_dir_inode: inode of the parent in the lower fs of the new file - * @lower_dentry: New file's dentry in the lower fs - * @ecryptfs_dentry: New file's dentry in ecryptfs + * @dentry: New file's dentry * @mode: The mode of the new file * @nd: nameidata of ecryptfs' parent's dentry & vfsmount * @@ -228,8 +227,7 @@ ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry, { int rc; - /* ecryptfs_do_create() calls ecryptfs_interpose(), which opens - * the crypt_stat->lower_file (persistent file) */ + /* ecryptfs_do_create() calls ecryptfs_interpose() */ rc = ecryptfs_do_create(directory_inode, ecryptfs_dentry, mode, nd); if (unlikely(rc)) { ecryptfs_printk(KERN_WARNING, "Failed to create file in" @@ -244,141 +242,91 @@ out: } /** - * ecryptfs_lookup - * @dir: inode - * @dentry: The dentry - * @nd: nameidata, may be NULL - * - * Find a file on disk. If the file does not exist, then we'll add it to the - * dentry cache and continue on to read it from the disk. + * ecryptfs_lookup_and_interpose_lower - Perform a lookup */ -static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry, - struct nameidata *nd) +int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry, + struct dentry *lower_dentry, + struct ecryptfs_crypt_stat *crypt_stat, + struct inode *ecryptfs_dir_inode, + struct nameidata *ecryptfs_nd) { - int rc = 0; struct dentry *lower_dir_dentry; - struct dentry *lower_dentry; struct vfsmount *lower_mnt; - char *encoded_name; - int encoded_namelen; - struct ecryptfs_crypt_stat *crypt_stat = NULL; + struct inode *lower_inode; struct ecryptfs_mount_crypt_stat *mount_crypt_stat; char *page_virt = NULL; - struct inode *lower_inode; u64 file_size; + int rc = 0; - lower_dir_dentry = ecryptfs_dentry_to_lower(dentry->d_parent); - dentry->d_op = &ecryptfs_dops; - if ((dentry->d_name.len == 1 && !strcmp(dentry->d_name.name, ".")) - || (dentry->d_name.len == 2 - && !strcmp(dentry->d_name.name, ".."))) { - d_drop(dentry); - goto out; - } - encoded_namelen = ecryptfs_encode_filename(crypt_stat, - dentry->d_name.name, - dentry->d_name.len, - &encoded_name); - if (encoded_namelen < 0) { - rc = encoded_namelen; - d_drop(dentry); - goto out; - } - ecryptfs_printk(KERN_DEBUG, "encoded_name = [%s]; encoded_namelen " - "= [%d]\n", encoded_name, encoded_namelen); - lower_dentry = lookup_one_len(encoded_name, lower_dir_dentry, - encoded_namelen - 1); - kfree(encoded_name); - if (IS_ERR(lower_dentry)) { - ecryptfs_printk(KERN_ERR, "ERR from lower_dentry\n"); - rc = PTR_ERR(lower_dentry); - d_drop(dentry); - goto out; - } - lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent)); - ecryptfs_printk(KERN_DEBUG, "lower_dentry = [%p]; lower_dentry->" - "d_name.name = [%s]\n", lower_dentry, - lower_dentry->d_name.name); + lower_dir_dentry = lower_dentry->d_parent; + lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt( + ecryptfs_dentry->d_parent)); lower_inode = lower_dentry->d_inode; - fsstack_copy_attr_atime(dir, lower_dir_dentry->d_inode); + fsstack_copy_attr_atime(ecryptfs_dir_inode, lower_dir_dentry->d_inode); BUG_ON(!atomic_read(&lower_dentry->d_count)); - ecryptfs_set_dentry_private(dentry, + ecryptfs_set_dentry_private(ecryptfs_dentry, kmem_cache_alloc(ecryptfs_dentry_info_cache, GFP_KERNEL)); - if (!ecryptfs_dentry_to_private(dentry)) { + if (!ecryptfs_dentry_to_private(ecryptfs_dentry)) { rc = -ENOMEM; - ecryptfs_printk(KERN_ERR, "Out of memory whilst attempting " - "to allocate ecryptfs_dentry_info struct\n"); + printk(KERN_ERR "%s: Out of memory whilst attempting " + "to allocate ecryptfs_dentry_info struct\n", + __func__); goto out_dput; } - ecryptfs_set_dentry_lower(dentry, lower_dentry); - ecryptfs_set_dentry_lower_mnt(dentry, lower_mnt); + ecryptfs_set_dentry_lower(ecryptfs_dentry, lower_dentry); + ecryptfs_set_dentry_lower_mnt(ecryptfs_dentry, lower_mnt); if (!lower_dentry->d_inode) { /* We want to add because we couldn't find in lower */ - d_add(dentry, NULL); + d_add(ecryptfs_dentry, NULL); goto out; } - rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, - ECRYPTFS_INTERPOSE_FLAG_D_ADD); + rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry, + ecryptfs_dir_inode->i_sb, 1); if (rc) { - ecryptfs_printk(KERN_ERR, "Error interposing\n"); + printk(KERN_ERR "%s: Error interposing; rc = [%d]\n", + __func__, rc); goto out; } - if (S_ISDIR(lower_inode->i_mode)) { - ecryptfs_printk(KERN_DEBUG, "Is a directory; returning\n"); + if (S_ISDIR(lower_inode->i_mode)) goto out; - } - if (S_ISLNK(lower_inode->i_mode)) { - ecryptfs_printk(KERN_DEBUG, "Is a symlink; returning\n"); + if (S_ISLNK(lower_inode->i_mode)) goto out; - } - if (special_file(lower_inode->i_mode)) { - ecryptfs_printk(KERN_DEBUG, "Is a special file; returning\n"); + if (special_file(lower_inode->i_mode)) goto out; - } - if (!nd) { - ecryptfs_printk(KERN_DEBUG, "We have a NULL nd, just leave" - "as we *think* we are about to unlink\n"); + if (!ecryptfs_nd) goto out; - } /* Released in this function */ - page_virt = kmem_cache_zalloc(ecryptfs_header_cache_2, - GFP_USER); + page_virt = kmem_cache_zalloc(ecryptfs_header_cache_2, GFP_USER); if (!page_virt) { + printk(KERN_ERR "%s: Cannot kmem_cache_zalloc() a page\n", + __func__); rc = -ENOMEM; - ecryptfs_printk(KERN_ERR, - "Cannot ecryptfs_kmalloc a page\n"); goto out; } - crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat; - if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED)) - ecryptfs_set_default_sizes(crypt_stat); - if (!ecryptfs_inode_to_private(dentry->d_inode)->lower_file) { - rc = ecryptfs_init_persistent_file(dentry); + if (!ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->lower_file) { + rc = ecryptfs_init_persistent_file(ecryptfs_dentry); if (rc) { printk(KERN_ERR "%s: Error attempting to initialize " "the persistent file for the dentry with name " "[%s]; rc = [%d]\n", __func__, - dentry->d_name.name, rc); - goto out; + ecryptfs_dentry->d_name.name, rc); + goto out_free_kmem; } } rc = ecryptfs_read_and_validate_header_region(page_virt, - dentry->d_inode); + ecryptfs_dentry->d_inode); if (rc) { - rc = ecryptfs_read_and_validate_xattr_region(page_virt, dentry); + rc = ecryptfs_read_and_validate_xattr_region(page_virt, + ecryptfs_dentry); if (rc) { - printk(KERN_DEBUG "Valid metadata not found in header " - "region or xattr region; treating file as " - "unencrypted\n"); rc = 0; - kmem_cache_free(ecryptfs_header_cache_2, page_virt); - goto out; + goto out_free_kmem; } crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR; } mount_crypt_stat = &ecryptfs_superblock_to_private( - dentry->d_sb)->mount_crypt_stat; + ecryptfs_dentry->d_sb)->mount_crypt_stat; if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) { if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) file_size = (crypt_stat->num_header_bytes_at_front @@ -388,14 +336,103 @@ static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry, } else { file_size = get_unaligned_be64(page_virt); } - i_size_write(dentry->d_inode, (loff_t)file_size); + i_size_write(ecryptfs_dentry->d_inode, (loff_t)file_size); +out_free_kmem: kmem_cache_free(ecryptfs_header_cache_2, page_virt); goto out; - out_dput: dput(lower_dentry); - d_drop(dentry); + d_drop(ecryptfs_dentry); out: + return rc; +} + +/** + * ecryptfs_lookup + * @ecryptfs_dir_inode: The eCryptfs directory inode + * @ecryptfs_dentry: The eCryptfs dentry that we are looking up + * @ecryptfs_nd: nameidata; may be NULL + * + * Find a file on disk. If the file does not exist, then we'll add it to the + * dentry cache and continue on to read it from the disk. + */ +static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, + struct dentry *ecryptfs_dentry, + struct nameidata *ecryptfs_nd) +{ + char *encrypted_and_encoded_name = NULL; + size_t encrypted_and_encoded_name_size; + struct ecryptfs_crypt_stat *crypt_stat = NULL; + struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL; + struct ecryptfs_inode_info *inode_info; + struct dentry *lower_dir_dentry, *lower_dentry; + int rc = 0; + + ecryptfs_dentry->d_op = &ecryptfs_dops; + if ((ecryptfs_dentry->d_name.len == 1 + && !strcmp(ecryptfs_dentry->d_name.name, ".")) + || (ecryptfs_dentry->d_name.len == 2 + && !strcmp(ecryptfs_dentry->d_name.name, ".."))) { + goto out_d_drop; + } + lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent); + lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name, + lower_dir_dentry, + ecryptfs_dentry->d_name.len); + if (IS_ERR(lower_dentry)) { + rc = PTR_ERR(lower_dentry); + printk(KERN_ERR "%s: lookup_one_len() returned [%d] on " + "lower_dentry = [%s]\n", __func__, rc, + ecryptfs_dentry->d_name.name); + goto out_d_drop; + } + if (lower_dentry->d_inode) + goto lookup_and_interpose; + inode_info = ecryptfs_inode_to_private(ecryptfs_dentry->d_inode); + if (inode_info) { + crypt_stat = &inode_info->crypt_stat; + /* TODO: lock for crypt_stat comparison */ + if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED)) + ecryptfs_set_default_sizes(crypt_stat); + } + if (crypt_stat) + mount_crypt_stat = crypt_stat->mount_crypt_stat; + else + mount_crypt_stat = &ecryptfs_superblock_to_private( + ecryptfs_dentry->d_sb)->mount_crypt_stat; + if (!(crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCRYPT_FILENAMES)) + && !(mount_crypt_stat && (mount_crypt_stat->flags + & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES))) + goto lookup_and_interpose; + dput(lower_dentry); + rc = ecryptfs_encrypt_and_encode_filename( + &encrypted_and_encoded_name, &encrypted_and_encoded_name_size, + crypt_stat, mount_crypt_stat, ecryptfs_dentry->d_name.name, + ecryptfs_dentry->d_name.len); + if (rc) { + printk(KERN_ERR "%s: Error attempting to encrypt and encode " + "filename; rc = [%d]\n", __func__, rc); + goto out_d_drop; + } + lower_dentry = lookup_one_len(encrypted_and_encoded_name, + lower_dir_dentry, + encrypted_and_encoded_name_size - 1); + if (IS_ERR(lower_dentry)) { + rc = PTR_ERR(lower_dentry); + printk(KERN_ERR "%s: lookup_one_len() returned [%d] on " + "lower_dentry = [%s]\n", __func__, rc, + encrypted_and_encoded_name); + goto out_d_drop; + } +lookup_and_interpose: + rc = ecryptfs_lookup_and_interpose_lower(ecryptfs_dentry, lower_dentry, + crypt_stat, ecryptfs_dir_inode, + ecryptfs_nd); + goto out; +out_d_drop: + d_drop(ecryptfs_dentry); +out: + kfree(encrypted_and_encoded_name); return ERR_PTR(rc); } @@ -466,19 +503,21 @@ static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry, struct dentry *lower_dentry; struct dentry *lower_dir_dentry; char *encoded_symname; - int encoded_symlen; - struct ecryptfs_crypt_stat *crypt_stat = NULL; + size_t encoded_symlen; + struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL; lower_dentry = ecryptfs_dentry_to_lower(dentry); dget(lower_dentry); lower_dir_dentry = lock_parent(lower_dentry); - encoded_symlen = ecryptfs_encode_filename(crypt_stat, symname, - strlen(symname), - &encoded_symname); - if (encoded_symlen < 0) { - rc = encoded_symlen; + mount_crypt_stat = &ecryptfs_superblock_to_private( + dir->i_sb)->mount_crypt_stat; + rc = ecryptfs_encrypt_and_encode_filename(&encoded_symname, + &encoded_symlen, + NULL, + mount_crypt_stat, symname, + strlen(symname)); + if (rc) goto out_lock; - } rc = vfs_symlink(lower_dir_dentry->d_inode, lower_dentry, encoded_symname); kfree(encoded_symname); @@ -602,53 +641,54 @@ out_lock: } static int -ecryptfs_readlink(struct dentry *dentry, char __user * buf, int bufsiz) +ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz) { - int rc; - struct dentry *lower_dentry; - char *decoded_name; char *lower_buf; - mm_segment_t old_fs; + struct dentry *lower_dentry; struct ecryptfs_crypt_stat *crypt_stat; + char *plaintext_name; + size_t plaintext_name_size; + mm_segment_t old_fs; + int rc; lower_dentry = ecryptfs_dentry_to_lower(dentry); - if (!lower_dentry->d_inode->i_op || - !lower_dentry->d_inode->i_op->readlink) { + if (!lower_dentry->d_inode->i_op->readlink) { rc = -EINVAL; goto out; } + crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat; /* Released in this function */ lower_buf = kmalloc(bufsiz, GFP_KERNEL); if (lower_buf == NULL) { - ecryptfs_printk(KERN_ERR, "Out of memory\n"); + printk(KERN_ERR "%s: Out of memory whilst attempting to " + "kmalloc [%d] bytes\n", __func__, bufsiz); rc = -ENOMEM; goto out; } old_fs = get_fs(); set_fs(get_ds()); - ecryptfs_printk(KERN_DEBUG, "Calling readlink w/ " - "lower_dentry->d_name.name = [%s]\n", - lower_dentry->d_name.name); rc = lower_dentry->d_inode->i_op->readlink(lower_dentry, (char __user *)lower_buf, bufsiz); set_fs(old_fs); if (rc >= 0) { - crypt_stat = NULL; - rc = ecryptfs_decode_filename(crypt_stat, lower_buf, rc, - &decoded_name); - if (rc == -ENOMEM) + rc = ecryptfs_decode_and_decrypt_filename(&plaintext_name, + &plaintext_name_size, + dentry, lower_buf, + rc); + if (rc) { + printk(KERN_ERR "%s: Error attempting to decode and " + "decrypt filename; rc = [%d]\n", __func__, + rc); goto out_free_lower_buf; - if (rc > 0) { - ecryptfs_printk(KERN_DEBUG, "Copying [%d] bytes " - "to userspace: [%*s]\n", rc, - decoded_name); - if (copy_to_user(buf, decoded_name, rc)) - rc = -EFAULT; } - kfree(decoded_name); - fsstack_copy_attr_atime(dentry->d_inode, - lower_dentry->d_inode); + rc = copy_to_user(buf, plaintext_name, plaintext_name_size); + if (rc) + rc = -EFAULT; + else + rc = plaintext_name_size; + kfree(plaintext_name); + fsstack_copy_attr_atime(dentry->d_inode, lower_dentry->d_inode); } out_free_lower_buf: kfree(lower_buf); @@ -670,13 +710,12 @@ static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd) } old_fs = get_fs(); set_fs(get_ds()); - ecryptfs_printk(KERN_DEBUG, "Calling readlink w/ " - "dentry->d_name.name = [%s]\n", dentry->d_name.name); rc = dentry->d_inode->i_op->readlink(dentry, (char __user *)buf, len); - buf[rc] = '\0'; set_fs(old_fs); if (rc < 0) goto out_free; + else + buf[rc] = '\0'; rc = 0; nd_set_link(nd, buf); goto out; diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index 0d713b691941..ff539420cc6f 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -358,7 +358,7 @@ parse_tag_67_packet(struct ecryptfs_key_record *key_rec, /* verify that everything through the encrypted FEK size is present */ if (message_len < 4) { rc = -EIO; - printk(KERN_ERR "%s: message_len is [%Zd]; minimum acceptable " + printk(KERN_ERR "%s: message_len is [%zd]; minimum acceptable " "message length is [%d]\n", __func__, message_len, 4); goto out; } @@ -385,13 +385,13 @@ parse_tag_67_packet(struct ecryptfs_key_record *key_rec, i += data_len; if (message_len < (i + key_rec->enc_key_size)) { rc = -EIO; - printk(KERN_ERR "%s: message_len [%Zd]; max len is [%Zd]\n", + printk(KERN_ERR "%s: message_len [%zd]; max len is [%zd]\n", __func__, message_len, (i + key_rec->enc_key_size)); goto out; } if (key_rec->enc_key_size > ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES) { rc = -EIO; - printk(KERN_ERR "%s: Encrypted key_size [%Zd] larger than " + printk(KERN_ERR "%s: Encrypted key_size [%zd] larger than " "the maximum key size [%d]\n", __func__, key_rec->enc_key_size, ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES); @@ -403,6 +403,580 @@ out: } static int +ecryptfs_find_global_auth_tok_for_sig( + struct ecryptfs_global_auth_tok **global_auth_tok, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *sig) +{ + struct ecryptfs_global_auth_tok *walker; + int rc = 0; + + (*global_auth_tok) = NULL; + mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex); + list_for_each_entry(walker, + &mount_crypt_stat->global_auth_tok_list, + mount_crypt_stat_list) { + if (memcmp(walker->sig, sig, ECRYPTFS_SIG_SIZE_HEX) == 0) { + (*global_auth_tok) = walker; + goto out; + } + } + rc = -EINVAL; +out: + mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex); + return rc; +} + +/** + * ecryptfs_find_auth_tok_for_sig + * @auth_tok: Set to the matching auth_tok; NULL if not found + * @crypt_stat: inode crypt_stat crypto context + * @sig: Sig of auth_tok to find + * + * For now, this function simply looks at the registered auth_tok's + * linked off the mount_crypt_stat, so all the auth_toks that can be + * used must be registered at mount time. This function could + * potentially try a lot harder to find auth_tok's (e.g., by calling + * out to ecryptfsd to dynamically retrieve an auth_tok object) so + * that static registration of auth_tok's will no longer be necessary. + * + * Returns zero on no error; non-zero on error + */ +static int +ecryptfs_find_auth_tok_for_sig( + struct ecryptfs_auth_tok **auth_tok, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat, + char *sig) +{ + struct ecryptfs_global_auth_tok *global_auth_tok; + int rc = 0; + + (*auth_tok) = NULL; + if (ecryptfs_find_global_auth_tok_for_sig(&global_auth_tok, + mount_crypt_stat, sig)) { + struct key *auth_tok_key; + + rc = ecryptfs_keyring_auth_tok_for_sig(&auth_tok_key, auth_tok, + sig); + } else + (*auth_tok) = global_auth_tok->global_auth_tok; + return rc; +} + +/** + * write_tag_70_packet can gobble a lot of stack space. We stuff most + * of the function's parameters in a kmalloc'd struct to help reduce + * eCryptfs' overall stack usage. + */ +struct ecryptfs_write_tag_70_packet_silly_stack { + u8 cipher_code; + size_t max_packet_size; + size_t packet_size_len; + size_t block_aligned_filename_size; + size_t block_size; + size_t i; + size_t j; + size_t num_rand_bytes; + struct mutex *tfm_mutex; + char *block_aligned_filename; + struct ecryptfs_auth_tok *auth_tok; + struct scatterlist src_sg; + struct scatterlist dst_sg; + struct blkcipher_desc desc; + char iv[ECRYPTFS_MAX_IV_BYTES]; + char hash[ECRYPTFS_TAG_70_DIGEST_SIZE]; + char tmp_hash[ECRYPTFS_TAG_70_DIGEST_SIZE]; + struct hash_desc hash_desc; + struct scatterlist hash_sg; +}; + +/** + * write_tag_70_packet - Write encrypted filename (EFN) packet against FNEK + * @filename: NULL-terminated filename string + * + * This is the simplest mechanism for achieving filename encryption in + * eCryptfs. It encrypts the given filename with the mount-wide + * filename encryption key (FNEK) and stores it in a packet to @dest, + * which the callee will encode and write directly into the dentry + * name. + */ +int +ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, + size_t *packet_size, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat, + char *filename, size_t filename_size) +{ + struct ecryptfs_write_tag_70_packet_silly_stack *s; + int rc = 0; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) { + printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc " + "[%zd] bytes of kernel memory\n", __func__, sizeof(*s)); + goto out; + } + s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; + (*packet_size) = 0; + rc = ecryptfs_get_tfm_and_mutex_for_cipher_name( + &s->desc.tfm, + &s->tfm_mutex, mount_crypt_stat->global_default_fn_cipher_name); + if (unlikely(rc)) { + printk(KERN_ERR "Internal error whilst attempting to get " + "tfm and mutex for cipher name [%s]; rc = [%d]\n", + mount_crypt_stat->global_default_fn_cipher_name, rc); + goto out; + } + mutex_lock(s->tfm_mutex); + s->block_size = crypto_blkcipher_blocksize(s->desc.tfm); + /* Plus one for the \0 separator between the random prefix + * and the plaintext filename */ + s->num_rand_bytes = (ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES + 1); + s->block_aligned_filename_size = (s->num_rand_bytes + filename_size); + if ((s->block_aligned_filename_size % s->block_size) != 0) { + s->num_rand_bytes += (s->block_size + - (s->block_aligned_filename_size + % s->block_size)); + s->block_aligned_filename_size = (s->num_rand_bytes + + filename_size); + } + /* Octet 0: Tag 70 identifier + * Octets 1-N1: Tag 70 packet size (includes cipher identifier + * and block-aligned encrypted filename size) + * Octets N1-N2: FNEK sig (ECRYPTFS_SIG_SIZE) + * Octet N2-N3: Cipher identifier (1 octet) + * Octets N3-N4: Block-aligned encrypted filename + * - Consists of a minimum number of random characters, a \0 + * separator, and then the filename */ + s->max_packet_size = (1 /* Tag 70 identifier */ + + 3 /* Max Tag 70 packet size */ + + ECRYPTFS_SIG_SIZE /* FNEK sig */ + + 1 /* Cipher identifier */ + + s->block_aligned_filename_size); + if (dest == NULL) { + (*packet_size) = s->max_packet_size; + goto out_unlock; + } + if (s->max_packet_size > (*remaining_bytes)) { + printk(KERN_WARNING "%s: Require [%zd] bytes to write; only " + "[%zd] available\n", __func__, s->max_packet_size, + (*remaining_bytes)); + rc = -EINVAL; + goto out_unlock; + } + s->block_aligned_filename = kzalloc(s->block_aligned_filename_size, + GFP_KERNEL); + if (!s->block_aligned_filename) { + printk(KERN_ERR "%s: Out of kernel memory whilst attempting to " + "kzalloc [%zd] bytes\n", __func__, + s->block_aligned_filename_size); + rc = -ENOMEM; + goto out_unlock; + } + s->i = 0; + dest[s->i++] = ECRYPTFS_TAG_70_PACKET_TYPE; + rc = ecryptfs_write_packet_length(&dest[s->i], + (ECRYPTFS_SIG_SIZE + + 1 /* Cipher code */ + + s->block_aligned_filename_size), + &s->packet_size_len); + if (rc) { + printk(KERN_ERR "%s: Error generating tag 70 packet " + "header; cannot generate packet length; rc = [%d]\n", + __func__, rc); + goto out_free_unlock; + } + s->i += s->packet_size_len; + ecryptfs_from_hex(&dest[s->i], + mount_crypt_stat->global_default_fnek_sig, + ECRYPTFS_SIG_SIZE); + s->i += ECRYPTFS_SIG_SIZE; + s->cipher_code = ecryptfs_code_for_cipher_string( + mount_crypt_stat->global_default_fn_cipher_name, + mount_crypt_stat->global_default_fn_cipher_key_bytes); + if (s->cipher_code == 0) { + printk(KERN_WARNING "%s: Unable to generate code for " + "cipher [%s] with key bytes [%zd]\n", __func__, + mount_crypt_stat->global_default_fn_cipher_name, + mount_crypt_stat->global_default_fn_cipher_key_bytes); + rc = -EINVAL; + goto out_free_unlock; + } + dest[s->i++] = s->cipher_code; + rc = ecryptfs_find_auth_tok_for_sig( + &s->auth_tok, mount_crypt_stat, + mount_crypt_stat->global_default_fnek_sig); + if (rc) { + printk(KERN_ERR "%s: Error attempting to find auth tok for " + "fnek sig [%s]; rc = [%d]\n", __func__, + mount_crypt_stat->global_default_fnek_sig, rc); + goto out_free_unlock; + } + /* TODO: Support other key modules than passphrase for + * filename encryption */ + BUG_ON(s->auth_tok->token_type != ECRYPTFS_PASSWORD); + sg_init_one( + &s->hash_sg, + (u8 *)s->auth_tok->token.password.session_key_encryption_key, + s->auth_tok->token.password.session_key_encryption_key_bytes); + s->hash_desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; + s->hash_desc.tfm = crypto_alloc_hash(ECRYPTFS_TAG_70_DIGEST, 0, + CRYPTO_ALG_ASYNC); + if (IS_ERR(s->hash_desc.tfm)) { + rc = PTR_ERR(s->hash_desc.tfm); + printk(KERN_ERR "%s: Error attempting to " + "allocate hash crypto context; rc = [%d]\n", + __func__, rc); + goto out_free_unlock; + } + rc = crypto_hash_init(&s->hash_desc); + if (rc) { + printk(KERN_ERR + "%s: Error initializing crypto hash; rc = [%d]\n", + __func__, rc); + goto out_release_free_unlock; + } + rc = crypto_hash_update( + &s->hash_desc, &s->hash_sg, + s->auth_tok->token.password.session_key_encryption_key_bytes); + if (rc) { + printk(KERN_ERR + "%s: Error updating crypto hash; rc = [%d]\n", + __func__, rc); + goto out_release_free_unlock; + } + rc = crypto_hash_final(&s->hash_desc, s->hash); + if (rc) { + printk(KERN_ERR + "%s: Error finalizing crypto hash; rc = [%d]\n", + __func__, rc); + goto out_release_free_unlock; + } + for (s->j = 0; s->j < (s->num_rand_bytes - 1); s->j++) { + s->block_aligned_filename[s->j] = + s->hash[(s->j % ECRYPTFS_TAG_70_DIGEST_SIZE)]; + if ((s->j % ECRYPTFS_TAG_70_DIGEST_SIZE) + == (ECRYPTFS_TAG_70_DIGEST_SIZE - 1)) { + sg_init_one(&s->hash_sg, (u8 *)s->hash, + ECRYPTFS_TAG_70_DIGEST_SIZE); + rc = crypto_hash_init(&s->hash_desc); + if (rc) { + printk(KERN_ERR + "%s: Error initializing crypto hash; " + "rc = [%d]\n", __func__, rc); + goto out_release_free_unlock; + } + rc = crypto_hash_update(&s->hash_desc, &s->hash_sg, + ECRYPTFS_TAG_70_DIGEST_SIZE); + if (rc) { + printk(KERN_ERR + "%s: Error updating crypto hash; " + "rc = [%d]\n", __func__, rc); + goto out_release_free_unlock; + } + rc = crypto_hash_final(&s->hash_desc, s->tmp_hash); + if (rc) { + printk(KERN_ERR + "%s: Error finalizing crypto hash; " + "rc = [%d]\n", __func__, rc); + goto out_release_free_unlock; + } + memcpy(s->hash, s->tmp_hash, + ECRYPTFS_TAG_70_DIGEST_SIZE); + } + if (s->block_aligned_filename[s->j] == '\0') + s->block_aligned_filename[s->j] = ECRYPTFS_NON_NULL; + } + memcpy(&s->block_aligned_filename[s->num_rand_bytes], filename, + filename_size); + rc = virt_to_scatterlist(s->block_aligned_filename, + s->block_aligned_filename_size, &s->src_sg, 1); + if (rc != 1) { + printk(KERN_ERR "%s: Internal error whilst attempting to " + "convert filename memory to scatterlist; " + "expected rc = 1; got rc = [%d]. " + "block_aligned_filename_size = [%zd]\n", __func__, rc, + s->block_aligned_filename_size); + goto out_release_free_unlock; + } + rc = virt_to_scatterlist(&dest[s->i], s->block_aligned_filename_size, + &s->dst_sg, 1); + if (rc != 1) { + printk(KERN_ERR "%s: Internal error whilst attempting to " + "convert encrypted filename memory to scatterlist; " + "expected rc = 1; got rc = [%d]. " + "block_aligned_filename_size = [%zd]\n", __func__, rc, + s->block_aligned_filename_size); + goto out_release_free_unlock; + } + /* The characters in the first block effectively do the job + * of the IV here, so we just use 0's for the IV. Note the + * constraint that ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES + * >= ECRYPTFS_MAX_IV_BYTES. */ + memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES); + s->desc.info = s->iv; + rc = crypto_blkcipher_setkey( + s->desc.tfm, + s->auth_tok->token.password.session_key_encryption_key, + mount_crypt_stat->global_default_fn_cipher_key_bytes); + if (rc < 0) { + printk(KERN_ERR "%s: Error setting key for crypto context; " + "rc = [%d]. s->auth_tok->token.password.session_key_" + "encryption_key = [0x%p]; mount_crypt_stat->" + "global_default_fn_cipher_key_bytes = [%zd]\n", __func__, + rc, + s->auth_tok->token.password.session_key_encryption_key, + mount_crypt_stat->global_default_fn_cipher_key_bytes); + goto out_release_free_unlock; + } + rc = crypto_blkcipher_encrypt_iv(&s->desc, &s->dst_sg, &s->src_sg, + s->block_aligned_filename_size); + if (rc) { + printk(KERN_ERR "%s: Error attempting to encrypt filename; " + "rc = [%d]\n", __func__, rc); + goto out_release_free_unlock; + } + s->i += s->block_aligned_filename_size; + (*packet_size) = s->i; + (*remaining_bytes) -= (*packet_size); +out_release_free_unlock: + crypto_free_hash(s->hash_desc.tfm); +out_free_unlock: + memset(s->block_aligned_filename, 0, s->block_aligned_filename_size); + kfree(s->block_aligned_filename); +out_unlock: + mutex_unlock(s->tfm_mutex); +out: + kfree(s); + return rc; +} + +struct ecryptfs_parse_tag_70_packet_silly_stack { + u8 cipher_code; + size_t max_packet_size; + size_t packet_size_len; + size_t parsed_tag_70_packet_size; + size_t block_aligned_filename_size; + size_t block_size; + size_t i; + struct mutex *tfm_mutex; + char *decrypted_filename; + struct ecryptfs_auth_tok *auth_tok; + struct scatterlist src_sg; + struct scatterlist dst_sg; + struct blkcipher_desc desc; + char fnek_sig_hex[ECRYPTFS_SIG_SIZE_HEX + 1]; + char iv[ECRYPTFS_MAX_IV_BYTES]; + char cipher_string[ECRYPTFS_MAX_CIPHER_NAME_SIZE]; +}; + +/** + * parse_tag_70_packet - Parse and process FNEK-encrypted passphrase packet + * @filename: This function kmalloc's the memory for the filename + * @filename_size: This function sets this to the amount of memory + * kmalloc'd for the filename + * @packet_size: This function sets this to the the number of octets + * in the packet parsed + * @mount_crypt_stat: The mount-wide cryptographic context + * @data: The memory location containing the start of the tag 70 + * packet + * @max_packet_size: The maximum legal size of the packet to be parsed + * from @data + * + * Returns zero on success; non-zero otherwise + */ +int +ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size, + size_t *packet_size, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat, + char *data, size_t max_packet_size) +{ + struct ecryptfs_parse_tag_70_packet_silly_stack *s; + int rc = 0; + + (*packet_size) = 0; + (*filename_size) = 0; + (*filename) = NULL; + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) { + printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc " + "[%zd] bytes of kernel memory\n", __func__, sizeof(*s)); + goto out; + } + s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; + if (max_packet_size < (1 + 1 + ECRYPTFS_SIG_SIZE + 1 + 1)) { + printk(KERN_WARNING "%s: max_packet_size is [%zd]; it must be " + "at least [%d]\n", __func__, max_packet_size, + (1 + 1 + ECRYPTFS_SIG_SIZE + 1 + 1)); + rc = -EINVAL; + goto out; + } + /* Octet 0: Tag 70 identifier + * Octets 1-N1: Tag 70 packet size (includes cipher identifier + * and block-aligned encrypted filename size) + * Octets N1-N2: FNEK sig (ECRYPTFS_SIG_SIZE) + * Octet N2-N3: Cipher identifier (1 octet) + * Octets N3-N4: Block-aligned encrypted filename + * - Consists of a minimum number of random numbers, a \0 + * separator, and then the filename */ + if (data[(*packet_size)++] != ECRYPTFS_TAG_70_PACKET_TYPE) { + printk(KERN_WARNING "%s: Invalid packet tag [0x%.2x]; must be " + "tag [0x%.2x]\n", __func__, + data[((*packet_size) - 1)], ECRYPTFS_TAG_70_PACKET_TYPE); + rc = -EINVAL; + goto out; + } + rc = ecryptfs_parse_packet_length(&data[(*packet_size)], + &s->parsed_tag_70_packet_size, + &s->packet_size_len); + if (rc) { + printk(KERN_WARNING "%s: Error parsing packet length; " + "rc = [%d]\n", __func__, rc); + goto out; + } + s->block_aligned_filename_size = (s->parsed_tag_70_packet_size + - ECRYPTFS_SIG_SIZE - 1); + if ((1 + s->packet_size_len + s->parsed_tag_70_packet_size) + > max_packet_size) { + printk(KERN_WARNING "%s: max_packet_size is [%zd]; real packet " + "size is [%zd]\n", __func__, max_packet_size, + (1 + s->packet_size_len + 1 + + s->block_aligned_filename_size)); + rc = -EINVAL; + goto out; + } + (*packet_size) += s->packet_size_len; + ecryptfs_to_hex(s->fnek_sig_hex, &data[(*packet_size)], + ECRYPTFS_SIG_SIZE); + s->fnek_sig_hex[ECRYPTFS_SIG_SIZE_HEX] = '\0'; + (*packet_size) += ECRYPTFS_SIG_SIZE; + s->cipher_code = data[(*packet_size)++]; + rc = ecryptfs_cipher_code_to_string(s->cipher_string, s->cipher_code); + if (rc) { + printk(KERN_WARNING "%s: Cipher code [%d] is invalid\n", + __func__, s->cipher_code); + goto out; + } + rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&s->desc.tfm, + &s->tfm_mutex, + s->cipher_string); + if (unlikely(rc)) { + printk(KERN_ERR "Internal error whilst attempting to get " + "tfm and mutex for cipher name [%s]; rc = [%d]\n", + s->cipher_string, rc); + goto out; + } + mutex_lock(s->tfm_mutex); + rc = virt_to_scatterlist(&data[(*packet_size)], + s->block_aligned_filename_size, &s->src_sg, 1); + if (rc != 1) { + printk(KERN_ERR "%s: Internal error whilst attempting to " + "convert encrypted filename memory to scatterlist; " + "expected rc = 1; got rc = [%d]. " + "block_aligned_filename_size = [%zd]\n", __func__, rc, + s->block_aligned_filename_size); + goto out_unlock; + } + (*packet_size) += s->block_aligned_filename_size; + s->decrypted_filename = kmalloc(s->block_aligned_filename_size, + GFP_KERNEL); + if (!s->decrypted_filename) { + printk(KERN_ERR "%s: Out of memory whilst attempting to " + "kmalloc [%zd] bytes\n", __func__, + s->block_aligned_filename_size); + rc = -ENOMEM; + goto out_unlock; + } + rc = virt_to_scatterlist(s->decrypted_filename, + s->block_aligned_filename_size, &s->dst_sg, 1); + if (rc != 1) { + printk(KERN_ERR "%s: Internal error whilst attempting to " + "convert decrypted filename memory to scatterlist; " + "expected rc = 1; got rc = [%d]. " + "block_aligned_filename_size = [%zd]\n", __func__, rc, + s->block_aligned_filename_size); + goto out_free_unlock; + } + /* The characters in the first block effectively do the job of + * the IV here, so we just use 0's for the IV. Note the + * constraint that ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES + * >= ECRYPTFS_MAX_IV_BYTES. */ + memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES); + s->desc.info = s->iv; + rc = ecryptfs_find_auth_tok_for_sig(&s->auth_tok, mount_crypt_stat, + s->fnek_sig_hex); + if (rc) { + printk(KERN_ERR "%s: Error attempting to find auth tok for " + "fnek sig [%s]; rc = [%d]\n", __func__, s->fnek_sig_hex, + rc); + goto out_free_unlock; + } + /* TODO: Support other key modules than passphrase for + * filename encryption */ + BUG_ON(s->auth_tok->token_type != ECRYPTFS_PASSWORD); + rc = crypto_blkcipher_setkey( + s->desc.tfm, + s->auth_tok->token.password.session_key_encryption_key, + mount_crypt_stat->global_default_fn_cipher_key_bytes); + if (rc < 0) { + printk(KERN_ERR "%s: Error setting key for crypto context; " + "rc = [%d]. s->auth_tok->token.password.session_key_" + "encryption_key = [0x%p]; mount_crypt_stat->" + "global_default_fn_cipher_key_bytes = [%zd]\n", __func__, + rc, + s->auth_tok->token.password.session_key_encryption_key, + mount_crypt_stat->global_default_fn_cipher_key_bytes); + goto out_free_unlock; + } + rc = crypto_blkcipher_decrypt_iv(&s->desc, &s->dst_sg, &s->src_sg, + s->block_aligned_filename_size); + if (rc) { + printk(KERN_ERR "%s: Error attempting to decrypt filename; " + "rc = [%d]\n", __func__, rc); + goto out_free_unlock; + } + s->i = 0; + while (s->decrypted_filename[s->i] != '\0' + && s->i < s->block_aligned_filename_size) + s->i++; + if (s->i == s->block_aligned_filename_size) { + printk(KERN_WARNING "%s: Invalid tag 70 packet; could not " + "find valid separator between random characters and " + "the filename\n", __func__); + rc = -EINVAL; + goto out_free_unlock; + } + s->i++; + (*filename_size) = (s->block_aligned_filename_size - s->i); + if (!((*filename_size) > 0 && (*filename_size < PATH_MAX))) { + printk(KERN_WARNING "%s: Filename size is [%zd], which is " + "invalid\n", __func__, (*filename_size)); + rc = -EINVAL; + goto out_free_unlock; + } + (*filename) = kmalloc(((*filename_size) + 1), GFP_KERNEL); + if (!(*filename)) { + printk(KERN_ERR "%s: Out of memory whilst attempting to " + "kmalloc [%zd] bytes\n", __func__, + ((*filename_size) + 1)); + rc = -ENOMEM; + goto out_free_unlock; + } + memcpy((*filename), &s->decrypted_filename[s->i], (*filename_size)); + (*filename)[(*filename_size)] = '\0'; +out_free_unlock: + kfree(s->decrypted_filename); +out_unlock: + mutex_unlock(s->tfm_mutex); +out: + if (rc) { + (*packet_size) = 0; + (*filename_size) = 0; + (*filename) = NULL; + } + kfree(s); + return rc; +} + +static int ecryptfs_get_auth_tok_sig(char **sig, struct ecryptfs_auth_tok *auth_tok) { int rc = 0; @@ -897,30 +1471,6 @@ out: return rc; } -static int -ecryptfs_find_global_auth_tok_for_sig( - struct ecryptfs_global_auth_tok **global_auth_tok, - struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *sig) -{ - struct ecryptfs_global_auth_tok *walker; - int rc = 0; - - (*global_auth_tok) = NULL; - mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex); - list_for_each_entry(walker, - &mount_crypt_stat->global_auth_tok_list, - mount_crypt_stat_list) { - if (memcmp(walker->sig, sig, ECRYPTFS_SIG_SIZE_HEX) == 0) { - (*global_auth_tok) = walker; - goto out; - } - } - rc = -EINVAL; -out: - mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex); - return rc; -} - /** * ecryptfs_verify_version * @version: The version number to confirm @@ -990,43 +1540,6 @@ out: } /** - * ecryptfs_find_auth_tok_for_sig - * @auth_tok: Set to the matching auth_tok; NULL if not found - * @crypt_stat: inode crypt_stat crypto context - * @sig: Sig of auth_tok to find - * - * For now, this function simply looks at the registered auth_tok's - * linked off the mount_crypt_stat, so all the auth_toks that can be - * used must be registered at mount time. This function could - * potentially try a lot harder to find auth_tok's (e.g., by calling - * out to ecryptfsd to dynamically retrieve an auth_tok object) so - * that static registration of auth_tok's will no longer be necessary. - * - * Returns zero on no error; non-zero on error - */ -static int -ecryptfs_find_auth_tok_for_sig( - struct ecryptfs_auth_tok **auth_tok, - struct ecryptfs_crypt_stat *crypt_stat, char *sig) -{ - struct ecryptfs_mount_crypt_stat *mount_crypt_stat = - crypt_stat->mount_crypt_stat; - struct ecryptfs_global_auth_tok *global_auth_tok; - int rc = 0; - - (*auth_tok) = NULL; - if (ecryptfs_find_global_auth_tok_for_sig(&global_auth_tok, - mount_crypt_stat, sig)) { - struct key *auth_tok_key; - - rc = ecryptfs_keyring_auth_tok_for_sig(&auth_tok_key, auth_tok, - sig); - } else - (*auth_tok) = global_auth_tok->global_auth_tok; - return rc; -} - -/** * decrypt_passphrase_encrypted_session_key - Decrypt the session key with the given auth_tok. * @auth_tok: The passphrase authentication token to use to encrypt the FEK * @crypt_stat: The cryptographic context @@ -1256,7 +1769,8 @@ find_next_matching_auth_tok: rc = -EINVAL; goto out_wipe_list; } - ecryptfs_find_auth_tok_for_sig(&matching_auth_tok, crypt_stat, + ecryptfs_find_auth_tok_for_sig(&matching_auth_tok, + crypt_stat->mount_crypt_stat, candidate_auth_tok_sig); if (matching_auth_tok) { found_auth_tok = 1; @@ -1336,7 +1850,9 @@ pki_encrypt_session_key(struct ecryptfs_auth_tok *auth_tok, int rc; rc = write_tag_66_packet(auth_tok->token.private_key.signature, - ecryptfs_code_for_cipher_string(crypt_stat), + ecryptfs_code_for_cipher_string( + crypt_stat->cipher, + crypt_stat->key_size), crypt_stat, &payload, &payload_len); if (rc) { ecryptfs_printk(KERN_ERR, "Error generating tag 66 packet\n"); @@ -1696,7 +2212,8 @@ encrypted_session_key_set: dest[(*packet_size)++] = 0x04; /* version 4 */ /* TODO: Break from RFC2440 so that arbitrary ciphers can be * specified with strings */ - cipher_code = ecryptfs_code_for_cipher_string(crypt_stat); + cipher_code = ecryptfs_code_for_cipher_string(crypt_stat->cipher, + crypt_stat->key_size); if (cipher_code == 0) { ecryptfs_printk(KERN_WARNING, "Unable to generate code for " "cipher [%s]\n", crypt_stat->cipher); diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index fd630713c5c7..789cf2e1be1e 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -206,7 +206,9 @@ enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig, ecryptfs_opt_cipher, ecryptfs_opt_ecryptfs_cipher, ecryptfs_opt_ecryptfs_key_bytes, ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata, - ecryptfs_opt_encrypted_view, ecryptfs_opt_err }; + ecryptfs_opt_encrypted_view, ecryptfs_opt_fnek_sig, + ecryptfs_opt_fn_cipher, ecryptfs_opt_fn_cipher_key_bytes, + ecryptfs_opt_err }; static const match_table_t tokens = { {ecryptfs_opt_sig, "sig=%s"}, @@ -217,6 +219,9 @@ static const match_table_t tokens = { {ecryptfs_opt_passthrough, "ecryptfs_passthrough"}, {ecryptfs_opt_xattr_metadata, "ecryptfs_xattr_metadata"}, {ecryptfs_opt_encrypted_view, "ecryptfs_encrypted_view"}, + {ecryptfs_opt_fnek_sig, "ecryptfs_fnek_sig=%s"}, + {ecryptfs_opt_fn_cipher, "ecryptfs_fn_cipher=%s"}, + {ecryptfs_opt_fn_cipher_key_bytes, "ecryptfs_fn_key_bytes=%u"}, {ecryptfs_opt_err, NULL} }; @@ -281,8 +286,11 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options) int rc = 0; int sig_set = 0; int cipher_name_set = 0; + int fn_cipher_name_set = 0; int cipher_key_bytes; int cipher_key_bytes_set = 0; + int fn_cipher_key_bytes; + int fn_cipher_key_bytes_set = 0; struct ecryptfs_mount_crypt_stat *mount_crypt_stat = &ecryptfs_superblock_to_private(sb)->mount_crypt_stat; substring_t args[MAX_OPT_ARGS]; @@ -290,7 +298,12 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options) char *sig_src; char *cipher_name_dst; char *cipher_name_src; + char *fn_cipher_name_dst; + char *fn_cipher_name_src; + char *fnek_dst; + char *fnek_src; char *cipher_key_bytes_src; + char *fn_cipher_key_bytes_src; if (!options) { rc = -EINVAL; @@ -322,10 +335,7 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options) global_default_cipher_name; strncpy(cipher_name_dst, cipher_name_src, ECRYPTFS_MAX_CIPHER_NAME_SIZE); - ecryptfs_printk(KERN_DEBUG, - "The mount_crypt_stat " - "global_default_cipher_name set to: " - "[%s]\n", cipher_name_dst); + cipher_name_dst[ECRYPTFS_MAX_CIPHER_NAME_SIZE] = '\0'; cipher_name_set = 1; break; case ecryptfs_opt_ecryptfs_key_bytes: @@ -335,11 +345,6 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options) &cipher_key_bytes_src, 0); mount_crypt_stat->global_default_cipher_key_size = cipher_key_bytes; - ecryptfs_printk(KERN_DEBUG, - "The mount_crypt_stat " - "global_default_cipher_key_size " - "set to: [%d]\n", mount_crypt_stat-> - global_default_cipher_key_size); cipher_key_bytes_set = 1; break; case ecryptfs_opt_passthrough: @@ -356,11 +361,51 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options) mount_crypt_stat->flags |= ECRYPTFS_ENCRYPTED_VIEW_ENABLED; break; + case ecryptfs_opt_fnek_sig: + fnek_src = args[0].from; + fnek_dst = + mount_crypt_stat->global_default_fnek_sig; + strncpy(fnek_dst, fnek_src, ECRYPTFS_SIG_SIZE_HEX); + mount_crypt_stat->global_default_fnek_sig[ + ECRYPTFS_SIG_SIZE_HEX] = '\0'; + rc = ecryptfs_add_global_auth_tok( + mount_crypt_stat, + mount_crypt_stat->global_default_fnek_sig); + if (rc) { + printk(KERN_ERR "Error attempting to register " + "global fnek sig [%s]; rc = [%d]\n", + mount_crypt_stat->global_default_fnek_sig, + rc); + goto out; + } + mount_crypt_stat->flags |= + (ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES + | ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK); + break; + case ecryptfs_opt_fn_cipher: + fn_cipher_name_src = args[0].from; + fn_cipher_name_dst = + mount_crypt_stat->global_default_fn_cipher_name; + strncpy(fn_cipher_name_dst, fn_cipher_name_src, + ECRYPTFS_MAX_CIPHER_NAME_SIZE); + mount_crypt_stat->global_default_fn_cipher_name[ + ECRYPTFS_MAX_CIPHER_NAME_SIZE] = '\0'; + fn_cipher_name_set = 1; + break; + case ecryptfs_opt_fn_cipher_key_bytes: + fn_cipher_key_bytes_src = args[0].from; + fn_cipher_key_bytes = + (int)simple_strtol(fn_cipher_key_bytes_src, + &fn_cipher_key_bytes_src, 0); + mount_crypt_stat->global_default_fn_cipher_key_bytes = + fn_cipher_key_bytes; + fn_cipher_key_bytes_set = 1; + break; case ecryptfs_opt_err: default: - ecryptfs_printk(KERN_WARNING, - "eCryptfs: unrecognized option '%s'\n", - p); + printk(KERN_WARNING + "%s: eCryptfs: unrecognized option [%s]\n", + __func__, p); } } if (!sig_set) { @@ -374,33 +419,60 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options) int cipher_name_len = strlen(ECRYPTFS_DEFAULT_CIPHER); BUG_ON(cipher_name_len >= ECRYPTFS_MAX_CIPHER_NAME_SIZE); - strcpy(mount_crypt_stat->global_default_cipher_name, ECRYPTFS_DEFAULT_CIPHER); } - if (!cipher_key_bytes_set) { + if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) + && !fn_cipher_name_set) + strcpy(mount_crypt_stat->global_default_fn_cipher_name, + mount_crypt_stat->global_default_cipher_name); + if (!cipher_key_bytes_set) mount_crypt_stat->global_default_cipher_key_size = 0; - } + if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) + && !fn_cipher_key_bytes_set) + mount_crypt_stat->global_default_fn_cipher_key_bytes = + mount_crypt_stat->global_default_cipher_key_size; mutex_lock(&key_tfm_list_mutex); if (!ecryptfs_tfm_exists(mount_crypt_stat->global_default_cipher_name, - NULL)) + NULL)) { rc = ecryptfs_add_new_key_tfm( NULL, mount_crypt_stat->global_default_cipher_name, mount_crypt_stat->global_default_cipher_key_size); - mutex_unlock(&key_tfm_list_mutex); - if (rc) { - printk(KERN_ERR "Error attempting to initialize cipher with " - "name = [%s] and key size = [%td]; rc = [%d]\n", - mount_crypt_stat->global_default_cipher_name, - mount_crypt_stat->global_default_cipher_key_size, rc); - rc = -EINVAL; - goto out; + if (rc) { + printk(KERN_ERR "Error attempting to initialize " + "cipher with name = [%s] and key size = [%td]; " + "rc = [%d]\n", + mount_crypt_stat->global_default_cipher_name, + mount_crypt_stat->global_default_cipher_key_size, + rc); + rc = -EINVAL; + mutex_unlock(&key_tfm_list_mutex); + goto out; + } } + if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) + && !ecryptfs_tfm_exists( + mount_crypt_stat->global_default_fn_cipher_name, NULL)) { + rc = ecryptfs_add_new_key_tfm( + NULL, mount_crypt_stat->global_default_fn_cipher_name, + mount_crypt_stat->global_default_fn_cipher_key_bytes); + if (rc) { + printk(KERN_ERR "Error attempting to initialize " + "cipher with name = [%s] and key size = [%td]; " + "rc = [%d]\n", + mount_crypt_stat->global_default_fn_cipher_name, + mount_crypt_stat->global_default_fn_cipher_key_bytes, + rc); + rc = -EINVAL; + mutex_unlock(&key_tfm_list_mutex); + goto out; + } + } + mutex_unlock(&key_tfm_list_mutex); rc = ecryptfs_init_global_auth_toks(mount_crypt_stat); - if (rc) { + if (rc) printk(KERN_WARNING "One or more global auth toks could not " "properly register; rc = [%d]\n", rc); - } out: return rc; } diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c index 6913f727624d..96ef51489e01 100644 --- a/fs/ecryptfs/messaging.c +++ b/fs/ecryptfs/messaging.c @@ -193,7 +193,7 @@ ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, uid_t euid, (*daemon) = kzalloc(sizeof(**daemon), GFP_KERNEL); if (!(*daemon)) { rc = -ENOMEM; - printk(KERN_ERR "%s: Failed to allocate [%Zd] bytes of " + printk(KERN_ERR "%s: Failed to allocate [%zd] bytes of " "GFP_KERNEL memory\n", __func__, sizeof(**daemon)); goto out; } @@ -435,7 +435,7 @@ int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid, msg_ctx->msg = kmalloc(msg_size, GFP_KERNEL); if (!msg_ctx->msg) { rc = -ENOMEM; - printk(KERN_ERR "%s: Failed to allocate [%Zd] bytes of " + printk(KERN_ERR "%s: Failed to allocate [%zd] bytes of " "GFP_KERNEL memory\n", __func__, msg_size); goto unlock; } diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c index efd95a0ed1ea..a67fea655f49 100644 --- a/fs/ecryptfs/miscdev.c +++ b/fs/ecryptfs/miscdev.c @@ -199,7 +199,7 @@ int ecryptfs_send_miscdev(char *data, size_t data_size, if (!msg_ctx->msg) { rc = -ENOMEM; printk(KERN_ERR "%s: Out of memory whilst attempting " - "to kmalloc(%Zd, GFP_KERNEL)\n", __func__, + "to kmalloc(%zd, GFP_KERNEL)\n", __func__, (sizeof(*msg_ctx->msg) + data_size)); goto out_unlock; } @@ -322,7 +322,7 @@ check_list: if (count < total_length) { rc = 0; printk(KERN_WARNING "%s: Only given user buffer of " - "size [%Zd], but we need [%Zd] to read the " + "size [%zd], but we need [%zd] to read the " "pending message\n", __func__, count, total_length); goto out_unlock_msg_ctx; } @@ -376,7 +376,7 @@ static int ecryptfs_miscdev_response(char *data, size_t data_size, if ((sizeof(*msg) + msg->data_len) != data_size) { printk(KERN_WARNING "%s: (sizeof(*msg) + msg->data_len) = " - "[%Zd]; data_size = [%Zd]. Invalid packet.\n", __func__, + "[%zd]; data_size = [%zd]. Invalid packet.\n", __func__, (sizeof(*msg) + msg->data_len), data_size); rc = -EINVAL; goto out; @@ -421,7 +421,7 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf, data = kmalloc(count, GFP_KERNEL); if (!data) { printk(KERN_ERR "%s: Out of memory whilst attempting to " - "kmalloc([%Zd], GFP_KERNEL)\n", __func__, count); + "kmalloc([%zd], GFP_KERNEL)\n", __func__, count); goto out; } rc = copy_from_user(data, buf, count); @@ -436,8 +436,8 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf, case ECRYPTFS_MSG_RESPONSE: if (count < (1 + 4 + 1 + sizeof(struct ecryptfs_message))) { printk(KERN_WARNING "%s: Minimum acceptable packet " - "size is [%Zd], but amount of data written is " - "only [%Zd]. Discarding response packet.\n", + "size is [%zd], but amount of data written is " + "only [%zd]. Discarding response packet.\n", __func__, (1 + 4 + 1 + sizeof(struct ecryptfs_message)), count); @@ -455,9 +455,9 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf, } i += packet_size_length; if ((1 + 4 + packet_size_length + packet_size) != count) { - printk(KERN_WARNING "%s: (1 + packet_size_length([%Zd])" - " + packet_size([%Zd]))([%Zd]) != " - "count([%Zd]). Invalid packet format.\n", + printk(KERN_WARNING "%s: (1 + packet_size_length([%zd])" + " + packet_size([%zd]))([%zd]) != " + "count([%zd]). Invalid packet format.\n", __func__, packet_size_length, packet_size, (1 + packet_size_length + packet_size), count); goto out_free; diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 04d7b3fa1ac6..46cec2b69796 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -288,7 +288,7 @@ static int ecryptfs_write_begin(struct file *file, loff_t prev_page_end_size; int rc = 0; - page = __grab_cache_page(mapping, index); + page = grab_cache_page_write_begin(mapping, index, flags); if (!page) return -ENOMEM; *pagep = page; diff --git a/fs/exec.c b/fs/exec.c index 911dd0fd7e09..605be573fe87 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -52,17 +52,13 @@ #include <linux/audit.h> #include <linux/tracehook.h> #include <linux/kmod.h> +#include <linux/fsnotify.h> #include <asm/uaccess.h> #include <asm/mmu_context.h> #include <asm/tlb.h> #include "internal.h" -#ifdef __alpha__ -/* for /sbin/loader handling in search_binary_handler() */ -#include <linux/a.out.h> -#endif - int core_uses_pid; char core_pattern[CORENAME_MAX_SIZE] = "core"; int suid_dumpable = 0; @@ -128,7 +124,8 @@ asmlinkage long sys_uselib(const char __user * library) if (nd.path.mnt->mnt_flags & MNT_NOEXEC) goto exit; - error = vfs_permission(&nd, MAY_READ | MAY_EXEC | MAY_OPEN); + error = inode_permission(nd.path.dentry->d_inode, + MAY_READ | MAY_EXEC | MAY_OPEN); if (error) goto exit; @@ -137,6 +134,8 @@ asmlinkage long sys_uselib(const char __user * library) if (IS_ERR(file)) goto out; + fsnotify_open(file->f_path.dentry); + error = -ENOEXEC; if(file->f_op) { struct linux_binfmt * fmt; @@ -234,13 +233,13 @@ static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos, static int __bprm_mm_init(struct linux_binprm *bprm) { - int err = -ENOMEM; + int err; struct vm_area_struct *vma = NULL; struct mm_struct *mm = bprm->mm; bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); if (!vma) - goto err; + return -ENOMEM; down_write(&mm->mmap_sem); vma->vm_mm = mm; @@ -253,28 +252,20 @@ static int __bprm_mm_init(struct linux_binprm *bprm) */ vma->vm_end = STACK_TOP_MAX; vma->vm_start = vma->vm_end - PAGE_SIZE; - vma->vm_flags = VM_STACK_FLAGS; vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); err = insert_vm_struct(mm, vma); - if (err) { - up_write(&mm->mmap_sem); + if (err) goto err; - } mm->stack_vm = mm->total_vm = 1; up_write(&mm->mmap_sem); - bprm->p = vma->vm_end - sizeof(void *); - return 0; - err: - if (vma) { - bprm->vma = NULL; - kmem_cache_free(vm_area_cachep, vma); - } - + up_write(&mm->mmap_sem); + bprm->vma = NULL; + kmem_cache_free(vm_area_cachep, vma); return err; } @@ -681,7 +672,7 @@ struct file *open_exec(const char *name) if (nd.path.mnt->mnt_flags & MNT_NOEXEC) goto out_path_put; - err = vfs_permission(&nd, MAY_EXEC | MAY_OPEN); + err = inode_permission(nd.path.dentry->d_inode, MAY_EXEC | MAY_OPEN); if (err) goto out_path_put; @@ -689,6 +680,8 @@ struct file *open_exec(const char *name) if (IS_ERR(file)) return file; + fsnotify_open(file->f_path.dentry); + err = deny_write_access(file); if (err) { fput(file); @@ -774,7 +767,6 @@ static int de_thread(struct task_struct *tsk) struct signal_struct *sig = tsk->signal; struct sighand_struct *oldsighand = tsk->sighand; spinlock_t *lock = &oldsighand->siglock; - struct task_struct *leader = NULL; int count; if (thread_group_empty(tsk)) @@ -812,7 +804,7 @@ static int de_thread(struct task_struct *tsk) * and to assume its PID: */ if (!thread_group_leader(tsk)) { - leader = tsk->group_leader; + struct task_struct *leader = tsk->group_leader; sig->notify_count = -1; /* for exit_notify() */ for (;;) { @@ -864,8 +856,9 @@ static int de_thread(struct task_struct *tsk) BUG_ON(leader->exit_state != EXIT_ZOMBIE); leader->exit_state = EXIT_DEAD; - write_unlock_irq(&tasklist_lock); + + release_task(leader); } sig->group_exit_task = NULL; @@ -874,8 +867,6 @@ static int de_thread(struct task_struct *tsk) no_thread_group: exit_itimers(sig); flush_itimer_signals(); - if (leader) - release_task(leader); if (atomic_read(&oldsighand->count) != 1) { struct sighand_struct *newsighand; @@ -1181,41 +1172,7 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs) unsigned int depth = bprm->recursion_depth; int try,retval; struct linux_binfmt *fmt; -#ifdef __alpha__ - /* handle /sbin/loader.. */ - { - struct exec * eh = (struct exec *) bprm->buf; - - if (!bprm->loader && eh->fh.f_magic == 0x183 && - (eh->fh.f_flags & 0x3000) == 0x3000) - { - struct file * file; - unsigned long loader; - allow_write_access(bprm->file); - fput(bprm->file); - bprm->file = NULL; - - loader = bprm->vma->vm_end - sizeof(void *); - - file = open_exec("/sbin/loader"); - retval = PTR_ERR(file); - if (IS_ERR(file)) - return retval; - - /* Remember if the application is TASO. */ - bprm->taso = eh->ah.entry < 0x100000000UL; - - bprm->file = file; - bprm->loader = loader; - retval = prepare_binprm(bprm); - if (retval<0) - return retval; - /* should call search_binary_handler recursively here, - but it does not matter */ - } - } -#endif retval = security_bprm_check(bprm); if (retval) return retval; @@ -1737,7 +1694,7 @@ int get_dumpable(struct mm_struct *mm) return (ret >= 2) ? 2 : ret; } -int do_coredump(long signr, int exit_code, struct pt_regs * regs) +void do_coredump(long signr, int exit_code, struct pt_regs *regs) { struct core_state core_state; char corename[CORENAME_MAX_SIZE + 1]; @@ -1821,6 +1778,11 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs) if (ispipe) { helper_argv = argv_split(GFP_KERNEL, corename+1, &helper_argc); + if (!helper_argv) { + printk(KERN_WARNING "%s failed to allocate memory\n", + __func__); + goto fail_unlock; + } /* Terminate the string before the first option */ delimit = strchr(corename, ' '); if (delimit) @@ -1888,5 +1850,5 @@ fail_unlock: put_cred(cred); coredump_finish(mm); fail: - return retval; + return; } diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index 8d0add625870..66321a877e74 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -565,12 +565,8 @@ got: inode->i_blocks = 0; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; memset(ei->i_data, 0, sizeof(ei->i_data)); - ei->i_flags = EXT2_I(dir)->i_flags & ~EXT2_BTREE_FL; - if (S_ISLNK(mode)) - ei->i_flags &= ~(EXT2_IMMUTABLE_FL|EXT2_APPEND_FL); - /* dirsync is only applied to directories */ - if (!S_ISDIR(mode)) - ei->i_flags &= ~EXT2_DIRSYNC_FL; + ei->i_flags = + ext2_mask_flags(mode, EXT2_I(dir)->i_flags & EXT2_FL_INHERITED); ei->i_faddr = 0; ei->i_frag_no = 0; ei->i_frag_size = 0; @@ -585,7 +581,10 @@ got: spin_lock(&sbi->s_next_gen_lock); inode->i_generation = sbi->s_next_generation++; spin_unlock(&sbi->s_next_gen_lock); - insert_inode_hash(inode); + if (insert_inode_locked(inode) < 0) { + err = -EINVAL; + goto fail_drop; + } if (DQUOT_ALLOC_INODE(inode)) { err = -EDQUOT; @@ -612,6 +611,7 @@ fail_drop: DQUOT_DROP(inode); inode->i_flags |= S_NOQUOTA; inode->i_nlink = 0; + unlock_new_inode(inode); iput(inode); return ERR_PTR(err); diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 7658b33e2653..23fff2f87783 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -32,6 +32,7 @@ #include <linux/buffer_head.h> #include <linux/mpage.h> #include <linux/fiemap.h> +#include <linux/namei.h> #include "ext2.h" #include "acl.h" #include "xip.h" @@ -497,8 +498,6 @@ static int ext2_alloc_branch(struct inode *inode, * ext2_splice_branch - splice the allocated branch onto inode. * @inode: owner * @block: (logical) number of block we are adding - * @chain: chain of indirect blocks (with a missing link - see - * ext2_alloc_branch) * @where: location of missing link * @num: number of indirect blocks we are adding * @blks: number of direct blocks we are adding @@ -1286,9 +1285,11 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino) else inode->i_mapping->a_ops = &ext2_aops; } else if (S_ISLNK(inode->i_mode)) { - if (ext2_inode_is_fast_symlink(inode)) + if (ext2_inode_is_fast_symlink(inode)) { inode->i_op = &ext2_fast_symlink_inode_operations; - else { + nd_terminate_link(ei->i_data, inode->i_size, + sizeof(ei->i_data) - 1); + } else { inode->i_op = &ext2_symlink_inode_operations; if (test_opt(inode->i_sb, NOBH)) inode->i_mapping->a_ops = &ext2_nobh_aops; diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c index de876fa793e1..7cb4badef927 100644 --- a/fs/ext2/ioctl.c +++ b/fs/ext2/ioctl.c @@ -50,8 +50,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) goto setflags_out; } - if (!S_ISDIR(inode->i_mode)) - flags &= ~EXT2_DIRSYNC_FL; + flags = ext2_mask_flags(inode->i_mode, flags); mutex_lock(&inode->i_mutex); /* Is it quota file? Do not allow user to mess with it */ diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 2a747252ec12..90ea17998a73 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -41,9 +41,11 @@ static inline int ext2_add_nondir(struct dentry *dentry, struct inode *inode) int err = ext2_add_link(dentry, inode); if (!err) { d_instantiate(dentry, inode); + unlock_new_inode(inode); return 0; } inode_dec_link_count(inode); + unlock_new_inode(inode); iput(inode); return err; } @@ -170,6 +172,7 @@ out: out_fail: inode_dec_link_count(inode); + unlock_new_inode(inode); iput (inode); goto out; } @@ -178,6 +181,7 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir, struct dentry *dentry) { struct inode *inode = old_dentry->d_inode; + int err; if (inode->i_nlink >= EXT2_LINK_MAX) return -EMLINK; @@ -186,7 +190,14 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir, inode_inc_link_count(inode); atomic_inc(&inode->i_count); - return ext2_add_nondir(dentry, inode); + err = ext2_add_link(dentry, inode); + if (!err) { + d_instantiate(dentry, inode); + return 0; + } + inode_dec_link_count(inode); + iput(inode); + return err; } static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode) @@ -222,12 +233,14 @@ static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode) goto out_fail; d_instantiate(dentry, inode); + unlock_new_inode(inode); out: return err; out_fail: inode_dec_link_count(inode); inode_dec_link_count(inode); + unlock_new_inode(inode); iput(inode); out_dir: inode_dec_link_count(dir); diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 647cd888ac87..da8bdeaa2e6d 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -132,6 +132,7 @@ static void ext2_put_super (struct super_block * sb) percpu_counter_destroy(&sbi->s_dirs_counter); brelse (sbi->s_sbh); sb->s_fs_info = NULL; + kfree(sbi->s_blockgroup_lock); kfree(sbi); return; @@ -756,6 +757,13 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) return -ENOMEM; + + sbi->s_blockgroup_lock = + kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); + if (!sbi->s_blockgroup_lock) { + kfree(sbi); + return -ENOMEM; + } sb->s_fs_info = sbi; sbi->s_sb_block = sb_block; @@ -983,7 +991,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) printk ("EXT2-fs: not enough memory\n"); goto failed_mount; } - bgl_lock_init(&sbi->s_blockgroup_lock); + bgl_lock_init(sbi->s_blockgroup_lock); sbi->s_debts = kcalloc(sbi->s_groups_count, sizeof(*sbi->s_debts), GFP_KERNEL); if (!sbi->s_debts) { printk ("EXT2-fs: not enough memory\n"); diff --git a/fs/ext3/hash.c b/fs/ext3/hash.c index c30e149fbd2e..7d215b4d4f2e 100644 --- a/fs/ext3/hash.c +++ b/fs/ext3/hash.c @@ -35,23 +35,71 @@ static void TEA_transform(__u32 buf[4], __u32 const in[]) /* The old legacy hash */ -static __u32 dx_hack_hash (const char *name, int len) +static __u32 dx_hack_hash_unsigned(const char *name, int len) { - __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; + __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; + const unsigned char *ucp = (const unsigned char *) name; + + while (len--) { + hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373)); + + if (hash & 0x80000000) + hash -= 0x7fffffff; + hash1 = hash0; + hash0 = hash; + } + return hash0 << 1; +} + +static __u32 dx_hack_hash_signed(const char *name, int len) +{ + __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; + const signed char *scp = (const signed char *) name; + while (len--) { - __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373)); + hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373)); - if (hash & 0x80000000) hash -= 0x7fffffff; + if (hash & 0x80000000) + hash -= 0x7fffffff; hash1 = hash0; hash0 = hash; } - return (hash0 << 1); + return hash0 << 1; } -static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) +static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num) { __u32 pad, val; int i; + const signed char *scp = (const signed char *) msg; + + pad = (__u32)len | ((__u32)len << 8); + pad |= pad << 16; + + val = pad; + if (len > num*4) + len = num * 4; + for (i = 0; i < len; i++) { + if ((i % 4) == 0) + val = pad; + val = ((int) scp[i]) + (val << 8); + if ((i % 4) == 3) { + *buf++ = val; + val = pad; + num--; + } + } + if (--num >= 0) + *buf++ = val; + while (--num >= 0) + *buf++ = pad; +} + +static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num) +{ + __u32 pad, val; + int i; + const unsigned char *ucp = (const unsigned char *) msg; pad = (__u32)len | ((__u32)len << 8); pad |= pad << 16; @@ -62,7 +110,7 @@ static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) for (i=0; i < len; i++) { if ((i % 4) == 0) val = pad; - val = msg[i] + (val << 8); + val = ((int) ucp[i]) + (val << 8); if ((i % 4) == 3) { *buf++ = val; val = pad; @@ -95,6 +143,8 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) const char *p; int i; __u32 in[8], buf[4]; + void (*str2hashbuf)(const char *, int, __u32 *, int) = + str2hashbuf_signed; /* Initialize the default seed for the hash checksum functions */ buf[0] = 0x67452301; @@ -113,13 +163,18 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) } switch (hinfo->hash_version) { + case DX_HASH_LEGACY_UNSIGNED: + hash = dx_hack_hash_unsigned(name, len); + break; case DX_HASH_LEGACY: - hash = dx_hack_hash(name, len); + hash = dx_hack_hash_signed(name, len); break; + case DX_HASH_HALF_MD4_UNSIGNED: + str2hashbuf = str2hashbuf_unsigned; case DX_HASH_HALF_MD4: p = name; while (len > 0) { - str2hashbuf(p, len, in, 8); + (*str2hashbuf)(p, len, in, 8); half_md4_transform(buf, in); len -= 32; p += 32; @@ -127,10 +182,12 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) minor_hash = buf[2]; hash = buf[1]; break; + case DX_HASH_TEA_UNSIGNED: + str2hashbuf = str2hashbuf_unsigned; case DX_HASH_TEA: p = name; while (len > 0) { - str2hashbuf(p, len, in, 4); + (*str2hashbuf)(p, len, in, 4); TEA_transform(buf, in); len -= 16; p += 16; diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c index 490bd0ed7896..8de6c720e510 100644 --- a/fs/ext3/ialloc.c +++ b/fs/ext3/ialloc.c @@ -559,12 +559,8 @@ got: ei->i_dir_start_lookup = 0; ei->i_disksize = 0; - ei->i_flags = EXT3_I(dir)->i_flags & ~EXT3_INDEX_FL; - if (S_ISLNK(mode)) - ei->i_flags &= ~(EXT3_IMMUTABLE_FL|EXT3_APPEND_FL); - /* dirsync only applies to directories */ - if (!S_ISDIR(mode)) - ei->i_flags &= ~EXT3_DIRSYNC_FL; + ei->i_flags = + ext3_mask_flags(mode, EXT3_I(dir)->i_flags & EXT3_FL_INHERITED); #ifdef EXT3_FRAGMENTS ei->i_faddr = 0; ei->i_frag_no = 0; @@ -579,7 +575,10 @@ got: ext3_set_inode_flags(inode); if (IS_DIRSYNC(inode)) handle->h_sync = 1; - insert_inode_hash(inode); + if (insert_inode_locked(inode) < 0) { + err = -EINVAL; + goto fail_drop; + } spin_lock(&sbi->s_next_gen_lock); inode->i_generation = sbi->s_next_generation++; spin_unlock(&sbi->s_next_gen_lock); @@ -627,6 +626,7 @@ fail_drop: DQUOT_DROP(inode); inode->i_flags |= S_NOQUOTA; inode->i_nlink = 0; + unlock_new_inode(inode); iput(inode); brelse(bitmap_bh); return ERR_PTR(err); diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index f8424ad89971..5fa453b49a64 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -37,6 +37,7 @@ #include <linux/uio.h> #include <linux/bio.h> #include <linux/fiemap.h> +#include <linux/namei.h> #include "xattr.h" #include "acl.h" @@ -1160,7 +1161,7 @@ static int ext3_write_begin(struct file *file, struct address_space *mapping, to = from + len; retry: - page = __grab_cache_page(mapping, index); + page = grab_cache_page_write_begin(mapping, index, flags); if (!page) return -ENOMEM; *pagep = page; @@ -2817,9 +2818,11 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino) inode->i_op = &ext3_dir_inode_operations; inode->i_fop = &ext3_dir_operations; } else if (S_ISLNK(inode->i_mode)) { - if (ext3_inode_is_fast_symlink(inode)) + if (ext3_inode_is_fast_symlink(inode)) { inode->i_op = &ext3_fast_symlink_inode_operations; - else { + nd_terminate_link(ei->i_data, inode->i_size, + sizeof(ei->i_data) - 1); + } else { inode->i_op = &ext3_symlink_inode_operations; ext3_set_aops(inode); } diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c index b7394d05ee8e..5e86ce9a86e0 100644 --- a/fs/ext3/ioctl.c +++ b/fs/ext3/ioctl.c @@ -53,8 +53,7 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, goto flags_out; } - if (!S_ISDIR(inode->i_mode)) - flags &= ~EXT3_DIRSYNC_FL; + flags = ext3_mask_flags(inode->i_mode, flags); mutex_lock(&inode->i_mutex); /* Is it quota file? Do not allow user to mess with it */ diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 3e5edc92aa0b..69a3d19ca9fd 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -74,10 +74,6 @@ static struct buffer_head *ext3_append(handle_t *handle, #define assert(test) J_ASSERT(test) #endif -#ifndef swap -#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0) -#endif - #ifdef DX_DEBUG #define dxtrace(command) command #else @@ -368,6 +364,8 @@ dx_probe(struct qstr *entry, struct inode *dir, goto fail; } hinfo->hash_version = root->info.hash_version; + if (hinfo->hash_version <= DX_HASH_TEA) + hinfo->hash_version += EXT3_SB(dir->i_sb)->s_hash_unsigned; hinfo->seed = EXT3_SB(dir->i_sb)->s_hash_seed; if (entry) ext3fs_dirhash(entry->name, entry->len, hinfo); @@ -636,6 +634,9 @@ int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash, dir = dir_file->f_path.dentry->d_inode; if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) { hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version; + if (hinfo.hash_version <= DX_HASH_TEA) + hinfo.hash_version += + EXT3_SB(dir->i_sb)->s_hash_unsigned; hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed; count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo, start_hash, start_minor_hash); @@ -1156,9 +1157,9 @@ static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, u32 hash2; struct dx_map_entry *map; char *data1 = (*bh)->b_data, *data2; - unsigned split, move, size, i; + unsigned split, move, size; struct ext3_dir_entry_2 *de = NULL, *de2; - int err = 0; + int err = 0, i; bh2 = ext3_append (handle, dir, &newblock, &err); if (!(bh2)) { @@ -1398,6 +1399,8 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, /* Initialize as for dx_probe */ hinfo.hash_version = root->info.hash_version; + if (hinfo.hash_version <= DX_HASH_TEA) + hinfo.hash_version += EXT3_SB(dir->i_sb)->s_hash_unsigned; hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed; ext3fs_dirhash(name, namelen, &hinfo); frame = frames; @@ -1652,9 +1655,11 @@ static int ext3_add_nondir(handle_t *handle, if (!err) { ext3_mark_inode_dirty(handle, inode); d_instantiate(dentry, inode); + unlock_new_inode(inode); return 0; } drop_nlink(inode); + unlock_new_inode(inode); iput(inode); return err; } @@ -1765,6 +1770,7 @@ retry: dir_block = ext3_bread (handle, inode, 0, 1, &err); if (!dir_block) { drop_nlink(inode); /* is this nlink == 0? */ + unlock_new_inode(inode); ext3_mark_inode_dirty(handle, inode); iput (inode); goto out_stop; @@ -1792,6 +1798,7 @@ retry: err = ext3_add_entry (handle, dentry, inode); if (err) { inode->i_nlink = 0; + unlock_new_inode(inode); ext3_mark_inode_dirty(handle, inode); iput (inode); goto out_stop; @@ -1800,6 +1807,7 @@ retry: ext3_update_dx_flag(dir); ext3_mark_inode_dirty(handle, dir); d_instantiate(dentry, inode); + unlock_new_inode(inode); out_stop: ext3_journal_stop(handle); if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) @@ -2170,10 +2178,10 @@ retry: * We have a transaction open. All is sweetness. It also sets * i_size in generic_commit_write(). */ - err = __page_symlink(inode, symname, l, - mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); + err = __page_symlink(inode, symname, l, 1); if (err) { drop_nlink(inode); + unlock_new_inode(inode); ext3_mark_inode_dirty(handle, inode); iput (inode); goto out_stop; @@ -2221,7 +2229,14 @@ retry: inc_nlink(inode); atomic_inc(&inode->i_count); - err = ext3_add_nondir(handle, dentry, inode); + err = ext3_add_entry(handle, dentry, inode); + if (!err) { + ext3_mark_inode_dirty(handle, inode); + d_instantiate(dentry, inode); + } else { + drop_nlink(inode); + iput(inode); + } ext3_journal_stop(handle); if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) goto retry; diff --git a/fs/ext3/super.c b/fs/ext3/super.c index f6c94f232ec1..b70d90e08a3c 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -48,8 +48,8 @@ static int ext3_load_journal(struct super_block *, struct ext3_super_block *, unsigned long journal_devnum); static int ext3_create_journal(struct super_block *, struct ext3_super_block *, unsigned int); -static void ext3_commit_super (struct super_block * sb, - struct ext3_super_block * es, +static int ext3_commit_super(struct super_block *sb, + struct ext3_super_block *es, int sync); static void ext3_mark_recovery_complete(struct super_block * sb, struct ext3_super_block * es); @@ -60,9 +60,9 @@ static const char *ext3_decode_error(struct super_block * sb, int errno, char nbuf[16]); static int ext3_remount (struct super_block * sb, int * flags, char * data); static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf); -static void ext3_unlockfs(struct super_block *sb); +static int ext3_unfreeze(struct super_block *sb); static void ext3_write_super (struct super_block * sb); -static void ext3_write_super_lockfs(struct super_block *sb); +static int ext3_freeze(struct super_block *sb); /* * Wrappers for journal_start/end. @@ -439,6 +439,7 @@ static void ext3_put_super (struct super_block * sb) ext3_blkdev_remove(sbi); } sb->s_fs_info = NULL; + kfree(sbi->s_blockgroup_lock); kfree(sbi); return; } @@ -682,6 +683,26 @@ static struct dentry *ext3_fh_to_parent(struct super_block *sb, struct fid *fid, ext3_nfs_get_inode); } +/* + * Try to release metadata pages (indirect blocks, directories) which are + * mapped via the block device. Since these pages could have journal heads + * which would prevent try_to_free_buffers() from freeing them, we must use + * jbd layer's try_to_free_buffers() function to release them. + */ +static int bdev_try_to_free_page(struct super_block *sb, struct page *page, + gfp_t wait) +{ + journal_t *journal = EXT3_SB(sb)->s_journal; + + WARN_ON(PageChecked(page)); + if (!page_has_buffers(page)) + return 0; + if (journal) + return journal_try_to_free_buffers(journal, page, + wait & ~__GFP_WAIT); + return try_to_free_buffers(page); +} + #ifdef CONFIG_QUOTA #define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group") #define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA)) @@ -713,7 +734,9 @@ static struct dquot_operations ext3_quota_operations = { .acquire_dquot = ext3_acquire_dquot, .release_dquot = ext3_release_dquot, .mark_dirty = ext3_mark_dquot_dirty, - .write_info = ext3_write_info + .write_info = ext3_write_info, + .alloc_dquot = dquot_alloc, + .destroy_dquot = dquot_destroy, }; static struct quotactl_ops ext3_qctl_operations = { @@ -736,8 +759,8 @@ static const struct super_operations ext3_sops = { .put_super = ext3_put_super, .write_super = ext3_write_super, .sync_fs = ext3_sync_fs, - .write_super_lockfs = ext3_write_super_lockfs, - .unlockfs = ext3_unlockfs, + .freeze_fs = ext3_freeze, + .unfreeze_fs = ext3_unfreeze, .statfs = ext3_statfs, .remount_fs = ext3_remount, .clear_inode = ext3_clear_inode, @@ -746,6 +769,7 @@ static const struct super_operations ext3_sops = { .quota_read = ext3_quota_read, .quota_write = ext3_quota_write, #endif + .bdev_try_to_free_page = bdev_try_to_free_page, }; static const struct export_operations ext3_export_ops = { @@ -1035,8 +1059,7 @@ static int parse_options (char *options, struct super_block *sb, case Opt_grpjquota: qtype = GRPQUOTA; set_qf_name: - if ((sb_any_quota_enabled(sb) || - sb_any_quota_suspended(sb)) && + if (sb_any_quota_loaded(sb) && !sbi->s_qf_names[qtype]) { printk(KERN_ERR "EXT3-fs: Cannot change journaled " @@ -1075,8 +1098,7 @@ set_qf_name: case Opt_offgrpjquota: qtype = GRPQUOTA; clear_qf_name: - if ((sb_any_quota_enabled(sb) || - sb_any_quota_suspended(sb)) && + if (sb_any_quota_loaded(sb) && sbi->s_qf_names[qtype]) { printk(KERN_ERR "EXT3-fs: Cannot change " "journaled quota options when " @@ -1095,8 +1117,7 @@ clear_qf_name: case Opt_jqfmt_vfsv0: qfmt = QFMT_VFS_V0; set_qf_format: - if ((sb_any_quota_enabled(sb) || - sb_any_quota_suspended(sb)) && + if (sb_any_quota_loaded(sb) && sbi->s_jquota_fmt != qfmt) { printk(KERN_ERR "EXT3-fs: Cannot change " "journaled quota options when " @@ -1115,8 +1136,7 @@ set_qf_format: set_opt(sbi->s_mount_opt, GRPQUOTA); break; case Opt_noquota: - if (sb_any_quota_enabled(sb) || - sb_any_quota_suspended(sb)) { + if (sb_any_quota_loaded(sb)) { printk(KERN_ERR "EXT3-fs: Cannot change quota " "options when quota turned on.\n"); return 0; @@ -1548,6 +1568,13 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) return -ENOMEM; + + sbi->s_blockgroup_lock = + kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); + if (!sbi->s_blockgroup_lock) { + kfree(sbi); + return -ENOMEM; + } sb->s_fs_info = sbi; sbi->s_mount_opt = 0; sbi->s_resuid = EXT3_DEF_RESUID; @@ -1744,6 +1771,18 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) for (i=0; i < 4; i++) sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); sbi->s_def_hash_version = es->s_def_hash_version; + i = le32_to_cpu(es->s_flags); + if (i & EXT2_FLAGS_UNSIGNED_HASH) + sbi->s_hash_unsigned = 3; + else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) { +#ifdef __CHAR_UNSIGNED__ + es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH); + sbi->s_hash_unsigned = 3; +#else + es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); +#endif + sb->s_dirt = 1; + } if (sbi->s_blocks_per_group > blocksize * 8) { printk (KERN_ERR @@ -1788,7 +1827,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) goto failed_mount; } - bgl_lock_init(&sbi->s_blockgroup_lock); + bgl_lock_init(sbi->s_blockgroup_lock); for (i = 0; i < db_count; i++) { block = descriptor_loc(sb, logic_sb_block, i); @@ -2272,21 +2311,23 @@ static int ext3_create_journal(struct super_block * sb, return 0; } -static void ext3_commit_super (struct super_block * sb, - struct ext3_super_block * es, +static int ext3_commit_super(struct super_block *sb, + struct ext3_super_block *es, int sync) { struct buffer_head *sbh = EXT3_SB(sb)->s_sbh; + int error = 0; if (!sbh) - return; + return error; es->s_wtime = cpu_to_le32(get_seconds()); es->s_free_blocks_count = cpu_to_le32(ext3_count_free_blocks(sb)); es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb)); BUFFER_TRACE(sbh, "marking dirty"); mark_buffer_dirty(sbh); if (sync) - sync_dirty_buffer(sbh); + error = sync_dirty_buffer(sbh); + return error; } @@ -2400,12 +2441,14 @@ static int ext3_sync_fs(struct super_block *sb, int wait) * LVM calls this function before a (read-only) snapshot is created. This * gives us a chance to flush the journal completely and mark the fs clean. */ -static void ext3_write_super_lockfs(struct super_block *sb) +static int ext3_freeze(struct super_block *sb) { + int error = 0; + journal_t *journal; sb->s_dirt = 0; if (!(sb->s_flags & MS_RDONLY)) { - journal_t *journal = EXT3_SB(sb)->s_journal; + journal = EXT3_SB(sb)->s_journal; /* Now we set up the journal barrier. */ journal_lock_updates(journal); @@ -2414,20 +2457,28 @@ static void ext3_write_super_lockfs(struct super_block *sb) * We don't want to clear needs_recovery flag when we failed * to flush the journal. */ - if (journal_flush(journal) < 0) - return; + error = journal_flush(journal); + if (error < 0) + goto out; /* Journal blocked and flushed, clear needs_recovery flag. */ EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); - ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1); + error = ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1); + if (error) + goto out; } + return 0; + +out: + journal_unlock_updates(journal); + return error; } /* * Called by LVM after the snapshot is done. We need to reset the RECOVER * flag here, even though the filesystem is not technically dirty yet. */ -static void ext3_unlockfs(struct super_block *sb) +static int ext3_unfreeze(struct super_block *sb) { if (!(sb->s_flags & MS_RDONLY)) { lock_super(sb); @@ -2437,6 +2488,7 @@ static void ext3_unlockfs(struct super_block *sb) unlock_super(sb); journal_unlock_updates(EXT3_SB(sb)->s_journal); } + return 0; } static int ext3_remount (struct super_block * sb, int * flags, char * data) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 38b3acf5683b..6bba06b09dd1 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -20,6 +20,7 @@ #include "ext4.h" #include "ext4_jbd2.h" #include "group.h" +#include "mballoc.h" /* * balloc.c contains the blocks allocation and deallocation routines @@ -100,10 +101,10 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, * essentially implementing a per-group read-only flag. */ if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { ext4_error(sb, __func__, - "Checksum bad for group %lu\n", block_group); - gdp->bg_free_blocks_count = 0; - gdp->bg_free_inodes_count = 0; - gdp->bg_itable_unused = 0; + "Checksum bad for group %u", block_group); + ext4_free_blks_set(sb, gdp, 0); + ext4_free_inodes_set(sb, gdp, 0); + ext4_itable_unused_set(sb, gdp, 0); memset(bh->b_data, 0xff, sb->s_blocksize); return 0; } @@ -205,15 +206,15 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb, ext4_group_t block_group, struct buffer_head **bh) { - unsigned long group_desc; - unsigned long offset; + unsigned int group_desc; + unsigned int offset; struct ext4_group_desc *desc; struct ext4_sb_info *sbi = EXT4_SB(sb); if (block_group >= sbi->s_groups_count) { ext4_error(sb, "ext4_get_group_desc", "block_group >= groups_count - " - "block_group = %lu, groups_count = %lu", + "block_group = %u, groups_count = %u", block_group, sbi->s_groups_count); return NULL; @@ -225,7 +226,7 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb, if (!sbi->s_group_desc[group_desc]) { ext4_error(sb, "ext4_get_group_desc", "Group descriptor not loaded - " - "block_group = %lu, group_desc = %lu, desc = %lu", + "block_group = %u, group_desc = %u, desc = %u", block_group, group_desc, offset); return NULL; } @@ -315,29 +316,50 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) if (unlikely(!bh)) { ext4_error(sb, __func__, "Cannot read block bitmap - " - "block_group = %lu, block_bitmap = %llu", + "block_group = %u, block_bitmap = %llu", block_group, bitmap_blk); return NULL; } - if (buffer_uptodate(bh) && - !(desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) + + if (bitmap_uptodate(bh)) return bh; lock_buffer(bh); + if (bitmap_uptodate(bh)) { + unlock_buffer(bh); + return bh; + } spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group)); if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { ext4_init_block_bitmap(sb, bh, block_group, desc); + set_bitmap_uptodate(bh); set_buffer_uptodate(bh); - unlock_buffer(bh); spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); + unlock_buffer(bh); return bh; } spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); + if (buffer_uptodate(bh)) { + /* + * if not uninit if bh is uptodate, + * bitmap is also uptodate + */ + set_bitmap_uptodate(bh); + unlock_buffer(bh); + return bh; + } + /* + * submit the buffer_head for read. We can + * safely mark the bitmap as uptodate now. + * We do it here so the bitmap uptodate bit + * get set with buffer lock held. + */ + set_bitmap_uptodate(bh); if (bh_submit_read(bh) < 0) { put_bh(bh); ext4_error(sb, __func__, "Cannot read block bitmap - " - "block_group = %lu, block_bitmap = %llu", + "block_group = %u, block_bitmap = %llu", block_group, bitmap_blk); return NULL; } @@ -350,62 +372,44 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) } /** - * ext4_free_blocks_sb() -- Free given blocks and update quota + * ext4_add_groupblocks() -- Add given blocks to an existing group * @handle: handle to this transaction * @sb: super block - * @block: start physcial block to free + * @block: start physcial block to add to the block group * @count: number of blocks to free - * @pdquot_freed_blocks: pointer to quota * - * XXX This function is only used by the on-line resizing code, which - * should probably be fixed up to call the mballoc variant. There - * this needs to be cleaned up later; in fact, I'm not convinced this - * is 100% correct in the face of the mballoc code. The online resizing - * code needs to be fixed up to more tightly (and correctly) interlock - * with the mballoc code. + * This marks the blocks as free in the bitmap. We ask the + * mballoc to reload the buddy after this by setting group + * EXT4_GROUP_INFO_NEED_INIT_BIT flag */ -void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb, - ext4_fsblk_t block, unsigned long count, - unsigned long *pdquot_freed_blocks) +void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, + ext4_fsblk_t block, unsigned long count) { struct buffer_head *bitmap_bh = NULL; struct buffer_head *gd_bh; ext4_group_t block_group; ext4_grpblk_t bit; - unsigned long i; - unsigned long overflow; + unsigned int i; struct ext4_group_desc *desc; struct ext4_super_block *es; struct ext4_sb_info *sbi; - int err = 0, ret; - ext4_grpblk_t group_freed; + int err = 0, ret, blk_free_count; + ext4_grpblk_t blocks_freed; + struct ext4_group_info *grp; - *pdquot_freed_blocks = 0; sbi = EXT4_SB(sb); es = sbi->s_es; - if (block < le32_to_cpu(es->s_first_data_block) || - block + count < block || - block + count > ext4_blocks_count(es)) { - ext4_error(sb, "ext4_free_blocks", - "Freeing blocks not in datazone - " - "block = %llu, count = %lu", block, count); - goto error_return; - } - - ext4_debug("freeing block(s) %llu-%llu\n", block, block + count - 1); + ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); -do_more: - overflow = 0; ext4_get_group_no_and_offset(sb, block, &block_group, &bit); + grp = ext4_get_group_info(sb, block_group); /* * Check to see if we are freeing blocks across a group * boundary. */ if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) { - overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb); - count -= overflow; + goto error_return; } - brelse(bitmap_bh); bitmap_bh = ext4_read_block_bitmap(sb, block_group); if (!bitmap_bh) goto error_return; @@ -418,18 +422,17 @@ do_more: in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) || in_range(block + count - 1, ext4_inode_table(sb, desc), sbi->s_itb_per_group)) { - ext4_error(sb, "ext4_free_blocks", - "Freeing blocks in system zones - " + ext4_error(sb, __func__, + "Adding blocks in system zones - " "Block = %llu, count = %lu", block, count); goto error_return; } /* - * We are about to start releasing blocks in the bitmap, + * We are about to add blocks to the bitmap, * so we need undo access. */ - /* @@@ check errors */ BUFFER_TRACE(bitmap_bh, "getting undo access"); err = ext4_journal_get_undo_access(handle, bitmap_bh); if (err) @@ -444,107 +447,55 @@ do_more: err = ext4_journal_get_write_access(handle, gd_bh); if (err) goto error_return; - - jbd_lock_bh_state(bitmap_bh); - - for (i = 0, group_freed = 0; i < count; i++) { - /* - * An HJ special. This is expensive... - */ -#ifdef CONFIG_JBD2_DEBUG - jbd_unlock_bh_state(bitmap_bh); - { - struct buffer_head *debug_bh; - debug_bh = sb_find_get_block(sb, block + i); - if (debug_bh) { - BUFFER_TRACE(debug_bh, "Deleted!"); - if (!bh2jh(bitmap_bh)->b_committed_data) - BUFFER_TRACE(debug_bh, - "No commited data in bitmap"); - BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap"); - __brelse(debug_bh); - } - } - jbd_lock_bh_state(bitmap_bh); -#endif - if (need_resched()) { - jbd_unlock_bh_state(bitmap_bh); - cond_resched(); - jbd_lock_bh_state(bitmap_bh); - } - /* @@@ This prevents newly-allocated data from being - * freed and then reallocated within the same - * transaction. - * - * Ideally we would want to allow that to happen, but to - * do so requires making jbd2_journal_forget() capable of - * revoking the queued write of a data block, which - * implies blocking on the journal lock. *forget() - * cannot block due to truncate races. - * - * Eventually we can fix this by making jbd2_journal_forget() - * return a status indicating whether or not it was able - * to revoke the buffer. On successful revoke, it is - * safe not to set the allocation bit in the committed - * bitmap, because we know that there is no outstanding - * activity on the buffer any more and so it is safe to - * reallocate it. - */ - BUFFER_TRACE(bitmap_bh, "set in b_committed_data"); - J_ASSERT_BH(bitmap_bh, - bh2jh(bitmap_bh)->b_committed_data != NULL); - ext4_set_bit_atomic(sb_bgl_lock(sbi, block_group), bit + i, - bh2jh(bitmap_bh)->b_committed_data); - - /* - * We clear the bit in the bitmap after setting the committed - * data bit, because this is the reverse order to that which - * the allocator uses. - */ + /* + * make sure we don't allow a parallel init on other groups in the + * same buddy cache + */ + down_write(&grp->alloc_sem); + for (i = 0, blocks_freed = 0; i < count; i++) { BUFFER_TRACE(bitmap_bh, "clear bit"); if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group), bit + i, bitmap_bh->b_data)) { - jbd_unlock_bh_state(bitmap_bh); ext4_error(sb, __func__, "bit already cleared for block %llu", (ext4_fsblk_t)(block + i)); - jbd_lock_bh_state(bitmap_bh); BUFFER_TRACE(bitmap_bh, "bit already cleared"); } else { - group_freed++; + blocks_freed++; } } - jbd_unlock_bh_state(bitmap_bh); - spin_lock(sb_bgl_lock(sbi, block_group)); - le16_add_cpu(&desc->bg_free_blocks_count, group_freed); + blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc); + ext4_free_blks_set(sb, desc, blk_free_count); desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc); spin_unlock(sb_bgl_lock(sbi, block_group)); - percpu_counter_add(&sbi->s_freeblocks_counter, count); + percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed); if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, block_group); spin_lock(sb_bgl_lock(sbi, flex_group)); - sbi->s_flex_groups[flex_group].free_blocks += count; + sbi->s_flex_groups[flex_group].free_blocks += blocks_freed; spin_unlock(sb_bgl_lock(sbi, flex_group)); } + /* + * request to reload the buddy with the + * new bitmap information + */ + set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); + ext4_mb_update_group_info(grp, blocks_freed); + up_write(&grp->alloc_sem); /* We dirtied the bitmap block */ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); - err = ext4_journal_dirty_metadata(handle, bitmap_bh); + err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); /* And the group descriptor block */ BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); - ret = ext4_journal_dirty_metadata(handle, gd_bh); - if (!err) err = ret; - *pdquot_freed_blocks += group_freed; - - if (overflow && !err) { - block += count; - count = overflow; - goto do_more; - } + ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); + if (!err) + err = ret; sb->s_dirt = 1; + error_return: brelse(bitmap_bh); ext4_std_error(sb, err); @@ -614,7 +565,7 @@ int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks) if (dirty_blocks < 0) { printk(KERN_CRIT "Dirty block accounting " "went wrong %lld\n", - dirty_blocks); + (long long)dirty_blocks); } } /* Check whether we have space after @@ -666,101 +617,45 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries) return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal); } -#define EXT4_META_BLOCK 0x1 - -static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode, - ext4_lblk_t iblock, ext4_fsblk_t goal, - unsigned long *count, int *errp, int flags) -{ - struct ext4_allocation_request ar; - ext4_fsblk_t ret; - - memset(&ar, 0, sizeof(ar)); - /* Fill with neighbour allocated blocks */ - - ar.inode = inode; - ar.goal = goal; - ar.len = *count; - ar.logical = iblock; - - if (S_ISREG(inode->i_mode) && !(flags & EXT4_META_BLOCK)) - /* enable in-core preallocation for data block allocation */ - ar.flags = EXT4_MB_HINT_DATA; - else - /* disable in-core preallocation for non-regular files */ - ar.flags = 0; - - ret = ext4_mb_new_blocks(handle, &ar, errp); - *count = ar.len; - return ret; -} - /* * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks * * @handle: handle to this transaction * @inode: file inode * @goal: given target block(filesystem wide) - * @count: total number of blocks need + * @count: pointer to total number of blocks needed * @errp: error code * - * Return 1st allocated block numberon success, *count stores total account + * Return 1st allocated block number on success, *count stores total account * error stores in errp pointer */ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, ext4_fsblk_t goal, unsigned long *count, int *errp) { + struct ext4_allocation_request ar; ext4_fsblk_t ret; - ret = do_blk_alloc(handle, inode, 0, goal, - count, errp, EXT4_META_BLOCK); + + memset(&ar, 0, sizeof(ar)); + /* Fill with neighbour allocated blocks */ + ar.inode = inode; + ar.goal = goal; + ar.len = count ? *count : 1; + + ret = ext4_mb_new_blocks(handle, &ar, errp); + if (count) + *count = ar.len; + /* * Account for the allocated meta blocks */ if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) { spin_lock(&EXT4_I(inode)->i_block_reservation_lock); - EXT4_I(inode)->i_allocated_meta_blocks += *count; + EXT4_I(inode)->i_allocated_meta_blocks += ar.len; spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); } return ret; } -/* - * ext4_new_meta_block() -- allocate block for meta data (indexing) blocks - * - * @handle: handle to this transaction - * @inode: file inode - * @goal: given target block(filesystem wide) - * @errp: error code - * - * Return allocated block number on success - */ -ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode, - ext4_fsblk_t goal, int *errp) -{ - unsigned long count = 1; - return ext4_new_meta_blocks(handle, inode, goal, &count, errp); -} - -/* - * ext4_new_blocks() -- allocate data blocks - * - * @handle: handle to this transaction - * @inode: file inode - * @goal: given target block(filesystem wide) - * @count: total number of blocks need - * @errp: error code - * - * Return 1st allocated block numberon success, *count stores total account - * error stores in errp pointer - */ - -ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode, - ext4_lblk_t iblock, ext4_fsblk_t goal, - unsigned long *count, int *errp) -{ - return do_blk_alloc(handle, inode, iblock, goal, count, errp, 0); -} - /** * ext4_count_free_blocks() -- count filesystem free blocks * @sb: superblock @@ -776,7 +671,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb) #ifdef EXT4FS_DEBUG struct ext4_super_block *es; ext4_fsblk_t bitmap_count; - unsigned long x; + unsigned int x; struct buffer_head *bitmap_bh = NULL; es = EXT4_SB(sb)->s_es; @@ -796,7 +691,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb) continue; x = ext4_count_free(bitmap_bh, sb->s_blocksize); - printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n", + printk(KERN_DEBUG "group %lu: stored = %d, counted = %u\n", i, le16_to_cpu(gdp->bg_free_blocks_count), x); bitmap_count += x; } @@ -812,7 +707,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb) gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; - desc_count += le16_to_cpu(gdp->bg_free_blocks_count); + desc_count += ext4_free_blks_count(sb, gdp); } return desc_count; diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c index 0a7a6663c190..fa3af81ac565 100644 --- a/fs/ext4/bitmap.c +++ b/fs/ext4/bitmap.c @@ -15,10 +15,9 @@ static const int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0}; -unsigned long ext4_count_free(struct buffer_head *map, unsigned int numchars) +unsigned int ext4_count_free(struct buffer_head *map, unsigned int numchars) { - unsigned int i; - unsigned long sum = 0; + unsigned int i, sum = 0; if (!map) return 0; diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index fed5b610df5a..2df2e40b01af 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -64,7 +64,7 @@ static unsigned char get_dtype(struct super_block *sb, int filetype) int ext4_check_dir_entry(const char *function, struct inode *dir, struct ext4_dir_entry_2 *de, struct buffer_head *bh, - unsigned long offset) + unsigned int offset) { const char *error_msg = NULL; const int rlen = ext4_rec_len_from_disk(de->rec_len); @@ -84,9 +84,9 @@ int ext4_check_dir_entry(const char *function, struct inode *dir, if (error_msg != NULL) ext4_error(dir->i_sb, function, "bad entry in directory #%lu: %s - " - "offset=%lu, inode=%lu, rec_len=%d, name_len=%d", + "offset=%u, inode=%u, rec_len=%d, name_len=%d", dir->i_ino, error_msg, offset, - (unsigned long) le32_to_cpu(de->inode), + le32_to_cpu(de->inode), rlen, de->name_len); return error_msg == NULL ? 1 : 0; } @@ -95,7 +95,7 @@ static int ext4_readdir(struct file *filp, void *dirent, filldir_t filldir) { int error = 0; - unsigned long offset; + unsigned int offset; int i, stored; struct ext4_dir_entry_2 *de; struct super_block *sb; @@ -405,7 +405,7 @@ static int call_filldir(struct file *filp, void *dirent, sb = inode->i_sb; if (!fname) { - printk(KERN_ERR "ext4: call_filldir: called with " + printk(KERN_ERR "EXT4-fs: call_filldir: called with " "null fname?!?\n"); return 0; } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index b0537c827024..c668e4377d76 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -19,6 +19,7 @@ #include <linux/types.h> #include <linux/blkdev.h> #include <linux/magic.h> +#include <linux/jbd2.h> #include "ext4_i.h" /* @@ -94,9 +95,9 @@ struct ext4_allocation_request { /* phys. block for ^^^ */ ext4_fsblk_t pright; /* how many blocks we want to allocate */ - unsigned long len; + unsigned int len; /* flags. see above EXT4_MB_HINT_* */ - unsigned long flags; + unsigned int flags; }; /* @@ -156,12 +157,12 @@ struct ext4_group_desc __le32 bg_block_bitmap_lo; /* Blocks bitmap block */ __le32 bg_inode_bitmap_lo; /* Inodes bitmap block */ __le32 bg_inode_table_lo; /* Inodes table block */ - __le16 bg_free_blocks_count; /* Free blocks count */ - __le16 bg_free_inodes_count; /* Free inodes count */ - __le16 bg_used_dirs_count; /* Directories count */ + __le16 bg_free_blocks_count_lo;/* Free blocks count */ + __le16 bg_free_inodes_count_lo;/* Free inodes count */ + __le16 bg_used_dirs_count_lo; /* Directories count */ __le16 bg_flags; /* EXT4_BG_flags (INODE_UNINIT, etc) */ __u32 bg_reserved[2]; /* Likely block/inode bitmap checksum */ - __le16 bg_itable_unused; /* Unused inodes count */ + __le16 bg_itable_unused_lo; /* Unused inodes count */ __le16 bg_checksum; /* crc16(sb_uuid+group+desc) */ __le32 bg_block_bitmap_hi; /* Blocks bitmap block MSB */ __le32 bg_inode_bitmap_hi; /* Inodes bitmap block MSB */ @@ -169,7 +170,7 @@ struct ext4_group_desc __le16 bg_free_blocks_count_hi;/* Free blocks count MSB */ __le16 bg_free_inodes_count_hi;/* Free inodes count MSB */ __le16 bg_used_dirs_count_hi; /* Directories count MSB */ - __le16 bg_itable_unused_hi; /* Unused inodes count MSB */ + __le16 bg_itable_unused_hi; /* Unused inodes count MSB */ __u32 bg_reserved2[3]; }; @@ -328,6 +329,7 @@ struct ext4_mount_options { uid_t s_resuid; gid_t s_resgid; unsigned long s_commit_interval; + u32 s_min_batch_time, s_max_batch_time; #ifdef CONFIG_QUOTA int s_jquota_fmt; char *s_qf_names[MAXQUOTAS]; @@ -534,7 +536,6 @@ do { \ #define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ #define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ #define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ -#define EXT4_MOUNT_EXTENTS 0x400000 /* Extents support */ #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ #define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ @@ -726,11 +727,11 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) */ #define EXT4_HAS_COMPAT_FEATURE(sb,mask) \ - (EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask)) + ((EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask)) != 0) #define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask) \ - (EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask)) + ((EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask)) != 0) #define EXT4_HAS_INCOMPAT_FEATURE(sb,mask) \ - (EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask)) + ((EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask)) != 0) #define EXT4_SET_COMPAT_FEATURE(sb,mask) \ EXT4_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask) #define EXT4_SET_RO_COMPAT_FEATURE(sb,mask) \ @@ -806,6 +807,12 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) #define EXT4_DEFM_JMODE_WBACK 0x0060 /* + * Default journal batch times + */ +#define EXT4_DEF_MIN_BATCH_TIME 0 +#define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */ + +/* * Structure of a directory entry */ #define EXT4_NAME_LEN 255 @@ -891,6 +898,9 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len) #define DX_HASH_LEGACY 0 #define DX_HASH_HALF_MD4 1 #define DX_HASH_TEA 2 +#define DX_HASH_LEGACY_UNSIGNED 3 +#define DX_HASH_HALF_MD4_UNSIGNED 4 +#define DX_HASH_TEA_UNSIGNED 5 #ifdef __KERNEL__ @@ -955,7 +965,7 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no) #define ERR_BAD_DX_DIR -75000 void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, - unsigned long *blockgrpp, ext4_grpblk_t *offsetp); + ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp); extern struct proc_dir_entry *ext4_proc_root; @@ -987,6 +997,9 @@ do { \ # define ATTRIB_NORET __attribute__((noreturn)) # define NORET_AND noreturn, +/* bitmap.c */ +extern unsigned int ext4_count_free(struct buffer_head *, unsigned); + /* balloc.c */ extern unsigned int ext4_block_group(struct super_block *sb, ext4_fsblk_t blocknr); @@ -995,20 +1008,14 @@ extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb, extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group); extern unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group); -extern ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode, - ext4_fsblk_t goal, int *errp); extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, ext4_fsblk_t goal, unsigned long *count, int *errp); -extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode, - ext4_lblk_t iblock, ext4_fsblk_t goal, - unsigned long *count, int *errp); extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks); extern int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks); extern void ext4_free_blocks(handle_t *handle, struct inode *inode, ext4_fsblk_t block, unsigned long count, int metadata); -extern void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb, - ext4_fsblk_t block, unsigned long count, - unsigned long *pdquot_freed_blocks); +extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, + ext4_fsblk_t block, unsigned long count); extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *); extern void ext4_check_blocks_bitmap(struct super_block *); extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, @@ -1019,7 +1026,7 @@ extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); /* dir.c */ extern int ext4_check_dir_entry(const char *, struct inode *, struct ext4_dir_entry_2 *, - struct buffer_head *, unsigned long); + struct buffer_head *, unsigned int); extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, __u32 minor_hash, struct ext4_dir_entry_2 *dirent); @@ -1039,7 +1046,6 @@ extern struct inode * ext4_orphan_get(struct super_block *, unsigned long); extern unsigned long ext4_count_free_inodes(struct super_block *); extern unsigned long ext4_count_dirs(struct super_block *); extern void ext4_check_inodes_bitmap(struct super_block *); -extern unsigned long ext4_count_free(struct buffer_head *, unsigned); /* mballoc.c */ extern long ext4_mb_stats; @@ -1054,12 +1060,13 @@ extern int __init init_ext4_mballoc(void); extern void exit_ext4_mballoc(void); extern void ext4_mb_free_blocks(handle_t *, struct inode *, unsigned long, unsigned long, int, unsigned long *); -extern int ext4_mb_add_more_groupinfo(struct super_block *sb, +extern int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t i, struct ext4_group_desc *desc); extern void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add); - - +extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t); +extern void ext4_mb_put_buddy_cache_lock(struct super_block *, + ext4_group_t, int); /* inode.c */ int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, struct buffer_head *bh, ext4_fsblk_t blocknr); @@ -1069,10 +1076,6 @@ struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int, int *); int ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); -int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, - ext4_lblk_t iblock, unsigned long maxblocks, - struct buffer_head *bh_result, - int create, int extend_disksize); extern struct inode *ext4_iget(struct super_block *, unsigned long); extern int ext4_write_inode(struct inode *, int); @@ -1123,6 +1126,9 @@ extern void ext4_abort(struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4))); extern void ext4_warning(struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4))); +extern void ext4_grp_locked_error(struct super_block *, ext4_group_t, + const char *, const char *, ...) + __attribute__ ((format (printf, 4, 5))); extern void ext4_update_dynamic_rev(struct super_block *sb); extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb, __u32 compat); @@ -1136,12 +1142,28 @@ extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb, struct ext4_group_desc *bg); extern ext4_fsblk_t ext4_inode_table(struct super_block *sb, struct ext4_group_desc *bg); +extern __u32 ext4_free_blks_count(struct super_block *sb, + struct ext4_group_desc *bg); +extern __u32 ext4_free_inodes_count(struct super_block *sb, + struct ext4_group_desc *bg); +extern __u32 ext4_used_dirs_count(struct super_block *sb, + struct ext4_group_desc *bg); +extern __u32 ext4_itable_unused_count(struct super_block *sb, + struct ext4_group_desc *bg); extern void ext4_block_bitmap_set(struct super_block *sb, struct ext4_group_desc *bg, ext4_fsblk_t blk); extern void ext4_inode_bitmap_set(struct super_block *sb, struct ext4_group_desc *bg, ext4_fsblk_t blk); extern void ext4_inode_table_set(struct super_block *sb, struct ext4_group_desc *bg, ext4_fsblk_t blk); +extern void ext4_free_blks_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count); +extern void ext4_free_inodes_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count); +extern void ext4_used_dirs_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count); +extern void ext4_itable_unused_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count); static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es) { @@ -1225,11 +1247,11 @@ do { \ } while (0) #ifdef CONFIG_SMP -/* Each CPU can accumulate FBC_BATCH blocks in their local +/* Each CPU can accumulate percpu_counter_batch blocks in their local * counters. So we need to make sure we have free blocks more - * than FBC_BATCH * nr_cpu_ids. Also add a window of 4 times. + * than percpu_counter_batch * nr_cpu_ids. Also add a window of 4 times. */ -#define EXT4_FREEBLOCKS_WATERMARK (4 * (FBC_BATCH * nr_cpu_ids)) +#define EXT4_FREEBLOCKS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids)) #else #define EXT4_FREEBLOCKS_WATERMARK 0 #endif @@ -1246,6 +1268,50 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) return ; } +struct ext4_group_info { + unsigned long bb_state; + struct rb_root bb_free_root; + unsigned short bb_first_free; + unsigned short bb_free; + unsigned short bb_fragments; + struct list_head bb_prealloc_list; +#ifdef DOUBLE_CHECK + void *bb_bitmap; +#endif + struct rw_semaphore alloc_sem; + unsigned short bb_counters[]; +}; + +#define EXT4_GROUP_INFO_NEED_INIT_BIT 0 +#define EXT4_GROUP_INFO_LOCKED_BIT 1 + +#define EXT4_MB_GRP_NEED_INIT(grp) \ + (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) + +static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) +{ + struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); + + bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state)); +} + +static inline void ext4_unlock_group(struct super_block *sb, + ext4_group_t group) +{ + struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); + + bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state)); +} + +static inline int ext4_is_group_locked(struct super_block *sb, + ext4_group_t group) +{ + struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); + + return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT, + &(grinfo->bb_state)); +} + /* * Inodes and files operations */ @@ -1271,18 +1337,38 @@ extern int ext4_ext_writepage_trans_blocks(struct inode *, int); extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk); extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, - ext4_lblk_t iblock, - unsigned long max_blocks, struct buffer_head *bh_result, - int create, int extend_disksize); + ext4_lblk_t iblock, unsigned int max_blocks, + struct buffer_head *bh_result, + int create, int extend_disksize); extern void ext4_ext_truncate(struct inode *); extern void ext4_ext_init(struct super_block *); extern void ext4_ext_release(struct super_block *); extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len); extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, - sector_t block, unsigned long max_blocks, + sector_t block, unsigned int max_blocks, struct buffer_head *bh, int create, int extend_disksize, int flag); +extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len); + +/* + * Add new method to test wether block and inode bitmaps are properly + * initialized. With uninit_bg reading the block from disk is not enough + * to mark the bitmap uptodate. We need to also zero-out the bitmap + */ +#define BH_BITMAP_UPTODATE BH_JBDPrivateStart + +static inline int bitmap_uptodate(struct buffer_head *bh) +{ + return (buffer_uptodate(bh) && + test_bit(BH_BITMAP_UPTODATE, &(bh)->b_state)); +} +static inline void set_bitmap_uptodate(struct buffer_head *bh) +{ + set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state); +} + #endif /* __KERNEL__ */ #endif /* _EXT4_H */ diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index bec7ce59fc0d..18cb67b2cbbc 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -194,11 +194,6 @@ static inline unsigned short ext_depth(struct inode *inode) return le16_to_cpu(ext_inode_hdr(inode)->eh_depth); } -static inline void ext4_ext_tree_changed(struct inode *inode) -{ - EXT4_I(inode)->i_ext_generation++; -} - static inline void ext4_ext_invalidate_cache(struct inode *inode) { diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h index 5c124c0ac6d3..e69acc16f5c4 100644 --- a/fs/ext4/ext4_i.h +++ b/fs/ext4/ext4_i.h @@ -31,7 +31,7 @@ typedef unsigned long long ext4_fsblk_t; typedef __u32 ext4_lblk_t; /* data type for block group number */ -typedef unsigned long ext4_group_t; +typedef unsigned int ext4_group_t; #define rsv_start rsv_window._rsv_start #define rsv_end rsv_window._rsv_end @@ -100,9 +100,6 @@ struct ext4_inode_info { */ loff_t i_disksize; - /* on-disk additional length */ - __u16 i_extra_isize; - /* * i_data_sem is for serialising ext4_truncate() against * ext4_getblock(). In the 2.4 ext2 design, great chunks of inode's @@ -117,7 +114,6 @@ struct ext4_inode_info { struct inode vfs_inode; struct jbd2_inode jinode; - unsigned long i_ext_generation; struct ext4_ext_cache i_cached_extent; /* * File creation time. Its function is same as that of @@ -130,10 +126,14 @@ struct ext4_inode_info { spinlock_t i_prealloc_lock; /* allocation reservation info for delalloc */ - unsigned long i_reserved_data_blocks; - unsigned long i_reserved_meta_blocks; - unsigned long i_allocated_meta_blocks; + unsigned int i_reserved_data_blocks; + unsigned int i_reserved_meta_blocks; + unsigned int i_allocated_meta_blocks; unsigned short i_delalloc_reserved_flag; + + /* on-disk additional length */ + __u16 i_extra_isize; + spinlock_t i_block_reservation_lock; }; diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index c75384b34f2c..ad13a84644e1 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -7,53 +7,96 @@ int __ext4_journal_get_undo_access(const char *where, handle_t *handle, struct buffer_head *bh) { - int err = jbd2_journal_get_undo_access(handle, bh); - if (err) - ext4_journal_abort_handle(where, __func__, bh, handle, err); + int err = 0; + + if (ext4_handle_valid(handle)) { + err = jbd2_journal_get_undo_access(handle, bh); + if (err) + ext4_journal_abort_handle(where, __func__, bh, + handle, err); + } return err; } int __ext4_journal_get_write_access(const char *where, handle_t *handle, struct buffer_head *bh) { - int err = jbd2_journal_get_write_access(handle, bh); - if (err) - ext4_journal_abort_handle(where, __func__, bh, handle, err); + int err = 0; + + if (ext4_handle_valid(handle)) { + err = jbd2_journal_get_write_access(handle, bh); + if (err) + ext4_journal_abort_handle(where, __func__, bh, + handle, err); + } return err; } int __ext4_journal_forget(const char *where, handle_t *handle, struct buffer_head *bh) { - int err = jbd2_journal_forget(handle, bh); - if (err) - ext4_journal_abort_handle(where, __func__, bh, handle, err); + int err = 0; + + if (ext4_handle_valid(handle)) { + err = jbd2_journal_forget(handle, bh); + if (err) + ext4_journal_abort_handle(where, __func__, bh, + handle, err); + } return err; } int __ext4_journal_revoke(const char *where, handle_t *handle, ext4_fsblk_t blocknr, struct buffer_head *bh) { - int err = jbd2_journal_revoke(handle, blocknr, bh); - if (err) - ext4_journal_abort_handle(where, __func__, bh, handle, err); + int err = 0; + + if (ext4_handle_valid(handle)) { + err = jbd2_journal_revoke(handle, blocknr, bh); + if (err) + ext4_journal_abort_handle(where, __func__, bh, + handle, err); + } return err; } int __ext4_journal_get_create_access(const char *where, handle_t *handle, struct buffer_head *bh) { - int err = jbd2_journal_get_create_access(handle, bh); - if (err) - ext4_journal_abort_handle(where, __func__, bh, handle, err); + int err = 0; + + if (ext4_handle_valid(handle)) { + err = jbd2_journal_get_create_access(handle, bh); + if (err) + ext4_journal_abort_handle(where, __func__, bh, + handle, err); + } return err; } -int __ext4_journal_dirty_metadata(const char *where, - handle_t *handle, struct buffer_head *bh) +int __ext4_handle_dirty_metadata(const char *where, handle_t *handle, + struct inode *inode, struct buffer_head *bh) { - int err = jbd2_journal_dirty_metadata(handle, bh); - if (err) - ext4_journal_abort_handle(where, __func__, bh, handle, err); + int err = 0; + + if (ext4_handle_valid(handle)) { + err = jbd2_journal_dirty_metadata(handle, bh); + if (err) + ext4_journal_abort_handle(where, __func__, bh, + handle, err); + } else { + mark_buffer_dirty(bh); + if (inode && inode_needs_sync(inode)) { + sync_dirty_buffer(bh); + if (buffer_req(bh) && !buffer_uptodate(bh)) { + ext4_error(inode->i_sb, __func__, + "IO error syncing inode, " + "inode=%lu, block=%llu", + inode->i_ino, + (unsigned long long) bh->b_blocknr); + err = -EIO; + } + } + } return err; } diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index b455c685a98b..be2f426f6805 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -32,8 +32,8 @@ * 5 levels of tree + root which are stored in the inode. */ #define EXT4_SINGLEDATA_TRANS_BLOCKS(sb) \ - (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \ - || test_opt(sb, EXTENTS) ? 27U : 8U) + (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \ + ? 27U : 8U) /* Extended attribute operations touch at most two data buffers, * two bitmap buffers, and two group summaries, in addition to the inode @@ -122,12 +122,6 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode); * been done yet. */ -static inline void ext4_journal_release_buffer(handle_t *handle, - struct buffer_head *bh) -{ - jbd2_journal_release_buffer(handle, bh); -} - void ext4_journal_abort_handle(const char *caller, const char *err_fn, struct buffer_head *bh, handle_t *handle, int err); @@ -146,8 +140,8 @@ int __ext4_journal_revoke(const char *where, handle_t *handle, int __ext4_journal_get_create_access(const char *where, handle_t *handle, struct buffer_head *bh); -int __ext4_journal_dirty_metadata(const char *where, - handle_t *handle, struct buffer_head *bh); +int __ext4_handle_dirty_metadata(const char *where, handle_t *handle, + struct inode *inode, struct buffer_head *bh); #define ext4_journal_get_undo_access(handle, bh) \ __ext4_journal_get_undo_access(__func__, (handle), (bh)) @@ -157,14 +151,57 @@ int __ext4_journal_dirty_metadata(const char *where, __ext4_journal_revoke(__func__, (handle), (blocknr), (bh)) #define ext4_journal_get_create_access(handle, bh) \ __ext4_journal_get_create_access(__func__, (handle), (bh)) -#define ext4_journal_dirty_metadata(handle, bh) \ - __ext4_journal_dirty_metadata(__func__, (handle), (bh)) #define ext4_journal_forget(handle, bh) \ __ext4_journal_forget(__func__, (handle), (bh)) +#define ext4_handle_dirty_metadata(handle, inode, bh) \ + __ext4_handle_dirty_metadata(__func__, (handle), (inode), (bh)) handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks); int __ext4_journal_stop(const char *where, handle_t *handle); +#define EXT4_NOJOURNAL_HANDLE ((handle_t *) 0x1) + +static inline int ext4_handle_valid(handle_t *handle) +{ + if (handle == EXT4_NOJOURNAL_HANDLE) + return 0; + return 1; +} + +static inline void ext4_handle_sync(handle_t *handle) +{ + if (ext4_handle_valid(handle)) + handle->h_sync = 1; +} + +static inline void ext4_handle_release_buffer(handle_t *handle, + struct buffer_head *bh) +{ + if (ext4_handle_valid(handle)) + jbd2_journal_release_buffer(handle, bh); +} + +static inline int ext4_handle_is_aborted(handle_t *handle) +{ + if (ext4_handle_valid(handle)) + return is_handle_aborted(handle); + return 0; +} + +static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed) +{ + if (ext4_handle_valid(handle) && handle->h_buffer_credits < needed) + return 0; + return 1; +} + +static inline void ext4_journal_release_buffer(handle_t *handle, + struct buffer_head *bh) +{ + if (ext4_handle_valid(handle)) + jbd2_journal_release_buffer(handle, bh); +} + static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks) { return ext4_journal_start_sb(inode->i_sb, nblocks); @@ -180,27 +217,37 @@ static inline handle_t *ext4_journal_current_handle(void) static inline int ext4_journal_extend(handle_t *handle, int nblocks) { - return jbd2_journal_extend(handle, nblocks); + if (ext4_handle_valid(handle)) + return jbd2_journal_extend(handle, nblocks); + return 0; } static inline int ext4_journal_restart(handle_t *handle, int nblocks) { - return jbd2_journal_restart(handle, nblocks); + if (ext4_handle_valid(handle)) + return jbd2_journal_restart(handle, nblocks); + return 0; } static inline int ext4_journal_blocks_per_page(struct inode *inode) { - return jbd2_journal_blocks_per_page(inode); + if (EXT4_JOURNAL(inode) != NULL) + return jbd2_journal_blocks_per_page(inode); + return 0; } static inline int ext4_journal_force_commit(journal_t *journal) { - return jbd2_journal_force_commit(journal); + if (journal) + return jbd2_journal_force_commit(journal); + return 0; } static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode) { - return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode); + if (ext4_handle_valid(handle)) + return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode); + return 0; } /* super.c */ @@ -208,6 +255,8 @@ int ext4_force_commit(struct super_block *sb); static inline int ext4_should_journal_data(struct inode *inode) { + if (EXT4_JOURNAL(inode) == NULL) + return 0; if (!S_ISREG(inode->i_mode)) return 1; if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) @@ -219,6 +268,8 @@ static inline int ext4_should_journal_data(struct inode *inode) static inline int ext4_should_order_data(struct inode *inode) { + if (EXT4_JOURNAL(inode) == NULL) + return 0; if (!S_ISREG(inode->i_mode)) return 0; if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) @@ -230,6 +281,8 @@ static inline int ext4_should_order_data(struct inode *inode) static inline int ext4_should_writeback_data(struct inode *inode) { + if (EXT4_JOURNAL(inode) == NULL) + return 0; if (!S_ISREG(inode->i_mode)) return 0; if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h index 445fde603df8..039b6ea1a042 100644 --- a/fs/ext4/ext4_sb.h +++ b/fs/ext4/ext4_sb.h @@ -57,6 +57,7 @@ struct ext4_sb_info { u32 s_next_generation; u32 s_hash_seed[4]; int s_def_hash_version; + int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */ struct percpu_counter s_freeblocks_counter; struct percpu_counter s_freeinodes_counter; struct percpu_counter s_dirs_counter; @@ -73,6 +74,8 @@ struct ext4_sb_info { struct journal_s *s_journal; struct list_head s_orphan; unsigned long s_commit_interval; + u32 s_max_batch_time; + u32 s_min_batch_time; struct block_device *journal_bdev; #ifdef CONFIG_JBD2_DEBUG struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ @@ -101,7 +104,8 @@ struct ext4_sb_info { spinlock_t s_reserve_lock; spinlock_t s_md_lock; tid_t s_last_transaction; - unsigned short *s_mb_offsets, *s_mb_maxs; + unsigned short *s_mb_offsets; + unsigned int *s_mb_maxs; /* tunables */ unsigned long s_stripe; @@ -146,4 +150,10 @@ struct ext4_sb_info { struct flex_groups *s_flex_groups; }; +static inline spinlock_t * +sb_bgl_lock(struct ext4_sb_info *sbi, unsigned int block_group) +{ + return bgl_lock_ptr(&sbi->s_blockgroup_lock, block_group); +} + #endif /* _EXT4_SB */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index ea2ce3c0ae66..54bf0623a9ae 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -97,6 +97,8 @@ static int ext4_ext_journal_restart(handle_t *handle, int needed) { int err; + if (!ext4_handle_valid(handle)) + return 0; if (handle->h_buffer_credits > needed) return 0; err = ext4_journal_extend(handle, needed); @@ -134,7 +136,7 @@ static int ext4_ext_dirty(handle_t *handle, struct inode *inode, int err; if (path->p_bh) { /* path points to block */ - err = ext4_journal_dirty_metadata(handle, path->p_bh); + err = ext4_handle_dirty_metadata(handle, inode, path->p_bh); } else { /* path points to leaf/index in inode body */ err = ext4_mark_inode_dirty(handle, inode); @@ -191,7 +193,7 @@ ext4_ext_new_meta_block(handle_t *handle, struct inode *inode, ext4_fsblk_t goal, newblock; goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block)); - newblock = ext4_new_meta_block(handle, inode, goal, err); + newblock = ext4_new_meta_blocks(handle, inode, goal, NULL, err); return newblock; } @@ -780,7 +782,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, set_buffer_uptodate(bh); unlock_buffer(bh); - err = ext4_journal_dirty_metadata(handle, bh); + err = ext4_handle_dirty_metadata(handle, inode, bh); if (err) goto cleanup; brelse(bh); @@ -859,7 +861,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, set_buffer_uptodate(bh); unlock_buffer(bh); - err = ext4_journal_dirty_metadata(handle, bh); + err = ext4_handle_dirty_metadata(handle, inode, bh); if (err) goto cleanup; brelse(bh); @@ -955,7 +957,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, set_buffer_uptodate(bh); unlock_buffer(bh); - err = ext4_journal_dirty_metadata(handle, bh); + err = ext4_handle_dirty_metadata(handle, inode, bh); if (err) goto out; @@ -1160,15 +1162,13 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path, while (--depth >= 0) { ix = path[depth].p_idx; if (ix != EXT_LAST_INDEX(path[depth].p_hdr)) - break; + goto got_index; } - if (depth < 0) { - /* we've gone up to the root and - * found no index to the right */ - return 0; - } + /* we've gone up to the root and found no index to the right */ + return 0; +got_index: /* we've found index to the right, let's * follow it and find the closest allocated * block to the right */ @@ -1201,7 +1201,6 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path, *phys = ext_pblock(ex); put_bh(bh); return 0; - } /* @@ -1622,7 +1621,6 @@ cleanup: ext4_ext_drop_refs(npath); kfree(npath); } - ext4_ext_tree_changed(inode); ext4_ext_invalidate_cache(inode); return err; } @@ -2233,7 +2231,6 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) } } out: - ext4_ext_tree_changed(inode); ext4_ext_drop_refs(path); kfree(path); ext4_journal_stop(handle); @@ -2250,7 +2247,7 @@ void ext4_ext_init(struct super_block *sb) * possible initialization would be here */ - if (test_opt(sb, EXTENTS)) { + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { printk(KERN_INFO "EXT4-fs: file extents enabled"); #ifdef AGGRESSIVE_TEST printk(", aggressive tests"); @@ -2275,7 +2272,7 @@ void ext4_ext_init(struct super_block *sb) */ void ext4_ext_release(struct super_block *sb) { - if (!test_opt(sb, EXTENTS)) + if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) return; #ifdef EXTENTS_STATS @@ -2380,7 +2377,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t iblock, - unsigned long max_blocks) + unsigned int max_blocks) { struct ext4_extent *ex, newex, orig_ex; struct ext4_extent *ex1 = NULL; @@ -2536,7 +2533,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, */ newdepth = ext_depth(inode); /* - * update the extent length after successfull insert of the + * update the extent length after successful insert of the * split extent */ orig_ex.ee_len = cpu_to_le16(ee_len - @@ -2678,26 +2675,26 @@ fix_extent_len: */ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, ext4_lblk_t iblock, - unsigned long max_blocks, struct buffer_head *bh_result, + unsigned int max_blocks, struct buffer_head *bh_result, int create, int extend_disksize) { struct ext4_ext_path *path = NULL; struct ext4_extent_header *eh; struct ext4_extent newex, *ex; - ext4_fsblk_t goal, newblock; - int err = 0, depth, ret; - unsigned long allocated = 0; + ext4_fsblk_t newblock; + int err = 0, depth, ret, cache_type; + unsigned int allocated = 0; struct ext4_allocation_request ar; loff_t disksize; __clear_bit(BH_New, &bh_result->b_state); - ext_debug("blocks %u/%lu requested for inode %u\n", + ext_debug("blocks %u/%u requested for inode %u\n", iblock, max_blocks, inode->i_ino); /* check in cache */ - goal = ext4_ext_in_cache(inode, iblock, &newex); - if (goal) { - if (goal == EXT4_EXT_CACHE_GAP) { + cache_type = ext4_ext_in_cache(inode, iblock, &newex); + if (cache_type) { + if (cache_type == EXT4_EXT_CACHE_GAP) { if (!create) { /* * block isn't allocated yet and @@ -2706,7 +2703,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, goto out2; } /* we should allocate requested block */ - } else if (goal == EXT4_EXT_CACHE_EXTENT) { + } else if (cache_type == EXT4_EXT_CACHE_EXTENT) { /* block is already allocated */ newblock = iblock - le32_to_cpu(newex.ee_block) @@ -2854,7 +2851,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, if (!newblock) goto out2; ext_debug("allocate new block: goal %llu, found %llu/%lu\n", - goal, newblock, allocated); + ar.goal, newblock, allocated); /* try to insert new extent into found leaf and return */ ext4_ext_store_pblock(&newex, newblock); @@ -2950,7 +2947,7 @@ void ext4_ext_truncate(struct inode *inode) * transaction synchronous. */ if (IS_SYNC(inode)) - handle->h_sync = 1; + ext4_handle_sync(handle); out_stop: up_write(&EXT4_I(inode)->i_data_sem); @@ -3004,7 +3001,7 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len) handle_t *handle; ext4_lblk_t block; loff_t new_size; - unsigned long max_blocks; + unsigned int max_blocks; int ret = 0; int ret2 = 0; int retries = 0; @@ -3083,7 +3080,7 @@ retry: /* * Callback function called for each extent to gather FIEMAP information. */ -int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, +static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, struct ext4_ext_cache *newex, struct ext4_extent *ex, void *data) { @@ -3152,7 +3149,8 @@ int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, /* fiemap flags we can handle specified here */ #define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) -int ext4_xattr_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo) +static int ext4_xattr_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo) { __u64 physical = 0; __u64 length; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 6bd11fba71f7..f731cb545a03 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -140,9 +140,6 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) return 0; } -extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, - __u64 start, __u64 len); - const struct file_operations ext4_file_operations = { .llseek = generic_file_llseek, .read = do_sync_read, diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c index 556ca8eba3db..ac8f168c8ab4 100644 --- a/fs/ext4/hash.c +++ b/fs/ext4/hash.c @@ -35,23 +35,71 @@ static void TEA_transform(__u32 buf[4], __u32 const in[]) /* The old legacy hash */ -static __u32 dx_hack_hash(const char *name, int len) +static __u32 dx_hack_hash_unsigned(const char *name, int len) { - __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; + __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; + const unsigned char *ucp = (const unsigned char *) name; + + while (len--) { + hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373)); + + if (hash & 0x80000000) + hash -= 0x7fffffff; + hash1 = hash0; + hash0 = hash; + } + return hash0 << 1; +} + +static __u32 dx_hack_hash_signed(const char *name, int len) +{ + __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; + const signed char *scp = (const signed char *) name; + while (len--) { - __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373)); + hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373)); - if (hash & 0x80000000) hash -= 0x7fffffff; + if (hash & 0x80000000) + hash -= 0x7fffffff; hash1 = hash0; hash0 = hash; } - return (hash0 << 1); + return hash0 << 1; +} + +static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num) +{ + __u32 pad, val; + int i; + const signed char *scp = (const signed char *) msg; + + pad = (__u32)len | ((__u32)len << 8); + pad |= pad << 16; + + val = pad; + if (len > num*4) + len = num * 4; + for (i = 0; i < len; i++) { + if ((i % 4) == 0) + val = pad; + val = ((int) scp[i]) + (val << 8); + if ((i % 4) == 3) { + *buf++ = val; + val = pad; + num--; + } + } + if (--num >= 0) + *buf++ = val; + while (--num >= 0) + *buf++ = pad; } -static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) +static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num) { __u32 pad, val; int i; + const unsigned char *ucp = (const unsigned char *) msg; pad = (__u32)len | ((__u32)len << 8); pad |= pad << 16; @@ -62,7 +110,7 @@ static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) for (i = 0; i < len; i++) { if ((i % 4) == 0) val = pad; - val = msg[i] + (val << 8); + val = ((int) ucp[i]) + (val << 8); if ((i % 4) == 3) { *buf++ = val; val = pad; @@ -95,6 +143,8 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) const char *p; int i; __u32 in[8], buf[4]; + void (*str2hashbuf)(const char *, int, __u32 *, int) = + str2hashbuf_signed; /* Initialize the default seed for the hash checksum functions */ buf[0] = 0x67452301; @@ -113,13 +163,18 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) } switch (hinfo->hash_version) { + case DX_HASH_LEGACY_UNSIGNED: + hash = dx_hack_hash_unsigned(name, len); + break; case DX_HASH_LEGACY: - hash = dx_hack_hash(name, len); + hash = dx_hack_hash_signed(name, len); break; + case DX_HASH_HALF_MD4_UNSIGNED: + str2hashbuf = str2hashbuf_unsigned; case DX_HASH_HALF_MD4: p = name; while (len > 0) { - str2hashbuf(p, len, in, 8); + (*str2hashbuf)(p, len, in, 8); half_md4_transform(buf, in); len -= 32; p += 32; @@ -127,10 +182,12 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) minor_hash = buf[2]; hash = buf[1]; break; + case DX_HASH_TEA_UNSIGNED: + str2hashbuf = str2hashbuf_unsigned; case DX_HASH_TEA: p = name; while (len > 0) { - str2hashbuf(p, len, in, 4); + (*str2hashbuf)(p, len, in, 4); TEA_transform(buf, in); len -= 16; p += 16; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 08cac9fcace2..4fb86a0061d0 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -74,17 +74,17 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh, /* If checksum is bad mark all blocks and inodes use to prevent * allocation, essentially implementing a per-group read-only flag. */ if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { - ext4_error(sb, __func__, "Checksum bad for group %lu\n", + ext4_error(sb, __func__, "Checksum bad for group %u", block_group); - gdp->bg_free_blocks_count = 0; - gdp->bg_free_inodes_count = 0; - gdp->bg_itable_unused = 0; + ext4_free_blks_set(sb, gdp, 0); + ext4_free_inodes_set(sb, gdp, 0); + ext4_itable_unused_set(sb, gdp, 0); memset(bh->b_data, 0xff, sb->s_blocksize); return 0; } memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8); - mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), EXT4_BLOCKS_PER_GROUP(sb), + mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, bh->b_data); return EXT4_INODES_PER_GROUP(sb); @@ -111,29 +111,49 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) if (unlikely(!bh)) { ext4_error(sb, __func__, "Cannot read inode bitmap - " - "block_group = %lu, inode_bitmap = %llu", + "block_group = %u, inode_bitmap = %llu", block_group, bitmap_blk); return NULL; } - if (buffer_uptodate(bh) && - !(desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT))) + if (bitmap_uptodate(bh)) return bh; lock_buffer(bh); + if (bitmap_uptodate(bh)) { + unlock_buffer(bh); + return bh; + } spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group)); if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { ext4_init_inode_bitmap(sb, bh, block_group, desc); + set_bitmap_uptodate(bh); set_buffer_uptodate(bh); - unlock_buffer(bh); spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); + unlock_buffer(bh); return bh; } spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); + if (buffer_uptodate(bh)) { + /* + * if not uninit if bh is uptodate, + * bitmap is also uptodate + */ + set_bitmap_uptodate(bh); + unlock_buffer(bh); + return bh; + } + /* + * submit the buffer_head for read. We can + * safely mark the bitmap as uptodate now. + * We do it here so the bitmap uptodate bit + * get set with buffer lock held. + */ + set_bitmap_uptodate(bh); if (bh_submit_read(bh) < 0) { put_bh(bh); ext4_error(sb, __func__, "Cannot read inode bitmap - " - "block_group = %lu, inode_bitmap = %llu", + "block_group = %u, inode_bitmap = %llu", block_group, bitmap_blk); return NULL; } @@ -168,7 +188,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) struct ext4_group_desc *gdp; struct ext4_super_block *es; struct ext4_sb_info *sbi; - int fatal = 0, err; + int fatal = 0, err, count; ext4_group_t flex_group; if (atomic_read(&inode->i_count) > 1) { @@ -190,6 +210,11 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) ino = inode->i_ino; ext4_debug("freeing inode %lu\n", ino); + trace_mark(ext4_free_inode, + "dev %s ino %lu mode %d uid %lu gid %lu bocks %llu", + sb->s_id, inode->i_ino, inode->i_mode, + (unsigned long) inode->i_uid, (unsigned long) inode->i_gid, + (unsigned long long) inode->i_blocks); /* * Note: we must free any quota before locking the superblock, @@ -236,9 +261,12 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) if (gdp) { spin_lock(sb_bgl_lock(sbi, block_group)); - le16_add_cpu(&gdp->bg_free_inodes_count, 1); - if (is_directory) - le16_add_cpu(&gdp->bg_used_dirs_count, -1); + count = ext4_free_inodes_count(sb, gdp) + 1; + ext4_free_inodes_set(sb, gdp, count); + if (is_directory) { + count = ext4_used_dirs_count(sb, gdp) - 1; + ext4_used_dirs_set(sb, gdp, count); + } gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); spin_unlock(sb_bgl_lock(sbi, block_group)); @@ -253,12 +281,12 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) spin_unlock(sb_bgl_lock(sbi, flex_group)); } } - BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata"); - err = ext4_journal_dirty_metadata(handle, bh2); + BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, NULL, bh2); if (!fatal) fatal = err; } - BUFFER_TRACE(bitmap_bh, "call ext4_journal_dirty_metadata"); - err = ext4_journal_dirty_metadata(handle, bitmap_bh); + BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); if (!fatal) fatal = err; sb->s_dirt = 1; @@ -291,13 +319,13 @@ static int find_group_dir(struct super_block *sb, struct inode *parent, for (group = 0; group < ngroups; group++) { desc = ext4_get_group_desc(sb, group, NULL); - if (!desc || !desc->bg_free_inodes_count) + if (!desc || !ext4_free_inodes_count(sb, desc)) continue; - if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei) + if (ext4_free_inodes_count(sb, desc) < avefreei) continue; if (!best_desc || - (le16_to_cpu(desc->bg_free_blocks_count) > - le16_to_cpu(best_desc->bg_free_blocks_count))) { + (ext4_free_blks_count(sb, desc) > + ext4_free_blks_count(sb, best_desc))) { *best_group = group; best_desc = desc; ret = 0; @@ -369,7 +397,7 @@ found_flexbg: for (i = best_flex * flex_size; i < ngroups && i < (best_flex + 1) * flex_size; i++) { desc = ext4_get_group_desc(sb, i, &bh); - if (le16_to_cpu(desc->bg_free_inodes_count)) { + if (ext4_free_inodes_count(sb, desc)) { *best_group = i; goto out; } @@ -443,17 +471,17 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, for (i = 0; i < ngroups; i++) { grp = (parent_group + i) % ngroups; desc = ext4_get_group_desc(sb, grp, NULL); - if (!desc || !desc->bg_free_inodes_count) + if (!desc || !ext4_free_inodes_count(sb, desc)) continue; - if (le16_to_cpu(desc->bg_used_dirs_count) >= best_ndir) + if (ext4_used_dirs_count(sb, desc) >= best_ndir) continue; - if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei) + if (ext4_free_inodes_count(sb, desc) < avefreei) continue; - if (le16_to_cpu(desc->bg_free_blocks_count) < avefreeb) + if (ext4_free_blks_count(sb, desc) < avefreeb) continue; *group = grp; ret = 0; - best_ndir = le16_to_cpu(desc->bg_used_dirs_count); + best_ndir = ext4_used_dirs_count(sb, desc); } if (ret == 0) return ret; @@ -479,13 +507,13 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, for (i = 0; i < ngroups; i++) { *group = (parent_group + i) % ngroups; desc = ext4_get_group_desc(sb, *group, NULL); - if (!desc || !desc->bg_free_inodes_count) + if (!desc || !ext4_free_inodes_count(sb, desc)) continue; - if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs) + if (ext4_used_dirs_count(sb, desc) >= max_dirs) continue; - if (le16_to_cpu(desc->bg_free_inodes_count) < min_inodes) + if (ext4_free_inodes_count(sb, desc) < min_inodes) continue; - if (le16_to_cpu(desc->bg_free_blocks_count) < min_blocks) + if (ext4_free_blks_count(sb, desc) < min_blocks) continue; return 0; } @@ -494,8 +522,8 @@ fallback: for (i = 0; i < ngroups; i++) { *group = (parent_group + i) % ngroups; desc = ext4_get_group_desc(sb, *group, NULL); - if (desc && desc->bg_free_inodes_count && - le16_to_cpu(desc->bg_free_inodes_count) >= avefreei) + if (desc && ext4_free_inodes_count(sb, desc) && + ext4_free_inodes_count(sb, desc) >= avefreei) return 0; } @@ -524,8 +552,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent, */ *group = parent_group; desc = ext4_get_group_desc(sb, *group, NULL); - if (desc && le16_to_cpu(desc->bg_free_inodes_count) && - le16_to_cpu(desc->bg_free_blocks_count)) + if (desc && ext4_free_inodes_count(sb, desc) && + ext4_free_blks_count(sb, desc)) return 0; /* @@ -548,8 +576,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent, if (*group >= ngroups) *group -= ngroups; desc = ext4_get_group_desc(sb, *group, NULL); - if (desc && le16_to_cpu(desc->bg_free_inodes_count) && - le16_to_cpu(desc->bg_free_blocks_count)) + if (desc && ext4_free_inodes_count(sb, desc) && + ext4_free_blks_count(sb, desc)) return 0; } @@ -562,7 +590,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent, if (++*group >= ngroups) *group = 0; desc = ext4_get_group_desc(sb, *group, NULL); - if (desc && le16_to_cpu(desc->bg_free_inodes_count)) + if (desc && ext4_free_inodes_count(sb, desc)) return 0; } @@ -570,6 +598,79 @@ static int find_group_other(struct super_block *sb, struct inode *parent, } /* + * claim the inode from the inode bitmap. If the group + * is uninit we need to take the groups's sb_bgl_lock + * and clear the uninit flag. The inode bitmap update + * and group desc uninit flag clear should be done + * after holding sb_bgl_lock so that ext4_read_inode_bitmap + * doesn't race with the ext4_claim_inode + */ +static int ext4_claim_inode(struct super_block *sb, + struct buffer_head *inode_bitmap_bh, + unsigned long ino, ext4_group_t group, int mode) +{ + int free = 0, retval = 0, count; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); + + spin_lock(sb_bgl_lock(sbi, group)); + if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) { + /* not a free inode */ + retval = 1; + goto err_ret; + } + ino++; + if ((group == 0 && ino < EXT4_FIRST_INO(sb)) || + ino > EXT4_INODES_PER_GROUP(sb)) { + spin_unlock(sb_bgl_lock(sbi, group)); + ext4_error(sb, __func__, + "reserved inode or inode > inodes count - " + "block_group = %u, inode=%lu", group, + ino + group * EXT4_INODES_PER_GROUP(sb)); + return 1; + } + /* If we didn't allocate from within the initialized part of the inode + * table then we need to initialize up to this inode. */ + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) { + + if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { + gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT); + /* When marking the block group with + * ~EXT4_BG_INODE_UNINIT we don't want to depend + * on the value of bg_itable_unused even though + * mke2fs could have initialized the same for us. + * Instead we calculated the value below + */ + + free = 0; + } else { + free = EXT4_INODES_PER_GROUP(sb) - + ext4_itable_unused_count(sb, gdp); + } + + /* + * Check the relative inode number against the last used + * relative inode number in this group. if it is greater + * we need to update the bg_itable_unused count + * + */ + if (ino > free) + ext4_itable_unused_set(sb, gdp, + (EXT4_INODES_PER_GROUP(sb) - ino)); + } + count = ext4_free_inodes_count(sb, gdp) - 1; + ext4_free_inodes_set(sb, gdp, count); + if (S_ISDIR(mode)) { + count = ext4_used_dirs_count(sb, gdp) + 1; + ext4_used_dirs_set(sb, gdp, count); + } + gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); +err_ret: + spin_unlock(sb_bgl_lock(sbi, group)); + return retval; +} + +/* * There are two policies for allocating an inode. If the new inode is * a directory, then a forward search is made for a block group with both * free space and a low directory-to-inode ratio; if that fails, then of @@ -582,8 +683,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent, struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) { struct super_block *sb; - struct buffer_head *bitmap_bh = NULL; - struct buffer_head *bh2; + struct buffer_head *inode_bitmap_bh = NULL; + struct buffer_head *group_desc_bh; ext4_group_t group = 0; unsigned long ino = 0; struct inode *inode; @@ -602,6 +703,8 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) return ERR_PTR(-EPERM); sb = dir->i_sb; + trace_mark(ext4_request_inode, "dev %s dir %lu mode %d", sb->s_id, + dir->i_ino, mode); inode = new_inode(sb); if (!inode) return ERR_PTR(-ENOMEM); @@ -631,40 +734,52 @@ got_group: for (i = 0; i < sbi->s_groups_count; i++) { err = -EIO; - gdp = ext4_get_group_desc(sb, group, &bh2); + gdp = ext4_get_group_desc(sb, group, &group_desc_bh); if (!gdp) goto fail; - brelse(bitmap_bh); - bitmap_bh = ext4_read_inode_bitmap(sb, group); - if (!bitmap_bh) + brelse(inode_bitmap_bh); + inode_bitmap_bh = ext4_read_inode_bitmap(sb, group); + if (!inode_bitmap_bh) goto fail; ino = 0; repeat_in_this_group: ino = ext4_find_next_zero_bit((unsigned long *) - bitmap_bh->b_data, EXT4_INODES_PER_GROUP(sb), ino); + inode_bitmap_bh->b_data, + EXT4_INODES_PER_GROUP(sb), ino); + if (ino < EXT4_INODES_PER_GROUP(sb)) { - BUFFER_TRACE(bitmap_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, bitmap_bh); + BUFFER_TRACE(inode_bitmap_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, + inode_bitmap_bh); if (err) goto fail; - if (!ext4_set_bit_atomic(sb_bgl_lock(sbi, group), - ino, bitmap_bh->b_data)) { + BUFFER_TRACE(group_desc_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, + group_desc_bh); + if (err) + goto fail; + if (!ext4_claim_inode(sb, inode_bitmap_bh, + ino, group, mode)) { /* we won it */ - BUFFER_TRACE(bitmap_bh, - "call ext4_journal_dirty_metadata"); - err = ext4_journal_dirty_metadata(handle, - bitmap_bh); + BUFFER_TRACE(inode_bitmap_bh, + "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, + inode, + inode_bitmap_bh); if (err) goto fail; + /* zero bit is inode number 1*/ + ino++; goto got; } /* we lost it */ - jbd2_journal_release_buffer(handle, bitmap_bh); + ext4_handle_release_buffer(handle, inode_bitmap_bh); + ext4_handle_release_buffer(handle, group_desc_bh); if (++ino < EXT4_INODES_PER_GROUP(sb)) goto repeat_in_this_group; @@ -684,30 +799,16 @@ repeat_in_this_group: goto out; got: - ino++; - if ((group == 0 && ino < EXT4_FIRST_INO(sb)) || - ino > EXT4_INODES_PER_GROUP(sb)) { - ext4_error(sb, __func__, - "reserved inode or inode > inodes count - " - "block_group = %lu, inode=%lu", group, - ino + group * EXT4_INODES_PER_GROUP(sb)); - err = -EIO; - goto fail; - } - - BUFFER_TRACE(bh2, "get_write_access"); - err = ext4_journal_get_write_access(handle, bh2); - if (err) goto fail; - /* We may have to initialize the block bitmap if it isn't already */ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) && gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { - struct buffer_head *block_bh = ext4_read_block_bitmap(sb, group); + struct buffer_head *block_bitmap_bh; - BUFFER_TRACE(block_bh, "get block bitmap access"); - err = ext4_journal_get_write_access(handle, block_bh); + block_bitmap_bh = ext4_read_block_bitmap(sb, group); + BUFFER_TRACE(block_bitmap_bh, "get block bitmap access"); + err = ext4_journal_get_write_access(handle, block_bitmap_bh); if (err) { - brelse(block_bh); + brelse(block_bitmap_bh); goto fail; } @@ -715,9 +816,9 @@ got: spin_lock(sb_bgl_lock(sbi, group)); /* recheck and clear flag under lock if we still need to */ if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { - gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); free = ext4_free_blocks_after_init(sb, group, gdp); - gdp->bg_free_blocks_count = cpu_to_le16(free); + gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); + ext4_free_blks_set(sb, gdp, free); gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); } @@ -725,55 +826,19 @@ got: /* Don't need to dirty bitmap block if we didn't change it */ if (free) { - BUFFER_TRACE(block_bh, "dirty block bitmap"); - err = ext4_journal_dirty_metadata(handle, block_bh); + BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap"); + err = ext4_handle_dirty_metadata(handle, + NULL, block_bitmap_bh); } - brelse(block_bh); + brelse(block_bitmap_bh); if (err) goto fail; } - - spin_lock(sb_bgl_lock(sbi, group)); - /* If we didn't allocate from within the initialized part of the inode - * table then we need to initialize up to this inode. */ - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) { - if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { - gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT); - - /* When marking the block group with - * ~EXT4_BG_INODE_UNINIT we don't want to depend - * on the value of bg_itable_unused even though - * mke2fs could have initialized the same for us. - * Instead we calculated the value below - */ - - free = 0; - } else { - free = EXT4_INODES_PER_GROUP(sb) - - le16_to_cpu(gdp->bg_itable_unused); - } - - /* - * Check the relative inode number against the last used - * relative inode number in this group. if it is greater - * we need to update the bg_itable_unused count - * - */ - if (ino > free) - gdp->bg_itable_unused = - cpu_to_le16(EXT4_INODES_PER_GROUP(sb) - ino); - } - - le16_add_cpu(&gdp->bg_free_inodes_count, -1); - if (S_ISDIR(mode)) { - le16_add_cpu(&gdp->bg_used_dirs_count, 1); - } - gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); - spin_unlock(sb_bgl_lock(sbi, group)); - BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata"); - err = ext4_journal_dirty_metadata(handle, bh2); - if (err) goto fail; + BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh); + if (err) + goto fail; percpu_counter_dec(&sbi->s_freeinodes_counter); if (S_ISDIR(mode)) @@ -825,8 +890,11 @@ got: ext4_set_inode_flags(inode); if (IS_DIRSYNC(inode)) - handle->h_sync = 1; - insert_inode_hash(inode); + ext4_handle_sync(handle); + if (insert_inode_locked(inode) < 0) { + err = -EINVAL; + goto fail_drop; + } spin_lock(&sbi->s_next_gen_lock); inode->i_generation = sbi->s_next_generation++; spin_unlock(&sbi->s_next_gen_lock); @@ -849,7 +917,7 @@ got: if (err) goto fail_free_drop; - if (test_opt(sb, EXTENTS)) { + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { /* set extent flag only for directory, file and normal symlink*/ if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) { EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL; @@ -864,6 +932,8 @@ got: } ext4_debug("allocating inode %lu\n", inode->i_ino); + trace_mark(ext4_allocate_inode, "dev %s ino %lu dir %lu mode %d", + sb->s_id, inode->i_ino, dir->i_ino, mode); goto really_out; fail: ext4_std_error(sb, err); @@ -871,7 +941,7 @@ out: iput(inode); ret = ERR_PTR(err); really_out: - brelse(bitmap_bh); + brelse(inode_bitmap_bh); return ret; fail_free_drop: @@ -881,8 +951,9 @@ fail_drop: DQUOT_DROP(inode); inode->i_flags |= S_NOQUOTA; inode->i_nlink = 0; + unlock_new_inode(inode); iput(inode); - brelse(bitmap_bh); + brelse(inode_bitmap_bh); return ERR_PTR(err); } @@ -981,7 +1052,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb) gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; - desc_count += le16_to_cpu(gdp->bg_free_inodes_count); + desc_count += ext4_free_inodes_count(sb, gdp); brelse(bitmap_bh); bitmap_bh = ext4_read_inode_bitmap(sb, i); if (!bitmap_bh) @@ -989,7 +1060,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb) x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8); printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n", - i, le16_to_cpu(gdp->bg_free_inodes_count), x); + i, ext4_free_inodes_count(sb, gdp), x); bitmap_count += x; } brelse(bitmap_bh); @@ -1003,7 +1074,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb) gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; - desc_count += le16_to_cpu(gdp->bg_free_inodes_count); + desc_count += ext4_free_inodes_count(sb, gdp); cond_resched(); } return desc_count; @@ -1020,8 +1091,7 @@ unsigned long ext4_count_dirs(struct super_block * sb) struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; - count += le16_to_cpu(gdp->bg_used_dirs_count); + count += ext4_used_dirs_count(sb, gdp); } return count; } - diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index be21a5ae33cb..a6444cee0c7e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -34,6 +34,7 @@ #include <linux/writeback.h> #include <linux/pagevec.h> #include <linux/mpage.h> +#include <linux/namei.h> #include <linux/uio.h> #include <linux/bio.h> #include "ext4_jbd2.h" @@ -71,12 +72,17 @@ static int ext4_inode_is_fast_symlink(struct inode *inode) * "bh" may be NULL: a metadata block may have been freed from memory * but there may still be a record of it in the journal, and that record * still needs to be revoked. + * + * If the handle isn't valid we're not journaling so there's nothing to do. */ int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, struct buffer_head *bh, ext4_fsblk_t blocknr) { int err; + if (!ext4_handle_valid(handle)) + return 0; + might_sleep(); BUFFER_TRACE(bh, "enter"); @@ -169,7 +175,9 @@ static handle_t *start_transaction(struct inode *inode) */ static int try_to_extend_transaction(handle_t *handle, struct inode *inode) { - if (handle->h_buffer_credits > EXT4_RESERVE_TRANS_BLOCKS) + if (!ext4_handle_valid(handle)) + return 0; + if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1)) return 0; if (!ext4_journal_extend(handle, blocks_for_truncate(inode))) return 0; @@ -183,6 +191,7 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode) */ static int ext4_journal_test_restart(handle_t *handle, struct inode *inode) { + BUG_ON(EXT4_JOURNAL(inode) == NULL); jbd_debug(2, "restarting handle %p\n", handle); return ext4_journal_restart(handle, blocks_for_truncate(inode)); } @@ -215,7 +224,7 @@ void ext4_delete_inode(struct inode *inode) } if (IS_SYNC(inode)) - handle->h_sync = 1; + ext4_handle_sync(handle); inode->i_size = 0; err = ext4_mark_inode_dirty(handle, inode); if (err) { @@ -232,7 +241,7 @@ void ext4_delete_inode(struct inode *inode) * enough credits left in the handle to remove the inode from * the orphan list and set the dtime field. */ - if (handle->h_buffer_credits < 3) { + if (!ext4_handle_has_enough_credits(handle, 3)) { err = ext4_journal_extend(handle, 3); if (err > 0) err = ext4_journal_restart(handle, 3); @@ -505,10 +514,10 @@ static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, * return the total number of blocks to be allocate, including the * direct and indirect blocks. */ -static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned long blks, +static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks, int blocks_to_boundary) { - unsigned long count = 0; + unsigned int count = 0; /* * Simple case, [t,d]Indirect block(s) has not allocated yet @@ -546,6 +555,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, int indirect_blks, int blks, ext4_fsblk_t new_blocks[4], int *err) { + struct ext4_allocation_request ar; int target, i; unsigned long count = 0, blk_allocated = 0; int index = 0; @@ -594,10 +604,17 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, if (!target) goto allocated; /* Now allocate data blocks */ - count = target; - /* allocating blocks for data blocks */ - current_block = ext4_new_blocks(handle, inode, iblock, - goal, &count, err); + memset(&ar, 0, sizeof(ar)); + ar.inode = inode; + ar.goal = goal; + ar.len = target; + ar.logical = iblock; + if (S_ISREG(inode->i_mode)) + /* enable in-core preallocation only for regular files */ + ar.flags = EXT4_MB_HINT_DATA; + + current_block = ext4_mb_new_blocks(handle, &ar, err); + if (*err && (target == blks)) { /* * if the allocation failed and we didn't allocate @@ -613,7 +630,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, */ new_blocks[index] = current_block; } - blk_allocated += count; + blk_allocated += ar.len; } allocated: /* total number of blocks allocated for direct blocks */ @@ -708,8 +725,8 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, set_buffer_uptodate(bh); unlock_buffer(bh); - BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); - err = ext4_journal_dirty_metadata(handle, bh); + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, inode, bh); if (err) goto failed; } @@ -791,8 +808,8 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode, * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode. */ jbd_debug(5, "splicing indirect only\n"); - BUFFER_TRACE(where->bh, "call ext4_journal_dirty_metadata"); - err = ext4_journal_dirty_metadata(handle, where->bh); + BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, inode, where->bh); if (err) goto err_out; } else { @@ -839,10 +856,10 @@ err_out: * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem) */ -int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, - ext4_lblk_t iblock, unsigned long maxblocks, - struct buffer_head *bh_result, - int create, int extend_disksize) +static int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, + ext4_lblk_t iblock, unsigned int maxblocks, + struct buffer_head *bh_result, + int create, int extend_disksize) { int err = -EIO; ext4_lblk_t offsets[4]; @@ -1044,7 +1061,7 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used) * It returns the error in case of allocation failure. */ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, - unsigned long max_blocks, struct buffer_head *bh, + unsigned int max_blocks, struct buffer_head *bh, int create, int extend_disksize, int flag) { int retval; @@ -1220,8 +1237,8 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, set_buffer_uptodate(bh); } unlock_buffer(bh); - BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); - err = ext4_journal_dirty_metadata(handle, bh); + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, inode, bh); if (!fatal) fatal = err; } else { @@ -1334,6 +1351,10 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, pgoff_t index; unsigned from, to; + trace_mark(ext4_write_begin, + "dev %s ino %lu pos %llu len %u flags %u", + inode->i_sb->s_id, inode->i_ino, + (unsigned long long) pos, len, flags); index = pos >> PAGE_CACHE_SHIFT; from = pos & (PAGE_CACHE_SIZE - 1); to = from + len; @@ -1345,7 +1366,7 @@ retry: goto out; } - page = __grab_cache_page(mapping, index); + page = grab_cache_page_write_begin(mapping, index, flags); if (!page) { ext4_journal_stop(handle); ret = -ENOMEM; @@ -1386,7 +1407,7 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh) if (!buffer_mapped(bh) || buffer_freed(bh)) return 0; set_buffer_uptodate(bh); - return ext4_journal_dirty_metadata(handle, bh); + return ext4_handle_dirty_metadata(handle, NULL, bh); } /* @@ -1405,6 +1426,10 @@ static int ext4_ordered_write_end(struct file *file, struct inode *inode = mapping->host; int ret = 0, ret2; + trace_mark(ext4_ordered_write_end, + "dev %s ino %lu pos %llu len %u copied %u", + inode->i_sb->s_id, inode->i_ino, + (unsigned long long) pos, len, copied); ret = ext4_jbd2_file_inode(handle, inode); if (ret == 0) { @@ -1443,6 +1468,10 @@ static int ext4_writeback_write_end(struct file *file, int ret = 0, ret2; loff_t new_i_size; + trace_mark(ext4_writeback_write_end, + "dev %s ino %lu pos %llu len %u copied %u", + inode->i_sb->s_id, inode->i_ino, + (unsigned long long) pos, len, copied); new_i_size = pos + copied; if (new_i_size > EXT4_I(inode)->i_disksize) { ext4_update_i_disksize(inode, new_i_size); @@ -1478,6 +1507,10 @@ static int ext4_journalled_write_end(struct file *file, unsigned from, to; loff_t new_i_size; + trace_mark(ext4_journalled_write_end, + "dev %s ino %lu pos %llu len %u copied %u", + inode->i_sb->s_id, inode->i_ino, + (unsigned long long) pos, len, copied); from = pos & (PAGE_CACHE_SIZE - 1); to = from + len; @@ -1624,7 +1657,7 @@ struct mpage_da_data { get_block_t *get_block; struct writeback_control *wbc; int io_done; - long pages_written; + int pages_written; int retval; }; @@ -1644,35 +1677,39 @@ struct mpage_da_data { */ static int mpage_da_submit_io(struct mpage_da_data *mpd) { - struct address_space *mapping = mpd->inode->i_mapping; - int ret = 0, err, nr_pages, i; - unsigned long index, end; - struct pagevec pvec; long pages_skipped; + struct pagevec pvec; + unsigned long index, end; + int ret = 0, err, nr_pages, i; + struct inode *inode = mpd->inode; + struct address_space *mapping = inode->i_mapping; BUG_ON(mpd->next_page <= mpd->first_page); - pagevec_init(&pvec, 0); + /* + * We need to start from the first_page to the next_page - 1 + * to make sure we also write the mapped dirty buffer_heads. + * If we look at mpd->lbh.b_blocknr we would only be looking + * at the currently mapped buffer_heads. + */ index = mpd->first_page; end = mpd->next_page - 1; + pagevec_init(&pvec, 0); while (index <= end) { - /* - * We can use PAGECACHE_TAG_DIRTY lookup here because - * even though we have cleared the dirty flag on the page - * We still keep the page in the radix tree with tag - * PAGECACHE_TAG_DIRTY. See clear_page_dirty_for_io. - * The PAGECACHE_TAG_DIRTY is cleared in set_page_writeback - * which is called via the below writepage callback. - */ - nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY, - min(end - index, - (pgoff_t)PAGEVEC_SIZE-1) + 1); + nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); if (nr_pages == 0) break; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; + index = page->index; + if (index > end) + break; + index++; + + BUG_ON(!PageLocked(page)); + BUG_ON(PageWriteback(page)); + pages_skipped = mpd->wbc->pages_skipped; err = mapping->a_ops->writepage(page, mpd->wbc); if (!err && (pages_skipped == mpd->wbc->pages_skipped)) @@ -1830,13 +1867,13 @@ static void ext4_print_free_blocks(struct inode *inode) ext4_count_free_blocks(inode->i_sb)); printk(KERN_EMERG "Free/Dirty block details\n"); printk(KERN_EMERG "free_blocks=%lld\n", - percpu_counter_sum(&sbi->s_freeblocks_counter)); + (long long)percpu_counter_sum(&sbi->s_freeblocks_counter)); printk(KERN_EMERG "dirty_blocks=%lld\n", - percpu_counter_sum(&sbi->s_dirtyblocks_counter)); + (long long)percpu_counter_sum(&sbi->s_dirtyblocks_counter)); printk(KERN_EMERG "Block reservation details\n"); - printk(KERN_EMERG "i_reserved_data_blocks=%lu\n", + printk(KERN_EMERG "i_reserved_data_blocks=%u\n", EXT4_I(inode)->i_reserved_data_blocks); - printk(KERN_EMERG "i_reserved_meta_blocks=%lu\n", + printk(KERN_EMERG "i_reserved_meta_blocks=%u\n", EXT4_I(inode)->i_reserved_meta_blocks); return; } @@ -2086,11 +2123,29 @@ static int __mpage_da_writepage(struct page *page, bh = head; do { BUG_ON(buffer_locked(bh)); + /* + * We need to try to allocate + * unmapped blocks in the same page. + * Otherwise we won't make progress + * with the page in ext4_da_writepage + */ if (buffer_dirty(bh) && (!buffer_mapped(bh) || buffer_delay(bh))) { mpage_add_bh_to_extent(mpd, logical, bh); if (mpd->io_done) return MPAGE_DA_EXTENT_TAIL; + } else if (buffer_dirty(bh) && (buffer_mapped(bh))) { + /* + * mapped dirty buffer. We need to update + * the b_state because we look at + * b_state in mpage_da_map_blocks. We don't + * update b_size because if we find an + * unmapped buffer_head later we need to + * use the b_state flag of that buffer_head. + */ + if (mpd->lbh.b_size == 0) + mpd->lbh.b_state = + bh->b_state & BH_FLAGS; } logical++; } while ((bh = bh->b_this_page) != head); @@ -2268,10 +2323,13 @@ static int ext4_da_writepage(struct page *page, { int ret = 0; loff_t size; - unsigned long len; + unsigned int len; struct buffer_head *page_bufs; struct inode *inode = page->mapping->host; + trace_mark(ext4_da_writepage, + "dev %s ino %lu page_index %lu", + inode->i_sb->s_id, inode->i_ino, page->index); size = i_size_read(inode); if (page->index == size >> PAGE_CACHE_SHIFT) len = size & ~PAGE_CACHE_MASK; @@ -2377,10 +2435,25 @@ static int ext4_da_writepages(struct address_space *mapping, struct mpage_da_data mpd; struct inode *inode = mapping->host; int no_nrwrite_index_update; - long pages_written = 0, pages_skipped; + int pages_written = 0; + long pages_skipped; int needed_blocks, ret = 0, nr_to_writebump = 0; struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); + trace_mark(ext4_da_writepages, + "dev %s ino %lu nr_t_write %ld " + "pages_skipped %ld range_start %llu " + "range_end %llu nonblocking %d " + "for_kupdate %d for_reclaim %d " + "for_writepages %d range_cyclic %d", + inode->i_sb->s_id, inode->i_ino, + wbc->nr_to_write, wbc->pages_skipped, + (unsigned long long) wbc->range_start, + (unsigned long long) wbc->range_end, + wbc->nonblocking, wbc->for_kupdate, + wbc->for_reclaim, wbc->for_writepages, + wbc->range_cyclic); + /* * No pages to write? This is mainly a kludge to avoid starting * a transaction for special inodes like journal inode on last iput() @@ -2388,6 +2461,20 @@ static int ext4_da_writepages(struct address_space *mapping, */ if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) return 0; + + /* + * If the filesystem has aborted, it is read-only, so return + * right away instead of dumping stack traces later on that + * will obscure the real source of the problem. We test + * EXT4_MOUNT_ABORT instead of sb->s_flag's MS_RDONLY because + * the latter could be true if the filesystem is mounted + * read-only, and in that case, ext4_da_writepages should + * *never* be called, so if that ever happens, we would want + * the stack trace. + */ + if (unlikely(sbi->s_mount_opt & EXT4_MOUNT_ABORT)) + return -EROFS; + /* * Make sure nr_to_write is >= sbi->s_mb_stream_request * This make sure small files blocks are allocated in @@ -2432,7 +2519,7 @@ static int ext4_da_writepages(struct address_space *mapping, handle = ext4_journal_start(inode, needed_blocks); if (IS_ERR(handle)) { ret = PTR_ERR(handle); - printk(KERN_EMERG "%s: jbd2_start: " + printk(KERN_CRIT "%s: jbd2_start: " "%ld pages, ino %lu; err %d\n", __func__, wbc->nr_to_write, inode->i_ino, ret); dump_stack(); @@ -2485,6 +2572,14 @@ out_writepages: if (!no_nrwrite_index_update) wbc->no_nrwrite_index_update = 0; wbc->nr_to_write -= nr_to_writebump; + trace_mark(ext4_da_writepage_result, + "dev %s ino %lu ret %d pages_written %d " + "pages_skipped %ld congestion %d " + "more_io %d no_nrwrite_index_update %d", + inode->i_sb->s_id, inode->i_ino, ret, + pages_written, wbc->pages_skipped, + wbc->encountered_congestion, wbc->more_io, + wbc->no_nrwrite_index_update); return ret; } @@ -2497,7 +2592,7 @@ static int ext4_nonda_switch(struct super_block *sb) /* * switch to non delalloc mode if we are running low * on free block. The free block accounting via percpu - * counters can get slightly wrong with FBC_BATCH getting + * counters can get slightly wrong with percpu_counter_batch getting * accumulated on each CPU without updating global counters * Delalloc need an accurate free block accounting. So switch * to non delalloc when we are near to error range. @@ -2536,6 +2631,11 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, len, flags, pagep, fsdata); } *fsdata = (void *)0; + + trace_mark(ext4_da_write_begin, + "dev %s ino %lu pos %llu len %u flags %u", + inode->i_sb->s_id, inode->i_ino, + (unsigned long long) pos, len, flags); retry: /* * With delayed allocation, we don't log the i_disksize update @@ -2549,7 +2649,7 @@ retry: goto out; } - page = __grab_cache_page(mapping, index); + page = grab_cache_page_write_begin(mapping, index, flags); if (!page) { ext4_journal_stop(handle); ret = -ENOMEM; @@ -2625,6 +2725,10 @@ static int ext4_da_write_end(struct file *file, } } + trace_mark(ext4_da_write_end, + "dev %s ino %lu pos %llu len %u copied %u", + inode->i_sb->s_id, inode->i_ino, + (unsigned long long) pos, len, copied); start = pos & (PAGE_CACHE_SIZE - 1); end = start + copied - 1; @@ -2717,7 +2821,10 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) filemap_write_and_wait(mapping); } - if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) { + BUG_ON(!EXT4_JOURNAL(inode) && + EXT4_I(inode)->i_state & EXT4_STATE_JDATA); + + if (EXT4_JOURNAL(inode) && EXT4_I(inode)->i_state & EXT4_STATE_JDATA) { /* * This is a REALLY heavyweight approach, but the use of * bmap on dirty files is expected to be extremely rare: @@ -2835,6 +2942,9 @@ static int ext4_normal_writepage(struct page *page, loff_t size = i_size_read(inode); loff_t len; + trace_mark(ext4_normal_writepage, + "dev %s ino %lu page_index %lu", + inode->i_sb->s_id, inode->i_ino, page->index); J_ASSERT(PageLocked(page)); if (page->index == size >> PAGE_CACHE_SHIFT) len = size & ~PAGE_CACHE_MASK; @@ -2920,6 +3030,9 @@ static int ext4_journalled_writepage(struct page *page, loff_t size = i_size_read(inode); loff_t len; + trace_mark(ext4_journalled_writepage, + "dev %s ino %lu page_index %lu", + inode->i_sb->s_id, inode->i_ino, page->index); J_ASSERT(PageLocked(page)); if (page->index == size >> PAGE_CACHE_SHIFT) len = size & ~PAGE_CACHE_MASK; @@ -2988,7 +3101,10 @@ static void ext4_invalidatepage(struct page *page, unsigned long offset) if (offset == 0) ClearPageChecked(page); - jbd2_journal_invalidatepage(journal, page, offset); + if (journal) + jbd2_journal_invalidatepage(journal, page, offset); + else + block_invalidatepage(page, offset); } static int ext4_releasepage(struct page *page, gfp_t wait) @@ -2998,7 +3114,10 @@ static int ext4_releasepage(struct page *page, gfp_t wait) WARN_ON(PageChecked(page)); if (!page_has_buffers(page)) return 0; - return jbd2_journal_try_to_free_buffers(journal, page, wait); + if (journal) + return jbd2_journal_try_to_free_buffers(journal, page, wait); + else + return try_to_free_buffers(page); } /* @@ -3270,7 +3389,7 @@ int ext4_block_truncate_page(handle_t *handle, err = 0; if (ext4_should_journal_data(inode)) { - err = ext4_journal_dirty_metadata(handle, bh); + err = ext4_handle_dirty_metadata(handle, inode, bh); } else { if (ext4_should_order_data(inode)) err = ext4_jbd2_file_inode(handle, inode); @@ -3394,8 +3513,8 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode, __le32 *p; if (try_to_extend_transaction(handle, inode)) { if (bh) { - BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); - ext4_journal_dirty_metadata(handle, bh); + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + ext4_handle_dirty_metadata(handle, inode, bh); } ext4_mark_inode_dirty(handle, inode); ext4_journal_test_restart(handle, inode); @@ -3495,7 +3614,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode, count, block_to_free_p, p); if (this_bh) { - BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata"); + BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata"); /* * The buffer head should have an attached journal head at this @@ -3504,7 +3623,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode, * the block was cleared. Check for this instead of OOPSing. */ if (bh2jh(this_bh)) - ext4_journal_dirty_metadata(handle, this_bh); + ext4_handle_dirty_metadata(handle, inode, this_bh); else ext4_error(inode->i_sb, __func__, "circular indirect block detected, " @@ -3534,7 +3653,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, ext4_fsblk_t nr; __le32 *p; - if (is_handle_aborted(handle)) + if (ext4_handle_is_aborted(handle)) return; if (depth--) { @@ -3604,7 +3723,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, * will merely complain about releasing a free block, * rather than leaking blocks. */ - if (is_handle_aborted(handle)) + if (ext4_handle_is_aborted(handle)) return; if (try_to_extend_transaction(handle, inode)) { ext4_mark_inode_dirty(handle, inode); @@ -3623,9 +3742,10 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, parent_bh)){ *p = 0; BUFFER_TRACE(parent_bh, - "call ext4_journal_dirty_metadata"); - ext4_journal_dirty_metadata(handle, - parent_bh); + "call ext4_handle_dirty_metadata"); + ext4_handle_dirty_metadata(handle, + inode, + parent_bh); } } } @@ -3813,7 +3933,7 @@ do_indirects: * synchronous */ if (IS_SYNC(inode)) - handle->h_sync = 1; + ext4_handle_sync(handle); out_stop: /* * If this was a simple ftruncate(), and the file will remain alive @@ -3843,7 +3963,7 @@ static int __ext4_get_inode_loc(struct inode *inode, ext4_fsblk_t block; int inodes_per_block, inode_offset; - iloc->bh = 0; + iloc->bh = NULL; if (!ext4_valid_inum(sb, inode->i_ino)) return -EIO; @@ -3950,7 +4070,7 @@ make_io: num = EXT4_INODES_PER_GROUP(sb); if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) - num -= le16_to_cpu(gdp->bg_itable_unused); + num -= ext4_itable_unused_count(sb, gdp); table += num / inodes_per_block; if (end > table) end = table; @@ -4164,9 +4284,11 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) inode->i_op = &ext4_dir_inode_operations; inode->i_fop = &ext4_dir_operations; } else if (S_ISLNK(inode->i_mode)) { - if (ext4_inode_is_fast_symlink(inode)) + if (ext4_inode_is_fast_symlink(inode)) { inode->i_op = &ext4_fast_symlink_inode_operations; - else { + nd_terminate_link(ei->i_data, inode->i_size, + sizeof(ei->i_data) - 1); + } else { inode->i_op = &ext4_symlink_inode_operations; ext4_set_aops(inode); } @@ -4310,8 +4432,8 @@ static int ext4_do_update_inode(handle_t *handle, EXT4_SET_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_LARGE_FILE); sb->s_dirt = 1; - handle->h_sync = 1; - err = ext4_journal_dirty_metadata(handle, + ext4_handle_sync(handle); + err = ext4_handle_dirty_metadata(handle, inode, EXT4_SB(sb)->s_sbh); } } @@ -4338,9 +4460,8 @@ static int ext4_do_update_inode(handle_t *handle, raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); } - - BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); - rc = ext4_journal_dirty_metadata(handle, bh); + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + rc = ext4_handle_dirty_metadata(handle, inode, bh); if (!err) err = rc; ei->i_state &= ~EXT4_STATE_NEW; @@ -4403,6 +4524,25 @@ int ext4_write_inode(struct inode *inode, int wait) return ext4_force_commit(inode->i_sb); } +int __ext4_write_dirty_metadata(struct inode *inode, struct buffer_head *bh) +{ + int err = 0; + + mark_buffer_dirty(bh); + if (inode && inode_needs_sync(inode)) { + sync_dirty_buffer(bh); + if (buffer_req(bh) && !buffer_uptodate(bh)) { + ext4_error(inode->i_sb, __func__, + "IO error syncing inode, " + "inode=%lu, block=%llu", + inode->i_ino, + (unsigned long long)bh->b_blocknr); + err = -EIO; + } + } + return err; +} + /* * ext4_setattr() * @@ -4707,16 +4847,15 @@ int ext4_reserve_inode_write(handle_t *handle, struct inode *inode, struct ext4_iloc *iloc) { - int err = 0; - if (handle) { - err = ext4_get_inode_loc(inode, iloc); - if (!err) { - BUFFER_TRACE(iloc->bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, iloc->bh); - if (err) { - brelse(iloc->bh); - iloc->bh = NULL; - } + int err; + + err = ext4_get_inode_loc(inode, iloc); + if (!err) { + BUFFER_TRACE(iloc->bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, iloc->bh); + if (err) { + brelse(iloc->bh); + iloc->bh = NULL; } } ext4_std_error(inode->i_sb, err); @@ -4788,7 +4927,8 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) might_sleep(); err = ext4_reserve_inode_write(handle, inode, &iloc); - if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && + if (ext4_handle_valid(handle) && + EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) { /* * We need extra buffer credits since we may write into EA block @@ -4840,6 +4980,11 @@ void ext4_dirty_inode(struct inode *inode) handle_t *current_handle = ext4_journal_current_handle(); handle_t *handle; + if (!ext4_handle_valid(current_handle)) { + ext4_mark_inode_dirty(current_handle, inode); + return; + } + handle = ext4_journal_start(inode, 2); if (IS_ERR(handle)) goto out; @@ -4877,8 +5022,9 @@ static int ext4_pin_inode(handle_t *handle, struct inode *inode) BUFFER_TRACE(iloc.bh, "get_write_access"); err = jbd2_journal_get_write_access(handle, iloc.bh); if (!err) - err = ext4_journal_dirty_metadata(handle, - iloc.bh); + err = ext4_handle_dirty_metadata(handle, + inode, + iloc.bh); brelse(iloc.bh); } } @@ -4904,6 +5050,8 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) */ journal = EXT4_JOURNAL(inode); + if (!journal) + return 0; if (is_journal_aborted(journal)) return -EROFS; @@ -4933,7 +5081,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) return PTR_ERR(handle); err = ext4_mark_inode_dirty(handle, inode); - handle->h_sync = 1; + ext4_handle_sync(handle); ext4_journal_stop(handle); ext4_std_error(inode->i_sb, err); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index dc99b4776d58..42dc83fb247a 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -99,7 +99,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) goto flags_out; } if (IS_SYNC(inode)) - handle->h_sync = 1; + ext4_handle_sync(handle); err = ext4_reserve_inode_write(handle, inode, &iloc); if (err) goto flags_err; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 444ad998f72e..918aec0c8a11 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -100,7 +100,7 @@ * inode as: * * { page } - * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]... + * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... * * * one block each for bitmap and buddy information. So for each group we @@ -330,6 +330,18 @@ * object * */ +static struct kmem_cache *ext4_pspace_cachep; +static struct kmem_cache *ext4_ac_cachep; +static struct kmem_cache *ext4_free_ext_cachep; +static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, + ext4_group_t group); +static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, + ext4_group_t group); +static int ext4_mb_init_per_dev_proc(struct super_block *sb); +static int ext4_mb_destroy_per_dev_proc(struct super_block *sb); +static void release_blocks_on_commit(journal_t *journal, transaction_t *txn); + + static inline void *mb_correct_addr_and_bit(int *bit, void *addr) { @@ -445,9 +457,9 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, blocknr += first + i; blocknr += le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); - - ext4_error(sb, __func__, "double-free of inode" - " %lu's block %llu(bit %u in group %lu)\n", + ext4_grp_locked_error(sb, e4b->bd_group, + __func__, "double-free of inode" + " %lu's block %llu(bit %u in group %u)", inode ? inode->i_ino : 0, blocknr, first + i, e4b->bd_group); } @@ -477,7 +489,7 @@ static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) b2 = (unsigned char *) bitmap; for (i = 0; i < e4b->bd_sb->s_blocksize; i++) { if (b1[i] != b2[i]) { - printk(KERN_ERR "corruption in group %lu " + printk(KERN_ERR "corruption in group %u " "at byte %u(%u): %x in copy != %x " "on disk/prealloc\n", e4b->bd_group, i, i * 8, b1[i], b2[i]); @@ -690,8 +702,8 @@ static void ext4_mb_generate_buddy(struct super_block *sb, grp->bb_fragments = fragments; if (free != grp->bb_free) { - ext4_error(sb, __func__, - "EXT4-fs: group %lu: %u blocks in bitmap, %u in gd\n", + ext4_grp_locked_error(sb, group, __func__, + "EXT4-fs: group %u: %u blocks in bitmap, %u in gd", group, free, grp->bb_free); /* * If we intent to continue, we consider group descritor @@ -716,7 +728,7 @@ static void ext4_mb_generate_buddy(struct super_block *sb, * stored in the inode as * * { page } - * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]... + * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... * * * one block each for bitmap and buddy information. @@ -782,25 +794,45 @@ static int ext4_mb_init_cache(struct page *page, char *incore) if (bh[i] == NULL) goto out; - if (buffer_uptodate(bh[i]) && - !(desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) + if (bitmap_uptodate(bh[i])) continue; lock_buffer(bh[i]); + if (bitmap_uptodate(bh[i])) { + unlock_buffer(bh[i]); + continue; + } spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { ext4_init_block_bitmap(sb, bh[i], first_group + i, desc); + set_bitmap_uptodate(bh[i]); set_buffer_uptodate(bh[i]); - unlock_buffer(bh[i]); spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); + unlock_buffer(bh[i]); continue; } spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); + if (buffer_uptodate(bh[i])) { + /* + * if not uninit if bh is uptodate, + * bitmap is also uptodate + */ + set_bitmap_uptodate(bh[i]); + unlock_buffer(bh[i]); + continue; + } get_bh(bh[i]); + /* + * submit the buffer_head for read. We can + * safely mark the bitmap as uptodate now. + * We do it here so the bitmap uptodate bit + * get set with buffer lock held. + */ + set_bitmap_uptodate(bh[i]); bh[i]->b_end_io = end_buffer_read_sync; submit_bh(READ, bh[i]); - mb_debug("read bitmap for group %lu\n", first_group + i); + mb_debug("read bitmap for group %u\n", first_group + i); } /* wait for I/O completion */ @@ -814,6 +846,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore) err = 0; first_block = page->index * blocks_per_page; + /* init the page */ + memset(page_address(page), 0xff, PAGE_CACHE_SIZE); for (i = 0; i < blocks_per_page; i++) { int group; struct ext4_group_info *grinfo; @@ -840,7 +874,6 @@ static int ext4_mb_init_cache(struct page *page, char *incore) BUG_ON(incore == NULL); mb_debug("put buddy for group %u in page %lu/%x\n", group, page->index, i * blocksize); - memset(data, 0xff, blocksize); grinfo = ext4_get_group_info(sb, group); grinfo->bb_fragments = 0; memset(grinfo->bb_counters, 0, @@ -848,7 +881,9 @@ static int ext4_mb_init_cache(struct page *page, char *incore) /* * incore got set to the group block bitmap below */ + ext4_lock_group(sb, group); ext4_mb_generate_buddy(sb, data, incore, group); + ext4_unlock_group(sb, group); incore = NULL; } else { /* this is block of bitmap */ @@ -862,6 +897,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore) /* mark all preallocated blks used in in-core bitmap */ ext4_mb_generate_from_pa(sb, data, group); + ext4_mb_generate_from_freelist(sb, data, group); ext4_unlock_group(sb, group); /* set incore so that the buddy information can be @@ -886,18 +922,20 @@ static noinline_for_stack int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, struct ext4_buddy *e4b) { - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct inode *inode = sbi->s_buddy_cache; int blocks_per_page; int block; int pnum; int poff; struct page *page; int ret; + struct ext4_group_info *grp; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct inode *inode = sbi->s_buddy_cache; - mb_debug("load group %lu\n", group); + mb_debug("load group %u\n", group); blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; + grp = ext4_get_group_info(sb, group); e4b->bd_blkbits = sb->s_blocksize_bits; e4b->bd_info = ext4_get_group_info(sb, group); @@ -905,6 +943,15 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, e4b->bd_group = group; e4b->bd_buddy_page = NULL; e4b->bd_bitmap_page = NULL; + e4b->alloc_semp = &grp->alloc_sem; + + /* Take the read lock on the group alloc + * sem. This would make sure a parallel + * ext4_mb_init_group happening on other + * groups mapped by the page is blocked + * till we are done with allocation + */ + down_read(e4b->alloc_semp); /* * the buddy cache inode stores the block bitmap @@ -920,6 +967,14 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, page = find_get_page(inode->i_mapping, pnum); if (page == NULL || !PageUptodate(page)) { if (page) + /* + * drop the page reference and try + * to get the page with lock. If we + * are not uptodate that implies + * somebody just created the page but + * is yet to initialize the same. So + * wait for it to initialize. + */ page_cache_release(page); page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); if (page) { @@ -985,6 +1040,9 @@ err: page_cache_release(e4b->bd_buddy_page); e4b->bd_buddy = NULL; e4b->bd_bitmap = NULL; + + /* Done with the buddy cache */ + up_read(e4b->alloc_semp); return ret; } @@ -994,6 +1052,9 @@ static void ext4_mb_release_desc(struct ext4_buddy *e4b) page_cache_release(e4b->bd_bitmap_page); if (e4b->bd_buddy_page) page_cache_release(e4b->bd_buddy_page); + /* Done with the buddy cache */ + if (e4b->alloc_semp) + up_read(e4b->alloc_semp); } @@ -1031,7 +1092,10 @@ static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len) cur += 32; continue; } - mb_clear_bit_atomic(lock, cur, bm); + if (lock) + mb_clear_bit_atomic(lock, cur, bm); + else + mb_clear_bit(cur, bm); cur++; } } @@ -1049,7 +1113,10 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len) cur += 32; continue; } - mb_set_bit_atomic(lock, cur, bm); + if (lock) + mb_set_bit_atomic(lock, cur, bm); + else + mb_set_bit(cur, bm); cur++; } } @@ -1094,12 +1161,11 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, blocknr += block; blocknr += le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); - ext4_unlock_group(sb, e4b->bd_group); - ext4_error(sb, __func__, "double-free of inode" - " %lu's block %llu(bit %u in group %lu)\n", + ext4_grp_locked_error(sb, e4b->bd_group, + __func__, "double-free of inode" + " %lu's block %llu(bit %u in group %u)", inode ? inode->i_ino : 0, blocknr, block, e4b->bd_group); - ext4_lock_group(sb, e4b->bd_group); } mb_clear_bit(block, EXT4_MB_BITMAP(e4b)); e4b->bd_info->bb_counters[order]++; @@ -1296,13 +1362,20 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, ac->ac_tail = ret & 0xffff; ac->ac_buddy = ret >> 16; - /* XXXXXXX: SUCH A HORRIBLE **CK */ - /*FIXME!! Why ? */ + /* + * take the page reference. We want the page to be pinned + * so that we don't get a ext4_mb_init_cache_call for this + * group until we update the bitmap. That would mean we + * double allocate blocks. The reference is dropped + * in ext4_mb_release_context + */ ac->ac_bitmap_page = e4b->bd_bitmap_page; get_page(ac->ac_bitmap_page); ac->ac_buddy_page = e4b->bd_buddy_page; get_page(ac->ac_buddy_page); - + /* on allocation we use ac to track the held semaphore */ + ac->alloc_semp = e4b->alloc_semp; + e4b->alloc_semp = NULL; /* store last allocated for subsequent stream allocation */ if ((ac->ac_flags & EXT4_MB_HINT_DATA)) { spin_lock(&sbi->s_md_lock); @@ -1326,6 +1399,8 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac, struct ext4_free_extent ex; int max; + if (ac->ac_status == AC_STATUS_FOUND) + return; /* * We don't want to scan for a whole year */ @@ -1575,8 +1650,9 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, * free blocks even though group info says we * we have free blocks */ - ext4_error(sb, __func__, "%d free blocks as per " - "group info. But bitmap says 0\n", + ext4_grp_locked_error(sb, e4b->bd_group, + __func__, "%d free blocks as per " + "group info. But bitmap says 0", free); break; } @@ -1584,8 +1660,9 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex); BUG_ON(ex.fe_len <= 0); if (free < ex.fe_len) { - ext4_error(sb, __func__, "%d free blocks as per " - "group info. But got %d blocks\n", + ext4_grp_locked_error(sb, e4b->bd_group, + __func__, "%d free blocks as per " + "group info. But got %d blocks", free, ex.fe_len); /* * The number of free blocks differs. This mostly @@ -1692,6 +1769,173 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac, return 0; } +/* + * lock the group_info alloc_sem of all the groups + * belonging to the same buddy cache page. This + * make sure other parallel operation on the buddy + * cache doesn't happen whild holding the buddy cache + * lock + */ +int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group) +{ + int i; + int block, pnum; + int blocks_per_page; + int groups_per_page; + ext4_group_t first_group; + struct ext4_group_info *grp; + + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; + /* + * the buddy cache inode stores the block bitmap + * and buddy information in consecutive blocks. + * So for each group we need two blocks. + */ + block = group * 2; + pnum = block / blocks_per_page; + first_group = pnum * blocks_per_page / 2; + + groups_per_page = blocks_per_page >> 1; + if (groups_per_page == 0) + groups_per_page = 1; + /* read all groups the page covers into the cache */ + for (i = 0; i < groups_per_page; i++) { + + if ((first_group + i) >= EXT4_SB(sb)->s_groups_count) + break; + grp = ext4_get_group_info(sb, first_group + i); + /* take all groups write allocation + * semaphore. This make sure there is + * no block allocation going on in any + * of that groups + */ + down_write_nested(&grp->alloc_sem, i); + } + return i; +} + +void ext4_mb_put_buddy_cache_lock(struct super_block *sb, + ext4_group_t group, int locked_group) +{ + int i; + int block, pnum; + int blocks_per_page; + ext4_group_t first_group; + struct ext4_group_info *grp; + + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; + /* + * the buddy cache inode stores the block bitmap + * and buddy information in consecutive blocks. + * So for each group we need two blocks. + */ + block = group * 2; + pnum = block / blocks_per_page; + first_group = pnum * blocks_per_page / 2; + /* release locks on all the groups */ + for (i = 0; i < locked_group; i++) { + + grp = ext4_get_group_info(sb, first_group + i); + /* take all groups write allocation + * semaphore. This make sure there is + * no block allocation going on in any + * of that groups + */ + up_write(&grp->alloc_sem); + } + +} + +static int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) +{ + + int ret; + void *bitmap; + int blocks_per_page; + int block, pnum, poff; + int num_grp_locked = 0; + struct ext4_group_info *this_grp; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct inode *inode = sbi->s_buddy_cache; + struct page *page = NULL, *bitmap_page = NULL; + + mb_debug("init group %lu\n", group); + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; + this_grp = ext4_get_group_info(sb, group); + /* + * This ensures we don't add group + * to this buddy cache via resize + */ + num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group); + if (!EXT4_MB_GRP_NEED_INIT(this_grp)) { + /* + * somebody initialized the group + * return without doing anything + */ + ret = 0; + goto err; + } + /* + * the buddy cache inode stores the block bitmap + * and buddy information in consecutive blocks. + * So for each group we need two blocks. + */ + block = group * 2; + pnum = block / blocks_per_page; + poff = block % blocks_per_page; + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + if (page) { + BUG_ON(page->mapping != inode->i_mapping); + ret = ext4_mb_init_cache(page, NULL); + if (ret) { + unlock_page(page); + goto err; + } + unlock_page(page); + } + if (page == NULL || !PageUptodate(page)) { + ret = -EIO; + goto err; + } + mark_page_accessed(page); + bitmap_page = page; + bitmap = page_address(page) + (poff * sb->s_blocksize); + + /* init buddy cache */ + block++; + pnum = block / blocks_per_page; + poff = block % blocks_per_page; + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + if (page == bitmap_page) { + /* + * If both the bitmap and buddy are in + * the same page we don't need to force + * init the buddy + */ + unlock_page(page); + } else if (page) { + BUG_ON(page->mapping != inode->i_mapping); + ret = ext4_mb_init_cache(page, bitmap); + if (ret) { + unlock_page(page); + goto err; + } + unlock_page(page); + } + if (page == NULL || !PageUptodate(page)) { + ret = -EIO; + goto err; + } + mark_page_accessed(page); +err: + ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked); + if (bitmap_page) + page_cache_release(bitmap_page); + if (page) + page_cache_release(page); + return ret; +} + static noinline_for_stack int ext4_mb_regular_allocator(struct ext4_allocation_context *ac) { @@ -1775,7 +2019,7 @@ repeat: group = 0; /* quick check to skip empty groups */ - grp = ext4_get_group_info(ac->ac_sb, group); + grp = ext4_get_group_info(sb, group); if (grp->bb_free == 0) continue; @@ -1788,10 +2032,9 @@ repeat: * we need full data about the group * to make a good selection */ - err = ext4_mb_load_buddy(sb, group, &e4b); + err = ext4_mb_init_group(sb, group); if (err) goto out; - ext4_mb_release_desc(&e4b); } /* @@ -1932,13 +2175,13 @@ static int ext4_mb_seq_history_show(struct seq_file *seq, void *v) if (hs->op == EXT4_MB_HISTORY_ALLOC) { fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u " "%-5u %-5s %-5u %-6u\n"; - sprintf(buf2, "%lu/%d/%u@%u", hs->result.fe_group, + sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group, hs->result.fe_start, hs->result.fe_len, hs->result.fe_logical); - sprintf(buf, "%lu/%d/%u@%u", hs->orig.fe_group, + sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group, hs->orig.fe_start, hs->orig.fe_len, hs->orig.fe_logical); - sprintf(buf3, "%lu/%d/%u@%u", hs->goal.fe_group, + sprintf(buf3, "%u/%d/%u@%u", hs->goal.fe_group, hs->goal.fe_start, hs->goal.fe_len, hs->goal.fe_logical); seq_printf(seq, fmt, hs->pid, hs->ino, buf, buf3, buf2, @@ -1947,20 +2190,20 @@ static int ext4_mb_seq_history_show(struct seq_file *seq, void *v) hs->buddy ? 1 << hs->buddy : 0); } else if (hs->op == EXT4_MB_HISTORY_PREALLOC) { fmt = "%-5u %-8u %-23s %-23s %-23s\n"; - sprintf(buf2, "%lu/%d/%u@%u", hs->result.fe_group, + sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group, hs->result.fe_start, hs->result.fe_len, hs->result.fe_logical); - sprintf(buf, "%lu/%d/%u@%u", hs->orig.fe_group, + sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group, hs->orig.fe_start, hs->orig.fe_len, hs->orig.fe_logical); seq_printf(seq, fmt, hs->pid, hs->ino, buf, "", buf2); } else if (hs->op == EXT4_MB_HISTORY_DISCARD) { - sprintf(buf2, "%lu/%d/%u", hs->result.fe_group, + sprintf(buf2, "%u/%d/%u", hs->result.fe_group, hs->result.fe_start, hs->result.fe_len); seq_printf(seq, "%-5u %-8u %-23s discard\n", hs->pid, hs->ino, buf2); } else if (hs->op == EXT4_MB_HISTORY_FREE) { - sprintf(buf2, "%lu/%d/%u", hs->result.fe_group, + sprintf(buf2, "%u/%d/%u", hs->result.fe_group, hs->result.fe_start, hs->result.fe_len); seq_printf(seq, "%-5u %-8u %-23s free\n", hs->pid, hs->ino, buf2); @@ -2073,7 +2316,7 @@ static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) return NULL; group = *pos + 1; - return (void *) group; + return (void *) ((unsigned long) group); } static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) @@ -2086,13 +2329,13 @@ static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) if (*pos < 0 || *pos >= sbi->s_groups_count) return NULL; group = *pos + 1; - return (void *) group;; + return (void *) ((unsigned long) group); } static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) { struct super_block *sb = seq->private; - long group = (long) v; + ext4_group_t group = (ext4_group_t) ((unsigned long) v); int i; int err; struct ext4_buddy e4b; @@ -2114,7 +2357,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) sizeof(struct ext4_group_info); err = ext4_mb_load_buddy(sb, group, &e4b); if (err) { - seq_printf(seq, "#%-5lu: I/O error\n", group); + seq_printf(seq, "#%-5u: I/O error\n", group); return 0; } ext4_lock_group(sb, group); @@ -2122,7 +2365,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) ext4_unlock_group(sb, group); ext4_mb_release_desc(&e4b); - seq_printf(seq, "#%-5lu: %-5u %-5u %-5u [", group, sg.info.bb_free, + seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, sg.info.bb_fragments, sg.info.bb_first_free); for (i = 0; i <= 13; i++) seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ? @@ -2296,10 +2539,11 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, ext4_free_blocks_after_init(sb, group, desc); } else { meta_group_info[i]->bb_free = - le16_to_cpu(desc->bg_free_blocks_count); + ext4_free_blks_count(sb, desc); } INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); + init_rwsem(&meta_group_info[i]->alloc_sem); meta_group_info[i]->bb_free_root.rb_node = NULL;; #ifdef DOUBLE_CHECK @@ -2327,54 +2571,6 @@ exit_meta_group_info: } /* ext4_mb_add_groupinfo */ /* - * Add a group to the existing groups. - * This function is used for online resize - */ -int ext4_mb_add_more_groupinfo(struct super_block *sb, ext4_group_t group, - struct ext4_group_desc *desc) -{ - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct inode *inode = sbi->s_buddy_cache; - int blocks_per_page; - int block; - int pnum; - struct page *page; - int err; - - /* Add group based on group descriptor*/ - err = ext4_mb_add_groupinfo(sb, group, desc); - if (err) - return err; - - /* - * Cache pages containing dynamic mb_alloc datas (buddy and bitmap - * datas) are set not up to date so that they will be re-initilaized - * during the next call to ext4_mb_load_buddy - */ - - /* Set buddy page as not up to date */ - blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; - block = group * 2; - pnum = block / blocks_per_page; - page = find_get_page(inode->i_mapping, pnum); - if (page != NULL) { - ClearPageUptodate(page); - page_cache_release(page); - } - - /* Set bitmap page as not up to date */ - block++; - pnum = block / blocks_per_page; - page = find_get_page(inode->i_mapping, pnum); - if (page != NULL) { - ClearPageUptodate(page); - page_cache_release(page); - } - - return 0; -} - -/* * Update an existing group. * This function is used for online resize */ @@ -2457,7 +2653,7 @@ static int ext4_mb_init_backend(struct super_block *sb) desc = ext4_get_group_desc(sb, i, NULL); if (desc == NULL) { printk(KERN_ERR - "EXT4-fs: can't read descriptor %lu\n", i); + "EXT4-fs: can't read descriptor %u\n", i); goto err_freebuddy; } if (ext4_mb_add_groupinfo(sb, i, desc) != 0) @@ -2493,6 +2689,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) if (sbi->s_mb_offsets == NULL) { return -ENOMEM; } + + i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int); sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); if (sbi->s_mb_maxs == NULL) { kfree(sbi->s_mb_maxs); @@ -2551,7 +2749,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) ext4_mb_init_per_dev_proc(sb); ext4_mb_history_init(sb); - sbi->s_journal->j_commit_callback = release_blocks_on_commit; + if (sbi->s_journal) + sbi->s_journal->j_commit_callback = release_blocks_on_commit; printk(KERN_INFO "EXT4-fs: mballoc enabled\n"); return 0; @@ -2652,7 +2851,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) list_for_each_safe(l, ltmp, &txn->t_private_list) { entry = list_entry(l, struct ext4_free_data, list); - mb_debug("gonna free %u blocks in group %lu (0x%p):", + mb_debug("gonna free %u blocks in group %u (0x%p):", entry->count, entry->group, entry); err = ext4_mb_load_buddy(sb, entry->group, &e4b); @@ -2679,8 +2878,9 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb) + entry->start_blk + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); - trace_mark(ext4_discard_blocks, "dev %s blk %llu count %u", sb->s_id, - (unsigned long long) discard_block, entry->count); + trace_mark(ext4_discard_blocks, "dev %s blk %llu count %u", + sb->s_id, (unsigned long long) discard_block, + entry->count); sb_issue_discard(sb, discard_block, entry->count); kmem_cache_free(ext4_free_ext_cachep, entry); @@ -2791,7 +2991,7 @@ void exit_ext4_mballoc(void) */ static noinline_for_stack int ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, - handle_t *handle, unsigned long reserv_blks) + handle_t *handle, unsigned int reserv_blks) { struct buffer_head *bitmap_bh = NULL; struct ext4_super_block *es; @@ -2824,7 +3024,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, if (!gdp) goto out_err; - ext4_debug("using block group %lu(%d)\n", ac->ac_b_ex.fe_group, + ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group, gdp->bg_free_blocks_count); err = ext4_journal_get_write_access(handle, gdp_bh); @@ -2843,8 +3043,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, in_range(block + len - 1, ext4_inode_table(sb, gdp), EXT4_SB(sb)->s_itb_per_group)) { ext4_error(sb, __func__, - "Allocating block in system zone - block = %llu", - block); + "Allocating block %llu in system zone of %d group\n", + block, ac->ac_b_ex.fe_group); /* File system mounted not to panic on error * Fix the bitmap and repeat the block allocation * We leak some of the blocks here. @@ -2852,7 +3052,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), bitmap_bh->b_data, ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len); - err = ext4_journal_dirty_metadata(handle, bitmap_bh); + err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); if (!err) err = -EAGAIN; goto out_err; @@ -2866,18 +3066,17 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, } } #endif - mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), bitmap_bh->b_data, - ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len); - spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); + mb_set_bits(NULL, bitmap_bh->b_data, + ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len); if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); - gdp->bg_free_blocks_count = - cpu_to_le16(ext4_free_blocks_after_init(sb, - ac->ac_b_ex.fe_group, - gdp)); + ext4_free_blks_set(sb, gdp, + ext4_free_blocks_after_init(sb, + ac->ac_b_ex.fe_group, gdp)); } - le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len); + len = ext4_free_blks_count(sb, gdp) - ac->ac_b_ex.fe_len; + ext4_free_blks_set(sb, gdp, len); gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp); spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len); @@ -2899,10 +3098,10 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, spin_unlock(sb_bgl_lock(sbi, flex_group)); } - err = ext4_journal_dirty_metadata(handle, bitmap_bh); + err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); if (err) goto out_err; - err = ext4_journal_dirty_metadata(handle, gdp_bh); + err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh); out_err: sb->s_dirt = 1; @@ -3031,7 +3230,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, /* check we don't cross already preallocated blocks */ rcu_read_lock(); list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { - unsigned long pa_end; + ext4_lblk_t pa_end; if (pa->pa_deleted) continue; @@ -3075,7 +3274,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, /* XXX: extra loop to check we really don't overlap preallocations */ rcu_read_lock(); list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { - unsigned long pa_end; + ext4_lblk_t pa_end; spin_lock(&pa->pa_lock); if (pa->pa_deleted == 0) { pa_end = pa->pa_lstart + pa->pa_len; @@ -3307,6 +3506,32 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) } /* + * the function goes through all block freed in the group + * but not yet committed and marks them used in in-core bitmap. + * buddy must be generated from this bitmap + * Need to be called with ext4 group lock (ext4_lock_group) + */ +static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, + ext4_group_t group) +{ + struct rb_node *n; + struct ext4_group_info *grp; + struct ext4_free_data *entry; + + grp = ext4_get_group_info(sb, group); + n = rb_first(&(grp->bb_free_root)); + + while (n) { + entry = rb_entry(n, struct ext4_free_data, node); + mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group), + bitmap, entry->start_blk, + entry->count); + n = rb_next(n); + } + return; +} + +/* * the function goes through all preallocation in this group and marks them * used in in-core bitmap. buddy must be generated from this bitmap * Need to be called with ext4 group lock (ext4_lock_group) @@ -3346,7 +3571,7 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, preallocated += len; count++; } - mb_debug("prellocated %u for group %lu\n", preallocated, group); + mb_debug("prellocated %u for group %u\n", preallocated, group); } static void ext4_mb_pa_callback(struct rcu_head *head) @@ -3363,7 +3588,7 @@ static void ext4_mb_pa_callback(struct rcu_head *head) static void ext4_mb_put_pa(struct ext4_allocation_context *ac, struct super_block *sb, struct ext4_prealloc_space *pa) { - unsigned long grp; + ext4_group_t grp; if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) return; @@ -3473,6 +3698,10 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) mb_debug("new inode pa %p: %llu/%u for %u\n", pa, pa->pa_pstart, pa->pa_len, pa->pa_lstart); + trace_mark(ext4_mb_new_inode_pa, + "dev %s ino %lu pstart %llu len %u lstart %u", + sb->s_id, ac->ac_inode->i_ino, + pa->pa_pstart, pa->pa_len, pa->pa_lstart); ext4_mb_use_inode_pa(ac, pa); atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); @@ -3530,7 +3759,9 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac) pa->pa_linear = 1; mb_debug("new group pa %p: %llu/%u for %u\n", pa, - pa->pa_pstart, pa->pa_len, pa->pa_lstart); + pa->pa_pstart, pa->pa_len, pa->pa_lstart); + trace_mark(ext4_mb_new_group_pa, "dev %s pstart %llu len %u lstart %u", + sb->s_id, pa->pa_pstart, pa->pa_len, pa->pa_lstart); ext4_mb_use_group_pa(ac, pa); atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); @@ -3579,16 +3810,18 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, { struct super_block *sb = e4b->bd_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); - unsigned long end; - unsigned long next; + unsigned int end; + unsigned int next; ext4_group_t group; ext4_grpblk_t bit; + unsigned long long grp_blk_start; sector_t start; int err = 0; int free = 0; BUG_ON(pa->pa_deleted == 0); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); + grp_blk_start = pa->pa_pstart - bit; BUG_ON(group != e4b->bd_group && pa->pa_len != 0); end = bit + pa->pa_len; @@ -3618,6 +3851,10 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, ext4_mb_store_history(ac); } + trace_mark(ext4_mb_release_inode_pa, + "dev %s ino %lu block %llu count %u", + sb->s_id, pa->pa_inode->i_ino, grp_blk_start + bit, + next - bit); mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); bit = next + 1; } @@ -3626,8 +3863,9 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, pa, (unsigned long) pa->pa_lstart, (unsigned long) pa->pa_pstart, (unsigned long) pa->pa_len); - ext4_error(sb, __func__, "free %u, pa_free %u\n", - free, pa->pa_free); + ext4_grp_locked_error(sb, group, + __func__, "free %u, pa_free %u", + free, pa->pa_free); /* * pa is already deleted so we use the value obtained * from the bitmap and continue. @@ -3650,6 +3888,8 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b, if (ac) ac->ac_op = EXT4_MB_HISTORY_DISCARD; + trace_mark(ext4_mb_release_group_pa, "dev %s pstart %llu len %d", + sb->s_id, pa->pa_pstart, pa->pa_len); BUG_ON(pa->pa_deleted == 0); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); BUG_ON(group != e4b->bd_group && pa->pa_len != 0); @@ -3692,7 +3932,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, int busy = 0; int free = 0; - mb_debug("discard preallocation for group %lu\n", group); + mb_debug("discard preallocation for group %u\n", group); if (list_empty(&grp->bb_prealloc_list)) return 0; @@ -3700,14 +3940,14 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, bitmap_bh = ext4_read_block_bitmap(sb, group); if (bitmap_bh == NULL) { ext4_error(sb, __func__, "Error in reading block " - "bitmap for %lu\n", group); + "bitmap for %u", group); return 0; } err = ext4_mb_load_buddy(sb, group, &e4b); if (err) { ext4_error(sb, __func__, "Error in loading buddy " - "information for %lu\n", group); + "information for %u", group); put_bh(bitmap_bh); return 0; } @@ -3815,6 +4055,8 @@ void ext4_discard_preallocations(struct inode *inode) } mb_debug("discard preallocation for inode %lu\n", inode->i_ino); + trace_mark(ext4_discard_preallocations, "dev %s ino %lu", sb->s_id, + inode->i_ino); INIT_LIST_HEAD(&list); @@ -3874,14 +4116,14 @@ repeat: err = ext4_mb_load_buddy(sb, group, &e4b); if (err) { ext4_error(sb, __func__, "Error in loading buddy " - "information for %lu\n", group); + "information for %u", group); continue; } bitmap_bh = ext4_read_block_bitmap(sb, group); if (bitmap_bh == NULL) { ext4_error(sb, __func__, "Error in reading block " - "bitmap for %lu\n", group); + "bitmap for %u", group); ext4_mb_release_desc(&e4b); continue; } @@ -4024,8 +4266,8 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac, struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; ext4_group_t group; - unsigned long len; - unsigned long goal; + unsigned int len; + ext4_fsblk_t goal; ext4_grpblk_t block; /* we can't allocate > group size */ @@ -4068,6 +4310,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac, ac->ac_pa = NULL; ac->ac_bitmap_page = NULL; ac->ac_buddy_page = NULL; + ac->alloc_semp = NULL; ac->ac_lg = NULL; /* we have to define context: we'll we work with a file or @@ -4146,7 +4389,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); if (ext4_mb_load_buddy(sb, group, &e4b)) { ext4_error(sb, __func__, "Error in loading buddy " - "information for %lu\n", group); + "information for %u", group); continue; } ext4_lock_group(sb, group); @@ -4248,6 +4491,8 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac) } ext4_mb_put_pa(ac, ac->ac_sb, pa); } + if (ac->alloc_semp) + up_read(ac->alloc_semp); if (ac->ac_bitmap_page) page_cache_release(ac->ac_bitmap_page); if (ac->ac_buddy_page) @@ -4264,6 +4509,8 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) int ret; int freed = 0; + trace_mark(ext4_mb_discard_preallocations, "dev %s needed %d", + sb->s_id, needed); for (i = 0; i < EXT4_SB(sb)->s_groups_count && needed > 0; i++) { ret = ext4_mb_discard_group_preallocations(sb, i, needed); freed += ret; @@ -4286,12 +4533,24 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, struct ext4_sb_info *sbi; struct super_block *sb; ext4_fsblk_t block = 0; - unsigned long inquota; - unsigned long reserv_blks = 0; + unsigned int inquota; + unsigned int reserv_blks = 0; sb = ar->inode->i_sb; sbi = EXT4_SB(sb); + trace_mark(ext4_request_blocks, "dev %s flags %u len %u ino %lu " + "lblk %llu goal %llu lleft %llu lright %llu " + "pleft %llu pright %llu ", + sb->s_id, ar->flags, ar->len, + ar->inode ? ar->inode->i_ino : 0, + (unsigned long long) ar->logical, + (unsigned long long) ar->goal, + (unsigned long long) ar->lleft, + (unsigned long long) ar->lright, + (unsigned long long) ar->pleft, + (unsigned long long) ar->pright); + if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) { /* * With delalloc we already reserved the blocks @@ -4313,7 +4572,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, } if (ar->len == 0) { *errp = -EDQUOT; - return 0; + goto out3; } inquota = ar->len; @@ -4348,10 +4607,14 @@ repeat: ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) ext4_mb_new_preallocation(ac); } - if (likely(ac->ac_status == AC_STATUS_FOUND)) { *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks); if (*errp == -EAGAIN) { + /* + * drop the reference that we took + * in ext4_mb_use_best_found + */ + ext4_mb_release_context(ac); ac->ac_b_ex.fe_group = 0; ac->ac_b_ex.fe_start = 0; ac->ac_b_ex.fe_len = 0; @@ -4382,6 +4645,26 @@ out2: out1: if (ar->len < inquota) DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len); +out3: + if (!ar->len) { + if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) + /* release all the reserved blocks if non delalloc */ + percpu_counter_sub(&sbi->s_dirtyblocks_counter, + reserv_blks); + } + + trace_mark(ext4_allocate_blocks, + "dev %s block %llu flags %u len %u ino %lu " + "logical %llu goal %llu lleft %llu lright %llu " + "pleft %llu pright %llu ", + sb->s_id, (unsigned long long) block, + ar->flags, ar->len, ar->inode ? ar->inode->i_ino : 0, + (unsigned long long) ar->logical, + (unsigned long long) ar->goal, + (unsigned long long) ar->lleft, + (unsigned long long) ar->lright, + (unsigned long long) ar->pleft, + (unsigned long long) ar->pright); return block; } @@ -4403,27 +4686,23 @@ static int can_merge(struct ext4_free_data *entry1, static noinline_for_stack int ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, - ext4_group_t group, ext4_grpblk_t block, int count) + struct ext4_free_data *new_entry) { + ext4_grpblk_t block; + struct ext4_free_data *entry; struct ext4_group_info *db = e4b->bd_info; struct super_block *sb = e4b->bd_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_free_data *entry, *new_entry; struct rb_node **n = &db->bb_free_root.rb_node, *node; struct rb_node *parent = NULL, *new_node; - + BUG_ON(!ext4_handle_valid(handle)); BUG_ON(e4b->bd_bitmap_page == NULL); BUG_ON(e4b->bd_buddy_page == NULL); - new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS); - new_entry->start_blk = block; - new_entry->group = group; - new_entry->count = count; - new_entry->t_tid = handle->h_transaction->t_tid; new_node = &new_entry->node; + block = new_entry->start_blk; - ext4_lock_group(sb, group); if (!*n) { /* first free block exent. We need to protect buddy cache from being freed, @@ -4441,10 +4720,9 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, else if (block >= (entry->start_blk + entry->count)) n = &(*n)->rb_right; else { - ext4_unlock_group(sb, group); - ext4_error(sb, __func__, - "Double free of blocks %d (%d %d)\n", - block, entry->start_blk, entry->count); + ext4_grp_locked_error(sb, e4b->bd_group, __func__, + "Double free of blocks %d (%d %d)", + block, entry->start_blk, entry->count); return 0; } } @@ -4483,7 +4761,6 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, spin_lock(&sbi->s_md_lock); list_add(&new_entry->list, &handle->h_transaction->t_private_list); spin_unlock(&sbi->s_md_lock); - ext4_unlock_group(sb, group); return 0; } @@ -4499,7 +4776,7 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode, struct ext4_allocation_context *ac = NULL; struct ext4_group_desc *gdp; struct ext4_super_block *es; - unsigned long overflow; + unsigned int overflow; ext4_grpblk_t bit; struct buffer_head *gd_bh; ext4_group_t block_group; @@ -4522,6 +4799,10 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode, } ext4_debug("freeing block %lu\n", block); + trace_mark(ext4_free_blocks, + "dev %s block %llu count %lu metadata %d ino %lu", + sb->s_id, (unsigned long long) block, count, metadata, + inode ? inode->i_ino : 0); ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); if (ac) { @@ -4581,11 +4862,6 @@ do_more: err = ext4_journal_get_write_access(handle, gd_bh); if (err) goto error_return; - - err = ext4_mb_load_buddy(sb, block_group, &e4b); - if (err) - goto error_return; - #ifdef AGGRESSIVE_CHECK { int i; @@ -4593,13 +4869,6 @@ do_more: BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data)); } #endif - mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data, - bit, count); - - /* We dirtied the bitmap block */ - BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); - err = ext4_journal_dirty_metadata(handle, bitmap_bh); - if (ac) { ac->ac_b_ex.fe_group = block_group; ac->ac_b_ex.fe_start = bit; @@ -4607,19 +4876,41 @@ do_more: ext4_mb_store_history(ac); } - if (metadata) { - /* blocks being freed are metadata. these blocks shouldn't - * be used until this transaction is committed */ - ext4_mb_free_metadata(handle, &e4b, block_group, bit, count); + err = ext4_mb_load_buddy(sb, block_group, &e4b); + if (err) + goto error_return; + if (metadata && ext4_handle_valid(handle)) { + struct ext4_free_data *new_entry; + /* + * blocks being freed are metadata. these blocks shouldn't + * be used until this transaction is committed + */ + new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS); + new_entry->start_blk = bit; + new_entry->group = block_group; + new_entry->count = count; + new_entry->t_tid = handle->h_transaction->t_tid; + ext4_lock_group(sb, block_group); + mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data, + bit, count); + ext4_mb_free_metadata(handle, &e4b, new_entry); + ext4_unlock_group(sb, block_group); } else { ext4_lock_group(sb, block_group); + /* need to update group_info->bb_free and bitmap + * with group lock held. generate_buddy look at + * them with group lock_held + */ + mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data, + bit, count); mb_free_blocks(inode, &e4b, bit, count); ext4_mb_return_to_preallocation(inode, &e4b, block, count); ext4_unlock_group(sb, block_group); } spin_lock(sb_bgl_lock(sbi, block_group)); - le16_add_cpu(&gdp->bg_free_blocks_count, count); + ret = ext4_free_blks_count(sb, gdp) + count; + ext4_free_blks_set(sb, gdp, ret); gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); spin_unlock(sb_bgl_lock(sbi, block_group)); percpu_counter_add(&sbi->s_freeblocks_counter, count); @@ -4635,9 +4926,13 @@ do_more: *freed += count; + /* We dirtied the bitmap block */ + BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); + err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); + /* And the group descriptor block */ BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); - ret = ext4_journal_dirty_metadata(handle, gd_bh); + ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); if (!err) err = ret; diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index b5dff1fff1e5..10a2921baf14 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -20,6 +20,7 @@ #include <linux/version.h> #include <linux/blkdev.h> #include <linux/marker.h> +#include <linux/mutex.h> #include "ext4_jbd2.h" #include "ext4.h" #include "group.h" @@ -98,9 +99,6 @@ */ #define MB_DEFAULT_GROUP_PREALLOC 512 -static struct kmem_cache *ext4_pspace_cachep; -static struct kmem_cache *ext4_ac_cachep; -static struct kmem_cache *ext4_free_ext_cachep; struct ext4_free_data { /* this links the free block information from group_info */ @@ -120,26 +118,6 @@ struct ext4_free_data { tid_t t_tid; }; -struct ext4_group_info { - unsigned long bb_state; - struct rb_root bb_free_root; - unsigned short bb_first_free; - unsigned short bb_free; - unsigned short bb_fragments; - struct list_head bb_prealloc_list; -#ifdef DOUBLE_CHECK - void *bb_bitmap; -#endif - unsigned short bb_counters[]; -}; - -#define EXT4_GROUP_INFO_NEED_INIT_BIT 0 -#define EXT4_GROUP_INFO_LOCKED_BIT 1 - -#define EXT4_MB_GRP_NEED_INIT(grp) \ - (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) - - struct ext4_prealloc_space { struct list_head pa_inode_list; struct list_head pa_group_list; @@ -217,6 +195,11 @@ struct ext4_allocation_context { __u8 ac_op; /* operation, for history only */ struct page *ac_bitmap_page; struct page *ac_buddy_page; + /* + * pointer to the held semaphore upon successful + * block allocation + */ + struct rw_semaphore *alloc_semp; struct ext4_prealloc_space *ac_pa; struct ext4_locality_group *ac_lg; }; @@ -250,6 +233,7 @@ struct ext4_buddy { struct super_block *bd_sb; __u16 bd_blkbits; ext4_group_t bd_group; + struct rw_semaphore *alloc_semp; }; #define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap) #define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy) @@ -259,51 +243,12 @@ static inline void ext4_mb_store_history(struct ext4_allocation_context *ac) { return; } -#else -static void ext4_mb_store_history(struct ext4_allocation_context *ac); #endif #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t); - -static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, - ext4_group_t group); -static void ext4_mb_return_to_preallocation(struct inode *inode, - struct ext4_buddy *e4b, sector_t block, - int count); -static void ext4_mb_put_pa(struct ext4_allocation_context *, - struct super_block *, struct ext4_prealloc_space *pa); -static int ext4_mb_init_per_dev_proc(struct super_block *sb); -static int ext4_mb_destroy_per_dev_proc(struct super_block *sb); -static void release_blocks_on_commit(journal_t *journal, transaction_t *txn); - - -static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) -{ - struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); - - bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state)); -} - -static inline void ext4_unlock_group(struct super_block *sb, - ext4_group_t group) -{ - struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); - - bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state)); -} - -static inline int ext4_is_group_locked(struct super_block *sb, - ext4_group_t group) -{ - struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); - - return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT, - &(grinfo->bb_state)); -} - -static ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, +static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, struct ext4_free_extent *fex) { ext4_fsblk_t block; diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index f2a9cf498ecd..734abca25e35 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -59,7 +59,8 @@ static int finish_range(handle_t *handle, struct inode *inode, /* * Make sure the credit we accumalated is not really high */ - if (needed && handle->h_buffer_credits >= EXT4_RESERVE_TRANS_BLOCKS) { + if (needed && ext4_handle_has_enough_credits(handle, + EXT4_RESERVE_TRANS_BLOCKS)) { retval = ext4_journal_restart(handle, needed); if (retval) goto err_out; @@ -229,7 +230,7 @@ static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode) { int retval = 0, needed; - if (handle->h_buffer_credits > EXT4_RESERVE_TRANS_BLOCKS) + if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1)) return 0; /* * We are freeing a blocks. During this we touch @@ -458,13 +459,13 @@ int ext4_ext_migrate(struct inode *inode) struct list_blocks_struct lb; unsigned long max_entries; - if (!test_opt(inode->i_sb, EXTENTS)) - /* - * if mounted with noextents we don't allow the migrate - */ - return -EINVAL; - - if ((EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) + /* + * If the filesystem does not support extents, or the inode + * already is extent-based, error out. + */ + if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_INCOMPAT_EXTENTS) || + (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) return -EINVAL; if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 63adcb792988..fec0b4c2f5f1 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -74,10 +74,6 @@ static struct buffer_head *ext4_append(handle_t *handle, #define assert(test) J_ASSERT(test) #endif -#ifndef swap -#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0) -#endif - #ifdef DX_DEBUG #define dxtrace(command) command #else @@ -372,6 +368,8 @@ dx_probe(const struct qstr *d_name, struct inode *dir, goto fail; } hinfo->hash_version = root->info.hash_version; + if (hinfo->hash_version <= DX_HASH_TEA) + hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed; if (d_name) ext4fs_dirhash(d_name->name, d_name->len, hinfo); @@ -641,6 +639,9 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, dir = dir_file->f_path.dentry->d_inode; if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) { hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; + if (hinfo.hash_version <= DX_HASH_TEA) + hinfo.hash_version += + EXT4_SB(dir->i_sb)->s_hash_unsigned; hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo, start_hash, start_minor_hash); @@ -806,7 +807,7 @@ static inline int ext4_match (int len, const char * const name, static inline int search_dirblock(struct buffer_head *bh, struct inode *dir, const struct qstr *d_name, - unsigned long offset, + unsigned int offset, struct ext4_dir_entry_2 ** res_dir) { struct ext4_dir_entry_2 * de; @@ -1043,11 +1044,11 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru bh = ext4_find_entry(dir, &dentry->d_name, &de); inode = NULL; if (bh) { - unsigned long ino = le32_to_cpu(de->inode); + __u32 ino = le32_to_cpu(de->inode); brelse(bh); if (!ext4_valid_inum(dir->i_sb, ino)) { ext4_error(dir->i_sb, "ext4_lookup", - "bad inode number: %lu", ino); + "bad inode number: %u", ino); return ERR_PTR(-EIO); } inode = ext4_iget(dir->i_sb, ino); @@ -1060,7 +1061,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru struct dentry *ext4_get_parent(struct dentry *child) { - unsigned long ino; + __u32 ino; struct inode *inode; static const struct qstr dotdot = { .name = "..", @@ -1078,7 +1079,7 @@ struct dentry *ext4_get_parent(struct dentry *child) if (!ext4_valid_inum(child->d_inode->i_sb, ino)) { ext4_error(child->d_inode->i_sb, "ext4_get_parent", - "bad inode number: %lu", ino); + "bad inode number: %u", ino); return ERR_PTR(-EIO); } @@ -1166,9 +1167,9 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, u32 hash2; struct dx_map_entry *map; char *data1 = (*bh)->b_data, *data2; - unsigned split, move, size, i; + unsigned split, move, size; struct ext4_dir_entry_2 *de = NULL, *de2; - int err = 0; + int err = 0, i; bh2 = ext4_append (handle, dir, &newblock, &err); if (!(bh2)) { @@ -1228,10 +1229,10 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, de = de2; } dx_insert_block(frame, hash2 + continued, newblock); - err = ext4_journal_dirty_metadata(handle, bh2); + err = ext4_handle_dirty_metadata(handle, dir, bh2); if (err) goto journal_error; - err = ext4_journal_dirty_metadata(handle, frame->bh); + err = ext4_handle_dirty_metadata(handle, dir, frame->bh); if (err) goto journal_error; brelse(bh2); @@ -1266,7 +1267,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, struct inode *dir = dentry->d_parent->d_inode; const char *name = dentry->d_name.name; int namelen = dentry->d_name.len; - unsigned long offset = 0; + unsigned int offset = 0; unsigned short reclen; int nlen, rlen, err; char *top; @@ -1335,8 +1336,8 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, ext4_update_dx_flag(dir); dir->i_version++; ext4_mark_inode_dirty(handle, dir); - BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); - err = ext4_journal_dirty_metadata(handle, bh); + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, dir, bh); if (err) ext4_std_error(dir->i_sb, err); brelse(bh); @@ -1408,6 +1409,8 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, /* Initialize as for dx_probe */ hinfo.hash_version = root->info.hash_version; + if (hinfo.hash_version <= DX_HASH_TEA) + hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; ext4fs_dirhash(name, namelen, &hinfo); frame = frames; @@ -1437,7 +1440,6 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, struct inode *inode) { struct inode *dir = dentry->d_parent->d_inode; - unsigned long offset; struct buffer_head *bh; struct ext4_dir_entry_2 *de; struct super_block *sb; @@ -1459,7 +1461,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, ext4_mark_inode_dirty(handle, dir); } blocks = dir->i_size >> sb->s_blocksize_bits; - for (block = 0, offset = 0; block < blocks; block++) { + for (block = 0; block < blocks; block++) { bh = ext4_bread(handle, dir, block, 0, &retval); if(!bh) return retval; @@ -1574,7 +1576,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, dxtrace(dx_show_index("node", frames[1].entries)); dxtrace(dx_show_index("node", ((struct dx_node *) bh2->b_data)->entries)); - err = ext4_journal_dirty_metadata(handle, bh2); + err = ext4_handle_dirty_metadata(handle, inode, bh2); if (err) goto journal_error; brelse (bh2); @@ -1600,7 +1602,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, if (err) goto journal_error; } - ext4_journal_dirty_metadata(handle, frames[0].bh); + ext4_handle_dirty_metadata(handle, inode, frames[0].bh); } de = do_split(handle, dir, &bh, frame, &hinfo, &err); if (!de) @@ -1646,8 +1648,8 @@ static int ext4_delete_entry(handle_t *handle, else de->inode = 0; dir->i_version++; - BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); - ext4_journal_dirty_metadata(handle, bh); + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + ext4_handle_dirty_metadata(handle, dir, bh); return 0; } i += ext4_rec_len_from_disk(de->rec_len); @@ -1693,9 +1695,11 @@ static int ext4_add_nondir(handle_t *handle, if (!err) { ext4_mark_inode_dirty(handle, inode); d_instantiate(dentry, inode); + unlock_new_inode(inode); return 0; } drop_nlink(inode); + unlock_new_inode(inode); iput(inode); return err; } @@ -1723,7 +1727,7 @@ retry: return PTR_ERR(handle); if (IS_DIRSYNC(dir)) - handle->h_sync = 1; + ext4_handle_sync(handle); inode = ext4_new_inode (handle, dir, mode); err = PTR_ERR(inode); @@ -1757,7 +1761,7 @@ retry: return PTR_ERR(handle); if (IS_DIRSYNC(dir)) - handle->h_sync = 1; + ext4_handle_sync(handle); inode = ext4_new_inode(handle, dir, mode); err = PTR_ERR(inode); @@ -1793,7 +1797,7 @@ retry: return PTR_ERR(handle); if (IS_DIRSYNC(dir)) - handle->h_sync = 1; + ext4_handle_sync(handle); inode = ext4_new_inode(handle, dir, S_IFDIR | mode); err = PTR_ERR(inode); @@ -1822,14 +1826,15 @@ retry: strcpy(de->name, ".."); ext4_set_de_type(dir->i_sb, de, S_IFDIR); inode->i_nlink = 2; - BUFFER_TRACE(dir_block, "call ext4_journal_dirty_metadata"); - ext4_journal_dirty_metadata(handle, dir_block); + BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata"); + ext4_handle_dirty_metadata(handle, dir, dir_block); brelse(dir_block); ext4_mark_inode_dirty(handle, inode); err = ext4_add_entry(handle, dentry, inode); if (err) { out_clear_inode: clear_nlink(inode); + unlock_new_inode(inode); ext4_mark_inode_dirty(handle, inode); iput(inode); goto out_stop; @@ -1838,6 +1843,7 @@ out_clear_inode: ext4_update_dx_flag(dir); ext4_mark_inode_dirty(handle, dir); d_instantiate(dentry, inode); + unlock_new_inode(inode); out_stop: ext4_journal_stop(handle); if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) @@ -1850,7 +1856,7 @@ out_stop: */ static int empty_dir(struct inode *inode) { - unsigned long offset; + unsigned int offset; struct buffer_head *bh; struct ext4_dir_entry_2 *de, *de1; struct super_block *sb; @@ -1895,7 +1901,7 @@ static int empty_dir(struct inode *inode) if (err) ext4_error(sb, __func__, "error %d reading directory" - " #%lu offset %lu", + " #%lu offset %u", err, inode->i_ino, offset); offset += sb->s_blocksize; continue; @@ -1933,6 +1939,9 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) struct ext4_iloc iloc; int err = 0, rc; + if (!ext4_handle_valid(handle)) + return 0; + lock_super(sb); if (!list_empty(&EXT4_I(inode)->i_orphan)) goto out_unlock; @@ -1961,7 +1970,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) /* Insert this inode at the head of the on-disk orphan list... */ NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan); EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); - err = ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh); + err = ext4_handle_dirty_metadata(handle, inode, EXT4_SB(sb)->s_sbh); rc = ext4_mark_iloc_dirty(handle, inode, &iloc); if (!err) err = rc; @@ -1995,10 +2004,13 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) struct list_head *prev; struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_sb_info *sbi; - unsigned long ino_next; + __u32 ino_next; struct ext4_iloc iloc; int err = 0; + if (!ext4_handle_valid(handle)) + return 0; + lock_super(inode->i_sb); if (list_empty(&ei->i_orphan)) { unlock_super(inode->i_sb); @@ -2017,7 +2029,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) * transaction handle with which to update the orphan list on * disk, but we still need to remove the inode from the linked * list in memory. */ - if (!handle) + if (sbi->s_journal && !handle) goto out; err = ext4_reserve_inode_write(handle, inode, &iloc); @@ -2025,19 +2037,19 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) goto out_err; if (prev == &sbi->s_orphan) { - jbd_debug(4, "superblock will point to %lu\n", ino_next); + jbd_debug(4, "superblock will point to %u\n", ino_next); BUFFER_TRACE(sbi->s_sbh, "get_write_access"); err = ext4_journal_get_write_access(handle, sbi->s_sbh); if (err) goto out_brelse; sbi->s_es->s_last_orphan = cpu_to_le32(ino_next); - err = ext4_journal_dirty_metadata(handle, sbi->s_sbh); + err = ext4_handle_dirty_metadata(handle, inode, sbi->s_sbh); } else { struct ext4_iloc iloc2; struct inode *i_prev = &list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode; - jbd_debug(4, "orphan inode %lu will point to %lu\n", + jbd_debug(4, "orphan inode %lu will point to %u\n", i_prev->i_ino, ino_next); err = ext4_reserve_inode_write(handle, i_prev, &iloc2); if (err) @@ -2082,7 +2094,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) goto end_rmdir; if (IS_DIRSYNC(dir)) - handle->h_sync = 1; + ext4_handle_sync(handle); inode = dentry->d_inode; @@ -2136,7 +2148,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) return PTR_ERR(handle); if (IS_DIRSYNC(dir)) - handle->h_sync = 1; + ext4_handle_sync(handle); retval = -ENOENT; bh = ext4_find_entry(dir, &dentry->d_name, &de); @@ -2193,7 +2205,7 @@ retry: return PTR_ERR(handle); if (IS_DIRSYNC(dir)) - handle->h_sync = 1; + ext4_handle_sync(handle); inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO); err = PTR_ERR(inode); @@ -2208,10 +2220,10 @@ retry: * We have a transaction open. All is sweetness. It also sets * i_size in generic_commit_write(). */ - err = __page_symlink(inode, symname, l, - mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); + err = __page_symlink(inode, symname, l, 1); if (err) { clear_nlink(inode); + unlock_new_inode(inode); ext4_mark_inode_dirty(handle, inode); iput(inode); goto out_stop; @@ -2256,13 +2268,20 @@ retry: return PTR_ERR(handle); if (IS_DIRSYNC(dir)) - handle->h_sync = 1; + ext4_handle_sync(handle); inode->i_ctime = ext4_current_time(inode); ext4_inc_count(handle, inode); atomic_inc(&inode->i_count); - err = ext4_add_nondir(handle, dentry, inode); + err = ext4_add_entry(handle, dentry, inode); + if (!err) { + ext4_mark_inode_dirty(handle, inode); + d_instantiate(dentry, inode); + } else { + drop_nlink(inode); + iput(inode); + } ext4_journal_stop(handle); if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; @@ -2298,7 +2317,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, return PTR_ERR(handle); if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) - handle->h_sync = 1; + ext4_handle_sync(handle); old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de); /* @@ -2352,8 +2371,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, new_dir->i_ctime = new_dir->i_mtime = ext4_current_time(new_dir); ext4_mark_inode_dirty(handle, new_dir); - BUFFER_TRACE(new_bh, "call ext4_journal_dirty_metadata"); - ext4_journal_dirty_metadata(handle, new_bh); + BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata"); + ext4_handle_dirty_metadata(handle, new_dir, new_bh); brelse(new_bh); new_bh = NULL; } @@ -2403,8 +2422,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, BUFFER_TRACE(dir_bh, "get_write_access"); ext4_journal_get_write_access(handle, dir_bh); PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino); - BUFFER_TRACE(dir_bh, "call ext4_journal_dirty_metadata"); - ext4_journal_dirty_metadata(handle, dir_bh); + BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata"); + ext4_handle_dirty_metadata(handle, old_dir, dir_bh); ext4_dec_count(handle, old_dir); if (new_inode) { /* checked empty_dir above, can't have another parent, diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index b6ec1843a015..c328be5d6885 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -50,7 +50,7 @@ static int verify_group_input(struct super_block *sb, ext4_get_group_no_and_offset(sb, start, NULL, &offset); if (group != sbi->s_groups_count) ext4_warning(sb, __func__, - "Cannot add at group %u (only %lu groups)", + "Cannot add at group %u (only %u groups)", input->group, sbi->s_groups_count); else if (offset != 0) ext4_warning(sb, __func__, "Last group not full"); @@ -149,7 +149,7 @@ static int extend_or_restart_transaction(handle_t *handle, int thresh, { int err; - if (handle->h_buffer_credits >= thresh) + if (ext4_handle_has_enough_credits(handle, thresh)) return 0; err = ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA); @@ -232,7 +232,7 @@ static int setup_new_group_blocks(struct super_block *sb, memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size); set_buffer_uptodate(gdb); unlock_buffer(gdb); - ext4_journal_dirty_metadata(handle, gdb); + ext4_handle_dirty_metadata(handle, NULL, gdb); ext4_set_bit(bit, bh->b_data); brelse(gdb); } @@ -251,7 +251,7 @@ static int setup_new_group_blocks(struct super_block *sb, err = PTR_ERR(bh); goto exit_bh; } - ext4_journal_dirty_metadata(handle, gdb); + ext4_handle_dirty_metadata(handle, NULL, gdb); ext4_set_bit(bit, bh->b_data); brelse(gdb); } @@ -276,7 +276,7 @@ static int setup_new_group_blocks(struct super_block *sb, err = PTR_ERR(it); goto exit_bh; } - ext4_journal_dirty_metadata(handle, it); + ext4_handle_dirty_metadata(handle, NULL, it); brelse(it); ext4_set_bit(bit, bh->b_data); } @@ -284,11 +284,9 @@ static int setup_new_group_blocks(struct super_block *sb, if ((err = extend_or_restart_transaction(handle, 2, bh))) goto exit_bh; - mark_bitmap_end(input->blocks_count, EXT4_BLOCKS_PER_GROUP(sb), - bh->b_data); - ext4_journal_dirty_metadata(handle, bh); + mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, bh->b_data); + ext4_handle_dirty_metadata(handle, NULL, bh); brelse(bh); - /* Mark unused entries in inode bitmap used */ ext4_debug("clear inode bitmap %#04llx (+%llu)\n", input->inode_bitmap, input->inode_bitmap - start); @@ -297,9 +295,9 @@ static int setup_new_group_blocks(struct super_block *sb, goto exit_journal; } - mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), EXT4_BLOCKS_PER_GROUP(sb), + mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, bh->b_data); - ext4_journal_dirty_metadata(handle, bh); + ext4_handle_dirty_metadata(handle, NULL, bh); exit_bh: brelse(bh); @@ -486,12 +484,12 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, * reserved inode, and will become GDT blocks (primary and backup). */ data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)] = 0; - ext4_journal_dirty_metadata(handle, dind); + ext4_handle_dirty_metadata(handle, NULL, dind); brelse(dind); inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9; ext4_mark_iloc_dirty(handle, inode, &iloc); memset((*primary)->b_data, 0, sb->s_blocksize); - ext4_journal_dirty_metadata(handle, *primary); + ext4_handle_dirty_metadata(handle, NULL, *primary); o_group_desc = EXT4_SB(sb)->s_group_desc; memcpy(n_group_desc, o_group_desc, @@ -502,7 +500,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, kfree(o_group_desc); le16_add_cpu(&es->s_reserved_gdt_blocks, -1); - ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh); + ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); return 0; @@ -618,7 +616,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, primary[i]->b_blocknr, gdbackups, blk + primary[i]->b_blocknr); */ data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr); - err2 = ext4_journal_dirty_metadata(handle, primary[i]); + err2 = ext4_handle_dirty_metadata(handle, NULL, primary[i]); if (!err) err = err2; } @@ -676,7 +674,8 @@ static void update_backups(struct super_block *sb, struct buffer_head *bh; /* Out of journal space, and can't get more - abort - so sad */ - if (handle->h_buffer_credits == 0 && + if (ext4_handle_valid(handle) && + handle->h_buffer_credits == 0 && ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA) && (err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA))) break; @@ -696,7 +695,7 @@ static void update_backups(struct super_block *sb, memset(bh->b_data + size, 0, rest); set_buffer_uptodate(bh); unlock_buffer(bh); - ext4_journal_dirty_metadata(handle, bh); + ext4_handle_dirty_metadata(handle, NULL, bh); brelse(bh); } if ((err2 = ext4_journal_stop(handle)) && !err) @@ -715,7 +714,7 @@ static void update_backups(struct super_block *sb, exit_err: if (err) { ext4_warning(sb, __func__, - "can't update backup for group %lu (err %d), " + "can't update backup for group %u (err %d), " "forcing fsck on next reboot", group, err); sbi->s_mount_state &= ~EXT4_VALID_FS; sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS); @@ -747,6 +746,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) struct inode *inode = NULL; handle_t *handle; int gdb_off, gdb_num; + int num_grp_locked = 0; int err, err2; gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb); @@ -761,13 +761,13 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) if (ext4_blocks_count(es) + input->blocks_count < ext4_blocks_count(es)) { - ext4_warning(sb, __func__, "blocks_count overflow\n"); + ext4_warning(sb, __func__, "blocks_count overflow"); return -EINVAL; } if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) < le32_to_cpu(es->s_inodes_count)) { - ext4_warning(sb, __func__, "inodes_count overflow\n"); + ext4_warning(sb, __func__, "inodes_count overflow"); return -EINVAL; } @@ -787,6 +787,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) } } + if ((err = verify_group_input(sb, input))) goto exit_put; @@ -855,6 +856,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) * using the new disk blocks. */ + num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, input->group); /* Update group descriptor block for new group */ gdp = (struct ext4_group_desc *)((char *)primary->b_data + gdb_off * EXT4_DESC_SIZE(sb)); @@ -862,17 +864,20 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */ ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */ ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */ - gdp->bg_free_blocks_count = cpu_to_le16(input->free_blocks_count); - gdp->bg_free_inodes_count = cpu_to_le16(EXT4_INODES_PER_GROUP(sb)); + ext4_free_blks_set(sb, gdp, input->free_blocks_count); + ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb)); + gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED); gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp); /* * We can allocate memory for mb_alloc based on the new group * descriptor */ - err = ext4_mb_add_more_groupinfo(sb, input->group, gdp); - if (err) + err = ext4_mb_add_groupinfo(sb, input->group, gdp); + if (err) { + ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked); goto exit_journal; + } /* * Make the new blocks and inodes valid next. We do this before @@ -914,8 +919,9 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) /* Update the global fs size fields */ sbi->s_groups_count++; + ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked); - ext4_journal_dirty_metadata(handle, primary); + ext4_handle_dirty_metadata(handle, NULL, primary); /* Update the reserved block counts only once the new group is * active. */ @@ -937,7 +943,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) EXT4_INODES_PER_GROUP(sb); } - ext4_journal_dirty_metadata(handle, sbi->s_sbh); + ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); sb->s_dirt = 1; exit_journal: @@ -975,9 +981,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, struct buffer_head *bh; handle_t *handle; int err; - unsigned long freed_blocks; ext4_group_t group; - struct ext4_group_info *grp; /* We don't need to worry about locking wrt other resizers just * yet: we're going to revalidate es->s_blocks_count after @@ -997,8 +1001,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, " too large to resize to %llu blocks safely\n", sb->s_id, n_blocks_count); if (sizeof(sector_t) < 8) - ext4_warning(sb, __func__, - "CONFIG_LBD not enabled\n"); + ext4_warning(sb, __func__, "CONFIG_LBD not enabled"); return -EINVAL; } @@ -1071,62 +1074,18 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, goto exit_put; } ext4_blocks_count_set(es, o_blocks_count + add); - ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh); + ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); sb->s_dirt = 1; unlock_super(sb); ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, o_blocks_count + add); - ext4_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks); + /* We add the blocks to the bitmap and set the group need init bit */ + ext4_add_groupblocks(handle, sb, o_blocks_count, add); ext4_debug("freed blocks %llu through %llu\n", o_blocks_count, o_blocks_count + add); if ((err = ext4_journal_stop(handle))) goto exit_put; - /* - * Mark mballoc pages as not up to date so that they will be updated - * next time they are loaded by ext4_mb_load_buddy. - * - * XXX Bad, Bad, BAD!!! We should not be overloading the - * Uptodate flag, particularly on thte bitmap bh, as way of - * hinting to ext4_mb_load_buddy() that it needs to be - * overloaded. A user could take a LVM snapshot, then do an - * on-line fsck, and clear the uptodate flag, and this would - * not be a bug in userspace, but a bug in the kernel. FIXME!!! - */ - { - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct inode *inode = sbi->s_buddy_cache; - int blocks_per_page; - int block; - int pnum; - struct page *page; - - /* Set buddy page as not up to date */ - blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; - block = group * 2; - pnum = block / blocks_per_page; - page = find_get_page(inode->i_mapping, pnum); - if (page != NULL) { - ClearPageUptodate(page); - page_cache_release(page); - } - - /* Set bitmap page as not up to date */ - block++; - pnum = block / blocks_per_page; - page = find_get_page(inode->i_mapping, pnum); - if (page != NULL) { - ClearPageUptodate(page); - page_cache_release(page); - } - - /* Get the info on the last group */ - grp = ext4_get_group_info(sb, group); - - /* Update free blocks in group info */ - ext4_mb_update_group_info(grp, add); - } - if (test_opt(sb, DEBUG)) printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n", ext4_blocks_count(es)); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index e4a241c65dbe..e5f06a5f045e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -51,9 +51,7 @@ struct proc_dir_entry *ext4_proc_root; static int ext4_load_journal(struct super_block *, struct ext4_super_block *, unsigned long journal_devnum); -static int ext4_create_journal(struct super_block *, struct ext4_super_block *, - unsigned int); -static void ext4_commit_super(struct super_block *sb, +static int ext4_commit_super(struct super_block *sb, struct ext4_super_block *es, int sync); static void ext4_mark_recovery_complete(struct super_block *sb, struct ext4_super_block *es); @@ -64,9 +62,9 @@ static const char *ext4_decode_error(struct super_block *sb, int errno, char nbuf[16]); static int ext4_remount(struct super_block *sb, int *flags, char *data); static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf); -static void ext4_unlockfs(struct super_block *sb); +static int ext4_unfreeze(struct super_block *sb); static void ext4_write_super(struct super_block *sb); -static void ext4_write_super_lockfs(struct super_block *sb); +static int ext4_freeze(struct super_block *sb); ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, @@ -93,6 +91,38 @@ ext4_fsblk_t ext4_inode_table(struct super_block *sb, (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0); } +__u32 ext4_free_blks_count(struct super_block *sb, + struct ext4_group_desc *bg) +{ + return le16_to_cpu(bg->bg_free_blocks_count_lo) | + (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? + (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0); +} + +__u32 ext4_free_inodes_count(struct super_block *sb, + struct ext4_group_desc *bg) +{ + return le16_to_cpu(bg->bg_free_inodes_count_lo) | + (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? + (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0); +} + +__u32 ext4_used_dirs_count(struct super_block *sb, + struct ext4_group_desc *bg) +{ + return le16_to_cpu(bg->bg_used_dirs_count_lo) | + (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? + (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0); +} + +__u32 ext4_itable_unused_count(struct super_block *sb, + struct ext4_group_desc *bg) +{ + return le16_to_cpu(bg->bg_itable_unused_lo) | + (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? + (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0); +} + void ext4_block_bitmap_set(struct super_block *sb, struct ext4_group_desc *bg, ext4_fsblk_t blk) { @@ -117,6 +147,38 @@ void ext4_inode_table_set(struct super_block *sb, bg->bg_inode_table_hi = cpu_to_le32(blk >> 32); } +void ext4_free_blks_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count) +{ + bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count); + if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) + bg->bg_free_blocks_count_hi = cpu_to_le16(count >> 16); +} + +void ext4_free_inodes_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count) +{ + bg->bg_free_inodes_count_lo = cpu_to_le16((__u16)count); + if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) + bg->bg_free_inodes_count_hi = cpu_to_le16(count >> 16); +} + +void ext4_used_dirs_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count) +{ + bg->bg_used_dirs_count_lo = cpu_to_le16((__u16)count); + if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) + bg->bg_used_dirs_count_hi = cpu_to_le16(count >> 16); +} + +void ext4_itable_unused_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count) +{ + bg->bg_itable_unused_lo = cpu_to_le16((__u16)count); + if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) + bg->bg_itable_unused_hi = cpu_to_le16(count >> 16); +} + /* * Wrappers for jbd2_journal_start/end. * @@ -136,13 +198,19 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks) * backs (eg. EIO in the commit thread), then we still need to * take the FS itself readonly cleanly. */ journal = EXT4_SB(sb)->s_journal; - if (is_journal_aborted(journal)) { - ext4_abort(sb, __func__, - "Detected aborted journal"); - return ERR_PTR(-EROFS); + if (journal) { + if (is_journal_aborted(journal)) { + ext4_abort(sb, __func__, + "Detected aborted journal"); + return ERR_PTR(-EROFS); + } + return jbd2_journal_start(journal, nblocks); } - - return jbd2_journal_start(journal, nblocks); + /* + * We're not journaling, return the appropriate indication. + */ + current->journal_info = EXT4_NOJOURNAL_HANDLE; + return current->journal_info; } /* @@ -157,6 +225,14 @@ int __ext4_journal_stop(const char *where, handle_t *handle) int err; int rc; + if (!ext4_handle_valid(handle)) { + /* + * Do this here since we don't call jbd2_journal_stop() in + * no-journal mode. + */ + current->journal_info = NULL; + return 0; + } sb = handle->h_transaction->t_journal->j_private; err = handle->h_err; rc = jbd2_journal_stop(handle); @@ -174,6 +250,8 @@ void ext4_journal_abort_handle(const char *caller, const char *err_fn, char nbuf[16]; const char *errstr = ext4_decode_error(NULL, err, nbuf); + BUG_ON(!ext4_handle_valid(handle)); + if (bh) BUFFER_TRACE(bh, "abort"); @@ -350,6 +428,44 @@ void ext4_warning(struct super_block *sb, const char *function, va_end(args); } +void ext4_grp_locked_error(struct super_block *sb, ext4_group_t grp, + const char *function, const char *fmt, ...) +__releases(bitlock) +__acquires(bitlock) +{ + va_list args; + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + + va_start(args, fmt); + printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function); + vprintk(fmt, args); + printk("\n"); + va_end(args); + + if (test_opt(sb, ERRORS_CONT)) { + EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; + es->s_state |= cpu_to_le16(EXT4_ERROR_FS); + ext4_commit_super(sb, es, 0); + return; + } + ext4_unlock_group(sb, grp); + ext4_handle_error(sb); + /* + * We only get here in the ERRORS_RO case; relocking the group + * may be dangerous, but nothing bad will happen since the + * filesystem will have already been marked read/only and the + * journal has been aborted. We return 1 as a hint to callers + * who might what to use the return value from + * ext4_grp_locked_error() to distinguish beween the + * ERRORS_CONT and ERRORS_RO case, and perhaps return more + * aggressively from the ext4 function in question, with a + * more appropriate error code. + */ + ext4_lock_group(sb, grp); + return; +} + + void ext4_update_dynamic_rev(struct super_block *sb) { struct ext4_super_block *es = EXT4_SB(sb)->s_es; @@ -389,7 +505,7 @@ static struct block_device *ext4_blkdev_get(dev_t dev) return bdev; fail: - printk(KERN_ERR "EXT4: failed to open journal device %s: %ld\n", + printk(KERN_ERR "EXT4-fs: failed to open journal device %s: %ld\n", __bdevname(dev, b), PTR_ERR(bdev)); return NULL; } @@ -448,11 +564,13 @@ static void ext4_put_super(struct super_block *sb) ext4_mb_release(sb); ext4_ext_release(sb); ext4_xattr_put_super(sb); - err = jbd2_journal_destroy(sbi->s_journal); - sbi->s_journal = NULL; - if (err < 0) - ext4_abort(sb, __func__, "Couldn't clean up the journal"); - + if (sbi->s_journal) { + err = jbd2_journal_destroy(sbi->s_journal); + sbi->s_journal = NULL; + if (err < 0) + ext4_abort(sb, __func__, + "Couldn't clean up the journal"); + } if (!(sb->s_flags & MS_RDONLY)) { EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); es->s_state = cpu_to_le16(sbi->s_mount_state); @@ -522,6 +640,11 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); INIT_LIST_HEAD(&ei->i_prealloc_list); spin_lock_init(&ei->i_prealloc_lock); + /* + * Note: We can be called before EXT4_SB(sb)->s_journal is set, + * therefore it can be null here. Don't check it, just initialize + * jinode. + */ jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode); ei->i_reserved_data_blocks = 0; ei->i_reserved_meta_blocks = 0; @@ -588,7 +711,8 @@ static void ext4_clear_inode(struct inode *inode) } #endif ext4_discard_preallocations(inode); - jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal, + if (EXT4_JOURNAL(inode)) + jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal, &EXT4_I(inode)->jinode); } @@ -681,10 +805,19 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) #endif if (!test_opt(sb, RESERVATION)) seq_puts(seq, ",noreservation"); - if (sbi->s_commit_interval) { + if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) { seq_printf(seq, ",commit=%u", (unsigned) (sbi->s_commit_interval / HZ)); } + if (sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME) { + seq_printf(seq, ",min_batch_time=%u", + (unsigned) sbi->s_min_batch_time); + } + if (sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) { + seq_printf(seq, ",max_batch_time=%u", + (unsigned) sbi->s_min_batch_time); + } + /* * We're changing the default of barrier mount option, so * let's always display its mount state so it's clear what its @@ -696,8 +829,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",journal_async_commit"); if (test_opt(sb, NOBH)) seq_puts(seq, ",nobh"); - if (!test_opt(sb, EXTENTS)) - seq_puts(seq, ",noextents"); if (test_opt(sb, I_VERSION)) seq_puts(seq, ",i_version"); if (!test_opt(sb, DELALLOC)) @@ -772,6 +903,25 @@ static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid, ext4_nfs_get_inode); } +/* + * Try to release metadata pages (indirect blocks, directories) which are + * mapped via the block device. Since these pages could have journal heads + * which would prevent try_to_free_buffers() from freeing them, we must use + * jbd2 layer's try_to_free_buffers() function to release them. + */ +static int bdev_try_to_free_page(struct super_block *sb, struct page *page, gfp_t wait) +{ + journal_t *journal = EXT4_SB(sb)->s_journal; + + WARN_ON(PageChecked(page)); + if (!page_has_buffers(page)) + return 0; + if (journal) + return jbd2_journal_try_to_free_buffers(journal, page, + wait & ~__GFP_WAIT); + return try_to_free_buffers(page); +} + #ifdef CONFIG_QUOTA #define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group") #define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA)) @@ -803,7 +953,9 @@ static struct dquot_operations ext4_quota_operations = { .acquire_dquot = ext4_acquire_dquot, .release_dquot = ext4_release_dquot, .mark_dirty = ext4_mark_dquot_dirty, - .write_info = ext4_write_info + .write_info = ext4_write_info, + .alloc_dquot = dquot_alloc, + .destroy_dquot = dquot_destroy, }; static struct quotactl_ops ext4_qctl_operations = { @@ -826,8 +978,8 @@ static const struct super_operations ext4_sops = { .put_super = ext4_put_super, .write_super = ext4_write_super, .sync_fs = ext4_sync_fs, - .write_super_lockfs = ext4_write_super_lockfs, - .unlockfs = ext4_unlockfs, + .freeze_fs = ext4_freeze, + .unfreeze_fs = ext4_unfreeze, .statfs = ext4_statfs, .remount_fs = ext4_remount, .clear_inode = ext4_clear_inode, @@ -836,6 +988,7 @@ static const struct super_operations ext4_sops = { .quota_read = ext4_quota_read, .quota_write = ext4_quota_write, #endif + .bdev_try_to_free_page = bdev_try_to_free_page, }; static const struct export_operations ext4_export_ops = { @@ -850,16 +1003,17 @@ enum { Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov, Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, - Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, + Opt_commit, Opt_min_batch_time, Opt_max_batch_time, + Opt_journal_update, Opt_journal_dev, Opt_journal_checksum, Opt_journal_async_commit, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, Opt_data_err_abort, Opt_data_err_ignore, Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, - Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version, + Opt_grpquota, Opt_i_version, Opt_stripe, Opt_delalloc, Opt_nodelalloc, - Opt_inode_readahead_blks + Opt_inode_readahead_blks, Opt_journal_ioprio }; static const match_table_t tokens = { @@ -889,8 +1043,9 @@ static const match_table_t tokens = { {Opt_nobh, "nobh"}, {Opt_bh, "bh"}, {Opt_commit, "commit=%u"}, + {Opt_min_batch_time, "min_batch_time=%u"}, + {Opt_max_batch_time, "max_batch_time=%u"}, {Opt_journal_update, "journal=update"}, - {Opt_journal_inum, "journal=%u"}, {Opt_journal_dev, "journal_dev=%u"}, {Opt_journal_checksum, "journal_checksum"}, {Opt_journal_async_commit, "journal_async_commit"}, @@ -911,14 +1066,13 @@ static const match_table_t tokens = { {Opt_quota, "quota"}, {Opt_usrquota, "usrquota"}, {Opt_barrier, "barrier=%u"}, - {Opt_extents, "extents"}, - {Opt_noextents, "noextents"}, {Opt_i_version, "i_version"}, {Opt_stripe, "stripe=%u"}, {Opt_resize, "resize"}, {Opt_delalloc, "delalloc"}, {Opt_nodelalloc, "nodelalloc"}, {Opt_inode_readahead_blks, "inode_readahead_blks=%u"}, + {Opt_journal_ioprio, "journal_ioprio=%u"}, {Opt_err, NULL}, }; @@ -943,8 +1097,11 @@ static ext4_fsblk_t get_sb_block(void **data) return sb_block; } +#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) + static int parse_options(char *options, struct super_block *sb, - unsigned int *inum, unsigned long *journal_devnum, + unsigned long *journal_devnum, + unsigned int *journal_ioprio, ext4_fsblk_t *n_blocks_count, int is_remount) { struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -956,7 +1113,6 @@ static int parse_options(char *options, struct super_block *sb, int qtype, qfmt; char *qname; #endif - ext4_fsblk_t last_block; if (!options) return 1; @@ -1068,16 +1224,6 @@ static int parse_options(char *options, struct super_block *sb, } set_opt(sbi->s_mount_opt, UPDATE_JOURNAL); break; - case Opt_journal_inum: - if (is_remount) { - printk(KERN_ERR "EXT4-fs: cannot specify " - "journal on remount\n"); - return 0; - } - if (match_int(&args[0], &option)) - return 0; - *inum = option; - break; case Opt_journal_dev: if (is_remount) { printk(KERN_ERR "EXT4-fs: cannot specify " @@ -1107,6 +1253,22 @@ static int parse_options(char *options, struct super_block *sb, option = JBD2_DEFAULT_MAX_COMMIT_AGE; sbi->s_commit_interval = HZ * option; break; + case Opt_max_batch_time: + if (match_int(&args[0], &option)) + return 0; + if (option < 0) + return 0; + if (option == 0) + option = EXT4_DEF_MAX_BATCH_TIME; + sbi->s_max_batch_time = option; + break; + case Opt_min_batch_time: + if (match_int(&args[0], &option)) + return 0; + if (option < 0) + return 0; + sbi->s_min_batch_time = option; + break; case Opt_data_journal: data_opt = EXT4_MOUNT_JOURNAL_DATA; goto datacheck; @@ -1142,8 +1304,7 @@ static int parse_options(char *options, struct super_block *sb, case Opt_grpjquota: qtype = GRPQUOTA; set_qf_name: - if ((sb_any_quota_enabled(sb) || - sb_any_quota_suspended(sb)) && + if (sb_any_quota_loaded(sb) && !sbi->s_qf_names[qtype]) { printk(KERN_ERR "EXT4-fs: Cannot change journaled " @@ -1182,8 +1343,7 @@ set_qf_name: case Opt_offgrpjquota: qtype = GRPQUOTA; clear_qf_name: - if ((sb_any_quota_enabled(sb) || - sb_any_quota_suspended(sb)) && + if (sb_any_quota_loaded(sb) && sbi->s_qf_names[qtype]) { printk(KERN_ERR "EXT4-fs: Cannot change " "journaled quota options when " @@ -1202,8 +1362,7 @@ clear_qf_name: case Opt_jqfmt_vfsv0: qfmt = QFMT_VFS_V0; set_qf_format: - if ((sb_any_quota_enabled(sb) || - sb_any_quota_suspended(sb)) && + if (sb_any_quota_loaded(sb) && sbi->s_jquota_fmt != qfmt) { printk(KERN_ERR "EXT4-fs: Cannot change " "journaled quota options when " @@ -1222,7 +1381,7 @@ set_qf_format: set_opt(sbi->s_mount_opt, GRPQUOTA); break; case Opt_noquota: - if (sb_any_quota_enabled(sb)) { + if (sb_any_quota_loaded(sb)) { printk(KERN_ERR "EXT4-fs: Cannot change quota " "options when quota turned on.\n"); return 0; @@ -1280,33 +1439,6 @@ set_qf_format: case Opt_bh: clear_opt(sbi->s_mount_opt, NOBH); break; - case Opt_extents: - if (!EXT4_HAS_INCOMPAT_FEATURE(sb, - EXT4_FEATURE_INCOMPAT_EXTENTS)) { - ext4_warning(sb, __func__, - "extents feature not enabled " - "on this filesystem, use tune2fs\n"); - return 0; - } - set_opt(sbi->s_mount_opt, EXTENTS); - break; - case Opt_noextents: - /* - * When e2fsprogs support resizing an already existing - * ext3 file system to greater than 2**32 we need to - * add support to block allocator to handle growing - * already existing block mapped inode so that blocks - * allocated for them fall within 2**32 - */ - last_block = ext4_blocks_count(sbi->s_es) - 1; - if (last_block > 0xffffffffULL) { - printk(KERN_ERR "EXT4-fs: Filesystem too " - "large to mount with " - "-o noextents options\n"); - return 0; - } - clear_opt(sbi->s_mount_opt, EXTENTS); - break; case Opt_i_version: set_opt(sbi->s_mount_opt, I_VERSION); sb->s_flags |= MS_I_VERSION; @@ -1331,6 +1463,14 @@ set_qf_format: return 0; sbi->s_inode_readahead_blks = option; break; + case Opt_journal_ioprio: + if (match_int(&args[0], &option)) + return 0; + if (option < 0 || option > 7) + break; + *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, + option); + break; default: printk(KERN_ERR "EXT4-fs: Unrecognized mount option \"%s\" " @@ -1406,24 +1546,19 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, printk(KERN_WARNING "EXT4-fs warning: checktime reached, " "running e2fsck is recommended\n"); -#if 0 - /* @@@ We _will_ want to clear the valid bit if we find - * inconsistencies, to force a fsck at reboot. But for - * a plain journaled filesystem we can keep it set as - * valid forever! :) - */ - es->s_state &= cpu_to_le16(~EXT4_VALID_FS); -#endif + if (!sbi->s_journal) + es->s_state &= cpu_to_le16(~EXT4_VALID_FS); if (!(__s16) le16_to_cpu(es->s_max_mnt_count)) es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT); le16_add_cpu(&es->s_mnt_count, 1); es->s_mtime = cpu_to_le32(get_seconds()); ext4_update_dynamic_rev(sb); - EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); + if (sbi->s_journal) + EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); ext4_commit_super(sb, es, 1); if (test_opt(sb, DEBUG)) - printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%lu, " + printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, " "bpg=%lu, ipg=%lu, mo=%04lx]\n", sb->s_blocksize, sbi->s_groups_count, @@ -1431,9 +1566,13 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, EXT4_INODES_PER_GROUP(sb), sbi->s_mount_opt); - printk(KERN_INFO "EXT4 FS on %s, %s journal on %s\n", - sb->s_id, EXT4_SB(sb)->s_journal->j_inode ? "internal" : - "external", EXT4_SB(sb)->s_journal->j_devname); + if (EXT4_SB(sb)->s_journal) { + printk(KERN_INFO "EXT4 FS on %s, %s journal on %s\n", + sb->s_id, EXT4_SB(sb)->s_journal->j_inode ? "internal" : + "external", EXT4_SB(sb)->s_journal->j_devname); + } else { + printk(KERN_INFO "EXT4 FS on %s, no journal\n", sb->s_id); + } return res; } @@ -1445,7 +1584,6 @@ static int ext4_fill_flex_info(struct super_block *sb) ext4_group_t flex_group_count; ext4_group_t flex_group; int groups_per_flex = 0; - __u64 block_bitmap = 0; int i; if (!sbi->s_es->s_log_groups_per_flex) { @@ -1464,21 +1602,18 @@ static int ext4_fill_flex_info(struct super_block *sb) sizeof(struct flex_groups), GFP_KERNEL); if (sbi->s_flex_groups == NULL) { printk(KERN_ERR "EXT4-fs: not enough memory for " - "%lu flex groups\n", flex_group_count); + "%u flex groups\n", flex_group_count); goto failed; } - gdp = ext4_get_group_desc(sb, 1, &bh); - block_bitmap = ext4_block_bitmap(sb, gdp) - 1; - for (i = 0; i < sbi->s_groups_count; i++) { gdp = ext4_get_group_desc(sb, i, &bh); flex_group = ext4_flex_group(sbi, i); sbi->s_flex_groups[flex_group].free_inodes += - le16_to_cpu(gdp->bg_free_inodes_count); + ext4_free_inodes_count(sb, gdp); sbi->s_flex_groups[flex_group].free_blocks += - le16_to_cpu(gdp->bg_free_blocks_count); + ext4_free_blks_count(sb, gdp); } return 1; @@ -1552,14 +1687,14 @@ static int ext4_check_descriptors(struct super_block *sb) block_bitmap = ext4_block_bitmap(sb, gdp); if (block_bitmap < first_block || block_bitmap > last_block) { printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " - "Block bitmap for group %lu not in group " + "Block bitmap for group %u not in group " "(block %llu)!\n", i, block_bitmap); return 0; } inode_bitmap = ext4_inode_bitmap(sb, gdp); if (inode_bitmap < first_block || inode_bitmap > last_block) { printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " - "Inode bitmap for group %lu not in group " + "Inode bitmap for group %u not in group " "(block %llu)!\n", i, inode_bitmap); return 0; } @@ -1567,14 +1702,14 @@ static int ext4_check_descriptors(struct super_block *sb) if (inode_table < first_block || inode_table + sbi->s_itb_per_group - 1 > last_block) { printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " - "Inode table for group %lu not in group " + "Inode table for group %u not in group " "(block %llu)!\n", i, inode_table); return 0; } spin_lock(sb_bgl_lock(sbi, i)); if (!ext4_group_desc_csum_verify(sbi, i, gdp)) { printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " - "Checksum for group %lu failed (%u!=%u)\n", + "Checksum for group %u failed (%u!=%u)\n", i, le16_to_cpu(ext4_group_desc_csum(sbi, i, gdp)), le16_to_cpu(gdp->bg_checksum)); if (!(sb->s_flags & MS_RDONLY)) { @@ -1721,7 +1856,7 @@ static loff_t ext4_max_size(int blkbits, int has_huge_files) /* small i_blocks in vfs inode? */ if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) { /* - * CONFIG_LSF is not enabled implies the inode + * CONFIG_LBD is not enabled implies the inode * i_block represent total blocks in 512 bytes * 32 == size of vfs inode i_blocks * 8 */ @@ -1764,7 +1899,7 @@ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files) if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) { /* - * !has_huge_files or CONFIG_LSF is not enabled + * !has_huge_files or CONFIG_LBD is not enabled * implies the inode i_block represent total blocks in * 512 bytes 32 == size of vfs inode i_blocks * 8 */ @@ -1866,19 +2001,20 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) ext4_fsblk_t sb_block = get_sb_block(&data); ext4_fsblk_t logical_sb_block; unsigned long offset = 0; - unsigned int journal_inum = 0; unsigned long journal_devnum = 0; unsigned long def_mount_opts; struct inode *root; char *cp; + const char *descr; int ret = -EINVAL; int blocksize; - int db_count; - int i; + unsigned int db_count; + unsigned int i; int needs_recovery, has_huge_files; - __le32 features; + int features; __u64 blocks_count; int err; + unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) @@ -1959,31 +2095,22 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_resuid = le16_to_cpu(es->s_def_resuid); sbi->s_resgid = le16_to_cpu(es->s_def_resgid); + sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ; + sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; + sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; set_opt(sbi->s_mount_opt, RESERVATION); set_opt(sbi->s_mount_opt, BARRIER); /* - * turn on extents feature by default in ext4 filesystem - * only if feature flag already set by mkfs or tune2fs. - * Use -o noextents to turn it off - */ - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) - set_opt(sbi->s_mount_opt, EXTENTS); - else - ext4_warning(sb, __func__, - "extents feature not enabled on this filesystem, " - "use tune2fs.\n"); - - /* * enable delayed allocation by default * Use -o nodelalloc to turn it off */ set_opt(sbi->s_mount_opt, DELALLOC); - if (!parse_options((char *) data, sb, &journal_inum, &journal_devnum, - NULL, 0)) + if (!parse_options((char *) data, sb, &journal_devnum, + &journal_ioprio, NULL, 0)) goto failed_mount; sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | @@ -2005,15 +2132,17 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP); if (features) { printk(KERN_ERR "EXT4-fs: %s: couldn't mount because of " - "unsupported optional features (%x).\n", - sb->s_id, le32_to_cpu(features)); + "unsupported optional features (%x).\n", sb->s_id, + (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) & + ~EXT4_FEATURE_INCOMPAT_SUPP)); goto failed_mount; } features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP); if (!(sb->s_flags & MS_RDONLY) && features) { printk(KERN_ERR "EXT4-fs: %s: couldn't mount RDWR because of " - "unsupported optional features (%x).\n", - sb->s_id, le32_to_cpu(features)); + "unsupported optional features (%x).\n", sb->s_id, + (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) & + ~EXT4_FEATURE_RO_COMPAT_SUPP)); goto failed_mount; } has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb, @@ -2021,13 +2150,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (has_huge_files) { /* * Large file size enabled file system can only be - * mount if kernel is build with CONFIG_LSF + * mount if kernel is build with CONFIG_LBD */ if (sizeof(root->i_blocks) < sizeof(u64) && !(sb->s_flags & MS_RDONLY)) { printk(KERN_ERR "EXT4-fs: %s: Filesystem with huge " "files cannot be mounted read-write " - "without CONFIG_LSF.\n", sb->s_id); + "without CONFIG_LBD.\n", sb->s_id); goto failed_mount; } } @@ -2118,6 +2247,18 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) for (i = 0; i < 4; i++) sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); sbi->s_def_hash_version = es->s_def_hash_version; + i = le32_to_cpu(es->s_flags); + if (i & EXT2_FLAGS_UNSIGNED_HASH) + sbi->s_hash_unsigned = 3; + else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) { +#ifdef __CHAR_UNSIGNED__ + es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH); + sbi->s_hash_unsigned = 3; +#else + es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); +#endif + sb->s_dirt = 1; + } if (sbi->s_blocks_per_group > blocksize * 8) { printk(KERN_ERR @@ -2145,20 +2286,30 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (EXT4_BLOCKS_PER_GROUP(sb) == 0) goto cantfind_ext4; - /* ensure blocks_count calculation below doesn't sign-extend */ - if (ext4_blocks_count(es) + EXT4_BLOCKS_PER_GROUP(sb) < - le32_to_cpu(es->s_first_data_block) + 1) { - printk(KERN_WARNING "EXT4-fs: bad geometry: block count %llu, " - "first data block %u, blocks per group %lu\n", - ext4_blocks_count(es), - le32_to_cpu(es->s_first_data_block), - EXT4_BLOCKS_PER_GROUP(sb)); + /* + * It makes no sense for the first data block to be beyond the end + * of the filesystem. + */ + if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) { + printk(KERN_WARNING "EXT4-fs: bad geometry: first data" + "block %u is beyond end of filesystem (%llu)\n", + le32_to_cpu(es->s_first_data_block), + ext4_blocks_count(es)); goto failed_mount; } blocks_count = (ext4_blocks_count(es) - le32_to_cpu(es->s_first_data_block) + EXT4_BLOCKS_PER_GROUP(sb) - 1); do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb)); + if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) { + printk(KERN_WARNING "EXT4-fs: groups count too large: %u " + "(block count %llu, first data block %u, " + "blocks per group %lu)\n", sbi->s_groups_count, + ext4_blocks_count(es), + le32_to_cpu(es->s_first_data_block), + EXT4_BLOCKS_PER_GROUP(sb)); + goto failed_mount; + } sbi->s_groups_count = blocks_count; db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / EXT4_DESC_PER_BLOCK(sb); @@ -2270,27 +2421,26 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; es->s_state |= cpu_to_le16(EXT4_ERROR_FS); ext4_commit_super(sb, es, 1); - printk(KERN_CRIT - "EXT4-fs (device %s): mount failed\n", - sb->s_id); goto failed_mount4; } } - } else if (journal_inum) { - if (ext4_create_journal(sb, es, journal_inum)) - goto failed_mount3; + } else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) && + EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) { + printk(KERN_ERR "EXT4-fs: required journal recovery " + "suppressed and not mounted read-only\n"); + goto failed_mount4; } else { - if (!silent) - printk(KERN_ERR - "ext4: No journal on filesystem on %s\n", - sb->s_id); - goto failed_mount3; + clear_opt(sbi->s_mount_opt, DATA_FLAGS); + set_opt(sbi->s_mount_opt, WRITEBACK_DATA); + sbi->s_journal = NULL; + needs_recovery = 0; + goto no_journal; } if (ext4_blocks_count(es) > 0xffffffffULL && !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_64BIT)) { - printk(KERN_ERR "ext4: Failed to set 64-bit journal feature\n"); + printk(KERN_ERR "EXT4-fs: Failed to set 64-bit journal feature\n"); goto failed_mount4; } @@ -2335,6 +2485,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) default: break; } + set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); + +no_journal: if (test_opt(sb, NOBH)) { if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) { @@ -2420,13 +2573,22 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS; ext4_orphan_cleanup(sb, es); EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS; - if (needs_recovery) + if (needs_recovery) { printk(KERN_INFO "EXT4-fs: recovery complete.\n"); - ext4_mark_recovery_complete(sb, es); - printk(KERN_INFO "EXT4-fs: mounted filesystem with %s data mode.\n", - test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ? "journal": - test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered": - "writeback"); + ext4_mark_recovery_complete(sb, es); + } + if (EXT4_SB(sb)->s_journal) { + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) + descr = " journalled data mode"; + else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) + descr = " ordered data mode"; + else + descr = " writeback data mode"; + } else + descr = "out journal"; + + printk(KERN_INFO "EXT4-fs: mounted filesystem %s with%s\n", + sb->s_id, descr); lock_kernel(); return 0; @@ -2438,8 +2600,11 @@ cantfind_ext4: goto failed_mount; failed_mount4: - jbd2_journal_destroy(sbi->s_journal); - sbi->s_journal = NULL; + printk(KERN_ERR "EXT4-fs (device %s): mount failed\n", sb->s_id); + if (sbi->s_journal) { + jbd2_journal_destroy(sbi->s_journal); + sbi->s_journal = NULL; + } failed_mount3: percpu_counter_destroy(&sbi->s_freeblocks_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); @@ -2476,11 +2641,9 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal) { struct ext4_sb_info *sbi = EXT4_SB(sb); - if (sbi->s_commit_interval) - journal->j_commit_interval = sbi->s_commit_interval; - /* We could also set up an ext4-specific default for the commit - * interval here, but for now we'll just fall back to the jbd - * default. */ + journal->j_commit_interval = sbi->s_commit_interval; + journal->j_min_batch_time = sbi->s_min_batch_time; + journal->j_max_batch_time = sbi->s_max_batch_time; spin_lock(&journal->j_state_lock); if (test_opt(sb, BARRIER)) @@ -2500,6 +2663,8 @@ static journal_t *ext4_get_journal(struct super_block *sb, struct inode *journal_inode; journal_t *journal; + BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)); + /* First, test for the existence of a valid inode on disk. Bad * things happen if we iget() an unused inode, as the subsequent * iput() will try to delete it. */ @@ -2548,13 +2713,15 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb, struct ext4_super_block *es; struct block_device *bdev; + BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)); + bdev = ext4_blkdev_get(j_dev); if (bdev == NULL) return NULL; if (bd_claim(bdev, sb)) { printk(KERN_ERR - "EXT4: failed to claim external journal device.\n"); + "EXT4-fs: failed to claim external journal device.\n"); blkdev_put(bdev, FMODE_READ|FMODE_WRITE); return NULL; } @@ -2635,6 +2802,8 @@ static int ext4_load_journal(struct super_block *sb, int err = 0; int really_read_only; + BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)); + if (journal_devnum && journal_devnum != le32_to_cpu(es->s_journal_dev)) { printk(KERN_INFO "EXT4-fs: external journal device major/minor " @@ -2719,55 +2888,14 @@ static int ext4_load_journal(struct super_block *sb, return 0; } -static int ext4_create_journal(struct super_block *sb, - struct ext4_super_block *es, - unsigned int journal_inum) -{ - journal_t *journal; - int err; - - if (sb->s_flags & MS_RDONLY) { - printk(KERN_ERR "EXT4-fs: readonly filesystem when trying to " - "create journal.\n"); - return -EROFS; - } - - journal = ext4_get_journal(sb, journal_inum); - if (!journal) - return -EINVAL; - - printk(KERN_INFO "EXT4-fs: creating new journal on inode %u\n", - journal_inum); - - err = jbd2_journal_create(journal); - if (err) { - printk(KERN_ERR "EXT4-fs: error creating journal.\n"); - jbd2_journal_destroy(journal); - return -EIO; - } - - EXT4_SB(sb)->s_journal = journal; - - ext4_update_dynamic_rev(sb); - EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); - EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL); - - es->s_journal_inum = cpu_to_le32(journal_inum); - sb->s_dirt = 1; - - /* Make sure we flush the recovery flag to disk. */ - ext4_commit_super(sb, es, 1); - - return 0; -} - -static void ext4_commit_super(struct super_block *sb, +static int ext4_commit_super(struct super_block *sb, struct ext4_super_block *es, int sync) { struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; + int error = 0; if (!sbh) - return; + return error; if (buffer_write_io_error(sbh)) { /* * Oh, dear. A previous attempt to write the @@ -2777,25 +2905,33 @@ static void ext4_commit_super(struct super_block *sb, * be remapped. Nothing we can do but to retry the * write and hope for the best. */ - printk(KERN_ERR "ext4: previous I/O error to " + printk(KERN_ERR "EXT4-fs: previous I/O error to " "superblock detected for %s.\n", sb->s_id); clear_buffer_write_io_error(sbh); set_buffer_uptodate(sbh); } es->s_wtime = cpu_to_le32(get_seconds()); - ext4_free_blocks_count_set(es, ext4_count_free_blocks(sb)); - es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb)); + ext4_free_blocks_count_set(es, percpu_counter_sum_positive( + &EXT4_SB(sb)->s_freeblocks_counter)); + es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive( + &EXT4_SB(sb)->s_freeinodes_counter)); + BUFFER_TRACE(sbh, "marking dirty"); mark_buffer_dirty(sbh); if (sync) { - sync_dirty_buffer(sbh); - if (buffer_write_io_error(sbh)) { - printk(KERN_ERR "ext4: I/O error while writing " + error = sync_dirty_buffer(sbh); + if (error) + return error; + + error = buffer_write_io_error(sbh); + if (error) { + printk(KERN_ERR "EXT4-fs: I/O error while writing " "superblock for %s.\n", sb->s_id); clear_buffer_write_io_error(sbh); set_buffer_uptodate(sbh); } } + return error; } @@ -2809,6 +2945,10 @@ static void ext4_mark_recovery_complete(struct super_block *sb, { journal_t *journal = EXT4_SB(sb)->s_journal; + if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) { + BUG_ON(journal != NULL); + return; + } jbd2_journal_lock_updates(journal); if (jbd2_journal_flush(journal) < 0) goto out; @@ -2838,6 +2978,8 @@ static void ext4_clear_journal_err(struct super_block *sb, int j_errno; const char *errstr; + BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)); + journal = EXT4_SB(sb)->s_journal; /* @@ -2870,14 +3012,17 @@ static void ext4_clear_journal_err(struct super_block *sb, int ext4_force_commit(struct super_block *sb) { journal_t *journal; - int ret; + int ret = 0; if (sb->s_flags & MS_RDONLY) return 0; journal = EXT4_SB(sb)->s_journal; - sb->s_dirt = 0; - ret = ext4_journal_force_commit(journal); + if (journal) { + sb->s_dirt = 0; + ret = ext4_journal_force_commit(journal); + } + return ret; } @@ -2889,9 +3034,13 @@ int ext4_force_commit(struct super_block *sb) */ static void ext4_write_super(struct super_block *sb) { - if (mutex_trylock(&sb->s_lock) != 0) - BUG(); - sb->s_dirt = 0; + if (EXT4_SB(sb)->s_journal) { + if (mutex_trylock(&sb->s_lock) != 0) + BUG(); + sb->s_dirt = 0; + } else { + ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1); + } } static int ext4_sync_fs(struct super_block *sb, int wait) @@ -2900,10 +3049,14 @@ static int ext4_sync_fs(struct super_block *sb, int wait) trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait); sb->s_dirt = 0; - if (wait) - ret = ext4_force_commit(sb); - else - jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, NULL); + if (EXT4_SB(sb)->s_journal) { + if (wait) + ret = ext4_force_commit(sb); + else + jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, NULL); + } else { + ext4_commit_super(sb, EXT4_SB(sb)->s_es, wait); + } return ret; } @@ -2911,36 +3064,48 @@ static int ext4_sync_fs(struct super_block *sb, int wait) * LVM calls this function before a (read-only) snapshot is created. This * gives us a chance to flush the journal completely and mark the fs clean. */ -static void ext4_write_super_lockfs(struct super_block *sb) +static int ext4_freeze(struct super_block *sb) { + int error = 0; + journal_t *journal; sb->s_dirt = 0; if (!(sb->s_flags & MS_RDONLY)) { - journal_t *journal = EXT4_SB(sb)->s_journal; + journal = EXT4_SB(sb)->s_journal; - /* Now we set up the journal barrier. */ - jbd2_journal_lock_updates(journal); + if (journal) { + /* Now we set up the journal barrier. */ + jbd2_journal_lock_updates(journal); - /* - * We don't want to clear needs_recovery flag when we failed - * to flush the journal. - */ - if (jbd2_journal_flush(journal) < 0) - return; + /* + * We don't want to clear needs_recovery flag when we + * failed to flush the journal. + */ + error = jbd2_journal_flush(journal); + if (error < 0) + goto out; + } /* Journal blocked and flushed, clear needs_recovery flag. */ EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1); + error = ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1); + if (error) + goto out; } + return 0; +out: + jbd2_journal_unlock_updates(journal); + return error; } /* * Called by LVM after the snapshot is done. We need to reset the RECOVER * flag here, even though the filesystem is not technically dirty yet. */ -static void ext4_unlockfs(struct super_block *sb) +static int ext4_unfreeze(struct super_block *sb) { - if (!(sb->s_flags & MS_RDONLY)) { + if (EXT4_SB(sb)->s_journal && !(sb->s_flags & MS_RDONLY)) { lock_super(sb); /* Reser the needs_recovery flag before the fs is unlocked. */ EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); @@ -2948,6 +3113,7 @@ static void ext4_unlockfs(struct super_block *sb) unlock_super(sb); jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); } + return 0; } static int ext4_remount(struct super_block *sb, int *flags, char *data) @@ -2958,6 +3124,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) unsigned long old_sb_flags; struct ext4_mount_options old_opts; ext4_group_t g; + unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; int err; #ifdef CONFIG_QUOTA int i; @@ -2969,16 +3136,21 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) old_opts.s_resuid = sbi->s_resuid; old_opts.s_resgid = sbi->s_resgid; old_opts.s_commit_interval = sbi->s_commit_interval; + old_opts.s_min_batch_time = sbi->s_min_batch_time; + old_opts.s_max_batch_time = sbi->s_max_batch_time; #ifdef CONFIG_QUOTA old_opts.s_jquota_fmt = sbi->s_jquota_fmt; for (i = 0; i < MAXQUOTAS; i++) old_opts.s_qf_names[i] = sbi->s_qf_names[i]; #endif + if (sbi->s_journal && sbi->s_journal->j_task->io_context) + journal_ioprio = sbi->s_journal->j_task->io_context->ioprio; /* * Allow the "check" option to be passed as a remount option. */ - if (!parse_options(data, sb, NULL, NULL, &n_blocks_count, 1)) { + if (!parse_options(data, sb, NULL, &journal_ioprio, + &n_blocks_count, 1)) { err = -EINVAL; goto restore_opts; } @@ -2991,7 +3163,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) es = sbi->s_es; - ext4_init_journal_params(sb, sbi->s_journal); + if (sbi->s_journal) { + ext4_init_journal_params(sb, sbi->s_journal); + set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); + } if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) || n_blocks_count > ext4_blocks_count(es)) { @@ -3020,17 +3195,20 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) * We have to unlock super so that we can wait for * transactions. */ - unlock_super(sb); - ext4_mark_recovery_complete(sb, es); - lock_super(sb); + if (sbi->s_journal) { + unlock_super(sb); + ext4_mark_recovery_complete(sb, es); + lock_super(sb); + } } else { - __le32 ret; + int ret; if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP))) { printk(KERN_WARNING "EXT4-fs: %s: couldn't " "remount RDWR because of unsupported " - "optional features (%x).\n", - sb->s_id, le32_to_cpu(ret)); + "optional features (%x).\n", sb->s_id, + (le32_to_cpu(sbi->s_es->s_feature_ro_compat) & + ~EXT4_FEATURE_RO_COMPAT_SUPP)); err = -EROFS; goto restore_opts; } @@ -3047,7 +3225,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) if (!ext4_group_desc_csum_verify(sbi, g, gdp)) { printk(KERN_ERR "EXT4-fs: ext4_remount: " - "Checksum for group %lu failed (%u!=%u)\n", + "Checksum for group %u failed (%u!=%u)\n", g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)), le16_to_cpu(gdp->bg_checksum)); err = -EINVAL; @@ -3076,7 +3254,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) * been changed by e2fsck since we originally mounted * the partition.) */ - ext4_clear_journal_err(sb, es); + if (sbi->s_journal) + ext4_clear_journal_err(sb, es); sbi->s_mount_state = le16_to_cpu(es->s_state); if ((err = ext4_group_extend(sb, es, n_blocks_count))) goto restore_opts; @@ -3084,6 +3263,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) sb->s_flags &= ~MS_RDONLY; } } + if (sbi->s_journal == NULL) + ext4_commit_super(sb, es, 1); + #ifdef CONFIG_QUOTA /* Release old quota file names */ for (i = 0; i < MAXQUOTAS; i++) @@ -3098,6 +3280,8 @@ restore_opts: sbi->s_resuid = old_opts.s_resuid; sbi->s_resgid = old_opts.s_resgid; sbi->s_commit_interval = old_opts.s_commit_interval; + sbi->s_min_batch_time = old_opts.s_min_batch_time; + sbi->s_max_batch_time = old_opts.s_max_batch_time; #ifdef CONFIG_QUOTA sbi->s_jquota_fmt = old_opts.s_jquota_fmt; for (i = 0; i < MAXQUOTAS; i++) { @@ -3360,7 +3544,8 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, * When we journal data on quota file, we have to flush journal to see * all updates to the file when we bypass pagecache... */ - if (ext4_should_journal_data(path.dentry->d_inode)) { + if (EXT4_SB(sb)->s_journal && + ext4_should_journal_data(path.dentry->d_inode)) { /* * We don't need to lock updates but journal_flush() could * otherwise be livelocked... @@ -3434,7 +3619,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, struct buffer_head *bh; handle_t *handle = journal_current_handle(); - if (!handle) { + if (EXT4_SB(sb)->s_journal && !handle) { printk(KERN_WARNING "EXT4-fs: Quota write (off=%llu, len=%llu)" " cancelled because transaction is not started.\n", (unsigned long long)off, (unsigned long long)len); @@ -3459,7 +3644,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, flush_dcache_page(bh->b_page); unlock_buffer(bh); if (journal_quota) - err = ext4_journal_dirty_metadata(handle, bh); + err = ext4_handle_dirty_metadata(handle, NULL, bh); else { /* Always do at least ordered writes for quotas */ err = ext4_jbd2_file_inode(handle, inode); @@ -3513,18 +3698,15 @@ static int ext4_ui_proc_open(struct inode *inode, struct file *file) static ssize_t ext4_ui_proc_write(struct file *file, const char __user *buf, size_t cnt, loff_t *ppos) { - unsigned int *p = PDE(file->f_path.dentry->d_inode)->data; + unsigned long *p = PDE(file->f_path.dentry->d_inode)->data; char str[32]; - unsigned long value; if (cnt >= sizeof(str)) return -EINVAL; if (copy_from_user(str, buf, cnt)) return -EFAULT; - value = simple_strtol(str, NULL, 0); - if (value < 0) - return -ERANGE; - *p = value; + + *p = simple_strtoul(str, NULL, 0); return cnt; } @@ -3615,7 +3797,7 @@ static void __exit exit_ext4_fs(void) } MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); -MODULE_DESCRIPTION("Fourth Extended Filesystem with extents"); +MODULE_DESCRIPTION("Fourth Extended Filesystem"); MODULE_LICENSE("GPL"); module_init(init_ext4_fs) module_exit(exit_ext4_fs) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 80626d516fee..157ce6589c54 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -457,7 +457,7 @@ static void ext4_xattr_update_super_block(handle_t *handle, if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) { EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR); sb->s_dirt = 1; - ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh); + ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); } } @@ -487,9 +487,9 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, ext4_forget(handle, 1, inode, bh, bh->b_blocknr); } else { le32_add_cpu(&BHDR(bh)->h_refcount, -1); - error = ext4_journal_dirty_metadata(handle, bh); + error = ext4_handle_dirty_metadata(handle, inode, bh); if (IS_SYNC(inode)) - handle->h_sync = 1; + ext4_handle_sync(handle); DQUOT_FREE_BLOCK(inode, 1); ea_bdebug(bh, "refcount now=%d; releasing", le32_to_cpu(BHDR(bh)->h_refcount)); @@ -724,8 +724,9 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, if (error == -EIO) goto bad_block; if (!error) - error = ext4_journal_dirty_metadata(handle, - bs->bh); + error = ext4_handle_dirty_metadata(handle, + inode, + bs->bh); if (error) goto cleanup; goto inserted; @@ -794,8 +795,9 @@ inserted: ea_bdebug(new_bh, "reusing; refcount now=%d", le32_to_cpu(BHDR(new_bh)->h_refcount)); unlock_buffer(new_bh); - error = ext4_journal_dirty_metadata(handle, - new_bh); + error = ext4_handle_dirty_metadata(handle, + inode, + new_bh); if (error) goto cleanup_dquot; } @@ -810,8 +812,8 @@ inserted: /* We need to allocate a new block */ ext4_fsblk_t goal = ext4_group_first_block_no(sb, EXT4_I(inode)->i_block_group); - ext4_fsblk_t block = ext4_new_meta_block(handle, inode, - goal, &error); + ext4_fsblk_t block = ext4_new_meta_blocks(handle, inode, + goal, NULL, &error); if (error) goto cleanup; ea_idebug(inode, "creating block %d", block); @@ -833,7 +835,8 @@ getblk_failed: set_buffer_uptodate(new_bh); unlock_buffer(new_bh); ext4_xattr_cache_insert(new_bh); - error = ext4_journal_dirty_metadata(handle, new_bh); + error = ext4_handle_dirty_metadata(handle, + inode, new_bh); if (error) goto cleanup; } @@ -1040,7 +1043,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, */ is.iloc.bh = NULL; if (IS_SYNC(inode)) - handle->h_sync = 1; + ext4_handle_sync(handle); } cleanup: diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 67e058357098..3a7f603b6982 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -841,7 +841,6 @@ const struct file_operations fat_dir_operations = { .compat_ioctl = fat_compat_dir_ioctl, #endif .fsync = file_fsync, - .llseek = generic_file_llseek, }; static int fat_get_short_entry(struct inode *dir, loff_t *pos, diff --git a/fs/fat/inode.c b/fs/fat/inode.c index d937aaf77374..6b74d09adbe5 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -749,6 +749,8 @@ static struct dentry *fat_get_parent(struct dentry *child) brelse(bh); parent = d_obtain_alias(inode); + if (!IS_ERR(parent)) + parent->d_op = sb->s_root->d_op; out: unlock_super(sb); diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index bf326d4356a3..8ae32e37673c 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -78,7 +78,7 @@ static int vfat_revalidate_ci(struct dentry *dentry, struct nameidata *nd) * for creation. */ if (!(nd->flags & (LOOKUP_CONTINUE | LOOKUP_PARENT))) { - if (nd->flags & LOOKUP_CREATE) + if (nd->flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET)) return 0; } diff --git a/fs/file_table.c b/fs/file_table.c index 0fbcacc3ea75..bbeeac6efa1a 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -32,6 +32,9 @@ struct files_stat_struct files_stat = { /* public. Not pretty! */ __cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock); +/* SLAB cache for file structures */ +static struct kmem_cache *filp_cachep __read_mostly; + static struct percpu_counter nr_files __cacheline_aligned_in_smp; static inline void file_free_rcu(struct rcu_head *head) @@ -397,7 +400,12 @@ too_bad: void __init files_init(unsigned long mempages) { int n; - /* One file with associated inode and dcache is very roughly 1K. + + filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, + SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + + /* + * One file with associated inode and dcache is very roughly 1K. * Per default don't use more than 10% of our memory for files. */ diff --git a/fs/filesystems.c b/fs/filesystems.c index d0e20ced62dd..d488dcd7f2bb 100644 --- a/fs/filesystems.c +++ b/fs/filesystems.c @@ -253,24 +253,27 @@ static int __init proc_filesystems_init(void) module_init(proc_filesystems_init); #endif -struct file_system_type *get_fs_type(const char *name) +static struct file_system_type *__get_fs_type(const char *name, int len) { struct file_system_type *fs; - const char *dot = strchr(name, '.'); - unsigned len = dot ? dot - name : strlen(name); read_lock(&file_systems_lock); fs = *(find_filesystem(name, len)); if (fs && !try_module_get(fs->owner)) fs = NULL; read_unlock(&file_systems_lock); - if (!fs && (request_module("%.*s", len, name) == 0)) { - read_lock(&file_systems_lock); - fs = *(find_filesystem(name, len)); - if (fs && !try_module_get(fs->owner)) - fs = NULL; - read_unlock(&file_systems_lock); - } + return fs; +} + +struct file_system_type *get_fs_type(const char *name) +{ + struct file_system_type *fs; + const char *dot = strchr(name, '.'); + int len = dot ? dot - name : strlen(name); + + fs = __get_fs_type(name, len); + if (!fs && (request_module("%.*s", len, name) == 0)) + fs = __get_fs_type(name, len); if (dot && fs && !(fs->fs_flags & FS_HAS_SUBTYPE)) { put_filesystem(fs); diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c index 9f3f2ceb73f0..03a6ea5e99f7 100644 --- a/fs/freevxfs/vxfs_inode.c +++ b/fs/freevxfs/vxfs_inode.c @@ -325,8 +325,10 @@ vxfs_iget(struct super_block *sbp, ino_t ino) if (!VXFS_ISIMMED(vip)) { ip->i_op = &page_symlink_inode_operations; ip->i_mapping->a_ops = &vxfs_aops; - } else + } else { ip->i_op = &vxfs_immed_symlink_iops; + vip->vii_immed.vi_immed[ip->i_size] = '\0'; + } } else init_special_inode(ip, ip->i_mode, old_decode_dev(vip->vii_rdev)); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index d0ff0b8cf309..e5eaa62fd17f 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -421,9 +421,6 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * If we're a pdlfush thread, then implement pdflush collision avoidance * against the entire list. * - * WB_SYNC_HOLD is a hack for sys_sync(): reattach the inode to sb->s_dirty so - * that it can be located for waiting on in __writeback_single_inode(). - * * If `bdi' is non-zero then we're being asked to writeback a specific queue. * This function assumes that the blockdev superblock's inodes are backed by * a variety of queues, so all inodes are searched. For other superblocks, @@ -443,6 +440,7 @@ void generic_sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc) { const unsigned long start = jiffies; /* livelock avoidance */ + int sync = wbc->sync_mode == WB_SYNC_ALL; spin_lock(&inode_lock); if (!wbc->for_kupdate || list_empty(&sb->s_io)) @@ -499,10 +497,6 @@ void generic_sync_sb_inodes(struct super_block *sb, __iget(inode); pages_skipped = wbc->pages_skipped; __writeback_single_inode(inode, wbc); - if (wbc->sync_mode == WB_SYNC_HOLD) { - inode->dirtied_when = jiffies; - list_move(&inode->i_list, &sb->s_dirty); - } if (current_is_pdflush()) writeback_release(bdi); if (wbc->pages_skipped != pages_skipped) { @@ -523,7 +517,49 @@ void generic_sync_sb_inodes(struct super_block *sb, if (!list_empty(&sb->s_more_io)) wbc->more_io = 1; } - spin_unlock(&inode_lock); + + if (sync) { + struct inode *inode, *old_inode = NULL; + + /* + * Data integrity sync. Must wait for all pages under writeback, + * because there may have been pages dirtied before our sync + * call, but which had writeout started before we write it out. + * In which case, the inode may not be on the dirty list, but + * we still have to wait for that writeout. + */ + list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + struct address_space *mapping; + + if (inode->i_state & (I_FREEING|I_WILL_FREE)) + continue; + mapping = inode->i_mapping; + if (mapping->nrpages == 0) + continue; + __iget(inode); + spin_unlock(&inode_lock); + /* + * We hold a reference to 'inode' so it couldn't have + * been removed from s_inodes list while we dropped the + * inode_lock. We cannot iput the inode now as we can + * be holding the last reference and we cannot iput it + * under inode_lock. So we keep the reference and iput + * it later. + */ + iput(old_inode); + old_inode = inode; + + filemap_fdatawait(mapping); + + cond_resched(); + + spin_lock(&inode_lock); + } + spin_unlock(&inode_lock); + iput(old_inode); + } else + spin_unlock(&inode_lock); + return; /* Leave any unwritten inodes on s_io */ } EXPORT_SYMBOL_GPL(generic_sync_sb_inodes); @@ -588,8 +624,7 @@ restart: /* * writeback and wait upon the filesystem's dirty inodes. The caller will - * do this in two passes - one to write, and one to wait. WB_SYNC_HOLD is - * used to park the written inodes on sb->s_dirty for the wait pass. + * do this in two passes - one to write, and one to wait. * * A finite limit is set on the number of pages which will be written. * To prevent infinite livelock of sys_sync(). @@ -600,30 +635,21 @@ restart: void sync_inodes_sb(struct super_block *sb, int wait) { struct writeback_control wbc = { - .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_HOLD, + .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE, .range_start = 0, .range_end = LLONG_MAX, }; - unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); - unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); - wbc.nr_to_write = nr_dirty + nr_unstable + - (inodes_stat.nr_inodes - inodes_stat.nr_unused) + - nr_dirty + nr_unstable; - wbc.nr_to_write += wbc.nr_to_write / 2; /* Bit more for luck */ - sync_sb_inodes(sb, &wbc); -} + if (!wait) { + unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); + unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); -/* - * Rather lame livelock avoidance. - */ -static void set_sb_syncing(int val) -{ - struct super_block *sb; - spin_lock(&sb_lock); - list_for_each_entry_reverse(sb, &super_blocks, s_list) - sb->s_syncing = val; - spin_unlock(&sb_lock); + wbc.nr_to_write = nr_dirty + nr_unstable + + (inodes_stat.nr_inodes - inodes_stat.nr_unused); + } else + wbc.nr_to_write = LONG_MAX; /* doesn't actually matter */ + + sync_sb_inodes(sb, &wbc); } /** @@ -652,9 +678,6 @@ static void __sync_inodes(int wait) spin_lock(&sb_lock); restart: list_for_each_entry(sb, &super_blocks, s_list) { - if (sb->s_syncing) - continue; - sb->s_syncing = 1; sb->s_count++; spin_unlock(&sb_lock); down_read(&sb->s_umount); @@ -672,13 +695,10 @@ restart: void sync_inodes(int wait) { - set_sb_syncing(0); __sync_inodes(0); - if (wait) { - set_sb_syncing(0); + if (wait) __sync_inodes(1); - } } /** diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 4f3cab321415..99c99dfb0373 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -1,6 +1,6 @@ /* FUSE: Filesystem in Userspace - Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu> + Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu> This program can be distributed under the terms of the GNU GPL. See the file COPYING. @@ -48,11 +48,13 @@ static ssize_t fuse_conn_waiting_read(struct file *file, char __user *buf, size_t size; if (!*ppos) { + long value; struct fuse_conn *fc = fuse_ctl_file_conn_get(file); if (!fc) return 0; - file->private_data=(void *)(long)atomic_read(&fc->num_waiting); + value = atomic_read(&fc->num_waiting); + file->private_data = (void *)value; fuse_conn_put(fc); } size = sprintf(tmp, "%ld\n", (long)file->private_data); diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index fba571648a8e..e0c7ada08a1f 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1,6 +1,6 @@ /* FUSE: Filesystem in Userspace - Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu> + Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu> This program can be distributed under the terms of the GNU GPL. See the file COPYING. @@ -269,7 +269,7 @@ static void flush_bg_queue(struct fuse_conn *fc) * Called with fc->lock, unlocks it */ static void request_end(struct fuse_conn *fc, struct fuse_req *req) - __releases(fc->lock) +__releases(&fc->lock) { void (*end) (struct fuse_conn *, struct fuse_req *) = req->end; req->end = NULL; @@ -293,13 +293,13 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req) wake_up(&req->waitq); if (end) end(fc, req); - else - fuse_put_request(fc, req); + fuse_put_request(fc, req); } static void wait_answer_interruptible(struct fuse_conn *fc, struct fuse_req *req) - __releases(fc->lock) __acquires(fc->lock) +__releases(&fc->lock) +__acquires(&fc->lock) { if (signal_pending(current)) return; @@ -317,7 +317,8 @@ static void queue_interrupt(struct fuse_conn *fc, struct fuse_req *req) } static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) - __releases(fc->lock) __acquires(fc->lock) +__releases(&fc->lock) +__acquires(&fc->lock) { if (!fc->no_interrupt) { /* Any signal may interrupt this */ @@ -380,7 +381,7 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) } } -void request_send(struct fuse_conn *fc, struct fuse_req *req) +void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) { req->isreply = 1; spin_lock(&fc->lock); @@ -399,8 +400,8 @@ void request_send(struct fuse_conn *fc, struct fuse_req *req) spin_unlock(&fc->lock); } -static void request_send_nowait_locked(struct fuse_conn *fc, - struct fuse_req *req) +static void fuse_request_send_nowait_locked(struct fuse_conn *fc, + struct fuse_req *req) { req->background = 1; fc->num_background++; @@ -414,11 +415,11 @@ static void request_send_nowait_locked(struct fuse_conn *fc, flush_bg_queue(fc); } -static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req) +static void fuse_request_send_nowait(struct fuse_conn *fc, struct fuse_req *req) { spin_lock(&fc->lock); if (fc->connected) { - request_send_nowait_locked(fc, req); + fuse_request_send_nowait_locked(fc, req); spin_unlock(&fc->lock); } else { req->out.h.error = -ENOTCONN; @@ -426,16 +427,16 @@ static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req) } } -void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req) +void fuse_request_send_noreply(struct fuse_conn *fc, struct fuse_req *req) { req->isreply = 0; - request_send_nowait(fc, req); + fuse_request_send_nowait(fc, req); } -void request_send_background(struct fuse_conn *fc, struct fuse_req *req) +void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req) { req->isreply = 1; - request_send_nowait(fc, req); + fuse_request_send_nowait(fc, req); } /* @@ -443,10 +444,11 @@ void request_send_background(struct fuse_conn *fc, struct fuse_req *req) * * fc->connected must have been checked previously */ -void request_send_background_locked(struct fuse_conn *fc, struct fuse_req *req) +void fuse_request_send_background_locked(struct fuse_conn *fc, + struct fuse_req *req) { req->isreply = 1; - request_send_nowait_locked(fc, req); + fuse_request_send_nowait_locked(fc, req); } /* @@ -539,8 +541,8 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) BUG_ON(!cs->nr_segs); cs->seglen = cs->iov[0].iov_len; cs->addr = (unsigned long) cs->iov[0].iov_base; - cs->iov ++; - cs->nr_segs --; + cs->iov++; + cs->nr_segs--; } down_read(¤t->mm->mmap_sem); err = get_user_pages(current, current->mm, cs->addr, 1, cs->write, 0, @@ -589,9 +591,11 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page *page, kunmap_atomic(mapaddr, KM_USER1); } while (count) { - int err; - if (!cs->len && (err = fuse_copy_fill(cs))) - return err; + if (!cs->len) { + int err = fuse_copy_fill(cs); + if (err) + return err; + } if (page) { void *mapaddr = kmap_atomic(page, KM_USER1); void *buf = mapaddr + offset; @@ -631,9 +635,11 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes, static int fuse_copy_one(struct fuse_copy_state *cs, void *val, unsigned size) { while (size) { - int err; - if (!cs->len && (err = fuse_copy_fill(cs))) - return err; + if (!cs->len) { + int err = fuse_copy_fill(cs); + if (err) + return err; + } fuse_copy_do(cs, &val, &size); } return 0; @@ -664,6 +670,8 @@ static int request_pending(struct fuse_conn *fc) /* Wait until a request is available on the pending list */ static void request_wait(struct fuse_conn *fc) +__releases(&fc->lock) +__acquires(&fc->lock) { DECLARE_WAITQUEUE(wait, current); @@ -691,7 +699,7 @@ static void request_wait(struct fuse_conn *fc) */ static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_req *req, const struct iovec *iov, unsigned long nr_segs) - __releases(fc->lock) +__releases(&fc->lock) { struct fuse_copy_state cs; struct fuse_in_header ih; @@ -813,6 +821,34 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov, return err; } +static int fuse_notify_poll(struct fuse_conn *fc, unsigned int size, + struct fuse_copy_state *cs) +{ + struct fuse_notify_poll_wakeup_out outarg; + int err; + + if (size != sizeof(outarg)) + return -EINVAL; + + err = fuse_copy_one(cs, &outarg, sizeof(outarg)); + if (err) + return err; + + return fuse_notify_poll_wakeup(fc, &outarg); +} + +static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, + unsigned int size, struct fuse_copy_state *cs) +{ + switch (code) { + case FUSE_NOTIFY_POLL: + return fuse_notify_poll(fc, size, cs); + + default: + return -EINVAL; + } +} + /* Look up request on processing list by unique ID */ static struct fuse_req *request_find(struct fuse_conn *fc, u64 unique) { @@ -876,9 +912,23 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov, err = fuse_copy_one(&cs, &oh, sizeof(oh)); if (err) goto err_finish; + + err = -EINVAL; + if (oh.len != nbytes) + goto err_finish; + + /* + * Zero oh.unique indicates unsolicited notification message + * and error contains notification code. + */ + if (!oh.unique) { + err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), &cs); + fuse_copy_finish(&cs); + return err ? err : nbytes; + } + err = -EINVAL; - if (!oh.unique || oh.error <= -1000 || oh.error > 0 || - oh.len != nbytes) + if (oh.error <= -1000 || oh.error > 0) goto err_finish; spin_lock(&fc->lock); @@ -966,6 +1016,8 @@ static unsigned fuse_dev_poll(struct file *file, poll_table *wait) * This function releases and reacquires fc->lock */ static void end_requests(struct fuse_conn *fc, struct list_head *head) +__releases(&fc->lock) +__acquires(&fc->lock) { while (!list_empty(head)) { struct fuse_req *req; @@ -988,7 +1040,8 @@ static void end_requests(struct fuse_conn *fc, struct list_head *head) * locked). */ static void end_io_requests(struct fuse_conn *fc) - __releases(fc->lock) __acquires(fc->lock) +__releases(&fc->lock) +__acquires(&fc->lock) { while (!list_empty(&fc->io)) { struct fuse_req *req = @@ -1002,11 +1055,11 @@ static void end_io_requests(struct fuse_conn *fc) wake_up(&req->waitq); if (end) { req->end = NULL; - /* The end function will consume this reference */ __fuse_get_request(req); spin_unlock(&fc->lock); wait_event(req->waitq, !req->locked); end(fc, req); + fuse_put_request(fc, req); spin_lock(&fc->lock); } } diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 95bc22bdd060..fdff346e96fd 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1,6 +1,6 @@ /* FUSE: Filesystem in Userspace - Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu> + Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu> This program can be distributed under the terms of the GNU GPL. See the file COPYING. @@ -189,7 +189,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd) parent = dget_parent(entry); fuse_lookup_init(fc, req, get_node_id(parent->d_inode), &entry->d_name, &outarg); - request_send(fc, req); + fuse_request_send(fc, req); dput(parent); err = req->out.h.error; fuse_put_request(fc, req); @@ -204,7 +204,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd) return 0; } spin_lock(&fc->lock); - fi->nlookup ++; + fi->nlookup++; spin_unlock(&fc->lock); } fuse_put_request(fc, forget_req); @@ -283,7 +283,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name, attr_version = fuse_get_attr_version(fc); fuse_lookup_init(fc, req, nodeid, name, outarg); - request_send(fc, req); + fuse_request_send(fc, req); err = req->out.h.error; fuse_put_request(fc, req); /* Zero nodeid is same as -ENOENT, but with valid timeout */ @@ -369,7 +369,7 @@ static void fuse_sync_release(struct fuse_conn *fc, struct fuse_file *ff, { fuse_release_fill(ff, nodeid, flags, FUSE_RELEASE); ff->reserved_req->force = 1; - request_send(fc, ff->reserved_req); + fuse_request_send(fc, ff->reserved_req); fuse_put_request(fc, ff->reserved_req); kfree(ff); } @@ -408,7 +408,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode, goto out_put_forget_req; err = -ENOMEM; - ff = fuse_file_alloc(); + ff = fuse_file_alloc(fc); if (!ff) goto out_put_request; @@ -432,7 +432,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode, req->out.args[0].value = &outentry; req->out.args[1].size = sizeof(outopen); req->out.args[1].value = &outopen; - request_send(fc, req); + fuse_request_send(fc, req); err = req->out.h.error; if (err) { if (err == -ENOSYS) @@ -502,7 +502,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req, else req->out.args[0].size = sizeof(outarg); req->out.args[0].value = &outarg; - request_send(fc, req); + fuse_request_send(fc, req); err = req->out.h.error; fuse_put_request(fc, req); if (err) @@ -631,15 +631,17 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) req->in.numargs = 1; req->in.args[0].size = entry->d_name.len + 1; req->in.args[0].value = entry->d_name.name; - request_send(fc, req); + fuse_request_send(fc, req); err = req->out.h.error; fuse_put_request(fc, req); if (!err) { struct inode *inode = entry->d_inode; - /* Set nlink to zero so the inode can be cleared, if - the inode does have more links this will be - discovered at the next lookup/getattr */ + /* + * Set nlink to zero so the inode can be cleared, if the inode + * does have more links this will be discovered at the next + * lookup/getattr. + */ clear_nlink(inode); fuse_invalidate_attr(inode); fuse_invalidate_attr(dir); @@ -662,7 +664,7 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry) req->in.numargs = 1; req->in.args[0].size = entry->d_name.len + 1; req->in.args[0].value = entry->d_name.name; - request_send(fc, req); + fuse_request_send(fc, req); err = req->out.h.error; fuse_put_request(fc, req); if (!err) { @@ -695,7 +697,7 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent, req->in.args[1].value = oldent->d_name.name; req->in.args[2].size = newent->d_name.len + 1; req->in.args[2].value = newent->d_name.name; - request_send(fc, req); + fuse_request_send(fc, req); err = req->out.h.error; fuse_put_request(fc, req); if (!err) { @@ -811,7 +813,7 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat, else req->out.args[0].size = sizeof(outarg); req->out.args[0].value = &outarg; - request_send(fc, req); + fuse_request_send(fc, req); err = req->out.h.error; fuse_put_request(fc, req); if (!err) { @@ -911,7 +913,7 @@ static int fuse_access(struct inode *inode, int mask) req->in.numargs = 1; req->in.args[0].size = sizeof(inarg); req->in.args[0].value = &inarg; - request_send(fc, req); + fuse_request_send(fc, req); err = req->out.h.error; fuse_put_request(fc, req); if (err == -ENOSYS) { @@ -1033,7 +1035,7 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir) req->num_pages = 1; req->pages[0] = page; fuse_read_fill(req, file, inode, file->f_pos, PAGE_SIZE, FUSE_READDIR); - request_send(fc, req); + fuse_request_send(fc, req); nbytes = req->out.args[0].size; err = req->out.h.error; fuse_put_request(fc, req); @@ -1067,7 +1069,7 @@ static char *read_link(struct dentry *dentry) req->out.numargs = 1; req->out.args[0].size = PAGE_SIZE - 1; req->out.args[0].value = link; - request_send(fc, req); + fuse_request_send(fc, req); if (req->out.h.error) { free_page((unsigned long) link); link = ERR_PTR(req->out.h.error); @@ -1273,7 +1275,7 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr, else req->out.args[0].size = sizeof(outarg); req->out.args[0].value = &outarg; - request_send(fc, req); + fuse_request_send(fc, req); err = req->out.h.error; fuse_put_request(fc, req); if (err) { @@ -1367,7 +1369,7 @@ static int fuse_setxattr(struct dentry *entry, const char *name, req->in.args[1].value = name; req->in.args[2].size = size; req->in.args[2].value = value; - request_send(fc, req); + fuse_request_send(fc, req); err = req->out.h.error; fuse_put_request(fc, req); if (err == -ENOSYS) { @@ -1413,7 +1415,7 @@ static ssize_t fuse_getxattr(struct dentry *entry, const char *name, req->out.args[0].size = sizeof(outarg); req->out.args[0].value = &outarg; } - request_send(fc, req); + fuse_request_send(fc, req); ret = req->out.h.error; if (!ret) ret = size ? req->out.args[0].size : outarg.size; @@ -1463,7 +1465,7 @@ static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) req->out.args[0].size = sizeof(outarg); req->out.args[0].value = &outarg; } - request_send(fc, req); + fuse_request_send(fc, req); ret = req->out.h.error; if (!ret) ret = size ? req->out.args[0].size : outarg.size; @@ -1496,7 +1498,7 @@ static int fuse_removexattr(struct dentry *entry, const char *name) req->in.numargs = 1; req->in.args[0].size = strlen(name) + 1; req->in.args[0].value = name; - request_send(fc, req); + fuse_request_send(fc, req); err = req->out.h.error; fuse_put_request(fc, req); if (err == -ENOSYS) { diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 34930a964b82..e8162646a9b5 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1,6 +1,6 @@ /* FUSE: Filesystem in Userspace - Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu> + Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu> This program can be distributed under the terms of the GNU GPL. See the file COPYING. @@ -39,14 +39,14 @@ static int fuse_send_open(struct inode *inode, struct file *file, int isdir, req->out.numargs = 1; req->out.args[0].size = sizeof(*outargp); req->out.args[0].value = outargp; - request_send(fc, req); + fuse_request_send(fc, req); err = req->out.h.error; fuse_put_request(fc, req); return err; } -struct fuse_file *fuse_file_alloc(void) +struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) { struct fuse_file *ff; ff = kmalloc(sizeof(struct fuse_file), GFP_KERNEL); @@ -58,7 +58,12 @@ struct fuse_file *fuse_file_alloc(void) } else { INIT_LIST_HEAD(&ff->write_entry); atomic_set(&ff->count, 0); + spin_lock(&fc->lock); + ff->kh = ++fc->khctr; + spin_unlock(&fc->lock); } + RB_CLEAR_NODE(&ff->polled_node); + init_waitqueue_head(&ff->poll_wait); } return ff; } @@ -79,7 +84,6 @@ static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req) { dput(req->misc.release.dentry); mntput(req->misc.release.vfsmount); - fuse_put_request(fc, req); } static void fuse_file_put(struct fuse_file *ff) @@ -89,7 +93,7 @@ static void fuse_file_put(struct fuse_file *ff) struct inode *inode = req->misc.release.dentry->d_inode; struct fuse_conn *fc = get_fuse_conn(inode); req->end = fuse_release_end; - request_send_background(fc, req); + fuse_request_send_background(fc, req); kfree(ff); } } @@ -109,6 +113,7 @@ void fuse_finish_open(struct inode *inode, struct file *file, int fuse_open_common(struct inode *inode, struct file *file, int isdir) { + struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_open_out outarg; struct fuse_file *ff; int err; @@ -121,7 +126,7 @@ int fuse_open_common(struct inode *inode, struct file *file, int isdir) if (err) return err; - ff = fuse_file_alloc(); + ff = fuse_file_alloc(fc); if (!ff) return -ENOMEM; @@ -167,7 +172,11 @@ int fuse_release_common(struct inode *inode, struct file *file, int isdir) spin_lock(&fc->lock); list_del(&ff->write_entry); + if (!RB_EMPTY_NODE(&ff->polled_node)) + rb_erase(&ff->polled_node, &fc->polled_files); spin_unlock(&fc->lock); + + wake_up_interruptible_sync(&ff->poll_wait); /* * Normally this will send the RELEASE request, * however if some asynchronous READ or WRITE requests @@ -280,7 +289,7 @@ static int fuse_flush(struct file *file, fl_owner_t id) req->in.args[0].size = sizeof(inarg); req->in.args[0].value = &inarg; req->force = 1; - request_send(fc, req); + fuse_request_send(fc, req); err = req->out.h.error; fuse_put_request(fc, req); if (err == -ENOSYS) { @@ -344,7 +353,7 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync, req->in.numargs = 1; req->in.args[0].size = sizeof(inarg); req->in.args[0].value = &inarg; - request_send(fc, req); + fuse_request_send(fc, req); err = req->out.h.error; fuse_put_request(fc, req); if (err == -ENOSYS) { @@ -396,7 +405,7 @@ static size_t fuse_send_read(struct fuse_req *req, struct file *file, inarg->read_flags |= FUSE_READ_LOCKOWNER; inarg->lock_owner = fuse_lock_owner_id(fc, owner); } - request_send(fc, req); + fuse_request_send(fc, req); return req->out.args[0].size; } @@ -493,7 +502,6 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req) } if (req->ff) fuse_file_put(req->ff); - fuse_put_request(fc, req); } static void fuse_send_readpages(struct fuse_req *req, struct file *file, @@ -509,10 +517,11 @@ static void fuse_send_readpages(struct fuse_req *req, struct file *file, struct fuse_file *ff = file->private_data; req->ff = fuse_file_get(ff); req->end = fuse_readpages_end; - request_send_background(fc, req); + fuse_request_send_background(fc, req); } else { - request_send(fc, req); + fuse_request_send(fc, req); fuse_readpages_end(fc, req); + fuse_put_request(fc, req); } } @@ -543,7 +552,7 @@ static int fuse_readpages_fill(void *_data, struct page *page) } } req->pages[req->num_pages] = page; - req->num_pages ++; + req->num_pages++; return 0; } @@ -636,7 +645,7 @@ static size_t fuse_send_write(struct fuse_req *req, struct file *file, inarg->write_flags |= FUSE_WRITE_LOCKOWNER; inarg->lock_owner = fuse_lock_owner_id(fc, owner); } - request_send(fc, req); + fuse_request_send(fc, req); return req->misc.write.out.size; } @@ -646,7 +655,7 @@ static int fuse_write_begin(struct file *file, struct address_space *mapping, { pgoff_t index = pos >> PAGE_CACHE_SHIFT; - *pagep = __grab_cache_page(mapping, index); + *pagep = grab_cache_page_write_begin(mapping, index, flags); if (!*pagep) return -ENOMEM; return 0; @@ -779,7 +788,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req, break; err = -ENOMEM; - page = __grab_cache_page(mapping, index); + page = grab_cache_page_write_begin(mapping, index, 0); if (!page) break; @@ -1042,7 +1051,6 @@ static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req) { __free_page(req->pages[0]); fuse_file_put(req->ff); - fuse_put_request(fc, req); } static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) @@ -1060,6 +1068,8 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) /* Called under fc->lock, may release and reacquire it */ static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req) +__releases(&fc->lock) +__acquires(&fc->lock) { struct fuse_inode *fi = get_fuse_inode(req->inode); loff_t size = i_size_read(req->inode); @@ -1079,13 +1089,14 @@ static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req) req->in.args[1].size = inarg->size; fi->writectr++; - request_send_background_locked(fc, req); + fuse_request_send_background_locked(fc, req); return; out_free: fuse_writepage_finish(fc, req); spin_unlock(&fc->lock); fuse_writepage_free(fc, req); + fuse_put_request(fc, req); spin_lock(&fc->lock); } @@ -1096,6 +1107,8 @@ static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req) * Called with fc->lock */ void fuse_flush_writepages(struct inode *inode) +__releases(&fc->lock) +__acquires(&fc->lock) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); @@ -1325,7 +1338,7 @@ static int fuse_getlk(struct file *file, struct file_lock *fl) req->out.numargs = 1; req->out.args[0].size = sizeof(outarg); req->out.args[0].value = &outarg; - request_send(fc, req); + fuse_request_send(fc, req); err = req->out.h.error; fuse_put_request(fc, req); if (!err) @@ -1357,7 +1370,7 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) return PTR_ERR(req); fuse_lk_fill(req, file, fl, opcode, pid, flock); - request_send(fc, req); + fuse_request_send(fc, req); err = req->out.h.error; /* locking is restartable */ if (err == -EINTR) @@ -1433,7 +1446,7 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block) req->out.numargs = 1; req->out.args[0].size = sizeof(outarg); req->out.args[0].value = &outarg; - request_send(fc, req); + fuse_request_send(fc, req); err = req->out.h.error; fuse_put_request(fc, req); if (err == -ENOSYS) @@ -1470,6 +1483,406 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin) return retval; } +static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov, + unsigned int nr_segs, size_t bytes, bool to_user) +{ + struct iov_iter ii; + int page_idx = 0; + + if (!bytes) + return 0; + + iov_iter_init(&ii, iov, nr_segs, bytes, 0); + + while (iov_iter_count(&ii)) { + struct page *page = pages[page_idx++]; + size_t todo = min_t(size_t, PAGE_SIZE, iov_iter_count(&ii)); + void *kaddr, *map; + + kaddr = map = kmap(page); + + while (todo) { + char __user *uaddr = ii.iov->iov_base + ii.iov_offset; + size_t iov_len = ii.iov->iov_len - ii.iov_offset; + size_t copy = min(todo, iov_len); + size_t left; + + if (!to_user) + left = copy_from_user(kaddr, uaddr, copy); + else + left = copy_to_user(uaddr, kaddr, copy); + + if (unlikely(left)) + return -EFAULT; + + iov_iter_advance(&ii, copy); + todo -= copy; + kaddr += copy; + } + + kunmap(map); + } + + return 0; +} + +/* + * For ioctls, there is no generic way to determine how much memory + * needs to be read and/or written. Furthermore, ioctls are allowed + * to dereference the passed pointer, so the parameter requires deep + * copying but FUSE has no idea whatsoever about what to copy in or + * out. + * + * This is solved by allowing FUSE server to retry ioctl with + * necessary in/out iovecs. Let's assume the ioctl implementation + * needs to read in the following structure. + * + * struct a { + * char *buf; + * size_t buflen; + * } + * + * On the first callout to FUSE server, inarg->in_size and + * inarg->out_size will be NULL; then, the server completes the ioctl + * with FUSE_IOCTL_RETRY set in out->flags, out->in_iovs set to 1 and + * the actual iov array to + * + * { { .iov_base = inarg.arg, .iov_len = sizeof(struct a) } } + * + * which tells FUSE to copy in the requested area and retry the ioctl. + * On the second round, the server has access to the structure and + * from that it can tell what to look for next, so on the invocation, + * it sets FUSE_IOCTL_RETRY, out->in_iovs to 2 and iov array to + * + * { { .iov_base = inarg.arg, .iov_len = sizeof(struct a) }, + * { .iov_base = a.buf, .iov_len = a.buflen } } + * + * FUSE will copy both struct a and the pointed buffer from the + * process doing the ioctl and retry ioctl with both struct a and the + * buffer. + * + * This time, FUSE server has everything it needs and completes ioctl + * without FUSE_IOCTL_RETRY which finishes the ioctl call. + * + * Copying data out works the same way. + * + * Note that if FUSE_IOCTL_UNRESTRICTED is clear, the kernel + * automatically initializes in and out iovs by decoding @cmd with + * _IOC_* macros and the server is not allowed to request RETRY. This + * limits ioctl data transfers to well-formed ioctls and is the forced + * behavior for all FUSE servers. + */ +static long fuse_file_do_ioctl(struct file *file, unsigned int cmd, + unsigned long arg, unsigned int flags) +{ + struct inode *inode = file->f_dentry->d_inode; + struct fuse_file *ff = file->private_data; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_ioctl_in inarg = { + .fh = ff->fh, + .cmd = cmd, + .arg = arg, + .flags = flags + }; + struct fuse_ioctl_out outarg; + struct fuse_req *req = NULL; + struct page **pages = NULL; + struct page *iov_page = NULL; + struct iovec *in_iov = NULL, *out_iov = NULL; + unsigned int in_iovs = 0, out_iovs = 0, num_pages = 0, max_pages; + size_t in_size, out_size, transferred; + int err; + + /* assume all the iovs returned by client always fits in a page */ + BUILD_BUG_ON(sizeof(struct iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE); + + if (!fuse_allow_task(fc, current)) + return -EACCES; + + err = -EIO; + if (is_bad_inode(inode)) + goto out; + + err = -ENOMEM; + pages = kzalloc(sizeof(pages[0]) * FUSE_MAX_PAGES_PER_REQ, GFP_KERNEL); + iov_page = alloc_page(GFP_KERNEL); + if (!pages || !iov_page) + goto out; + + /* + * If restricted, initialize IO parameters as encoded in @cmd. + * RETRY from server is not allowed. + */ + if (!(flags & FUSE_IOCTL_UNRESTRICTED)) { + struct iovec *iov = page_address(iov_page); + + iov->iov_base = (void __user *)arg; + iov->iov_len = _IOC_SIZE(cmd); + + if (_IOC_DIR(cmd) & _IOC_WRITE) { + in_iov = iov; + in_iovs = 1; + } + + if (_IOC_DIR(cmd) & _IOC_READ) { + out_iov = iov; + out_iovs = 1; + } + } + + retry: + inarg.in_size = in_size = iov_length(in_iov, in_iovs); + inarg.out_size = out_size = iov_length(out_iov, out_iovs); + + /* + * Out data can be used either for actual out data or iovs, + * make sure there always is at least one page. + */ + out_size = max_t(size_t, out_size, PAGE_SIZE); + max_pages = DIV_ROUND_UP(max(in_size, out_size), PAGE_SIZE); + + /* make sure there are enough buffer pages and init request with them */ + err = -ENOMEM; + if (max_pages > FUSE_MAX_PAGES_PER_REQ) + goto out; + while (num_pages < max_pages) { + pages[num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); + if (!pages[num_pages]) + goto out; + num_pages++; + } + + req = fuse_get_req(fc); + if (IS_ERR(req)) { + err = PTR_ERR(req); + req = NULL; + goto out; + } + memcpy(req->pages, pages, sizeof(req->pages[0]) * num_pages); + req->num_pages = num_pages; + + /* okay, let's send it to the client */ + req->in.h.opcode = FUSE_IOCTL; + req->in.h.nodeid = get_node_id(inode); + req->in.numargs = 1; + req->in.args[0].size = sizeof(inarg); + req->in.args[0].value = &inarg; + if (in_size) { + req->in.numargs++; + req->in.args[1].size = in_size; + req->in.argpages = 1; + + err = fuse_ioctl_copy_user(pages, in_iov, in_iovs, in_size, + false); + if (err) + goto out; + } + + req->out.numargs = 2; + req->out.args[0].size = sizeof(outarg); + req->out.args[0].value = &outarg; + req->out.args[1].size = out_size; + req->out.argpages = 1; + req->out.argvar = 1; + + fuse_request_send(fc, req); + err = req->out.h.error; + transferred = req->out.args[1].size; + fuse_put_request(fc, req); + req = NULL; + if (err) + goto out; + + /* did it ask for retry? */ + if (outarg.flags & FUSE_IOCTL_RETRY) { + char *vaddr; + + /* no retry if in restricted mode */ + err = -EIO; + if (!(flags & FUSE_IOCTL_UNRESTRICTED)) + goto out; + + in_iovs = outarg.in_iovs; + out_iovs = outarg.out_iovs; + + /* + * Make sure things are in boundary, separate checks + * are to protect against overflow. + */ + err = -ENOMEM; + if (in_iovs > FUSE_IOCTL_MAX_IOV || + out_iovs > FUSE_IOCTL_MAX_IOV || + in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV) + goto out; + + err = -EIO; + if ((in_iovs + out_iovs) * sizeof(struct iovec) != transferred) + goto out; + + /* okay, copy in iovs and retry */ + vaddr = kmap_atomic(pages[0], KM_USER0); + memcpy(page_address(iov_page), vaddr, transferred); + kunmap_atomic(vaddr, KM_USER0); + + in_iov = page_address(iov_page); + out_iov = in_iov + in_iovs; + + goto retry; + } + + err = -EIO; + if (transferred > inarg.out_size) + goto out; + + err = fuse_ioctl_copy_user(pages, out_iov, out_iovs, transferred, true); + out: + if (req) + fuse_put_request(fc, req); + if (iov_page) + __free_page(iov_page); + while (num_pages) + __free_page(pages[--num_pages]); + kfree(pages); + + return err ? err : outarg.result; +} + +static long fuse_file_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + return fuse_file_do_ioctl(file, cmd, arg, 0); +} + +static long fuse_file_compat_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + return fuse_file_do_ioctl(file, cmd, arg, FUSE_IOCTL_COMPAT); +} + +/* + * All files which have been polled are linked to RB tree + * fuse_conn->polled_files which is indexed by kh. Walk the tree and + * find the matching one. + */ +static struct rb_node **fuse_find_polled_node(struct fuse_conn *fc, u64 kh, + struct rb_node **parent_out) +{ + struct rb_node **link = &fc->polled_files.rb_node; + struct rb_node *last = NULL; + + while (*link) { + struct fuse_file *ff; + + last = *link; + ff = rb_entry(last, struct fuse_file, polled_node); + + if (kh < ff->kh) + link = &last->rb_left; + else if (kh > ff->kh) + link = &last->rb_right; + else + return link; + } + + if (parent_out) + *parent_out = last; + return link; +} + +/* + * The file is about to be polled. Make sure it's on the polled_files + * RB tree. Note that files once added to the polled_files tree are + * not removed before the file is released. This is because a file + * polled once is likely to be polled again. + */ +static void fuse_register_polled_file(struct fuse_conn *fc, + struct fuse_file *ff) +{ + spin_lock(&fc->lock); + if (RB_EMPTY_NODE(&ff->polled_node)) { + struct rb_node **link, *parent; + + link = fuse_find_polled_node(fc, ff->kh, &parent); + BUG_ON(*link); + rb_link_node(&ff->polled_node, parent, link); + rb_insert_color(&ff->polled_node, &fc->polled_files); + } + spin_unlock(&fc->lock); +} + +static unsigned fuse_file_poll(struct file *file, poll_table *wait) +{ + struct inode *inode = file->f_dentry->d_inode; + struct fuse_file *ff = file->private_data; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh }; + struct fuse_poll_out outarg; + struct fuse_req *req; + int err; + + if (fc->no_poll) + return DEFAULT_POLLMASK; + + poll_wait(file, &ff->poll_wait, wait); + + /* + * Ask for notification iff there's someone waiting for it. + * The client may ignore the flag and always notify. + */ + if (waitqueue_active(&ff->poll_wait)) { + inarg.flags |= FUSE_POLL_SCHEDULE_NOTIFY; + fuse_register_polled_file(fc, ff); + } + + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + + req->in.h.opcode = FUSE_POLL; + req->in.h.nodeid = get_node_id(inode); + req->in.numargs = 1; + req->in.args[0].size = sizeof(inarg); + req->in.args[0].value = &inarg; + req->out.numargs = 1; + req->out.args[0].size = sizeof(outarg); + req->out.args[0].value = &outarg; + fuse_request_send(fc, req); + err = req->out.h.error; + fuse_put_request(fc, req); + + if (!err) + return outarg.revents; + if (err == -ENOSYS) { + fc->no_poll = 1; + return DEFAULT_POLLMASK; + } + return POLLERR; +} + +/* + * This is called from fuse_handle_notify() on FUSE_NOTIFY_POLL and + * wakes up the poll waiters. + */ +int fuse_notify_poll_wakeup(struct fuse_conn *fc, + struct fuse_notify_poll_wakeup_out *outarg) +{ + u64 kh = outarg->kh; + struct rb_node **link; + + spin_lock(&fc->lock); + + link = fuse_find_polled_node(fc, kh, NULL); + if (*link) { + struct fuse_file *ff; + + ff = rb_entry(*link, struct fuse_file, polled_node); + wake_up_interruptible_sync(&ff->poll_wait); + } + + spin_unlock(&fc->lock); + return 0; +} + static const struct file_operations fuse_file_operations = { .llseek = fuse_file_llseek, .read = do_sync_read, @@ -1484,6 +1897,9 @@ static const struct file_operations fuse_file_operations = { .lock = fuse_file_lock, .flock = fuse_file_flock, .splice_read = generic_file_splice_read, + .unlocked_ioctl = fuse_file_ioctl, + .compat_ioctl = fuse_file_compat_ioctl, + .poll = fuse_file_poll, }; static const struct file_operations fuse_direct_io_file_operations = { @@ -1496,6 +1912,9 @@ static const struct file_operations fuse_direct_io_file_operations = { .fsync = fuse_fsync, .lock = fuse_file_lock, .flock = fuse_file_flock, + .unlocked_ioctl = fuse_file_ioctl, + .compat_ioctl = fuse_file_compat_ioctl, + .poll = fuse_file_poll, /* no mmap and splice_read */ }; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 35accfdd747f..5e64b815a5a1 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1,6 +1,6 @@ /* FUSE: Filesystem in Userspace - Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu> + Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu> This program can be distributed under the terms of the GNU GPL. See the file COPYING. @@ -19,6 +19,8 @@ #include <linux/backing-dev.h> #include <linux/mutex.h> #include <linux/rwsem.h> +#include <linux/rbtree.h> +#include <linux/poll.h> /** Max number of pages that can be used in a single read request */ #define FUSE_MAX_PAGES_PER_REQ 32 @@ -100,6 +102,9 @@ struct fuse_file { /** Request reserved for flush and release */ struct fuse_req *reserved_req; + /** Kernel file handle guaranteed to be unique */ + u64 kh; + /** File handle used by userspace */ u64 fh; @@ -108,6 +113,12 @@ struct fuse_file { /** Entry on inode's write_files list */ struct list_head write_entry; + + /** RB node to be linked on fuse_conn->polled_files */ + struct rb_node polled_node; + + /** Wait queue head for poll */ + wait_queue_head_t poll_wait; }; /** One input argument of a request */ @@ -322,6 +333,12 @@ struct fuse_conn { /** The list of requests under I/O */ struct list_head io; + /** The next unique kernel file handle */ + u64 khctr; + + /** rbtree of fuse_files waiting for poll events indexed by ph */ + struct rb_root polled_files; + /** Number of requests currently in the background */ unsigned num_background; @@ -355,19 +372,19 @@ struct fuse_conn { /** Connection failed (version mismatch). Cannot race with setting other bitfields since it is only set once in INIT reply, before any other request, and never cleared */ - unsigned conn_error : 1; + unsigned conn_error:1; /** Connection successful. Only set in INIT */ - unsigned conn_init : 1; + unsigned conn_init:1; /** Do readpages asynchronously? Only set in INIT */ - unsigned async_read : 1; + unsigned async_read:1; /** Do not send separate SETATTR request before open(O_TRUNC) */ - unsigned atomic_o_trunc : 1; + unsigned atomic_o_trunc:1; /** Filesystem supports NFS exporting. Only set in INIT */ - unsigned export_support : 1; + unsigned export_support:1; /* * The following bitfields are only for optimization purposes @@ -375,43 +392,46 @@ struct fuse_conn { */ /** Is fsync not implemented by fs? */ - unsigned no_fsync : 1; + unsigned no_fsync:1; /** Is fsyncdir not implemented by fs? */ - unsigned no_fsyncdir : 1; + unsigned no_fsyncdir:1; /** Is flush not implemented by fs? */ - unsigned no_flush : 1; + unsigned no_flush:1; /** Is setxattr not implemented by fs? */ - unsigned no_setxattr : 1; + unsigned no_setxattr:1; /** Is getxattr not implemented by fs? */ - unsigned no_getxattr : 1; + unsigned no_getxattr:1; /** Is listxattr not implemented by fs? */ - unsigned no_listxattr : 1; + unsigned no_listxattr:1; /** Is removexattr not implemented by fs? */ - unsigned no_removexattr : 1; + unsigned no_removexattr:1; /** Are file locking primitives not implemented by fs? */ - unsigned no_lock : 1; + unsigned no_lock:1; /** Is access not implemented by fs? */ - unsigned no_access : 1; + unsigned no_access:1; /** Is create not implemented by fs? */ - unsigned no_create : 1; + unsigned no_create:1; /** Is interrupt not implemented by fs? */ - unsigned no_interrupt : 1; + unsigned no_interrupt:1; /** Is bmap not implemented by fs? */ - unsigned no_bmap : 1; + unsigned no_bmap:1; + + /** Is poll not implemented by fs? */ + unsigned no_poll:1; /** Do multi-page cached writes */ - unsigned big_writes : 1; + unsigned big_writes:1; /** The number of requests waiting for completion */ atomic_t num_waiting; @@ -445,6 +465,9 @@ struct fuse_conn { /** Version counter for attribute changes */ u64 attr_version; + + /** Called on final put */ + void (*release)(struct fuse_conn *); }; static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb) @@ -499,7 +522,7 @@ void fuse_read_fill(struct fuse_req *req, struct file *file, */ int fuse_open_common(struct inode *inode, struct file *file, int isdir); -struct fuse_file *fuse_file_alloc(void); +struct fuse_file *fuse_file_alloc(struct fuse_conn *fc); void fuse_file_free(struct fuse_file *ff); void fuse_finish_open(struct inode *inode, struct file *file, struct fuse_file *ff, struct fuse_open_out *outarg); @@ -519,6 +542,12 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync, int isdir); /** + * Notify poll wakeup + */ +int fuse_notify_poll_wakeup(struct fuse_conn *fc, + struct fuse_notify_poll_wakeup_out *outarg); + +/** * Initialize file operations on a regular file */ void fuse_init_file_inode(struct inode *inode); @@ -593,19 +622,20 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req); /** * Send a request (synchronous) */ -void request_send(struct fuse_conn *fc, struct fuse_req *req); +void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req); /** * Send a request with no reply */ -void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req); +void fuse_request_send_noreply(struct fuse_conn *fc, struct fuse_req *req); /** * Send a request in the background */ -void request_send_background(struct fuse_conn *fc, struct fuse_req *req); +void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req); -void request_send_background_locked(struct fuse_conn *fc, struct fuse_req *req); +void fuse_request_send_background_locked(struct fuse_conn *fc, + struct fuse_req *req); /* Abort all requests */ void fuse_abort_conn(struct fuse_conn *fc); @@ -623,6 +653,11 @@ void fuse_invalidate_entry_cache(struct dentry *entry); struct fuse_conn *fuse_conn_get(struct fuse_conn *fc); /** + * Initialize fuse_conn + */ +int fuse_conn_init(struct fuse_conn *fc, struct super_block *sb); + +/** * Release reference to fuse_conn */ void fuse_conn_put(struct fuse_conn *fc); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 2e99f34b4435..47c96fdca1ac 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1,6 +1,6 @@ /* FUSE: Filesystem in Userspace - Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu> + Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu> This program can be distributed under the terms of the GNU GPL. See the file COPYING. @@ -37,10 +37,10 @@ struct fuse_mount_data { unsigned rootmode; unsigned user_id; unsigned group_id; - unsigned fd_present : 1; - unsigned rootmode_present : 1; - unsigned user_id_present : 1; - unsigned group_id_present : 1; + unsigned fd_present:1; + unsigned rootmode_present:1; + unsigned user_id_present:1; + unsigned group_id_present:1; unsigned flags; unsigned max_read; unsigned blksize; @@ -94,7 +94,7 @@ void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req, req->in.numargs = 1; req->in.args[0].size = sizeof(struct fuse_forget_in); req->in.args[0].value = inarg; - request_send_noreply(fc, req); + fuse_request_send_noreply(fc, req); } static void fuse_clear_inode(struct inode *inode) @@ -250,7 +250,7 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, fi = get_fuse_inode(inode); spin_lock(&fc->lock); - fi->nlookup ++; + fi->nlookup++; spin_unlock(&fc->lock); fuse_change_attributes(inode, attr, attr_valid, attr_version); @@ -269,7 +269,7 @@ static void fuse_send_destroy(struct fuse_conn *fc) fc->destroy_req = NULL; req->in.h.opcode = FUSE_DESTROY; req->force = 1; - request_send(fc, req); + fuse_request_send(fc, req); fuse_put_request(fc, req); } } @@ -334,7 +334,7 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf) req->out.args[0].size = fc->minor < 4 ? FUSE_COMPAT_STATFS_SIZE : sizeof(outarg); req->out.args[0].value = &outarg; - request_send(fc, req); + fuse_request_send(fc, req); err = req->out.h.error; if (!err) convert_fuse_statfs(buf, &outarg.st); @@ -462,68 +462,69 @@ static int fuse_show_options(struct seq_file *m, struct vfsmount *mnt) return 0; } -static struct fuse_conn *new_conn(struct super_block *sb) +int fuse_conn_init(struct fuse_conn *fc, struct super_block *sb) { - struct fuse_conn *fc; int err; - fc = kzalloc(sizeof(*fc), GFP_KERNEL); - if (fc) { - spin_lock_init(&fc->lock); - mutex_init(&fc->inst_mutex); - atomic_set(&fc->count, 1); - init_waitqueue_head(&fc->waitq); - init_waitqueue_head(&fc->blocked_waitq); - init_waitqueue_head(&fc->reserved_req_waitq); - INIT_LIST_HEAD(&fc->pending); - INIT_LIST_HEAD(&fc->processing); - INIT_LIST_HEAD(&fc->io); - INIT_LIST_HEAD(&fc->interrupts); - INIT_LIST_HEAD(&fc->bg_queue); - atomic_set(&fc->num_waiting, 0); - fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; - fc->bdi.unplug_io_fn = default_unplug_io_fn; - /* fuse does it's own writeback accounting */ - fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB; - fc->dev = sb->s_dev; - err = bdi_init(&fc->bdi); - if (err) - goto error_kfree; - if (sb->s_bdev) { - err = bdi_register(&fc->bdi, NULL, "%u:%u-fuseblk", - MAJOR(fc->dev), MINOR(fc->dev)); - } else { - err = bdi_register_dev(&fc->bdi, fc->dev); - } - if (err) - goto error_bdi_destroy; - /* - * For a single fuse filesystem use max 1% of dirty + - * writeback threshold. - * - * This gives about 1M of write buffer for memory maps on a - * machine with 1G and 10% dirty_ratio, which should be more - * than enough. - * - * Privileged users can raise it by writing to - * - * /sys/class/bdi/<bdi>/max_ratio - */ - bdi_set_max_ratio(&fc->bdi, 1); - fc->reqctr = 0; - fc->blocked = 1; - fc->attr_version = 1; - get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key)); + memset(fc, 0, sizeof(*fc)); + spin_lock_init(&fc->lock); + mutex_init(&fc->inst_mutex); + atomic_set(&fc->count, 1); + init_waitqueue_head(&fc->waitq); + init_waitqueue_head(&fc->blocked_waitq); + init_waitqueue_head(&fc->reserved_req_waitq); + INIT_LIST_HEAD(&fc->pending); + INIT_LIST_HEAD(&fc->processing); + INIT_LIST_HEAD(&fc->io); + INIT_LIST_HEAD(&fc->interrupts); + INIT_LIST_HEAD(&fc->bg_queue); + INIT_LIST_HEAD(&fc->entry); + atomic_set(&fc->num_waiting, 0); + fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; + fc->bdi.unplug_io_fn = default_unplug_io_fn; + /* fuse does it's own writeback accounting */ + fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB; + fc->khctr = 0; + fc->polled_files = RB_ROOT; + fc->dev = sb->s_dev; + err = bdi_init(&fc->bdi); + if (err) + goto error_mutex_destroy; + if (sb->s_bdev) { + err = bdi_register(&fc->bdi, NULL, "%u:%u-fuseblk", + MAJOR(fc->dev), MINOR(fc->dev)); + } else { + err = bdi_register_dev(&fc->bdi, fc->dev); } - return fc; + if (err) + goto error_bdi_destroy; + /* + * For a single fuse filesystem use max 1% of dirty + + * writeback threshold. + * + * This gives about 1M of write buffer for memory maps on a + * machine with 1G and 10% dirty_ratio, which should be more + * than enough. + * + * Privileged users can raise it by writing to + * + * /sys/class/bdi/<bdi>/max_ratio + */ + bdi_set_max_ratio(&fc->bdi, 1); + fc->reqctr = 0; + fc->blocked = 1; + fc->attr_version = 1; + get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key)); -error_bdi_destroy: + return 0; + + error_bdi_destroy: bdi_destroy(&fc->bdi); -error_kfree: + error_mutex_destroy: mutex_destroy(&fc->inst_mutex); - kfree(fc); - return NULL; + return err; } +EXPORT_SYMBOL_GPL(fuse_conn_init); void fuse_conn_put(struct fuse_conn *fc) { @@ -532,7 +533,7 @@ void fuse_conn_put(struct fuse_conn *fc) fuse_request_free(fc->destroy_req); mutex_destroy(&fc->inst_mutex); bdi_destroy(&fc->bdi); - kfree(fc); + fc->release(fc); } } @@ -542,7 +543,7 @@ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc) return fc; } -static struct inode *get_root_inode(struct super_block *sb, unsigned mode) +static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode) { struct fuse_attr attr; memset(&attr, 0, sizeof(attr)); @@ -553,8 +554,7 @@ static struct inode *get_root_inode(struct super_block *sb, unsigned mode) return fuse_iget(sb, 1, 0, &attr, 0, 0); } -struct fuse_inode_handle -{ +struct fuse_inode_handle { u64 nodeid; u32 generation; }; @@ -761,7 +761,6 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) fc->max_write = max_t(unsigned, 4096, fc->max_write); fc->conn_init = 1; } - fuse_put_request(fc, req); fc->blocked = 0; wake_up_all(&fc->blocked_waitq); } @@ -787,7 +786,12 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) req->out.args[0].size = sizeof(struct fuse_init_out); req->out.args[0].value = &req->misc.init_out; req->end = process_init_reply; - request_send_background(fc, req); + fuse_request_send_background(fc, req); +} + +static void fuse_free_conn(struct fuse_conn *fc) +{ + kfree(fc); } static int fuse_fill_super(struct super_block *sb, void *data, int silent) @@ -828,10 +832,17 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) if (file->f_op != &fuse_dev_operations) return -EINVAL; - fc = new_conn(sb); + fc = kmalloc(sizeof(*fc), GFP_KERNEL); if (!fc) return -ENOMEM; + err = fuse_conn_init(fc, sb); + if (err) { + kfree(fc); + return err; + } + + fc->release = fuse_free_conn; fc->flags = d.flags; fc->user_id = d.user_id; fc->group_id = d.group_id; @@ -841,7 +852,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) sb->s_fs_info = fc; err = -ENOMEM; - root = get_root_inode(sb, d.rootmode); + root = fuse_get_root_inode(sb, d.rootmode); if (!root) goto err; @@ -952,7 +963,7 @@ static inline void unregister_fuseblk(void) static void fuse_inode_init_once(void *foo) { - struct inode * inode = foo; + struct inode *inode = foo; inode_init_once(inode); } @@ -1031,7 +1042,7 @@ static int __init fuse_init(void) { int res; - printk("fuse init (API version %i.%i)\n", + printk(KERN_INFO "fuse init (API version %i.%i)\n", FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION); INIT_LIST_HEAD(&fuse_conn_list); diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig index ab2f57e3fb87..e563a6449811 100644 --- a/fs/gfs2/Kconfig +++ b/fs/gfs2/Kconfig @@ -1,6 +1,6 @@ config GFS2_FS tristate "GFS2 file system support" - depends on EXPERIMENTAL && (64BIT || (LSF && LBD)) + depends on EXPERIMENTAL && (64BIT || LBD) select FS_POSIX_ACL select CRC32 help diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile index ec65851ec80a..c1b4ec6a9650 100644 --- a/fs/gfs2/Makefile +++ b/fs/gfs2/Makefile @@ -1,5 +1,5 @@ obj-$(CONFIG_GFS2_FS) += gfs2.o -gfs2-y := acl.o bmap.o daemon.o dir.o eaops.o eattr.o glock.o \ +gfs2-y := acl.o bmap.o dir.o eaops.o eattr.o glock.o \ glops.o inode.o log.o lops.o locking.o main.o meta_io.o \ mount.o ops_address.o ops_dentry.o ops_export.o ops_file.o \ ops_fstype.o ops_inode.o ops_super.o quota.o \ diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 3e9bd46f27e3..e335dceb6a4f 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -91,7 +91,7 @@ static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl, struct gfs2_ea_location el_this; int error; - if (!ip->i_di.di_eattr) + if (!ip->i_eattr) return 0; memset(&er, 0, sizeof(struct gfs2_ea_request)); diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index bec76b1c2bb0..11ffc56f1f81 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -75,9 +75,9 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh, void *kaddr = kmap(page); memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), - ip->i_di.di_size); - memset(kaddr + ip->i_di.di_size, 0, - PAGE_CACHE_SIZE - ip->i_di.di_size); + ip->i_disksize); + memset(kaddr + ip->i_disksize, 0, + PAGE_CACHE_SIZE - ip->i_disksize); kunmap(page); SetPageUptodate(page); @@ -132,7 +132,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page) if (error) goto out; - if (ip->i_di.di_size) { + if (ip->i_disksize) { /* Get a free block, fill it with the stuffed data, and write it out to disk */ @@ -159,7 +159,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page) di = (struct gfs2_dinode *)dibh->b_data; gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); - if (ip->i_di.di_size) { + if (ip->i_disksize) { *(__be64 *)(di + 1) = cpu_to_be64(block); gfs2_add_inode_blocks(&ip->i_inode, 1); di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode)); @@ -926,7 +926,7 @@ static int do_grow(struct gfs2_inode *ip, u64 size) } } - ip->i_di.di_size = size; + ip->i_disksize = size; ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); @@ -1033,7 +1033,7 @@ static int trunc_start(struct gfs2_inode *ip, u64 size) goto out; if (gfs2_is_stuffed(ip)) { - ip->i_di.di_size = size; + ip->i_disksize = size; ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); @@ -1045,9 +1045,9 @@ static int trunc_start(struct gfs2_inode *ip, u64 size) error = gfs2_block_truncate_page(ip->i_inode.i_mapping); if (!error) { - ip->i_di.di_size = size; + ip->i_disksize = size; ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; - ip->i_di.di_flags |= GFS2_DIF_TRUNC_IN_PROG; + ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG; gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); } @@ -1114,13 +1114,13 @@ static int trunc_end(struct gfs2_inode *ip) if (error) goto out; - if (!ip->i_di.di_size) { + if (!ip->i_disksize) { ip->i_height = 0; ip->i_goal = ip->i_no_addr; gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); } ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; - ip->i_di.di_flags &= ~GFS2_DIF_TRUNC_IN_PROG; + ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG; gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); @@ -1205,9 +1205,9 @@ int gfs2_truncatei(struct gfs2_inode *ip, u64 size) if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), S_ISREG(ip->i_inode.i_mode))) return -EINVAL; - if (size > ip->i_di.di_size) + if (size > ip->i_disksize) error = do_grow(ip, size); - else if (size < ip->i_di.di_size) + else if (size < ip->i_disksize) error = do_shrink(ip, size); else /* update time stamps */ @@ -1219,7 +1219,7 @@ int gfs2_truncatei(struct gfs2_inode *ip, u64 size) int gfs2_truncatei_resume(struct gfs2_inode *ip) { int error; - error = trunc_dealloc(ip, ip->i_di.di_size); + error = trunc_dealloc(ip, ip->i_disksize); if (!error) error = trunc_end(ip); return error; @@ -1231,35 +1231,6 @@ int gfs2_file_dealloc(struct gfs2_inode *ip) } /** - * gfs2_write_calc_reserv - calculate number of blocks needed to write to a file - * @ip: the file - * @len: the number of bytes to be written to the file - * @data_blocks: returns the number of data blocks required - * @ind_blocks: returns the number of indirect blocks required - * - */ - -void gfs2_write_calc_reserv(struct gfs2_inode *ip, unsigned int len, - unsigned int *data_blocks, unsigned int *ind_blocks) -{ - struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - unsigned int tmp; - - if (gfs2_is_dir(ip)) { - *data_blocks = DIV_ROUND_UP(len, sdp->sd_jbsize) + 2; - *ind_blocks = 3 * (sdp->sd_max_jheight - 1); - } else { - *data_blocks = (len >> sdp->sd_sb.sb_bsize_shift) + 3; - *ind_blocks = 3 * (sdp->sd_max_height - 1); - } - - for (tmp = *data_blocks; tmp > sdp->sd_diptrs;) { - tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs); - *ind_blocks += tmp; - } -} - -/** * gfs2_write_alloc_required - figure out if a write will require an allocation * @ip: the file being written to * @offset: the offset to write to @@ -1276,6 +1247,7 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, struct buffer_head bh; unsigned int shift; u64 lblock, lblock_stop, size; + u64 end_of_file; *alloc_required = 0; @@ -1291,19 +1263,12 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, *alloc_required = 1; shift = sdp->sd_sb.sb_bsize_shift; - if (gfs2_is_dir(ip)) { - unsigned int bsize = sdp->sd_jbsize; - lblock = offset; - do_div(lblock, bsize); - lblock_stop = offset + len + bsize - 1; - do_div(lblock_stop, bsize); - } else { - u64 end_of_file = (ip->i_di.di_size + sdp->sd_sb.sb_bsize - 1) >> shift; - lblock = offset >> shift; - lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift; - if (lblock_stop > end_of_file) - return 0; - } + BUG_ON(gfs2_is_dir(ip)); + end_of_file = (ip->i_disksize + sdp->sd_sb.sb_bsize - 1) >> shift; + lblock = offset >> shift; + lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift; + if (lblock_stop > end_of_file) + return 0; size = (lblock_stop - lblock) << shift; do { diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h index 4e6cde2943bd..c983177e05ac 100644 --- a/fs/gfs2/bmap.h +++ b/fs/gfs2/bmap.h @@ -10,10 +10,40 @@ #ifndef __BMAP_DOT_H__ #define __BMAP_DOT_H__ +#include "inode.h" + struct inode; struct gfs2_inode; struct page; + +/** + * gfs2_write_calc_reserv - calculate number of blocks needed to write to a file + * @ip: the file + * @len: the number of bytes to be written to the file + * @data_blocks: returns the number of data blocks required + * @ind_blocks: returns the number of indirect blocks required + * + */ + +static inline void gfs2_write_calc_reserv(const struct gfs2_inode *ip, + unsigned int len, + unsigned int *data_blocks, + unsigned int *ind_blocks) +{ + const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + unsigned int tmp; + + BUG_ON(gfs2_is_dir(ip)); + *data_blocks = (len >> sdp->sd_sb.sb_bsize_shift) + 3; + *ind_blocks = 3 * (sdp->sd_max_height - 1); + + for (tmp = *data_blocks; tmp > sdp->sd_diptrs;) { + tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs); + *ind_blocks += tmp; + } +} + int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page); int gfs2_block_map(struct inode *inode, sector_t lblock, struct buffer_head *bh, int create); int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen); @@ -21,10 +51,6 @@ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsi int gfs2_truncatei(struct gfs2_inode *ip, u64 size); int gfs2_truncatei_resume(struct gfs2_inode *ip); int gfs2_file_dealloc(struct gfs2_inode *ip); - -void gfs2_write_calc_reserv(struct gfs2_inode *ip, unsigned int len, - unsigned int *data_blocks, - unsigned int *ind_blocks); int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, unsigned int len, int *alloc_required); diff --git a/fs/gfs2/daemon.c b/fs/gfs2/daemon.c deleted file mode 100644 index e51991947d2c..000000000000 --- a/fs/gfs2/daemon.c +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#include <linux/sched.h> -#include <linux/slab.h> -#include <linux/spinlock.h> -#include <linux/completion.h> -#include <linux/buffer_head.h> -#include <linux/kthread.h> -#include <linux/delay.h> -#include <linux/gfs2_ondisk.h> -#include <linux/lm_interface.h> -#include <linux/freezer.h> - -#include "gfs2.h" -#include "incore.h" -#include "daemon.h" -#include "glock.h" -#include "log.h" -#include "quota.h" -#include "recovery.h" -#include "super.h" -#include "util.h" - -/* This uses schedule_timeout() instead of msleep() because it's good for - the daemons to wake up more often than the timeout when unmounting so - the user's unmount doesn't sit there forever. - - The kthread functions used to start these daemons block and flush signals. */ - -/** - * gfs2_glockd - Reclaim unused glock structures - * @sdp: Pointer to GFS2 superblock - * - * One or more of these daemons run, reclaiming glocks on sd_reclaim_list. - * Number of daemons can be set by user, with num_glockd mount option. - */ - -int gfs2_glockd(void *data) -{ - struct gfs2_sbd *sdp = data; - - while (!kthread_should_stop()) { - while (atomic_read(&sdp->sd_reclaim_count)) - gfs2_reclaim_glock(sdp); - - wait_event_interruptible(sdp->sd_reclaim_wq, - (atomic_read(&sdp->sd_reclaim_count) || - kthread_should_stop())); - if (freezing(current)) - refrigerator(); - } - - return 0; -} - -/** - * gfs2_recoverd - Recover dead machine's journals - * @sdp: Pointer to GFS2 superblock - * - */ - -int gfs2_recoverd(void *data) -{ - struct gfs2_sbd *sdp = data; - unsigned long t; - - while (!kthread_should_stop()) { - gfs2_check_journals(sdp); - t = gfs2_tune_get(sdp, gt_recoverd_secs) * HZ; - if (freezing(current)) - refrigerator(); - schedule_timeout_interruptible(t); - } - - return 0; -} - -/** - * gfs2_quotad - Write cached quota changes into the quota file - * @sdp: Pointer to GFS2 superblock - * - */ - -int gfs2_quotad(void *data) -{ - struct gfs2_sbd *sdp = data; - unsigned long t; - int error; - - while (!kthread_should_stop()) { - /* Update the master statfs file */ - - t = sdp->sd_statfs_sync_time + - gfs2_tune_get(sdp, gt_statfs_quantum) * HZ; - - if (time_after_eq(jiffies, t)) { - error = gfs2_statfs_sync(sdp); - if (error && - error != -EROFS && - !test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) - fs_err(sdp, "quotad: (1) error=%d\n", error); - sdp->sd_statfs_sync_time = jiffies; - } - - /* Update quota file */ - - t = sdp->sd_quota_sync_time + - gfs2_tune_get(sdp, gt_quota_quantum) * HZ; - - if (time_after_eq(jiffies, t)) { - error = gfs2_quota_sync(sdp); - if (error && - error != -EROFS && - !test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) - fs_err(sdp, "quotad: (2) error=%d\n", error); - sdp->sd_quota_sync_time = jiffies; - } - - gfs2_quota_scan(sdp); - - t = gfs2_tune_get(sdp, gt_quotad_secs) * HZ; - if (freezing(current)) - refrigerator(); - schedule_timeout_interruptible(t); - } - - return 0; -} - diff --git a/fs/gfs2/daemon.h b/fs/gfs2/daemon.h deleted file mode 100644 index 4be084fb6a62..000000000000 --- a/fs/gfs2/daemon.h +++ /dev/null @@ -1,17 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#ifndef __DAEMON_DOT_H__ -#define __DAEMON_DOT_H__ - -int gfs2_glockd(void *data); -int gfs2_recoverd(void *data); -int gfs2_quotad(void *data); - -#endif /* __DAEMON_DOT_H__ */ diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index eed040d8ba3a..b7c8e5c70791 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -36,7 +36,7 @@ * the block. In leaves, they begin at offset sizeof(struct gfs2_leaf) from the * beginning of the leaf block. The dirents reside in leaves when * - * dip->i_di.di_flags & GFS2_DIF_EXHASH is true + * dip->i_diskflags & GFS2_DIF_EXHASH is true * * Otherwise, the dirents are "linear", within a single stuffed dinode block. * @@ -128,8 +128,8 @@ static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf, gfs2_trans_add_bh(ip->i_gl, dibh, 1); memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size); - if (ip->i_di.di_size < offset + size) - ip->i_di.di_size = offset + size; + if (ip->i_disksize < offset + size) + ip->i_disksize = offset + size; ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; gfs2_dinode_out(ip, dibh->b_data); @@ -226,8 +226,8 @@ out: if (error) return error; - if (ip->i_di.di_size < offset + copied) - ip->i_di.di_size = offset + copied; + if (ip->i_disksize < offset + copied) + ip->i_disksize = offset + copied; ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; gfs2_trans_add_bh(ip->i_gl, dibh, 1); @@ -277,11 +277,11 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset, int copied = 0; int error = 0; - if (offset >= ip->i_di.di_size) + if (offset >= ip->i_disksize) return 0; - if (offset + size > ip->i_di.di_size) - size = ip->i_di.di_size - offset; + if (offset + size > ip->i_disksize) + size = ip->i_disksize - offset; if (!size) return 0; @@ -755,12 +755,12 @@ static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode, struct gfs2_inode *ip = GFS2_I(inode); int error; - if (ip->i_di.di_flags & GFS2_DIF_EXHASH) { + if (ip->i_diskflags & GFS2_DIF_EXHASH) { struct gfs2_leaf *leaf; unsigned hsize = 1 << ip->i_depth; unsigned index; u64 ln; - if (hsize * sizeof(u64) != ip->i_di.di_size) { + if (hsize * sizeof(u64) != ip->i_disksize) { gfs2_consist_inode(ip); return ERR_PTR(-EIO); } @@ -858,8 +858,8 @@ static int dir_make_exhash(struct inode *inode) return -ENOSPC; bn = bh->b_blocknr; - gfs2_assert(sdp, dip->i_di.di_entries < (1 << 16)); - leaf->lf_entries = cpu_to_be16(dip->i_di.di_entries); + gfs2_assert(sdp, dip->i_entries < (1 << 16)); + leaf->lf_entries = cpu_to_be16(dip->i_entries); /* Copy dirents */ @@ -905,9 +905,9 @@ static int dir_make_exhash(struct inode *inode) for (x = sdp->sd_hash_ptrs; x--; lp++) *lp = cpu_to_be64(bn); - dip->i_di.di_size = sdp->sd_sb.sb_bsize / 2; + dip->i_disksize = sdp->sd_sb.sb_bsize / 2; gfs2_add_inode_blocks(&dip->i_inode, 1); - dip->i_di.di_flags |= GFS2_DIF_EXHASH; + dip->i_diskflags |= GFS2_DIF_EXHASH; for (x = sdp->sd_hash_ptrs, y = -1; x; x >>= 1, y++) ; dip->i_depth = y; @@ -1082,7 +1082,7 @@ static int dir_double_exhash(struct gfs2_inode *dip) int error = 0; hsize = 1 << dip->i_depth; - if (hsize * sizeof(u64) != dip->i_di.di_size) { + if (hsize * sizeof(u64) != dip->i_disksize) { gfs2_consist_inode(dip); return -EIO; } @@ -1091,7 +1091,7 @@ static int dir_double_exhash(struct gfs2_inode *dip) buf = kcalloc(3, sdp->sd_hash_bsize, GFP_NOFS | __GFP_NOFAIL); - for (block = dip->i_di.di_size >> sdp->sd_hash_bsize_shift; block--;) { + for (block = dip->i_disksize >> sdp->sd_hash_bsize_shift; block--;) { error = gfs2_dir_read_data(dip, (char *)buf, block * sdp->sd_hash_bsize, sdp->sd_hash_bsize, 1); @@ -1370,7 +1370,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque, unsigned depth = 0; hsize = 1 << dip->i_depth; - if (hsize * sizeof(u64) != dip->i_di.di_size) { + if (hsize * sizeof(u64) != dip->i_disksize) { gfs2_consist_inode(dip); return -EIO; } @@ -1426,10 +1426,10 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque, int copied = 0; int error; - if (!dip->i_di.di_entries) + if (!dip->i_entries) return 0; - if (dip->i_di.di_flags & GFS2_DIF_EXHASH) + if (dip->i_diskflags & GFS2_DIF_EXHASH) return dir_e_read(inode, offset, opaque, filldir); if (!gfs2_is_stuffed(dip)) { @@ -1453,17 +1453,17 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque, error = PTR_ERR(dent); goto out; } - if (dip->i_di.di_entries != g.offset) { + if (dip->i_entries != g.offset) { fs_warn(sdp, "Number of entries corrupt in dir %llu, " - "ip->i_di.di_entries (%u) != g.offset (%u)\n", + "ip->i_entries (%u) != g.offset (%u)\n", (unsigned long long)dip->i_no_addr, - dip->i_di.di_entries, + dip->i_entries, g.offset); error = -EIO; goto out; } error = do_filldir_main(dip, offset, opaque, filldir, darr, - dip->i_di.di_entries, &copied); + dip->i_entries, &copied); out: kfree(darr); } @@ -1612,7 +1612,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name, dent = gfs2_init_dirent(inode, dent, name, bh); gfs2_inum_out(nip, dent); dent->de_type = cpu_to_be16(type); - if (ip->i_di.di_flags & GFS2_DIF_EXHASH) { + if (ip->i_diskflags & GFS2_DIF_EXHASH) { leaf = (struct gfs2_leaf *)bh->b_data; be16_add_cpu(&leaf->lf_entries, 1); } @@ -1621,14 +1621,14 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name, if (error) break; gfs2_trans_add_bh(ip->i_gl, bh, 1); - ip->i_di.di_entries++; + ip->i_entries++; ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; gfs2_dinode_out(ip, bh->b_data); brelse(bh); error = 0; break; } - if (!(ip->i_di.di_flags & GFS2_DIF_EXHASH)) { + if (!(ip->i_diskflags & GFS2_DIF_EXHASH)) { error = dir_make_exhash(inode); if (error) break; @@ -1691,7 +1691,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *name) } dirent_del(dip, bh, prev, dent); - if (dip->i_di.di_flags & GFS2_DIF_EXHASH) { + if (dip->i_diskflags & GFS2_DIF_EXHASH) { struct gfs2_leaf *leaf = (struct gfs2_leaf *)bh->b_data; u16 entries = be16_to_cpu(leaf->lf_entries); if (!entries) @@ -1704,10 +1704,10 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *name) if (error) return error; - if (!dip->i_di.di_entries) + if (!dip->i_entries) gfs2_consist_inode(dip); gfs2_trans_add_bh(dip->i_gl, bh, 1); - dip->i_di.di_entries--; + dip->i_entries--; dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME; gfs2_dinode_out(dip, bh->b_data); brelse(bh); @@ -1748,7 +1748,7 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, gfs2_inum_out(nip, dent); dent->de_type = cpu_to_be16(new_type); - if (dip->i_di.di_flags & GFS2_DIF_EXHASH) { + if (dip->i_diskflags & GFS2_DIF_EXHASH) { brelse(bh); error = gfs2_meta_inode_buffer(dip, &bh); if (error) @@ -1784,7 +1784,7 @@ static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data) int error = 0; hsize = 1 << dip->i_depth; - if (hsize * sizeof(u64) != dip->i_di.di_size) { + if (hsize * sizeof(u64) != dip->i_disksize) { gfs2_consist_inode(dip); return -EIO; } diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h index 8a468cac9328..4f919440c3be 100644 --- a/fs/gfs2/dir.h +++ b/fs/gfs2/dir.h @@ -11,6 +11,7 @@ #define __DIR_DOT_H__ #include <linux/dcache.h> +#include <linux/crc32.h> struct inode; struct gfs2_inode; diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c index e3f76f451b0a..0d1c76d906ae 100644 --- a/fs/gfs2/eattr.c +++ b/fs/gfs2/eattr.c @@ -114,11 +114,11 @@ static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data) __be64 *eablk, *end; int error; - error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT, &bh); + error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &bh); if (error) return error; - if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT)) { + if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT)) { error = ea_foreach_i(ip, bh, ea_call, data); goto out; } @@ -414,7 +414,7 @@ int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er) if (error) return error; - if (ip->i_di.di_eattr) { + if (ip->i_eattr) { struct ea_list ei = { .ei_er = er, .ei_size = 0 }; error = ea_foreach(ip, ea_list_i, &ei); @@ -514,7 +514,7 @@ int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er) struct gfs2_ea_location el; int error; - if (!ip->i_di.di_eattr) + if (!ip->i_eattr) return -ENODATA; error = gfs2_ea_find(ip, er, &el); @@ -741,7 +741,7 @@ static int ea_init_i(struct gfs2_inode *ip, struct gfs2_ea_request *er, if (error) return error; - ip->i_di.di_eattr = bh->b_blocknr; + ip->i_eattr = bh->b_blocknr; error = ea_write(ip, GFS2_EA_BH2FIRST(bh), er); brelse(bh); @@ -935,10 +935,10 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er, int error; int mh_size = sizeof(struct gfs2_meta_header); - if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT) { + if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) { __be64 *end; - error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT, + error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &indbh); if (error) return error; @@ -972,9 +972,9 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er, gfs2_buffer_clear_tail(indbh, mh_size); eablk = (__be64 *)(indbh->b_data + mh_size); - *eablk = cpu_to_be64(ip->i_di.di_eattr); - ip->i_di.di_eattr = blk; - ip->i_di.di_flags |= GFS2_DIF_EA_INDIRECT; + *eablk = cpu_to_be64(ip->i_eattr); + ip->i_eattr = blk; + ip->i_diskflags |= GFS2_DIF_EA_INDIRECT; gfs2_add_inode_blocks(&ip->i_inode, 1); eablk++; @@ -1015,7 +1015,7 @@ static int ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er, if (error) return error; - if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT)) + if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT)) blks++; if (GFS2_EAREQ_SIZE_STUFFED(er) > GFS2_SB(&ip->i_inode)->sd_jbsize) blks += DIV_ROUND_UP(er->er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize); @@ -1040,7 +1040,7 @@ int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er) struct gfs2_ea_location el; int error; - if (!ip->i_di.di_eattr) { + if (!ip->i_eattr) { if (er->er_flags & XATTR_REPLACE) return -ENODATA; return ea_init(ip, er); @@ -1051,7 +1051,7 @@ int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er) return error; if (el.el_ea) { - if (ip->i_di.di_flags & GFS2_DIF_APPENDONLY) { + if (ip->i_diskflags & GFS2_DIF_APPENDONLY) { brelse(el.el_bh); return -EPERM; } @@ -1145,7 +1145,7 @@ int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er) struct gfs2_ea_location el; int error; - if (!ip->i_di.di_eattr) + if (!ip->i_eattr) return -ENODATA; error = gfs2_ea_find(ip, er, &el); @@ -1309,7 +1309,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip) memset(&rlist, 0, sizeof(struct gfs2_rgrp_list)); - error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT, &indbh); + error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &indbh); if (error) return error; @@ -1388,7 +1388,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip) if (bstart) gfs2_free_meta(ip, bstart, blen); - ip->i_di.di_flags &= ~GFS2_DIF_EA_INDIRECT; + ip->i_diskflags &= ~GFS2_DIF_EA_INDIRECT; error = gfs2_meta_inode_buffer(ip, &dibh); if (!error) { @@ -1416,7 +1416,7 @@ static int ea_dealloc_block(struct gfs2_inode *ip) struct buffer_head *dibh; int error; - rgd = gfs2_blk2rgrpd(sdp, ip->i_di.di_eattr); + rgd = gfs2_blk2rgrpd(sdp, ip->i_eattr); if (!rgd) { gfs2_consist_inode(ip); return -EIO; @@ -1432,9 +1432,9 @@ static int ea_dealloc_block(struct gfs2_inode *ip) if (error) goto out_gunlock; - gfs2_free_meta(ip, ip->i_di.di_eattr, 1); + gfs2_free_meta(ip, ip->i_eattr, 1); - ip->i_di.di_eattr = 0; + ip->i_eattr = 0; gfs2_add_inode_blocks(&ip->i_inode, -1); error = gfs2_meta_inode_buffer(ip, &dibh); @@ -1479,7 +1479,7 @@ int gfs2_ea_dealloc(struct gfs2_inode *ip) if (error) goto out_rindex; - if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT) { + if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) { error = ea_dealloc_indirect(ip); if (error) goto out_rindex; diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index c962283d4e7f..6b983aef785d 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -40,6 +40,7 @@ #include "quota.h" #include "super.h" #include "util.h" +#include "bmap.h" struct gfs2_gl_hash_bucket { struct hlist_head hb_list; @@ -61,9 +62,10 @@ static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int static DECLARE_RWSEM(gfs2_umount_flush_sem); static struct dentry *gfs2_root; -static struct task_struct *scand_process; -static unsigned int scand_secs = 5; static struct workqueue_struct *glock_workqueue; +static LIST_HEAD(lru_list); +static atomic_t lru_count = ATOMIC_INIT(0); +static DEFINE_SPINLOCK(lru_lock); #define GFS2_GL_HASH_SHIFT 15 #define GFS2_GL_HASH_SIZE (1 << GFS2_GL_HASH_SHIFT) @@ -174,6 +176,22 @@ static void gfs2_glock_hold(struct gfs2_glock *gl) } /** + * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list + * @gl: the glock + * + */ + +static void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl) +{ + spin_lock(&lru_lock); + if (list_empty(&gl->gl_lru) && gl->gl_state != LM_ST_UNLOCKED) { + list_add_tail(&gl->gl_lru, &lru_list); + atomic_inc(&lru_count); + } + spin_unlock(&lru_lock); +} + +/** * gfs2_glock_put() - Decrement reference count on glock * @gl: The glock to put * @@ -187,14 +205,23 @@ int gfs2_glock_put(struct gfs2_glock *gl) if (atomic_dec_and_test(&gl->gl_ref)) { hlist_del(&gl->gl_list); write_unlock(gl_lock_addr(gl->gl_hash)); + spin_lock(&lru_lock); + if (!list_empty(&gl->gl_lru)) { + list_del_init(&gl->gl_lru); + atomic_dec(&lru_count); + } + spin_unlock(&lru_lock); GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_UNLOCKED); - GLOCK_BUG_ON(gl, !list_empty(&gl->gl_reclaim)); + GLOCK_BUG_ON(gl, !list_empty(&gl->gl_lru)); GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders)); glock_free(gl); rv = 1; goto out; } write_unlock(gl_lock_addr(gl->gl_hash)); + /* 1 for being hashed, 1 for having state != LM_ST_UNLOCKED */ + if (atomic_read(&gl->gl_ref) == 2) + gfs2_glock_schedule_for_reclaim(gl); out: return rv; } @@ -289,10 +316,13 @@ static void gfs2_holder_wake(struct gfs2_holder *gh) * do_promote - promote as many requests as possible on the current queue * @gl: The glock * - * Returns: true if there is a blocked holder at the head of the list + * Returns: 1 if there is a blocked holder at the head of the list, or 2 + * if a type specific operation is underway. */ static int do_promote(struct gfs2_glock *gl) +__releases(&gl->gl_spin) +__acquires(&gl->gl_spin) { const struct gfs2_glock_operations *glops = gl->gl_ops; struct gfs2_holder *gh, *tmp; @@ -310,6 +340,8 @@ restart: ret = glops->go_lock(gh); spin_lock(&gl->gl_spin); if (ret) { + if (ret == 1) + return 2; gh->gh_error = ret; list_del_init(&gh->gh_list); gfs2_holder_wake(gh); @@ -414,6 +446,7 @@ static void finish_xmote(struct gfs2_glock *gl, unsigned int ret) const struct gfs2_glock_operations *glops = gl->gl_ops; struct gfs2_holder *gh; unsigned state = ret & LM_OUT_ST_MASK; + int rv; spin_lock(&gl->gl_spin); state_change(gl, state); @@ -468,7 +501,6 @@ retry: gfs2_demote_wake(gl); if (state != LM_ST_UNLOCKED) { if (glops->go_xmote_bh) { - int rv; spin_unlock(&gl->gl_spin); rv = glops->go_xmote_bh(gl, gh); if (rv == -EAGAIN) @@ -479,10 +511,13 @@ retry: goto out; } } - do_promote(gl); + rv = do_promote(gl); + if (rv == 2) + goto out_locked; } out: clear_bit(GLF_LOCK, &gl->gl_flags); +out_locked: spin_unlock(&gl->gl_spin); gfs2_glock_put(gl); } @@ -511,6 +546,8 @@ static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock, */ static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target) +__releases(&gl->gl_spin) +__acquires(&gl->gl_spin) { const struct gfs2_glock_operations *glops = gl->gl_ops; struct gfs2_sbd *sdp = gl->gl_sbd; @@ -576,8 +613,11 @@ static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl) */ static void run_queue(struct gfs2_glock *gl, const int nonblock) +__releases(&gl->gl_spin) +__acquires(&gl->gl_spin) { struct gfs2_holder *gh = NULL; + int ret; if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) return; @@ -596,8 +636,11 @@ static void run_queue(struct gfs2_glock *gl, const int nonblock) } else { if (test_bit(GLF_DEMOTE, &gl->gl_flags)) gfs2_demote_wake(gl); - if (do_promote(gl) == 0) + ret = do_promote(gl); + if (ret == 0) goto out; + if (ret == 2) + return; gh = find_first_waiter(gl); gl->gl_target = gh->gh_state; if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) @@ -820,7 +863,7 @@ static void wait_on_demote(struct gfs2_glock *gl) */ static void handle_callback(struct gfs2_glock *gl, unsigned int state, - int remote, unsigned long delay) + unsigned long delay) { int bit = delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE; @@ -828,9 +871,6 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state, if (gl->gl_demote_state == LM_ST_EXCLUSIVE) { gl->gl_demote_state = state; gl->gl_demote_time = jiffies; - if (remote && gl->gl_ops->go_type == LM_TYPE_IOPEN && - gl->gl_object) - gfs2_glock_schedule_for_reclaim(gl); } else if (gl->gl_demote_state != LM_ST_UNLOCKED && gl->gl_demote_state != state) { gl->gl_demote_state = LM_ST_UNLOCKED; @@ -877,6 +917,8 @@ void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...) */ static inline void add_to_queue(struct gfs2_holder *gh) +__releases(&gl->gl_spin) +__acquires(&gl->gl_spin) { struct gfs2_glock *gl = gh->gh_gl; struct gfs2_sbd *sdp = gl->gl_sbd; @@ -998,7 +1040,7 @@ void gfs2_glock_dq(struct gfs2_holder *gh) spin_lock(&gl->gl_spin); if (gh->gh_flags & GL_NOCACHE) - handle_callback(gl, LM_ST_UNLOCKED, 0, 0); + handle_callback(gl, LM_ST_UNLOCKED, 0); list_del_init(&gh->gh_list); if (find_first_holder(gl) == NULL) { @@ -1269,12 +1311,26 @@ static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name, delay = gl->gl_ops->go_min_hold_time; spin_lock(&gl->gl_spin); - handle_callback(gl, state, 1, delay); + handle_callback(gl, state, delay); spin_unlock(&gl->gl_spin); if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0) gfs2_glock_put(gl); } +static void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid) +{ + struct gfs2_jdesc *jd; + + spin_lock(&sdp->sd_jindex_spin); + list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { + if (jd->jd_jid != jid) + continue; + jd->jd_dirty = 1; + break; + } + spin_unlock(&sdp->sd_jindex_spin); +} + /** * gfs2_glock_cb - Callback used by locking module * @sdp: Pointer to the superblock @@ -1338,80 +1394,83 @@ void gfs2_glock_cb(void *cb_data, unsigned int type, void *data) * Returns: 1 if it's ok */ -static int demote_ok(struct gfs2_glock *gl) +static int demote_ok(const struct gfs2_glock *gl) { const struct gfs2_glock_operations *glops = gl->gl_ops; - int demote = 1; - - if (test_bit(GLF_STICKY, &gl->gl_flags)) - demote = 0; - else if (glops->go_demote_ok) - demote = glops->go_demote_ok(gl); - - return demote; -} - -/** - * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list - * @gl: the glock - * - */ - -void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl) -{ - struct gfs2_sbd *sdp = gl->gl_sbd; - spin_lock(&sdp->sd_reclaim_lock); - if (list_empty(&gl->gl_reclaim)) { - gfs2_glock_hold(gl); - list_add(&gl->gl_reclaim, &sdp->sd_reclaim_list); - atomic_inc(&sdp->sd_reclaim_count); - spin_unlock(&sdp->sd_reclaim_lock); - wake_up(&sdp->sd_reclaim_wq); - } else - spin_unlock(&sdp->sd_reclaim_lock); + if (gl->gl_state == LM_ST_UNLOCKED) + return 0; + if (!list_empty(&gl->gl_holders)) + return 0; + if (glops->go_demote_ok) + return glops->go_demote_ok(gl); + return 1; } -/** - * gfs2_reclaim_glock - process the next glock on the filesystem's reclaim list - * @sdp: the filesystem - * - * Called from gfs2_glockd() glock reclaim daemon, or when promoting a - * different glock and we notice that there are a lot of glocks in the - * reclaim list. - * - */ -void gfs2_reclaim_glock(struct gfs2_sbd *sdp) +static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask) { struct gfs2_glock *gl; - int done_callback = 0; + int may_demote; + int nr_skipped = 0; + int got_ref = 0; + LIST_HEAD(skipped); - spin_lock(&sdp->sd_reclaim_lock); - if (list_empty(&sdp->sd_reclaim_list)) { - spin_unlock(&sdp->sd_reclaim_lock); - return; - } - gl = list_entry(sdp->sd_reclaim_list.next, - struct gfs2_glock, gl_reclaim); - list_del_init(&gl->gl_reclaim); - spin_unlock(&sdp->sd_reclaim_lock); + if (nr == 0) + goto out; - atomic_dec(&sdp->sd_reclaim_count); - atomic_inc(&sdp->sd_reclaimed); + if (!(gfp_mask & __GFP_FS)) + return -1; - spin_lock(&gl->gl_spin); - if (find_first_holder(gl) == NULL && - gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl)) { - handle_callback(gl, LM_ST_UNLOCKED, 0, 0); - done_callback = 1; + spin_lock(&lru_lock); + while(nr && !list_empty(&lru_list)) { + gl = list_entry(lru_list.next, struct gfs2_glock, gl_lru); + list_del_init(&gl->gl_lru); + atomic_dec(&lru_count); + + /* Test for being demotable */ + if (!test_and_set_bit(GLF_LOCK, &gl->gl_flags)) { + gfs2_glock_hold(gl); + got_ref = 1; + spin_unlock(&lru_lock); + spin_lock(&gl->gl_spin); + may_demote = demote_ok(gl); + spin_unlock(&gl->gl_spin); + clear_bit(GLF_LOCK, &gl->gl_flags); + if (may_demote) { + handle_callback(gl, LM_ST_UNLOCKED, 0); + nr--; + if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) + gfs2_glock_put(gl); + } + spin_lock(&lru_lock); + if (may_demote) + continue; + } + if (list_empty(&gl->gl_lru) && + (atomic_read(&gl->gl_ref) <= (2 + got_ref))) { + nr_skipped++; + list_add(&gl->gl_lru, &skipped); + } + if (got_ref) { + spin_unlock(&lru_lock); + gfs2_glock_put(gl); + spin_lock(&lru_lock); + got_ref = 0; + } } - spin_unlock(&gl->gl_spin); - if (!done_callback || - queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) - gfs2_glock_put(gl); + list_splice(&skipped, &lru_list); + atomic_add(nr_skipped, &lru_count); + spin_unlock(&lru_lock); +out: + return (atomic_read(&lru_count) / 100) * sysctl_vfs_cache_pressure; } +static struct shrinker glock_shrinker = { + .shrink = gfs2_shrink_glock_memory, + .seeks = DEFAULT_SEEKS, +}; + /** * examine_bucket - Call a function for glock in a hash bucket * @examiner: the function @@ -1457,26 +1516,6 @@ out: } /** - * scan_glock - look at a glock and see if we can reclaim it - * @gl: the glock to look at - * - */ - -static void scan_glock(struct gfs2_glock *gl) -{ - if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object) - return; - if (test_bit(GLF_LOCK, &gl->gl_flags)) - return; - - spin_lock(&gl->gl_spin); - if (find_first_holder(gl) == NULL && - gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl)) - gfs2_glock_schedule_for_reclaim(gl); - spin_unlock(&gl->gl_spin); -} - -/** * clear_glock - look at a glock and see if we can free it from glock cache * @gl: the glock to look at * @@ -1484,23 +1523,16 @@ static void scan_glock(struct gfs2_glock *gl) static void clear_glock(struct gfs2_glock *gl) { - struct gfs2_sbd *sdp = gl->gl_sbd; - int released; - - spin_lock(&sdp->sd_reclaim_lock); - if (!list_empty(&gl->gl_reclaim)) { - list_del_init(&gl->gl_reclaim); - atomic_dec(&sdp->sd_reclaim_count); - spin_unlock(&sdp->sd_reclaim_lock); - released = gfs2_glock_put(gl); - gfs2_assert(sdp, !released); - } else { - spin_unlock(&sdp->sd_reclaim_lock); + spin_lock(&lru_lock); + if (!list_empty(&gl->gl_lru)) { + list_del_init(&gl->gl_lru); + atomic_dec(&lru_count); } + spin_unlock(&lru_lock); spin_lock(&gl->gl_spin); if (find_first_holder(gl) == NULL && gl->gl_state != LM_ST_UNLOCKED) - handle_callback(gl, LM_ST_UNLOCKED, 0, 0); + handle_callback(gl, LM_ST_UNLOCKED, 0); spin_unlock(&gl->gl_spin); gfs2_glock_hold(gl); if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) @@ -1548,6 +1580,20 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp) } } +void gfs2_glock_finish_truncate(struct gfs2_inode *ip) +{ + struct gfs2_glock *gl = ip->i_gl; + int ret; + + ret = gfs2_truncatei_resume(ip); + gfs2_assert_withdraw(gl->gl_sbd, ret == 0); + + spin_lock(&gl->gl_spin); + clear_bit(GLF_LOCK, &gl->gl_flags); + run_queue(gl, 1); + spin_unlock(&gl->gl_spin); +} + static const char *state2str(unsigned state) { switch(state) { @@ -1623,8 +1669,6 @@ static const char *gflags2str(char *buf, const unsigned long *gflags) char *p = buf; if (test_bit(GLF_LOCK, gflags)) *p++ = 'l'; - if (test_bit(GLF_STICKY, gflags)) - *p++ = 's'; if (test_bit(GLF_DEMOTE, gflags)) *p++ = 'D'; if (test_bit(GLF_PENDING_DEMOTE, gflags)) @@ -1743,34 +1787,6 @@ static int gfs2_dump_lockstate(struct gfs2_sbd *sdp) return error; } -/** - * gfs2_scand - Look for cached glocks and inodes to toss from memory - * @sdp: Pointer to GFS2 superblock - * - * One of these daemons runs, finding candidates to add to sd_reclaim_list. - * See gfs2_glockd() - */ - -static int gfs2_scand(void *data) -{ - unsigned x; - unsigned delay; - - while (!kthread_should_stop()) { - for (x = 0; x < GFS2_GL_HASH_SIZE; x++) - examine_bucket(scan_glock, NULL, x); - if (freezing(current)) - refrigerator(); - delay = scand_secs; - if (delay < 1) - delay = 1; - schedule_timeout_interruptible(delay * HZ); - } - - return 0; -} - - int __init gfs2_glock_init(void) { @@ -1784,28 +1800,21 @@ int __init gfs2_glock_init(void) } #endif - scand_process = kthread_run(gfs2_scand, NULL, "gfs2_scand"); - if (IS_ERR(scand_process)) - return PTR_ERR(scand_process); - glock_workqueue = create_workqueue("glock_workqueue"); - if (IS_ERR(glock_workqueue)) { - kthread_stop(scand_process); + if (IS_ERR(glock_workqueue)) return PTR_ERR(glock_workqueue); - } + + register_shrinker(&glock_shrinker); return 0; } void gfs2_glock_exit(void) { + unregister_shrinker(&glock_shrinker); destroy_workqueue(glock_workqueue); - kthread_stop(scand_process); } -module_param(scand_secs, uint, S_IRUGO|S_IWUSR); -MODULE_PARM_DESC(scand_secs, "The number of seconds between scand runs"); - static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi) { struct gfs2_glock *gl; diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 695c6b193611..543ec7ecfbda 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -129,9 +129,9 @@ int gfs2_lvb_hold(struct gfs2_glock *gl); void gfs2_lvb_unhold(struct gfs2_glock *gl); void gfs2_glock_cb(void *cb_data, unsigned int type, void *data); -void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl); void gfs2_reclaim_glock(struct gfs2_sbd *sdp); void gfs2_gl_hash_clear(struct gfs2_sbd *sdp); +void gfs2_glock_finish_truncate(struct gfs2_inode *ip); int __init gfs2_glock_init(void); void gfs2_glock_exit(void); diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index c6c318c2a0f6..8522d3aa64fc 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -201,19 +201,12 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags) * Returns: 1 if it's ok */ -static int inode_go_demote_ok(struct gfs2_glock *gl) +static int inode_go_demote_ok(const struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_sbd; - int demote = 0; - - if (!gl->gl_object && !gl->gl_aspace->i_mapping->nrpages) - demote = 1; - else if (!sdp->sd_args.ar_localcaching && - time_after_eq(jiffies, gl->gl_stamp + - gfs2_tune_get(sdp, gt_demote_secs) * HZ)) - demote = 1; - - return demote; + if (sdp->sd_jindex == gl->gl_object || sdp->sd_rindex == gl->gl_object) + return 0; + return 1; } /** @@ -227,6 +220,7 @@ static int inode_go_demote_ok(struct gfs2_glock *gl) static int inode_go_lock(struct gfs2_holder *gh) { struct gfs2_glock *gl = gh->gh_gl; + struct gfs2_sbd *sdp = gl->gl_sbd; struct gfs2_inode *ip = gl->gl_object; int error = 0; @@ -239,10 +233,16 @@ static int inode_go_lock(struct gfs2_holder *gh) return error; } - if ((ip->i_di.di_flags & GFS2_DIF_TRUNC_IN_PROG) && + if ((ip->i_diskflags & GFS2_DIF_TRUNC_IN_PROG) && (gl->gl_state == LM_ST_EXCLUSIVE) && - (gh->gh_state == LM_ST_EXCLUSIVE)) - error = gfs2_truncatei_resume(ip); + (gh->gh_state == LM_ST_EXCLUSIVE)) { + spin_lock(&sdp->sd_trunc_lock); + if (list_empty(&ip->i_trunc_list)) + list_add(&sdp->sd_trunc_list, &ip->i_trunc_list); + spin_unlock(&sdp->sd_trunc_lock); + wake_up(&sdp->sd_quota_wait); + return 1; + } return error; } @@ -260,10 +260,13 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl) const struct gfs2_inode *ip = gl->gl_object; if (ip == NULL) return 0; - gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%08lx\n", + gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu/%llu\n", (unsigned long long)ip->i_no_formal_ino, (unsigned long long)ip->i_no_addr, - IF2DT(ip->i_inode.i_mode), ip->i_flags); + IF2DT(ip->i_inode.i_mode), ip->i_flags, + (unsigned int)ip->i_diskflags, + (unsigned long long)ip->i_inode.i_size, + (unsigned long long)ip->i_disksize); return 0; } @@ -274,7 +277,7 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl) * Returns: 1 if it's ok */ -static int rgrp_go_demote_ok(struct gfs2_glock *gl) +static int rgrp_go_demote_ok(const struct gfs2_glock *gl) { return !gl->gl_aspace->i_mapping->nrpages; } @@ -318,7 +321,9 @@ static int rgrp_go_dump(struct seq_file *seq, const struct gfs2_glock *gl) const struct gfs2_rgrpd *rgd = gl->gl_object; if (rgd == NULL) return 0; - gfs2_print_dbg(seq, " R: n:%llu\n", (unsigned long long)rgd->rd_addr); + gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u\n", + (unsigned long long)rgd->rd_addr, rgd->rd_flags, + rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes); return 0; } @@ -374,13 +379,25 @@ static int trans_go_xmote_bh(struct gfs2_glock *gl, struct gfs2_holder *gh) } /** + * trans_go_demote_ok + * @gl: the glock + * + * Always returns 0 + */ + +static int trans_go_demote_ok(const struct gfs2_glock *gl) +{ + return 0; +} + +/** * quota_go_demote_ok - Check to see if it's ok to unlock a quota glock * @gl: the glock * * Returns: 1 if it's ok */ -static int quota_go_demote_ok(struct gfs2_glock *gl) +static int quota_go_demote_ok(const struct gfs2_glock *gl) { return !atomic_read(&gl->gl_lvb_count); } @@ -414,6 +431,7 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = { const struct gfs2_glock_operations gfs2_trans_glops = { .go_xmote_th = trans_go_sync, .go_xmote_bh = trans_go_xmote_bh, + .go_demote_ok = trans_go_demote_ok, .go_type = LM_TYPE_NONDISK, }; diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index f566ec1b4e8e..608849d00021 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -68,12 +68,6 @@ struct gfs2_bitmap { u32 bi_len; }; -struct gfs2_rgrp_host { - u32 rg_free; - u32 rg_dinodes; - u64 rg_igeneration; -}; - struct gfs2_rgrpd { struct list_head rd_list; /* Link with superblock */ struct list_head rd_list_mru; @@ -83,14 +77,16 @@ struct gfs2_rgrpd { u32 rd_length; /* length of rgrp header in fs blocks */ u32 rd_data; /* num of data blocks in rgrp */ u32 rd_bitbytes; /* number of bytes in data bitmaps */ - struct gfs2_rgrp_host rd_rg; + u32 rd_free; + u32 rd_free_clone; + u32 rd_dinodes; + u64 rd_igeneration; struct gfs2_bitmap *rd_bits; - unsigned int rd_bh_count; struct mutex rd_mutex; - u32 rd_free_clone; struct gfs2_log_element rd_le; - u32 rd_last_alloc; struct gfs2_sbd *rd_sbd; + unsigned int rd_bh_count; + u32 rd_last_alloc; unsigned char rd_flags; #define GFS2_RDF_CHECK 0x01 /* Need to check for unlinked inodes */ #define GFS2_RDF_NOALLOC 0x02 /* rg prohibits allocation */ @@ -129,7 +125,7 @@ struct gfs2_glock_operations { void (*go_xmote_th) (struct gfs2_glock *gl); int (*go_xmote_bh) (struct gfs2_glock *gl, struct gfs2_holder *gh); void (*go_inval) (struct gfs2_glock *gl, int flags); - int (*go_demote_ok) (struct gfs2_glock *gl); + int (*go_demote_ok) (const struct gfs2_glock *gl); int (*go_lock) (struct gfs2_holder *gh); void (*go_unlock) (struct gfs2_holder *gh); int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl); @@ -159,7 +155,6 @@ struct gfs2_holder { enum { GLF_LOCK = 1, - GLF_STICKY = 2, GLF_DEMOTE = 3, GLF_PENDING_DEMOTE = 4, GLF_DEMOTE_IN_PROGRESS = 5, @@ -194,7 +189,7 @@ struct gfs2_glock { unsigned long gl_tchange; void *gl_object; - struct list_head gl_reclaim; + struct list_head gl_lru; struct gfs2_sbd *gl_sbd; @@ -233,29 +228,24 @@ enum { GIF_USER = 4, /* user inode, not metadata addr space */ }; -struct gfs2_dinode_host { - u64 di_size; /* number of bytes in file */ - u64 di_generation; /* generation number for NFS */ - u32 di_flags; /* GFS2_DIF_... */ - /* These only apply to directories */ - u32 di_entries; /* The number of entries in the directory */ - u64 di_eattr; /* extended attribute block number */ -}; struct gfs2_inode { struct inode i_inode; u64 i_no_addr; u64 i_no_formal_ino; + u64 i_generation; + u64 i_eattr; + loff_t i_disksize; unsigned long i_flags; /* GIF_... */ - - struct gfs2_dinode_host i_di; /* To be replaced by ref to block */ - struct gfs2_glock *i_gl; /* Move into i_gh? */ struct gfs2_holder i_iopen_gh; struct gfs2_holder i_gh; /* for prepare/commit_write only */ struct gfs2_alloc *i_alloc; u64 i_goal; /* goal block for allocations */ struct rw_semaphore i_rw_mutex; + struct list_head i_trunc_list; + u32 i_entries; + u32 i_diskflags; u8 i_height; u8 i_depth; }; @@ -406,13 +396,11 @@ struct gfs2_args { struct gfs2_tune { spinlock_t gt_spin; - unsigned int gt_demote_secs; /* Cache retention for unheld glock */ unsigned int gt_incore_log_blocks; unsigned int gt_log_flush_secs; unsigned int gt_recoverd_secs; unsigned int gt_logd_secs; - unsigned int gt_quotad_secs; unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */ unsigned int gt_quota_warn_period; /* Secs between quota warn msgs */ @@ -488,10 +476,6 @@ struct gfs2_sbd { /* Lock Stuff */ struct lm_lockstruct sd_lockstruct; - struct list_head sd_reclaim_list; - spinlock_t sd_reclaim_lock; - wait_queue_head_t sd_reclaim_wq; - atomic_t sd_reclaim_count; struct gfs2_holder sd_live_gh; struct gfs2_glock *sd_rename_gl; struct gfs2_glock *sd_trans_gl; @@ -519,7 +503,6 @@ struct gfs2_sbd { spinlock_t sd_statfs_spin; struct gfs2_statfs_change_host sd_statfs_master; struct gfs2_statfs_change_host sd_statfs_local; - unsigned long sd_statfs_sync_time; /* Resource group stuff */ @@ -552,8 +535,6 @@ struct gfs2_sbd { struct task_struct *sd_recoverd_process; struct task_struct *sd_logd_process; struct task_struct *sd_quotad_process; - struct task_struct *sd_glockd_process[GFS2_GLOCKD_MAX]; - unsigned int sd_glockd_num; /* Quota stuff */ @@ -561,13 +542,15 @@ struct gfs2_sbd { atomic_t sd_quota_count; spinlock_t sd_quota_spin; struct mutex sd_quota_mutex; + wait_queue_head_t sd_quota_wait; + struct list_head sd_trunc_list; + spinlock_t sd_trunc_lock; unsigned int sd_quota_slots; unsigned int sd_quota_chunks; unsigned char **sd_quota_bitmap; u64 sd_quota_sync_gen; - unsigned long sd_quota_sync_time; /* Log stuff */ @@ -624,10 +607,6 @@ struct gfs2_sbd { struct mutex sd_freeze_lock; unsigned int sd_freeze_count; - /* Counters */ - - atomic_t sd_reclaimed; - char sd_fsname[GFS2_FSNAME_LEN]; char sd_table_name[GFS2_FSNAME_LEN]; char sd_proto_name[GFS2_FSNAME_LEN]; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index d57616840e89..3b87c188da41 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -32,7 +32,6 @@ #include "log.h" #include "meta_io.h" #include "ops_address.h" -#include "ops_inode.h" #include "quota.h" #include "rgrp.h" #include "trans.h" @@ -248,7 +247,6 @@ fail: static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) { - struct gfs2_dinode_host *di = &ip->i_di; const struct gfs2_dinode *str = buf; struct timespec atime; u16 height, depth; @@ -274,8 +272,8 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) * to do that. */ ip->i_inode.i_nlink = be32_to_cpu(str->di_nlink); - di->di_size = be64_to_cpu(str->di_size); - i_size_write(&ip->i_inode, di->di_size); + ip->i_disksize = be64_to_cpu(str->di_size); + i_size_write(&ip->i_inode, ip->i_disksize); gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks)); atime.tv_sec = be64_to_cpu(str->di_atime); atime.tv_nsec = be32_to_cpu(str->di_atime_nsec); @@ -287,9 +285,9 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) ip->i_inode.i_ctime.tv_nsec = be32_to_cpu(str->di_ctime_nsec); ip->i_goal = be64_to_cpu(str->di_goal_meta); - di->di_generation = be64_to_cpu(str->di_generation); + ip->i_generation = be64_to_cpu(str->di_generation); - di->di_flags = be32_to_cpu(str->di_flags); + ip->i_diskflags = be32_to_cpu(str->di_flags); gfs2_set_inode_flags(&ip->i_inode); height = be16_to_cpu(str->di_height); if (unlikely(height > GFS2_MAX_META_HEIGHT)) @@ -300,9 +298,9 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) if (unlikely(depth > GFS2_DIR_MAX_DEPTH)) goto corrupt; ip->i_depth = (u8)depth; - di->di_entries = be32_to_cpu(str->di_entries); + ip->i_entries = be32_to_cpu(str->di_entries); - di->di_eattr = be64_to_cpu(str->di_eattr); + ip->i_eattr = be64_to_cpu(str->di_eattr); if (S_ISREG(ip->i_inode.i_mode)) gfs2_set_aops(&ip->i_inode); @@ -388,7 +386,6 @@ int gfs2_dinode_dealloc(struct gfs2_inode *ip) gfs2_free_di(rgd, ip); gfs2_trans_end(sdp); - clear_bit(GLF_STICKY, &ip->i_gl->gl_flags); out_rg_gunlock: gfs2_glock_dq_uninit(&al->al_rgd_gh); @@ -690,7 +687,7 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name, return error; } - if (dip->i_di.di_entries == (u32)-1) + if (dip->i_entries == (u32)-1) return -EFBIG; if (S_ISDIR(mode) && dip->i_inode.i_nlink == (u32)-1) return -EMLINK; @@ -790,11 +787,11 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl, di->di_flags = 0; if (S_ISREG(mode)) { - if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_JDATA) || + if ((dip->i_diskflags & GFS2_DIF_INHERIT_JDATA) || gfs2_tune_get(sdp, gt_new_files_jdata)) di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA); } else if (S_ISDIR(mode)) { - di->di_flags |= cpu_to_be32(dip->i_di.di_flags & + di->di_flags |= cpu_to_be32(dip->i_diskflags & GFS2_DIF_INHERIT_JDATA); } @@ -1068,7 +1065,7 @@ int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name, struct qstr dotname; int error; - if (ip->i_di.di_entries != 2) { + if (ip->i_entries != 2) { if (gfs2_consist_inode(ip)) gfs2_dinode_print(ip); return -EIO; @@ -1168,7 +1165,7 @@ int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len) return error; } - if (!ip->i_di.di_size) { + if (!ip->i_disksize) { gfs2_consist_inode(ip); error = -EIO; goto out; @@ -1178,7 +1175,7 @@ int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len) if (error) goto out; - x = ip->i_di.di_size + 1; + x = ip->i_disksize + 1; if (x > *len) { *buf = kmalloc(x, GFP_NOFS); if (!*buf) { @@ -1242,7 +1239,6 @@ int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr) void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf) { - const struct gfs2_dinode_host *di = &ip->i_di; struct gfs2_dinode *str = buf; str->di_header.mh_magic = cpu_to_be32(GFS2_MAGIC); @@ -1256,7 +1252,7 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf) str->di_uid = cpu_to_be32(ip->i_inode.i_uid); str->di_gid = cpu_to_be32(ip->i_inode.i_gid); str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink); - str->di_size = cpu_to_be64(di->di_size); + str->di_size = cpu_to_be64(ip->i_disksize); str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode)); str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec); str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec); @@ -1264,17 +1260,17 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf) str->di_goal_meta = cpu_to_be64(ip->i_goal); str->di_goal_data = cpu_to_be64(ip->i_goal); - str->di_generation = cpu_to_be64(di->di_generation); + str->di_generation = cpu_to_be64(ip->i_generation); - str->di_flags = cpu_to_be32(di->di_flags); + str->di_flags = cpu_to_be32(ip->i_diskflags); str->di_height = cpu_to_be16(ip->i_height); str->di_payload_format = cpu_to_be32(S_ISDIR(ip->i_inode.i_mode) && - !(ip->i_di.di_flags & GFS2_DIF_EXHASH) ? + !(ip->i_diskflags & GFS2_DIF_EXHASH) ? GFS2_FORMAT_DE : 0); str->di_depth = cpu_to_be16(ip->i_depth); - str->di_entries = cpu_to_be32(di->di_entries); + str->di_entries = cpu_to_be32(ip->i_entries); - str->di_eattr = cpu_to_be64(di->di_eattr); + str->di_eattr = cpu_to_be64(ip->i_eattr); str->di_atime_nsec = cpu_to_be32(ip->i_inode.i_atime.tv_nsec); str->di_mtime_nsec = cpu_to_be32(ip->i_inode.i_mtime.tv_nsec); str->di_ctime_nsec = cpu_to_be32(ip->i_inode.i_ctime.tv_nsec); @@ -1282,22 +1278,21 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf) void gfs2_dinode_print(const struct gfs2_inode *ip) { - const struct gfs2_dinode_host *di = &ip->i_di; - printk(KERN_INFO " no_formal_ino = %llu\n", (unsigned long long)ip->i_no_formal_ino); printk(KERN_INFO " no_addr = %llu\n", (unsigned long long)ip->i_no_addr); - printk(KERN_INFO " di_size = %llu\n", (unsigned long long)di->di_size); + printk(KERN_INFO " i_disksize = %llu\n", + (unsigned long long)ip->i_disksize); printk(KERN_INFO " blocks = %llu\n", (unsigned long long)gfs2_get_inode_blocks(&ip->i_inode)); printk(KERN_INFO " i_goal = %llu\n", (unsigned long long)ip->i_goal); - printk(KERN_INFO " di_flags = 0x%.8X\n", di->di_flags); + printk(KERN_INFO " i_diskflags = 0x%.8X\n", ip->i_diskflags); printk(KERN_INFO " i_height = %u\n", ip->i_height); printk(KERN_INFO " i_depth = %u\n", ip->i_depth); - printk(KERN_INFO " di_entries = %u\n", di->di_entries); - printk(KERN_INFO " di_eattr = %llu\n", - (unsigned long long)di->di_eattr); + printk(KERN_INFO " i_entries = %u\n", ip->i_entries); + printk(KERN_INFO " i_eattr = %llu\n", + (unsigned long long)ip->i_eattr); } diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index 2d43f69610a0..d5329364cdff 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -10,6 +10,7 @@ #ifndef __INODE_DOT_H__ #define __INODE_DOT_H__ +#include <linux/fs.h> #include "util.h" static inline int gfs2_is_stuffed(const struct gfs2_inode *ip) @@ -19,7 +20,7 @@ static inline int gfs2_is_stuffed(const struct gfs2_inode *ip) static inline int gfs2_is_jdata(const struct gfs2_inode *ip) { - return ip->i_di.di_flags & GFS2_DIF_JDATA; + return ip->i_diskflags & GFS2_DIF_JDATA; } static inline int gfs2_is_writeback(const struct gfs2_inode *ip) @@ -97,5 +98,15 @@ struct inode *gfs2_lookup_simple(struct inode *dip, const char *name); void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf); void gfs2_dinode_print(const struct gfs2_inode *ip); +extern const struct inode_operations gfs2_file_iops; +extern const struct inode_operations gfs2_dir_iops; +extern const struct inode_operations gfs2_symlink_iops; +extern const struct file_operations gfs2_file_fops; +extern const struct file_operations gfs2_dir_fops; +extern const struct file_operations gfs2_file_fops_nolock; +extern const struct file_operations gfs2_dir_fops_nolock; + +extern void gfs2_set_inode_flags(struct inode *inode); + #endif /* __INODE_DOT_H__ */ diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c index 0c4cbe6c8285..1aa7eb6a0226 100644 --- a/fs/gfs2/locking/dlm/mount.c +++ b/fs/gfs2/locking/dlm/mount.c @@ -194,17 +194,25 @@ out: static void gdlm_recovery_done(void *lockspace, unsigned int jid, unsigned int message) { + char env_jid[20]; + char env_status[20]; + char *envp[] = { env_jid, env_status, NULL }; struct gdlm_ls *ls = lockspace; ls->recover_jid_done = jid; ls->recover_jid_status = message; - kobject_uevent(&ls->kobj, KOBJ_CHANGE); + sprintf(env_jid, "JID=%d", jid); + sprintf(env_status, "RECOVERY=%s", + message == LM_RD_SUCCESS ? "Done" : "Failed"); + kobject_uevent_env(&ls->kobj, KOBJ_CHANGE, envp); } static void gdlm_others_may_mount(void *lockspace) { + char *message = "FIRSTMOUNT=Done"; + char *envp[] = { message, NULL }; struct gdlm_ls *ls = lockspace; ls->first_done = 1; - kobject_uevent(&ls->kobj, KOBJ_CHANGE); + kobject_uevent_env(&ls->kobj, KOBJ_CHANGE, envp); } /* Userspace gets the offline uevent, blocks new gfs locks on diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c index 4ec571c3d8a9..9b7edcf7bd49 100644 --- a/fs/gfs2/locking/dlm/sysfs.c +++ b/fs/gfs2/locking/dlm/sysfs.c @@ -195,9 +195,23 @@ void gdlm_kobject_release(struct gdlm_ls *ls) kobject_put(&ls->kobj); } +static int gdlm_uevent(struct kset *kset, struct kobject *kobj, + struct kobj_uevent_env *env) +{ + struct gdlm_ls *ls = container_of(kobj, struct gdlm_ls, kobj); + add_uevent_var(env, "LOCKTABLE=%s:%s", ls->clustername, ls->fsname); + add_uevent_var(env, "LOCKPROTO=lock_dlm"); + return 0; +} + +static struct kset_uevent_ops gdlm_uevent_ops = { + .uevent = gdlm_uevent, +}; + + int gdlm_sysfs_init(void) { - gdlm_kset = kset_create_and_add("lock_dlm", NULL, kernel_kobj); + gdlm_kset = kset_create_and_add("lock_dlm", &gdlm_uevent_ops, kernel_kobj); if (!gdlm_kset) { printk(KERN_WARNING "%s: can not create kset\n", __func__); return -ENOMEM; diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index bb2cc303ac29..7cacfde32194 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -19,7 +19,7 @@ #include "gfs2.h" #include "incore.h" -#include "ops_fstype.h" +#include "super.h" #include "sys.h" #include "util.h" #include "glock.h" @@ -30,6 +30,7 @@ static void gfs2_init_inode_once(void *foo) inode_init_once(&ip->i_inode); init_rwsem(&ip->i_rw_mutex); + INIT_LIST_HEAD(&ip->i_trunc_list); ip->i_alloc = NULL; } @@ -42,7 +43,7 @@ static void gfs2_init_glock_once(void *foo) INIT_LIST_HEAD(&gl->gl_holders); gl->gl_lvb = NULL; atomic_set(&gl->gl_lvb_count, 0); - INIT_LIST_HEAD(&gl->gl_reclaim); + INIT_LIST_HEAD(&gl->gl_lru); INIT_LIST_HEAD(&gl->gl_ail_list); atomic_set(&gl->gl_ail_count, 0); } @@ -93,6 +94,12 @@ static int __init init_gfs2_fs(void) if (!gfs2_rgrpd_cachep) goto fail; + gfs2_quotad_cachep = kmem_cache_create("gfs2_quotad", + sizeof(struct gfs2_quota_data), + 0, 0, NULL); + if (!gfs2_quotad_cachep) + goto fail; + error = register_filesystem(&gfs2_fs_type); if (error) goto fail; @@ -112,6 +119,9 @@ fail_unregister: fail: gfs2_glock_exit(); + if (gfs2_quotad_cachep) + kmem_cache_destroy(gfs2_quotad_cachep); + if (gfs2_rgrpd_cachep) kmem_cache_destroy(gfs2_rgrpd_cachep); @@ -140,6 +150,7 @@ static void __exit exit_gfs2_fs(void) unregister_filesystem(&gfs2_fs_type); unregister_filesystem(&gfs2meta_fs_type); + kmem_cache_destroy(gfs2_quotad_cachep); kmem_cache_destroy(gfs2_rgrpd_cachep); kmem_cache_destroy(gfs2_bufdata_cachep); kmem_cache_destroy(gfs2_inode_cachep); diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c index f96eb90a2cfa..3cb0a44ba023 100644 --- a/fs/gfs2/mount.c +++ b/fs/gfs2/mount.c @@ -32,7 +32,6 @@ enum { Opt_debug, Opt_nodebug, Opt_upgrade, - Opt_num_glockd, Opt_acl, Opt_noacl, Opt_quota_off, @@ -57,7 +56,6 @@ static const match_table_t tokens = { {Opt_debug, "debug"}, {Opt_nodebug, "nodebug"}, {Opt_upgrade, "upgrade"}, - {Opt_num_glockd, "num_glockd=%d"}, {Opt_acl, "acl"}, {Opt_noacl, "noacl"}, {Opt_quota_off, "quota=off"}, @@ -87,16 +85,7 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount) int error = 0; if (!remount) { - /* If someone preloaded options, use those instead */ - spin_lock(&gfs2_sys_margs_lock); - if (gfs2_sys_margs) { - data = gfs2_sys_margs; - gfs2_sys_margs = NULL; - } - spin_unlock(&gfs2_sys_margs_lock); - /* Set some defaults */ - args->ar_num_glockd = GFS2_GLOCKD_DEFAULT; args->ar_quota = GFS2_QUOTA_DEFAULT; args->ar_data = GFS2_DATA_DEFAULT; } @@ -105,7 +94,7 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount) process them */ for (options = data; (o = strsep(&options, ",")); ) { - int token, option; + int token; substring_t tmp[MAX_OPT_ARGS]; if (!*o) @@ -196,22 +185,6 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount) goto cant_remount; args->ar_upgrade = 1; break; - case Opt_num_glockd: - if ((error = match_int(&tmp[0], &option))) { - fs_info(sdp, "problem getting num_glockd\n"); - goto out_error; - } - - if (remount && option != args->ar_num_glockd) - goto cant_remount; - if (!option || option > GFS2_GLOCKD_MAX) { - fs_info(sdp, "0 < num_glockd <= %u (not %u)\n", - GFS2_GLOCKD_MAX, option); - error = -EINVAL; - goto out_error; - } - args->ar_num_glockd = option; - break; case Opt_acl: args->ar_posix_acl = 1; sdp->sd_vfs->s_flags |= MS_POSIXACL; diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c index 27563816e1c5..4ddab67867eb 100644 --- a/fs/gfs2/ops_address.c +++ b/fs/gfs2/ops_address.c @@ -210,25 +210,23 @@ static int gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc { struct inode *inode = page->mapping->host; struct gfs2_sbd *sdp = GFS2_SB(inode); - int error; + int ret; int done_trans = 0; - error = gfs2_writepage_common(page, wbc); - if (error <= 0) - return error; - if (PageChecked(page)) { if (wbc->sync_mode != WB_SYNC_ALL) goto out_ignore; - error = gfs2_trans_begin(sdp, RES_DINODE + 1, 0); - if (error) + ret = gfs2_trans_begin(sdp, RES_DINODE + 1, 0); + if (ret) goto out_ignore; done_trans = 1; } - error = __gfs2_jdata_writepage(page, wbc); + ret = gfs2_writepage_common(page, wbc); + if (ret > 0) + ret = __gfs2_jdata_writepage(page, wbc); if (done_trans) gfs2_trans_end(sdp); - return error; + return ret; out_ignore: redirty_page_for_writepage(wbc, page); @@ -453,8 +451,8 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page) kaddr = kmap_atomic(page, KM_USER0); memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), - ip->i_di.di_size); - memset(kaddr + ip->i_di.di_size, 0, PAGE_CACHE_SIZE - ip->i_di.di_size); + ip->i_disksize); + memset(kaddr + ip->i_disksize, 0, PAGE_CACHE_SIZE - ip->i_disksize); kunmap_atomic(kaddr, KM_USER0); flush_dcache_page(page); brelse(dibh); @@ -627,7 +625,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, { struct gfs2_inode *ip = GFS2_I(mapping->host); struct gfs2_sbd *sdp = GFS2_SB(mapping->host); - unsigned int data_blocks, ind_blocks, rblocks; + unsigned int data_blocks = 0, ind_blocks = 0, rblocks; int alloc_required; int error = 0; struct gfs2_alloc *al; @@ -641,11 +639,13 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, if (unlikely(error)) goto out_uninit; - gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks); error = gfs2_write_alloc_required(ip, pos, len, &alloc_required); if (error) goto out_unlock; + if (alloc_required || gfs2_is_jdata(ip)) + gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks); + if (alloc_required) { al = gfs2_alloc_get(ip); if (!al) { @@ -675,7 +675,8 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, goto out_trans_fail; error = -ENOMEM; - page = __grab_cache_page(mapping, index); + flags |= AOP_FLAG_NOFS; + page = grab_cache_page_write_begin(mapping, index, flags); *pagep = page; if (unlikely(!page)) goto out_endtrans; @@ -782,7 +783,7 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh, if (inode->i_size < to) { i_size_write(inode, to); - ip->i_di.di_size = inode->i_size; + ip->i_disksize = inode->i_size; di->di_size = cpu_to_be64(inode->i_size); mark_inode_dirty(inode); } @@ -847,9 +848,9 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping, ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); - if (likely(ret >= 0) && (inode->i_size > ip->i_di.di_size)) { + if (likely(ret >= 0) && (inode->i_size > ip->i_disksize)) { di = (struct gfs2_dinode *)dibh->b_data; - ip->i_di.di_size = inode->i_size; + ip->i_disksize = inode->i_size; di->di_size = cpu_to_be64(inode->i_size); mark_inode_dirty(inode); } diff --git a/fs/gfs2/ops_dentry.c b/fs/gfs2/ops_dentry.c index 4a5e676b4420..c2ad36330ca3 100644 --- a/fs/gfs2/ops_dentry.c +++ b/fs/gfs2/ops_dentry.c @@ -19,7 +19,7 @@ #include "incore.h" #include "dir.h" #include "glock.h" -#include "ops_dentry.h" +#include "super.h" #include "util.h" #include "inode.h" diff --git a/fs/gfs2/ops_dentry.h b/fs/gfs2/ops_dentry.h deleted file mode 100644 index 5caa3db4d3f5..000000000000 --- a/fs/gfs2/ops_dentry.h +++ /dev/null @@ -1,17 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#ifndef __OPS_DENTRY_DOT_H__ -#define __OPS_DENTRY_DOT_H__ - -#include <linux/dcache.h> - -extern struct dentry_operations gfs2_dops; - -#endif /* __OPS_DENTRY_DOT_H__ */ diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c index bbb8c36403a9..7fdeb14ddd1a 100644 --- a/fs/gfs2/ops_export.c +++ b/fs/gfs2/ops_export.c @@ -22,8 +22,7 @@ #include "glock.h" #include "glops.h" #include "inode.h" -#include "ops_dentry.h" -#include "ops_fstype.h" +#include "super.h" #include "rgrp.h" #include "util.h" @@ -214,7 +213,7 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb, } error = -EIO; - if (GFS2_I(inode)->i_di.di_flags & GFS2_DIF_SYSTEM) { + if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM) { iput(inode); goto fail; } diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c index 3a747f8e2188..93fe41b67f97 100644 --- a/fs/gfs2/ops_file.c +++ b/fs/gfs2/ops_file.c @@ -39,7 +39,6 @@ #include "util.h" #include "eaops.h" #include "ops_address.h" -#include "ops_inode.h" /** * gfs2_llseek - seek to a location in a file @@ -158,8 +157,8 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr) if (error) return error; - fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_di.di_flags); - if (!S_ISDIR(inode->i_mode) && ip->i_di.di_flags & GFS2_DIF_JDATA) + fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_diskflags); + if (!S_ISDIR(inode->i_mode) && ip->i_diskflags & GFS2_DIF_JDATA) fsflags |= FS_JOURNAL_DATA_FL; if (put_user(fsflags, ptr)) error = -EFAULT; @@ -172,17 +171,16 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr) void gfs2_set_inode_flags(struct inode *inode) { struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_dinode_host *di = &ip->i_di; unsigned int flags = inode->i_flags; flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); - if (di->di_flags & GFS2_DIF_IMMUTABLE) + if (ip->i_diskflags & GFS2_DIF_IMMUTABLE) flags |= S_IMMUTABLE; - if (di->di_flags & GFS2_DIF_APPENDONLY) + if (ip->i_diskflags & GFS2_DIF_APPENDONLY) flags |= S_APPEND; - if (di->di_flags & GFS2_DIF_NOATIME) + if (ip->i_diskflags & GFS2_DIF_NOATIME) flags |= S_NOATIME; - if (di->di_flags & GFS2_DIF_SYNC) + if (ip->i_diskflags & GFS2_DIF_SYNC) flags |= S_SYNC; inode->i_flags = flags; } @@ -221,7 +219,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask) if (error) goto out_drop_write; - flags = ip->i_di.di_flags; + flags = ip->i_diskflags; new_flags = (flags & ~mask) | (reqflags & mask); if ((new_flags ^ flags) == 0) goto out; @@ -260,7 +258,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask) if (error) goto out_trans_end; gfs2_trans_add_bh(ip->i_gl, bh, 1); - ip->i_di.di_flags = new_flags; + ip->i_diskflags = new_flags; gfs2_dinode_out(ip, bh->b_data); brelse(bh); gfs2_set_inode_flags(inode); @@ -344,7 +342,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page) struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); unsigned long last_index; - u64 pos = page->index << (PAGE_CACHE_SIZE - inode->i_blkbits); + u64 pos = page->index << PAGE_CACHE_SHIFT; unsigned int data_blocks, ind_blocks, rblocks; int alloc_required = 0; struct gfs2_holder gh; @@ -357,7 +355,6 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page) goto out; set_bit(GIF_SW_PAGED, &ip->i_flags); - gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks); ret = gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE, &alloc_required); if (ret || !alloc_required) goto out_unlock; @@ -369,6 +366,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page) ret = gfs2_quota_lock_check(ip); if (ret) goto out_alloc_put; + gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks); al->al_requested = data_blocks + ind_blocks; ret = gfs2_inplace_reserve(ip); if (ret) @@ -479,7 +477,7 @@ static int gfs2_open(struct inode *inode, struct file *file) goto fail; if (!(file->f_flags & O_LARGEFILE) && - ip->i_di.di_size > MAX_NON_LFS) { + ip->i_disksize > MAX_NON_LFS) { error = -EOVERFLOW; goto fail_gunlock; } diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index b117fcf2c4f5..f91eebdde581 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -22,20 +22,18 @@ #include "gfs2.h" #include "incore.h" #include "bmap.h" -#include "daemon.h" #include "glock.h" #include "glops.h" #include "inode.h" #include "mount.h" -#include "ops_fstype.h" -#include "ops_dentry.h" -#include "ops_super.h" #include "recovery.h" #include "rgrp.h" #include "super.h" #include "sys.h" #include "util.h" #include "log.h" +#include "quota.h" +#include "dir.h" #define DO 0 #define UNDO 1 @@ -58,12 +56,10 @@ static void gfs2_tune_init(struct gfs2_tune *gt) { spin_lock_init(>->gt_spin); - gt->gt_demote_secs = 300; gt->gt_incore_log_blocks = 1024; gt->gt_log_flush_secs = 60; gt->gt_recoverd_secs = 60; gt->gt_logd_secs = 1; - gt->gt_quotad_secs = 5; gt->gt_quota_simul_sync = 64; gt->gt_quota_warn_period = 10; gt->gt_quota_scale_num = 1; @@ -91,10 +87,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) gfs2_tune_init(&sdp->sd_tune); - INIT_LIST_HEAD(&sdp->sd_reclaim_list); - spin_lock_init(&sdp->sd_reclaim_lock); - init_waitqueue_head(&sdp->sd_reclaim_wq); - mutex_init(&sdp->sd_inum_mutex); spin_lock_init(&sdp->sd_statfs_spin); @@ -110,6 +102,9 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) INIT_LIST_HEAD(&sdp->sd_quota_list); spin_lock_init(&sdp->sd_quota_spin); mutex_init(&sdp->sd_quota_mutex); + init_waitqueue_head(&sdp->sd_quota_wait); + INIT_LIST_HEAD(&sdp->sd_trunc_list); + spin_lock_init(&sdp->sd_trunc_lock); spin_lock_init(&sdp->sd_log_lock); @@ -443,24 +438,11 @@ out: static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh, int undo) { - struct task_struct *p; int error = 0; if (undo) goto fail_trans; - for (sdp->sd_glockd_num = 0; - sdp->sd_glockd_num < sdp->sd_args.ar_num_glockd; - sdp->sd_glockd_num++) { - p = kthread_run(gfs2_glockd, sdp, "gfs2_glockd"); - error = IS_ERR(p); - if (error) { - fs_err(sdp, "can't start glockd thread: %d\n", error); - goto fail; - } - sdp->sd_glockd_process[sdp->sd_glockd_num] = p; - } - error = gfs2_glock_nq_num(sdp, GFS2_MOUNT_LOCK, &gfs2_nondisk_glops, LM_ST_EXCLUSIVE, LM_FLAG_NOEXP | GL_NOCACHE, @@ -493,7 +475,6 @@ static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh, fs_err(sdp, "can't create transaction glock: %d\n", error); goto fail_rename; } - set_bit(GLF_STICKY, &sdp->sd_trans_gl->gl_flags); return 0; @@ -506,9 +487,6 @@ fail_live: fail_mount: gfs2_glock_dq_uninit(mount_gh); fail: - while (sdp->sd_glockd_num--) - kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]); - return error; } @@ -620,7 +598,7 @@ static int map_journal_extents(struct gfs2_sbd *sdp) prev_db = 0; - for (lb = 0; lb < ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift; lb++) { + for (lb = 0; lb < ip->i_disksize >> sdp->sd_sb.sb_bsize_shift; lb++) { bh.b_state = 0; bh.b_blocknr = 0; bh.b_size = 1 << ip->i_inode.i_blkbits; @@ -661,6 +639,72 @@ static void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp) sdp->sd_lockstruct.ls_lockspace); } +/** + * gfs2_jindex_hold - Grab a lock on the jindex + * @sdp: The GFS2 superblock + * @ji_gh: the holder for the jindex glock + * + * Returns: errno + */ + +static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh) +{ + struct gfs2_inode *dip = GFS2_I(sdp->sd_jindex); + struct qstr name; + char buf[20]; + struct gfs2_jdesc *jd; + int error; + + name.name = buf; + + mutex_lock(&sdp->sd_jindex_mutex); + + for (;;) { + error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, ji_gh); + if (error) + break; + + name.len = sprintf(buf, "journal%u", sdp->sd_journals); + name.hash = gfs2_disk_hash(name.name, name.len); + + error = gfs2_dir_check(sdp->sd_jindex, &name, NULL); + if (error == -ENOENT) { + error = 0; + break; + } + + gfs2_glock_dq_uninit(ji_gh); + + if (error) + break; + + error = -ENOMEM; + jd = kzalloc(sizeof(struct gfs2_jdesc), GFP_KERNEL); + if (!jd) + break; + + INIT_LIST_HEAD(&jd->extent_list); + jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1); + if (!jd->jd_inode || IS_ERR(jd->jd_inode)) { + if (!jd->jd_inode) + error = -ENOENT; + else + error = PTR_ERR(jd->jd_inode); + kfree(jd); + break; + } + + spin_lock(&sdp->sd_jindex_spin); + jd->jd_jid = sdp->sd_journals++; + list_add_tail(&jd->jd_list, &sdp->sd_jindex_list); + spin_unlock(&sdp->sd_jindex_spin); + } + + mutex_unlock(&sdp->sd_jindex_mutex); + + return error; +} + static int init_journal(struct gfs2_sbd *sdp, int undo) { struct inode *master = sdp->sd_master_dir->d_inode; @@ -681,7 +725,6 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) return PTR_ERR(sdp->sd_jindex); } ip = GFS2_I(sdp->sd_jindex); - set_bit(GLF_STICKY, &ip->i_gl->gl_flags); /* Load in the journal index special file */ @@ -832,7 +875,6 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo) goto fail_statfs; } ip = GFS2_I(sdp->sd_rindex); - set_bit(GLF_STICKY, &ip->i_gl->gl_flags); sdp->sd_rindex_uptodate = 0; /* Read in the quota inode */ @@ -973,9 +1015,6 @@ static int init_threads(struct gfs2_sbd *sdp, int undo) } sdp->sd_logd_process = p; - sdp->sd_statfs_sync_time = jiffies; - sdp->sd_quota_sync_time = jiffies; - p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad"); error = IS_ERR(p); if (error) { @@ -1224,17 +1263,21 @@ static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags, static void gfs2_kill_sb(struct super_block *sb) { struct gfs2_sbd *sdp = sb->s_fs_info; - if (sdp) { - gfs2_meta_syncfs(sdp); - dput(sdp->sd_root_dir); - dput(sdp->sd_master_dir); - sdp->sd_root_dir = NULL; - sdp->sd_master_dir = NULL; + + if (sdp == NULL) { + kill_block_super(sb); + return; } + + gfs2_meta_syncfs(sdp); + dput(sdp->sd_root_dir); + dput(sdp->sd_master_dir); + sdp->sd_root_dir = NULL; + sdp->sd_master_dir = NULL; shrink_dcache_sb(sb); kill_block_super(sb); - if (sdp) - gfs2_delete_debugfs_file(sdp); + gfs2_delete_debugfs_file(sdp); + kfree(sdp); } struct file_system_type gfs2_fs_type = { diff --git a/fs/gfs2/ops_fstype.h b/fs/gfs2/ops_fstype.h deleted file mode 100644 index da8490511836..000000000000 --- a/fs/gfs2/ops_fstype.h +++ /dev/null @@ -1,19 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#ifndef __OPS_FSTYPE_DOT_H__ -#define __OPS_FSTYPE_DOT_H__ - -#include <linux/fs.h> - -extern struct file_system_type gfs2_fs_type; -extern struct file_system_type gfs2meta_fs_type; -extern const struct export_operations gfs2_export_ops; - -#endif /* __OPS_FSTYPE_DOT_H__ */ diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c index d232991b9046..49877546beb9 100644 --- a/fs/gfs2/ops_inode.c +++ b/fs/gfs2/ops_inode.c @@ -19,6 +19,7 @@ #include <linux/gfs2_ondisk.h> #include <linux/crc32.h> #include <linux/lm_interface.h> +#include <linux/fiemap.h> #include <asm/uaccess.h> #include "gfs2.h" @@ -31,12 +32,11 @@ #include "glock.h" #include "inode.h" #include "meta_io.h" -#include "ops_dentry.h" -#include "ops_inode.h" #include "quota.h" #include "rgrp.h" #include "trans.h" #include "util.h" +#include "super.h" /** * gfs2_create - Create a file @@ -185,7 +185,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, if (!dip->i_inode.i_nlink) goto out_gunlock; error = -EFBIG; - if (dip->i_di.di_entries == (u32)-1) + if (dip->i_entries == (u32)-1) goto out_gunlock; error = -EPERM; if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) @@ -371,7 +371,7 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry, ip = ghs[1].gh_gl->gl_object; - ip->i_di.di_size = size; + ip->i_disksize = size; error = gfs2_meta_inode_buffer(ip, &dibh); @@ -425,9 +425,9 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode) ip = ghs[1].gh_gl->gl_object; ip->i_inode.i_nlink = 2; - ip->i_di.di_size = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode); - ip->i_di.di_flags |= GFS2_DIF_JDATA; - ip->i_di.di_entries = 2; + ip->i_disksize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode); + ip->i_diskflags |= GFS2_DIF_JDATA; + ip->i_entries = 2; error = gfs2_meta_inode_buffer(ip, &dibh); @@ -517,13 +517,13 @@ static int gfs2_rmdir(struct inode *dir, struct dentry *dentry) if (error) goto out_gunlock; - if (ip->i_di.di_entries < 2) { + if (ip->i_entries < 2) { if (gfs2_consist_inode(ip)) gfs2_dinode_print(ip); error = -EIO; goto out_gunlock; } - if (ip->i_di.di_entries > 2) { + if (ip->i_entries > 2) { error = -ENOTEMPTY; goto out_gunlock; } @@ -726,13 +726,13 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, goto out_gunlock; if (S_ISDIR(nip->i_inode.i_mode)) { - if (nip->i_di.di_entries < 2) { + if (nip->i_entries < 2) { if (gfs2_consist_inode(nip)) gfs2_dinode_print(nip); error = -EIO; goto out_gunlock; } - if (nip->i_di.di_entries > 2) { + if (nip->i_entries > 2) { error = -ENOTEMPTY; goto out_gunlock; } @@ -758,7 +758,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, error = -EINVAL; goto out_gunlock; } - if (ndip->i_di.di_entries == (u32)-1) { + if (ndip->i_entries == (u32)-1) { error = -EFBIG; goto out_gunlock; } @@ -990,7 +990,7 @@ static int setattr_size(struct inode *inode, struct iattr *attr) struct gfs2_sbd *sdp = GFS2_SB(inode); int error; - if (attr->ia_size != ip->i_di.di_size) { + if (attr->ia_size != ip->i_disksize) { error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks); if (error) return error; @@ -1001,8 +1001,8 @@ static int setattr_size(struct inode *inode, struct iattr *attr) } error = gfs2_truncatei(ip, attr->ia_size); - if (error && (inode->i_size != ip->i_di.di_size)) - i_size_write(inode, ip->i_di.di_size); + if (error && (inode->i_size != ip->i_disksize)) + i_size_write(inode, ip->i_disksize); return error; } @@ -1212,6 +1212,48 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name) return gfs2_ea_remove(GFS2_I(dentry->d_inode), &er); } +static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int ret; + + ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC); + if (ret) + return ret; + + mutex_lock(&inode->i_mutex); + + ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh); + if (ret) + goto out; + + if (gfs2_is_stuffed(ip)) { + u64 phys = ip->i_no_addr << inode->i_blkbits; + u64 size = i_size_read(inode); + u32 flags = FIEMAP_EXTENT_LAST|FIEMAP_EXTENT_NOT_ALIGNED| + FIEMAP_EXTENT_DATA_INLINE; + phys += sizeof(struct gfs2_dinode); + phys += start; + if (start + len > size) + len = size - start; + if (start < size) + ret = fiemap_fill_next_extent(fieinfo, start, phys, + len, flags); + if (ret == 1) + ret = 0; + } else { + ret = __generic_block_fiemap(inode, fieinfo, start, len, + gfs2_block_map); + } + + gfs2_glock_dq_uninit(&gh); +out: + mutex_unlock(&inode->i_mutex); + return ret; +} + const struct inode_operations gfs2_file_iops = { .permission = gfs2_permission, .setattr = gfs2_setattr, @@ -1220,6 +1262,7 @@ const struct inode_operations gfs2_file_iops = { .getxattr = gfs2_getxattr, .listxattr = gfs2_listxattr, .removexattr = gfs2_removexattr, + .fiemap = gfs2_fiemap, }; const struct inode_operations gfs2_dir_iops = { @@ -1239,6 +1282,7 @@ const struct inode_operations gfs2_dir_iops = { .getxattr = gfs2_getxattr, .listxattr = gfs2_listxattr, .removexattr = gfs2_removexattr, + .fiemap = gfs2_fiemap, }; const struct inode_operations gfs2_symlink_iops = { @@ -1251,5 +1295,6 @@ const struct inode_operations gfs2_symlink_iops = { .getxattr = gfs2_getxattr, .listxattr = gfs2_listxattr, .removexattr = gfs2_removexattr, + .fiemap = gfs2_fiemap, }; diff --git a/fs/gfs2/ops_inode.h b/fs/gfs2/ops_inode.h deleted file mode 100644 index 14b4b797622a..000000000000 --- a/fs/gfs2/ops_inode.h +++ /dev/null @@ -1,25 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#ifndef __OPS_INODE_DOT_H__ -#define __OPS_INODE_DOT_H__ - -#include <linux/fs.h> - -extern const struct inode_operations gfs2_file_iops; -extern const struct inode_operations gfs2_dir_iops; -extern const struct inode_operations gfs2_symlink_iops; -extern const struct file_operations gfs2_file_fops; -extern const struct file_operations gfs2_dir_fops; -extern const struct file_operations gfs2_file_fops_nolock; -extern const struct file_operations gfs2_dir_fops_nolock; - -extern void gfs2_set_inode_flags(struct inode *inode); - -#endif /* __OPS_INODE_DOT_H__ */ diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c index d5355d9b5926..320323d03479 100644 --- a/fs/gfs2/ops_super.c +++ b/fs/gfs2/ops_super.c @@ -28,7 +28,6 @@ #include "inode.h" #include "log.h" #include "mount.h" -#include "ops_super.h" #include "quota.h" #include "recovery.h" #include "rgrp.h" @@ -143,8 +142,6 @@ static void gfs2_put_super(struct super_block *sb) kthread_stop(sdp->sd_quotad_process); kthread_stop(sdp->sd_logd_process); kthread_stop(sdp->sd_recoverd_process); - while (sdp->sd_glockd_num--) - kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]); if (!(sb->s_flags & MS_RDONLY)) { error = gfs2_make_fs_ro(sdp); @@ -185,7 +182,6 @@ static void gfs2_put_super(struct super_block *sb) /* At this point, we're through participating in the lockspace */ gfs2_sys_fs_del(sdp); - kfree(sdp); } /** @@ -215,18 +211,18 @@ static int gfs2_sync_fs(struct super_block *sb, int wait) } /** - * gfs2_write_super_lockfs - prevent further writes to the filesystem + * gfs2_freeze - prevent further writes to the filesystem * @sb: the VFS structure for the filesystem * */ -static void gfs2_write_super_lockfs(struct super_block *sb) +static int gfs2_freeze(struct super_block *sb) { struct gfs2_sbd *sdp = sb->s_fs_info; int error; if (test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) - return; + return -EINVAL; for (;;) { error = gfs2_freeze_fs(sdp); @@ -246,17 +242,150 @@ static void gfs2_write_super_lockfs(struct super_block *sb) fs_err(sdp, "retrying...\n"); msleep(1000); } + return 0; } /** - * gfs2_unlockfs - reallow writes to the filesystem + * gfs2_unfreeze - reallow writes to the filesystem * @sb: the VFS structure for the filesystem * */ -static void gfs2_unlockfs(struct super_block *sb) +static int gfs2_unfreeze(struct super_block *sb) { gfs2_unfreeze_fs(sb->s_fs_info); + return 0; +} + +/** + * statfs_fill - fill in the sg for a given RG + * @rgd: the RG + * @sc: the sc structure + * + * Returns: 0 on success, -ESTALE if the LVB is invalid + */ + +static int statfs_slow_fill(struct gfs2_rgrpd *rgd, + struct gfs2_statfs_change_host *sc) +{ + gfs2_rgrp_verify(rgd); + sc->sc_total += rgd->rd_data; + sc->sc_free += rgd->rd_free; + sc->sc_dinodes += rgd->rd_dinodes; + return 0; +} + +/** + * gfs2_statfs_slow - Stat a filesystem using asynchronous locking + * @sdp: the filesystem + * @sc: the sc info that will be returned + * + * Any error (other than a signal) will cause this routine to fall back + * to the synchronous version. + * + * FIXME: This really shouldn't busy wait like this. + * + * Returns: errno + */ + +static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc) +{ + struct gfs2_holder ri_gh; + struct gfs2_rgrpd *rgd_next; + struct gfs2_holder *gha, *gh; + unsigned int slots = 64; + unsigned int x; + int done; + int error = 0, err; + + memset(sc, 0, sizeof(struct gfs2_statfs_change_host)); + gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL); + if (!gha) + return -ENOMEM; + + error = gfs2_rindex_hold(sdp, &ri_gh); + if (error) + goto out; + + rgd_next = gfs2_rgrpd_get_first(sdp); + + for (;;) { + done = 1; + + for (x = 0; x < slots; x++) { + gh = gha + x; + + if (gh->gh_gl && gfs2_glock_poll(gh)) { + err = gfs2_glock_wait(gh); + if (err) { + gfs2_holder_uninit(gh); + error = err; + } else { + if (!error) + error = statfs_slow_fill( + gh->gh_gl->gl_object, sc); + gfs2_glock_dq_uninit(gh); + } + } + + if (gh->gh_gl) + done = 0; + else if (rgd_next && !error) { + error = gfs2_glock_nq_init(rgd_next->rd_gl, + LM_ST_SHARED, + GL_ASYNC, + gh); + rgd_next = gfs2_rgrpd_get_next(rgd_next); + done = 0; + } + + if (signal_pending(current)) + error = -ERESTARTSYS; + } + + if (done) + break; + + yield(); + } + + gfs2_glock_dq_uninit(&ri_gh); + +out: + kfree(gha); + return error; +} + +/** + * gfs2_statfs_i - Do a statfs + * @sdp: the filesystem + * @sg: the sg structure + * + * Returns: errno + */ + +static int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc) +{ + struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; + struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; + + spin_lock(&sdp->sd_statfs_spin); + + *sc = *m_sc; + sc->sc_total += l_sc->sc_total; + sc->sc_free += l_sc->sc_free; + sc->sc_dinodes += l_sc->sc_dinodes; + + spin_unlock(&sdp->sd_statfs_spin); + + if (sc->sc_free < 0) + sc->sc_free = 0; + if (sc->sc_free > sc->sc_total) + sc->sc_free = sc->sc_total; + if (sc->sc_dinodes < 0) + sc->sc_dinodes = 0; + + return 0; } /** @@ -370,7 +499,6 @@ static void gfs2_clear_inode(struct inode *inode) */ if (test_bit(GIF_USER, &ip->i_flags)) { ip->i_gl->gl_object = NULL; - gfs2_glock_schedule_for_reclaim(ip->i_gl); gfs2_glock_put(ip->i_gl); ip->i_gl = NULL; if (ip->i_iopen_gh.gh_gl) { @@ -423,8 +551,6 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt) seq_printf(s, ",debug"); if (args->ar_upgrade) seq_printf(s, ",upgrade"); - if (args->ar_num_glockd != GFS2_GLOCKD_DEFAULT) - seq_printf(s, ",num_glockd=%u", args->ar_num_glockd); if (args->ar_posix_acl) seq_printf(s, ",acl"); if (args->ar_quota != GFS2_QUOTA_DEFAULT) { @@ -494,16 +620,16 @@ static void gfs2_delete_inode(struct inode *inode) gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh); error = gfs2_glock_nq(&ip->i_iopen_gh); if (error) - goto out_uninit; + goto out_truncate; if (S_ISDIR(inode->i_mode) && - (ip->i_di.di_flags & GFS2_DIF_EXHASH)) { + (ip->i_diskflags & GFS2_DIF_EXHASH)) { error = gfs2_dir_exhash_dealloc(ip); if (error) goto out_unlock; } - if (ip->i_di.di_eattr) { + if (ip->i_eattr) { error = gfs2_ea_dealloc(ip); if (error) goto out_unlock; @@ -519,6 +645,7 @@ static void gfs2_delete_inode(struct inode *inode) if (error) goto out_unlock; +out_truncate: error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks); if (error) goto out_unlock; @@ -527,8 +654,8 @@ static void gfs2_delete_inode(struct inode *inode) gfs2_trans_end(sdp); out_unlock: - gfs2_glock_dq(&ip->i_iopen_gh); -out_uninit: + if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) + gfs2_glock_dq(&ip->i_iopen_gh); gfs2_holder_uninit(&ip->i_iopen_gh); gfs2_glock_dq_uninit(&gh); if (error && error != GLR_TRYFAILED) @@ -563,8 +690,8 @@ const struct super_operations gfs2_super_ops = { .put_super = gfs2_put_super, .write_super = gfs2_write_super, .sync_fs = gfs2_sync_fs, - .write_super_lockfs = gfs2_write_super_lockfs, - .unlockfs = gfs2_unlockfs, + .freeze_fs = gfs2_freeze, + .unfreeze_fs = gfs2_unfreeze, .statfs = gfs2_statfs, .remount_fs = gfs2_remount_fs, .clear_inode = gfs2_clear_inode, diff --git a/fs/gfs2/ops_super.h b/fs/gfs2/ops_super.h deleted file mode 100644 index 442a274c6272..000000000000 --- a/fs/gfs2/ops_super.h +++ /dev/null @@ -1,17 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#ifndef __OPS_SUPER_DOT_H__ -#define __OPS_SUPER_DOT_H__ - -#include <linux/fs.h> - -extern const struct super_operations gfs2_super_ops; - -#endif /* __OPS_SUPER_DOT_H__ */ diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 3e073f5144fa..b08d09696b3e 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -46,6 +46,8 @@ #include <linux/bio.h> #include <linux/gfs2_ondisk.h> #include <linux/lm_interface.h> +#include <linux/kthread.h> +#include <linux/freezer.h> #include "gfs2.h" #include "incore.h" @@ -94,7 +96,7 @@ static int qd_alloc(struct gfs2_sbd *sdp, int user, u32 id, struct gfs2_quota_data *qd; int error; - qd = kzalloc(sizeof(struct gfs2_quota_data), GFP_NOFS); + qd = kmem_cache_zalloc(gfs2_quotad_cachep, GFP_NOFS); if (!qd) return -ENOMEM; @@ -119,7 +121,7 @@ static int qd_alloc(struct gfs2_sbd *sdp, int user, u32 id, return 0; fail: - kfree(qd); + kmem_cache_free(gfs2_quotad_cachep, qd); return error; } @@ -158,7 +160,7 @@ static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, int create, if (qd || !create) { if (new_qd) { gfs2_lvb_unhold(new_qd->qd_gl); - kfree(new_qd); + kmem_cache_free(gfs2_quotad_cachep, new_qd); } *qdp = qd; return 0; @@ -1013,7 +1015,7 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change, if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), change)) return; - if (ip->i_di.di_flags & GFS2_DIF_SYSTEM) + if (ip->i_diskflags & GFS2_DIF_SYSTEM) return; for (x = 0; x < al->al_qd_num; x++) { @@ -1100,15 +1102,15 @@ static void gfs2_quota_change_in(struct gfs2_quota_change_host *qc, const void * int gfs2_quota_init(struct gfs2_sbd *sdp) { struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode); - unsigned int blocks = ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift; + unsigned int blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift; unsigned int x, slot = 0; unsigned int found = 0; u64 dblock; u32 extlen = 0; int error; - if (!ip->i_di.di_size || ip->i_di.di_size > (64 << 20) || - ip->i_di.di_size & (sdp->sd_sb.sb_bsize - 1)) { + if (!ip->i_disksize || ip->i_disksize > (64 << 20) || + ip->i_disksize & (sdp->sd_sb.sb_bsize - 1)) { gfs2_consist_inode(ip); return -EIO; } @@ -1195,7 +1197,7 @@ fail: return error; } -void gfs2_quota_scan(struct gfs2_sbd *sdp) +static void gfs2_quota_scan(struct gfs2_sbd *sdp) { struct gfs2_quota_data *qd, *safe; LIST_HEAD(dead); @@ -1222,7 +1224,7 @@ void gfs2_quota_scan(struct gfs2_sbd *sdp) gfs2_assert_warn(sdp, !qd->qd_bh_count); gfs2_lvb_unhold(qd->qd_gl); - kfree(qd); + kmem_cache_free(gfs2_quotad_cachep, qd); } } @@ -1257,7 +1259,7 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp) gfs2_assert_warn(sdp, !qd->qd_bh_count); gfs2_lvb_unhold(qd->qd_gl); - kfree(qd); + kmem_cache_free(gfs2_quotad_cachep, qd); spin_lock(&sdp->sd_quota_spin); } @@ -1272,3 +1274,94 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp) } } +static void quotad_error(struct gfs2_sbd *sdp, const char *msg, int error) +{ + if (error == 0 || error == -EROFS) + return; + if (!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) + fs_err(sdp, "gfs2_quotad: %s error %d\n", msg, error); +} + +static void quotad_check_timeo(struct gfs2_sbd *sdp, const char *msg, + int (*fxn)(struct gfs2_sbd *sdp), + unsigned long t, unsigned long *timeo, + unsigned int *new_timeo) +{ + if (t >= *timeo) { + int error = fxn(sdp); + quotad_error(sdp, msg, error); + *timeo = gfs2_tune_get_i(&sdp->sd_tune, new_timeo) * HZ; + } else { + *timeo -= t; + } +} + +static void quotad_check_trunc_list(struct gfs2_sbd *sdp) +{ + struct gfs2_inode *ip; + + while(1) { + ip = NULL; + spin_lock(&sdp->sd_trunc_lock); + if (!list_empty(&sdp->sd_trunc_list)) { + ip = list_entry(sdp->sd_trunc_list.next, + struct gfs2_inode, i_trunc_list); + list_del_init(&ip->i_trunc_list); + } + spin_unlock(&sdp->sd_trunc_lock); + if (ip == NULL) + return; + gfs2_glock_finish_truncate(ip); + } +} + +/** + * gfs2_quotad - Write cached quota changes into the quota file + * @sdp: Pointer to GFS2 superblock + * + */ + +int gfs2_quotad(void *data) +{ + struct gfs2_sbd *sdp = data; + struct gfs2_tune *tune = &sdp->sd_tune; + unsigned long statfs_timeo = 0; + unsigned long quotad_timeo = 0; + unsigned long t = 0; + DEFINE_WAIT(wait); + int empty; + + while (!kthread_should_stop()) { + + /* Update the master statfs file */ + quotad_check_timeo(sdp, "statfs", gfs2_statfs_sync, t, + &statfs_timeo, &tune->gt_statfs_quantum); + + /* Update quota file */ + quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t, + "ad_timeo, &tune->gt_quota_quantum); + + /* FIXME: This should be turned into a shrinker */ + gfs2_quota_scan(sdp); + + /* Check for & recover partially truncated inodes */ + quotad_check_trunc_list(sdp); + + if (freezing(current)) + refrigerator(); + t = min(quotad_timeo, statfs_timeo); + + prepare_to_wait(&sdp->sd_quota_wait, &wait, TASK_UNINTERRUPTIBLE); + spin_lock(&sdp->sd_trunc_lock); + empty = list_empty(&sdp->sd_trunc_list); + spin_unlock(&sdp->sd_trunc_lock); + if (empty) + t -= schedule_timeout(t); + else + t = 0; + finish_wait(&sdp->sd_quota_wait, &wait); + } + + return 0; +} + diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h index 3b7f4b0e5dfe..cec9032be97d 100644 --- a/fs/gfs2/quota.h +++ b/fs/gfs2/quota.h @@ -15,22 +15,22 @@ struct gfs2_sbd; #define NO_QUOTA_CHANGE ((u32)-1) -int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid); -void gfs2_quota_unhold(struct gfs2_inode *ip); +extern int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid); +extern void gfs2_quota_unhold(struct gfs2_inode *ip); -int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid); -void gfs2_quota_unlock(struct gfs2_inode *ip); +extern int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid); +extern void gfs2_quota_unlock(struct gfs2_inode *ip); -int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid); -void gfs2_quota_change(struct gfs2_inode *ip, s64 change, - u32 uid, u32 gid); +extern int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid); +extern void gfs2_quota_change(struct gfs2_inode *ip, s64 change, + u32 uid, u32 gid); -int gfs2_quota_sync(struct gfs2_sbd *sdp); -int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id); +extern int gfs2_quota_sync(struct gfs2_sbd *sdp); +extern int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id); -int gfs2_quota_init(struct gfs2_sbd *sdp); -void gfs2_quota_scan(struct gfs2_sbd *sdp); -void gfs2_quota_cleanup(struct gfs2_sbd *sdp); +extern int gfs2_quota_init(struct gfs2_sbd *sdp); +extern void gfs2_quota_cleanup(struct gfs2_sbd *sdp); +extern int gfs2_quotad(void *data); static inline int gfs2_quota_lock_check(struct gfs2_inode *ip) { diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index d5e91f4f6a0b..efd09c3d2b26 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -14,6 +14,8 @@ #include <linux/gfs2_ondisk.h> #include <linux/crc32.h> #include <linux/lm_interface.h> +#include <linux/kthread.h> +#include <linux/freezer.h> #include "gfs2.h" #include "incore.h" @@ -583,13 +585,35 @@ fail: return error; } +static struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp) +{ + struct gfs2_jdesc *jd; + int found = 0; + + spin_lock(&sdp->sd_jindex_spin); + + list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { + if (jd->jd_dirty) { + jd->jd_dirty = 0; + found = 1; + break; + } + } + spin_unlock(&sdp->sd_jindex_spin); + + if (!found) + jd = NULL; + + return jd; +} + /** * gfs2_check_journals - Recover any dirty journals * @sdp: the filesystem * */ -void gfs2_check_journals(struct gfs2_sbd *sdp) +static void gfs2_check_journals(struct gfs2_sbd *sdp) { struct gfs2_jdesc *jd; @@ -603,3 +627,25 @@ void gfs2_check_journals(struct gfs2_sbd *sdp) } } +/** + * gfs2_recoverd - Recover dead machine's journals + * @sdp: Pointer to GFS2 superblock + * + */ + +int gfs2_recoverd(void *data) +{ + struct gfs2_sbd *sdp = data; + unsigned long t; + + while (!kthread_should_stop()) { + gfs2_check_journals(sdp); + t = gfs2_tune_get(sdp, gt_recoverd_secs) * HZ; + if (freezing(current)) + refrigerator(); + schedule_timeout_interruptible(t); + } + + return 0; +} + diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h index f7235e61c723..a8218ea15b57 100644 --- a/fs/gfs2/recovery.h +++ b/fs/gfs2/recovery.h @@ -18,17 +18,17 @@ static inline void gfs2_replay_incr_blk(struct gfs2_sbd *sdp, unsigned int *blk) *blk = 0; } -int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk, +extern int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk, struct buffer_head **bh); -int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where); -int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where); -void gfs2_revoke_clean(struct gfs2_sbd *sdp); +extern int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where); +extern int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where); +extern void gfs2_revoke_clean(struct gfs2_sbd *sdp); -int gfs2_find_jhead(struct gfs2_jdesc *jd, +extern int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head); -int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd); -void gfs2_check_journals(struct gfs2_sbd *sdp); +extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd); +extern int gfs2_recoverd(void *data); #endif /* __RECOVERY_DOT_H__ */ diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 2d90fb253505..8b01c635d925 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -269,16 +269,14 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd) bi->bi_len, x); } - if (count[0] != rgd->rd_rg.rg_free) { + if (count[0] != rgd->rd_free) { if (gfs2_consist_rgrpd(rgd)) fs_err(sdp, "free data mismatch: %u != %u\n", - count[0], rgd->rd_rg.rg_free); + count[0], rgd->rd_free); return; } - tmp = rgd->rd_data - - rgd->rd_rg.rg_free - - rgd->rd_rg.rg_dinodes; + tmp = rgd->rd_data - rgd->rd_free - rgd->rd_dinodes; if (count[1] + count[2] != tmp) { if (gfs2_consist_rgrpd(rgd)) fs_err(sdp, "used data mismatch: %u != %u\n", @@ -286,10 +284,10 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd) return; } - if (count[3] != rgd->rd_rg.rg_dinodes) { + if (count[3] != rgd->rd_dinodes) { if (gfs2_consist_rgrpd(rgd)) fs_err(sdp, "used metadata mismatch: %u != %u\n", - count[3], rgd->rd_rg.rg_dinodes); + count[3], rgd->rd_dinodes); return; } @@ -501,7 +499,7 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp) for (rgrps = 0;; rgrps++) { loff_t pos = rgrps * sizeof(struct gfs2_rindex); - if (pos + sizeof(struct gfs2_rindex) >= ip->i_di.di_size) + if (pos + sizeof(struct gfs2_rindex) >= ip->i_disksize) break; error = gfs2_internal_read(ip, &ra_state, buf, &pos, sizeof(struct gfs2_rindex)); @@ -590,7 +588,7 @@ static int gfs2_ri_update(struct gfs2_inode *ip) struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct inode *inode = &ip->i_inode; struct file_ra_state ra_state; - u64 rgrp_count = ip->i_di.di_size; + u64 rgrp_count = ip->i_disksize; int error; if (do_div(rgrp_count, sizeof(struct gfs2_rindex))) { @@ -634,7 +632,7 @@ static int gfs2_ri_update_special(struct gfs2_inode *ip) for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) { /* Ignore partials */ if ((sdp->sd_rgrps + 1) * sizeof(struct gfs2_rindex) > - ip->i_di.di_size) + ip->i_disksize) break; error = read_rindex_entry(ip, &ra_state); if (error) { @@ -692,7 +690,6 @@ int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh) static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf) { const struct gfs2_rgrp *str = buf; - struct gfs2_rgrp_host *rg = &rgd->rd_rg; u32 rg_flags; rg_flags = be32_to_cpu(str->rg_flags); @@ -700,24 +697,23 @@ static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf) rgd->rd_flags |= GFS2_RDF_NOALLOC; else rgd->rd_flags &= ~GFS2_RDF_NOALLOC; - rg->rg_free = be32_to_cpu(str->rg_free); - rg->rg_dinodes = be32_to_cpu(str->rg_dinodes); - rg->rg_igeneration = be64_to_cpu(str->rg_igeneration); + rgd->rd_free = be32_to_cpu(str->rg_free); + rgd->rd_dinodes = be32_to_cpu(str->rg_dinodes); + rgd->rd_igeneration = be64_to_cpu(str->rg_igeneration); } static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf) { struct gfs2_rgrp *str = buf; - struct gfs2_rgrp_host *rg = &rgd->rd_rg; u32 rg_flags = 0; if (rgd->rd_flags & GFS2_RDF_NOALLOC) rg_flags |= GFS2_RGF_NOALLOC; str->rg_flags = cpu_to_be32(rg_flags); - str->rg_free = cpu_to_be32(rg->rg_free); - str->rg_dinodes = cpu_to_be32(rg->rg_dinodes); + str->rg_free = cpu_to_be32(rgd->rd_free); + str->rg_dinodes = cpu_to_be32(rgd->rd_dinodes); str->__pad = cpu_to_be32(0); - str->rg_igeneration = cpu_to_be64(rg->rg_igeneration); + str->rg_igeneration = cpu_to_be64(rgd->rd_igeneration); memset(&str->rg_reserved, 0, sizeof(str->rg_reserved)); } @@ -776,7 +772,7 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd) } spin_lock(&sdp->sd_rindex_spin); - rgd->rd_free_clone = rgd->rd_rg.rg_free; + rgd->rd_free_clone = rgd->rd_free; rgd->rd_bh_count++; spin_unlock(&sdp->sd_rindex_spin); @@ -850,7 +846,7 @@ void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd) } spin_lock(&sdp->sd_rindex_spin); - rgd->rd_free_clone = rgd->rd_rg.rg_free; + rgd->rd_free_clone = rgd->rd_free; spin_unlock(&sdp->sd_rindex_spin); } @@ -1403,8 +1399,8 @@ u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n) block = rgd->rd_data0 + blk; ip->i_goal = block; - gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free >= *n); - rgd->rd_rg.rg_free -= *n; + gfs2_assert_withdraw(sdp, rgd->rd_free >= *n); + rgd->rd_free -= *n; gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); @@ -1445,10 +1441,10 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation) block = rgd->rd_data0 + blk; - gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free); - rgd->rd_rg.rg_free--; - rgd->rd_rg.rg_dinodes++; - *generation = rgd->rd_rg.rg_igeneration++; + gfs2_assert_withdraw(sdp, rgd->rd_free); + rgd->rd_free--; + rgd->rd_dinodes++; + *generation = rgd->rd_igeneration++; gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); @@ -1481,7 +1477,7 @@ void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen) if (!rgd) return; - rgd->rd_rg.rg_free += blen; + rgd->rd_free += blen; gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); @@ -1509,7 +1505,7 @@ void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen) if (!rgd) return; - rgd->rd_rg.rg_free += blen; + rgd->rd_free += blen; gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); @@ -1546,10 +1542,10 @@ static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno) return; gfs2_assert_withdraw(sdp, rgd == tmp_rgd); - if (!rgd->rd_rg.rg_dinodes) + if (!rgd->rd_dinodes) gfs2_consist_rgrpd(rgd); - rgd->rd_rg.rg_dinodes--; - rgd->rd_rg.rg_free++; + rgd->rd_dinodes--; + rgd->rd_free++; gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index c3ba3d9d0aac..141b781f2fcc 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -34,76 +34,6 @@ #include "util.h" /** - * gfs2_jindex_hold - Grab a lock on the jindex - * @sdp: The GFS2 superblock - * @ji_gh: the holder for the jindex glock - * - * This is very similar to the gfs2_rindex_hold() function, except that - * in general we hold the jindex lock for longer periods of time and - * we grab it far less frequently (in general) then the rgrp lock. - * - * Returns: errno - */ - -int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh) -{ - struct gfs2_inode *dip = GFS2_I(sdp->sd_jindex); - struct qstr name; - char buf[20]; - struct gfs2_jdesc *jd; - int error; - - name.name = buf; - - mutex_lock(&sdp->sd_jindex_mutex); - - for (;;) { - error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, ji_gh); - if (error) - break; - - name.len = sprintf(buf, "journal%u", sdp->sd_journals); - name.hash = gfs2_disk_hash(name.name, name.len); - - error = gfs2_dir_check(sdp->sd_jindex, &name, NULL); - if (error == -ENOENT) { - error = 0; - break; - } - - gfs2_glock_dq_uninit(ji_gh); - - if (error) - break; - - error = -ENOMEM; - jd = kzalloc(sizeof(struct gfs2_jdesc), GFP_KERNEL); - if (!jd) - break; - - INIT_LIST_HEAD(&jd->extent_list); - jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1); - if (!jd->jd_inode || IS_ERR(jd->jd_inode)) { - if (!jd->jd_inode) - error = -ENOENT; - else - error = PTR_ERR(jd->jd_inode); - kfree(jd); - break; - } - - spin_lock(&sdp->sd_jindex_spin); - jd->jd_jid = sdp->sd_journals++; - list_add_tail(&jd->jd_list, &sdp->sd_jindex_list); - spin_unlock(&sdp->sd_jindex_spin); - } - - mutex_unlock(&sdp->sd_jindex_mutex); - - return error; -} - -/** * gfs2_jindex_free - Clear all the journal index information * @sdp: The GFS2 superblock * @@ -166,39 +96,6 @@ struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid) return jd; } -void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid) -{ - struct gfs2_jdesc *jd; - - spin_lock(&sdp->sd_jindex_spin); - jd = jdesc_find_i(&sdp->sd_jindex_list, jid); - if (jd) - jd->jd_dirty = 1; - spin_unlock(&sdp->sd_jindex_spin); -} - -struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp) -{ - struct gfs2_jdesc *jd; - int found = 0; - - spin_lock(&sdp->sd_jindex_spin); - - list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { - if (jd->jd_dirty) { - jd->jd_dirty = 0; - found = 1; - break; - } - } - spin_unlock(&sdp->sd_jindex_spin); - - if (!found) - jd = NULL; - - return jd; -} - int gfs2_jdesc_check(struct gfs2_jdesc *jd) { struct gfs2_inode *ip = GFS2_I(jd->jd_inode); @@ -206,14 +103,14 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd) int ar; int error; - if (ip->i_di.di_size < (8 << 20) || ip->i_di.di_size > (1 << 30) || - (ip->i_di.di_size & (sdp->sd_sb.sb_bsize - 1))) { + if (ip->i_disksize < (8 << 20) || ip->i_disksize > (1 << 30) || + (ip->i_disksize & (sdp->sd_sb.sb_bsize - 1))) { gfs2_consist_inode(ip); return -EIO; } - jd->jd_blocks = ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift; + jd->jd_blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift; - error = gfs2_write_alloc_required(ip, 0, ip->i_di.di_size, &ar); + error = gfs2_write_alloc_required(ip, 0, ip->i_disksize, &ar); if (!error && ar) { gfs2_consist_inode(ip); error = -EIO; @@ -423,137 +320,6 @@ out: return error; } -/** - * gfs2_statfs_i - Do a statfs - * @sdp: the filesystem - * @sg: the sg structure - * - * Returns: errno - */ - -int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc) -{ - struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; - struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; - - spin_lock(&sdp->sd_statfs_spin); - - *sc = *m_sc; - sc->sc_total += l_sc->sc_total; - sc->sc_free += l_sc->sc_free; - sc->sc_dinodes += l_sc->sc_dinodes; - - spin_unlock(&sdp->sd_statfs_spin); - - if (sc->sc_free < 0) - sc->sc_free = 0; - if (sc->sc_free > sc->sc_total) - sc->sc_free = sc->sc_total; - if (sc->sc_dinodes < 0) - sc->sc_dinodes = 0; - - return 0; -} - -/** - * statfs_fill - fill in the sg for a given RG - * @rgd: the RG - * @sc: the sc structure - * - * Returns: 0 on success, -ESTALE if the LVB is invalid - */ - -static int statfs_slow_fill(struct gfs2_rgrpd *rgd, - struct gfs2_statfs_change_host *sc) -{ - gfs2_rgrp_verify(rgd); - sc->sc_total += rgd->rd_data; - sc->sc_free += rgd->rd_rg.rg_free; - sc->sc_dinodes += rgd->rd_rg.rg_dinodes; - return 0; -} - -/** - * gfs2_statfs_slow - Stat a filesystem using asynchronous locking - * @sdp: the filesystem - * @sc: the sc info that will be returned - * - * Any error (other than a signal) will cause this routine to fall back - * to the synchronous version. - * - * FIXME: This really shouldn't busy wait like this. - * - * Returns: errno - */ - -int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc) -{ - struct gfs2_holder ri_gh; - struct gfs2_rgrpd *rgd_next; - struct gfs2_holder *gha, *gh; - unsigned int slots = 64; - unsigned int x; - int done; - int error = 0, err; - - memset(sc, 0, sizeof(struct gfs2_statfs_change_host)); - gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL); - if (!gha) - return -ENOMEM; - - error = gfs2_rindex_hold(sdp, &ri_gh); - if (error) - goto out; - - rgd_next = gfs2_rgrpd_get_first(sdp); - - for (;;) { - done = 1; - - for (x = 0; x < slots; x++) { - gh = gha + x; - - if (gh->gh_gl && gfs2_glock_poll(gh)) { - err = gfs2_glock_wait(gh); - if (err) { - gfs2_holder_uninit(gh); - error = err; - } else { - if (!error) - error = statfs_slow_fill( - gh->gh_gl->gl_object, sc); - gfs2_glock_dq_uninit(gh); - } - } - - if (gh->gh_gl) - done = 0; - else if (rgd_next && !error) { - error = gfs2_glock_nq_init(rgd_next->rd_gl, - LM_ST_SHARED, - GL_ASYNC, - gh); - rgd_next = gfs2_rgrpd_get_next(rgd_next); - done = 0; - } - - if (signal_pending(current)) - error = -ERESTARTSYS; - } - - if (done) - break; - - yield(); - } - - gfs2_glock_dq_uninit(&ri_gh); - -out: - kfree(gha); - return error; -} - struct lfcc { struct list_head list; struct gfs2_holder gh; @@ -580,10 +346,6 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp, struct gfs2_log_header_host lh; int error; - error = gfs2_jindex_hold(sdp, &ji_gh); - if (error) - return error; - list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { lfcc = kmalloc(sizeof(struct lfcc), GFP_KERNEL); if (!lfcc) { diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h index 50a4c9b1215e..f6b8b00ad881 100644 --- a/fs/gfs2/super.h +++ b/fs/gfs2/super.h @@ -10,6 +10,8 @@ #ifndef __SUPER_DOT_H__ #define __SUPER_DOT_H__ +#include <linux/fs.h> +#include <linux/dcache.h> #include "incore.h" void gfs2_lm_unmount(struct gfs2_sbd *sdp); @@ -23,12 +25,9 @@ static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp) return x; } -int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh); void gfs2_jindex_free(struct gfs2_sbd *sdp); struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid); -void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid); -struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp); int gfs2_jdesc_check(struct gfs2_jdesc *jd); int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename, @@ -40,11 +39,15 @@ int gfs2_statfs_init(struct gfs2_sbd *sdp); void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free, s64 dinodes); int gfs2_statfs_sync(struct gfs2_sbd *sdp); -int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc); -int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc); int gfs2_freeze_fs(struct gfs2_sbd *sdp); void gfs2_unfreeze_fs(struct gfs2_sbd *sdp); +extern struct file_system_type gfs2_fs_type; +extern struct file_system_type gfs2meta_fs_type; +extern const struct export_operations gfs2_export_ops; +extern const struct super_operations gfs2_super_ops; +extern struct dentry_operations gfs2_dops; + #endif /* __SUPER_DOT_H__ */ diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index 7e1879f1a02c..26c1fa777a95 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -26,9 +26,6 @@ #include "quota.h" #include "util.h" -char *gfs2_sys_margs; -spinlock_t gfs2_sys_margs_lock; - static ssize_t id_show(struct gfs2_sbd *sdp, char *buf) { return snprintf(buf, PAGE_SIZE, "%u:%u\n", @@ -263,7 +260,6 @@ ARGS_ATTR(localcaching, "%d\n"); ARGS_ATTR(localflocks, "%d\n"); ARGS_ATTR(debug, "%d\n"); ARGS_ATTR(upgrade, "%d\n"); -ARGS_ATTR(num_glockd, "%u\n"); ARGS_ATTR(posix_acl, "%d\n"); ARGS_ATTR(quota, "%u\n"); ARGS_ATTR(suiddir, "%d\n"); @@ -279,7 +275,6 @@ static struct attribute *args_attrs[] = { &args_attr_localflocks.attr, &args_attr_debug.attr, &args_attr_upgrade.attr, - &args_attr_num_glockd.attr, &args_attr_posix_acl.attr, &args_attr_quota.attr, &args_attr_suiddir.attr, @@ -288,30 +283,6 @@ static struct attribute *args_attrs[] = { }; /* - * display counters from superblock - */ - -struct counters_attr { - struct attribute attr; - ssize_t (*show)(struct gfs2_sbd *, char *); -}; - -#define COUNTERS_ATTR(name, fmt) \ -static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \ -{ \ - return snprintf(buf, PAGE_SIZE, fmt, \ - (unsigned int)atomic_read(&sdp->sd_##name)); \ -} \ -static struct counters_attr counters_attr_##name = __ATTR_RO(name) - -COUNTERS_ATTR(reclaimed, "%u\n"); - -static struct attribute *counters_attrs[] = { - &counters_attr_reclaimed.attr, - NULL, -}; - -/* * get and set struct gfs2_tune fields */ @@ -393,7 +364,6 @@ static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\ } \ TUNE_ATTR_2(name, name##_store) -TUNE_ATTR(demote_secs, 0); TUNE_ATTR(incore_log_blocks, 0); TUNE_ATTR(log_flush_secs, 0); TUNE_ATTR(quota_warn_period, 0); @@ -408,11 +378,9 @@ TUNE_ATTR(stall_secs, 1); TUNE_ATTR(statfs_quantum, 1); TUNE_ATTR_DAEMON(recoverd_secs, recoverd_process); TUNE_ATTR_DAEMON(logd_secs, logd_process); -TUNE_ATTR_DAEMON(quotad_secs, quotad_process); TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store); static struct attribute *tune_attrs[] = { - &tune_attr_demote_secs.attr, &tune_attr_incore_log_blocks.attr, &tune_attr_log_flush_secs.attr, &tune_attr_quota_warn_period.attr, @@ -426,7 +394,6 @@ static struct attribute *tune_attrs[] = { &tune_attr_statfs_quantum.attr, &tune_attr_recoverd_secs.attr, &tune_attr_logd_secs.attr, - &tune_attr_quotad_secs.attr, &tune_attr_quota_scale.attr, &tune_attr_new_files_jdata.attr, NULL, @@ -437,11 +404,6 @@ static struct attribute_group lockstruct_group = { .attrs = lockstruct_attrs, }; -static struct attribute_group counters_group = { - .name = "counters", - .attrs = counters_attrs, -}; - static struct attribute_group args_group = { .name = "args", .attrs = args_attrs, @@ -466,13 +428,9 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp) if (error) goto fail_reg; - error = sysfs_create_group(&sdp->sd_kobj, &counters_group); - if (error) - goto fail_lockstruct; - error = sysfs_create_group(&sdp->sd_kobj, &args_group); if (error) - goto fail_counters; + goto fail_lockstruct; error = sysfs_create_group(&sdp->sd_kobj, &tune_group); if (error) @@ -483,8 +441,6 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp) fail_args: sysfs_remove_group(&sdp->sd_kobj, &args_group); -fail_counters: - sysfs_remove_group(&sdp->sd_kobj, &counters_group); fail_lockstruct: sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group); fail_reg: @@ -498,16 +454,27 @@ void gfs2_sys_fs_del(struct gfs2_sbd *sdp) { sysfs_remove_group(&sdp->sd_kobj, &tune_group); sysfs_remove_group(&sdp->sd_kobj, &args_group); - sysfs_remove_group(&sdp->sd_kobj, &counters_group); sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group); kobject_put(&sdp->sd_kobj); } +static int gfs2_uevent(struct kset *kset, struct kobject *kobj, + struct kobj_uevent_env *env) +{ + struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj); + add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name); + add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name); + return 0; +} + +static struct kset_uevent_ops gfs2_uevent_ops = { + .uevent = gfs2_uevent, +}; + + int gfs2_sys_init(void) { - gfs2_sys_margs = NULL; - spin_lock_init(&gfs2_sys_margs_lock); - gfs2_kset = kset_create_and_add("gfs2", NULL, fs_kobj); + gfs2_kset = kset_create_and_add("gfs2", &gfs2_uevent_ops, fs_kobj); if (!gfs2_kset) return -ENOMEM; return 0; @@ -515,7 +482,6 @@ int gfs2_sys_init(void) void gfs2_sys_uninit(void) { - kfree(gfs2_sys_margs); kset_unregister(gfs2_kset); } diff --git a/fs/gfs2/sys.h b/fs/gfs2/sys.h index 1ca8cdac5304..e94560e836d7 100644 --- a/fs/gfs2/sys.h +++ b/fs/gfs2/sys.h @@ -13,10 +13,6 @@ #include <linux/spinlock.h> struct gfs2_sbd; -/* Allow args to be passed to GFS2 when using an initial ram disk */ -extern char *gfs2_sys_margs; -extern spinlock_t gfs2_sys_margs_lock; - int gfs2_sys_fs_add(struct gfs2_sbd *sdp); void gfs2_sys_fs_del(struct gfs2_sbd *sdp); diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index d31e355c61fb..374f50e95496 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -25,6 +25,7 @@ struct kmem_cache *gfs2_glock_cachep __read_mostly; struct kmem_cache *gfs2_inode_cachep __read_mostly; struct kmem_cache *gfs2_bufdata_cachep __read_mostly; struct kmem_cache *gfs2_rgrpd_cachep __read_mostly; +struct kmem_cache *gfs2_quotad_cachep __read_mostly; void gfs2_assert_i(struct gfs2_sbd *sdp) { diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h index 7f48576289c9..33e96b0ce9ab 100644 --- a/fs/gfs2/util.h +++ b/fs/gfs2/util.h @@ -148,6 +148,7 @@ extern struct kmem_cache *gfs2_glock_cachep; extern struct kmem_cache *gfs2_inode_cachep; extern struct kmem_cache *gfs2_bufdata_cachep; extern struct kmem_cache *gfs2_rgrpd_cachep; +extern struct kmem_cache *gfs2_quotad_cachep; static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt, unsigned int *p) diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 3a31451ac170..5c538e0ec14b 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -501,7 +501,7 @@ int hostfs_write_begin(struct file *file, struct address_space *mapping, { pgoff_t index = pos >> PAGE_CACHE_SHIFT; - *pagep = __grab_cache_page(mapping, index); + *pagep = grab_cache_page_write_begin(mapping, index, flags); if (!*pagep) return -ENOMEM; return 0; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 7d479ce3aceb..6903d37af037 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -252,6 +252,7 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf, for (;;) { struct page *page; unsigned long nr, ret; + int ra; /* nr is the maximum number of bytes to copy from this page */ nr = huge_page_size(h); @@ -274,16 +275,19 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf, */ ret = len < nr ? len : nr; if (clear_user(buf, ret)) - ret = -EFAULT; + ra = -EFAULT; + else + ra = 0; } else { /* * We have the page, copy it to user space buffer. */ - ret = hugetlbfs_read_actor(page, offset, buf, len, nr); + ra = hugetlbfs_read_actor(page, offset, buf, len, nr); + ret = ra; } - if (ret < 0) { + if (ra < 0) { if (retval == 0) - retval = ret; + retval = ra; if (page) page_cache_release(page); goto out; @@ -506,7 +510,6 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid, inode->i_mode = mode; inode->i_uid = uid; inode->i_gid = gid; - inode->i_blocks = 0; inode->i_mapping->a_ops = &hugetlbfs_aops; inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; diff --git a/fs/inode.c b/fs/inode.c index 0487ddba1397..913ab2d9a5d1 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -22,6 +22,7 @@ #include <linux/bootmem.h> #include <linux/inotify.h> #include <linux/mount.h> +#include <linux/async.h> /* * This is needed for the following functions: @@ -108,84 +109,102 @@ static void wake_up_inode(struct inode *inode) wake_up_bit(&inode->i_state, __I_LOCK); } -static struct inode *alloc_inode(struct super_block *sb) +/** + * inode_init_always - perform inode structure intialisation + * @sb: superblock inode belongs to + * @inode: inode to initialise + * + * These are initializations that need to be done on every inode + * allocation as the fields are not initialised by slab allocation. + */ +struct inode *inode_init_always(struct super_block *sb, struct inode *inode) { static const struct address_space_operations empty_aops; static struct inode_operations empty_iops; static const struct file_operations empty_fops; - struct inode *inode; - if (sb->s_op->alloc_inode) - inode = sb->s_op->alloc_inode(sb); - else - inode = (struct inode *) kmem_cache_alloc(inode_cachep, GFP_KERNEL); - - if (inode) { - struct address_space * const mapping = &inode->i_data; - - inode->i_sb = sb; - inode->i_blkbits = sb->s_blocksize_bits; - inode->i_flags = 0; - atomic_set(&inode->i_count, 1); - inode->i_op = &empty_iops; - inode->i_fop = &empty_fops; - inode->i_nlink = 1; - atomic_set(&inode->i_writecount, 0); - inode->i_size = 0; - inode->i_blocks = 0; - inode->i_bytes = 0; - inode->i_generation = 0; + struct address_space * const mapping = &inode->i_data; + + inode->i_sb = sb; + inode->i_blkbits = sb->s_blocksize_bits; + inode->i_flags = 0; + atomic_set(&inode->i_count, 1); + inode->i_op = &empty_iops; + inode->i_fop = &empty_fops; + inode->i_nlink = 1; + inode->i_uid = 0; + inode->i_gid = 0; + atomic_set(&inode->i_writecount, 0); + inode->i_size = 0; + inode->i_blocks = 0; + inode->i_bytes = 0; + inode->i_generation = 0; #ifdef CONFIG_QUOTA - memset(&inode->i_dquot, 0, sizeof(inode->i_dquot)); + memset(&inode->i_dquot, 0, sizeof(inode->i_dquot)); #endif - inode->i_pipe = NULL; - inode->i_bdev = NULL; - inode->i_cdev = NULL; - inode->i_rdev = 0; - inode->dirtied_when = 0; - if (security_inode_alloc(inode)) { - if (inode->i_sb->s_op->destroy_inode) - inode->i_sb->s_op->destroy_inode(inode); - else - kmem_cache_free(inode_cachep, (inode)); - return NULL; - } + inode->i_pipe = NULL; + inode->i_bdev = NULL; + inode->i_cdev = NULL; + inode->i_rdev = 0; + inode->dirtied_when = 0; + if (security_inode_alloc(inode)) { + if (inode->i_sb->s_op->destroy_inode) + inode->i_sb->s_op->destroy_inode(inode); + else + kmem_cache_free(inode_cachep, (inode)); + return NULL; + } - spin_lock_init(&inode->i_lock); - lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key); + spin_lock_init(&inode->i_lock); + lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key); - mutex_init(&inode->i_mutex); - lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key); + mutex_init(&inode->i_mutex); + lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key); - init_rwsem(&inode->i_alloc_sem); - lockdep_set_class(&inode->i_alloc_sem, &sb->s_type->i_alloc_sem_key); + init_rwsem(&inode->i_alloc_sem); + lockdep_set_class(&inode->i_alloc_sem, &sb->s_type->i_alloc_sem_key); - mapping->a_ops = &empty_aops; - mapping->host = inode; - mapping->flags = 0; - mapping_set_gfp_mask(mapping, GFP_HIGHUSER_PAGECACHE); - mapping->assoc_mapping = NULL; - mapping->backing_dev_info = &default_backing_dev_info; - mapping->writeback_index = 0; + mapping->a_ops = &empty_aops; + mapping->host = inode; + mapping->flags = 0; + mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); + mapping->assoc_mapping = NULL; + mapping->backing_dev_info = &default_backing_dev_info; + mapping->writeback_index = 0; - /* - * If the block_device provides a backing_dev_info for client - * inodes then use that. Otherwise the inode share the bdev's - * backing_dev_info. - */ - if (sb->s_bdev) { - struct backing_dev_info *bdi; + /* + * If the block_device provides a backing_dev_info for client + * inodes then use that. Otherwise the inode share the bdev's + * backing_dev_info. + */ + if (sb->s_bdev) { + struct backing_dev_info *bdi; - bdi = sb->s_bdev->bd_inode_backing_dev_info; - if (!bdi) - bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info; - mapping->backing_dev_info = bdi; - } - inode->i_private = NULL; - inode->i_mapping = mapping; + bdi = sb->s_bdev->bd_inode_backing_dev_info; + if (!bdi) + bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info; + mapping->backing_dev_info = bdi; } + inode->i_private = NULL; + inode->i_mapping = mapping; + return inode; } +EXPORT_SYMBOL(inode_init_always); + +static struct inode *alloc_inode(struct super_block *sb) +{ + struct inode *inode; + + if (sb->s_op->alloc_inode) + inode = sb->s_op->alloc_inode(sb); + else + inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL); + + if (inode) + return inode_init_always(sb, inode); + return NULL; +} void destroy_inode(struct inode *inode) { @@ -196,6 +215,7 @@ void destroy_inode(struct inode *inode) else kmem_cache_free(inode_cachep, (inode)); } +EXPORT_SYMBOL(destroy_inode); /* @@ -534,12 +554,55 @@ repeat: return node ? inode : NULL; } +static unsigned long hash(struct super_block *sb, unsigned long hashval) +{ + unsigned long tmp; + + tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) / + L1_CACHE_BYTES; + tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS); + return tmp & I_HASHMASK; +} + +static inline void +__inode_add_to_lists(struct super_block *sb, struct hlist_head *head, + struct inode *inode) +{ + inodes_stat.nr_inodes++; + list_add(&inode->i_list, &inode_in_use); + list_add(&inode->i_sb_list, &sb->s_inodes); + if (head) + hlist_add_head(&inode->i_hash, head); +} + +/** + * inode_add_to_lists - add a new inode to relevant lists + * @sb: superblock inode belongs to + * @inode: inode to mark in use + * + * When an inode is allocated it needs to be accounted for, added to the in use + * list, the owning superblock and the inode hash. This needs to be done under + * the inode_lock, so export a function to do this rather than the inode lock + * itself. We calculate the hash list to add to here so it is all internal + * which requires the caller to have already set up the inode number in the + * inode to add. + */ +void inode_add_to_lists(struct super_block *sb, struct inode *inode) +{ + struct hlist_head *head = inode_hashtable + hash(sb, inode->i_ino); + + spin_lock(&inode_lock); + __inode_add_to_lists(sb, head, inode); + spin_unlock(&inode_lock); +} +EXPORT_SYMBOL_GPL(inode_add_to_lists); + /** * new_inode - obtain an inode * @sb: superblock * * Allocates a new inode for given superblock. The default gfp_mask - * for allocations related to inode->i_mapping is GFP_HIGHUSER_PAGECACHE. + * for allocations related to inode->i_mapping is GFP_HIGHUSER_MOVABLE. * If HIGHMEM pages are unsuitable or it is known that pages allocated * for the page cache are not reclaimable or migratable, * mapping_set_gfp_mask() must be called with suitable flags on the @@ -561,9 +624,7 @@ struct inode *new_inode(struct super_block *sb) inode = alloc_inode(sb); if (inode) { spin_lock(&inode_lock); - inodes_stat.nr_inodes++; - list_add(&inode->i_list, &inode_in_use); - list_add(&inode->i_sb_list, &sb->s_inodes); + __inode_add_to_lists(sb, NULL, inode); inode->i_ino = ++last_ino; inode->i_state = 0; spin_unlock(&inode_lock); @@ -622,10 +683,7 @@ static struct inode * get_new_inode(struct super_block *sb, struct hlist_head *h if (set(inode, data)) goto set_failed; - inodes_stat.nr_inodes++; - list_add(&inode->i_list, &inode_in_use); - list_add(&inode->i_sb_list, &sb->s_inodes); - hlist_add_head(&inode->i_hash, head); + __inode_add_to_lists(sb, head, inode); inode->i_state = I_LOCK|I_NEW; spin_unlock(&inode_lock); @@ -671,10 +729,7 @@ static struct inode * get_new_inode_fast(struct super_block *sb, struct hlist_he old = find_inode_fast(sb, head, ino); if (!old) { inode->i_ino = ino; - inodes_stat.nr_inodes++; - list_add(&inode->i_list, &inode_in_use); - list_add(&inode->i_sb_list, &sb->s_inodes); - hlist_add_head(&inode->i_hash, head); + __inode_add_to_lists(sb, head, inode); inode->i_state = I_LOCK|I_NEW; spin_unlock(&inode_lock); @@ -698,16 +753,6 @@ static struct inode * get_new_inode_fast(struct super_block *sb, struct hlist_he return inode; } -static unsigned long hash(struct super_block *sb, unsigned long hashval) -{ - unsigned long tmp; - - tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) / - L1_CACHE_BYTES; - tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS); - return tmp & I_HASHMASK; -} - /** * iunique - get a unique inode number * @sb: superblock @@ -990,6 +1035,65 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino) EXPORT_SYMBOL(iget_locked); +int insert_inode_locked(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + ino_t ino = inode->i_ino; + struct hlist_head *head = inode_hashtable + hash(sb, ino); + struct inode *old; + + inode->i_state |= I_LOCK|I_NEW; + while (1) { + spin_lock(&inode_lock); + old = find_inode_fast(sb, head, ino); + if (likely(!old)) { + hlist_add_head(&inode->i_hash, head); + spin_unlock(&inode_lock); + return 0; + } + __iget(old); + spin_unlock(&inode_lock); + wait_on_inode(old); + if (unlikely(!hlist_unhashed(&old->i_hash))) { + iput(old); + return -EBUSY; + } + iput(old); + } +} + +EXPORT_SYMBOL(insert_inode_locked); + +int insert_inode_locked4(struct inode *inode, unsigned long hashval, + int (*test)(struct inode *, void *), void *data) +{ + struct super_block *sb = inode->i_sb; + struct hlist_head *head = inode_hashtable + hash(sb, hashval); + struct inode *old; + + inode->i_state |= I_LOCK|I_NEW; + + while (1) { + spin_lock(&inode_lock); + old = find_inode(sb, head, test, data); + if (likely(!old)) { + hlist_add_head(&inode->i_hash, head); + spin_unlock(&inode_lock); + return 0; + } + __iget(old); + spin_unlock(&inode_lock); + wait_on_inode(old); + if (unlikely(!hlist_unhashed(&old->i_hash))) { + iput(old); + return -EBUSY; + } + iput(old); + } +} + +EXPORT_SYMBOL(insert_inode_locked4); + /** * __insert_inode_hash - hash an inode * @inode: unhashed inode @@ -1292,6 +1396,7 @@ int inode_wait(void *word) schedule(); return 0; } +EXPORT_SYMBOL(inode_wait); /* * If we try to find an inode in the inode hash while it is being diff --git a/fs/ioctl.c b/fs/ioctl.c index 43e8b2c0664b..20b0a8a24c6b 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -231,7 +231,8 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg) #define blk_to_logical(inode, blk) (blk << (inode)->i_blkbits) #define logical_to_blk(inode, offset) (offset >> (inode)->i_blkbits); -/* +/** + * __generic_block_fiemap - FIEMAP for block based inodes (no locking) * @inode - the inode to map * @arg - the pointer to userspace where we copy everything to * @get_block - the fs's get_block function @@ -242,11 +243,15 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg) * * If it is possible to have data blocks beyond a hole past @inode->i_size, then * please do not use this function, it will stop at the first unmapped block - * beyond i_size + * beyond i_size. + * + * If you use this function directly, you need to do your own locking. Use + * generic_block_fiemap if you want the locking done for you. */ -int generic_block_fiemap(struct inode *inode, - struct fiemap_extent_info *fieinfo, u64 start, - u64 len, get_block_t *get_block) + +int __generic_block_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, u64 start, + u64 len, get_block_t *get_block) { struct buffer_head tmp; unsigned int start_blk; @@ -260,9 +265,6 @@ int generic_block_fiemap(struct inode *inode, start_blk = logical_to_blk(inode, start); - /* guard against change */ - mutex_lock(&inode->i_mutex); - length = (long long)min_t(u64, len, i_size_read(inode)); map_len = length; @@ -334,14 +336,36 @@ int generic_block_fiemap(struct inode *inode, cond_resched(); } while (1); - mutex_unlock(&inode->i_mutex); - /* if ret is 1 then we just hit the end of the extent array */ if (ret == 1) ret = 0; return ret; } +EXPORT_SYMBOL(__generic_block_fiemap); + +/** + * generic_block_fiemap - FIEMAP for block based inodes + * @inode: The inode to map + * @fieinfo: The mapping information + * @start: The initial block to map + * @len: The length of the extect to attempt to map + * @get_block: The block mapping function for the fs + * + * Calls __generic_block_fiemap to map the inode, after taking + * the inode's mutex lock. + */ + +int generic_block_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, u64 start, + u64 len, get_block_t *get_block) +{ + int ret; + mutex_lock(&inode->i_mutex); + ret = __generic_block_fiemap(inode, fieinfo, start, len, get_block); + mutex_unlock(&inode->i_mutex); + return ret; +} EXPORT_SYMBOL(generic_block_fiemap); #endif /* CONFIG_BLOCK */ @@ -415,6 +439,43 @@ static int ioctl_fioasync(unsigned int fd, struct file *filp, return error; } +static int ioctl_fsfreeze(struct file *filp) +{ + struct super_block *sb = filp->f_path.dentry->d_inode->i_sb; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + /* If filesystem doesn't support freeze feature, return. */ + if (sb->s_op->freeze_fs == NULL) + return -EOPNOTSUPP; + + /* If a blockdevice-backed filesystem isn't specified, return. */ + if (sb->s_bdev == NULL) + return -EINVAL; + + /* Freeze */ + sb = freeze_bdev(sb->s_bdev); + if (IS_ERR(sb)) + return PTR_ERR(sb); + return 0; +} + +static int ioctl_fsthaw(struct file *filp) +{ + struct super_block *sb = filp->f_path.dentry->d_inode->i_sb; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + /* If a blockdevice-backed filesystem isn't specified, return EINVAL. */ + if (sb->s_bdev == NULL) + return -EINVAL; + + /* Thaw */ + return thaw_bdev(sb->s_bdev, sb); +} + /* * When you add any new common ioctls to the switches above and below * please update compat_sys_ioctl() too. @@ -462,6 +523,15 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd, } else error = -ENOTTY; break; + + case FIFREEZE: + error = ioctl_fsfreeze(filp); + break; + + case FITHAW: + error = ioctl_fsthaw(filp); + break; + default: if (S_ISREG(filp->f_path.dentry->d_inode->i_mode)) error = file_ioctl(filp, cmd, arg); diff --git a/fs/ioprio.c b/fs/ioprio.c index 3569e0ad86a2..1a39ac370942 100644 --- a/fs/ioprio.c +++ b/fs/ioprio.c @@ -27,7 +27,7 @@ #include <linux/security.h> #include <linux/pid_namespace.h> -static int set_task_ioprio(struct task_struct *task, int ioprio) +int set_task_ioprio(struct task_struct *task, int ioprio) { int err; struct io_context *ioc; @@ -70,6 +70,7 @@ static int set_task_ioprio(struct task_struct *task, int ioprio) task_unlock(task); return err; } +EXPORT_SYMBOL_GPL(set_task_ioprio); asmlinkage long sys_ioprio_set(int which, int who, int ioprio) { diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 3f8af0f1505b..6147ec3643a0 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -855,10 +855,6 @@ root_found: } sbi->s_joliet_level = joliet_level; - /* check the root inode */ - if (!inode->i_op) - goto out_bad_root; - /* Make sure the root inode is a directory */ if (!S_ISDIR(inode->i_mode)) { printk(KERN_WARNING @@ -886,8 +882,6 @@ root_found: /* * Display error messages and free resources. */ -out_bad_root: - printk(KERN_WARNING "%s: root inode not initialized\n", __func__); out_iput: iput(inode); goto out_no_inode; diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 25719d902c51..3fbffb1ea714 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -306,6 +306,8 @@ void journal_commit_transaction(journal_t *journal) int flags; int err; unsigned long blocknr; + ktime_t start_time; + u64 commit_time; char *tagp = NULL; journal_header_t *header; journal_block_tag_t *tag = NULL; @@ -418,6 +420,7 @@ void journal_commit_transaction(journal_t *journal) commit_transaction->t_state = T_FLUSH; journal->j_committing_transaction = commit_transaction; journal->j_running_transaction = NULL; + start_time = ktime_get(); commit_transaction->t_log_start = journal->j_head; wake_up(&journal->j_wait_transaction_locked); spin_unlock(&journal->j_state_lock); @@ -913,6 +916,18 @@ restart_loop: J_ASSERT(commit_transaction == journal->j_committing_transaction); journal->j_commit_sequence = commit_transaction->t_tid; journal->j_committing_transaction = NULL; + commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); + + /* + * weight the commit time higher than the average time so we don't + * react too strongly to vast changes in commit time + */ + if (likely(journal->j_average_commit_time)) + journal->j_average_commit_time = (commit_time*3 + + journal->j_average_commit_time) / 4; + else + journal->j_average_commit_time = commit_time; + spin_unlock(&journal->j_state_lock); if (commit_transaction->t_checkpoint_list == NULL && diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index 60d4c32c8808..e6a117431277 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -25,6 +25,7 @@ #include <linux/timer.h> #include <linux/mm.h> #include <linux/highmem.h> +#include <linux/hrtimer.h> static void __journal_temp_unlink_buffer(struct journal_head *jh); @@ -49,6 +50,7 @@ get_transaction(journal_t *journal, transaction_t *transaction) { transaction->t_journal = journal; transaction->t_state = T_RUNNING; + transaction->t_start_time = ktime_get(); transaction->t_tid = journal->j_transaction_sequence++; transaction->t_expires = jiffies + journal->j_commit_interval; spin_lock_init(&transaction->t_handle_lock); @@ -752,7 +754,6 @@ out: * int journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update. * @handle: transaction to add buffer modifications to * @bh: bh to be used for metadata writes - * @credits: variable that will receive credits for the buffer * * Returns an error code or 0 on success. * @@ -1370,7 +1371,7 @@ int journal_stop(handle_t *handle) { transaction_t *transaction = handle->h_transaction; journal_t *journal = transaction->t_journal; - int old_handle_count, err; + int err; pid_t pid; J_ASSERT(journal_current_handle() == handle); @@ -1399,6 +1400,17 @@ int journal_stop(handle_t *handle) * on IO anyway. Speeds up many-threaded, many-dir operations * by 30x or more... * + * We try and optimize the sleep time against what the underlying disk + * can do, instead of having a static sleep time. This is usefull for + * the case where our storage is so fast that it is more optimal to go + * ahead and force a flush and wait for the transaction to be committed + * than it is to wait for an arbitrary amount of time for new writers to + * join the transaction. We acheive this by measuring how long it takes + * to commit a transaction, and compare it with how long this + * transaction has been running, and if run time < commit time then we + * sleep for the delta and commit. This greatly helps super fast disks + * that would see slowdowns as more threads started doing fsyncs. + * * But don't do this if this process was the most recent one to * perform a synchronous write. We do this to detect the case where a * single process is doing a stream of sync writes. No point in waiting @@ -1406,11 +1418,26 @@ int journal_stop(handle_t *handle) */ pid = current->pid; if (handle->h_sync && journal->j_last_sync_writer != pid) { + u64 commit_time, trans_time; + journal->j_last_sync_writer = pid; - do { - old_handle_count = transaction->t_handle_count; - schedule_timeout_uninterruptible(1); - } while (old_handle_count != transaction->t_handle_count); + + spin_lock(&journal->j_state_lock); + commit_time = journal->j_average_commit_time; + spin_unlock(&journal->j_state_lock); + + trans_time = ktime_to_ns(ktime_sub(ktime_get(), + transaction->t_start_time)); + + commit_time = min_t(u64, commit_time, + 1000*jiffies_to_usecs(1)); + + if (trans_time < commit_time) { + ktime_t expires = ktime_add_ns(ktime_get(), + commit_time); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_hrtimeout(&expires, HRTIMER_MODE_ABS); + } } current->journal_info = NULL; diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 9497718fe920..17159cacbd9e 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -249,16 +249,14 @@ restart: return ret; } -#define NR_BATCH 64 - static void -__flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count) +__flush_batch(journal_t *journal, int *batch_count) { int i; - ll_rw_block(SWRITE, *batch_count, bhs); + ll_rw_block(SWRITE, *batch_count, journal->j_chkpt_bhs); for (i = 0; i < *batch_count; i++) { - struct buffer_head *bh = bhs[i]; + struct buffer_head *bh = journal->j_chkpt_bhs[i]; clear_buffer_jwrite(bh); BUFFER_TRACE(bh, "brelse"); __brelse(bh); @@ -277,8 +275,7 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count) * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it */ static int __process_buffer(journal_t *journal, struct journal_head *jh, - struct buffer_head **bhs, int *batch_count, - transaction_t *transaction) + int *batch_count, transaction_t *transaction) { struct buffer_head *bh = jh2bh(jh); int ret = 0; @@ -325,14 +322,14 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh, get_bh(bh); J_ASSERT_BH(bh, !buffer_jwrite(bh)); set_buffer_jwrite(bh); - bhs[*batch_count] = bh; + journal->j_chkpt_bhs[*batch_count] = bh; __buffer_relink_io(jh); jbd_unlock_bh_state(bh); transaction->t_chp_stats.cs_written++; (*batch_count)++; - if (*batch_count == NR_BATCH) { + if (*batch_count == JBD2_NR_BATCH) { spin_unlock(&journal->j_list_lock); - __flush_batch(journal, bhs, batch_count); + __flush_batch(journal, batch_count); ret = 1; } } @@ -388,7 +385,6 @@ restart: if (journal->j_checkpoint_transactions == transaction && transaction->t_tid == this_tid) { int batch_count = 0; - struct buffer_head *bhs[NR_BATCH]; struct journal_head *jh; int retry = 0, err; @@ -402,7 +398,7 @@ restart: retry = 1; break; } - retry = __process_buffer(journal, jh, bhs, &batch_count, + retry = __process_buffer(journal, jh, &batch_count, transaction); if (retry < 0 && !result) result = retry; @@ -419,7 +415,7 @@ restart: spin_unlock(&journal->j_list_lock); retry = 1; } - __flush_batch(journal, bhs, &batch_count); + __flush_batch(journal, &batch_count); } if (retry) { @@ -686,6 +682,7 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) safely remove this transaction from the log */ __jbd2_journal_drop_transaction(journal, transaction); + kfree(transaction); /* Just in case anybody was waiting for more transactions to be checkpointed... */ @@ -760,5 +757,4 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact J_ASSERT(journal->j_running_transaction != transaction); jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid); - kfree(transaction); } diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index ebc667bc54a8..62804e57a44c 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -25,6 +25,7 @@ #include <linux/crc32.h> #include <linux/writeback.h> #include <linux/backing-dev.h> +#include <linux/bio.h> /* * Default IO end handler for temporary BJ_IO buffer_heads. @@ -137,7 +138,7 @@ static int journal_submit_commit_record(journal_t *journal, set_buffer_ordered(bh); barrier_done = 1; } - ret = submit_bh(WRITE, bh); + ret = submit_bh(WRITE_SYNC, bh); if (barrier_done) clear_buffer_ordered(bh); @@ -158,7 +159,7 @@ static int journal_submit_commit_record(journal_t *journal, lock_buffer(bh); set_buffer_uptodate(bh); clear_buffer_dirty(bh); - ret = submit_bh(WRITE, bh); + ret = submit_bh(WRITE_SYNC, bh); } *cbh = bh; return ret; @@ -168,12 +169,34 @@ static int journal_submit_commit_record(journal_t *journal, * This function along with journal_submit_commit_record * allows to write the commit record asynchronously. */ -static int journal_wait_on_commit_record(struct buffer_head *bh) +static int journal_wait_on_commit_record(journal_t *journal, + struct buffer_head *bh) { int ret = 0; +retry: clear_buffer_dirty(bh); wait_on_buffer(bh); + if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) { + printk(KERN_WARNING + "JBD2: wait_on_commit_record: sync failed on %s - " + "disabling barriers\n", journal->j_devname); + spin_lock(&journal->j_state_lock); + journal->j_flags &= ~JBD2_BARRIER; + spin_unlock(&journal->j_state_lock); + + lock_buffer(bh); + clear_buffer_dirty(bh); + set_buffer_uptodate(bh); + bh->b_end_io = journal_end_buffer_io_sync; + + ret = submit_bh(WRITE_SYNC, bh); + if (ret) { + unlock_buffer(bh); + return ret; + } + goto retry; + } if (unlikely(!buffer_uptodate(bh))) ret = -EIO; @@ -332,13 +355,15 @@ void jbd2_journal_commit_transaction(journal_t *journal) int flags; int err; unsigned long long blocknr; + ktime_t start_time; + u64 commit_time; char *tagp = NULL; journal_header_t *header; journal_block_tag_t *tag = NULL; int space_left = 0; int first_tag = 0; int tag_flag; - int i; + int i, to_free = 0; int tag_bytes = journal_tag_bytes(journal); struct buffer_head *cbh = NULL; /* For transactional checksums */ __u32 crc32_sum = ~0; @@ -458,6 +483,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) commit_transaction->t_state = T_FLUSH; journal->j_committing_transaction = commit_transaction; journal->j_running_transaction = NULL; + start_time = ktime_get(); commit_transaction->t_log_start = journal->j_head; wake_up(&journal->j_wait_transaction_locked); spin_unlock(&journal->j_state_lock); @@ -509,6 +535,10 @@ void jbd2_journal_commit_transaction(journal_t *journal) if (is_journal_aborted(journal)) { clear_buffer_jbddirty(jh2bh(jh)); JBUFFER_TRACE(jh, "journal is aborting: refile"); + jbd2_buffer_abort_trigger(jh, + jh->b_frozen_data ? + jh->b_frozen_triggers : + jh->b_triggers); jbd2_journal_refile_buffer(journal, jh); /* If that was the last one, we need to clean up * any descriptor buffers which may have been @@ -799,7 +829,7 @@ wait_for_iobuf: __jbd2_journal_abort_hard(journal); } if (!err && !is_journal_aborted(journal)) - err = journal_wait_on_commit_record(cbh); + err = journal_wait_on_commit_record(journal, cbh); if (err) jbd2_journal_abort(journal, err); @@ -844,6 +874,9 @@ restart_loop: * data. * * Otherwise, we can just throw away the frozen data now. + * + * We also know that the frozen data has already fired + * its triggers if they exist, so we can clear that too. */ if (jh->b_committed_data) { jbd2_free(jh->b_committed_data, bh->b_size); @@ -851,10 +884,12 @@ restart_loop: if (jh->b_frozen_data) { jh->b_committed_data = jh->b_frozen_data; jh->b_frozen_data = NULL; + jh->b_frozen_triggers = NULL; } } else if (jh->b_frozen_data) { jbd2_free(jh->b_frozen_data, bh->b_size); jh->b_frozen_data = NULL; + jh->b_frozen_triggers = NULL; } spin_lock(&journal->j_list_lock); @@ -972,14 +1007,23 @@ restart_loop: J_ASSERT(commit_transaction == journal->j_committing_transaction); journal->j_commit_sequence = commit_transaction->t_tid; journal->j_committing_transaction = NULL; - spin_unlock(&journal->j_state_lock); + commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); - if (journal->j_commit_callback) - journal->j_commit_callback(journal, commit_transaction); + /* + * weight the commit time higher than the average time so we don't + * react too strongly to vast changes in the commit time + */ + if (likely(journal->j_average_commit_time)) + journal->j_average_commit_time = (commit_time + + journal->j_average_commit_time*3) / 4; + else + journal->j_average_commit_time = commit_time; + spin_unlock(&journal->j_state_lock); if (commit_transaction->t_checkpoint_list == NULL && commit_transaction->t_checkpoint_io_list == NULL) { __jbd2_journal_drop_transaction(journal, commit_transaction); + to_free = 1; } else { if (journal->j_checkpoint_transactions == NULL) { journal->j_checkpoint_transactions = commit_transaction; @@ -998,11 +1042,16 @@ restart_loop: } spin_unlock(&journal->j_list_lock); + if (journal->j_commit_callback) + journal->j_commit_callback(journal, commit_transaction); + trace_mark(jbd2_end_commit, "dev %s transaction %d head %d", - journal->j_devname, journal->j_commit_sequence, + journal->j_devname, commit_transaction->t_tid, journal->j_tail_sequence); jbd_debug(1, "JBD: commit %d complete, head %d\n", journal->j_commit_sequence, journal->j_tail_sequence); + if (to_free) + kfree(commit_transaction); wake_up(&journal->j_wait_done_commit); } diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index e70d657a19f8..56675306ed81 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -40,6 +40,7 @@ #include <asm/uaccess.h> #include <asm/page.h> +#include <asm/div64.h> EXPORT_SYMBOL(jbd2_journal_start); EXPORT_SYMBOL(jbd2_journal_restart); @@ -50,6 +51,7 @@ EXPORT_SYMBOL(jbd2_journal_unlock_updates); EXPORT_SYMBOL(jbd2_journal_get_write_access); EXPORT_SYMBOL(jbd2_journal_get_create_access); EXPORT_SYMBOL(jbd2_journal_get_undo_access); +EXPORT_SYMBOL(jbd2_journal_set_triggers); EXPORT_SYMBOL(jbd2_journal_dirty_metadata); EXPORT_SYMBOL(jbd2_journal_release_buffer); EXPORT_SYMBOL(jbd2_journal_forget); @@ -65,7 +67,6 @@ EXPORT_SYMBOL(jbd2_journal_update_format); EXPORT_SYMBOL(jbd2_journal_check_used_features); EXPORT_SYMBOL(jbd2_journal_check_available_features); EXPORT_SYMBOL(jbd2_journal_set_features); -EXPORT_SYMBOL(jbd2_journal_create); EXPORT_SYMBOL(jbd2_journal_load); EXPORT_SYMBOL(jbd2_journal_destroy); EXPORT_SYMBOL(jbd2_journal_abort); @@ -131,8 +132,9 @@ static int kjournald2(void *arg) journal->j_task = current; wake_up(&journal->j_wait_done_commit); - printk(KERN_INFO "kjournald2 starting. Commit interval %ld seconds\n", - journal->j_commit_interval / HZ); + printk(KERN_INFO "kjournald2 starting: pid %d, dev %s, " + "commit interval %ld seconds\n", current->pid, + journal->j_devname, journal->j_commit_interval / HZ); /* * And now, wait forever for commit wakeup events. @@ -290,6 +292,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, struct page *new_page; unsigned int new_offset; struct buffer_head *bh_in = jh2bh(jh_in); + struct jbd2_buffer_trigger_type *triggers; /* * The buffer really shouldn't be locked: only the current committing @@ -314,13 +317,23 @@ repeat: done_copy_out = 1; new_page = virt_to_page(jh_in->b_frozen_data); new_offset = offset_in_page(jh_in->b_frozen_data); + triggers = jh_in->b_frozen_triggers; } else { new_page = jh2bh(jh_in)->b_page; new_offset = offset_in_page(jh2bh(jh_in)->b_data); + triggers = jh_in->b_triggers; } mapped_data = kmap_atomic(new_page, KM_USER0); /* + * Fire any commit trigger. Do this before checking for escaping, + * as the trigger may modify the magic offset. If a copy-out + * happens afterwards, it will have the correct data in the buffer. + */ + jbd2_buffer_commit_trigger(jh_in, mapped_data + new_offset, + triggers); + + /* * Check for escaping */ if (*((__be32 *)(mapped_data + new_offset)) == @@ -352,6 +365,13 @@ repeat: new_page = virt_to_page(tmp); new_offset = offset_in_page(tmp); done_copy_out = 1; + + /* + * This isn't strictly necessary, as we're using frozen + * data for the escaping, but it keeps consistency with + * b_frozen_data usage. + */ + jh_in->b_frozen_triggers = jh_in->b_triggers; } /* @@ -631,6 +651,8 @@ struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal) return NULL; bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); + if (!bh) + return NULL; lock_buffer(bh); memset(bh->b_data, 0, journal->j_blocksize); set_buffer_uptodate(bh); @@ -824,6 +846,8 @@ static int jbd2_seq_info_show(struct seq_file *seq, void *v) jiffies_to_msecs(s->stats->u.run.rs_flushing / s->stats->ts_tid)); seq_printf(seq, " %ums logging transaction\n", jiffies_to_msecs(s->stats->u.run.rs_logging / s->stats->ts_tid)); + seq_printf(seq, " %luus average transaction commit time\n", + do_div(s->journal->j_average_commit_time, 1000)); seq_printf(seq, " %lu handles per transaction\n", s->stats->u.run.rs_handle_count / s->stats->ts_tid); seq_printf(seq, " %lu blocks per transaction\n", @@ -961,6 +985,8 @@ static journal_t * journal_init_common (void) spin_lock_init(&journal->j_state_lock); journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE); + journal->j_min_batch_time = 0; + journal->j_max_batch_time = 15000; /* 15ms */ /* The journal is marked for error until we succeed with recovery! */ journal->j_flags = JBD2_ABORT; @@ -1016,15 +1042,14 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev, /* journal descriptor can store up to n blocks -bzzz */ journal->j_blocksize = blocksize; + jbd2_stats_proc_init(journal); n = journal->j_blocksize / sizeof(journal_block_tag_t); journal->j_wbufsize = n; journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL); if (!journal->j_wbuf) { printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n", __func__); - kfree(journal); - journal = NULL; - goto out; + goto out_err; } journal->j_dev = bdev; journal->j_fs_dev = fs_dev; @@ -1034,14 +1059,22 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev, p = journal->j_devname; while ((p = strchr(p, '/'))) *p = '!'; - jbd2_stats_proc_init(journal); bh = __getblk(journal->j_dev, start, journal->j_blocksize); - J_ASSERT(bh != NULL); + if (!bh) { + printk(KERN_ERR + "%s: Cannot get buffer for journal superblock\n", + __func__); + goto out_err; + } journal->j_sb_buffer = bh; journal->j_superblock = (journal_superblock_t *)bh->b_data; -out: + return journal; +out_err: + jbd2_stats_proc_exit(journal); + kfree(journal); + return NULL; } /** @@ -1089,9 +1122,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode) if (!journal->j_wbuf) { printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n", __func__); - jbd2_stats_proc_exit(journal); - kfree(journal); - return NULL; + goto out_err; } err = jbd2_journal_bmap(journal, 0, &blocknr); @@ -1099,17 +1130,24 @@ journal_t * jbd2_journal_init_inode (struct inode *inode) if (err) { printk(KERN_ERR "%s: Cannnot locate journal superblock\n", __func__); - jbd2_stats_proc_exit(journal); - kfree(journal); - return NULL; + goto out_err; } bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); - J_ASSERT(bh != NULL); + if (!bh) { + printk(KERN_ERR + "%s: Cannot get buffer for journal superblock\n", + __func__); + goto out_err; + } journal->j_sb_buffer = bh; journal->j_superblock = (journal_superblock_t *)bh->b_data; return journal; +out_err: + jbd2_stats_proc_exit(journal); + kfree(journal); + return NULL; } /* @@ -1158,77 +1196,6 @@ static int journal_reset(journal_t *journal) } /** - * int jbd2_journal_create() - Initialise the new journal file - * @journal: Journal to create. This structure must have been initialised - * - * Given a journal_t structure which tells us which disk blocks we can - * use, create a new journal superblock and initialise all of the - * journal fields from scratch. - **/ -int jbd2_journal_create(journal_t *journal) -{ - unsigned long long blocknr; - struct buffer_head *bh; - journal_superblock_t *sb; - int i, err; - - if (journal->j_maxlen < JBD2_MIN_JOURNAL_BLOCKS) { - printk (KERN_ERR "Journal length (%d blocks) too short.\n", - journal->j_maxlen); - journal_fail_superblock(journal); - return -EINVAL; - } - - if (journal->j_inode == NULL) { - /* - * We don't know what block to start at! - */ - printk(KERN_EMERG - "%s: creation of journal on external device!\n", - __func__); - BUG(); - } - - /* Zero out the entire journal on disk. We cannot afford to - have any blocks on disk beginning with JBD2_MAGIC_NUMBER. */ - jbd_debug(1, "JBD: Zeroing out journal blocks...\n"); - for (i = 0; i < journal->j_maxlen; i++) { - err = jbd2_journal_bmap(journal, i, &blocknr); - if (err) - return err; - bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); - lock_buffer(bh); - memset (bh->b_data, 0, journal->j_blocksize); - BUFFER_TRACE(bh, "marking dirty"); - mark_buffer_dirty(bh); - BUFFER_TRACE(bh, "marking uptodate"); - set_buffer_uptodate(bh); - unlock_buffer(bh); - __brelse(bh); - } - - sync_blockdev(journal->j_dev); - jbd_debug(1, "JBD: journal cleared.\n"); - - /* OK, fill in the initial static fields in the new superblock */ - sb = journal->j_superblock; - - sb->s_header.h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); - sb->s_header.h_blocktype = cpu_to_be32(JBD2_SUPERBLOCK_V2); - - sb->s_blocksize = cpu_to_be32(journal->j_blocksize); - sb->s_maxlen = cpu_to_be32(journal->j_maxlen); - sb->s_first = cpu_to_be32(1); - - journal->j_transaction_sequence = 1; - - journal->j_flags &= ~JBD2_ABORT; - journal->j_format_version = 2; - - return journal_reset(journal); -} - -/** * void jbd2_journal_update_superblock() - Update journal sb on disk. * @journal: The journal to update. * @wait: Set to '0' if you don't want to wait for IO completion. @@ -1472,7 +1439,9 @@ int jbd2_journal_destroy(journal_t *journal) spin_lock(&journal->j_list_lock); while (journal->j_checkpoint_transactions != NULL) { spin_unlock(&journal->j_list_lock); + mutex_lock(&journal->j_checkpoint_mutex); jbd2_log_do_checkpoint(journal); + mutex_unlock(&journal->j_checkpoint_mutex); spin_lock(&journal->j_list_lock); } diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 39b7805a599a..46b4e347ed7d 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -25,6 +25,7 @@ #include <linux/timer.h> #include <linux/mm.h> #include <linux/highmem.h> +#include <linux/hrtimer.h> static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh); @@ -48,6 +49,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction) { transaction->t_journal = journal; transaction->t_state = T_RUNNING; + transaction->t_start_time = ktime_get(); transaction->t_tid = journal->j_transaction_sequence++; transaction->t_expires = jiffies + journal->j_commit_interval; spin_lock_init(&transaction->t_handle_lock); @@ -741,6 +743,12 @@ done: source = kmap_atomic(page, KM_USER0); memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size); kunmap_atomic(source, KM_USER0); + + /* + * Now that the frozen data is saved off, we need to store + * any matching triggers. + */ + jh->b_frozen_triggers = jh->b_triggers; } jbd_unlock_bh_state(bh); @@ -944,6 +952,47 @@ out: } /** + * void jbd2_journal_set_triggers() - Add triggers for commit writeout + * @bh: buffer to trigger on + * @type: struct jbd2_buffer_trigger_type containing the trigger(s). + * + * Set any triggers on this journal_head. This is always safe, because + * triggers for a committing buffer will be saved off, and triggers for + * a running transaction will match the buffer in that transaction. + * + * Call with NULL to clear the triggers. + */ +void jbd2_journal_set_triggers(struct buffer_head *bh, + struct jbd2_buffer_trigger_type *type) +{ + struct journal_head *jh = bh2jh(bh); + + jh->b_triggers = type; +} + +void jbd2_buffer_commit_trigger(struct journal_head *jh, void *mapped_data, + struct jbd2_buffer_trigger_type *triggers) +{ + struct buffer_head *bh = jh2bh(jh); + + if (!triggers || !triggers->t_commit) + return; + + triggers->t_commit(triggers, bh, mapped_data, bh->b_size); +} + +void jbd2_buffer_abort_trigger(struct journal_head *jh, + struct jbd2_buffer_trigger_type *triggers) +{ + if (!triggers || !triggers->t_abort) + return; + + triggers->t_abort(triggers, jh2bh(jh)); +} + + + +/** * int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata * @handle: transaction to add buffer to. * @bh: buffer to mark @@ -1193,7 +1242,7 @@ int jbd2_journal_stop(handle_t *handle) { transaction_t *transaction = handle->h_transaction; journal_t *journal = transaction->t_journal; - int old_handle_count, err; + int err; pid_t pid; J_ASSERT(journal_current_handle() == handle); @@ -1216,24 +1265,54 @@ int jbd2_journal_stop(handle_t *handle) /* * Implement synchronous transaction batching. If the handle * was synchronous, don't force a commit immediately. Let's - * yield and let another thread piggyback onto this transaction. - * Keep doing that while new threads continue to arrive. - * It doesn't cost much - we're about to run a commit and sleep - * on IO anyway. Speeds up many-threaded, many-dir operations - * by 30x or more... + * yield and let another thread piggyback onto this + * transaction. Keep doing that while new threads continue to + * arrive. It doesn't cost much - we're about to run a commit + * and sleep on IO anyway. Speeds up many-threaded, many-dir + * operations by 30x or more... * - * But don't do this if this process was the most recent one to - * perform a synchronous write. We do this to detect the case where a - * single process is doing a stream of sync writes. No point in waiting - * for joiners in that case. + * We try and optimize the sleep time against what the + * underlying disk can do, instead of having a static sleep + * time. This is useful for the case where our storage is so + * fast that it is more optimal to go ahead and force a flush + * and wait for the transaction to be committed than it is to + * wait for an arbitrary amount of time for new writers to + * join the transaction. We achieve this by measuring how + * long it takes to commit a transaction, and compare it with + * how long this transaction has been running, and if run time + * < commit time then we sleep for the delta and commit. This + * greatly helps super fast disks that would see slowdowns as + * more threads started doing fsyncs. + * + * But don't do this if this process was the most recent one + * to perform a synchronous write. We do this to detect the + * case where a single process is doing a stream of sync + * writes. No point in waiting for joiners in that case. */ pid = current->pid; if (handle->h_sync && journal->j_last_sync_writer != pid) { + u64 commit_time, trans_time; + journal->j_last_sync_writer = pid; - do { - old_handle_count = transaction->t_handle_count; - schedule_timeout_uninterruptible(1); - } while (old_handle_count != transaction->t_handle_count); + + spin_lock(&journal->j_state_lock); + commit_time = journal->j_average_commit_time; + spin_unlock(&journal->j_state_lock); + + trans_time = ktime_to_ns(ktime_sub(ktime_get(), + transaction->t_start_time)); + + commit_time = max_t(u64, commit_time, + 1000*journal->j_min_batch_time); + commit_time = min_t(u64, commit_time, + 1000*journal->j_max_batch_time); + + if (trans_time < commit_time) { + ktime_t expires = ktime_add_ns(ktime_get(), + commit_time); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_hrtimeout(&expires, HRTIMER_MODE_ABS); + } } current->journal_info = NULL; diff --git a/fs/jffs2/compr_rubin.c b/fs/jffs2/compr_rubin.c index c73fa89b5f8a..170d289ac785 100644 --- a/fs/jffs2/compr_rubin.c +++ b/fs/jffs2/compr_rubin.c @@ -22,9 +22,7 @@ #define BIT_DIVIDER_MIPS 1043 -static int bits_mips[8] = { 277,249,290,267,229,341,212,241}; /* mips32 */ - -#include <linux/errno.h> +static int bits_mips[8] = { 277, 249, 290, 267, 229, 341, 212, 241}; struct pushpull { unsigned char *buf; @@ -43,7 +41,9 @@ struct rubin_state { int bits[8]; }; -static inline void init_pushpull(struct pushpull *pp, char *buf, unsigned buflen, unsigned ofs, unsigned reserve) +static inline void init_pushpull(struct pushpull *pp, char *buf, + unsigned buflen, unsigned ofs, + unsigned reserve) { pp->buf = buf; pp->buflen = buflen; @@ -53,16 +53,14 @@ static inline void init_pushpull(struct pushpull *pp, char *buf, unsigned buflen static inline int pushbit(struct pushpull *pp, int bit, int use_reserved) { - if (pp->ofs >= pp->buflen - (use_reserved?0:pp->reserve)) { + if (pp->ofs >= pp->buflen - (use_reserved?0:pp->reserve)) return -ENOSPC; - } - if (bit) { - pp->buf[pp->ofs >> 3] |= (1<<(7-(pp->ofs &7))); - } - else { - pp->buf[pp->ofs >> 3] &= ~(1<<(7-(pp->ofs &7))); - } + if (bit) + pp->buf[pp->ofs >> 3] |= (1<<(7-(pp->ofs & 7))); + else + pp->buf[pp->ofs >> 3] &= ~(1<<(7-(pp->ofs & 7))); + pp->ofs++; return 0; @@ -97,6 +95,7 @@ static void init_rubin(struct rubin_state *rs, int div, int *bits) rs->p = (long) (2 * UPPER_BIT_RUBIN); rs->bit_number = (long) 0; rs->bit_divider = div; + for (c=0; c<8; c++) rs->bits[c] = bits[c]; } @@ -108,7 +107,8 @@ static int encode(struct rubin_state *rs, long A, long B, int symbol) long i0, i1; int ret; - while ((rs->q >= UPPER_BIT_RUBIN) || ((rs->p + rs->q) <= UPPER_BIT_RUBIN)) { + while ((rs->q >= UPPER_BIT_RUBIN) || + ((rs->p + rs->q) <= UPPER_BIT_RUBIN)) { rs->bit_number++; ret = pushbit(&rs->pp, (rs->q & UPPER_BIT_RUBIN) ? 1 : 0, 0); @@ -119,12 +119,12 @@ static int encode(struct rubin_state *rs, long A, long B, int symbol) rs->p <<= 1; } i0 = A * rs->p / (A + B); - if (i0 <= 0) { + if (i0 <= 0) i0 = 1; - } - if (i0 >= rs->p) { + + if (i0 >= rs->p) i0 = rs->p - 1; - } + i1 = rs->p - i0; if (symbol == 0) @@ -157,11 +157,13 @@ static void init_decode(struct rubin_state *rs, int div, int *bits) /* behalve lower */ rs->rec_q = 0; - for (rs->bit_number = 0; rs->bit_number++ < RUBIN_REG_SIZE; rs->rec_q = rs->rec_q * 2 + (long) (pullbit(&rs->pp))) + for (rs->bit_number = 0; rs->bit_number++ < RUBIN_REG_SIZE; + rs->rec_q = rs->rec_q * 2 + (long) (pullbit(&rs->pp))) ; } -static void __do_decode(struct rubin_state *rs, unsigned long p, unsigned long q) +static void __do_decode(struct rubin_state *rs, unsigned long p, + unsigned long q) { register unsigned long lower_bits_rubin = LOWER_BITS_RUBIN; unsigned long rec_q; @@ -207,12 +209,11 @@ static int decode(struct rubin_state *rs, long A, long B) __do_decode(rs, p, q); i0 = A * rs->p / (A + B); - if (i0 <= 0) { + if (i0 <= 0) i0 = 1; - } - if (i0 >= rs->p) { + + if (i0 >= rs->p) i0 = rs->p - 1; - } threshold = rs->q + i0; symbol = rs->rec_q >= threshold; @@ -234,14 +235,15 @@ static int out_byte(struct rubin_state *rs, unsigned char byte) struct rubin_state rs_copy; rs_copy = *rs; - for (i=0;i<8;i++) { - ret = encode(rs, rs->bit_divider-rs->bits[i],rs->bits[i],byte&1); + for (i=0; i<8; i++) { + ret = encode(rs, rs->bit_divider-rs->bits[i], + rs->bits[i], byte & 1); if (ret) { /* Failed. Restore old state */ *rs = rs_copy; return ret; } - byte=byte>>1; + byte >>= 1 ; } return 0; } @@ -251,7 +253,8 @@ static int in_byte(struct rubin_state *rs) int i, result = 0, bit_divider = rs->bit_divider; for (i = 0; i < 8; i++) - result |= decode(rs, bit_divider - rs->bits[i], rs->bits[i]) << i; + result |= decode(rs, bit_divider - rs->bits[i], + rs->bits[i]) << i; return result; } @@ -259,7 +262,8 @@ static int in_byte(struct rubin_state *rs) static int rubin_do_compress(int bit_divider, int *bits, unsigned char *data_in, - unsigned char *cpage_out, uint32_t *sourcelen, uint32_t *dstlen) + unsigned char *cpage_out, uint32_t *sourcelen, + uint32_t *dstlen) { int outpos = 0; int pos=0; @@ -295,7 +299,8 @@ static int rubin_do_compress(int bit_divider, int *bits, unsigned char *data_in, int jffs2_rubinmips_compress(unsigned char *data_in, unsigned char *cpage_out, uint32_t *sourcelen, uint32_t *dstlen, void *model) { - return rubin_do_compress(BIT_DIVIDER_MIPS, bits_mips, data_in, cpage_out, sourcelen, dstlen); + return rubin_do_compress(BIT_DIVIDER_MIPS, bits_mips, data_in, + cpage_out, sourcelen, dstlen); } #endif static int jffs2_dynrubin_compress(unsigned char *data_in, @@ -316,9 +321,8 @@ static int jffs2_dynrubin_compress(unsigned char *data_in, return -1; memset(histo, 0, 256); - for (i=0; i<mysrclen; i++) { + for (i=0; i<mysrclen; i++) histo[data_in[i]]++; - } memset(bits, 0, sizeof(int)*8); for (i=0; i<256; i++) { if (i&128) @@ -346,7 +350,8 @@ static int jffs2_dynrubin_compress(unsigned char *data_in, cpage_out[i] = bits[i]; } - ret = rubin_do_compress(256, bits, data_in, cpage_out+8, &mysrclen, &mydstlen); + ret = rubin_do_compress(256, bits, data_in, cpage_out+8, &mysrclen, + &mydstlen); if (ret) return ret; @@ -363,8 +368,10 @@ static int jffs2_dynrubin_compress(unsigned char *data_in, return 0; } -static void rubin_do_decompress(int bit_divider, int *bits, unsigned char *cdata_in, - unsigned char *page_out, uint32_t srclen, uint32_t destlen) +static void rubin_do_decompress(int bit_divider, int *bits, + unsigned char *cdata_in, + unsigned char *page_out, uint32_t srclen, + uint32_t destlen) { int outpos = 0; struct rubin_state rs; @@ -372,9 +379,8 @@ static void rubin_do_decompress(int bit_divider, int *bits, unsigned char *cdata init_pushpull(&rs.pp, cdata_in, srclen, 0, 0); init_decode(&rs, bit_divider, bits); - while (outpos < destlen) { + while (outpos < destlen) page_out[outpos++] = in_byte(&rs); - } } @@ -383,7 +389,8 @@ static int jffs2_rubinmips_decompress(unsigned char *data_in, uint32_t sourcelen, uint32_t dstlen, void *model) { - rubin_do_decompress(BIT_DIVIDER_MIPS, bits_mips, data_in, cpage_out, sourcelen, dstlen); + rubin_do_decompress(BIT_DIVIDER_MIPS, bits_mips, data_in, + cpage_out, sourcelen, dstlen); return 0; } @@ -398,52 +405,53 @@ static int jffs2_dynrubin_decompress(unsigned char *data_in, for (c=0; c<8; c++) bits[c] = data_in[c]; - rubin_do_decompress(256, bits, data_in+8, cpage_out, sourcelen-8, dstlen); + rubin_do_decompress(256, bits, data_in+8, cpage_out, sourcelen-8, + dstlen); return 0; } static struct jffs2_compressor jffs2_rubinmips_comp = { - .priority = JFFS2_RUBINMIPS_PRIORITY, - .name = "rubinmips", - .compr = JFFS2_COMPR_DYNRUBIN, - .compress = NULL, /*&jffs2_rubinmips_compress,*/ - .decompress = &jffs2_rubinmips_decompress, + .priority = JFFS2_RUBINMIPS_PRIORITY, + .name = "rubinmips", + .compr = JFFS2_COMPR_DYNRUBIN, + .compress = NULL, /*&jffs2_rubinmips_compress,*/ + .decompress = &jffs2_rubinmips_decompress, #ifdef JFFS2_RUBINMIPS_DISABLED - .disabled = 1, + .disabled = 1, #else - .disabled = 0, + .disabled = 0, #endif }; int jffs2_rubinmips_init(void) { - return jffs2_register_compressor(&jffs2_rubinmips_comp); + return jffs2_register_compressor(&jffs2_rubinmips_comp); } void jffs2_rubinmips_exit(void) { - jffs2_unregister_compressor(&jffs2_rubinmips_comp); + jffs2_unregister_compressor(&jffs2_rubinmips_comp); } static struct jffs2_compressor jffs2_dynrubin_comp = { - .priority = JFFS2_DYNRUBIN_PRIORITY, - .name = "dynrubin", - .compr = JFFS2_COMPR_RUBINMIPS, - .compress = jffs2_dynrubin_compress, - .decompress = &jffs2_dynrubin_decompress, + .priority = JFFS2_DYNRUBIN_PRIORITY, + .name = "dynrubin", + .compr = JFFS2_COMPR_RUBINMIPS, + .compress = jffs2_dynrubin_compress, + .decompress = &jffs2_dynrubin_decompress, #ifdef JFFS2_DYNRUBIN_DISABLED - .disabled = 1, + .disabled = 1, #else - .disabled = 0, + .disabled = 0, #endif }; int jffs2_dynrubin_init(void) { - return jffs2_register_compressor(&jffs2_dynrubin_comp); + return jffs2_register_compressor(&jffs2_dynrubin_comp); } void jffs2_dynrubin_exit(void) { - jffs2_unregister_compressor(&jffs2_dynrubin_comp); + jffs2_unregister_compressor(&jffs2_dynrubin_comp); } diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c index 259461b910af..c32b4a1ad6cf 100644 --- a/fs/jffs2/erase.c +++ b/fs/jffs2/erase.c @@ -175,7 +175,7 @@ static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock { /* For NAND, if the failure did not occur at the device level for a specific physical page, don't bother updating the bad block table. */ - if (jffs2_cleanmarker_oob(c) && (bad_offset != MTD_FAIL_ADDR_UNKNOWN)) { + if (jffs2_cleanmarker_oob(c) && (bad_offset != (uint32_t)MTD_FAIL_ADDR_UNKNOWN)) { /* We had a device-level failure to erase. Let's see if we've failed too many times. */ if (!jffs2_write_nand_badblock(c, jeb, bad_offset)) { @@ -209,7 +209,8 @@ static void jffs2_erase_callback(struct erase_info *instr) struct erase_priv_struct *priv = (void *)instr->priv; if(instr->state != MTD_ERASE_DONE) { - printk(KERN_WARNING "Erase at 0x%08x finished, but state != MTD_ERASE_DONE. State is 0x%x instead.\n", instr->addr, instr->state); + printk(KERN_WARNING "Erase at 0x%08llx finished, but state != MTD_ERASE_DONE. State is 0x%x instead.\n", + (unsigned long long)instr->addr, instr->state); jffs2_erase_failed(priv->c, priv->jeb, instr->fail_addr); } else { jffs2_erase_succeeded(priv->c, priv->jeb); diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index 5a98aa87c853..5edc2bf20581 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -132,7 +132,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, uint32_t pageofs = index << PAGE_CACHE_SHIFT; int ret = 0; - pg = __grab_cache_page(mapping, index); + pg = grab_cache_page_write_begin(mapping, index, flags); if (!pg) return -ENOMEM; *pagep = pg; diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h index 1750445556c3..507ed6ec1847 100644 --- a/fs/jffs2/nodelist.h +++ b/fs/jffs2/nodelist.h @@ -366,9 +366,6 @@ void jffs2_free_ino_caches(struct jffs2_sb_info *c); void jffs2_free_raw_node_refs(struct jffs2_sb_info *c); struct jffs2_node_frag *jffs2_lookup_node_frag(struct rb_root *fragtree, uint32_t offset); void jffs2_kill_fragtree(struct rb_root *root, struct jffs2_sb_info *c_delete); -struct rb_node *rb_next(struct rb_node *); -struct rb_node *rb_prev(struct rb_node *); -void rb_replace_node(struct rb_node *victim, struct rb_node *new, struct rb_root *root); int jffs2_add_full_dnode_to_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_full_dnode *fn); uint32_t jffs2_truncate_fragtree (struct jffs2_sb_info *c, struct rb_root *list, uint32_t size); struct jffs2_raw_node_ref *jffs2_link_node_ref(struct jffs2_sb_info *c, diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 210339784b56..b00ee9f05a06 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -59,8 +59,14 @@ struct inode *jfs_iget(struct super_block *sb, unsigned long ino) if (inode->i_size >= IDATASIZE) { inode->i_op = &page_symlink_inode_operations; inode->i_mapping->a_ops = &jfs_aops; - } else + } else { inode->i_op = &jfs_symlink_inode_operations; + /* + * The inline data should be null-terminated, but + * don't let on-disk corruption crash the kernel + */ + JFS_IP(inode)->i_inline[inode->i_size] = '\0'; + } } else { inode->i_op = &jfs_file_inode_operations; init_special_inode(inode, inode->i_mode, inode->i_rdev); diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c index d6363d8309d0..0f94381ca6d0 100644 --- a/fs/jfs/jfs_imap.c +++ b/fs/jfs/jfs_imap.c @@ -58,9 +58,9 @@ /* * __mark_inode_dirty expects inodes to be hashed. Since we don't want - * special inodes in the fileset inode space, we hash them to a dummy head + * special inodes in the fileset inode space, we make them appear hashed, + * but do not put on any lists. */ -static HLIST_HEAD(aggregate_hash); /* * imap locks @@ -496,7 +496,11 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary) /* release the page */ release_metapage(mp); - hlist_add_head(&ip->i_hash, &aggregate_hash); + /* + * that will look hashed, but won't be on any list; hlist_del() + * will work fine and require no locking. + */ + ip->i_hash.pprev = &ip->i_hash.next; return (ip); } diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c index 70022fd1c539..d4d142c2edd4 100644 --- a/fs/jfs/jfs_inode.c +++ b/fs/jfs/jfs_inode.c @@ -79,7 +79,8 @@ struct inode *ialloc(struct inode *parent, umode_t mode) inode = new_inode(sb); if (!inode) { jfs_warn("ialloc: new_inode returned NULL!"); - return ERR_PTR(-ENOMEM); + rc = -ENOMEM; + goto fail; } jfs_inode = JFS_IP(inode); @@ -89,8 +90,12 @@ struct inode *ialloc(struct inode *parent, umode_t mode) jfs_warn("ialloc: diAlloc returned %d!", rc); if (rc == -EIO) make_bad_inode(inode); - iput(inode); - return ERR_PTR(rc); + goto fail_put; + } + + if (insert_inode_locked(inode) < 0) { + rc = -EINVAL; + goto fail_unlock; } inode->i_uid = current_fsuid(); @@ -112,11 +117,8 @@ struct inode *ialloc(struct inode *parent, umode_t mode) * Allocate inode to quota. */ if (DQUOT_ALLOC_INODE(inode)) { - DQUOT_DROP(inode); - inode->i_flags |= S_NOQUOTA; - inode->i_nlink = 0; - iput(inode); - return ERR_PTR(-EDQUOT); + rc = -EDQUOT; + goto fail_drop; } inode->i_mode = mode; @@ -158,4 +160,15 @@ struct inode *ialloc(struct inode *parent, umode_t mode) jfs_info("ialloc returns inode = 0x%p\n", inode); return inode; + +fail_drop: + DQUOT_DROP(inode); + inode->i_flags |= S_NOQUOTA; +fail_unlock: + inode->i_nlink = 0; + unlock_new_inode(inode); +fail_put: + iput(inode); +fail: + return ERR_PTR(rc); } diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index cc3cedffbfa1..b4de56b851e4 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -155,7 +155,6 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode, ip->i_fop = &jfs_file_operations; ip->i_mapping->a_ops = &jfs_aops; - insert_inode_hash(ip); mark_inode_dirty(ip); dip->i_ctime = dip->i_mtime = CURRENT_TIME; @@ -171,9 +170,12 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode, if (rc) { free_ea_wmap(ip); ip->i_nlink = 0; + unlock_new_inode(ip); iput(ip); - } else + } else { d_instantiate(dentry, ip); + unlock_new_inode(ip); + } out2: free_UCSname(&dname); @@ -289,7 +291,6 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode) ip->i_op = &jfs_dir_inode_operations; ip->i_fop = &jfs_dir_operations; - insert_inode_hash(ip); mark_inode_dirty(ip); /* update parent directory inode */ @@ -306,9 +307,12 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode) if (rc) { free_ea_wmap(ip); ip->i_nlink = 0; + unlock_new_inode(ip); iput(ip); - } else + } else { d_instantiate(dentry, ip); + unlock_new_inode(ip); + } out2: free_UCSname(&dname); @@ -1019,7 +1023,6 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry, goto out3; } - insert_inode_hash(ip); mark_inode_dirty(ip); dip->i_ctime = dip->i_mtime = CURRENT_TIME; @@ -1039,9 +1042,12 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry, if (rc) { free_ea_wmap(ip); ip->i_nlink = 0; + unlock_new_inode(ip); iput(ip); - } else + } else { d_instantiate(dentry, ip); + unlock_new_inode(ip); + } out2: free_UCSname(&dname); @@ -1399,7 +1405,6 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry, jfs_ip->dev = new_encode_dev(rdev); init_special_inode(ip, ip->i_mode, rdev); - insert_inode_hash(ip); mark_inode_dirty(ip); dir->i_ctime = dir->i_mtime = CURRENT_TIME; @@ -1417,9 +1422,12 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry, if (rc) { free_ea_wmap(ip); ip->i_nlink = 0; + unlock_new_inode(ip); iput(ip); - } else + } else { d_instantiate(dentry, ip); + unlock_new_inode(ip); + } out1: free_UCSname(&dname); diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 0dae345e481b..b37d1f78b854 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -543,7 +543,7 @@ out_kfree: return ret; } -static void jfs_write_super_lockfs(struct super_block *sb) +static int jfs_freeze(struct super_block *sb) { struct jfs_sb_info *sbi = JFS_SBI(sb); struct jfs_log *log = sbi->log; @@ -553,9 +553,10 @@ static void jfs_write_super_lockfs(struct super_block *sb) lmLogShutdown(log); updateSuper(sb, FM_CLEAN); } + return 0; } -static void jfs_unlockfs(struct super_block *sb) +static int jfs_unfreeze(struct super_block *sb) { struct jfs_sb_info *sbi = JFS_SBI(sb); struct jfs_log *log = sbi->log; @@ -568,6 +569,7 @@ static void jfs_unlockfs(struct super_block *sb) else txResume(sb); } + return 0; } static int jfs_get_sb(struct file_system_type *fs_type, @@ -735,8 +737,8 @@ static const struct super_operations jfs_super_operations = { .delete_inode = jfs_delete_inode, .put_super = jfs_put_super, .sync_fs = jfs_sync_fs, - .write_super_lockfs = jfs_write_super_lockfs, - .unlockfs = jfs_unlockfs, + .freeze_fs = jfs_freeze, + .unfreeze_fs = jfs_unfreeze, .statfs = jfs_statfs, .remount_fs = jfs_remount, .show_options = jfs_show_options, diff --git a/fs/libfs.c b/fs/libfs.c index e960a8321902..49b44099dabb 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -231,7 +231,6 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name, */ root->i_ino = 1; root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR; - root->i_uid = root->i_gid = 0; root->i_atime = root->i_mtime = root->i_ctime = CURRENT_TIME; dentry = d_alloc(NULL, &d_name); if (!dentry) { @@ -360,7 +359,7 @@ int simple_write_begin(struct file *file, struct address_space *mapping, index = pos >> PAGE_CACHE_SHIFT; from = pos & (PAGE_CACHE_SIZE - 1); - page = __grab_cache_page(mapping, index); + page = grab_cache_page_write_begin(mapping, index, flags); if (!page) return -ENOMEM; @@ -436,8 +435,6 @@ int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files */ inode->i_ino = 1; inode->i_mode = S_IFDIR | 0755; - inode->i_uid = inode->i_gid = 0; - inode->i_blocks = 0; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; @@ -464,8 +461,6 @@ int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files if (!inode) goto out; inode->i_mode = S_IFREG | files->mode; - inode->i_uid = inode->i_gid = 0; - inode->i_blocks = 0; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inode->i_fop = files->ops; inode->i_ino = i; diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index 8307dd64bf46..1f3b0fc0d351 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -14,6 +14,7 @@ #include <linux/sunrpc/svc.h> #include <linux/lockd/lockd.h> #include <linux/smp_lock.h> +#include <linux/kthread.h> #define NLMDBG_FACILITY NLMDBG_CLIENT @@ -60,7 +61,7 @@ struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init) host = nlmclnt_lookup_host(nlm_init->address, nlm_init->addrlen, nlm_init->protocol, nlm_version, - nlm_init->hostname); + nlm_init->hostname, nlm_init->noresvport); if (host == NULL) { lockd_down(); return ERR_PTR(-ENOLCK); @@ -191,11 +192,15 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock) void nlmclnt_recovery(struct nlm_host *host) { + struct task_struct *task; + if (!host->h_reclaiming++) { nlm_get_host(host); - __module_get(THIS_MODULE); - if (kernel_thread(reclaimer, host, CLONE_FS | CLONE_FILES) < 0) - module_put(THIS_MODULE); + task = kthread_run(reclaimer, host, "%s-reclaim", host->h_name); + if (IS_ERR(task)) + printk(KERN_ERR "lockd: unable to spawn reclaimer " + "thread. Locks for %s won't be reclaimed! " + "(%ld)\n", host->h_name, PTR_ERR(task)); } } @@ -207,7 +212,6 @@ reclaimer(void *ptr) struct file_lock *fl, *next; u32 nsmstate; - daemonize("%s-reclaim", host->h_name); allow_signal(SIGKILL); down_write(&host->h_rwsem); @@ -233,7 +237,12 @@ restart: list_for_each_entry_safe(fl, next, &host->h_reclaim, fl_u.nfs_fl.list) { list_del_init(&fl->fl_u.nfs_fl.list); - /* Why are we leaking memory here? --okir */ + /* + * sending this thread a SIGKILL will result in any unreclaimed + * locks being removed from the h_granted list. This means that + * the kernel will not attempt to reclaim them again if a new + * reclaimer thread is spawned for this host. + */ if (signalled()) continue; if (nlmclnt_reclaim(host, fl) != 0) @@ -261,5 +270,5 @@ restart: nlm_release_host(host); lockd_down(); unlock_kernel(); - module_put_and_exit(0); + return 0; } diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 31668b690e03..dd7957064a8c 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -16,7 +16,6 @@ #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/svc.h> #include <linux/lockd/lockd.h> -#include <linux/lockd/sm_inter.h> #define NLMDBG_FACILITY NLMDBG_CLIENT #define NLMCLNT_GRACE_WAIT (5*HZ) @@ -518,11 +517,9 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl) unsigned char fl_type; int status = -ENOLCK; - if (nsm_monitor(host) < 0) { - printk(KERN_NOTICE "lockd: failed to monitor %s\n", - host->h_name); + if (nsm_monitor(host) < 0) goto out; - } + fl->fl_flags |= FL_ACCESS; status = do_vfs_lock(fl); fl->fl_flags = fl_flags; diff --git a/fs/lockd/host.c b/fs/lockd/host.c index e05d04416037..99d737bd4325 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -15,7 +15,6 @@ #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/svc.h> #include <linux/lockd/lockd.h> -#include <linux/lockd/sm_inter.h> #include <linux/mutex.h> #include <net/ipv6.h> @@ -32,11 +31,6 @@ static int nrhosts; static DEFINE_MUTEX(nlm_host_mutex); static void nlm_gc_hosts(void); -static struct nsm_handle *nsm_find(const struct sockaddr *sap, - const size_t salen, - const char *hostname, - const size_t hostname_len, - const int create); struct nlm_lookup_host_info { const int server; /* search for server|client */ @@ -48,6 +42,7 @@ struct nlm_lookup_host_info { const size_t hostname_len; /* it's length */ const struct sockaddr *src_sap; /* our address (optional) */ const size_t src_len; /* it's length */ + const int noresvport; /* use non-priv port */ }; /* @@ -104,32 +99,6 @@ static void nlm_clear_port(struct sockaddr *sap) } } -static void nlm_display_address(const struct sockaddr *sap, - char *buf, const size_t len) -{ - const struct sockaddr_in *sin = (struct sockaddr_in *)sap; - const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; - - switch (sap->sa_family) { - case AF_UNSPEC: - snprintf(buf, len, "unspecified"); - break; - case AF_INET: - snprintf(buf, len, "%pI4", &sin->sin_addr.s_addr); - break; - case AF_INET6: - if (ipv6_addr_v4mapped(&sin6->sin6_addr)) - snprintf(buf, len, "%pI4", - &sin6->sin6_addr.s6_addr32[3]); - else - snprintf(buf, len, "%pI6", &sin6->sin6_addr); - break; - default: - snprintf(buf, len, "unsupported address family"); - break; - } -} - /* * Common host lookup routine for server & client */ @@ -189,8 +158,8 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni) atomic_inc(&nsm->sm_count); else { host = NULL; - nsm = nsm_find(ni->sap, ni->salen, - ni->hostname, ni->hostname_len, 1); + nsm = nsm_get_handle(ni->sap, ni->salen, + ni->hostname, ni->hostname_len); if (!nsm) { dprintk("lockd: nlm_lookup_host failed; " "no nsm handle\n"); @@ -205,6 +174,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni) goto out; } host->h_name = nsm->sm_name; + host->h_addrbuf = nsm->sm_addrbuf; memcpy(nlm_addr(host), ni->sap, ni->salen); host->h_addrlen = ni->salen; nlm_clear_port(nlm_addr(host)); @@ -222,6 +192,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni) host->h_nsmstate = 0; /* real NSM state */ host->h_nsmhandle = nsm; host->h_server = ni->server; + host->h_noresvport = ni->noresvport; hlist_add_head(&host->h_hash, chain); INIT_LIST_HEAD(&host->h_lockowners); spin_lock_init(&host->h_lock); @@ -230,11 +201,6 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni) nrhosts++; - nlm_display_address((struct sockaddr *)&host->h_addr, - host->h_addrbuf, sizeof(host->h_addrbuf)); - nlm_display_address((struct sockaddr *)&host->h_srcaddr, - host->h_srcaddrbuf, sizeof(host->h_srcaddrbuf)); - dprintk("lockd: nlm_lookup_host created host %s\n", host->h_name); @@ -254,10 +220,8 @@ nlm_destroy_host(struct nlm_host *host) BUG_ON(!list_empty(&host->h_lockowners)); BUG_ON(atomic_read(&host->h_count)); - /* - * Release NSM handle and unmonitor host. - */ nsm_unmonitor(host); + nsm_release(host->h_nsmhandle); clnt = host->h_rpcclnt; if (clnt != NULL) @@ -272,6 +236,7 @@ nlm_destroy_host(struct nlm_host *host) * @protocol: transport protocol to use * @version: NLM protocol version * @hostname: '\0'-terminated hostname of server + * @noresvport: 1 if non-privileged port should be used * * Returns an nlm_host structure that matches the passed-in * [server address, transport protocol, NLM version, server hostname]. @@ -281,7 +246,9 @@ nlm_destroy_host(struct nlm_host *host) struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap, const size_t salen, const unsigned short protocol, - const u32 version, const char *hostname) + const u32 version, + const char *hostname, + int noresvport) { const struct sockaddr source = { .sa_family = AF_UNSPEC, @@ -296,6 +263,7 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap, .hostname_len = strlen(hostname), .src_sap = &source, .src_len = sizeof(source), + .noresvport = noresvport, }; dprintk("lockd: %s(host='%s', vers=%u, proto=%s)\n", __func__, @@ -372,8 +340,8 @@ nlm_bind_host(struct nlm_host *host) { struct rpc_clnt *clnt; - dprintk("lockd: nlm_bind_host %s (%s), my addr=%s\n", - host->h_name, host->h_addrbuf, host->h_srcaddrbuf); + dprintk("lockd: nlm_bind_host %s (%s)\n", + host->h_name, host->h_addrbuf); /* Lock host handle */ mutex_lock(&host->h_mutex); @@ -417,6 +385,8 @@ nlm_bind_host(struct nlm_host *host) */ if (!host->h_server) args.flags |= RPC_CLNT_CREATE_HARDRTRY; + if (host->h_noresvport) + args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; clnt = rpc_create(&args); if (!IS_ERR(clnt)) @@ -473,35 +443,23 @@ void nlm_release_host(struct nlm_host *host) } } -/* - * We were notified that the host indicated by address &sin - * has rebooted. - * Release all resources held by that peer. +/** + * nlm_host_rebooted - Release all resources held by rebooted host + * @info: pointer to decoded results of NLM_SM_NOTIFY call + * + * We were notified that the specified host has rebooted. Release + * all resources held by that peer. */ -void nlm_host_rebooted(const struct sockaddr_in *sin, - const char *hostname, - unsigned int hostname_len, - u32 new_state) +void nlm_host_rebooted(const struct nlm_reboot *info) { struct hlist_head *chain; struct hlist_node *pos; struct nsm_handle *nsm; struct nlm_host *host; - nsm = nsm_find((struct sockaddr *)sin, sizeof(*sin), - hostname, hostname_len, 0); - if (nsm == NULL) { - dprintk("lockd: never saw rebooted peer '%.*s' before\n", - hostname_len, hostname); + nsm = nsm_reboot_lookup(info); + if (unlikely(nsm == NULL)) return; - } - - dprintk("lockd: nlm_host_rebooted(%.*s, %s)\n", - hostname_len, hostname, nsm->sm_addrbuf); - - /* When reclaiming locks on this peer, make sure that - * we set up a new notification */ - nsm->sm_monitored = 0; /* Mark all hosts tied to this NSM state as having rebooted. * We run the loop repeatedly, because we drop the host table @@ -512,8 +470,8 @@ again: mutex_lock(&nlm_host_mutex); for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) { hlist_for_each_entry(host, pos, chain, h_hash) { if (host->h_nsmhandle == nsm - && host->h_nsmstate != new_state) { - host->h_nsmstate = new_state; + && host->h_nsmstate != info->state) { + host->h_nsmstate = info->state; host->h_state++; nlm_get_host(host); @@ -621,89 +579,3 @@ nlm_gc_hosts(void) next_gc = jiffies + NLM_HOST_COLLECT; } - - -/* - * Manage NSM handles - */ -static LIST_HEAD(nsm_handles); -static DEFINE_SPINLOCK(nsm_lock); - -static struct nsm_handle *nsm_find(const struct sockaddr *sap, - const size_t salen, - const char *hostname, - const size_t hostname_len, - const int create) -{ - struct nsm_handle *nsm = NULL; - struct nsm_handle *pos; - - if (!sap) - return NULL; - - if (hostname && memchr(hostname, '/', hostname_len) != NULL) { - if (printk_ratelimit()) { - printk(KERN_WARNING "Invalid hostname \"%.*s\" " - "in NFS lock request\n", - (int)hostname_len, hostname); - } - return NULL; - } - -retry: - spin_lock(&nsm_lock); - list_for_each_entry(pos, &nsm_handles, sm_link) { - - if (hostname && nsm_use_hostnames) { - if (strlen(pos->sm_name) != hostname_len - || memcmp(pos->sm_name, hostname, hostname_len)) - continue; - } else if (!nlm_cmp_addr(nsm_addr(pos), sap)) - continue; - atomic_inc(&pos->sm_count); - kfree(nsm); - nsm = pos; - goto found; - } - if (nsm) { - list_add(&nsm->sm_link, &nsm_handles); - goto found; - } - spin_unlock(&nsm_lock); - - if (!create) - return NULL; - - nsm = kzalloc(sizeof(*nsm) + hostname_len + 1, GFP_KERNEL); - if (nsm == NULL) - return NULL; - - memcpy(nsm_addr(nsm), sap, salen); - nsm->sm_addrlen = salen; - nsm->sm_name = (char *) (nsm + 1); - memcpy(nsm->sm_name, hostname, hostname_len); - nsm->sm_name[hostname_len] = '\0'; - nlm_display_address((struct sockaddr *)&nsm->sm_addr, - nsm->sm_addrbuf, sizeof(nsm->sm_addrbuf)); - atomic_set(&nsm->sm_count, 1); - goto retry; - -found: - spin_unlock(&nsm_lock); - return nsm; -} - -/* - * Release an NSM handle - */ -void -nsm_release(struct nsm_handle *nsm) -{ - if (!nsm) - return; - if (atomic_dec_and_lock(&nsm->sm_count, &nsm_lock)) { - list_del(&nsm->sm_link); - spin_unlock(&nsm_lock); - kfree(nsm); - } -} diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index ffd3461f75ef..5e2c4d5ac827 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -9,35 +9,123 @@ #include <linux/types.h> #include <linux/utsname.h> #include <linux/kernel.h> +#include <linux/ktime.h> + #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/xprtsock.h> #include <linux/sunrpc/svc.h> #include <linux/lockd/lockd.h> -#include <linux/lockd/sm_inter.h> - #define NLMDBG_FACILITY NLMDBG_MONITOR +#define NSM_PROGRAM 100024 +#define NSM_VERSION 1 + +enum { + NSMPROC_NULL, + NSMPROC_STAT, + NSMPROC_MON, + NSMPROC_UNMON, + NSMPROC_UNMON_ALL, + NSMPROC_SIMU_CRASH, + NSMPROC_NOTIFY, +}; + +struct nsm_args { + struct nsm_private *priv; + u32 prog; /* RPC callback info */ + u32 vers; + u32 proc; -#define XDR_ADDRBUF_LEN (20) + char *mon_name; +}; -static struct rpc_clnt * nsm_create(void); +struct nsm_res { + u32 status; + u32 state; +}; static struct rpc_program nsm_program; +static LIST_HEAD(nsm_handles); +static DEFINE_SPINLOCK(nsm_lock); /* * Local NSM state */ -int nsm_local_state; +int __read_mostly nsm_local_state; +int __read_mostly nsm_use_hostnames; -/* - * Common procedure for SM_MON/SM_UNMON calls - */ -static int -nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res) +static inline struct sockaddr *nsm_addr(const struct nsm_handle *nsm) +{ + return (struct sockaddr *)&nsm->sm_addr; +} + +static void nsm_display_ipv4_address(const struct sockaddr *sap, char *buf, + const size_t len) +{ + const struct sockaddr_in *sin = (struct sockaddr_in *)sap; + snprintf(buf, len, "%pI4", &sin->sin_addr.s_addr); +} + +static void nsm_display_ipv6_address(const struct sockaddr *sap, char *buf, + const size_t len) +{ + const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; + + if (ipv6_addr_v4mapped(&sin6->sin6_addr)) + snprintf(buf, len, "%pI4", &sin6->sin6_addr.s6_addr32[3]); + else if (sin6->sin6_scope_id != 0) + snprintf(buf, len, "%pI6%%%u", &sin6->sin6_addr, + sin6->sin6_scope_id); + else + snprintf(buf, len, "%pI6", &sin6->sin6_addr); +} + +static void nsm_display_address(const struct sockaddr *sap, + char *buf, const size_t len) +{ + switch (sap->sa_family) { + case AF_INET: + nsm_display_ipv4_address(sap, buf, len); + break; + case AF_INET6: + nsm_display_ipv6_address(sap, buf, len); + break; + default: + snprintf(buf, len, "unsupported address family"); + break; + } +} + +static struct rpc_clnt *nsm_create(void) +{ + struct sockaddr_in sin = { + .sin_family = AF_INET, + .sin_addr.s_addr = htonl(INADDR_LOOPBACK), + }; + struct rpc_create_args args = { + .protocol = XPRT_TRANSPORT_UDP, + .address = (struct sockaddr *)&sin, + .addrsize = sizeof(sin), + .servername = "rpc.statd", + .program = &nsm_program, + .version = NSM_VERSION, + .authflavor = RPC_AUTH_NULL, + }; + + return rpc_create(&args); +} + +static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res) { struct rpc_clnt *clnt; int status; - struct nsm_args args; + struct nsm_args args = { + .priv = &nsm->sm_priv, + .prog = NLM_PROGRAM, + .vers = 3, + .proc = NLMPROC_NSM_NOTIFY, + .mon_name = nsm->sm_mon_name, + }; struct rpc_message msg = { .rpc_argp = &args, .rpc_resp = res, @@ -46,22 +134,18 @@ nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res) clnt = nsm_create(); if (IS_ERR(clnt)) { status = PTR_ERR(clnt); + dprintk("lockd: failed to create NSM upcall transport, " + "status=%d\n", status); goto out; } - memset(&args, 0, sizeof(args)); - args.mon_name = nsm->sm_name; - args.addr = nsm_addr_in(nsm)->sin_addr.s_addr; - args.prog = NLM_PROGRAM; - args.vers = 3; - args.proc = NLMPROC_NSM_NOTIFY; memset(res, 0, sizeof(*res)); msg.rpc_proc = &clnt->cl_procinfo[proc]; status = rpc_call_sync(clnt, &msg, 0); if (status < 0) - printk(KERN_DEBUG "nsm_mon_unmon: rpc failed, status=%d\n", - status); + dprintk("lockd: NSM upcall RPC failed, status=%d\n", + status); else status = 0; rpc_shutdown_client(clnt); @@ -69,82 +153,272 @@ nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res) return status; } -/* - * Set up monitoring of a remote host +/** + * nsm_monitor - Notify a peer in case we reboot + * @host: pointer to nlm_host of peer to notify + * + * If this peer is not already monitored, this function sends an + * upcall to the local rpc.statd to record the name/address of + * the peer to notify in case we reboot. + * + * Returns zero if the peer is monitored by the local rpc.statd; + * otherwise a negative errno value is returned. */ -int -nsm_monitor(struct nlm_host *host) +int nsm_monitor(const struct nlm_host *host) { struct nsm_handle *nsm = host->h_nsmhandle; struct nsm_res res; int status; - dprintk("lockd: nsm_monitor(%s)\n", host->h_name); - BUG_ON(nsm == NULL); + dprintk("lockd: nsm_monitor(%s)\n", nsm->sm_name); if (nsm->sm_monitored) return 0; - status = nsm_mon_unmon(nsm, SM_MON, &res); + /* + * Choose whether to record the caller_name or IP address of + * this peer in the local rpc.statd's database. + */ + nsm->sm_mon_name = nsm_use_hostnames ? nsm->sm_name : nsm->sm_addrbuf; - if (status < 0 || res.status != 0) - printk(KERN_NOTICE "lockd: cannot monitor %s\n", host->h_name); + status = nsm_mon_unmon(nsm, NSMPROC_MON, &res); + if (res.status != 0) + status = -EIO; + if (status < 0) + printk(KERN_NOTICE "lockd: cannot monitor %s\n", nsm->sm_name); else nsm->sm_monitored = 1; return status; } -/* - * Cease to monitor remote host +/** + * nsm_unmonitor - Unregister peer notification + * @host: pointer to nlm_host of peer to stop monitoring + * + * If this peer is monitored, this function sends an upcall to + * tell the local rpc.statd not to send this peer a notification + * when we reboot. */ -int -nsm_unmonitor(struct nlm_host *host) +void nsm_unmonitor(const struct nlm_host *host) { struct nsm_handle *nsm = host->h_nsmhandle; struct nsm_res res; - int status = 0; - - if (nsm == NULL) - return 0; - host->h_nsmhandle = NULL; + int status; if (atomic_read(&nsm->sm_count) == 1 && nsm->sm_monitored && !nsm->sm_sticky) { - dprintk("lockd: nsm_unmonitor(%s)\n", host->h_name); + dprintk("lockd: nsm_unmonitor(%s)\n", nsm->sm_name); - status = nsm_mon_unmon(nsm, SM_UNMON, &res); + status = nsm_mon_unmon(nsm, NSMPROC_UNMON, &res); + if (res.status != 0) + status = -EIO; if (status < 0) printk(KERN_NOTICE "lockd: cannot unmonitor %s\n", - host->h_name); + nsm->sm_name); else nsm->sm_monitored = 0; } - nsm_release(nsm); - return status; +} + +static struct nsm_handle *nsm_lookup_hostname(const char *hostname, + const size_t len) +{ + struct nsm_handle *nsm; + + list_for_each_entry(nsm, &nsm_handles, sm_link) + if (strlen(nsm->sm_name) == len && + memcmp(nsm->sm_name, hostname, len) == 0) + return nsm; + return NULL; +} + +static struct nsm_handle *nsm_lookup_addr(const struct sockaddr *sap) +{ + struct nsm_handle *nsm; + + list_for_each_entry(nsm, &nsm_handles, sm_link) + if (nlm_cmp_addr(nsm_addr(nsm), sap)) + return nsm; + return NULL; +} + +static struct nsm_handle *nsm_lookup_priv(const struct nsm_private *priv) +{ + struct nsm_handle *nsm; + + list_for_each_entry(nsm, &nsm_handles, sm_link) + if (memcmp(nsm->sm_priv.data, priv->data, + sizeof(priv->data)) == 0) + return nsm; + return NULL; } /* - * Create NSM client for the local host + * Construct a unique cookie to match this nsm_handle to this monitored + * host. It is passed to the local rpc.statd via NSMPROC_MON, and + * returned via NLMPROC_SM_NOTIFY, in the "priv" field of these + * requests. + * + * The NSM protocol requires that these cookies be unique while the + * system is running. We prefer a stronger requirement of making them + * unique across reboots. If user space bugs cause a stale cookie to + * be sent to the kernel, it could cause the wrong host to lose its + * lock state if cookies were not unique across reboots. + * + * The cookies are exposed only to local user space via loopback. They + * do not appear on the physical network. If we want greater security + * for some reason, nsm_init_private() could perform a one-way hash to + * obscure the contents of the cookie. */ -static struct rpc_clnt * -nsm_create(void) +static void nsm_init_private(struct nsm_handle *nsm) { - struct sockaddr_in sin = { - .sin_family = AF_INET, - .sin_addr.s_addr = htonl(INADDR_LOOPBACK), - .sin_port = 0, - }; - struct rpc_create_args args = { - .protocol = XPRT_TRANSPORT_UDP, - .address = (struct sockaddr *)&sin, - .addrsize = sizeof(sin), - .servername = "localhost", - .program = &nsm_program, - .version = SM_VERSION, - .authflavor = RPC_AUTH_NULL, - }; + u64 *p = (u64 *)&nsm->sm_priv.data; + struct timespec ts; - return rpc_create(&args); + ktime_get_ts(&ts); + *p++ = timespec_to_ns(&ts); + *p = (unsigned long)nsm; +} + +static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap, + const size_t salen, + const char *hostname, + const size_t hostname_len) +{ + struct nsm_handle *new; + + new = kzalloc(sizeof(*new) + hostname_len + 1, GFP_KERNEL); + if (unlikely(new == NULL)) + return NULL; + + atomic_set(&new->sm_count, 1); + new->sm_name = (char *)(new + 1); + memcpy(nsm_addr(new), sap, salen); + new->sm_addrlen = salen; + nsm_init_private(new); + nsm_display_address((const struct sockaddr *)&new->sm_addr, + new->sm_addrbuf, sizeof(new->sm_addrbuf)); + memcpy(new->sm_name, hostname, hostname_len); + new->sm_name[hostname_len] = '\0'; + + return new; +} + +/** + * nsm_get_handle - Find or create a cached nsm_handle + * @sap: pointer to socket address of handle to find + * @salen: length of socket address + * @hostname: pointer to C string containing hostname to find + * @hostname_len: length of C string + * + * Behavior is modulated by the global nsm_use_hostnames variable. + * + * Returns a cached nsm_handle after bumping its ref count, or + * returns a fresh nsm_handle if a handle that matches @sap and/or + * @hostname cannot be found in the handle cache. Returns NULL if + * an error occurs. + */ +struct nsm_handle *nsm_get_handle(const struct sockaddr *sap, + const size_t salen, const char *hostname, + const size_t hostname_len) +{ + struct nsm_handle *cached, *new = NULL; + + if (hostname && memchr(hostname, '/', hostname_len) != NULL) { + if (printk_ratelimit()) { + printk(KERN_WARNING "Invalid hostname \"%.*s\" " + "in NFS lock request\n", + (int)hostname_len, hostname); + } + return NULL; + } + +retry: + spin_lock(&nsm_lock); + + if (nsm_use_hostnames && hostname != NULL) + cached = nsm_lookup_hostname(hostname, hostname_len); + else + cached = nsm_lookup_addr(sap); + + if (cached != NULL) { + atomic_inc(&cached->sm_count); + spin_unlock(&nsm_lock); + kfree(new); + dprintk("lockd: found nsm_handle for %s (%s), " + "cnt %d\n", cached->sm_name, + cached->sm_addrbuf, + atomic_read(&cached->sm_count)); + return cached; + } + + if (new != NULL) { + list_add(&new->sm_link, &nsm_handles); + spin_unlock(&nsm_lock); + dprintk("lockd: created nsm_handle for %s (%s)\n", + new->sm_name, new->sm_addrbuf); + return new; + } + + spin_unlock(&nsm_lock); + + new = nsm_create_handle(sap, salen, hostname, hostname_len); + if (unlikely(new == NULL)) + return NULL; + goto retry; +} + +/** + * nsm_reboot_lookup - match NLMPROC_SM_NOTIFY arguments to an nsm_handle + * @info: pointer to NLMPROC_SM_NOTIFY arguments + * + * Returns a matching nsm_handle if found in the nsm cache; the returned + * nsm_handle's reference count is bumped and sm_monitored is cleared. + * Otherwise returns NULL if some error occurred. + */ +struct nsm_handle *nsm_reboot_lookup(const struct nlm_reboot *info) +{ + struct nsm_handle *cached; + + spin_lock(&nsm_lock); + + cached = nsm_lookup_priv(&info->priv); + if (unlikely(cached == NULL)) { + spin_unlock(&nsm_lock); + dprintk("lockd: never saw rebooted peer '%.*s' before\n", + info->len, info->mon); + return cached; + } + + atomic_inc(&cached->sm_count); + spin_unlock(&nsm_lock); + + /* + * During subsequent lock activity, force a fresh + * notification to be set up for this host. + */ + cached->sm_monitored = 0; + + dprintk("lockd: host %s (%s) rebooted, cnt %d\n", + cached->sm_name, cached->sm_addrbuf, + atomic_read(&cached->sm_count)); + return cached; +} + +/** + * nsm_release - Release an NSM handle + * @nsm: pointer to handle to be released + * + */ +void nsm_release(struct nsm_handle *nsm) +{ + if (atomic_dec_and_lock(&nsm->sm_count, &nsm_lock)) { + list_del(&nsm->sm_link); + spin_unlock(&nsm_lock); + dprintk("lockd: destroyed nsm_handle for %s (%s)\n", + nsm->sm_name, nsm->sm_addrbuf); + kfree(nsm); + } } /* @@ -154,127 +428,132 @@ nsm_create(void) * Status Monitor wire protocol. */ -static __be32 *xdr_encode_nsm_string(__be32 *p, char *string) +static int encode_nsm_string(struct xdr_stream *xdr, const char *string) { - size_t len = strlen(string); - - if (len > SM_MAXSTRLEN) - len = SM_MAXSTRLEN; - return xdr_encode_opaque(p, string, len); + const u32 len = strlen(string); + __be32 *p; + + if (unlikely(len > SM_MAXSTRLEN)) + return -EIO; + p = xdr_reserve_space(xdr, sizeof(u32) + len); + if (unlikely(p == NULL)) + return -EIO; + xdr_encode_opaque(p, string, len); + return 0; } /* * "mon_name" specifies the host to be monitored. - * - * Linux uses a text version of the IP address of the remote - * host as the host identifier (the "mon_name" argument). - * - * Linux statd always looks up the canonical hostname first for - * whatever remote hostname it receives, so this works alright. */ -static __be32 *xdr_encode_mon_name(__be32 *p, struct nsm_args *argp) +static int encode_mon_name(struct xdr_stream *xdr, const struct nsm_args *argp) { - char buffer[XDR_ADDRBUF_LEN + 1]; - char *name = argp->mon_name; - - if (!nsm_use_hostnames) { - snprintf(buffer, XDR_ADDRBUF_LEN, - "%pI4", &argp->addr); - name = buffer; - } - - return xdr_encode_nsm_string(p, name); + return encode_nsm_string(xdr, argp->mon_name); } /* * The "my_id" argument specifies the hostname and RPC procedure * to be called when the status manager receives notification - * (via the SM_NOTIFY call) that the state of host "mon_name" + * (via the NLMPROC_SM_NOTIFY call) that the state of host "mon_name" * has changed. */ -static __be32 *xdr_encode_my_id(__be32 *p, struct nsm_args *argp) +static int encode_my_id(struct xdr_stream *xdr, const struct nsm_args *argp) { - p = xdr_encode_nsm_string(p, utsname()->nodename); - if (!p) - return ERR_PTR(-EIO); - + int status; + __be32 *p; + + status = encode_nsm_string(xdr, utsname()->nodename); + if (unlikely(status != 0)) + return status; + p = xdr_reserve_space(xdr, 3 * sizeof(u32)); + if (unlikely(p == NULL)) + return -EIO; *p++ = htonl(argp->prog); *p++ = htonl(argp->vers); *p++ = htonl(argp->proc); - - return p; + return 0; } /* * The "mon_id" argument specifies the non-private arguments - * of an SM_MON or SM_UNMON call. + * of an NSMPROC_MON or NSMPROC_UNMON call. */ -static __be32 *xdr_encode_mon_id(__be32 *p, struct nsm_args *argp) +static int encode_mon_id(struct xdr_stream *xdr, const struct nsm_args *argp) { - p = xdr_encode_mon_name(p, argp); - if (!p) - return ERR_PTR(-EIO); + int status; - return xdr_encode_my_id(p, argp); + status = encode_mon_name(xdr, argp); + if (unlikely(status != 0)) + return status; + return encode_my_id(xdr, argp); } /* * The "priv" argument may contain private information required - * by the SM_MON call. This information will be supplied in the - * SM_NOTIFY call. - * - * Linux provides the raw IP address of the monitored host, - * left in network byte order. + * by the NSMPROC_MON call. This information will be supplied in the + * NLMPROC_SM_NOTIFY call. */ -static __be32 *xdr_encode_priv(__be32 *p, struct nsm_args *argp) +static int encode_priv(struct xdr_stream *xdr, const struct nsm_args *argp) { - *p++ = argp->addr; - *p++ = 0; - *p++ = 0; - *p++ = 0; + __be32 *p; - return p; + p = xdr_reserve_space(xdr, SM_PRIV_SIZE); + if (unlikely(p == NULL)) + return -EIO; + xdr_encode_opaque_fixed(p, argp->priv->data, SM_PRIV_SIZE); + return 0; } -static int -xdr_encode_mon(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp) +static int xdr_enc_mon(struct rpc_rqst *req, __be32 *p, + const struct nsm_args *argp) { - p = xdr_encode_mon_id(p, argp); - if (IS_ERR(p)) - return PTR_ERR(p); - - p = xdr_encode_priv(p, argp); - if (IS_ERR(p)) - return PTR_ERR(p); - - rqstp->rq_slen = xdr_adjust_iovec(rqstp->rq_svec, p); - return 0; + struct xdr_stream xdr; + int status; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); + status = encode_mon_id(&xdr, argp); + if (unlikely(status)) + return status; + return encode_priv(&xdr, argp); } -static int -xdr_encode_unmon(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp) +static int xdr_enc_unmon(struct rpc_rqst *req, __be32 *p, + const struct nsm_args *argp) { - p = xdr_encode_mon_id(p, argp); - if (IS_ERR(p)) - return PTR_ERR(p); - rqstp->rq_slen = xdr_adjust_iovec(rqstp->rq_svec, p); - return 0; + struct xdr_stream xdr; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); + return encode_mon_id(&xdr, argp); } -static int -xdr_decode_stat_res(struct rpc_rqst *rqstp, __be32 *p, struct nsm_res *resp) +static int xdr_dec_stat_res(struct rpc_rqst *rqstp, __be32 *p, + struct nsm_res *resp) { + struct xdr_stream xdr; + + xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); + p = xdr_inline_decode(&xdr, 2 * sizeof(u32)); + if (unlikely(p == NULL)) + return -EIO; resp->status = ntohl(*p++); - resp->state = ntohl(*p++); - dprintk("nsm: xdr_decode_stat_res status %d state %d\n", + resp->state = ntohl(*p); + + dprintk("lockd: xdr_dec_stat_res status %d state %d\n", resp->status, resp->state); return 0; } -static int -xdr_decode_stat(struct rpc_rqst *rqstp, __be32 *p, struct nsm_res *resp) +static int xdr_dec_stat(struct rpc_rqst *rqstp, __be32 *p, + struct nsm_res *resp) { - resp->state = ntohl(*p++); + struct xdr_stream xdr; + + xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); + p = xdr_inline_decode(&xdr, sizeof(u32)); + if (unlikely(p == NULL)) + return -EIO; + resp->state = ntohl(*p); + + dprintk("lockd: xdr_dec_stat state %d\n", resp->state); return 0; } @@ -288,22 +567,22 @@ xdr_decode_stat(struct rpc_rqst *rqstp, __be32 *p, struct nsm_res *resp) #define SM_unmonres_sz 1 static struct rpc_procinfo nsm_procedures[] = { -[SM_MON] = { - .p_proc = SM_MON, - .p_encode = (kxdrproc_t) xdr_encode_mon, - .p_decode = (kxdrproc_t) xdr_decode_stat_res, +[NSMPROC_MON] = { + .p_proc = NSMPROC_MON, + .p_encode = (kxdrproc_t)xdr_enc_mon, + .p_decode = (kxdrproc_t)xdr_dec_stat_res, .p_arglen = SM_mon_sz, .p_replen = SM_monres_sz, - .p_statidx = SM_MON, + .p_statidx = NSMPROC_MON, .p_name = "MONITOR", }, -[SM_UNMON] = { - .p_proc = SM_UNMON, - .p_encode = (kxdrproc_t) xdr_encode_unmon, - .p_decode = (kxdrproc_t) xdr_decode_stat, +[NSMPROC_UNMON] = { + .p_proc = NSMPROC_UNMON, + .p_encode = (kxdrproc_t)xdr_enc_unmon, + .p_decode = (kxdrproc_t)xdr_dec_stat, .p_arglen = SM_mon_id_sz, .p_replen = SM_unmonres_sz, - .p_statidx = SM_UNMON, + .p_statidx = NSMPROC_UNMON, .p_name = "UNMONITOR", }, }; @@ -322,7 +601,7 @@ static struct rpc_stat nsm_stats; static struct rpc_program nsm_program = { .name = "statd", - .number = SM_PROGRAM, + .number = NSM_PROGRAM, .nrvers = ARRAY_SIZE(nsm_version), .version = nsm_version, .stats = &nsm_stats diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 56b076736b56..64f1c31b5853 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -35,7 +35,6 @@ #include <linux/sunrpc/svcsock.h> #include <net/ip.h> #include <linux/lockd/lockd.h> -#include <linux/lockd/sm_inter.h> #include <linux/nfs.h> #define NLMDBG_FACILITY NLMDBG_SVC @@ -45,7 +44,7 @@ static struct svc_program nlmsvc_program; struct nlmsvc_binding * nlmsvc_ops; -EXPORT_SYMBOL(nlmsvc_ops); +EXPORT_SYMBOL_GPL(nlmsvc_ops); static DEFINE_MUTEX(nlmsvc_mutex); static unsigned int nlmsvc_users; @@ -54,13 +53,26 @@ static struct svc_rqst *nlmsvc_rqst; unsigned long nlmsvc_timeout; /* + * If the kernel has IPv6 support available, always listen for + * both AF_INET and AF_INET6 requests. + */ +#if (defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)) && \ + defined(CONFIG_SUNRPC_REGISTER_V4) +static const sa_family_t nlmsvc_family = AF_INET6; +#else /* (CONFIG_IPV6 || CONFIG_IPV6_MODULE) && CONFIG_SUNRPC_REGISTER_V4 */ +static const sa_family_t nlmsvc_family = AF_INET; +#endif /* (CONFIG_IPV6 || CONFIG_IPV6_MODULE) && CONFIG_SUNRPC_REGISTER_V4 */ + +/* * These can be set at insmod time (useful for NFS as root filesystem), * and also changed through the sysctl interface. -- Jamie Lokier, Aug 2003 */ static unsigned long nlm_grace_period; static unsigned long nlm_timeout = LOCKD_DFLT_TIMEO; static int nlm_udpport, nlm_tcpport; -int nsm_use_hostnames = 0; + +/* RLIM_NOFILE defaults to 1024. That seems like a reasonable default here. */ +static unsigned int nlm_max_connections = 1024; /* * Constants needed for the sysctl interface. @@ -143,6 +155,9 @@ lockd(void *vrqstp) long timeout = MAX_SCHEDULE_TIMEOUT; RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); + /* update sv_maxconn if it has changed */ + rqstp->rq_server->sv_maxconn = nlm_max_connections; + if (signalled()) { flush_signals(current); if (nlmsvc_ops) { @@ -189,6 +204,19 @@ lockd(void *vrqstp) return 0; } +static int create_lockd_listener(struct svc_serv *serv, char *name, + unsigned short port) +{ + struct svc_xprt *xprt; + + xprt = svc_find_xprt(serv, name, 0, 0); + if (xprt == NULL) + return svc_create_xprt(serv, name, port, SVC_SOCK_DEFAULTS); + + svc_xprt_put(xprt); + return 0; +} + /* * Ensure there are active UDP and TCP listeners for lockd. * @@ -202,29 +230,23 @@ lockd(void *vrqstp) static int make_socks(struct svc_serv *serv) { static int warned; - struct svc_xprt *xprt; - int err = 0; + int err; - xprt = svc_find_xprt(serv, "udp", 0, 0); - if (!xprt) - err = svc_create_xprt(serv, "udp", nlm_udpport, - SVC_SOCK_DEFAULTS); - else - svc_xprt_put(xprt); - if (err >= 0) { - xprt = svc_find_xprt(serv, "tcp", 0, 0); - if (!xprt) - err = svc_create_xprt(serv, "tcp", nlm_tcpport, - SVC_SOCK_DEFAULTS); - else - svc_xprt_put(xprt); - } - if (err >= 0) { - warned = 0; - err = 0; - } else if (warned++ == 0) + err = create_lockd_listener(serv, "udp", nlm_udpport); + if (err < 0) + goto out_err; + + err = create_lockd_listener(serv, "tcp", nlm_tcpport); + if (err < 0) + goto out_err; + + warned = 0; + return 0; + +out_err: + if (warned++ == 0) printk(KERN_WARNING - "lockd_up: makesock failed, error=%d\n", err); + "lockd_up: makesock failed, error=%d\n", err); return err; } @@ -252,7 +274,7 @@ int lockd_up(void) "lockd_up: no pid, %d users??\n", nlmsvc_users); error = -ENOMEM; - serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, AF_INET, NULL); + serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, nlmsvc_family, NULL); if (!serv) { printk(KERN_WARNING "lockd_up: create service failed\n"); goto out; @@ -276,6 +298,7 @@ int lockd_up(void) } svc_sock_update_bufs(serv); + serv->sv_maxconn = nlm_max_connections; nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, serv->sv_name); if (IS_ERR(nlmsvc_task)) { @@ -300,7 +323,7 @@ out: mutex_unlock(&nlmsvc_mutex); return error; } -EXPORT_SYMBOL(lockd_up); +EXPORT_SYMBOL_GPL(lockd_up); /* * Decrement the user count and bring down lockd if we're the last. @@ -329,7 +352,7 @@ lockd_down(void) out: mutex_unlock(&nlmsvc_mutex); } -EXPORT_SYMBOL(lockd_down); +EXPORT_SYMBOL_GPL(lockd_down); #ifdef CONFIG_SYSCTL @@ -485,6 +508,7 @@ module_param_call(nlm_udpport, param_set_port, param_get_int, module_param_call(nlm_tcpport, param_set_port, param_get_int, &nlm_tcpport, 0644); module_param(nsm_use_hostnames, bool, 0644); +module_param(nlm_max_connections, uint, 0644); /* * Initialising and terminating the module. diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 4dfdcbc6bf68..1725037374c5 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -16,8 +16,6 @@ #include <linux/nfsd/nfsd.h> #include <linux/lockd/lockd.h> #include <linux/lockd/share.h> -#include <linux/lockd/sm_inter.h> - #define NLMDBG_FACILITY NLMDBG_CLIENT @@ -419,8 +417,6 @@ static __be32 nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp, void *resp) { - struct sockaddr_in saddr; - dprintk("lockd: SM_NOTIFY called\n"); if (!nlm_privileged_requester(rqstp)) { @@ -430,14 +426,7 @@ nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp, return rpc_system_err; } - /* Obtain the host pointer for this NFS server and try to - * reclaim all locks we hold on this server. - */ - memset(&saddr, 0, sizeof(saddr)); - saddr.sin_family = AF_INET; - saddr.sin_addr.s_addr = argp->addr; - nlm_host_rebooted(&saddr, argp->mon, argp->len, argp->state); - + nlm_host_rebooted(argp); return rpc_success; } diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index 3ca89e2a9381..3688e55901fc 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -16,8 +16,6 @@ #include <linux/nfsd/nfsd.h> #include <linux/lockd/lockd.h> #include <linux/lockd/share.h> -#include <linux/lockd/sm_inter.h> - #define NLMDBG_FACILITY NLMDBG_CLIENT @@ -451,8 +449,6 @@ static __be32 nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp, void *resp) { - struct sockaddr_in saddr; - dprintk("lockd: SM_NOTIFY called\n"); if (!nlm_privileged_requester(rqstp)) { @@ -462,14 +458,7 @@ nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp, return rpc_system_err; } - /* Obtain the host pointer for this NFS server and try to - * reclaim all locks we hold on this server. - */ - memset(&saddr, 0, sizeof(saddr)); - saddr.sin_family = AF_INET; - saddr.sin_addr.s_addr = argp->addr; - nlm_host_rebooted(&saddr, argp->mon, argp->len, argp->state); - + nlm_host_rebooted(argp); return rpc_success; } diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index 34c2766e27c7..9e4d6aab611b 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -17,7 +17,6 @@ #include <linux/nfsd/export.h> #include <linux/lockd/lockd.h> #include <linux/lockd/share.h> -#include <linux/lockd/sm_inter.h> #include <linux/module.h> #include <linux/mount.h> diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c index 1f226290c67c..0336f2beacde 100644 --- a/fs/lockd/xdr.c +++ b/fs/lockd/xdr.c @@ -16,7 +16,6 @@ #include <linux/sunrpc/svc.h> #include <linux/sunrpc/stats.h> #include <linux/lockd/lockd.h> -#include <linux/lockd/sm_inter.h> #define NLMDBG_FACILITY NLMDBG_XDR @@ -349,8 +348,8 @@ nlmsvc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp) if (!(p = xdr_decode_string_inplace(p, &argp->mon, &argp->len, SM_MAXSTRLEN))) return 0; argp->state = ntohl(*p++); - /* Preserve the address in network byte order */ - argp->addr = *p++; + memcpy(&argp->priv.data, p, sizeof(argp->priv.data)); + p += XDR_QUADLEN(SM_PRIV_SIZE); return xdr_argsize_check(rqstp, p); } diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c index 50c493a8ad8e..e1d528653192 100644 --- a/fs/lockd/xdr4.c +++ b/fs/lockd/xdr4.c @@ -17,7 +17,6 @@ #include <linux/sunrpc/svc.h> #include <linux/sunrpc/stats.h> #include <linux/lockd/lockd.h> -#include <linux/lockd/sm_inter.h> #define NLMDBG_FACILITY NLMDBG_XDR @@ -356,8 +355,8 @@ nlm4svc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp if (!(p = xdr_decode_string_inplace(p, &argp->mon, &argp->len, SM_MAXSTRLEN))) return 0; argp->state = ntohl(*p++); - /* Preserve the address in network byte order */ - argp->addr = *p++; + memcpy(&argp->priv.data, p, sizeof(argp->priv.data)); + p += XDR_QUADLEN(SM_PRIV_SIZE); return xdr_argsize_check(rqstp, p); } diff --git a/fs/minix/dir.c b/fs/minix/dir.c index f70433816a38..d4946c4c90e2 100644 --- a/fs/minix/dir.c +++ b/fs/minix/dir.c @@ -280,7 +280,7 @@ int minix_add_link(struct dentry *dentry, struct inode *inode) return -EINVAL; got_it: - pos = (page->index >> PAGE_CACHE_SHIFT) + p - (char*)page_address(page); + pos = page_offset(page) + p - (char *)page_address(page); err = __minix_write_begin(NULL, page->mapping, pos, sbi->s_dirsize, AOP_FLAG_UNINTERRUPTIBLE, &page, NULL); if (err) diff --git a/fs/mpage.c b/fs/mpage.c index 552b80b3facc..16c3ef37eae3 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -241,7 +241,6 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, first_hole = page_block; page_block++; block_in_file++; - clear_buffer_mapped(map_bh); continue; } @@ -308,7 +307,10 @@ alloc_new: goto alloc_new; } - if (buffer_boundary(map_bh) || (first_hole != blocks_per_page)) + relative_block = block_in_file - *first_logical_block; + nblocks = map_bh->b_size >> blkbits; + if ((buffer_boundary(map_bh) && relative_block == nblocks) || + (first_hole != blocks_per_page)) bio = mpage_bio_submit(READ, bio); else *last_block_in_bio = blocks[blocks_per_page - 1]; diff --git a/fs/namei.c b/fs/namei.c index af3783fff1de..f05bed242422 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -226,6 +226,16 @@ int generic_permission(struct inode *inode, int mask, return -EACCES; } +/** + * inode_permission - check for access rights to a given inode + * @inode: inode to check permission on + * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) + * + * Used to check for read/write/execute permissions on an inode. + * We use "fsuid" for this, letting us set arbitrary permissions + * for filesystem access without changing the "normal" uids which + * are used for other things. + */ int inode_permission(struct inode *inode, int mask) { int retval; @@ -247,8 +257,7 @@ int inode_permission(struct inode *inode, int mask) return -EACCES; } - /* Ordinary permission routines do not understand MAY_APPEND. */ - if (inode->i_op && inode->i_op->permission) + if (inode->i_op->permission) retval = inode->i_op->permission(inode, mask); else retval = generic_permission(inode, mask, NULL); @@ -265,21 +274,6 @@ int inode_permission(struct inode *inode, int mask) } /** - * vfs_permission - check for access rights to a given path - * @nd: lookup result that describes the path - * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) - * - * Used to check for read/write/execute permissions on a path. - * We use "fsuid" for this, letting us set arbitrary permissions - * for filesystem access without changing the "normal" uids which - * are used for other things. - */ -int vfs_permission(struct nameidata *nd, int mask) -{ - return inode_permission(nd->path.dentry->d_inode, mask); -} - -/** * file_permission - check for additional access rights to a given file * @file: file to check access rights for * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) @@ -289,7 +283,7 @@ int vfs_permission(struct nameidata *nd, int mask) * * Note: * Do not use this function in new code. All access checks should - * be done using vfs_permission(). + * be done using inode_permission(). */ int file_permission(struct file *file, int mask) { @@ -438,7 +432,7 @@ static int exec_permission_lite(struct inode *inode) { umode_t mode = inode->i_mode; - if (inode->i_op && inode->i_op->permission) + if (inode->i_op->permission) return -EAGAIN; if (current_fsuid() == inode->i_uid) @@ -527,18 +521,6 @@ out_unlock: return result; } -/* SMP-safe */ -static __always_inline void -walk_init_root(const char *name, struct nameidata *nd) -{ - struct fs_struct *fs = current->fs; - - read_lock(&fs->lock); - nd->path = fs->root; - path_get(&fs->root); - read_unlock(&fs->lock); -} - /* * Wrapper to retry pathname resolution whenever the underlying * file system returns an ESTALE. @@ -576,9 +558,16 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l goto fail; if (*link == '/') { + struct fs_struct *fs = current->fs; + path_put(&nd->path); - walk_init_root(link, nd); + + read_lock(&fs->lock); + nd->path = fs->root; + path_get(&fs->root); + read_unlock(&fs->lock); } + res = link_path_walk(link, nd); if (nd->depth || res || nd->last_type!=LAST_NORM) return res; @@ -859,7 +848,8 @@ static int __link_path_walk(const char *name, struct nameidata *nd) nd->flags |= LOOKUP_CONTINUE; err = exec_permission_lite(inode); if (err == -EAGAIN) - err = vfs_permission(nd, MAY_EXEC); + err = inode_permission(nd->path.dentry->d_inode, + MAY_EXEC); if (err) break; @@ -918,9 +908,6 @@ static int __link_path_walk(const char *name, struct nameidata *nd) inode = next.dentry->d_inode; if (!inode) goto out_dput; - err = -ENOTDIR; - if (!inode->i_op) - goto out_dput; if (inode->i_op->follow_link) { err = do_follow_link(&next, nd); @@ -930,9 +917,6 @@ static int __link_path_walk(const char *name, struct nameidata *nd) inode = nd->path.dentry->d_inode; if (!inode) break; - err = -ENOTDIR; - if (!inode->i_op) - break; } else path_to_nameidata(&next, nd); err = -ENOTDIR; @@ -971,7 +955,7 @@ last_component: break; inode = next.dentry->d_inode; if ((lookup_flags & LOOKUP_FOLLOW) - && inode && inode->i_op && inode->i_op->follow_link) { + && inode && inode->i_op->follow_link) { err = do_follow_link(&next, nd); if (err) goto return_err; @@ -983,7 +967,7 @@ last_component: break; if (lookup_flags & LOOKUP_DIRECTORY) { err = -ENOTDIR; - if (!inode->i_op || !inode->i_op->lookup) + if (!inode->i_op->lookup) break; } goto return_base; @@ -1479,7 +1463,7 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode, if (error) return error; - if (!dir->i_op || !dir->i_op->create) + if (!dir->i_op->create) return -EACCES; /* shouldn't it be ENOSYS? */ mode &= S_IALLUGO; mode |= S_IFREG; @@ -1493,9 +1477,9 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode, return error; } -int may_open(struct nameidata *nd, int acc_mode, int flag) +int may_open(struct path *path, int acc_mode, int flag) { - struct dentry *dentry = nd->path.dentry; + struct dentry *dentry = path->dentry; struct inode *inode = dentry->d_inode; int error; @@ -1516,13 +1500,13 @@ int may_open(struct nameidata *nd, int acc_mode, int flag) if (S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { flag &= ~O_TRUNC; } else if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode)) { - if (nd->path.mnt->mnt_flags & MNT_NODEV) + if (path->mnt->mnt_flags & MNT_NODEV) return -EACCES; flag &= ~O_TRUNC; } - error = vfs_permission(nd, acc_mode); + error = inode_permission(inode, acc_mode); if (error) return error; /* @@ -1556,6 +1540,9 @@ int may_open(struct nameidata *nd, int acc_mode, int flag) * Refuse to truncate files with mandatory locks held on them. */ error = locks_verify_locked(inode); + if (!error) + error = security_path_truncate(path, 0, + ATTR_MTIME|ATTR_CTIME|ATTR_OPEN); if (!error) { DQUOT_INIT(inode); @@ -1586,14 +1573,18 @@ static int __open_namei_create(struct nameidata *nd, struct path *path, if (!IS_POSIXACL(dir->d_inode)) mode &= ~current->fs->umask; + error = security_path_mknod(&nd->path, path->dentry, mode, 0); + if (error) + goto out_unlock; error = vfs_create(dir->d_inode, path->dentry, mode, nd); +out_unlock: mutex_unlock(&dir->d_inode->i_mutex); dput(nd->path.dentry); nd->path.dentry = path->dentry; if (error) return error; /* Don't check for write permission, don't truncate */ - return may_open(nd, 0, flag & ~O_TRUNC); + return may_open(&nd->path, 0, flag & ~O_TRUNC); } /* @@ -1755,7 +1746,7 @@ do_last: error = -ENOENT; if (!path.dentry->d_inode) goto exit_dput; - if (path.dentry->d_inode->i_op && path.dentry->d_inode->i_op->follow_link) + if (path.dentry->d_inode->i_op->follow_link) goto do_link; path_to_nameidata(&path, &nd); @@ -1779,7 +1770,7 @@ ok: if (error) goto exit; } - error = may_open(&nd, acc_mode, flag); + error = may_open(&nd.path, acc_mode, flag); if (error) { if (will_write) mnt_drop_write(nd.path.mnt); @@ -1936,7 +1927,7 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD)) return -EPERM; - if (!dir->i_op || !dir->i_op->mknod) + if (!dir->i_op->mknod) return -EPERM; error = devcgroup_inode_mknod(mode, dev); @@ -1999,6 +1990,9 @@ asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode, error = mnt_want_write(nd.path.mnt); if (error) goto out_dput; + error = security_path_mknod(&nd.path, dentry, mode, dev); + if (error) + goto out_drop_write; switch (mode & S_IFMT) { case 0: case S_IFREG: error = vfs_create(nd.path.dentry->d_inode,dentry,mode,&nd); @@ -2011,6 +2005,7 @@ asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode, error = vfs_mknod(nd.path.dentry->d_inode,dentry,mode,0); break; } +out_drop_write: mnt_drop_write(nd.path.mnt); out_dput: dput(dentry); @@ -2034,7 +2029,7 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) if (error) return error; - if (!dir->i_op || !dir->i_op->mkdir) + if (!dir->i_op->mkdir) return -EPERM; mode &= (S_IRWXUGO|S_ISVTX); @@ -2070,7 +2065,11 @@ asmlinkage long sys_mkdirat(int dfd, const char __user *pathname, int mode) error = mnt_want_write(nd.path.mnt); if (error) goto out_dput; + error = security_path_mkdir(&nd.path, dentry, mode); + if (error) + goto out_drop_write; error = vfs_mkdir(nd.path.dentry->d_inode, dentry, mode); +out_drop_write: mnt_drop_write(nd.path.mnt); out_dput: dput(dentry); @@ -2121,7 +2120,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry) if (error) return error; - if (!dir->i_op || !dir->i_op->rmdir) + if (!dir->i_op->rmdir) return -EPERM; DQUOT_INIT(dir); @@ -2180,7 +2179,11 @@ static long do_rmdir(int dfd, const char __user *pathname) error = mnt_want_write(nd.path.mnt); if (error) goto exit3; + error = security_path_rmdir(&nd.path, dentry); + if (error) + goto exit4; error = vfs_rmdir(nd.path.dentry->d_inode, dentry); +exit4: mnt_drop_write(nd.path.mnt); exit3: dput(dentry); @@ -2204,7 +2207,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry) if (error) return error; - if (!dir->i_op || !dir->i_op->unlink) + if (!dir->i_op->unlink) return -EPERM; DQUOT_INIT(dir); @@ -2265,7 +2268,11 @@ static long do_unlinkat(int dfd, const char __user *pathname) error = mnt_want_write(nd.path.mnt); if (error) goto exit2; + error = security_path_unlink(&nd.path, dentry); + if (error) + goto exit3; error = vfs_unlink(nd.path.dentry->d_inode, dentry); +exit3: mnt_drop_write(nd.path.mnt); exit2: dput(dentry); @@ -2307,7 +2314,7 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname) if (error) return error; - if (!dir->i_op || !dir->i_op->symlink) + if (!dir->i_op->symlink) return -EPERM; error = security_inode_symlink(dir, dentry, oldname); @@ -2346,7 +2353,11 @@ asmlinkage long sys_symlinkat(const char __user *oldname, error = mnt_want_write(nd.path.mnt); if (error) goto out_dput; + error = security_path_symlink(&nd.path, dentry, from); + if (error) + goto out_drop_write; error = vfs_symlink(nd.path.dentry->d_inode, dentry, from); +out_drop_write: mnt_drop_write(nd.path.mnt); out_dput: dput(dentry); @@ -2384,7 +2395,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de */ if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) return -EPERM; - if (!dir->i_op || !dir->i_op->link) + if (!dir->i_op->link) return -EPERM; if (S_ISDIR(inode->i_mode)) return -EPERM; @@ -2443,7 +2454,11 @@ asmlinkage long sys_linkat(int olddfd, const char __user *oldname, error = mnt_want_write(nd.path.mnt); if (error) goto out_dput; + error = security_path_link(old_path.dentry, &nd.path, new_dentry); + if (error) + goto out_drop_write; error = vfs_link(old_path.dentry, nd.path.dentry->d_inode, new_dentry); +out_drop_write: mnt_drop_write(nd.path.mnt); out_dput: dput(new_dentry); @@ -2587,7 +2602,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (error) return error; - if (!old_dir->i_op || !old_dir->i_op->rename) + if (!old_dir->i_op->rename) return -EPERM; DQUOT_INIT(old_dir); @@ -2679,8 +2694,13 @@ asmlinkage long sys_renameat(int olddfd, const char __user *oldname, error = mnt_want_write(oldnd.path.mnt); if (error) goto exit5; + error = security_path_rename(&oldnd.path, old_dentry, + &newnd.path, new_dentry); + if (error) + goto exit6; error = vfs_rename(old_dir->d_inode, old_dentry, new_dir->d_inode, new_dentry); +exit6: mnt_drop_write(oldnd.path.mnt); exit5: dput(new_dentry); @@ -2750,13 +2770,16 @@ int vfs_follow_link(struct nameidata *nd, const char *link) /* get the link contents into pagecache */ static char *page_getlink(struct dentry * dentry, struct page **ppage) { - struct page * page; + char *kaddr; + struct page *page; struct address_space *mapping = dentry->d_inode->i_mapping; page = read_mapping_page(mapping, 0, NULL); if (IS_ERR(page)) return (char*)page; *ppage = page; - return kmap(page); + kaddr = kmap(page); + nd_terminate_link(kaddr, dentry->d_inode->i_size, PAGE_SIZE - 1); + return kaddr; } int page_readlink(struct dentry *dentry, char __user *buffer, int buflen) @@ -2788,18 +2811,23 @@ void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) } } -int __page_symlink(struct inode *inode, const char *symname, int len, - gfp_t gfp_mask) +/* + * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS + */ +int __page_symlink(struct inode *inode, const char *symname, int len, int nofs) { struct address_space *mapping = inode->i_mapping; struct page *page; void *fsdata; int err; char *kaddr; + unsigned int flags = AOP_FLAG_UNINTERRUPTIBLE; + if (nofs) + flags |= AOP_FLAG_NOFS; retry: err = pagecache_write_begin(NULL, mapping, 0, len-1, - AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata); + flags, &page, &fsdata); if (err) goto fail; @@ -2823,7 +2851,7 @@ fail: int page_symlink(struct inode *inode, const char *symname, int len) { return __page_symlink(inode, symname, len, - mapping_gfp_mask(inode->i_mapping)); + !(mapping_gfp_mask(inode->i_mapping) & __GFP_FS)); } const struct inode_operations page_symlink_inode_operations = { @@ -2849,7 +2877,6 @@ EXPORT_SYMBOL(path_lookup); EXPORT_SYMBOL(kern_path); EXPORT_SYMBOL(vfs_path_lookup); EXPORT_SYMBOL(inode_permission); -EXPORT_SYMBOL(vfs_permission); EXPORT_SYMBOL(file_permission); EXPORT_SYMBOL(unlock_rename); EXPORT_SYMBOL(vfs_create); @@ -2865,3 +2892,10 @@ EXPORT_SYMBOL(vfs_symlink); EXPORT_SYMBOL(vfs_unlink); EXPORT_SYMBOL(dentry_unhash); EXPORT_SYMBOL(generic_readlink); + +/* to be mentioned only in INIT_TASK */ +struct fs_struct init_fs = { + .count = ATOMIC_INIT(1), + .lock = __RW_LOCK_UNLOCKED(init_fs.lock), + .umask = 0022, +}; diff --git a/fs/namespace.c b/fs/namespace.c index 1c09cab8f7cf..a40685d800a8 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1990,7 +1990,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, if (!new_ns->root) { up_write(&namespace_sem); kfree(new_ns); - return ERR_PTR(-ENOMEM);; + return ERR_PTR(-ENOMEM); } spin_lock(&vfsmount_lock); list_add_tail(&new_ns->list, &new_ns->root->mnt_list); diff --git a/fs/ncpfs/getopt.c b/fs/ncpfs/getopt.c index 335b003dddf9..0af3349de851 100644 --- a/fs/ncpfs/getopt.c +++ b/fs/ncpfs/getopt.c @@ -16,7 +16,6 @@ * @opts: an array of &struct option entries controlling parser operations * @optopt: output; will contain the current option * @optarg: output; will contain the value (if one exists) - * @flag: output; may be NULL; should point to a long for or'ing flags * @value: output; may be NULL; will be overwritten with the integer value * of the current argument. * diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c index 6d04e050c74e..f54360f50a9c 100644 --- a/fs/ncpfs/ioctl.c +++ b/fs/ncpfs/ioctl.c @@ -98,7 +98,7 @@ struct compat_ncp_objectname_ioctl { s32 auth_type; u32 object_name_len; - compat_caddr_t object_name; /* an userspace data, in most cases user name */ + compat_caddr_t object_name; /* a userspace data, in most cases user name */ }; struct compat_ncp_fs_info_v2 { diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index c2e9cfd9e5a4..3e634f2a1083 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -16,6 +16,7 @@ #include <linux/mutex.h> #include <linux/freezer.h> #include <linux/kthread.h> +#include <linux/sunrpc/svcauth_gss.h> #include <net/inet_sock.h> @@ -182,10 +183,34 @@ void nfs_callback_down(void) mutex_unlock(&nfs_callback_mutex); } +static int check_gss_callback_principal(struct nfs_client *clp, + struct svc_rqst *rqstp) +{ + struct rpc_clnt *r = clp->cl_rpcclient; + char *p = svc_gss_principal(rqstp); + + /* + * It might just be a normal user principal, in which case + * userspace won't bother to tell us the name at all. + */ + if (p == NULL) + return SVC_DENIED; + + /* Expect a GSS_C_NT_HOSTBASED_NAME like "nfs@serverhostname" */ + + if (memcmp(p, "nfs@", 4) != 0) + return SVC_DENIED; + p += 4; + if (strcmp(p, r->cl_server) != 0) + return SVC_DENIED; + return SVC_OK; +} + static int nfs_callback_authenticate(struct svc_rqst *rqstp) { struct nfs_client *clp; RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); + int ret = SVC_OK; /* Don't talk to strangers */ clp = nfs_find_client(svc_addr(rqstp), 4); @@ -194,21 +219,22 @@ static int nfs_callback_authenticate(struct svc_rqst *rqstp) dprintk("%s: %s NFSv4 callback!\n", __func__, svc_print_addr(rqstp, buf, sizeof(buf))); - nfs_put_client(clp); switch (rqstp->rq_authop->flavour) { case RPC_AUTH_NULL: if (rqstp->rq_proc != CB_NULL) - return SVC_DENIED; + ret = SVC_DENIED; break; case RPC_AUTH_UNIX: break; case RPC_AUTH_GSS: - /* FIXME: RPCSEC_GSS handling? */ + ret = check_gss_callback_principal(clp, rqstp); + break; default: - return SVC_DENIED; + ret = SVC_DENIED; } - return SVC_OK; + nfs_put_client(clp); + return ret; } /* diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 7547600b6174..9b728f3565a1 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -143,7 +143,6 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_ clp->cl_proto = cl_init->proto; #ifdef CONFIG_NFS_V4 - init_rwsem(&clp->cl_sem); INIT_LIST_HEAD(&clp->cl_delegations); spin_lock_init(&clp->cl_lock); INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state); @@ -224,31 +223,54 @@ void nfs_put_client(struct nfs_client *clp) } } -static int nfs_sockaddr_match_ipaddr4(const struct sockaddr_in *sa1, - const struct sockaddr_in *sa2) +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +static const struct in6_addr *nfs_map_ipv4_addr(const struct sockaddr *sa, struct in6_addr *addr_mapped) { - return sa1->sin_addr.s_addr == sa2->sin_addr.s_addr; + switch (sa->sa_family) { + default: + return NULL; + case AF_INET6: + return &((const struct sockaddr_in6 *)sa)->sin6_addr; + break; + case AF_INET: + ipv6_addr_set_v4mapped(((const struct sockaddr_in *)sa)->sin_addr.s_addr, + addr_mapped); + return addr_mapped; + } } -static int nfs_sockaddr_match_ipaddr6(const struct sockaddr_in6 *sa1, - const struct sockaddr_in6 *sa2) +static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1, + const struct sockaddr *sa2) +{ + const struct in6_addr *addr1; + const struct in6_addr *addr2; + struct in6_addr addr1_mapped; + struct in6_addr addr2_mapped; + + addr1 = nfs_map_ipv4_addr(sa1, &addr1_mapped); + if (likely(addr1 != NULL)) { + addr2 = nfs_map_ipv4_addr(sa2, &addr2_mapped); + if (likely(addr2 != NULL)) + return ipv6_addr_equal(addr1, addr2); + } + return 0; +} +#else +static int nfs_sockaddr_match_ipaddr4(const struct sockaddr_in *sa1, + const struct sockaddr_in *sa2) { - return ipv6_addr_equal(&sa1->sin6_addr, &sa2->sin6_addr); + return sa1->sin_addr.s_addr == sa2->sin_addr.s_addr; } static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1, const struct sockaddr *sa2) { - switch (sa1->sa_family) { - case AF_INET: - return nfs_sockaddr_match_ipaddr4((const struct sockaddr_in *)sa1, - (const struct sockaddr_in *)sa2); - case AF_INET6: - return nfs_sockaddr_match_ipaddr6((const struct sockaddr_in6 *)sa1, - (const struct sockaddr_in6 *)sa2); - } - BUG(); + if (unlikely(sa1->sa_family != AF_INET || sa2->sa_family != AF_INET)) + return 0; + return nfs_sockaddr_match_ipaddr4((const struct sockaddr_in *)sa1, + (const struct sockaddr_in *)sa2); } +#endif /* * Find a client by IP address and protocol version @@ -270,8 +292,6 @@ struct nfs_client *nfs_find_client(const struct sockaddr *addr, u32 nfsversion) if (clp->rpc_ops->version != nfsversion) continue; - if (addr->sa_family != clap->sa_family) - continue; /* Match only the IP address, not the port number */ if (!nfs_sockaddr_match_ipaddr(addr, clap)) continue; @@ -305,8 +325,6 @@ struct nfs_client *nfs_find_client_next(struct nfs_client *clp) if (clp->rpc_ops->version != nfsvers) continue; - if (sap->sa_family != clap->sa_family) - continue; /* Match only the IP address, not the port number */ if (!nfs_sockaddr_match_ipaddr(sap, clap)) continue; @@ -470,7 +488,7 @@ static void nfs_init_timeout_values(struct rpc_timeout *to, int proto, static int nfs_create_rpc_client(struct nfs_client *clp, const struct rpc_timeout *timeparms, rpc_authflavor_t flavor, - int flags) + int discrtry, int noresvport) { struct rpc_clnt *clnt = NULL; struct rpc_create_args args = { @@ -482,9 +500,13 @@ static int nfs_create_rpc_client(struct nfs_client *clp, .program = &nfs_program, .version = clp->rpc_ops->version, .authflavor = flavor, - .flags = flags, }; + if (discrtry) + args.flags |= RPC_CLNT_CREATE_DISCRTRY; + if (noresvport) + args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; + if (!IS_ERR(clp->cl_rpcclient)) return 0; @@ -522,6 +544,8 @@ static int nfs_start_lockd(struct nfs_server *server) .protocol = server->flags & NFS_MOUNT_TCP ? IPPROTO_TCP : IPPROTO_UDP, .nfs_version = clp->rpc_ops->version, + .noresvport = server->flags & NFS_MOUNT_NORESVPORT ? + 1 : 0, }; if (nlm_init.nfs_version > 3) @@ -623,7 +647,8 @@ static int nfs_init_client(struct nfs_client *clp, * Create a client RPC handle for doing FSSTAT with UNIX auth only * - RFC 2623, sec 2.3.2 */ - error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX, 0); + error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX, + 0, data->flags & NFS_MOUNT_NORESVPORT); if (error < 0) goto error; nfs_mark_client_ready(clp, NFS_CS_READY); @@ -965,7 +990,8 @@ error: static int nfs4_init_client(struct nfs_client *clp, const struct rpc_timeout *timeparms, const char *ip_addr, - rpc_authflavor_t authflavour) + rpc_authflavor_t authflavour, + int flags) { int error; @@ -979,7 +1005,7 @@ static int nfs4_init_client(struct nfs_client *clp, clp->rpc_ops = &nfs_v4_clientops; error = nfs_create_rpc_client(clp, timeparms, authflavour, - RPC_CLNT_CREATE_DISCRTRY); + 1, flags & NFS_MOUNT_NORESVPORT); if (error < 0) goto error; memcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr)); @@ -1030,7 +1056,8 @@ static int nfs4_set_client(struct nfs_server *server, error = PTR_ERR(clp); goto error; } - error = nfs4_init_client(clp, timeparms, ip_addr, authflavour); + error = nfs4_init_client(clp, timeparms, ip_addr, authflavour, + server->flags); if (error < 0) goto error_put; @@ -1059,6 +1086,10 @@ static int nfs4_init_server(struct nfs_server *server, nfs_init_timeout_values(&timeparms, data->nfs_server.protocol, data->timeo, data->retrans); + /* Initialise the client representation from the mount data */ + server->flags = data->flags; + server->caps |= NFS_CAP_ATOMIC_OPEN; + /* Get a client record */ error = nfs4_set_client(server, data->nfs_server.hostname, @@ -1071,10 +1102,6 @@ static int nfs4_init_server(struct nfs_server *server, if (error < 0) goto error; - /* Initialise the client representation from the mount data */ - server->flags = data->flags; - server->caps |= NFS_CAP_ATOMIC_OPEN; - if (data->rsize) server->rsize = nfs_block_size(data->rsize, NULL); if (data->wsize) @@ -1177,6 +1204,10 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, parent_server = NFS_SB(data->sb); parent_client = parent_server->nfs_client; + /* Initialise the client representation from the parent server */ + nfs_server_copy_userdata(server, parent_server); + server->caps |= NFS_CAP_ATOMIC_OPEN; + /* Get a client representation. * Note: NFSv4 always uses TCP, */ error = nfs4_set_client(server, data->hostname, @@ -1189,10 +1220,6 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, if (error < 0) goto error; - /* Initialise the client representation from the parent server */ - nfs_server_copy_userdata(server, parent_server); - server->caps |= NFS_CAP_ATOMIC_OPEN; - error = nfs_init_server_rpcclient(server, parent_server->client->cl_timeout, data->authflavor); if (error < 0) goto error; diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index cc563cfa6940..968225a88015 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -43,6 +43,27 @@ static void nfs_free_delegation(struct nfs_delegation *delegation) put_rpccred(cred); } +void nfs_mark_delegation_referenced(struct nfs_delegation *delegation) +{ + set_bit(NFS_DELEGATION_REFERENCED, &delegation->flags); +} + +int nfs_have_delegation(struct inode *inode, fmode_t flags) +{ + struct nfs_delegation *delegation; + int ret = 0; + + flags &= FMODE_READ|FMODE_WRITE; + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(inode)->delegation); + if (delegation != NULL && (delegation->type & flags) == flags) { + nfs_mark_delegation_referenced(delegation); + ret = 1; + } + rcu_read_unlock(); + return ret; +} + static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_state *state) { struct inode *inode = state->inode; @@ -119,7 +140,7 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, st delegation->maxsize = res->maxsize; oldcred = delegation->cred; delegation->cred = get_rpccred(cred); - delegation->flags &= ~NFS_DELEGATION_NEED_RECLAIM; + clear_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags); NFS_I(inode)->delegation_state = delegation->type; smp_wmb(); put_rpccred(oldcred); @@ -134,19 +155,35 @@ static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation * return res; } +static struct inode *nfs_delegation_grab_inode(struct nfs_delegation *delegation) +{ + struct inode *inode = NULL; + + spin_lock(&delegation->lock); + if (delegation->inode != NULL) + inode = igrab(delegation->inode); + spin_unlock(&delegation->lock); + return inode; +} + static struct nfs_delegation *nfs_detach_delegation_locked(struct nfs_inode *nfsi, const nfs4_stateid *stateid) { struct nfs_delegation *delegation = rcu_dereference(nfsi->delegation); if (delegation == NULL) goto nomatch; + spin_lock(&delegation->lock); if (stateid != NULL && memcmp(delegation->stateid.data, stateid->data, sizeof(delegation->stateid.data)) != 0) - goto nomatch; + goto nomatch_unlock; list_del_rcu(&delegation->super_list); + delegation->inode = NULL; nfsi->delegation_state = 0; rcu_assign_pointer(nfsi->delegation, NULL); + spin_unlock(&delegation->lock); return delegation; +nomatch_unlock: + spin_unlock(&delegation->lock); nomatch: return NULL; } @@ -172,6 +209,8 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct delegation->change_attr = nfsi->change_attr; delegation->cred = get_rpccred(cred); delegation->inode = inode; + delegation->flags = 1<<NFS_DELEGATION_REFERENCED; + spin_lock_init(&delegation->lock); spin_lock(&clp->cl_lock); if (rcu_dereference(nfsi->delegation) != NULL) { @@ -226,22 +265,47 @@ static void nfs_msync_inode(struct inode *inode) */ static int __nfs_inode_return_delegation(struct inode *inode, struct nfs_delegation *delegation) { - struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; struct nfs_inode *nfsi = NFS_I(inode); nfs_msync_inode(inode); - down_read(&clp->cl_sem); /* Guard against new delegated open calls */ down_write(&nfsi->rwsem); nfs_delegation_claim_opens(inode, &delegation->stateid); up_write(&nfsi->rwsem); - up_read(&clp->cl_sem); nfs_msync_inode(inode); return nfs_do_return_delegation(inode, delegation, 1); } /* + * Return all delegations that have been marked for return + */ +void nfs_client_return_marked_delegations(struct nfs_client *clp) +{ + struct nfs_delegation *delegation; + struct inode *inode; + +restart: + rcu_read_lock(); + list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) { + if (!test_and_clear_bit(NFS_DELEGATION_RETURN, &delegation->flags)) + continue; + inode = nfs_delegation_grab_inode(delegation); + if (inode == NULL) + continue; + spin_lock(&clp->cl_lock); + delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL); + spin_unlock(&clp->cl_lock); + rcu_read_unlock(); + if (delegation != NULL) + __nfs_inode_return_delegation(inode, delegation); + iput(inode); + goto restart; + } + rcu_read_unlock(); +} + +/* * This function returns the delegation without reclaiming opens * or protecting against delegation reclaims. * It is therefore really only safe to be called from @@ -279,83 +343,55 @@ int nfs_inode_return_delegation(struct inode *inode) return err; } +static void nfs_mark_return_delegation(struct nfs_client *clp, struct nfs_delegation *delegation) +{ + set_bit(NFS_DELEGATION_RETURN, &delegation->flags); + set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state); +} + /* * Return all delegations associated to a super block */ -void nfs_return_all_delegations(struct super_block *sb) +void nfs_super_return_all_delegations(struct super_block *sb) { struct nfs_client *clp = NFS_SB(sb)->nfs_client; struct nfs_delegation *delegation; - struct inode *inode; if (clp == NULL) return; -restart: rcu_read_lock(); list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) { - if (delegation->inode->i_sb != sb) - continue; - inode = igrab(delegation->inode); - if (inode == NULL) - continue; - spin_lock(&clp->cl_lock); - delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL); - spin_unlock(&clp->cl_lock); - rcu_read_unlock(); - if (delegation != NULL) - __nfs_inode_return_delegation(inode, delegation); - iput(inode); - goto restart; + spin_lock(&delegation->lock); + if (delegation->inode != NULL && delegation->inode->i_sb == sb) + set_bit(NFS_DELEGATION_RETURN, &delegation->flags); + spin_unlock(&delegation->lock); } rcu_read_unlock(); + nfs_client_return_marked_delegations(clp); } -static int nfs_do_expire_all_delegations(void *ptr) +static void nfs_client_mark_return_all_delegations(struct nfs_client *clp) { - struct nfs_client *clp = ptr; struct nfs_delegation *delegation; - struct inode *inode; - allow_signal(SIGKILL); -restart: - if (test_bit(NFS4CLNT_STATE_RECOVER, &clp->cl_state) != 0) - goto out; - if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) - goto out; rcu_read_lock(); list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) { - inode = igrab(delegation->inode); - if (inode == NULL) - continue; - spin_lock(&clp->cl_lock); - delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL); - spin_unlock(&clp->cl_lock); - rcu_read_unlock(); - if (delegation) - __nfs_inode_return_delegation(inode, delegation); - iput(inode); - goto restart; + set_bit(NFS_DELEGATION_RETURN, &delegation->flags); + set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state); } rcu_read_unlock(); -out: - nfs_put_client(clp); - module_put_and_exit(0); +} + +static void nfs_delegation_run_state_manager(struct nfs_client *clp) +{ + if (test_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) + nfs4_schedule_state_manager(clp); } void nfs_expire_all_delegations(struct nfs_client *clp) { - struct task_struct *task; - - __module_get(THIS_MODULE); - atomic_inc(&clp->cl_count); - task = kthread_run(nfs_do_expire_all_delegations, clp, - "%s-delegreturn", - rpc_peeraddr2str(clp->cl_rpcclient, - RPC_DISPLAY_ADDR)); - if (!IS_ERR(task)) - return; - nfs_put_client(clp); - module_put(THIS_MODULE); + nfs_client_mark_return_all_delegations(clp); + nfs_delegation_run_state_manager(clp); } /* @@ -363,68 +399,29 @@ void nfs_expire_all_delegations(struct nfs_client *clp) */ void nfs_handle_cb_pathdown(struct nfs_client *clp) { - struct nfs_delegation *delegation; - struct inode *inode; - if (clp == NULL) return; -restart: + nfs_client_mark_return_all_delegations(clp); +} + +static void nfs_client_mark_return_unreferenced_delegations(struct nfs_client *clp) +{ + struct nfs_delegation *delegation; + rcu_read_lock(); list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) { - inode = igrab(delegation->inode); - if (inode == NULL) + if (test_and_clear_bit(NFS_DELEGATION_REFERENCED, &delegation->flags)) continue; - spin_lock(&clp->cl_lock); - delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL); - spin_unlock(&clp->cl_lock); - rcu_read_unlock(); - if (delegation != NULL) - __nfs_inode_return_delegation(inode, delegation); - iput(inode); - goto restart; + set_bit(NFS_DELEGATION_RETURN, &delegation->flags); + set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state); } rcu_read_unlock(); } -struct recall_threadargs { - struct inode *inode; - struct nfs_client *clp; - const nfs4_stateid *stateid; - - struct completion started; - int result; -}; - -static int recall_thread(void *data) +void nfs_expire_unreferenced_delegations(struct nfs_client *clp) { - struct recall_threadargs *args = (struct recall_threadargs *)data; - struct inode *inode = igrab(args->inode); - struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; - struct nfs_inode *nfsi = NFS_I(inode); - struct nfs_delegation *delegation; - - daemonize("nfsv4-delegreturn"); - - nfs_msync_inode(inode); - down_read(&clp->cl_sem); - down_write(&nfsi->rwsem); - spin_lock(&clp->cl_lock); - delegation = nfs_detach_delegation_locked(nfsi, args->stateid); - if (delegation != NULL) - args->result = 0; - else - args->result = -ENOENT; - spin_unlock(&clp->cl_lock); - complete(&args->started); - nfs_delegation_claim_opens(inode, args->stateid); - up_write(&nfsi->rwsem); - up_read(&clp->cl_sem); - nfs_msync_inode(inode); - - if (delegation != NULL) - nfs_do_return_delegation(inode, delegation, 1); - iput(inode); - module_put_and_exit(0); + nfs_client_mark_return_unreferenced_delegations(clp); + nfs_delegation_run_state_manager(clp); } /* @@ -432,22 +429,20 @@ static int recall_thread(void *data) */ int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid) { - struct recall_threadargs data = { - .inode = inode, - .stateid = stateid, - }; - int status; + struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; + struct nfs_delegation *delegation; - init_completion(&data.started); - __module_get(THIS_MODULE); - status = kernel_thread(recall_thread, &data, CLONE_KERNEL); - if (status < 0) - goto out_module_put; - wait_for_completion(&data.started); - return data.result; -out_module_put: - module_put(THIS_MODULE); - return status; + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(inode)->delegation); + if (delegation == NULL || memcmp(delegation->stateid.data, stateid->data, + sizeof(delegation->stateid.data)) != 0) { + rcu_read_unlock(); + return -ENOENT; + } + nfs_mark_return_delegation(clp, delegation); + rcu_read_unlock(); + nfs_delegation_run_state_manager(clp); + return 0; } /* @@ -459,10 +454,14 @@ struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs struct inode *res = NULL; rcu_read_lock(); list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) { - if (nfs_compare_fh(fhandle, &NFS_I(delegation->inode)->fh) == 0) { + spin_lock(&delegation->lock); + if (delegation->inode != NULL && + nfs_compare_fh(fhandle, &NFS_I(delegation->inode)->fh) == 0) { res = igrab(delegation->inode); - break; } + spin_unlock(&delegation->lock); + if (res != NULL) + break; } rcu_read_unlock(); return res; @@ -476,7 +475,7 @@ void nfs_delegation_mark_reclaim(struct nfs_client *clp) struct nfs_delegation *delegation; rcu_read_lock(); list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) - delegation->flags |= NFS_DELEGATION_NEED_RECLAIM; + set_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags); rcu_read_unlock(); } @@ -486,17 +485,22 @@ void nfs_delegation_mark_reclaim(struct nfs_client *clp) void nfs_delegation_reap_unclaimed(struct nfs_client *clp) { struct nfs_delegation *delegation; + struct inode *inode; restart: rcu_read_lock(); list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) { - if ((delegation->flags & NFS_DELEGATION_NEED_RECLAIM) == 0) + if (test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) == 0) + continue; + inode = nfs_delegation_grab_inode(delegation); + if (inode == NULL) continue; spin_lock(&clp->cl_lock); - delegation = nfs_detach_delegation_locked(NFS_I(delegation->inode), NULL); + delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL); spin_unlock(&clp->cl_lock); rcu_read_unlock(); if (delegation != NULL) nfs_free_delegation(delegation); + iput(inode); goto restart; } rcu_read_unlock(); diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index f1c5e2a5d88e..09f383795174 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -17,14 +17,20 @@ struct nfs_delegation { struct rpc_cred *cred; struct inode *inode; nfs4_stateid stateid; - int type; -#define NFS_DELEGATION_NEED_RECLAIM 1 - long flags; + fmode_t type; loff_t maxsize; __u64 change_attr; + unsigned long flags; + spinlock_t lock; struct rcu_head rcu; }; +enum { + NFS_DELEGATION_NEED_RECLAIM = 0, + NFS_DELEGATION_RETURN, + NFS_DELEGATION_REFERENCED, +}; + int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); int nfs_inode_return_delegation(struct inode *inode); @@ -32,9 +38,11 @@ int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *s void nfs_inode_return_delegation_noreclaim(struct inode *inode); struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle); -void nfs_return_all_delegations(struct super_block *sb); +void nfs_super_return_all_delegations(struct super_block *sb); void nfs_expire_all_delegations(struct nfs_client *clp); +void nfs_expire_unreferenced_delegations(struct nfs_client *clp); void nfs_handle_cb_pathdown(struct nfs_client *clp); +void nfs_client_return_marked_delegations(struct nfs_client *clp); void nfs_delegation_mark_reclaim(struct nfs_client *clp); void nfs_delegation_reap_unclaimed(struct nfs_client *clp); @@ -45,22 +53,11 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl); int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode); -static inline int nfs_have_delegation(struct inode *inode, int flags) -{ - struct nfs_delegation *delegation; - int ret = 0; - - flags &= FMODE_READ|FMODE_WRITE; - rcu_read_lock(); - delegation = rcu_dereference(NFS_I(inode)->delegation); - if (delegation != NULL && (delegation->type & flags) == flags) - ret = 1; - rcu_read_unlock(); - return ret; -} +void nfs_mark_delegation_referenced(struct nfs_delegation *delegation); +int nfs_have_delegation(struct inode *inode, fmode_t flags); #else -static inline int nfs_have_delegation(struct inode *inode, int flags) +static inline int nfs_have_delegation(struct inode *inode, fmode_t flags) { return 0; } diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 3e64b98f3a93..e35c8199f82f 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -799,6 +799,9 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd) goto out_bad; } + if (nfs_have_delegation(inode, FMODE_READ)) + goto out_set_verifier; + /* Force a full look up iff the parent directory has changed */ if (!nfs_is_exclusive_create(dir, nd) && nfs_check_verifier(dir, dentry)) { if (nfs_lookup_verify_inode(inode, nd)) @@ -817,6 +820,7 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd) if ((error = nfs_refresh_inode(inode, &fattr)) != 0) goto out_bad; +out_set_verifier: nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); out_valid: dput(parent); @@ -973,7 +977,7 @@ struct dentry_operations nfs4_dentry_operations = { * Use intent information to determine whether we need to substitute * the NFSv4-style stateful OPEN for the LOOKUP call */ -static int is_atomic_open(struct inode *dir, struct nameidata *nd) +static int is_atomic_open(struct nameidata *nd) { if (nd == NULL || nfs_lookup_check_intent(nd, LOOKUP_OPEN) == 0) return 0; @@ -996,7 +1000,7 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); /* Check that we are indeed trying to open this file */ - if (!is_atomic_open(dir, nd)) + if (!is_atomic_open(nd)) goto no_open; if (dentry->d_name.len > NFS_SERVER(dir)->namelen) { @@ -1047,10 +1051,10 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd) struct inode *dir; int openflags, ret = 0; + if (!is_atomic_open(nd)) + goto no_open; parent = dget_parent(dentry); dir = parent->d_inode; - if (!is_atomic_open(dir, nd)) - goto no_open; /* We can't create new files in nfs_open_revalidate(), so we * optimize away revalidation of negative dentries. */ @@ -1062,11 +1066,11 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd) /* NFS only supports OPEN on regular files */ if (!S_ISREG(inode->i_mode)) - goto no_open; + goto no_open_dput; openflags = nd->intent.open.flags; /* We cannot do exclusive creation on a positive dentry */ if ((openflags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL)) - goto no_open; + goto no_open_dput; /* We can't create new files, or truncate existing ones here */ openflags &= ~(O_CREAT|O_TRUNC); @@ -1081,10 +1085,9 @@ out: if (!ret) d_drop(dentry); return ret; -no_open: +no_open_dput: dput(parent); - if (inode != NULL && nfs_have_delegation(inode, FMODE_READ)) - return 1; +no_open: return nfs_lookup_revalidate(dentry, nd); } #endif /* CONFIG_NFSV4 */ @@ -1794,7 +1797,8 @@ static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, str cache = nfs_access_search_rbtree(inode, cred); if (cache == NULL) goto out; - if (!time_in_range(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo)) + if (!nfs_have_delegation(inode, FMODE_READ) && + !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo)) goto out_stale; res->jiffies = cache->jiffies; res->cred = cache->cred; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index d319b49f8f06..90f292b520d2 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -354,7 +354,7 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping, file->f_path.dentry->d_name.name, mapping->host->i_ino, len, (long long) pos); - page = __grab_cache_page(mapping, index); + page = grab_cache_page_write_begin(mapping, index, flags); if (!page) return -ENOMEM; *pagep = page; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index d22eb383e1cf..0c381686171e 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -592,7 +592,7 @@ static void nfs_file_set_open_context(struct file *filp, struct nfs_open_context /* * Given an inode, search for an open context with the desired characteristics */ -struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, int mode) +struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, fmode_t mode) { struct nfs_inode *nfsi = NFS_I(inode); struct nfs_open_context *pos, *ctx = NULL; @@ -712,14 +712,7 @@ int nfs_attribute_timeout(struct inode *inode) if (nfs_have_delegation(inode, FMODE_READ)) return 0; - /* - * Special case: if the attribute timeout is set to 0, then always - * treat the cache as having expired (unless holding - * a delegation). - */ - if (nfsi->attrtimeo == 0) - return 1; - return !time_in_range(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo); + return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo); } /** @@ -1182,7 +1175,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfsi->attrtimeo_timestamp = now; nfsi->attr_gencount = nfs_inc_attr_generation_counter(); } else { - if (!time_in_range(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) { + if (!time_in_range_open(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) { if ((nfsi->attrtimeo <<= 1) > NFS_MAXATTRTIMEO(inode)) nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode); nfsi->attrtimeo_timestamp = now; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index d212ee41caf2..340ede8f608f 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -63,6 +63,20 @@ struct nfs_parsed_mount_data { struct security_mnt_opts lsm_opts; }; +/* mount_clnt.c */ +struct nfs_mount_request { + struct sockaddr *sap; + size_t salen; + char *hostname; + char *dirpath; + u32 version; + unsigned short protocol; + struct nfs_fh *fh; + int noresvport; +}; + +extern int nfs_mount(struct nfs_mount_request *info); + /* client.c */ extern struct rpc_program nfs_program; diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c index 086a6830d785..ca905a5bb1ba 100644 --- a/fs/nfs/mount_clnt.c +++ b/fs/nfs/mount_clnt.c @@ -29,47 +29,43 @@ struct mnt_fhstatus { /** * nfs_mount - Obtain an NFS file handle for the given host and path - * @addr: pointer to server's address - * @len: size of server's address - * @hostname: name of server host, or NULL - * @path: pointer to string containing export path to mount - * @version: mount version to use for this request - * @protocol: transport protocol to use for thie request - * @fh: pointer to location to place returned file handle + * @info: pointer to mount request arguments * * Uses default timeout parameters specified by underlying transport. */ -int nfs_mount(struct sockaddr *addr, size_t len, char *hostname, char *path, - int version, int protocol, struct nfs_fh *fh) +int nfs_mount(struct nfs_mount_request *info) { struct mnt_fhstatus result = { - .fh = fh + .fh = info->fh }; struct rpc_message msg = { - .rpc_argp = path, + .rpc_argp = info->dirpath, .rpc_resp = &result, }; struct rpc_create_args args = { - .protocol = protocol, - .address = addr, - .addrsize = len, - .servername = hostname, + .protocol = info->protocol, + .address = info->sap, + .addrsize = info->salen, + .servername = info->hostname, .program = &mnt_program, - .version = version, + .version = info->version, .authflavor = RPC_AUTH_UNIX, - .flags = 0, }; struct rpc_clnt *mnt_clnt; int status; dprintk("NFS: sending MNT request for %s:%s\n", - (hostname ? hostname : "server"), path); + (info->hostname ? info->hostname : "server"), + info->dirpath); + + if (info->noresvport) + args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; mnt_clnt = rpc_create(&args); if (IS_ERR(mnt_clnt)) goto out_clnt_err; - if (version == NFS_MNT3_VERSION) + if (info->version == NFS_MNT3_VERSION) msg.rpc_proc = &mnt_clnt->cl_procinfo[MOUNTPROC3_MNT]; else msg.rpc_proc = &mnt_clnt->cl_procinfo[MNTPROC_MNT]; diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index ea790645fda6..4e4d33204376 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -38,8 +38,12 @@ struct idmap; ((err) != NFSERR_NOFILEHANDLE)) enum nfs4_client_state { - NFS4CLNT_STATE_RECOVER = 0, + NFS4CLNT_MANAGER_RUNNING = 0, + NFS4CLNT_CHECK_LEASE, NFS4CLNT_LEASE_EXPIRED, + NFS4CLNT_RECLAIM_REBOOT, + NFS4CLNT_RECLAIM_NOGRACE, + NFS4CLNT_DELEGRETURN, }; /* @@ -90,12 +94,18 @@ struct nfs4_state_owner { spinlock_t so_lock; atomic_t so_count; + unsigned long so_flags; struct list_head so_states; struct list_head so_delegations; struct nfs_seqid_counter so_seqid; struct rpc_sequence so_sequence; }; +enum { + NFS_OWNER_RECLAIM_REBOOT, + NFS_OWNER_RECLAIM_NOGRACE +}; + /* * struct nfs4_state maintains the client-side state for a given * (state_owner,inode) tuple (OPEN) or state_owner (LOCK). @@ -128,6 +138,8 @@ enum { NFS_O_RDONLY_STATE, /* OPEN stateid has read-only state */ NFS_O_WRONLY_STATE, /* OPEN stateid has write-only state */ NFS_O_RDWR_STATE, /* OPEN stateid has read/write state */ + NFS_STATE_RECLAIM_REBOOT, /* OPEN stateid server rebooted */ + NFS_STATE_RECLAIM_NOGRACE, /* OPEN stateid needs to recover state */ }; struct nfs4_state { @@ -149,7 +161,7 @@ struct nfs4_state { unsigned int n_rdonly; /* Number of read-only references */ unsigned int n_wronly; /* Number of write-only references */ unsigned int n_rdwr; /* Number of read/write references */ - int state; /* State on the server (R,W, or RW) */ + fmode_t state; /* State on the server (R,W, or RW) */ atomic_t count; }; @@ -157,9 +169,12 @@ struct nfs4_state { struct nfs4_exception { long timeout; int retry; + struct nfs4_state *state; }; struct nfs4_state_recovery_ops { + int owner_flag_bit; + int state_flag_bit; int (*recover_open)(struct nfs4_state_owner *, struct nfs4_state *); int (*recover_lock)(struct nfs4_state *, struct file_lock *); }; @@ -174,7 +189,6 @@ extern ssize_t nfs4_listxattr(struct dentry *, char *, size_t); /* nfs4proc.c */ -extern int nfs4_map_errors(int err); extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *); extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct rpc_cred *); extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *); @@ -187,7 +201,7 @@ extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, struct nfs4_fs_locations *fs_locations, struct page *page); extern struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops; -extern struct nfs4_state_recovery_ops nfs4_network_partition_recovery_ops; +extern struct nfs4_state_recovery_ops nfs4_nograce_recovery_ops; extern const u32 nfs4_fattr_bitmap[2]; extern const u32 nfs4_statfs_bitmap[2]; @@ -202,16 +216,18 @@ extern void nfs4_kill_renewd(struct nfs_client *); extern void nfs4_renew_state(struct work_struct *); /* nfs4state.c */ -struct rpc_cred *nfs4_get_renew_cred(struct nfs_client *clp); +struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp); extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *); extern void nfs4_put_state_owner(struct nfs4_state_owner *); extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *); extern void nfs4_put_open_state(struct nfs4_state *); -extern void nfs4_close_state(struct path *, struct nfs4_state *, mode_t); -extern void nfs4_close_sync(struct path *, struct nfs4_state *, mode_t); -extern void nfs4_state_set_mode_locked(struct nfs4_state *, mode_t); +extern void nfs4_close_state(struct path *, struct nfs4_state *, fmode_t); +extern void nfs4_close_sync(struct path *, struct nfs4_state *, fmode_t); +extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t); extern void nfs4_schedule_state_recovery(struct nfs_client *); +extern void nfs4_schedule_state_manager(struct nfs_client *); +extern int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state); extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 83e700a2b0c0..8dde84b988d9 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -62,14 +62,12 @@ struct nfs4_opendata; static int _nfs4_proc_open(struct nfs4_opendata *data); static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); -static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *); -static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception); -static int nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs_client *clp); +static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *); static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr); static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr); /* Prevent leaks of NFSv4 errors into userland */ -int nfs4_map_errors(int err) +static int nfs4_map_errors(int err) { if (err < -1000) { dprintk("%s could not handle NFSv4 error %d\n", @@ -195,6 +193,83 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent kunmap_atomic(start, KM_USER0); } +static int nfs4_wait_bit_killable(void *word) +{ + if (fatal_signal_pending(current)) + return -ERESTARTSYS; + schedule(); + return 0; +} + +static int nfs4_wait_clnt_recover(struct nfs_client *clp) +{ + int res; + + might_sleep(); + + res = wait_on_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING, + nfs4_wait_bit_killable, TASK_KILLABLE); + return res; +} + +static int nfs4_delay(struct rpc_clnt *clnt, long *timeout) +{ + int res = 0; + + might_sleep(); + + if (*timeout <= 0) + *timeout = NFS4_POLL_RETRY_MIN; + if (*timeout > NFS4_POLL_RETRY_MAX) + *timeout = NFS4_POLL_RETRY_MAX; + schedule_timeout_killable(*timeout); + if (fatal_signal_pending(current)) + res = -ERESTARTSYS; + *timeout <<= 1; + return res; +} + +/* This is the error handling routine for processes that are allowed + * to sleep. + */ +static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception) +{ + struct nfs_client *clp = server->nfs_client; + struct nfs4_state *state = exception->state; + int ret = errorcode; + + exception->retry = 0; + switch(errorcode) { + case 0: + return 0; + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_OPENMODE: + if (state == NULL) + break; + nfs4_state_mark_reclaim_nograce(clp, state); + case -NFS4ERR_STALE_CLIENTID: + case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_EXPIRED: + nfs4_schedule_state_recovery(clp); + ret = nfs4_wait_clnt_recover(clp); + if (ret == 0) + exception->retry = 1; + break; + case -NFS4ERR_FILE_OPEN: + case -NFS4ERR_GRACE: + case -NFS4ERR_DELAY: + ret = nfs4_delay(server->client, &exception->timeout); + if (ret != 0) + break; + case -NFS4ERR_OLD_STATEID: + exception->retry = 1; + } + /* We failed to handle the error */ + return nfs4_map_errors(ret); +} + + static void renew_lease(const struct nfs_server *server, unsigned long timestamp) { struct nfs_client *clp = server->nfs_client; @@ -248,7 +323,7 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p) } static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path, - struct nfs4_state_owner *sp, int flags, + struct nfs4_state_owner *sp, fmode_t fmode, int flags, const struct iattr *attrs) { struct dentry *parent = dget_parent(path->dentry); @@ -268,7 +343,8 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path, p->owner = sp; atomic_inc(&sp->so_count); p->o_arg.fh = NFS_FH(dir); - p->o_arg.open_flags = flags, + p->o_arg.open_flags = flags; + p->o_arg.fmode = fmode & (FMODE_READ|FMODE_WRITE); p->o_arg.clientid = server->nfs_client->cl_clientid; p->o_arg.id = sp->so_owner_id.id; p->o_arg.name = &p->path.dentry->d_name; @@ -324,10 +400,13 @@ static int nfs4_wait_for_completion_rpc_task(struct rpc_task *task) return ret; } -static int can_open_cached(struct nfs4_state *state, int mode) +static int can_open_cached(struct nfs4_state *state, fmode_t mode, int open_mode) { int ret = 0; - switch (mode & (FMODE_READ|FMODE_WRITE|O_EXCL)) { + + if (open_mode & O_EXCL) + goto out; + switch (mode & (FMODE_READ|FMODE_WRITE)) { case FMODE_READ: ret |= test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0; break; @@ -337,21 +416,23 @@ static int can_open_cached(struct nfs4_state *state, int mode) case FMODE_READ|FMODE_WRITE: ret |= test_bit(NFS_O_RDWR_STATE, &state->flags) != 0; } +out: return ret; } -static int can_open_delegated(struct nfs_delegation *delegation, mode_t open_flags) +static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode) { - if ((delegation->type & open_flags) != open_flags) + if ((delegation->type & fmode) != fmode) return 0; - if (delegation->flags & NFS_DELEGATION_NEED_RECLAIM) + if (test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags)) return 0; + nfs_mark_delegation_referenced(delegation); return 1; } -static void update_open_stateflags(struct nfs4_state *state, mode_t open_flags) +static void update_open_stateflags(struct nfs4_state *state, fmode_t fmode) { - switch (open_flags) { + switch (fmode) { case FMODE_WRITE: state->n_wronly++; break; @@ -361,15 +442,15 @@ static void update_open_stateflags(struct nfs4_state *state, mode_t open_flags) case FMODE_READ|FMODE_WRITE: state->n_rdwr++; } - nfs4_state_set_mode_locked(state, state->state | open_flags); + nfs4_state_set_mode_locked(state, state->state | fmode); } -static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, int open_flags) +static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode) { if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) memcpy(state->stateid.data, stateid->data, sizeof(state->stateid.data)); memcpy(state->open_stateid.data, stateid->data, sizeof(state->open_stateid.data)); - switch (open_flags) { + switch (fmode) { case FMODE_READ: set_bit(NFS_O_RDONLY_STATE, &state->flags); break; @@ -381,16 +462,15 @@ static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid * } } -static void nfs_set_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, int open_flags) +static void nfs_set_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode) { write_seqlock(&state->seqlock); - nfs_set_open_stateid_locked(state, stateid, open_flags); + nfs_set_open_stateid_locked(state, stateid, fmode); write_sequnlock(&state->seqlock); } -static void update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, nfs4_stateid *deleg_stateid, int open_flags) +static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, const nfs4_stateid *deleg_stateid, fmode_t fmode) { - open_flags &= (FMODE_READ|FMODE_WRITE); /* * Protect the call to nfs4_state_set_mode_locked and * serialise the stateid update @@ -401,20 +481,60 @@ static void update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_sta set_bit(NFS_DELEGATED_STATE, &state->flags); } if (open_stateid != NULL) - nfs_set_open_stateid_locked(state, open_stateid, open_flags); + nfs_set_open_stateid_locked(state, open_stateid, fmode); write_sequnlock(&state->seqlock); spin_lock(&state->owner->so_lock); - update_open_stateflags(state, open_flags); + update_open_stateflags(state, fmode); spin_unlock(&state->owner->so_lock); } -static void nfs4_return_incompatible_delegation(struct inode *inode, mode_t open_flags) +static int update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, nfs4_stateid *delegation, fmode_t fmode) +{ + struct nfs_inode *nfsi = NFS_I(state->inode); + struct nfs_delegation *deleg_cur; + int ret = 0; + + fmode &= (FMODE_READ|FMODE_WRITE); + + rcu_read_lock(); + deleg_cur = rcu_dereference(nfsi->delegation); + if (deleg_cur == NULL) + goto no_delegation; + + spin_lock(&deleg_cur->lock); + if (nfsi->delegation != deleg_cur || + (deleg_cur->type & fmode) != fmode) + goto no_delegation_unlock; + + if (delegation == NULL) + delegation = &deleg_cur->stateid; + else if (memcmp(deleg_cur->stateid.data, delegation->data, NFS4_STATEID_SIZE) != 0) + goto no_delegation_unlock; + + nfs_mark_delegation_referenced(deleg_cur); + __update_open_stateid(state, open_stateid, &deleg_cur->stateid, fmode); + ret = 1; +no_delegation_unlock: + spin_unlock(&deleg_cur->lock); +no_delegation: + rcu_read_unlock(); + + if (!ret && open_stateid != NULL) { + __update_open_stateid(state, open_stateid, NULL, fmode); + ret = 1; + } + + return ret; +} + + +static void nfs4_return_incompatible_delegation(struct inode *inode, fmode_t fmode) { struct nfs_delegation *delegation; rcu_read_lock(); delegation = rcu_dereference(NFS_I(inode)->delegation); - if (delegation == NULL || (delegation->type & open_flags) == open_flags) { + if (delegation == NULL || (delegation->type & fmode) == fmode) { rcu_read_unlock(); return; } @@ -427,27 +547,28 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata) struct nfs4_state *state = opendata->state; struct nfs_inode *nfsi = NFS_I(state->inode); struct nfs_delegation *delegation; - int open_mode = opendata->o_arg.open_flags & (FMODE_READ|FMODE_WRITE|O_EXCL); + int open_mode = opendata->o_arg.open_flags & O_EXCL; + fmode_t fmode = opendata->o_arg.fmode; nfs4_stateid stateid; int ret = -EAGAIN; - rcu_read_lock(); - delegation = rcu_dereference(nfsi->delegation); for (;;) { - if (can_open_cached(state, open_mode)) { + if (can_open_cached(state, fmode, open_mode)) { spin_lock(&state->owner->so_lock); - if (can_open_cached(state, open_mode)) { - update_open_stateflags(state, open_mode); + if (can_open_cached(state, fmode, open_mode)) { + update_open_stateflags(state, fmode); spin_unlock(&state->owner->so_lock); - rcu_read_unlock(); goto out_return_state; } spin_unlock(&state->owner->so_lock); } - if (delegation == NULL) - break; - if (!can_open_delegated(delegation, open_mode)) + rcu_read_lock(); + delegation = rcu_dereference(nfsi->delegation); + if (delegation == NULL || + !can_open_delegated(delegation, fmode)) { + rcu_read_unlock(); break; + } /* Save the delegation */ memcpy(stateid.data, delegation->stateid.data, sizeof(stateid.data)); rcu_read_unlock(); @@ -455,19 +576,11 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata) if (ret != 0) goto out; ret = -EAGAIN; - rcu_read_lock(); - delegation = rcu_dereference(nfsi->delegation); - /* If no delegation, try a cached open */ - if (delegation == NULL) - continue; - /* Is the delegation still valid? */ - if (memcmp(stateid.data, delegation->stateid.data, sizeof(stateid.data)) != 0) - continue; - rcu_read_unlock(); - update_open_stateid(state, NULL, &stateid, open_mode); - goto out_return_state; + + /* Try to update the stateid using the delegation */ + if (update_open_stateid(state, NULL, &stateid, fmode)) + goto out_return_state; } - rcu_read_unlock(); out: return ERR_PTR(ret); out_return_state: @@ -480,7 +593,6 @@ static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data struct inode *inode; struct nfs4_state *state = NULL; struct nfs_delegation *delegation; - nfs4_stateid *deleg_stateid = NULL; int ret; if (!data->rpc_done) { @@ -507,7 +619,7 @@ static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data if (delegation) delegation_flags = delegation->flags; rcu_read_unlock(); - if (!(delegation_flags & NFS_DELEGATION_NEED_RECLAIM)) + if ((delegation_flags & 1UL<<NFS_DELEGATION_NEED_RECLAIM) == 0) nfs_inode_set_delegation(state->inode, data->owner->so_cred, &data->o_res); @@ -516,12 +628,9 @@ static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data data->owner->so_cred, &data->o_res); } - rcu_read_lock(); - delegation = rcu_dereference(NFS_I(inode)->delegation); - if (delegation != NULL) - deleg_stateid = &delegation->stateid; - update_open_stateid(state, &data->o_res.stateid, deleg_stateid, data->o_arg.open_flags); - rcu_read_unlock(); + + update_open_stateid(state, &data->o_res.stateid, NULL, + data->o_arg.fmode); iput(inode); out: return state; @@ -552,7 +661,7 @@ static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context { struct nfs4_opendata *opendata; - opendata = nfs4_opendata_alloc(&ctx->path, state->owner, 0, NULL); + opendata = nfs4_opendata_alloc(&ctx->path, state->owner, 0, 0, NULL); if (opendata == NULL) return ERR_PTR(-ENOMEM); opendata->state = state; @@ -560,12 +669,13 @@ static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context return opendata; } -static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, mode_t openflags, struct nfs4_state **res) +static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, fmode_t fmode, struct nfs4_state **res) { struct nfs4_state *newstate; int ret; - opendata->o_arg.open_flags = openflags; + opendata->o_arg.open_flags = 0; + opendata->o_arg.fmode = fmode; memset(&opendata->o_res, 0, sizeof(opendata->o_res)); memset(&opendata->c_res, 0, sizeof(opendata->c_res)); nfs4_init_opendata_res(opendata); @@ -575,7 +685,7 @@ static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, mode_t openf newstate = nfs4_opendata_to_nfs4_state(opendata); if (IS_ERR(newstate)) return PTR_ERR(newstate); - nfs4_close_state(&opendata->path, newstate, openflags); + nfs4_close_state(&opendata->path, newstate, fmode); *res = newstate; return 0; } @@ -631,7 +741,7 @@ static int _nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state { struct nfs_delegation *delegation; struct nfs4_opendata *opendata; - int delegation_type = 0; + fmode_t delegation_type = 0; int status; opendata = nfs4_open_recoverdata_alloc(ctx, state); @@ -641,7 +751,7 @@ static int _nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state opendata->o_arg.fh = NFS_FH(state->inode); rcu_read_lock(); delegation = rcu_dereference(NFS_I(state->inode)->delegation); - if (delegation != NULL && (delegation->flags & NFS_DELEGATION_NEED_RECLAIM) != 0) + if (delegation != NULL && test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) != 0) delegation_type = delegation->type; rcu_read_unlock(); opendata->o_arg.u.delegation_type = delegation_type; @@ -744,7 +854,7 @@ static void nfs4_open_confirm_release(void *calldata) goto out_free; state = nfs4_opendata_to_nfs4_state(data); if (!IS_ERR(state)) - nfs4_close_state(&data->path, state, data->o_arg.open_flags); + nfs4_close_state(&data->path, state, data->o_arg.fmode); out_free: nfs4_opendata_put(data); } @@ -808,12 +918,12 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) if (data->state != NULL) { struct nfs_delegation *delegation; - if (can_open_cached(data->state, data->o_arg.open_flags & (FMODE_READ|FMODE_WRITE|O_EXCL))) + if (can_open_cached(data->state, data->o_arg.fmode, data->o_arg.open_flags)) goto out_no_action; rcu_read_lock(); delegation = rcu_dereference(NFS_I(data->state->inode)->delegation); if (delegation != NULL && - (delegation->flags & NFS_DELEGATION_NEED_RECLAIM) == 0) { + test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) == 0) { rcu_read_unlock(); goto out_no_action; } @@ -877,7 +987,7 @@ static void nfs4_open_release(void *calldata) goto out_free; state = nfs4_opendata_to_nfs4_state(data); if (!IS_ERR(state)) - nfs4_close_state(&data->path, state, data->o_arg.open_flags); + nfs4_close_state(&data->path, state, data->o_arg.fmode); out_free: nfs4_opendata_put(data); } @@ -955,10 +1065,11 @@ static int nfs4_recover_expired_lease(struct nfs_server *server) int ret; for (;;) { - ret = nfs4_wait_clnt_recover(server->client, clp); + ret = nfs4_wait_clnt_recover(clp); if (ret != 0) return ret; - if (!test_and_clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) + if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) && + !test_bit(NFS4CLNT_CHECK_LEASE,&clp->cl_state)) break; nfs4_schedule_state_recovery(clp); } @@ -993,8 +1104,9 @@ static inline int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4 do { err = _nfs4_open_expired(ctx, state); - if (err == -NFS4ERR_DELAY) - nfs4_handle_exception(server, err, &exception); + if (err != -NFS4ERR_DELAY) + break; + nfs4_handle_exception(server, err, &exception); } while (exception.retry); return err; } @@ -1031,12 +1143,11 @@ static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, struct /* * Returns a referenced nfs4_state */ -static int _nfs4_do_open(struct inode *dir, struct path *path, int flags, struct iattr *sattr, struct rpc_cred *cred, struct nfs4_state **res) +static int _nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, int flags, struct iattr *sattr, struct rpc_cred *cred, struct nfs4_state **res) { struct nfs4_state_owner *sp; struct nfs4_state *state = NULL; struct nfs_server *server = NFS_SERVER(dir); - struct nfs_client *clp = server->nfs_client; struct nfs4_opendata *opendata; int status; @@ -1050,12 +1161,11 @@ static int _nfs4_do_open(struct inode *dir, struct path *path, int flags, struct if (status != 0) goto err_put_state_owner; if (path->dentry->d_inode != NULL) - nfs4_return_incompatible_delegation(path->dentry->d_inode, flags & (FMODE_READ|FMODE_WRITE)); - down_read(&clp->cl_sem); + nfs4_return_incompatible_delegation(path->dentry->d_inode, fmode); status = -ENOMEM; - opendata = nfs4_opendata_alloc(path, sp, flags, sattr); + opendata = nfs4_opendata_alloc(path, sp, fmode, flags, sattr); if (opendata == NULL) - goto err_release_rwsem; + goto err_put_state_owner; if (path->dentry->d_inode != NULL) opendata->state = nfs4_get_open_state(path->dentry->d_inode, sp); @@ -1073,13 +1183,10 @@ static int _nfs4_do_open(struct inode *dir, struct path *path, int flags, struct goto err_opendata_put; nfs4_opendata_put(opendata); nfs4_put_state_owner(sp); - up_read(&clp->cl_sem); *res = state; return 0; err_opendata_put: nfs4_opendata_put(opendata); -err_release_rwsem: - up_read(&clp->cl_sem); err_put_state_owner: nfs4_put_state_owner(sp); out_err: @@ -1088,14 +1195,14 @@ out_err: } -static struct nfs4_state *nfs4_do_open(struct inode *dir, struct path *path, int flags, struct iattr *sattr, struct rpc_cred *cred) +static struct nfs4_state *nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, int flags, struct iattr *sattr, struct rpc_cred *cred) { struct nfs4_exception exception = { }; struct nfs4_state *res; int status; do { - status = _nfs4_do_open(dir, path, flags, sattr, cred, &res); + status = _nfs4_do_open(dir, path, fmode, flags, sattr, cred, &res); if (status == 0) break; /* NOTE: BAD_SEQID means the server and client disagree about the @@ -1230,10 +1337,13 @@ static void nfs4_close_done(struct rpc_task *task, void *data) renew_lease(server, calldata->timestamp); break; case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_OLD_STATEID: + case -NFS4ERR_BAD_STATEID: case -NFS4ERR_EXPIRED: - break; + if (calldata->arg.fmode == 0) + break; default: - if (nfs4_async_handle_error(task, server) == -EAGAIN) { + if (nfs4_async_handle_error(task, server, state) == -EAGAIN) { rpc_restart_call(task); return; } @@ -1272,10 +1382,10 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) nfs_fattr_init(calldata->res.fattr); if (test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0) { task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE]; - calldata->arg.open_flags = FMODE_READ; + calldata->arg.fmode = FMODE_READ; } else if (test_bit(NFS_O_WRONLY_STATE, &state->flags) != 0) { task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE]; - calldata->arg.open_flags = FMODE_WRITE; + calldata->arg.fmode = FMODE_WRITE; } calldata->timestamp = jiffies; rpc_call_start(task); @@ -1328,6 +1438,7 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait) calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid); if (calldata->arg.seqid == NULL) goto out_free_calldata; + calldata->arg.fmode = 0; calldata->arg.bitmask = server->attr_bitmask; calldata->res.fattr = &calldata->fattr; calldata->res.seqid = calldata->arg.seqid; @@ -1354,13 +1465,13 @@ out: return status; } -static int nfs4_intent_set_file(struct nameidata *nd, struct path *path, struct nfs4_state *state) +static int nfs4_intent_set_file(struct nameidata *nd, struct path *path, struct nfs4_state *state, fmode_t fmode) { struct file *filp; int ret; /* If the open_intent is for execute, we have an extra check to make */ - if (nd->intent.open.flags & FMODE_EXEC) { + if (fmode & FMODE_EXEC) { ret = nfs_may_open(state->inode, state->owner->so_cred, nd->intent.open.flags); @@ -1376,7 +1487,7 @@ static int nfs4_intent_set_file(struct nameidata *nd, struct path *path, struct } ret = PTR_ERR(filp); out_close: - nfs4_close_sync(path, state, nd->intent.open.flags); + nfs4_close_sync(path, state, fmode & (FMODE_READ|FMODE_WRITE)); return ret; } @@ -1392,6 +1503,7 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd) struct rpc_cred *cred; struct nfs4_state *state; struct dentry *res; + fmode_t fmode = nd->intent.open.flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC); if (nd->flags & LOOKUP_CREATE) { attr.ia_mode = nd->intent.open.create_mode; @@ -1409,7 +1521,7 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd) parent = dentry->d_parent; /* Protect against concurrent sillydeletes */ nfs_block_sillyrename(parent); - state = nfs4_do_open(dir, &path, nd->intent.open.flags, &attr, cred); + state = nfs4_do_open(dir, &path, fmode, nd->intent.open.flags, &attr, cred); put_rpccred(cred); if (IS_ERR(state)) { if (PTR_ERR(state) == -ENOENT) { @@ -1424,7 +1536,7 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd) path.dentry = res; nfs_set_verifier(path.dentry, nfs_save_change_attribute(dir)); nfs_unblock_sillyrename(parent); - nfs4_intent_set_file(nd, &path, state); + nfs4_intent_set_file(nd, &path, state, fmode); return res; } @@ -1437,11 +1549,12 @@ nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, st }; struct rpc_cred *cred; struct nfs4_state *state; + fmode_t fmode = openflags & (FMODE_READ | FMODE_WRITE); cred = rpc_lookup_cred(); if (IS_ERR(cred)) return PTR_ERR(cred); - state = nfs4_do_open(dir, &path, openflags, NULL, cred); + state = nfs4_do_open(dir, &path, fmode, openflags, NULL, cred); put_rpccred(cred); if (IS_ERR(state)) { switch (PTR_ERR(state)) { @@ -1458,10 +1571,10 @@ nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, st } if (state->inode == dentry->d_inode) { nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); - nfs4_intent_set_file(nd, &path, state); + nfs4_intent_set_file(nd, &path, state, fmode); return 1; } - nfs4_close_sync(&path, state, openflags); + nfs4_close_sync(&path, state, fmode); out_drop: d_drop(dentry); return 0; @@ -1887,6 +2000,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, }; struct nfs4_state *state; struct rpc_cred *cred; + fmode_t fmode = flags & (FMODE_READ | FMODE_WRITE); int status = 0; cred = rpc_lookup_cred(); @@ -1894,7 +2008,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, status = PTR_ERR(cred); goto out; } - state = nfs4_do_open(dir, &path, flags, sattr, cred); + state = nfs4_do_open(dir, &path, fmode, flags, sattr, cred); d_drop(dentry); if (IS_ERR(state)) { status = PTR_ERR(state); @@ -1910,9 +2024,9 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, nfs_post_op_update_inode(state->inode, &fattr); } if (status == 0 && (nd->flags & LOOKUP_OPEN) != 0) - status = nfs4_intent_set_file(nd, &path, state); + status = nfs4_intent_set_file(nd, &path, state, fmode); else - nfs4_close_sync(&path, state, flags); + nfs4_close_sync(&path, state, fmode); out_putcred: put_rpccred(cred); out: @@ -1974,7 +2088,7 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir) { struct nfs_removeres *res = task->tk_msg.rpc_resp; - if (nfs4_async_handle_error(task, res->server) == -EAGAIN) + if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN) return 0; update_changeattr(dir, &res->cinfo); nfs_post_op_update_inode(dir, &res->dir_attr); @@ -2402,7 +2516,7 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data) { struct nfs_server *server = NFS_SERVER(data->inode); - if (nfs4_async_handle_error(task, server) == -EAGAIN) { + if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) { rpc_restart_call(task); return -EAGAIN; } @@ -2423,7 +2537,7 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data) { struct inode *inode = data->inode; - if (nfs4_async_handle_error(task, NFS_SERVER(inode)) == -EAGAIN) { + if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) { rpc_restart_call(task); return -EAGAIN; } @@ -2449,7 +2563,7 @@ static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data) { struct inode *inode = data->inode; - if (nfs4_async_handle_error(task, NFS_SERVER(inode)) == -EAGAIN) { + if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) { rpc_restart_call(task); return -EAGAIN; } @@ -2742,19 +2856,25 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen } static int -nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server) +nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state) { struct nfs_client *clp = server->nfs_client; if (!clp || task->tk_status >= 0) return 0; switch(task->tk_status) { + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_OPENMODE: + if (state == NULL) + break; + nfs4_state_mark_reclaim_nograce(clp, state); case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_STALE_STATEID: case -NFS4ERR_EXPIRED: rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL); nfs4_schedule_state_recovery(clp); - if (test_bit(NFS4CLNT_STATE_RECOVER, &clp->cl_state) == 0) + if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0) rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task); task->tk_status = 0; return -EAGAIN; @@ -2772,79 +2892,6 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server) return 0; } -static int nfs4_wait_bit_killable(void *word) -{ - if (fatal_signal_pending(current)) - return -ERESTARTSYS; - schedule(); - return 0; -} - -static int nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs_client *clp) -{ - int res; - - might_sleep(); - - rwsem_acquire(&clp->cl_sem.dep_map, 0, 0, _RET_IP_); - - res = wait_on_bit(&clp->cl_state, NFS4CLNT_STATE_RECOVER, - nfs4_wait_bit_killable, TASK_KILLABLE); - - rwsem_release(&clp->cl_sem.dep_map, 1, _RET_IP_); - return res; -} - -static int nfs4_delay(struct rpc_clnt *clnt, long *timeout) -{ - int res = 0; - - might_sleep(); - - if (*timeout <= 0) - *timeout = NFS4_POLL_RETRY_MIN; - if (*timeout > NFS4_POLL_RETRY_MAX) - *timeout = NFS4_POLL_RETRY_MAX; - schedule_timeout_killable(*timeout); - if (fatal_signal_pending(current)) - res = -ERESTARTSYS; - *timeout <<= 1; - return res; -} - -/* This is the error handling routine for processes that are allowed - * to sleep. - */ -static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception) -{ - struct nfs_client *clp = server->nfs_client; - int ret = errorcode; - - exception->retry = 0; - switch(errorcode) { - case 0: - return 0; - case -NFS4ERR_STALE_CLIENTID: - case -NFS4ERR_STALE_STATEID: - case -NFS4ERR_EXPIRED: - nfs4_schedule_state_recovery(clp); - ret = nfs4_wait_clnt_recover(server->client, clp); - if (ret == 0) - exception->retry = 1; - break; - case -NFS4ERR_FILE_OPEN: - case -NFS4ERR_GRACE: - case -NFS4ERR_DELAY: - ret = nfs4_delay(server->client, &exception->timeout); - if (ret != 0) - break; - case -NFS4ERR_OLD_STATEID: - exception->retry = 1; - } - /* We failed to handle the error */ - return nfs4_map_errors(ret); -} - int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short port, struct rpc_cred *cred) { nfs4_verifier sc_verifier; @@ -2916,7 +2963,6 @@ static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cre spin_lock(&clp->cl_lock); clp->cl_lease_time = fsinfo.lease_time * HZ; clp->cl_last_renewal = now; - clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); spin_unlock(&clp->cl_lock); } return status; @@ -3074,7 +3120,6 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock struct nfs4_lock_state *lsp; int status; - down_read(&clp->cl_sem); arg.lock_owner.clientid = clp->cl_clientid; status = nfs4_set_lock_state(state, request); if (status != 0) @@ -3091,7 +3136,6 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock } request->fl_ops->fl_release_private(request); out: - up_read(&clp->cl_sem); return status; } @@ -3181,11 +3225,13 @@ static void nfs4_locku_done(struct rpc_task *task, void *data) sizeof(calldata->lsp->ls_stateid.data)); renew_lease(calldata->server, calldata->timestamp); break; + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_OLD_STATEID: case -NFS4ERR_STALE_STATEID: case -NFS4ERR_EXPIRED: break; default: - if (nfs4_async_handle_error(task, calldata->server) == -EAGAIN) + if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN) rpc_restart_call(task); } } @@ -3248,6 +3294,7 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl, static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *request) { + struct nfs_inode *nfsi = NFS_I(state->inode); struct nfs_seqid *seqid; struct nfs4_lock_state *lsp; struct rpc_task *task; @@ -3257,8 +3304,12 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock * status = nfs4_set_lock_state(state, request); /* Unlock _before_ we do the RPC call */ request->fl_flags |= FL_EXISTS; - if (do_vfs_lock(request->fl_file, request) == -ENOENT) + down_read(&nfsi->rwsem); + if (do_vfs_lock(request->fl_file, request) == -ENOENT) { + up_read(&nfsi->rwsem); goto out; + } + up_read(&nfsi->rwsem); if (status != 0) goto out; /* Is this a delegated lock? */ @@ -3484,7 +3535,7 @@ static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) { - struct nfs_client *clp = state->owner->so_client; + struct nfs_inode *nfsi = NFS_I(state->inode); unsigned char fl_flags = request->fl_flags; int status; @@ -3496,19 +3547,13 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock status = do_vfs_lock(request->fl_file, request); if (status < 0) goto out; - down_read(&clp->cl_sem); + down_read(&nfsi->rwsem); if (test_bit(NFS_DELEGATED_STATE, &state->flags)) { - struct nfs_inode *nfsi = NFS_I(state->inode); /* Yes: cache locks! */ - down_read(&nfsi->rwsem); /* ...but avoid races with delegation recall... */ - if (test_bit(NFS_DELEGATED_STATE, &state->flags)) { - request->fl_flags = fl_flags & ~FL_SLEEP; - status = do_vfs_lock(request->fl_file, request); - up_read(&nfsi->rwsem); - goto out_unlock; - } - up_read(&nfsi->rwsem); + request->fl_flags = fl_flags & ~FL_SLEEP; + status = do_vfs_lock(request->fl_file, request); + goto out_unlock; } status = _nfs4_do_setlk(state, cmd, request, 0); if (status != 0) @@ -3518,7 +3563,7 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock if (do_vfs_lock(request->fl_file, request) < 0) printk(KERN_WARNING "%s: VFS is out of sync with lock manager!\n", __func__); out_unlock: - up_read(&clp->cl_sem); + up_read(&nfsi->rwsem); out: request->fl_flags = fl_flags; return status; @@ -3664,11 +3709,15 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, } struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops = { + .owner_flag_bit = NFS_OWNER_RECLAIM_REBOOT, + .state_flag_bit = NFS_STATE_RECLAIM_REBOOT, .recover_open = nfs4_open_reclaim, .recover_lock = nfs4_lock_reclaim, }; -struct nfs4_state_recovery_ops nfs4_network_partition_recovery_ops = { +struct nfs4_state_recovery_ops nfs4_nograce_recovery_ops = { + .owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE, + .state_flag_bit = NFS_STATE_RECLAIM_NOGRACE, .recover_open = nfs4_open_expired, .recover_lock = nfs4_lock_expired, }; diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c index 3305acbbe2ae..f524e932ff7b 100644 --- a/fs/nfs/nfs4renewd.c +++ b/fs/nfs/nfs4renewd.c @@ -65,7 +65,6 @@ nfs4_renew_state(struct work_struct *work) long lease, timeout; unsigned long last, now; - down_read(&clp->cl_sem); dprintk("%s: start\n", __func__); /* Are there any active superblocks? */ if (list_empty(&clp->cl_superblocks)) @@ -77,17 +76,19 @@ nfs4_renew_state(struct work_struct *work) timeout = (2 * lease) / 3 + (long)last - (long)now; /* Are we close to a lease timeout? */ if (time_after(now, last + lease/3)) { - cred = nfs4_get_renew_cred(clp); + cred = nfs4_get_renew_cred_locked(clp); + spin_unlock(&clp->cl_lock); if (cred == NULL) { - set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); - spin_unlock(&clp->cl_lock); + if (list_empty(&clp->cl_delegations)) { + set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); + goto out; + } nfs_expire_all_delegations(clp); - goto out; + } else { + /* Queue an asynchronous RENEW. */ + nfs4_proc_async_renew(clp, cred); + put_rpccred(cred); } - spin_unlock(&clp->cl_lock); - /* Queue an asynchronous RENEW. */ - nfs4_proc_async_renew(clp, cred); - put_rpccred(cred); timeout = (2 * lease) / 3; spin_lock(&clp->cl_lock); } else @@ -100,12 +101,11 @@ nfs4_renew_state(struct work_struct *work) cancel_delayed_work(&clp->cl_renewd); schedule_delayed_work(&clp->cl_renewd, timeout); spin_unlock(&clp->cl_lock); + nfs_expire_unreferenced_delegations(clp); out: - up_read(&clp->cl_sem); dprintk("%s: done\n", __func__); } -/* Must be called with clp->cl_sem locked for writes */ void nfs4_schedule_state_renewal(struct nfs_client *clp) { diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 401ef8b28f97..2022fe47966f 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -71,14 +71,12 @@ static int nfs4_init_client(struct nfs_client *clp, struct rpc_cred *cred) return status; } -static struct rpc_cred *nfs4_get_machine_cred(struct nfs_client *clp) +static struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp) { struct rpc_cred *cred = NULL; - spin_lock(&clp->cl_lock); if (clp->cl_machine_cred != NULL) cred = get_rpccred(clp->cl_machine_cred); - spin_unlock(&clp->cl_lock); return cred; } @@ -94,7 +92,7 @@ static void nfs4_clear_machine_cred(struct nfs_client *clp) put_rpccred(cred); } -struct rpc_cred *nfs4_get_renew_cred(struct nfs_client *clp) +struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp) { struct nfs4_state_owner *sp; struct rb_node *pos; @@ -110,13 +108,24 @@ struct rpc_cred *nfs4_get_renew_cred(struct nfs_client *clp) return cred; } +static struct rpc_cred *nfs4_get_renew_cred(struct nfs_client *clp) +{ + struct rpc_cred *cred; + + spin_lock(&clp->cl_lock); + cred = nfs4_get_renew_cred_locked(clp); + spin_unlock(&clp->cl_lock); + return cred; +} + static struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp) { struct nfs4_state_owner *sp; struct rb_node *pos; struct rpc_cred *cred; - cred = nfs4_get_machine_cred(clp); + spin_lock(&clp->cl_lock); + cred = nfs4_get_machine_cred_locked(clp); if (cred != NULL) goto out; pos = rb_first(&clp->cl_state_owners); @@ -125,6 +134,7 @@ static struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp) cred = get_rpccred(sp->so_cred); } out: + spin_unlock(&clp->cl_lock); return cred; } @@ -295,10 +305,6 @@ nfs4_drop_state_owner(struct nfs4_state_owner *sp) } } -/* - * Note: must be called with clp->cl_sem held in order to prevent races - * with reboot recovery! - */ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct rpc_cred *cred) { struct nfs_client *clp = server->nfs_client; @@ -327,10 +333,6 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct return sp; } -/* - * Must be called with clp->cl_sem held in order to avoid races - * with state recovery... - */ void nfs4_put_state_owner(struct nfs4_state_owner *sp) { struct nfs_client *clp = sp->so_client; @@ -361,18 +363,18 @@ nfs4_alloc_open_state(void) } void -nfs4_state_set_mode_locked(struct nfs4_state *state, mode_t mode) +nfs4_state_set_mode_locked(struct nfs4_state *state, fmode_t fmode) { - if (state->state == mode) + if (state->state == fmode) return; /* NB! List reordering - see the reclaim code for why. */ - if ((mode & FMODE_WRITE) != (state->state & FMODE_WRITE)) { - if (mode & FMODE_WRITE) + if ((fmode & FMODE_WRITE) != (state->state & FMODE_WRITE)) { + if (fmode & FMODE_WRITE) list_move(&state->open_states, &state->owner->so_states); else list_move_tail(&state->open_states, &state->owner->so_states); } - state->state = mode; + state->state = fmode; } static struct nfs4_state * @@ -432,10 +434,6 @@ out: return state; } -/* - * Beware! Caller must be holding exactly one - * reference to clp->cl_sem! - */ void nfs4_put_open_state(struct nfs4_state *state) { struct inode *inode = state->inode; @@ -456,16 +454,16 @@ void nfs4_put_open_state(struct nfs4_state *state) /* * Close the current file. */ -static void __nfs4_close(struct path *path, struct nfs4_state *state, mode_t mode, int wait) +static void __nfs4_close(struct path *path, struct nfs4_state *state, fmode_t fmode, int wait) { struct nfs4_state_owner *owner = state->owner; int call_close = 0; - int newstate; + fmode_t newstate; atomic_inc(&owner->so_count); /* Protect against nfs4_find_state() */ spin_lock(&owner->so_lock); - switch (mode & (FMODE_READ | FMODE_WRITE)) { + switch (fmode & (FMODE_READ | FMODE_WRITE)) { case FMODE_READ: state->n_rdonly--; break; @@ -500,14 +498,14 @@ static void __nfs4_close(struct path *path, struct nfs4_state *state, mode_t mod nfs4_do_close(path, state, wait); } -void nfs4_close_state(struct path *path, struct nfs4_state *state, mode_t mode) +void nfs4_close_state(struct path *path, struct nfs4_state *state, fmode_t fmode) { - __nfs4_close(path, state, mode, 0); + __nfs4_close(path, state, fmode, 0); } -void nfs4_close_sync(struct path *path, struct nfs4_state *state, mode_t mode) +void nfs4_close_sync(struct path *path, struct nfs4_state *state, fmode_t fmode) { - __nfs4_close(path, state, mode, 1); + __nfs4_close(path, state, fmode, 1); } /* @@ -568,7 +566,6 @@ static void nfs4_free_lock_state(struct nfs4_lock_state *lsp) * Return a compatible lock_state. If no initialized lock_state structure * exists, return an uninitialized one. * - * The caller must be holding clp->cl_sem */ static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner) { @@ -770,32 +767,34 @@ unlock: return status; } -static int reclaimer(void *); +static int nfs4_run_state_manager(void *); -static inline void nfs4_clear_recover_bit(struct nfs_client *clp) +static void nfs4_clear_state_manager_bit(struct nfs_client *clp) { smp_mb__before_clear_bit(); - clear_bit(NFS4CLNT_STATE_RECOVER, &clp->cl_state); + clear_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state); smp_mb__after_clear_bit(); - wake_up_bit(&clp->cl_state, NFS4CLNT_STATE_RECOVER); + wake_up_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING); rpc_wake_up(&clp->cl_rpcwaitq); } /* - * State recovery routine + * Schedule the nfs_client asynchronous state management routine */ -static void nfs4_recover_state(struct nfs_client *clp) +void nfs4_schedule_state_manager(struct nfs_client *clp) { struct task_struct *task; + if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0) + return; __module_get(THIS_MODULE); atomic_inc(&clp->cl_count); - task = kthread_run(reclaimer, clp, "%s-reclaim", + task = kthread_run(nfs4_run_state_manager, clp, "%s-manager", rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)); if (!IS_ERR(task)) return; - nfs4_clear_recover_bit(clp); + nfs4_clear_state_manager_bit(clp); nfs_put_client(clp); module_put(THIS_MODULE); } @@ -807,16 +806,42 @@ void nfs4_schedule_state_recovery(struct nfs_client *clp) { if (!clp) return; - if (test_and_set_bit(NFS4CLNT_STATE_RECOVER, &clp->cl_state) == 0) - nfs4_recover_state(clp); + if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) + set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state); + nfs4_schedule_state_manager(clp); } -static int nfs4_reclaim_locks(struct nfs4_state_recovery_ops *ops, struct nfs4_state *state) +static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state) +{ + + set_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags); + /* Don't recover state that expired before the reboot */ + if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags)) { + clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags); + return 0; + } + set_bit(NFS_OWNER_RECLAIM_REBOOT, &state->owner->so_flags); + set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state); + return 1; +} + +int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state) +{ + set_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags); + clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags); + set_bit(NFS_OWNER_RECLAIM_NOGRACE, &state->owner->so_flags); + set_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state); + return 1; +} + +static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_recovery_ops *ops) { struct inode *inode = state->inode; + struct nfs_inode *nfsi = NFS_I(inode); struct file_lock *fl; int status = 0; + down_write(&nfsi->rwsem); for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK))) continue; @@ -839,12 +864,14 @@ static int nfs4_reclaim_locks(struct nfs4_state_recovery_ops *ops, struct nfs4_s goto out_err; } } + up_write(&nfsi->rwsem); return 0; out_err: + up_write(&nfsi->rwsem); return status; } -static int nfs4_reclaim_open_state(struct nfs4_state_recovery_ops *ops, struct nfs4_state_owner *sp) +static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs4_state_recovery_ops *ops) { struct nfs4_state *state; struct nfs4_lock_state *lock; @@ -858,28 +885,34 @@ static int nfs4_reclaim_open_state(struct nfs4_state_recovery_ops *ops, struct n * recovering after a network partition or a reboot from a * server that doesn't support a grace period. */ +restart: + spin_lock(&sp->so_lock); list_for_each_entry(state, &sp->so_states, open_states) { + if (!test_and_clear_bit(ops->state_flag_bit, &state->flags)) + continue; if (state->state == 0) continue; + atomic_inc(&state->count); + spin_unlock(&sp->so_lock); status = ops->recover_open(sp, state); if (status >= 0) { - status = nfs4_reclaim_locks(ops, state); - if (status < 0) - goto out_err; - list_for_each_entry(lock, &state->lock_states, ls_locks) { - if (!(lock->ls_flags & NFS_LOCK_INITIALIZED)) - printk("%s: Lock reclaim failed!\n", + status = nfs4_reclaim_locks(state, ops); + if (status >= 0) { + list_for_each_entry(lock, &state->lock_states, ls_locks) { + if (!(lock->ls_flags & NFS_LOCK_INITIALIZED)) + printk("%s: Lock reclaim failed!\n", __func__); + } + nfs4_put_open_state(state); + goto restart; } - continue; } switch (status) { default: printk(KERN_ERR "%s: unhandled error %d. Zeroing state\n", __func__, status); case -ENOENT: - case -NFS4ERR_RECLAIM_BAD: - case -NFS4ERR_RECLAIM_CONFLICT: + case -ESTALE: /* * Open state on this file cannot be recovered * All we can do is revert to using the zero stateid. @@ -889,84 +922,176 @@ static int nfs4_reclaim_open_state(struct nfs4_state_recovery_ops *ops, struct n /* Mark the file as being 'closed' */ state->state = 0; break; + case -NFS4ERR_RECLAIM_BAD: + case -NFS4ERR_RECLAIM_CONFLICT: + nfs4_state_mark_reclaim_nograce(sp->so_client, state); + break; case -NFS4ERR_EXPIRED: case -NFS4ERR_NO_GRACE: + nfs4_state_mark_reclaim_nograce(sp->so_client, state); case -NFS4ERR_STALE_CLIENTID: goto out_err; } + nfs4_put_open_state(state); + goto restart; } + spin_unlock(&sp->so_lock); return 0; out_err: + nfs4_put_open_state(state); return status; } -static void nfs4_state_mark_reclaim(struct nfs_client *clp) +static void nfs4_clear_open_state(struct nfs4_state *state) +{ + struct nfs4_lock_state *lock; + + clear_bit(NFS_DELEGATED_STATE, &state->flags); + clear_bit(NFS_O_RDONLY_STATE, &state->flags); + clear_bit(NFS_O_WRONLY_STATE, &state->flags); + clear_bit(NFS_O_RDWR_STATE, &state->flags); + list_for_each_entry(lock, &state->lock_states, ls_locks) { + lock->ls_seqid.flags = 0; + lock->ls_flags &= ~NFS_LOCK_INITIALIZED; + } +} + +static void nfs4_state_mark_reclaim_helper(struct nfs_client *clp, int (*mark_reclaim)(struct nfs_client *clp, struct nfs4_state *state)) { struct nfs4_state_owner *sp; struct rb_node *pos; struct nfs4_state *state; - struct nfs4_lock_state *lock; /* Reset all sequence ids to zero */ for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) { sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); - sp->so_seqid.counter = 0; sp->so_seqid.flags = 0; spin_lock(&sp->so_lock); list_for_each_entry(state, &sp->so_states, open_states) { - clear_bit(NFS_DELEGATED_STATE, &state->flags); - clear_bit(NFS_O_RDONLY_STATE, &state->flags); - clear_bit(NFS_O_WRONLY_STATE, &state->flags); - clear_bit(NFS_O_RDWR_STATE, &state->flags); - list_for_each_entry(lock, &state->lock_states, ls_locks) { - lock->ls_seqid.counter = 0; - lock->ls_seqid.flags = 0; - lock->ls_flags &= ~NFS_LOCK_INITIALIZED; - } + if (mark_reclaim(clp, state)) + nfs4_clear_open_state(state); } spin_unlock(&sp->so_lock); } } -static int reclaimer(void *ptr) +static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp) +{ + /* Mark all delegations for reclaim */ + nfs_delegation_mark_reclaim(clp); + nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_reboot); +} + +static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp) { - struct nfs_client *clp = ptr; struct nfs4_state_owner *sp; struct rb_node *pos; - struct nfs4_state_recovery_ops *ops; - struct rpc_cred *cred; + struct nfs4_state *state; + + if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) + return; + + for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) { + sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); + spin_lock(&sp->so_lock); + list_for_each_entry(state, &sp->so_states, open_states) { + if (!test_and_clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags)) + continue; + nfs4_state_mark_reclaim_nograce(clp, state); + } + spin_unlock(&sp->so_lock); + } + + nfs_delegation_reap_unclaimed(clp); +} + +static void nfs_delegation_clear_all(struct nfs_client *clp) +{ + nfs_delegation_mark_reclaim(clp); + nfs_delegation_reap_unclaimed(clp); +} + +static void nfs4_state_start_reclaim_nograce(struct nfs_client *clp) +{ + nfs_delegation_clear_all(clp); + nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce); +} + +static void nfs4_state_end_reclaim_nograce(struct nfs_client *clp) +{ + clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state); +} + +static void nfs4_recovery_handle_error(struct nfs_client *clp, int error) +{ + switch (error) { + case -NFS4ERR_CB_PATH_DOWN: + nfs_handle_cb_pathdown(clp); + break; + case -NFS4ERR_STALE_CLIENTID: + case -NFS4ERR_LEASE_MOVED: + set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); + nfs4_state_start_reclaim_reboot(clp); + break; + case -NFS4ERR_EXPIRED: + set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); + nfs4_state_start_reclaim_nograce(clp); + } +} + +static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recovery_ops *ops) +{ + struct rb_node *pos; int status = 0; - allow_signal(SIGKILL); +restart: + spin_lock(&clp->cl_lock); + for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) { + struct nfs4_state_owner *sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); + if (!test_and_clear_bit(ops->owner_flag_bit, &sp->so_flags)) + continue; + atomic_inc(&sp->so_count); + spin_unlock(&clp->cl_lock); + status = nfs4_reclaim_open_state(sp, ops); + if (status < 0) { + set_bit(ops->owner_flag_bit, &sp->so_flags); + nfs4_put_state_owner(sp); + nfs4_recovery_handle_error(clp, status); + return status; + } + nfs4_put_state_owner(sp); + goto restart; + } + spin_unlock(&clp->cl_lock); + return status; +} - /* Ensure exclusive access to NFSv4 state */ - down_write(&clp->cl_sem); - /* Are there any NFS mounts out there? */ - if (list_empty(&clp->cl_superblocks)) - goto out; -restart_loop: - ops = &nfs4_network_partition_recovery_ops; - /* Are there any open files on this volume? */ +static int nfs4_check_lease(struct nfs_client *clp) +{ + struct rpc_cred *cred; + int status = -NFS4ERR_EXPIRED; + + /* Is the client already known to have an expired lease? */ + if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) + return 0; cred = nfs4_get_renew_cred(clp); - if (cred != NULL) { - /* Yes there are: try to renew the old lease */ - status = nfs4_proc_renew(clp, cred); - put_rpccred(cred); - switch (status) { - case 0: - case -NFS4ERR_CB_PATH_DOWN: - goto out; - case -NFS4ERR_STALE_CLIENTID: - case -NFS4ERR_LEASE_MOVED: - ops = &nfs4_reboot_recovery_ops; - } - } else { - /* "reboot" to ensure we clear all state on the server */ - clp->cl_boot_time = CURRENT_TIME; + if (cred == NULL) { + cred = nfs4_get_setclientid_cred(clp); + if (cred == NULL) + goto out; } - /* We're going to have to re-establish a clientid */ - nfs4_state_mark_reclaim(clp); - status = -ENOENT; + status = nfs4_proc_renew(clp, cred); + put_rpccred(cred); +out: + nfs4_recovery_handle_error(clp, status); + return status; +} + +static int nfs4_reclaim_lease(struct nfs_client *clp) +{ + struct rpc_cred *cred; + int status = -ENOENT; + cred = nfs4_get_setclientid_cred(clp); if (cred != NULL) { status = nfs4_init_client(clp, cred); @@ -974,42 +1099,90 @@ restart_loop: /* Handle case where the user hasn't set up machine creds */ if (status == -EACCES && cred == clp->cl_machine_cred) { nfs4_clear_machine_cred(clp); - goto restart_loop; + status = -EAGAIN; } } - if (status) - goto out_error; - /* Mark all delegations for reclaim */ - nfs_delegation_mark_reclaim(clp); - /* Note: list is protected by exclusive lock on cl->cl_sem */ - for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) { - sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); - status = nfs4_reclaim_open_state(ops, sp); - if (status < 0) { - if (status == -NFS4ERR_NO_GRACE) { - ops = &nfs4_network_partition_recovery_ops; - status = nfs4_reclaim_open_state(ops, sp); + return status; +} + +static void nfs4_state_manager(struct nfs_client *clp) +{ + int status = 0; + + /* Ensure exclusive access to NFSv4 state */ + for(;;) { + if (test_and_clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) { + /* We're going to have to re-establish a clientid */ + status = nfs4_reclaim_lease(clp); + if (status) { + set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); + if (status == -EAGAIN) + continue; + goto out_error; } + clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state); + } + + if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) { + status = nfs4_check_lease(clp); + if (status != 0) + continue; + } + + /* First recover reboot state... */ + if (test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) { + status = nfs4_do_reclaim(clp, &nfs4_reboot_recovery_ops); if (status == -NFS4ERR_STALE_CLIENTID) - goto restart_loop; - if (status == -NFS4ERR_EXPIRED) - goto restart_loop; + continue; + nfs4_state_end_reclaim_reboot(clp); + continue; + } + + /* Now recover expired state... */ + if (test_and_clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) { + status = nfs4_do_reclaim(clp, &nfs4_nograce_recovery_ops); + if (status < 0) { + set_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state); + if (status == -NFS4ERR_STALE_CLIENTID) + continue; + if (status == -NFS4ERR_EXPIRED) + continue; + goto out_error; + } else + nfs4_state_end_reclaim_nograce(clp); + continue; } + + if (test_and_clear_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) { + nfs_client_return_marked_delegations(clp); + continue; + } + + nfs4_clear_state_manager_bit(clp); + /* Did we race with an attempt to give us more work? */ + if (clp->cl_state == 0) + break; + if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0) + break; } - nfs_delegation_reap_unclaimed(clp); -out: - up_write(&clp->cl_sem); - if (status == -NFS4ERR_CB_PATH_DOWN) - nfs_handle_cb_pathdown(clp); - nfs4_clear_recover_bit(clp); + return; +out_error: + printk(KERN_WARNING "Error: state manager failed on NFSv4 server %s" + " with error %d\n", clp->cl_hostname, -status); + if (test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) + nfs4_state_end_reclaim_reboot(clp); + nfs4_clear_state_manager_bit(clp); +} + +static int nfs4_run_state_manager(void *ptr) +{ + struct nfs_client *clp = ptr; + + allow_signal(SIGKILL); + nfs4_state_manager(clp); nfs_put_client(clp); module_put_and_exit(0); return 0; -out_error: - printk(KERN_WARNING "Error: state recovery failed on NFSv4 server %s" - " with error %d\n", clp->cl_hostname, -status); - set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); - goto out; } /* diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index b916297d2334..d1e4c8f8a0a9 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -8,7 +8,7 @@ * * Kendrick Smith <kmsmith@umich.edu> * Andy Adamson <andros@umich.edu> - * + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -67,7 +67,7 @@ static int nfs4_stat_to_errno(int); #define NFS4_MAXTAGLEN 0 #endif -/* lock,open owner id: +/* lock,open owner id: * we currently use size 2 (u64) out of (NFS4_OPAQUE_LIMIT >> 2) */ #define open_owner_id_maxsz (1 + 4) @@ -541,6 +541,7 @@ static struct { struct compound_hdr { int32_t status; uint32_t nops; + __be32 * nops_p; uint32_t taglen; char * tag; }; @@ -578,7 +579,7 @@ static void encode_string(struct xdr_stream *xdr, unsigned int len, const char * xdr_encode_opaque(p, str, len); } -static int encode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr) +static void encode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr) { __be32 *p; @@ -588,8 +589,13 @@ static int encode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr) WRITE32(hdr->taglen); WRITEMEM(hdr->tag, hdr->taglen); WRITE32(NFS4_MINOR_VERSION); + hdr->nops_p = p; WRITE32(hdr->nops); - return 0; +} + +static void encode_nops(struct compound_hdr *hdr) +{ + *hdr->nops_p = htonl(hdr->nops); } static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *verf) @@ -601,7 +607,7 @@ static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *ve xdr_encode_opaque_fixed(p, verf->data, NFS4_VERIFIER_SIZE); } -static int encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const struct nfs_server *server) +static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const struct nfs_server *server) { char owner_name[IDMAP_NAMESZ]; char owner_group[IDMAP_NAMESZ]; @@ -612,7 +618,6 @@ static int encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const s int len; uint32_t bmval0 = 0; uint32_t bmval1 = 0; - int status; /* * We reserve enough space to write the entire attribute buffer at once. @@ -709,7 +714,7 @@ static int encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const s bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET; WRITE32(NFS4_SET_TO_SERVER_TIME); } - + /* * Now we backfill the bitmap and the attribute buffer length. */ @@ -723,23 +728,20 @@ static int encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const s *q++ = htonl(bmval1); *q++ = htonl(len); - status = 0; /* out: */ - return status; } -static int encode_access(struct xdr_stream *xdr, u32 access) +static void encode_access(struct xdr_stream *xdr, u32 access, struct compound_hdr *hdr) { __be32 *p; RESERVE_SPACE(8); WRITE32(OP_ACCESS); WRITE32(access); - - return 0; + hdr->nops++; } -static int encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg) +static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg, struct compound_hdr *hdr) { __be32 *p; @@ -747,26 +749,24 @@ static int encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg) WRITE32(OP_CLOSE); WRITE32(arg->seqid->sequence->counter); WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE); - - return 0; + hdr->nops++; } -static int encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *args) +static void encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *args, struct compound_hdr *hdr) { __be32 *p; - - RESERVE_SPACE(16); - WRITE32(OP_COMMIT); - WRITE64(args->offset); - WRITE32(args->count); - return 0; + RESERVE_SPACE(16); + WRITE32(OP_COMMIT); + WRITE64(args->offset); + WRITE32(args->count); + hdr->nops++; } -static int encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *create) +static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *create, struct compound_hdr *hdr) { __be32 *p; - + RESERVE_SPACE(8); WRITE32(OP_CREATE); WRITE32(create->ftype); @@ -791,64 +791,62 @@ static int encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *c RESERVE_SPACE(4 + create->name->len); WRITE32(create->name->len); WRITEMEM(create->name->name, create->name->len); + hdr->nops++; - return encode_attrs(xdr, create->attrs, create->server); + encode_attrs(xdr, create->attrs, create->server); } -static int encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap) +static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct compound_hdr *hdr) { - __be32 *p; + __be32 *p; - RESERVE_SPACE(12); - WRITE32(OP_GETATTR); - WRITE32(1); - WRITE32(bitmap); - return 0; + RESERVE_SPACE(12); + WRITE32(OP_GETATTR); + WRITE32(1); + WRITE32(bitmap); + hdr->nops++; } -static int encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm1) +static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm1, struct compound_hdr *hdr) { - __be32 *p; + __be32 *p; - RESERVE_SPACE(16); - WRITE32(OP_GETATTR); - WRITE32(2); - WRITE32(bm0); - WRITE32(bm1); - return 0; + RESERVE_SPACE(16); + WRITE32(OP_GETATTR); + WRITE32(2); + WRITE32(bm0); + WRITE32(bm1); + hdr->nops++; } -static int encode_getfattr(struct xdr_stream *xdr, const u32* bitmask) +static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) { - return encode_getattr_two(xdr, - bitmask[0] & nfs4_fattr_bitmap[0], - bitmask[1] & nfs4_fattr_bitmap[1]); + encode_getattr_two(xdr, bitmask[0] & nfs4_fattr_bitmap[0], + bitmask[1] & nfs4_fattr_bitmap[1], hdr); } -static int encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask) +static void encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) { - return encode_getattr_two(xdr, bitmask[0] & nfs4_fsinfo_bitmap[0], - bitmask[1] & nfs4_fsinfo_bitmap[1]); + encode_getattr_two(xdr, bitmask[0] & nfs4_fsinfo_bitmap[0], + bitmask[1] & nfs4_fsinfo_bitmap[1], hdr); } -static int encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask) +static void encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) { - return encode_getattr_two(xdr, - bitmask[0] & nfs4_fs_locations_bitmap[0], - bitmask[1] & nfs4_fs_locations_bitmap[1]); + encode_getattr_two(xdr, bitmask[0] & nfs4_fs_locations_bitmap[0], + bitmask[1] & nfs4_fs_locations_bitmap[1], hdr); } -static int encode_getfh(struct xdr_stream *xdr) +static void encode_getfh(struct xdr_stream *xdr, struct compound_hdr *hdr) { __be32 *p; RESERVE_SPACE(4); WRITE32(OP_GETFH); - - return 0; + hdr->nops++; } -static int encode_link(struct xdr_stream *xdr, const struct qstr *name) +static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr) { __be32 *p; @@ -856,8 +854,7 @@ static int encode_link(struct xdr_stream *xdr, const struct qstr *name) WRITE32(OP_LINK); WRITE32(name->len); WRITEMEM(name->name, name->len); - - return 0; + hdr->nops++; } static inline int nfs4_lock_type(struct file_lock *fl, int block) @@ -878,7 +875,7 @@ static inline uint64_t nfs4_lock_length(struct file_lock *fl) * opcode,type,reclaim,offset,length,new_lock_owner = 32 * open_seqid,open_stateid,lock_seqid,lock_owner.clientid, lock_owner.id = 40 */ -static int encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args) +static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args, struct compound_hdr *hdr) { __be32 *p; @@ -904,11 +901,10 @@ static int encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args) WRITEMEM(args->lock_stateid->data, NFS4_STATEID_SIZE); WRITE32(args->lock_seqid->sequence->counter); } - - return 0; + hdr->nops++; } -static int encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *args) +static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *args, struct compound_hdr *hdr) { __be32 *p; @@ -921,11 +917,10 @@ static int encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *arg WRITE32(16); WRITEMEM("lock id:", 8); WRITE64(args->lock_owner.id); - - return 0; + hdr->nops++; } -static int encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *args) +static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *args, struct compound_hdr *hdr) { __be32 *p; @@ -936,11 +931,10 @@ static int encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *arg WRITEMEM(args->stateid->data, NFS4_STATEID_SIZE); WRITE64(args->fl->fl_start); WRITE64(nfs4_lock_length(args->fl)); - - return 0; + hdr->nops++; } -static int encode_lookup(struct xdr_stream *xdr, const struct qstr *name) +static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr) { int len = name->len; __be32 *p; @@ -949,27 +943,26 @@ static int encode_lookup(struct xdr_stream *xdr, const struct qstr *name) WRITE32(OP_LOOKUP); WRITE32(len); WRITEMEM(name->name, len); - - return 0; + hdr->nops++; } -static void encode_share_access(struct xdr_stream *xdr, int open_flags) +static void encode_share_access(struct xdr_stream *xdr, fmode_t fmode) { __be32 *p; RESERVE_SPACE(8); - switch (open_flags & (FMODE_READ|FMODE_WRITE)) { - case FMODE_READ: - WRITE32(NFS4_SHARE_ACCESS_READ); - break; - case FMODE_WRITE: - WRITE32(NFS4_SHARE_ACCESS_WRITE); - break; - case FMODE_READ|FMODE_WRITE: - WRITE32(NFS4_SHARE_ACCESS_BOTH); - break; - default: - BUG(); + switch (fmode & (FMODE_READ|FMODE_WRITE)) { + case FMODE_READ: + WRITE32(NFS4_SHARE_ACCESS_READ); + break; + case FMODE_WRITE: + WRITE32(NFS4_SHARE_ACCESS_WRITE); + break; + case FMODE_READ|FMODE_WRITE: + WRITE32(NFS4_SHARE_ACCESS_BOTH); + break; + default: + WRITE32(0); } WRITE32(0); /* for linux, share_deny = 0 always */ } @@ -984,7 +977,7 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena RESERVE_SPACE(8); WRITE32(OP_OPEN); WRITE32(arg->seqid->sequence->counter); - encode_share_access(xdr, arg->open_flags); + encode_share_access(xdr, arg->fmode); RESERVE_SPACE(28); WRITE64(arg->clientid); WRITE32(16); @@ -998,13 +991,13 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op RESERVE_SPACE(4); switch(arg->open_flags & O_EXCL) { - case 0: - WRITE32(NFS4_CREATE_UNCHECKED); - encode_attrs(xdr, arg->u.attrs, arg->server); - break; - default: - WRITE32(NFS4_CREATE_EXCLUSIVE); - encode_nfs4_verifier(xdr, &arg->u.verifier); + case 0: + WRITE32(NFS4_CREATE_UNCHECKED); + encode_attrs(xdr, arg->u.attrs, arg->server); + break; + default: + WRITE32(NFS4_CREATE_EXCLUSIVE); + encode_nfs4_verifier(xdr, &arg->u.verifier); } } @@ -1014,33 +1007,33 @@ static void encode_opentype(struct xdr_stream *xdr, const struct nfs_openargs *a RESERVE_SPACE(4); switch (arg->open_flags & O_CREAT) { - case 0: - WRITE32(NFS4_OPEN_NOCREATE); - break; - default: - BUG_ON(arg->claim != NFS4_OPEN_CLAIM_NULL); - WRITE32(NFS4_OPEN_CREATE); - encode_createmode(xdr, arg); + case 0: + WRITE32(NFS4_OPEN_NOCREATE); + break; + default: + BUG_ON(arg->claim != NFS4_OPEN_CLAIM_NULL); + WRITE32(NFS4_OPEN_CREATE); + encode_createmode(xdr, arg); } } -static inline void encode_delegation_type(struct xdr_stream *xdr, int delegation_type) +static inline void encode_delegation_type(struct xdr_stream *xdr, fmode_t delegation_type) { __be32 *p; RESERVE_SPACE(4); switch (delegation_type) { - case 0: - WRITE32(NFS4_OPEN_DELEGATE_NONE); - break; - case FMODE_READ: - WRITE32(NFS4_OPEN_DELEGATE_READ); - break; - case FMODE_WRITE|FMODE_READ: - WRITE32(NFS4_OPEN_DELEGATE_WRITE); - break; - default: - BUG(); + case 0: + WRITE32(NFS4_OPEN_DELEGATE_NONE); + break; + case FMODE_READ: + WRITE32(NFS4_OPEN_DELEGATE_READ); + break; + case FMODE_WRITE|FMODE_READ: + WRITE32(NFS4_OPEN_DELEGATE_WRITE); + break; + default: + BUG(); } } @@ -1053,7 +1046,7 @@ static inline void encode_claim_null(struct xdr_stream *xdr, const struct qstr * encode_string(xdr, name->len, name->name); } -static inline void encode_claim_previous(struct xdr_stream *xdr, int type) +static inline void encode_claim_previous(struct xdr_stream *xdr, fmode_t type) { __be32 *p; @@ -1072,27 +1065,27 @@ static inline void encode_claim_delegate_cur(struct xdr_stream *xdr, const struc encode_string(xdr, name->len, name->name); } -static int encode_open(struct xdr_stream *xdr, const struct nfs_openargs *arg) +static void encode_open(struct xdr_stream *xdr, const struct nfs_openargs *arg, struct compound_hdr *hdr) { encode_openhdr(xdr, arg); encode_opentype(xdr, arg); switch (arg->claim) { - case NFS4_OPEN_CLAIM_NULL: - encode_claim_null(xdr, arg->name); - break; - case NFS4_OPEN_CLAIM_PREVIOUS: - encode_claim_previous(xdr, arg->u.delegation_type); - break; - case NFS4_OPEN_CLAIM_DELEGATE_CUR: - encode_claim_delegate_cur(xdr, arg->name, &arg->u.delegation); - break; - default: - BUG(); + case NFS4_OPEN_CLAIM_NULL: + encode_claim_null(xdr, arg->name); + break; + case NFS4_OPEN_CLAIM_PREVIOUS: + encode_claim_previous(xdr, arg->u.delegation_type); + break; + case NFS4_OPEN_CLAIM_DELEGATE_CUR: + encode_claim_delegate_cur(xdr, arg->name, &arg->u.delegation); + break; + default: + BUG(); } - return 0; + hdr->nops++; } -static int encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_confirmargs *arg) +static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_confirmargs *arg, struct compound_hdr *hdr) { __be32 *p; @@ -1100,11 +1093,10 @@ static int encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_con WRITE32(OP_OPEN_CONFIRM); WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE); WRITE32(arg->seqid->sequence->counter); - - return 0; + hdr->nops++; } -static int encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_closeargs *arg) +static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_closeargs *arg, struct compound_hdr *hdr) { __be32 *p; @@ -1112,12 +1104,12 @@ static int encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_closea WRITE32(OP_OPEN_DOWNGRADE); WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE); WRITE32(arg->seqid->sequence->counter); - encode_share_access(xdr, arg->open_flags); - return 0; + encode_share_access(xdr, arg->fmode); + hdr->nops++; } -static int -encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh) +static void +encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh, struct compound_hdr *hdr) { int len = fh->size; __be32 *p; @@ -1126,18 +1118,16 @@ encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh) WRITE32(OP_PUTFH); WRITE32(len); WRITEMEM(fh->data, len); - - return 0; + hdr->nops++; } -static int encode_putrootfh(struct xdr_stream *xdr) +static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr) { - __be32 *p; - - RESERVE_SPACE(4); - WRITE32(OP_PUTROOTFH); + __be32 *p; - return 0; + RESERVE_SPACE(4); + WRITE32(OP_PUTROOTFH); + hdr->nops++; } static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx) @@ -1153,7 +1143,7 @@ static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context WRITEMEM(zero_stateid.data, NFS4_STATEID_SIZE); } -static int encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args) +static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, struct compound_hdr *hdr) { __be32 *p; @@ -1165,11 +1155,10 @@ static int encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args) RESERVE_SPACE(12); WRITE64(args->offset); WRITE32(args->count); - - return 0; + hdr->nops++; } -static int encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req) +static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr) { uint32_t attrs[2] = { FATTR4_WORD0_RDATTR_ERROR|FATTR4_WORD0_FILEID, @@ -1191,6 +1180,7 @@ static int encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID; WRITE32(attrs[0] & readdir->bitmask[0]); WRITE32(attrs[1] & readdir->bitmask[1]); + hdr->nops++; dprintk("%s: cookie = %Lu, verifier = %08x:%08x, bitmap = %08x:%08x\n", __func__, (unsigned long long)readdir->cookie, @@ -1198,21 +1188,18 @@ static int encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg ((u32 *)readdir->verifier.data)[1], attrs[0] & readdir->bitmask[0], attrs[1] & readdir->bitmask[1]); - - return 0; } -static int encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *readlink, struct rpc_rqst *req) +static void encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *readlink, struct rpc_rqst *req, struct compound_hdr *hdr) { __be32 *p; RESERVE_SPACE(4); WRITE32(OP_READLINK); - - return 0; + hdr->nops++; } -static int encode_remove(struct xdr_stream *xdr, const struct qstr *name) +static void encode_remove(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr) { __be32 *p; @@ -1220,11 +1207,10 @@ static int encode_remove(struct xdr_stream *xdr, const struct qstr *name) WRITE32(OP_REMOVE); WRITE32(name->len); WRITEMEM(name->name, name->len); - - return 0; + hdr->nops++; } -static int encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, const struct qstr *newname) +static void encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, const struct qstr *newname, struct compound_hdr *hdr) { __be32 *p; @@ -1232,38 +1218,35 @@ static int encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, con WRITE32(OP_RENAME); WRITE32(oldname->len); WRITEMEM(oldname->name, oldname->len); - + RESERVE_SPACE(4 + newname->len); WRITE32(newname->len); WRITEMEM(newname->name, newname->len); - - return 0; + hdr->nops++; } -static int encode_renew(struct xdr_stream *xdr, const struct nfs_client *client_stateid) +static void encode_renew(struct xdr_stream *xdr, const struct nfs_client *client_stateid, struct compound_hdr *hdr) { __be32 *p; RESERVE_SPACE(12); WRITE32(OP_RENEW); WRITE64(client_stateid->cl_clientid); - - return 0; + hdr->nops++; } -static int -encode_restorefh(struct xdr_stream *xdr) +static void +encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr) { __be32 *p; RESERVE_SPACE(4); WRITE32(OP_RESTOREFH); - - return 0; + hdr->nops++; } static int -encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg) +encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compound_hdr *hdr) { __be32 *p; @@ -1278,36 +1261,32 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg) RESERVE_SPACE(4); WRITE32(arg->acl_len); xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len); + hdr->nops++; return 0; } -static int -encode_savefh(struct xdr_stream *xdr) +static void +encode_savefh(struct xdr_stream *xdr, struct compound_hdr *hdr) { __be32 *p; RESERVE_SPACE(4); WRITE32(OP_SAVEFH); - - return 0; + hdr->nops++; } -static int encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs *arg, const struct nfs_server *server) +static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs *arg, const struct nfs_server *server, struct compound_hdr *hdr) { - int status; __be32 *p; - - RESERVE_SPACE(4+NFS4_STATEID_SIZE); - WRITE32(OP_SETATTR); - WRITEMEM(arg->stateid.data, NFS4_STATEID_SIZE); - if ((status = encode_attrs(xdr, arg->iap, server))) - return status; - - return 0; + RESERVE_SPACE(4+NFS4_STATEID_SIZE); + WRITE32(OP_SETATTR); + WRITEMEM(arg->stateid.data, NFS4_STATEID_SIZE); + hdr->nops++; + encode_attrs(xdr, arg->iap, server); } -static int encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid) +static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid, struct compound_hdr *hdr) { __be32 *p; @@ -1322,23 +1301,21 @@ static int encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclien encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr); RESERVE_SPACE(4); WRITE32(setclientid->sc_cb_ident); - - return 0; + hdr->nops++; } -static int encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_client *client_state) +static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_client *client_state, struct compound_hdr *hdr) { - __be32 *p; - - RESERVE_SPACE(12 + NFS4_VERIFIER_SIZE); - WRITE32(OP_SETCLIENTID_CONFIRM); - WRITE64(client_state->cl_clientid); - WRITEMEM(client_state->cl_confirm.data, NFS4_VERIFIER_SIZE); + __be32 *p; - return 0; + RESERVE_SPACE(12 + NFS4_VERIFIER_SIZE); + WRITE32(OP_SETCLIENTID_CONFIRM); + WRITE64(client_state->cl_clientid); + WRITEMEM(client_state->cl_confirm.data, NFS4_VERIFIER_SIZE); + hdr->nops++; } -static int encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *args) +static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *args, struct compound_hdr *hdr) { __be32 *p; @@ -1353,11 +1330,10 @@ static int encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *args WRITE32(args->count); xdr_write_pages(xdr, args->pages, args->pgbase, args->count); - - return 0; + hdr->nops++; } -static int encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *stateid) +static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *stateid, struct compound_hdr *hdr) { __be32 *p; @@ -1365,8 +1341,7 @@ static int encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *statei WRITE32(OP_DELEGRETURN); WRITEMEM(stateid->data, NFS4_STATEID_SIZE); - return 0; - + hdr->nops++; } /* * END OF "GENERIC" ENCODE ROUTINES. @@ -1379,21 +1354,16 @@ static int nfs4_xdr_enc_access(struct rpc_rqst *req, __be32 *p, const struct nfs { struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 3, + .nops = 0, }; - int status; xdr_init_encode(&xdr, &req->rq_snd_buf, p); encode_compound_hdr(&xdr, &hdr); - status = encode_putfh(&xdr, args->fh); - if (status != 0) - goto out; - status = encode_access(&xdr, args->access); - if (status != 0) - goto out; - status = encode_getfattr(&xdr, args->bitmask); -out: - return status; + encode_putfh(&xdr, args->fh, &hdr); + encode_access(&xdr, args->access, &hdr); + encode_getfattr(&xdr, args->bitmask, &hdr); + encode_nops(&hdr); + return 0; } /* @@ -1403,21 +1373,17 @@ static int nfs4_xdr_enc_lookup(struct rpc_rqst *req, __be32 *p, const struct nfs { struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 4, + .nops = 0, }; - int status; xdr_init_encode(&xdr, &req->rq_snd_buf, p); encode_compound_hdr(&xdr, &hdr); - if ((status = encode_putfh(&xdr, args->dir_fh)) != 0) - goto out; - if ((status = encode_lookup(&xdr, args->name)) != 0) - goto out; - if ((status = encode_getfh(&xdr)) != 0) - goto out; - status = encode_getfattr(&xdr, args->bitmask); -out: - return status; + encode_putfh(&xdr, args->dir_fh, &hdr); + encode_lookup(&xdr, args->name, &hdr); + encode_getfh(&xdr, &hdr); + encode_getfattr(&xdr, args->bitmask, &hdr); + encode_nops(&hdr); + return 0; } /* @@ -1427,18 +1393,16 @@ static int nfs4_xdr_enc_lookup_root(struct rpc_rqst *req, __be32 *p, const struc { struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 3, + .nops = 0, }; - int status; xdr_init_encode(&xdr, &req->rq_snd_buf, p); encode_compound_hdr(&xdr, &hdr); - if ((status = encode_putrootfh(&xdr)) != 0) - goto out; - if ((status = encode_getfh(&xdr)) == 0) - status = encode_getfattr(&xdr, args->bitmask); -out: - return status; + encode_putrootfh(&xdr, &hdr); + encode_getfh(&xdr, &hdr); + encode_getfattr(&xdr, args->bitmask, &hdr); + encode_nops(&hdr); + return 0; } /* @@ -1448,19 +1412,16 @@ static int nfs4_xdr_enc_remove(struct rpc_rqst *req, __be32 *p, const struct nfs { struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 3, + .nops = 0, }; - int status; xdr_init_encode(&xdr, &req->rq_snd_buf, p); encode_compound_hdr(&xdr, &hdr); - if ((status = encode_putfh(&xdr, args->fh)) != 0) - goto out; - if ((status = encode_remove(&xdr, &args->name)) != 0) - goto out; - status = encode_getfattr(&xdr, args->bitmask); -out: - return status; + encode_putfh(&xdr, args->fh, &hdr); + encode_remove(&xdr, &args->name, &hdr); + encode_getfattr(&xdr, args->bitmask, &hdr); + encode_nops(&hdr); + return 0; } /* @@ -1470,27 +1431,20 @@ static int nfs4_xdr_enc_rename(struct rpc_rqst *req, __be32 *p, const struct nfs { struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 7, + .nops = 0, }; - int status; xdr_init_encode(&xdr, &req->rq_snd_buf, p); encode_compound_hdr(&xdr, &hdr); - if ((status = encode_putfh(&xdr, args->old_dir)) != 0) - goto out; - if ((status = encode_savefh(&xdr)) != 0) - goto out; - if ((status = encode_putfh(&xdr, args->new_dir)) != 0) - goto out; - if ((status = encode_rename(&xdr, args->old_name, args->new_name)) != 0) - goto out; - if ((status = encode_getfattr(&xdr, args->bitmask)) != 0) - goto out; - if ((status = encode_restorefh(&xdr)) != 0) - goto out; - status = encode_getfattr(&xdr, args->bitmask); -out: - return status; + encode_putfh(&xdr, args->old_dir, &hdr); + encode_savefh(&xdr, &hdr); + encode_putfh(&xdr, args->new_dir, &hdr); + encode_rename(&xdr, args->old_name, args->new_name, &hdr); + encode_getfattr(&xdr, args->bitmask, &hdr); + encode_restorefh(&xdr, &hdr); + encode_getfattr(&xdr, args->bitmask, &hdr); + encode_nops(&hdr); + return 0; } /* @@ -1500,27 +1454,20 @@ static int nfs4_xdr_enc_link(struct rpc_rqst *req, __be32 *p, const struct nfs4_ { struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 7, + .nops = 0, }; - int status; xdr_init_encode(&xdr, &req->rq_snd_buf, p); encode_compound_hdr(&xdr, &hdr); - if ((status = encode_putfh(&xdr, args->fh)) != 0) - goto out; - if ((status = encode_savefh(&xdr)) != 0) - goto out; - if ((status = encode_putfh(&xdr, args->dir_fh)) != 0) - goto out; - if ((status = encode_link(&xdr, args->name)) != 0) - goto out; - if ((status = encode_getfattr(&xdr, args->bitmask)) != 0) - goto out; - if ((status = encode_restorefh(&xdr)) != 0) - goto out; - status = encode_getfattr(&xdr, args->bitmask); -out: - return status; + encode_putfh(&xdr, args->fh, &hdr); + encode_savefh(&xdr, &hdr); + encode_putfh(&xdr, args->dir_fh, &hdr); + encode_link(&xdr, args->name, &hdr); + encode_getfattr(&xdr, args->bitmask, &hdr); + encode_restorefh(&xdr, &hdr); + encode_getfattr(&xdr, args->bitmask, &hdr); + encode_nops(&hdr); + return 0; } /* @@ -1530,27 +1477,20 @@ static int nfs4_xdr_enc_create(struct rpc_rqst *req, __be32 *p, const struct nfs { struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 7, + .nops = 0, }; - int status; xdr_init_encode(&xdr, &req->rq_snd_buf, p); encode_compound_hdr(&xdr, &hdr); - if ((status = encode_putfh(&xdr, args->dir_fh)) != 0) - goto out; - if ((status = encode_savefh(&xdr)) != 0) - goto out; - if ((status = encode_create(&xdr, args)) != 0) - goto out; - if ((status = encode_getfh(&xdr)) != 0) - goto out; - if ((status = encode_getfattr(&xdr, args->bitmask)) != 0) - goto out; - if ((status = encode_restorefh(&xdr)) != 0) - goto out; - status = encode_getfattr(&xdr, args->bitmask); -out: - return status; + encode_putfh(&xdr, args->dir_fh, &hdr); + encode_savefh(&xdr, &hdr); + encode_create(&xdr, args, &hdr); + encode_getfh(&xdr, &hdr); + encode_getfattr(&xdr, args->bitmask, &hdr); + encode_restorefh(&xdr, &hdr); + encode_getfattr(&xdr, args->bitmask, &hdr); + encode_nops(&hdr); + return 0; } /* @@ -1568,15 +1508,15 @@ static int nfs4_xdr_enc_getattr(struct rpc_rqst *req, __be32 *p, const struct nf { struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 2, + .nops = 0, }; - int status; xdr_init_encode(&xdr, &req->rq_snd_buf, p); encode_compound_hdr(&xdr, &hdr); - if ((status = encode_putfh(&xdr, args->fh)) == 0) - status = encode_getfattr(&xdr, args->bitmask); - return status; + encode_putfh(&xdr, args->fh, &hdr); + encode_getfattr(&xdr, args->bitmask, &hdr); + encode_nops(&hdr); + return 0; } /* @@ -1584,23 +1524,18 @@ static int nfs4_xdr_enc_getattr(struct rpc_rqst *req, __be32 *p, const struct nf */ static int nfs4_xdr_enc_close(struct rpc_rqst *req, __be32 *p, struct nfs_closeargs *args) { - struct xdr_stream xdr; - struct compound_hdr hdr = { - .nops = 3, - }; - int status; - - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, &hdr); - status = encode_putfh(&xdr, args->fh); - if(status) - goto out; - status = encode_close(&xdr, args); - if (status != 0) - goto out; - status = encode_getfattr(&xdr, args->bitmask); -out: - return status; + struct xdr_stream xdr; + struct compound_hdr hdr = { + .nops = 0, + }; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); + encode_compound_hdr(&xdr, &hdr); + encode_putfh(&xdr, args->fh, &hdr); + encode_close(&xdr, args, &hdr); + encode_getfattr(&xdr, args->bitmask, &hdr); + encode_nops(&hdr); + return 0; } /* @@ -1610,33 +1545,20 @@ static int nfs4_xdr_enc_open(struct rpc_rqst *req, __be32 *p, struct nfs_openarg { struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 7, + .nops = 0, }; - int status; xdr_init_encode(&xdr, &req->rq_snd_buf, p); encode_compound_hdr(&xdr, &hdr); - status = encode_putfh(&xdr, args->fh); - if (status) - goto out; - status = encode_savefh(&xdr); - if (status) - goto out; - status = encode_open(&xdr, args); - if (status) - goto out; - status = encode_getfh(&xdr); - if (status) - goto out; - status = encode_getfattr(&xdr, args->bitmask); - if (status) - goto out; - status = encode_restorefh(&xdr); - if (status) - goto out; - status = encode_getfattr(&xdr, args->bitmask); -out: - return status; + encode_putfh(&xdr, args->fh, &hdr); + encode_savefh(&xdr, &hdr); + encode_open(&xdr, args, &hdr); + encode_getfh(&xdr, &hdr); + encode_getfattr(&xdr, args->bitmask, &hdr); + encode_restorefh(&xdr, &hdr); + encode_getfattr(&xdr, args->bitmask, &hdr); + encode_nops(&hdr); + return 0; } /* @@ -1646,18 +1568,15 @@ static int nfs4_xdr_enc_open_confirm(struct rpc_rqst *req, __be32 *p, struct nfs { struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 2, + .nops = 0, }; - int status; xdr_init_encode(&xdr, &req->rq_snd_buf, p); encode_compound_hdr(&xdr, &hdr); - status = encode_putfh(&xdr, args->fh); - if(status) - goto out; - status = encode_open_confirm(&xdr, args); -out: - return status; + encode_putfh(&xdr, args->fh, &hdr); + encode_open_confirm(&xdr, args, &hdr); + encode_nops(&hdr); + return 0; } /* @@ -1667,21 +1586,16 @@ static int nfs4_xdr_enc_open_noattr(struct rpc_rqst *req, __be32 *p, struct nfs_ { struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 3, + .nops = 0, }; - int status; xdr_init_encode(&xdr, &req->rq_snd_buf, p); encode_compound_hdr(&xdr, &hdr); - status = encode_putfh(&xdr, args->fh); - if (status) - goto out; - status = encode_open(&xdr, args); - if (status) - goto out; - status = encode_getfattr(&xdr, args->bitmask); -out: - return status; + encode_putfh(&xdr, args->fh, &hdr); + encode_open(&xdr, args, &hdr); + encode_getfattr(&xdr, args->bitmask, &hdr); + encode_nops(&hdr); + return 0; } /* @@ -1691,21 +1605,16 @@ static int nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req, __be32 *p, struct n { struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 3, + .nops = 0, }; - int status; xdr_init_encode(&xdr, &req->rq_snd_buf, p); encode_compound_hdr(&xdr, &hdr); - status = encode_putfh(&xdr, args->fh); - if (status) - goto out; - status = encode_open_downgrade(&xdr, args); - if (status != 0) - goto out; - status = encode_getfattr(&xdr, args->bitmask); -out: - return status; + encode_putfh(&xdr, args->fh, &hdr); + encode_open_downgrade(&xdr, args, &hdr); + encode_getfattr(&xdr, args->bitmask, &hdr); + encode_nops(&hdr); + return 0; } /* @@ -1715,18 +1624,15 @@ static int nfs4_xdr_enc_lock(struct rpc_rqst *req, __be32 *p, struct nfs_lock_ar { struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 2, + .nops = 0, }; - int status; xdr_init_encode(&xdr, &req->rq_snd_buf, p); encode_compound_hdr(&xdr, &hdr); - status = encode_putfh(&xdr, args->fh); - if(status) - goto out; - status = encode_lock(&xdr, args); -out: - return status; + encode_putfh(&xdr, args->fh, &hdr); + encode_lock(&xdr, args, &hdr); + encode_nops(&hdr); + return 0; } /* @@ -1736,18 +1642,15 @@ static int nfs4_xdr_enc_lockt(struct rpc_rqst *req, __be32 *p, struct nfs_lockt_ { struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 2, + .nops = 0, }; - int status; xdr_init_encode(&xdr, &req->rq_snd_buf, p); encode_compound_hdr(&xdr, &hdr); - status = encode_putfh(&xdr, args->fh); - if(status) - goto out; - status = encode_lockt(&xdr, args); -out: - return status; + encode_putfh(&xdr, args->fh, &hdr); + encode_lockt(&xdr, args, &hdr); + encode_nops(&hdr); + return 0; } /* @@ -1757,18 +1660,15 @@ static int nfs4_xdr_enc_locku(struct rpc_rqst *req, __be32 *p, struct nfs_locku_ { struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 2, + .nops = 0, }; - int status; xdr_init_encode(&xdr, &req->rq_snd_buf, p); encode_compound_hdr(&xdr, &hdr); - status = encode_putfh(&xdr, args->fh); - if(status) - goto out; - status = encode_locku(&xdr, args); -out: - return status; + encode_putfh(&xdr, args->fh, &hdr); + encode_locku(&xdr, args, &hdr); + encode_nops(&hdr); + return 0; } /* @@ -1778,18 +1678,15 @@ static int nfs4_xdr_enc_readlink(struct rpc_rqst *req, __be32 *p, const struct n { struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 2, + .nops = 0, }; struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; unsigned int replen; - int status; xdr_init_encode(&xdr, &req->rq_snd_buf, p); encode_compound_hdr(&xdr, &hdr); - status = encode_putfh(&xdr, args->fh); - if(status) - goto out; - status = encode_readlink(&xdr, args, req); + encode_putfh(&xdr, args->fh, &hdr); + encode_readlink(&xdr, args, req, &hdr); /* set up reply kvec * toplevel_status + taglen + rescount + OP_PUTFH + status @@ -1798,9 +1695,8 @@ static int nfs4_xdr_enc_readlink(struct rpc_rqst *req, __be32 *p, const struct n replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS4_dec_readlink_sz) << 2; xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, args->pgbase, args->pglen); - -out: - return status; + encode_nops(&hdr); + return 0; } /* @@ -1810,18 +1706,15 @@ static int nfs4_xdr_enc_readdir(struct rpc_rqst *req, __be32 *p, const struct nf { struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 2, + .nops = 0, }; struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; int replen; - int status; xdr_init_encode(&xdr, &req->rq_snd_buf, p); encode_compound_hdr(&xdr, &hdr); - status = encode_putfh(&xdr, args->fh); - if(status) - goto out; - status = encode_readdir(&xdr, args, req); + encode_putfh(&xdr, args->fh, &hdr); + encode_readdir(&xdr, args, req, &hdr); /* set up reply kvec * toplevel_status + taglen + rescount + OP_PUTFH + status @@ -1833,9 +1726,8 @@ static int nfs4_xdr_enc_readdir(struct rpc_rqst *req, __be32 *p, const struct nf dprintk("%s: inlined page args = (%u, %p, %u, %u)\n", __func__, replen, args->pages, args->pgbase, args->count); - -out: - return status; + encode_nops(&hdr); + return 0; } /* @@ -1846,18 +1738,14 @@ static int nfs4_xdr_enc_read(struct rpc_rqst *req, __be32 *p, struct nfs_readarg struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 2, + .nops = 0, }; - int replen, status; + int replen; xdr_init_encode(&xdr, &req->rq_snd_buf, p); encode_compound_hdr(&xdr, &hdr); - status = encode_putfh(&xdr, args->fh); - if (status) - goto out; - status = encode_read(&xdr, args); - if (status) - goto out; + encode_putfh(&xdr, args->fh, &hdr); + encode_read(&xdr, args, &hdr); /* set up reply kvec * toplevel status + taglen=0 + rescount + OP_PUTFH + status @@ -1867,33 +1755,27 @@ static int nfs4_xdr_enc_read(struct rpc_rqst *req, __be32 *p, struct nfs_readarg xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, args->pgbase, args->count); req->rq_rcv_buf.flags |= XDRBUF_READ; -out: - return status; + encode_nops(&hdr); + return 0; } /* * Encode an SETATTR request */ static int nfs4_xdr_enc_setattr(struct rpc_rqst *req, __be32 *p, struct nfs_setattrargs *args) - { - struct xdr_stream xdr; - struct compound_hdr hdr = { - .nops = 3, - }; - int status; - - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, &hdr); - status = encode_putfh(&xdr, args->fh); - if(status) - goto out; - status = encode_setattr(&xdr, args, args->server); - if(status) - goto out; - status = encode_getfattr(&xdr, args->bitmask); -out: - return status; + struct xdr_stream xdr; + struct compound_hdr hdr = { + .nops = 0, + }; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); + encode_compound_hdr(&xdr, &hdr); + encode_putfh(&xdr, args->fh, &hdr); + encode_setattr(&xdr, args, args->server, &hdr); + encode_getfattr(&xdr, args->bitmask, &hdr); + encode_nops(&hdr); + return 0; } /* @@ -1906,22 +1788,21 @@ nfs4_xdr_enc_getacl(struct rpc_rqst *req, __be32 *p, struct xdr_stream xdr; struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; struct compound_hdr hdr = { - .nops = 2, + .nops = 0, }; - int replen, status; + int replen; xdr_init_encode(&xdr, &req->rq_snd_buf, p); encode_compound_hdr(&xdr, &hdr); - status = encode_putfh(&xdr, args->fh); - if (status) - goto out; - status = encode_getattr_two(&xdr, FATTR4_WORD0_ACL, 0); + encode_putfh(&xdr, args->fh, &hdr); + encode_getattr_two(&xdr, FATTR4_WORD0_ACL, 0, &hdr); + /* set up reply buffer: */ replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS4_dec_getacl_sz) << 2; xdr_inline_pages(&req->rq_rcv_buf, replen, args->acl_pages, args->acl_pgbase, args->acl_len); -out: - return status; + encode_nops(&hdr); + return 0; } /* @@ -1931,22 +1812,17 @@ static int nfs4_xdr_enc_write(struct rpc_rqst *req, __be32 *p, struct nfs_writea { struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 3, + .nops = 0, }; - int status; xdr_init_encode(&xdr, &req->rq_snd_buf, p); encode_compound_hdr(&xdr, &hdr); - status = encode_putfh(&xdr, args->fh); - if (status) - goto out; - status = encode_write(&xdr, args); - if (status) - goto out; + encode_putfh(&xdr, args->fh, &hdr); + encode_write(&xdr, args, &hdr); req->rq_snd_buf.flags |= XDRBUF_WRITE; - status = encode_getfattr(&xdr, args->bitmask); -out: - return status; + encode_getfattr(&xdr, args->bitmask, &hdr); + encode_nops(&hdr); + return 0; } /* @@ -1956,21 +1832,16 @@ static int nfs4_xdr_enc_commit(struct rpc_rqst *req, __be32 *p, struct nfs_write { struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 3, + .nops = 0, }; - int status; xdr_init_encode(&xdr, &req->rq_snd_buf, p); encode_compound_hdr(&xdr, &hdr); - status = encode_putfh(&xdr, args->fh); - if (status) - goto out; - status = encode_commit(&xdr, args); - if (status) - goto out; - status = encode_getfattr(&xdr, args->bitmask); -out: - return status; + encode_putfh(&xdr, args->fh, &hdr); + encode_commit(&xdr, args, &hdr); + encode_getfattr(&xdr, args->bitmask, &hdr); + encode_nops(&hdr); + return 0; } /* @@ -1980,16 +1851,15 @@ static int nfs4_xdr_enc_fsinfo(struct rpc_rqst *req, __be32 *p, struct nfs4_fsin { struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 2, + .nops = 0, }; - int status; xdr_init_encode(&xdr, &req->rq_snd_buf, p); encode_compound_hdr(&xdr, &hdr); - status = encode_putfh(&xdr, args->fh); - if (!status) - status = encode_fsinfo(&xdr, args->bitmask); - return status; + encode_putfh(&xdr, args->fh, &hdr); + encode_fsinfo(&xdr, args->bitmask, &hdr); + encode_nops(&hdr); + return 0; } /* @@ -1999,17 +1869,16 @@ static int nfs4_xdr_enc_pathconf(struct rpc_rqst *req, __be32 *p, const struct n { struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 2, + .nops = 0, }; - int status; xdr_init_encode(&xdr, &req->rq_snd_buf, p); encode_compound_hdr(&xdr, &hdr); - status = encode_putfh(&xdr, args->fh); - if (!status) - status = encode_getattr_one(&xdr, - args->bitmask[0] & nfs4_pathconf_bitmap[0]); - return status; + encode_putfh(&xdr, args->fh, &hdr); + encode_getattr_one(&xdr, args->bitmask[0] & nfs4_pathconf_bitmap[0], + &hdr); + encode_nops(&hdr); + return 0; } /* @@ -2019,18 +1888,16 @@ static int nfs4_xdr_enc_statfs(struct rpc_rqst *req, __be32 *p, const struct nfs { struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 2, + .nops = 0, }; - int status; xdr_init_encode(&xdr, &req->rq_snd_buf, p); encode_compound_hdr(&xdr, &hdr); - status = encode_putfh(&xdr, args->fh); - if (status == 0) - status = encode_getattr_two(&xdr, - args->bitmask[0] & nfs4_statfs_bitmap[0], - args->bitmask[1] & nfs4_statfs_bitmap[1]); - return status; + encode_putfh(&xdr, args->fh, &hdr); + encode_getattr_two(&xdr, args->bitmask[0] & nfs4_statfs_bitmap[0], + args->bitmask[1] & nfs4_statfs_bitmap[1], &hdr); + encode_nops(&hdr); + return 0; } /* @@ -2040,19 +1907,18 @@ static int nfs4_xdr_enc_server_caps(struct rpc_rqst *req, __be32 *p, const struc { struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 2, + .nops = 0, }; - int status; xdr_init_encode(&xdr, &req->rq_snd_buf, p); encode_compound_hdr(&xdr, &hdr); - status = encode_putfh(&xdr, fhandle); - if (status == 0) - status = encode_getattr_one(&xdr, FATTR4_WORD0_SUPPORTED_ATTRS| - FATTR4_WORD0_LINK_SUPPORT| - FATTR4_WORD0_SYMLINK_SUPPORT| - FATTR4_WORD0_ACLSUPPORT); - return status; + encode_putfh(&xdr, fhandle, &hdr); + encode_getattr_one(&xdr, FATTR4_WORD0_SUPPORTED_ATTRS| + FATTR4_WORD0_LINK_SUPPORT| + FATTR4_WORD0_SYMLINK_SUPPORT| + FATTR4_WORD0_ACLSUPPORT, &hdr); + encode_nops(&hdr); + return 0; } /* @@ -2062,12 +1928,14 @@ static int nfs4_xdr_enc_renew(struct rpc_rqst *req, __be32 *p, struct nfs_client { struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 1, + .nops = 0, }; xdr_init_encode(&xdr, &req->rq_snd_buf, p); encode_compound_hdr(&xdr, &hdr); - return encode_renew(&xdr, clp); + encode_renew(&xdr, clp, &hdr); + encode_nops(&hdr); + return 0; } /* @@ -2077,12 +1945,14 @@ static int nfs4_xdr_enc_setclientid(struct rpc_rqst *req, __be32 *p, struct nfs4 { struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 1, + .nops = 0, }; xdr_init_encode(&xdr, &req->rq_snd_buf, p); encode_compound_hdr(&xdr, &hdr); - return encode_setclientid(&xdr, sc); + encode_setclientid(&xdr, sc, &hdr); + encode_nops(&hdr); + return 0; } /* @@ -2092,19 +1962,17 @@ static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, str { struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 3, + .nops = 0, }; const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 }; - int status; xdr_init_encode(&xdr, &req->rq_snd_buf, p); encode_compound_hdr(&xdr, &hdr); - status = encode_setclientid_confirm(&xdr, clp); - if (!status) - status = encode_putrootfh(&xdr); - if (!status) - status = encode_fsinfo(&xdr, lease_bitmap); - return status; + encode_setclientid_confirm(&xdr, clp, &hdr); + encode_putrootfh(&xdr, &hdr); + encode_fsinfo(&xdr, lease_bitmap, &hdr); + encode_nops(&hdr); + return 0; } /* @@ -2114,21 +1982,16 @@ static int nfs4_xdr_enc_delegreturn(struct rpc_rqst *req, __be32 *p, const struc { struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 3, + .nops = 0, }; - int status; xdr_init_encode(&xdr, &req->rq_snd_buf, p); encode_compound_hdr(&xdr, &hdr); - status = encode_putfh(&xdr, args->fhandle); - if (status != 0) - goto out; - status = encode_delegreturn(&xdr, args->stateid); - if (status != 0) - goto out; - status = encode_getfattr(&xdr, args->bitmask); -out: - return status; + encode_putfh(&xdr, args->fhandle, &hdr); + encode_delegreturn(&xdr, args->stateid, &hdr); + encode_getfattr(&xdr, args->bitmask, &hdr); + encode_nops(&hdr); + return 0; } /* @@ -2138,20 +2001,17 @@ static int nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, __be32 *p, struct nfs { struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 3, + .nops = 0, }; struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; int replen; - int status; xdr_init_encode(&xdr, &req->rq_snd_buf, p); encode_compound_hdr(&xdr, &hdr); - if ((status = encode_putfh(&xdr, args->dir_fh)) != 0) - goto out; - if ((status = encode_lookup(&xdr, args->name)) != 0) - goto out; - if ((status = encode_fs_locations(&xdr, args->bitmask)) != 0) - goto out; + encode_putfh(&xdr, args->dir_fh, &hdr); + encode_lookup(&xdr, args->name, &hdr); + encode_fs_locations(&xdr, args->bitmask, &hdr); + /* set up reply * toplevel_status + OP_PUTFH + status * + OP_LOOKUP + status + OP_GETATTR + status = 7 @@ -2159,8 +2019,8 @@ static int nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, __be32 *p, struct nfs replen = (RPC_REPHDRSIZE + auth->au_rslack + 7) << 2; xdr_inline_pages(&req->rq_rcv_buf, replen, &args->page, 0, PAGE_SIZE); -out: - return status; + encode_nops(&hdr); + return 0; } /* @@ -2217,11 +2077,13 @@ static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr) READ_BUF(8); READ32(hdr->status); READ32(hdr->taglen); - + READ_BUF(hdr->taglen + 4); hdr->tag = (char *)p; p += XDR_QUADLEN(hdr->taglen); READ32(hdr->nops); + if (unlikely(hdr->nops < 1)) + return nfs4_stat_to_errno(hdr->status); return 0; } @@ -3047,8 +2909,7 @@ static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res) { __be32 *savep; - uint32_t attrlen, - bitmap[2] = {0}; + uint32_t attrlen, bitmap[2] = {0}; int status; if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) @@ -3070,14 +2931,13 @@ xdr_error: dprintk("%s: xdr returned %d!\n", __func__, -status); return status; } - + static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat) { __be32 *savep; - uint32_t attrlen, - bitmap[2] = {0}; + uint32_t attrlen, bitmap[2] = {0}; int status; - + if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) goto xdr_error; if ((status = decode_attr_bitmap(xdr, bitmap)) != 0) @@ -3107,10 +2967,9 @@ xdr_error: static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf) { __be32 *savep; - uint32_t attrlen, - bitmap[2] = {0}; + uint32_t attrlen, bitmap[2] = {0}; int status; - + if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) goto xdr_error; if ((status = decode_attr_bitmap(xdr, bitmap)) != 0) @@ -3256,7 +3115,7 @@ static int decode_getfh(struct xdr_stream *xdr, struct nfs_fh *fh) static int decode_link(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) { int status; - + status = decode_op_hdr(xdr, OP_LINK); if (status) return status; @@ -3344,27 +3203,27 @@ static int decode_lookup(struct xdr_stream *xdr) /* This is too sick! */ static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize) { - __be32 *p; + __be32 *p; uint32_t limit_type, nblocks, blocksize; READ_BUF(12); READ32(limit_type); switch (limit_type) { - case 1: - READ64(*maxsize); - break; - case 2: - READ32(nblocks); - READ32(blocksize); - *maxsize = (uint64_t)nblocks * (uint64_t)blocksize; + case 1: + READ64(*maxsize); + break; + case 2: + READ32(nblocks); + READ32(blocksize); + *maxsize = (uint64_t)nblocks * (uint64_t)blocksize; } return 0; } static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res) { - __be32 *p; - uint32_t delegation_type; + __be32 *p; + uint32_t delegation_type; READ_BUF(4); READ32(delegation_type); @@ -3375,13 +3234,14 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res) READ_BUF(NFS4_STATEID_SIZE+4); COPYMEM(res->delegation.data, NFS4_STATEID_SIZE); READ32(res->do_recall); + switch (delegation_type) { - case NFS4_OPEN_DELEGATE_READ: - res->delegation_type = FMODE_READ; - break; - case NFS4_OPEN_DELEGATE_WRITE: - res->delegation_type = FMODE_WRITE|FMODE_READ; - if (decode_space_limit(xdr, &res->maxsize) < 0) + case NFS4_OPEN_DELEGATE_READ: + res->delegation_type = FMODE_READ; + break; + case NFS4_OPEN_DELEGATE_WRITE: + res->delegation_type = FMODE_WRITE|FMODE_READ; + if (decode_space_limit(xdr, &res->maxsize) < 0) return -EIO; } return decode_ace(xdr, NULL, res->server->nfs_client); @@ -3389,27 +3249,27 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res) static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) { - __be32 *p; + __be32 *p; uint32_t savewords, bmlen, i; - int status; + int status; - status = decode_op_hdr(xdr, OP_OPEN); + status = decode_op_hdr(xdr, OP_OPEN); if (status != -EIO) nfs_increment_open_seqid(status, res->seqid); - if (status) - return status; - READ_BUF(NFS4_STATEID_SIZE); - COPYMEM(res->stateid.data, NFS4_STATEID_SIZE); + if (status) + return status; + READ_BUF(NFS4_STATEID_SIZE); + COPYMEM(res->stateid.data, NFS4_STATEID_SIZE); - decode_change_info(xdr, &res->cinfo); + decode_change_info(xdr, &res->cinfo); - READ_BUF(8); - READ32(res->rflags); - READ32(bmlen); - if (bmlen > 10) - goto xdr_error; + READ_BUF(8); + READ32(res->rflags); + READ32(bmlen); + if (bmlen > 10) + goto xdr_error; - READ_BUF(bmlen << 2); + READ_BUF(bmlen << 2); savewords = min_t(uint32_t, bmlen, NFS4_BITMAP_SIZE); for (i = 0; i < savewords; ++i) READ32(res->attrset[i]); @@ -3424,17 +3284,17 @@ xdr_error: static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmres *res) { - __be32 *p; + __be32 *p; int status; - status = decode_op_hdr(xdr, OP_OPEN_CONFIRM); + status = decode_op_hdr(xdr, OP_OPEN_CONFIRM); if (status != -EIO) nfs_increment_open_seqid(status, res->seqid); - if (status) - return status; - READ_BUF(NFS4_STATEID_SIZE); - COPYMEM(res->stateid.data, NFS4_STATEID_SIZE); - return 0; + if (status) + return status; + READ_BUF(NFS4_STATEID_SIZE); + COPYMEM(res->stateid.data, NFS4_STATEID_SIZE); + return 0; } static int decode_open_downgrade(struct xdr_stream *xdr, struct nfs_closeres *res) @@ -3562,7 +3422,7 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n dprintk("NFS: readdir reply truncated!\n"); entry[1] = 1; } -out: +out: kunmap_atomic(kaddr, KM_USER0); return 0; short_pkt: @@ -3718,7 +3578,6 @@ static int decode_setattr(struct xdr_stream *xdr, struct nfs_setattrres *res) uint32_t bmlen; int status; - status = decode_op_hdr(xdr, OP_SETATTR); if (status) return status; @@ -3738,7 +3597,7 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp) READ32(opnum); if (opnum != OP_SETCLIENTID) { dprintk("nfs: decode_setclientid: Server returned operation" - " %d\n", opnum); + " %d\n", opnum); return -EIO; } READ32(nfserr); @@ -3792,34 +3651,34 @@ static int decode_delegreturn(struct xdr_stream *xdr) } /* + * END OF "GENERIC" DECODE ROUTINES. + */ + +/* * Decode OPEN_DOWNGRADE response */ static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, __be32 *p, struct nfs_closeres *res) { - struct xdr_stream xdr; - struct compound_hdr hdr; - int status; - - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); - if (status) - goto out; - status = decode_putfh(&xdr); - if (status) - goto out; - status = decode_open_downgrade(&xdr, res); + struct xdr_stream xdr; + struct compound_hdr hdr; + int status; + + xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); + status = decode_compound_hdr(&xdr, &hdr); + if (status) + goto out; + status = decode_putfh(&xdr); + if (status) + goto out; + status = decode_open_downgrade(&xdr, res); if (status != 0) goto out; decode_getfattr(&xdr, res->fattr, res->server); out: - return status; + return status; } /* - * END OF "GENERIC" DECODE ROUTINES. - */ - -/* * Decode ACCESS response */ static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_accessres *res) @@ -3827,7 +3686,7 @@ static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_ac struct xdr_stream xdr; struct compound_hdr hdr; int status; - + xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); if ((status = decode_compound_hdr(&xdr, &hdr)) != 0) goto out; @@ -3850,7 +3709,7 @@ static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_lo struct xdr_stream xdr; struct compound_hdr hdr; int status; - + xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); if ((status = decode_compound_hdr(&xdr, &hdr)) != 0) goto out; @@ -3873,7 +3732,7 @@ static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp, __be32 *p, struct nf struct xdr_stream xdr; struct compound_hdr hdr; int status; - + xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); if ((status = decode_compound_hdr(&xdr, &hdr)) != 0) goto out; @@ -3893,7 +3752,7 @@ static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, __be32 *p, struct nfs_rem struct xdr_stream xdr; struct compound_hdr hdr; int status; - + xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); if ((status = decode_compound_hdr(&xdr, &hdr)) != 0) goto out; @@ -3914,7 +3773,7 @@ static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_re struct xdr_stream xdr; struct compound_hdr hdr; int status; - + xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); if ((status = decode_compound_hdr(&xdr, &hdr)) != 0) goto out; @@ -3944,7 +3803,7 @@ static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_link struct xdr_stream xdr; struct compound_hdr hdr; int status; - + xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); if ((status = decode_compound_hdr(&xdr, &hdr)) != 0) goto out; @@ -3977,7 +3836,7 @@ static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_cr struct xdr_stream xdr; struct compound_hdr hdr; int status; - + xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); if ((status = decode_compound_hdr(&xdr, &hdr)) != 0) goto out; @@ -4014,7 +3873,7 @@ static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_g struct xdr_stream xdr; struct compound_hdr hdr; int status; - + xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); status = decode_compound_hdr(&xdr, &hdr); if (status) @@ -4025,7 +3884,6 @@ static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_g status = decode_getfattr(&xdr, res->fattr, res->server); out: return status; - } /* @@ -4034,21 +3892,20 @@ out: static int nfs4_xdr_enc_setacl(struct rpc_rqst *req, __be32 *p, struct nfs_setaclargs *args) { - struct xdr_stream xdr; - struct compound_hdr hdr = { - .nops = 2, - }; - int status; - - xdr_init_encode(&xdr, &req->rq_snd_buf, p); - encode_compound_hdr(&xdr, &hdr); - status = encode_putfh(&xdr, args->fh); - if (status) - goto out; - status = encode_setacl(&xdr, args); -out: - return status; + struct xdr_stream xdr; + struct compound_hdr hdr = { + .nops = 0, + }; + int status; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); + encode_compound_hdr(&xdr, &hdr); + encode_putfh(&xdr, args->fh, &hdr); + status = encode_setacl(&xdr, args, &hdr); + encode_nops(&hdr); + return status; } + /* * Decode SETACL response */ @@ -4099,18 +3956,18 @@ out: */ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_closeres *res) { - struct xdr_stream xdr; - struct compound_hdr hdr; - int status; - - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); - if (status) - goto out; - status = decode_putfh(&xdr); - if (status) - goto out; - status = decode_close(&xdr, res); + struct xdr_stream xdr; + struct compound_hdr hdr; + int status; + + xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); + status = decode_compound_hdr(&xdr, &hdr); + if (status) + goto out; + status = decode_putfh(&xdr); + if (status) + goto out; + status = decode_close(&xdr, res); if (status != 0) goto out; /* @@ -4121,7 +3978,7 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_clos */ decode_getfattr(&xdr, res->fattr, res->server); out: - return status; + return status; } /* @@ -4129,23 +3986,23 @@ out: */ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openres *res) { - struct xdr_stream xdr; - struct compound_hdr hdr; - int status; - - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); - if (status) - goto out; - status = decode_putfh(&xdr); - if (status) - goto out; - status = decode_savefh(&xdr); + struct xdr_stream xdr; + struct compound_hdr hdr; + int status; + + xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); + status = decode_compound_hdr(&xdr, &hdr); + if (status) + goto out; + status = decode_putfh(&xdr); + if (status) + goto out; + status = decode_savefh(&xdr); + if (status) + goto out; + status = decode_open(&xdr, res); if (status) goto out; - status = decode_open(&xdr, res); - if (status) - goto out; if (decode_getfh(&xdr, &res->fh) != 0) goto out; if (decode_getfattr(&xdr, res->f_attr, res->server) != 0) @@ -4154,7 +4011,7 @@ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openr goto out; decode_getfattr(&xdr, res->dir_attr, res->server); out: - return status; + return status; } /* @@ -4162,20 +4019,20 @@ out: */ static int nfs4_xdr_dec_open_confirm(struct rpc_rqst *rqstp, __be32 *p, struct nfs_open_confirmres *res) { - struct xdr_stream xdr; - struct compound_hdr hdr; - int status; - - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); - if (status) - goto out; - status = decode_putfh(&xdr); - if (status) - goto out; - status = decode_open_confirm(&xdr, res); + struct xdr_stream xdr; + struct compound_hdr hdr; + int status; + + xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); + status = decode_compound_hdr(&xdr, &hdr); + if (status) + goto out; + status = decode_putfh(&xdr); + if (status) + goto out; + status = decode_open_confirm(&xdr, res); out: - return status; + return status; } /* @@ -4183,23 +4040,23 @@ out: */ static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openres *res) { - struct xdr_stream xdr; - struct compound_hdr hdr; - int status; - - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); - if (status) - goto out; - status = decode_putfh(&xdr); - if (status) - goto out; - status = decode_open(&xdr, res); - if (status) - goto out; + struct xdr_stream xdr; + struct compound_hdr hdr; + int status; + + xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); + status = decode_compound_hdr(&xdr, &hdr); + if (status) + goto out; + status = decode_putfh(&xdr); + if (status) + goto out; + status = decode_open(&xdr, res); + if (status) + goto out; decode_getfattr(&xdr, res->f_attr, res->server); out: - return status; + return status; } /* @@ -4207,25 +4064,25 @@ out: */ static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_setattrres *res) { - struct xdr_stream xdr; - struct compound_hdr hdr; - int status; - - xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); - status = decode_compound_hdr(&xdr, &hdr); - if (status) - goto out; - status = decode_putfh(&xdr); - if (status) - goto out; - status = decode_setattr(&xdr, res); - if (status) - goto out; + struct xdr_stream xdr; + struct compound_hdr hdr; + int status; + + xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); + status = decode_compound_hdr(&xdr, &hdr); + if (status) + goto out; + status = decode_putfh(&xdr); + if (status) + goto out; + status = decode_setattr(&xdr, res); + if (status) + goto out; status = decode_getfattr(&xdr, res->fattr, res->server); if (status == NFS4ERR_DELAY) status = 0; out: - return status; + return status; } /* @@ -4421,8 +4278,6 @@ static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, __be32 *p, struct nfs_fsinf status = decode_putfh(&xdr); if (!status) status = decode_fsinfo(&xdr, fsinfo); - if (!status) - status = nfs4_stat_to_errno(hdr.status); return status; } @@ -4511,8 +4366,6 @@ static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p, status = decode_compound_hdr(&xdr, &hdr); if (!status) status = decode_setclientid(&xdr, clp); - if (!status) - status = nfs4_stat_to_errno(hdr.status); return status; } @@ -4533,8 +4386,6 @@ static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, __be32 *p, str status = decode_putrootfh(&xdr); if (!status) status = decode_fsinfo(&xdr, fsinfo); - if (!status) - status = nfs4_stat_to_errno(hdr.status); return status; } @@ -4715,7 +4566,7 @@ nfs4_stat_to_errno(int stat) .p_replen = NFS4_##restype##_sz, \ .p_statidx = NFSPROC4_CLNT_##proc, \ .p_name = #proc, \ - } +} struct rpc_procinfo nfs4_procedures[] = { PROC(READ, enc_read, dec_read), diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c index d74d16ce0d49..d9ef602fbc5a 100644 --- a/fs/nfs/nfsroot.c +++ b/fs/nfs/nfsroot.c @@ -86,6 +86,8 @@ #include <net/ipconfig.h> #include <linux/parser.h> +#include "internal.h" + /* Define this to allow debugging output */ #undef NFSROOT_DEBUG #define NFSDBG_FACILITY NFSDBG_ROOT @@ -100,7 +102,7 @@ static char nfs_root_name[256] __initdata = ""; static __be32 servaddr __initdata = 0; /* Name of directory to mount */ -static char nfs_path[NFS_MAXPATHLEN] __initdata = { 0, }; +static char nfs_export_path[NFS_MAXPATHLEN] __initdata = { 0, }; /* NFS-related data */ static struct nfs_mount_data nfs_data __initdata = { 0, };/* NFS mount info */ @@ -312,7 +314,7 @@ static int __init root_nfs_name(char *name) printk(KERN_ERR "Root-NFS: Pathname for remote directory too long.\n"); return -1; } - sprintf(nfs_path, buf, cp); + sprintf(nfs_export_path, buf, cp); return 1; } @@ -340,7 +342,7 @@ static int __init root_nfs_addr(void) static void __init root_nfs_print(void) { printk(KERN_NOTICE "Root-NFS: Mounting %s on server %s as root\n", - nfs_path, nfs_data.hostname); + nfs_export_path, nfs_data.hostname); printk(KERN_NOTICE "Root-NFS: rsize = %d, wsize = %d, timeo = %d, retrans = %d\n", nfs_data.rsize, nfs_data.wsize, nfs_data.timeo, nfs_data.retrans); printk(KERN_NOTICE "Root-NFS: acreg (min,max) = (%d,%d), acdir (min,max) = (%d,%d)\n", @@ -485,18 +487,23 @@ static int __init root_nfs_get_handle(void) { struct nfs_fh fh; struct sockaddr_in sin; + struct nfs_mount_request request = { + .sap = (struct sockaddr *)&sin, + .salen = sizeof(sin), + .dirpath = nfs_export_path, + .version = (nfs_data.flags & NFS_MOUNT_VER3) ? + NFS_MNT3_VERSION : NFS_MNT_VERSION, + .protocol = (nfs_data.flags & NFS_MOUNT_TCP) ? + XPRT_TRANSPORT_TCP : XPRT_TRANSPORT_UDP, + .fh = &fh, + }; int status; - int protocol = (nfs_data.flags & NFS_MOUNT_TCP) ? - XPRT_TRANSPORT_TCP : XPRT_TRANSPORT_UDP; - int version = (nfs_data.flags & NFS_MOUNT_VER3) ? - NFS_MNT3_VERSION : NFS_MNT_VERSION; set_sockaddr(&sin, servaddr, htons(mount_port)); - status = nfs_mount((struct sockaddr *) &sin, sizeof(sin), NULL, - nfs_path, version, protocol, &fh); + status = nfs_mount(&request); if (status < 0) printk(KERN_ERR "Root-NFS: Server returned error %d " - "while mounting %s\n", status, nfs_path); + "while mounting %s\n", status, nfs_export_path); else { nfs_data.root.size = fh.size; memcpy(nfs_data.root.data, fh.data, fh.size); diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 40d17987d0e8..f856004bb7fa 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -533,12 +533,6 @@ readpage_async_filler(void *data, struct page *page) unsigned int len; int error; - error = nfs_wb_page(inode, page); - if (error) - goto out_unlock; - if (PageUptodate(page)) - goto out_unlock; - len = nfs_page_length(page); if (len == 0) return nfs_return_empty_page(page); diff --git a/fs/nfs/super.c b/fs/nfs/super.c index bb0313ac9e1f..d6686f4786dc 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -75,6 +75,7 @@ enum { Opt_acl, Opt_noacl, Opt_rdirplus, Opt_nordirplus, Opt_sharecache, Opt_nosharecache, + Opt_resvport, Opt_noresvport, /* Mount options that take integer arguments */ Opt_port, @@ -129,6 +130,8 @@ static const match_table_t nfs_mount_option_tokens = { { Opt_nordirplus, "nordirplus" }, { Opt_sharecache, "sharecache" }, { Opt_nosharecache, "nosharecache" }, + { Opt_resvport, "resvport" }, + { Opt_noresvport, "noresvport" }, { Opt_port, "port=%u" }, { Opt_rsize, "rsize=%u" }, @@ -512,7 +515,8 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, { NFS_MOUNT_NONLM, ",nolock", "" }, { NFS_MOUNT_NOACL, ",noacl", "" }, { NFS_MOUNT_NORDIRPLUS, ",nordirplus", "" }, - { NFS_MOUNT_UNSHARED, ",nosharecache", ""}, + { NFS_MOUNT_UNSHARED, ",nosharecache", "" }, + { NFS_MOUNT_NORESVPORT, ",noresvport", "" }, { 0, NULL, NULL } }; const struct proc_nfs_info *nfs_infop; @@ -1033,6 +1037,12 @@ static int nfs_parse_mount_options(char *raw, case Opt_nosharecache: mnt->flags |= NFS_MOUNT_UNSHARED; break; + case Opt_resvport: + mnt->flags &= ~NFS_MOUNT_NORESVPORT; + break; + case Opt_noresvport: + mnt->flags |= NFS_MOUNT_NORESVPORT; + break; /* * options that take numeric values @@ -1327,8 +1337,14 @@ out_security_failure: static int nfs_try_mount(struct nfs_parsed_mount_data *args, struct nfs_fh *root_fh) { - struct sockaddr *sap = (struct sockaddr *)&args->mount_server.address; - char *hostname; + struct nfs_mount_request request = { + .sap = (struct sockaddr *) + &args->mount_server.address, + .dirpath = args->nfs_server.export_path, + .protocol = args->mount_server.protocol, + .fh = root_fh, + .noresvport = args->flags & NFS_MOUNT_NORESVPORT, + }; int status; if (args->mount_server.version == 0) { @@ -1337,42 +1353,38 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args, else args->mount_server.version = NFS_MNT_VERSION; } + request.version = args->mount_server.version; if (args->mount_server.hostname) - hostname = args->mount_server.hostname; + request.hostname = args->mount_server.hostname; else - hostname = args->nfs_server.hostname; + request.hostname = args->nfs_server.hostname; /* * Construct the mount server's address. */ if (args->mount_server.address.ss_family == AF_UNSPEC) { - memcpy(sap, &args->nfs_server.address, + memcpy(request.sap, &args->nfs_server.address, args->nfs_server.addrlen); args->mount_server.addrlen = args->nfs_server.addrlen; } + request.salen = args->mount_server.addrlen; /* * autobind will be used if mount_server.port == 0 */ - nfs_set_port(sap, args->mount_server.port); + nfs_set_port(request.sap, args->mount_server.port); /* * Now ask the mount server to map our export path * to a file handle. */ - status = nfs_mount(sap, - args->mount_server.addrlen, - hostname, - args->nfs_server.export_path, - args->mount_server.version, - args->mount_server.protocol, - root_fh); + status = nfs_mount(&request); if (status == 0) return 0; dfprintk(MOUNT, "NFS: unable to mount server %s, error %d\n", - hostname, status); + request.hostname, status); return status; } @@ -2419,7 +2431,7 @@ static void nfs4_kill_super(struct super_block *sb) { struct nfs_server *server = NFS_SB(sb); - nfs_return_all_delegations(sb); + nfs_super_return_all_delegations(sb); kill_anon_super(sb); nfs4_renewd_prepare_shutdown(server); diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c index c11f5375d7c1..04133aacb1e5 100644 --- a/fs/nfs_common/nfsacl.c +++ b/fs/nfs_common/nfsacl.c @@ -29,8 +29,8 @@ MODULE_LICENSE("GPL"); -EXPORT_SYMBOL(nfsacl_encode); -EXPORT_SYMBOL(nfsacl_decode); +EXPORT_SYMBOL_GPL(nfsacl_encode); +EXPORT_SYMBOL_GPL(nfsacl_decode); struct nfsacl_encode_desc { struct xdr_array2_desc desc; diff --git a/fs/nfsctl.c b/fs/nfsctl.c index b1acbd6ab6fb..b27451909dff 100644 --- a/fs/nfsctl.c +++ b/fs/nfsctl.c @@ -38,9 +38,10 @@ static struct file *do_open(char *name, int flags) return ERR_PTR(error); if (flags == O_RDWR) - error = may_open(&nd,MAY_READ|MAY_WRITE,FMODE_READ|FMODE_WRITE); + error = may_open(&nd.path, MAY_READ|MAY_WRITE, + FMODE_READ|FMODE_WRITE); else - error = may_open(&nd, MAY_WRITE, FMODE_WRITE); + error = may_open(&nd.path, MAY_WRITE, FMODE_WRITE); if (!error) return dentry_open(nd.path.dentry, nd.path.mnt, flags, diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c index 0184fe9b514c..c903e04aa217 100644 --- a/fs/nfsd/auth.c +++ b/fs/nfsd/auth.c @@ -76,10 +76,10 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) ret = set_groups(new, gi); put_group_info(gi); - if (!ret) + if (ret < 0) goto error; - if (new->uid) + if (new->fsuid) new->cap_effective = cap_drop_nfsd_set(new->cap_effective); else new->cap_effective = cap_raise_nfsd_set(new->cap_effective, diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 094747a1227c..c464181b5994 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -53,9 +53,6 @@ #define NFSPROC4_CB_NULL 0 #define NFSPROC4_CB_COMPOUND 1 -/* declarations */ -static const struct rpc_call_ops nfs4_cb_null_ops; - /* Index of predefined Linux callback client operations */ enum { @@ -358,6 +355,7 @@ static struct rpc_program cb_program = { .nrvers = ARRAY_SIZE(nfs_cb_version), .version = nfs_cb_version, .stats = &cb_stats, + .pipe_dir_name = "/nfsd4_cb", }; /* Reference counting, callback cleanup, etc., all look racy as heck. @@ -382,8 +380,9 @@ static int do_probe_callback(void *data) .program = &cb_program, .prognumber = cb->cb_prog, .version = nfs_cb_version[1]->number, - .authflavor = RPC_AUTH_UNIX, /* XXX: need AUTH_GSS... */ + .authflavor = clp->cl_flavor, .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET), + .client_name = clp->cl_principal, }; struct rpc_message msg = { .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL], @@ -392,6 +391,11 @@ static int do_probe_callback(void *data) struct rpc_clnt *client; int status; + if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5)) { + status = nfserr_cb_path_down; + goto out_err; + } + /* Initialize address */ memset(&addr, 0, sizeof(addr)); addr.sin_family = AF_INET; diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 669461e291ae..9fa60a3ad48c 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -946,6 +946,11 @@ encode_op: nfsd4_encode_operation(resp, op); status = op->status; } + + dprintk("nfsv4 compound op %p opcnt %d #%d: %d: status %d\n", + args->ops, args->opcnt, resp->opcnt, op->opnum, + be32_to_cpu(status)); + if (cstate->replay_owner) { nfs4_put_stateowner(cstate->replay_owner); cstate->replay_owner = NULL; diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 0f9d6efaa62b..74f7b67567fd 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -116,9 +116,9 @@ nfs4_make_rec_clidname(char *dname, struct xdr_netobj *clname) md5_to_hex(dname, cksum.data); - kfree(cksum.data); status = nfs_ok; out: + kfree(cksum.data); crypto_free_hash(desc.tfm); out_no_tfm: return status; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index bf4cd46a5a11..88db7d3ec120 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -54,6 +54,7 @@ #include <linux/mutex.h> #include <linux/lockd/bind.h> #include <linux/module.h> +#include <linux/sunrpc/svcauth_gss.h> #define NFSDDBG_FACILITY NFSDDBG_PROC @@ -377,6 +378,7 @@ free_client(struct nfs4_client *clp) shutdown_callback_client(clp); if (clp->cl_cred.cr_group_info) put_group_info(clp->cl_cred.cr_group_info); + kfree(clp->cl_principal); kfree(clp->cl_name.data); kfree(clp); } @@ -696,6 +698,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, unsigned int strhashval; struct nfs4_client *conf, *unconf, *new; __be32 status; + char *princ; char dname[HEXDIR_LEN]; if (!check_name(clname)) @@ -783,6 +786,15 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, } copy_verf(new, &clverifier); new->cl_addr = sin->sin_addr.s_addr; + new->cl_flavor = rqstp->rq_flavor; + princ = svc_gss_principal(rqstp); + if (princ) { + new->cl_principal = kstrdup(princ, GFP_KERNEL); + if (new->cl_principal == NULL) { + free_client(new); + goto out; + } + } copy_cred(&new->cl_cred, &rqstp->rq_cred); gen_confirm(new); gen_callback(new, setclid); @@ -2404,6 +2416,26 @@ out: #define LOCK_HASH_SIZE (1 << LOCK_HASH_BITS) #define LOCK_HASH_MASK (LOCK_HASH_SIZE - 1) +static inline u64 +end_offset(u64 start, u64 len) +{ + u64 end; + + end = start + len; + return end >= start ? end: NFS4_MAX_UINT64; +} + +/* last octet in a range */ +static inline u64 +last_byte_offset(u64 start, u64 len) +{ + u64 end; + + BUG_ON(!len); + end = start + len; + return end > start ? end - 1: NFS4_MAX_UINT64; +} + #define lockownerid_hashval(id) \ ((id) & LOCK_HASH_MASK) @@ -2423,13 +2455,13 @@ static struct list_head lockstateid_hashtbl[STATEID_HASH_SIZE]; static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags) { - struct nfs4_stateid *local = NULL; + struct nfs4_stateid *local; u32 st_id = stid->si_stateownerid; u32 f_id = stid->si_fileid; unsigned int hashval; dprintk("NFSD: find_stateid flags 0x%x\n",flags); - if ((flags & LOCK_STATE) || (flags & RD_STATE) || (flags & WR_STATE)) { + if (flags & (LOCK_STATE | RD_STATE | WR_STATE)) { hashval = stateid_hashval(st_id, f_id); list_for_each_entry(local, &lockstateid_hashtbl[hashval], st_hash) { if ((local->st_stateid.si_stateownerid == st_id) && @@ -2437,7 +2469,8 @@ find_stateid(stateid_t *stid, int flags) return local; } } - if ((flags & OPEN_STATE) || (flags & RD_STATE) || (flags & WR_STATE)) { + + if (flags & (OPEN_STATE | RD_STATE | WR_STATE)) { hashval = stateid_hashval(st_id, f_id); list_for_each_entry(local, &stateid_hashtbl[hashval], st_hash) { if ((local->st_stateid.si_stateownerid == st_id) && @@ -2506,8 +2539,8 @@ nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny) deny->ld_clientid.cl_id = 0; } deny->ld_start = fl->fl_start; - deny->ld_length = ~(u64)0; - if (fl->fl_end != ~(u64)0) + deny->ld_length = NFS4_MAX_UINT64; + if (fl->fl_end != NFS4_MAX_UINT64) deny->ld_length = fl->fl_end - fl->fl_start + 1; deny->ld_type = NFS4_READ_LT; if (fl->fl_type != F_RDLCK) @@ -2604,7 +2637,7 @@ out: static int check_lock_length(u64 offset, u64 length) { - return ((length == 0) || ((length != ~(u64)0) && + return ((length == 0) || ((length != NFS4_MAX_UINT64) && LOFF_OVERFLOW(offset, length))); } @@ -2724,11 +2757,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, file_lock.fl_lmops = &nfsd_posix_mng_ops; file_lock.fl_start = lock->lk_offset; - if ((lock->lk_length == ~(u64)0) || - LOFF_OVERFLOW(lock->lk_offset, lock->lk_length)) - file_lock.fl_end = ~(u64)0; - else - file_lock.fl_end = lock->lk_offset + lock->lk_length - 1; + file_lock.fl_end = last_byte_offset(lock->lk_offset, lock->lk_length); nfs4_transform_lock_offset(&file_lock); /* @@ -2769,6 +2798,25 @@ out: } /* + * The NFSv4 spec allows a client to do a LOCKT without holding an OPEN, + * so we do a temporary open here just to get an open file to pass to + * vfs_test_lock. (Arguably perhaps test_lock should be done with an + * inode operation.) + */ +static int nfsd_test_lock(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file_lock *lock) +{ + struct file *file; + int err; + + err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file); + if (err) + return err; + err = vfs_test_lock(file, lock); + nfsd_close(file); + return err; +} + +/* * LOCKT operation */ __be32 @@ -2776,7 +2824,6 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_lockt *lockt) { struct inode *inode; - struct file file; struct file_lock file_lock; int error; __be32 status; @@ -2827,23 +2874,12 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, file_lock.fl_lmops = &nfsd_posix_mng_ops; file_lock.fl_start = lockt->lt_offset; - if ((lockt->lt_length == ~(u64)0) || LOFF_OVERFLOW(lockt->lt_offset, lockt->lt_length)) - file_lock.fl_end = ~(u64)0; - else - file_lock.fl_end = lockt->lt_offset + lockt->lt_length - 1; + file_lock.fl_end = last_byte_offset(lockt->lt_offset, lockt->lt_length); nfs4_transform_lock_offset(&file_lock); - /* vfs_test_lock uses the struct file _only_ to resolve the inode. - * since LOCKT doesn't require an OPEN, and therefore a struct - * file may not exist, pass vfs_test_lock a struct file with - * only the dentry:inode set. - */ - memset(&file, 0, sizeof (struct file)); - file.f_path.dentry = cstate->current_fh.fh_dentry; - status = nfs_ok; - error = vfs_test_lock(&file, &file_lock); + error = nfsd_test_lock(rqstp, &cstate->current_fh, &file_lock); if (error) { status = nfserrno(error); goto out; @@ -2894,10 +2930,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, file_lock.fl_lmops = &nfsd_posix_mng_ops; file_lock.fl_start = locku->lu_offset; - if ((locku->lu_length == ~(u64)0) || LOFF_OVERFLOW(locku->lu_offset, locku->lu_length)) - file_lock.fl_end = ~(u64)0; - else - file_lock.fl_end = locku->lu_offset + locku->lu_length - 1; + file_lock.fl_end = last_byte_offset(locku->lu_offset, locku->lu_length); nfs4_transform_lock_offset(&file_lock); /* diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index afcdf4b76843..f65953be39c0 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1,6 +1,4 @@ /* - * fs/nfs/nfs4xdr.c - * * Server-side XDR for NFSv4 * * Copyright (c) 2002 The Regents of the University of Michigan. diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 77d7b8c531a6..3d93b2064ce5 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -84,6 +84,8 @@ static ssize_t write_unexport(struct file *file, char *buf, size_t size); static ssize_t write_getfd(struct file *file, char *buf, size_t size); static ssize_t write_getfs(struct file *file, char *buf, size_t size); static ssize_t write_filehandle(struct file *file, char *buf, size_t size); +static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size); +static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size); static ssize_t write_threads(struct file *file, char *buf, size_t size); static ssize_t write_pool_threads(struct file *file, char *buf, size_t size); static ssize_t write_versions(struct file *file, char *buf, size_t size); @@ -94,9 +96,6 @@ static ssize_t write_leasetime(struct file *file, char *buf, size_t size); static ssize_t write_recoverydir(struct file *file, char *buf, size_t size); #endif -static ssize_t failover_unlock_ip(struct file *file, char *buf, size_t size); -static ssize_t failover_unlock_fs(struct file *file, char *buf, size_t size); - static ssize_t (*write_op[])(struct file *, char *, size_t) = { [NFSD_Svc] = write_svc, [NFSD_Add] = write_add, @@ -106,8 +105,8 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = { [NFSD_Getfd] = write_getfd, [NFSD_Getfs] = write_getfs, [NFSD_Fh] = write_filehandle, - [NFSD_FO_UnlockIP] = failover_unlock_ip, - [NFSD_FO_UnlockFS] = failover_unlock_fs, + [NFSD_FO_UnlockIP] = write_unlock_ip, + [NFSD_FO_UnlockFS] = write_unlock_fs, [NFSD_Threads] = write_threads, [NFSD_Pool_Threads] = write_pool_threads, [NFSD_Versions] = write_versions, @@ -176,10 +175,24 @@ static const struct file_operations exports_operations = { /*----------------------------------------------------------------------------*/ /* * payload - write methods - * If the method has a response, the response should be put in buf, - * and the length returned. Otherwise return 0 or and -error. */ +/** + * write_svc - Start kernel's NFSD server + * + * Deprecated. /proc/fs/nfsd/threads is preferred. + * Function remains to support old versions of nfs-utils. + * + * Input: + * buf: struct nfsctl_svc + * svc_port: port number of this + * server's listener + * svc_nthreads: number of threads to start + * size: size in bytes of passed in nfsctl_svc + * Output: + * On success: returns zero + * On error: return code is negative errno value + */ static ssize_t write_svc(struct file *file, char *buf, size_t size) { struct nfsctl_svc *data; @@ -189,6 +202,30 @@ static ssize_t write_svc(struct file *file, char *buf, size_t size) return nfsd_svc(data->svc_port, data->svc_nthreads); } +/** + * write_add - Add or modify client entry in auth unix cache + * + * Deprecated. /proc/net/rpc/auth.unix.ip is preferred. + * Function remains to support old versions of nfs-utils. + * + * Input: + * buf: struct nfsctl_client + * cl_ident: '\0'-terminated C string + * containing domain name + * of client + * cl_naddr: no. of items in cl_addrlist + * cl_addrlist: array of client addresses + * cl_fhkeytype: ignored + * cl_fhkeylen: ignored + * cl_fhkey: ignored + * size: size in bytes of passed in nfsctl_client + * Output: + * On success: returns zero + * On error: return code is negative errno value + * + * Note: Only AF_INET client addresses are passed in, since + * nfsctl_client.cl_addrlist contains only in_addr fields for addresses. + */ static ssize_t write_add(struct file *file, char *buf, size_t size) { struct nfsctl_client *data; @@ -198,6 +235,30 @@ static ssize_t write_add(struct file *file, char *buf, size_t size) return exp_addclient(data); } +/** + * write_del - Remove client from auth unix cache + * + * Deprecated. /proc/net/rpc/auth.unix.ip is preferred. + * Function remains to support old versions of nfs-utils. + * + * Input: + * buf: struct nfsctl_client + * cl_ident: '\0'-terminated C string + * containing domain name + * of client + * cl_naddr: ignored + * cl_addrlist: ignored + * cl_fhkeytype: ignored + * cl_fhkeylen: ignored + * cl_fhkey: ignored + * size: size in bytes of passed in nfsctl_client + * Output: + * On success: returns zero + * On error: return code is negative errno value + * + * Note: Only AF_INET client addresses are passed in, since + * nfsctl_client.cl_addrlist contains only in_addr fields for addresses. + */ static ssize_t write_del(struct file *file, char *buf, size_t size) { struct nfsctl_client *data; @@ -207,6 +268,33 @@ static ssize_t write_del(struct file *file, char *buf, size_t size) return exp_delclient(data); } +/** + * write_export - Export part or all of a local file system + * + * Deprecated. /proc/net/rpc/{nfsd.export,nfsd.fh} are preferred. + * Function remains to support old versions of nfs-utils. + * + * Input: + * buf: struct nfsctl_export + * ex_client: '\0'-terminated C string + * containing domain name + * of client allowed to access + * this export + * ex_path: '\0'-terminated C string + * containing pathname of + * directory in local file system + * ex_dev: fsid to use for this export + * ex_ino: ignored + * ex_flags: export flags for this export + * ex_anon_uid: UID to use for anonymous + * requests + * ex_anon_gid: GID to use for anonymous + * requests + * size: size in bytes of passed in nfsctl_export + * Output: + * On success: returns zero + * On error: return code is negative errno value + */ static ssize_t write_export(struct file *file, char *buf, size_t size) { struct nfsctl_export *data; @@ -216,6 +304,31 @@ static ssize_t write_export(struct file *file, char *buf, size_t size) return exp_export(data); } +/** + * write_unexport - Unexport a previously exported file system + * + * Deprecated. /proc/net/rpc/{nfsd.export,nfsd.fh} are preferred. + * Function remains to support old versions of nfs-utils. + * + * Input: + * buf: struct nfsctl_export + * ex_client: '\0'-terminated C string + * containing domain name + * of client no longer allowed + * to access this export + * ex_path: '\0'-terminated C string + * containing pathname of + * directory in local file system + * ex_dev: ignored + * ex_ino: ignored + * ex_flags: ignored + * ex_anon_uid: ignored + * ex_anon_gid: ignored + * size: size in bytes of passed in nfsctl_export + * Output: + * On success: returns zero + * On error: return code is negative errno value + */ static ssize_t write_unexport(struct file *file, char *buf, size_t size) { struct nfsctl_export *data; @@ -226,6 +339,30 @@ static ssize_t write_unexport(struct file *file, char *buf, size_t size) return exp_unexport(data); } +/** + * write_getfs - Get a variable-length NFS file handle by path + * + * Deprecated. /proc/fs/nfsd/filehandle is preferred. + * Function remains to support old versions of nfs-utils. + * + * Input: + * buf: struct nfsctl_fsparm + * gd_addr: socket address of client + * gd_path: '\0'-terminated C string + * containing pathname of + * directory in local file system + * gd_maxlen: maximum size of returned file + * handle + * size: size in bytes of passed in nfsctl_fsparm + * Output: + * On success: passed-in buffer filled with a knfsd_fh structure + * (a variable-length raw NFS file handle); + * return code is the size in bytes of the file handle + * On error: return code is negative errno value + * + * Note: Only AF_INET client addresses are passed in, since gd_addr + * is the same size as a struct sockaddr_in. + */ static ssize_t write_getfs(struct file *file, char *buf, size_t size) { struct nfsctl_fsparm *data; @@ -265,6 +402,29 @@ static ssize_t write_getfs(struct file *file, char *buf, size_t size) return err; } +/** + * write_getfd - Get a fixed-length NFS file handle by path (used by mountd) + * + * Deprecated. /proc/fs/nfsd/filehandle is preferred. + * Function remains to support old versions of nfs-utils. + * + * Input: + * buf: struct nfsctl_fdparm + * gd_addr: socket address of client + * gd_path: '\0'-terminated C string + * containing pathname of + * directory in local file system + * gd_version: fdparm structure version + * size: size in bytes of passed in nfsctl_fdparm + * Output: + * On success: passed-in buffer filled with nfsctl_res + * (a fixed-length raw NFS file handle); + * return code is the size in bytes of the file handle + * On error: return code is negative errno value + * + * Note: Only AF_INET client addresses are passed in, since gd_addr + * is the same size as a struct sockaddr_in. + */ static ssize_t write_getfd(struct file *file, char *buf, size_t size) { struct nfsctl_fdparm *data; @@ -309,7 +469,23 @@ static ssize_t write_getfd(struct file *file, char *buf, size_t size) return err; } -static ssize_t failover_unlock_ip(struct file *file, char *buf, size_t size) +/** + * write_unlock_ip - Release all locks used by a client + * + * Experimental. + * + * Input: + * buf: '\n'-terminated C string containing a + * presentation format IPv4 address + * size: length of C string in @buf + * Output: + * On success: returns zero if all specified locks were released; + * returns one if one or more locks were not released + * On error: return code is negative errno value + * + * Note: Only AF_INET client addresses are passed in + */ +static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size) { struct sockaddr_in sin = { .sin_family = AF_INET, @@ -339,7 +515,21 @@ static ssize_t failover_unlock_ip(struct file *file, char *buf, size_t size) return nlmsvc_unlock_all_by_ip((struct sockaddr *)&sin); } -static ssize_t failover_unlock_fs(struct file *file, char *buf, size_t size) +/** + * write_unlock_fs - Release all locks on a local file system + * + * Experimental. + * + * Input: + * buf: '\n'-terminated C string containing the + * absolute pathname of a local file system + * size: length of C string in @buf + * Output: + * On success: returns zero if all specified locks were released; + * returns one if one or more locks were not released + * On error: return code is negative errno value + */ +static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size) { struct path path; char *fo_path; @@ -360,21 +550,44 @@ static ssize_t failover_unlock_fs(struct file *file, char *buf, size_t size) if (error) return error; + /* + * XXX: Needs better sanity checking. Otherwise we could end up + * releasing locks on the wrong file system. + * + * For example: + * 1. Does the path refer to a directory? + * 2. Is that directory a mount point, or + * 3. Is that directory the root of an exported file system? + */ error = nlmsvc_unlock_all_by_sb(path.mnt->mnt_sb); path_put(&path); return error; } +/** + * write_filehandle - Get a variable-length NFS file handle by path + * + * On input, the buffer contains a '\n'-terminated C string comprised of + * three alphanumeric words separated by whitespace. The string may + * contain escape sequences. + * + * Input: + * buf: + * domain: client domain name + * path: export pathname + * maxsize: numeric maximum size of + * @buf + * size: length of C string in @buf + * Output: + * On success: passed-in buffer filled with '\n'-terminated C + * string containing a ASCII hex text version + * of the NFS file handle; + * return code is the size in bytes of the string + * On error: return code is negative errno value + */ static ssize_t write_filehandle(struct file *file, char *buf, size_t size) { - /* request is: - * domain path maxsize - * response is - * filehandle - * - * qword quoting is used, so filehandle will be \x.... - */ char *dname, *path; int uninitialized_var(maxsize); char *mesg = buf; @@ -391,11 +604,13 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size) dname = mesg; len = qword_get(&mesg, dname, size); - if (len <= 0) return -EINVAL; + if (len <= 0) + return -EINVAL; path = dname+len+1; len = qword_get(&mesg, path, size); - if (len <= 0) return -EINVAL; + if (len <= 0) + return -EINVAL; len = get_int(&mesg, &maxsize); if (len) @@ -419,17 +634,43 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size) if (len) return len; - mesg = buf; len = SIMPLE_TRANSACTION_LIMIT; + mesg = buf; + len = SIMPLE_TRANSACTION_LIMIT; qword_addhex(&mesg, &len, (char*)&fh.fh_base, fh.fh_size); mesg[-1] = '\n'; return mesg - buf; } +/** + * write_threads - Start NFSD, or report the current number of running threads + * + * Input: + * buf: ignored + * size: zero + * Output: + * On success: passed-in buffer filled with '\n'-terminated C + * string numeric value representing the number of + * running NFSD threads; + * return code is the size in bytes of the string + * On error: return code is zero + * + * OR + * + * Input: + * buf: C string containing an unsigned + * integer value representing the + * number of NFSD threads to start + * size: non-zero length of C string in @buf + * Output: + * On success: NFS service is started; + * passed-in buffer filled with '\n'-terminated C + * string numeric value representing the number of + * running NFSD threads; + * return code is the size in bytes of the string + * On error: return code is zero or a negative errno value + */ static ssize_t write_threads(struct file *file, char *buf, size_t size) { - /* if size > 0, look for a number of threads and call nfsd_svc - * then write out number of threads as reply - */ char *mesg = buf; int rv; if (size > 0) { @@ -437,9 +678,9 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size) rv = get_int(&mesg, &newthreads); if (rv) return rv; - if (newthreads <0) + if (newthreads < 0) return -EINVAL; - rv = nfsd_svc(2049, newthreads); + rv = nfsd_svc(NFS_PORT, newthreads); if (rv) return rv; } @@ -447,6 +688,28 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size) return strlen(buf); } +/** + * write_pool_threads - Set or report the current number of threads per pool + * + * Input: + * buf: ignored + * size: zero + * + * OR + * + * Input: + * buf: C string containing whitespace- + * separated unsigned integer values + * representing the number of NFSD + * threads to start in each pool + * size: non-zero length of C string in @buf + * Output: + * On success: passed-in buffer filled with '\n'-terminated C + * string containing integer values representing the + * number of NFSD threads in each pool; + * return code is the size in bytes of the string + * On error: return code is zero or a negative errno value + */ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size) { /* if size > 0, look for an array of number of threads per node @@ -517,10 +780,6 @@ out_free: static ssize_t __write_versions(struct file *file, char *buf, size_t size) { - /* - * Format: - * [-/+]vers [-/+]vers ... - */ char *mesg = buf; char *vers, sign; int len, num; @@ -578,6 +837,38 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size) return len; } +/** + * write_versions - Set or report the available NFS protocol versions + * + * Input: + * buf: ignored + * size: zero + * Output: + * On success: passed-in buffer filled with '\n'-terminated C + * string containing positive or negative integer + * values representing the current status of each + * protocol version; + * return code is the size in bytes of the string + * On error: return code is zero or a negative errno value + * + * OR + * + * Input: + * buf: C string containing whitespace- + * separated positive or negative + * integer values representing NFS + * protocol versions to enable ("+n") + * or disable ("-n") + * size: non-zero length of C string in @buf + * Output: + * On success: status of zero or more protocol versions has + * been updated; passed-in buffer filled with + * '\n'-terminated C string containing positive + * or negative integer values representing the + * current status of each protocol version; + * return code is the size in bytes of the string + * On error: return code is zero or a negative errno value + */ static ssize_t write_versions(struct file *file, char *buf, size_t size) { ssize_t rv; @@ -687,6 +978,75 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size) return -EINVAL; } +/** + * write_ports - Pass a socket file descriptor or transport name to listen on + * + * Input: + * buf: ignored + * size: zero + * Output: + * On success: passed-in buffer filled with a '\n'-terminated C + * string containing a whitespace-separated list of + * named NFSD listeners; + * return code is the size in bytes of the string + * On error: return code is zero or a negative errno value + * + * OR + * + * Input: + * buf: C string containing an unsigned + * integer value representing a bound + * but unconnected socket that is to be + * used as an NFSD listener + * size: non-zero length of C string in @buf + * Output: + * On success: NFS service is started; + * passed-in buffer filled with a '\n'-terminated C + * string containing a unique alphanumeric name of + * the listener; + * return code is the size in bytes of the string + * On error: return code is a negative errno value + * + * OR + * + * Input: + * buf: C string containing a "-" followed + * by an integer value representing a + * previously passed in socket file + * descriptor + * size: non-zero length of C string in @buf + * Output: + * On success: NFS service no longer listens on that socket; + * passed-in buffer filled with a '\n'-terminated C + * string containing a unique name of the listener; + * return code is the size in bytes of the string + * On error: return code is a negative errno value + * + * OR + * + * Input: + * buf: C string containing a transport + * name and an unsigned integer value + * representing the port to listen on, + * separated by whitespace + * size: non-zero length of C string in @buf + * Output: + * On success: returns zero; NFS service is started + * On error: return code is a negative errno value + * + * OR + * + * Input: + * buf: C string containing a "-" followed + * by a transport name and an unsigned + * integer value representing the port + * to listen on, separated by whitespace + * size: non-zero length of C string in @buf + * Output: + * On success: returns zero; NFS service no longer listens + * on that transport + * On error: return code is a negative errno value + */ static ssize_t write_ports(struct file *file, char *buf, size_t size) { ssize_t rv; @@ -700,6 +1060,27 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size) int nfsd_max_blksize; +/** + * write_maxblksize - Set or report the current NFS blksize + * + * Input: + * buf: ignored + * size: zero + * + * OR + * + * Input: + * buf: C string containing an unsigned + * integer value representing the new + * NFS blksize + * size: non-zero length of C string in @buf + * Output: + * On success: passed-in buffer filled with '\n'-terminated C string + * containing numeric value of the current NFS blksize + * setting; + * return code is the size in bytes of the string + * On error: return code is zero or a negative errno value + */ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size) { char *mesg = buf; @@ -752,6 +1133,27 @@ static ssize_t __write_leasetime(struct file *file, char *buf, size_t size) return strlen(buf); } +/** + * write_leasetime - Set or report the current NFSv4 lease time + * + * Input: + * buf: ignored + * size: zero + * + * OR + * + * Input: + * buf: C string containing an unsigned + * integer value representing the new + * NFSv4 lease expiry time + * size: non-zero length of C string in @buf + * Output: + * On success: passed-in buffer filled with '\n'-terminated C + * string containing unsigned integer value of the + * current lease expiry time; + * return code is the size in bytes of the string + * On error: return code is zero or a negative errno value + */ static ssize_t write_leasetime(struct file *file, char *buf, size_t size) { ssize_t rv; @@ -788,6 +1190,27 @@ static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size) return strlen(buf); } +/** + * write_recoverydir - Set or report the pathname of the recovery directory + * + * Input: + * buf: ignored + * size: zero + * + * OR + * + * Input: + * buf: C string containing the pathname + * of the directory on a local file + * system containing permanent NFSv4 + * recovery data + * size: non-zero length of C string in @buf + * Output: + * On success: passed-in buffer filled with '\n'-terminated C string + * containing the current recovery pathname setting; + * return code is the size in bytes of the string + * On error: return code is zero or a negative errno value + */ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size) { ssize_t rv; diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index f0da7d9c3a92..9f1ca17293d3 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -258,14 +258,32 @@ out: return error; } -/* - * Perform sanity checks on the dentry in a client's file handle. +/** + * fh_verify - filehandle lookup and access checking + * @rqstp: pointer to current rpc request + * @fhp: filehandle to be verified + * @type: expected type of object pointed to by filehandle + * @access: type of access needed to object + * + * Look up a dentry from the on-the-wire filehandle, check the client's + * access to the export, and set the current task's credentials. + * + * Regardless of success or failure of fh_verify(), fh_put() should be + * called on @fhp when the caller is finished with the filehandle. * - * Note that the file handle dentry may need to be freed even after - * an error return. + * fh_verify() may be called multiple times on a given filehandle, for + * example, when processing an NFSv4 compound. The first call will look + * up a dentry using the on-the-wire filehandle. Subsequent calls will + * skip the lookup and just perform the other checks and possibly change + * the current task's credentials. * - * This is only called at the start of an nfsproc call, so fhp points to - * a svc_fh which is all 0 except for the over-the-wire file handle. + * @type specifies the type of object expected using one of the S_IF* + * constants defined in include/linux/stat.h. The caller may use zero + * to indicate that it doesn't care, or a negative integer to indicate + * that it expects something not of the given type. + * + * @access is formed from the NFSD_MAY_* constants defined in + * include/linux/nfsd/nfsd.h. */ __be32 fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access) @@ -466,6 +484,8 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, goto retry; break; } + } else if (exp->ex_flags & NFSEXP_FSID) { + fsid_type = FSID_NUM; } else if (exp->ex_uuid) { if (fhp->fh_maxsize >= 64) { if (root_export) @@ -478,9 +498,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, else fsid_type = FSID_UUID4_INUM; } - } else if (exp->ex_flags & NFSEXP_FSID) - fsid_type = FSID_NUM; - else if (!old_valid_dev(ex_dev)) + } else if (!old_valid_dev(ex_dev)) /* for newer device numbers, we must use a newer fsid format */ fsid_type = FSID_ENCODE_DEV; else diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 5cffeca7acef..6f7f26351227 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -622,6 +622,7 @@ nfserrno (int errno) { nfserr_badname, -ESRCH }, { nfserr_io, -ETXTBSY }, { nfserr_notsupp, -EOPNOTSUPP }, + { nfserr_toosmall, -ETOOSMALL }, }; int i; diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index d1c5f787b365..6e50aaa56ca2 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -764,7 +764,6 @@ static inline int nfsd_dosync(struct file *filp, struct dentry *dp, return err; } - static int nfsd_sync(struct file *filp) @@ -1211,7 +1210,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, dirp = dentry->d_inode; err = nfserr_notdir; - if(!dirp->i_op || !dirp->i_op->lookup) + if (!dirp->i_op->lookup) goto out; /* * Check whether the response file handle has been verified yet. @@ -1347,7 +1346,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp, /* Get all the sanity checks out of the way before * we lock the parent. */ err = nfserr_notdir; - if(!dirp->i_op || !dirp->i_op->lookup) + if (!dirp->i_op->lookup) goto out; fh_lock_nested(fhp, I_MUTEX_PARENT); @@ -1482,7 +1481,7 @@ nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp) inode = dentry->d_inode; err = nfserr_inval; - if (!inode->i_op || !inode->i_op->readlink) + if (!inode->i_op->readlink) goto out; touch_atime(fhp->fh_export->ex_path.mnt, dentry); @@ -2162,7 +2161,7 @@ nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl) size_t size; int error; - if (!IS_POSIXACL(inode) || !inode->i_op || + if (!IS_POSIXACL(inode) || !inode->i_op->setxattr || !inode->i_op->removexattr) return -EOPNOTSUPP; switch(type) { diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig new file mode 100644 index 000000000000..50914d7303c6 --- /dev/null +++ b/fs/notify/Kconfig @@ -0,0 +1,2 @@ +source "fs/notify/dnotify/Kconfig" +source "fs/notify/inotify/Kconfig" diff --git a/fs/notify/Makefile b/fs/notify/Makefile new file mode 100644 index 000000000000..5a95b6010ce7 --- /dev/null +++ b/fs/notify/Makefile @@ -0,0 +1,2 @@ +obj-y += dnotify/ +obj-y += inotify/ diff --git a/fs/notify/dnotify/Kconfig b/fs/notify/dnotify/Kconfig new file mode 100644 index 000000000000..26adf5dfa646 --- /dev/null +++ b/fs/notify/dnotify/Kconfig @@ -0,0 +1,10 @@ +config DNOTIFY + bool "Dnotify support" + default y + help + Dnotify is a directory-based per-fd file change notification system + that uses signals to communicate events to user-space. There exist + superior alternatives, but some applications may still rely on + dnotify. + + If unsure, say Y. diff --git a/fs/notify/dnotify/Makefile b/fs/notify/dnotify/Makefile new file mode 100644 index 000000000000..f145251dcadb --- /dev/null +++ b/fs/notify/dnotify/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_DNOTIFY) += dnotify.o diff --git a/fs/dnotify.c b/fs/notify/dnotify/dnotify.c index 676073b8dda5..b0aa2cde80bd 100644 --- a/fs/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -115,9 +115,6 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) dn->dn_next = inode->i_dnotify; inode->i_dnotify = dn; spin_unlock(&inode->i_lock); - - if (filp->f_op && filp->f_op->dir_notify) - return filp->f_op->dir_notify(filp, arg); return 0; out_free: diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig new file mode 100644 index 000000000000..446792841023 --- /dev/null +++ b/fs/notify/inotify/Kconfig @@ -0,0 +1,27 @@ +config INOTIFY + bool "Inotify file change notification support" + default y + ---help--- + Say Y here to enable inotify support. Inotify is a file change + notification system and a replacement for dnotify. Inotify fixes + numerous shortcomings in dnotify and introduces several new features + including multiple file events, one-shot support, and unmount + notification. + + For more information, see <file:Documentation/filesystems/inotify.txt> + + If unsure, say Y. + +config INOTIFY_USER + bool "Inotify support for userspace" + depends on INOTIFY + default y + ---help--- + Say Y here to enable inotify support for userspace, including the + associated system calls. Inotify allows monitoring of both files and + directories via a single open fd. Events are read from the file + descriptor, which is also select()- and poll()-able. + + For more information, see <file:Documentation/filesystems/inotify.txt> + + If unsure, say Y. diff --git a/fs/notify/inotify/Makefile b/fs/notify/inotify/Makefile new file mode 100644 index 000000000000..e290f3bb9d8d --- /dev/null +++ b/fs/notify/inotify/Makefile @@ -0,0 +1,2 @@ +obj-$(CONFIG_INOTIFY) += inotify.o +obj-$(CONFIG_INOTIFY_USER) += inotify_user.o diff --git a/fs/inotify.c b/fs/notify/inotify/inotify.c index dae3f28f30d4..dae3f28f30d4 100644 --- a/fs/inotify.c +++ b/fs/notify/inotify/inotify.c diff --git a/fs/inotify_user.c b/fs/notify/inotify/inotify_user.c index e2425bbd871f..81b8644b0136 100644 --- a/fs/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -76,10 +76,10 @@ struct inotify_device { struct mutex ev_mutex; /* protects event queue */ struct mutex up_mutex; /* synchronizes watch updates */ struct list_head events; /* list of queued events */ - atomic_t count; /* reference count */ struct user_struct *user; /* user who opened this dev */ struct inotify_handle *ih; /* inotify handle */ struct fasync_struct *fa; /* async notification */ + atomic_t count; /* reference count */ unsigned int queue_size; /* size of the queue (bytes) */ unsigned int event_count; /* number of pending events */ unsigned int max_events; /* maximum number of events */ @@ -704,7 +704,7 @@ fput_and_out: return ret; } -asmlinkage long sys_inotify_rm_watch(int fd, u32 wd) +asmlinkage long sys_inotify_rm_watch(int fd, __s32 wd) { struct file *filp; struct inotify_device *dev; diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index e9da092e2772..86bef156cf0a 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -1406,9 +1406,6 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi) ni->allocated_size = sle64_to_cpu( a->data.non_resident.allocated_size); } - /* Setup the operations for this attribute inode. */ - vi->i_op = NULL; - vi->i_fop = NULL; if (NInoMstProtected(ni)) vi->i_mapping->a_ops = &ntfs_mst_aops; else diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile index 589dcdfdfe3c..01596079dd63 100644 --- a/fs/ocfs2/Makefile +++ b/fs/ocfs2/Makefile @@ -12,6 +12,7 @@ obj-$(CONFIG_OCFS2_FS_USERSPACE_CLUSTER) += ocfs2_stack_user.o ocfs2-objs := \ alloc.o \ aops.o \ + blockcheck.o \ buffer_head_io.o \ dcache.o \ dir.o \ @@ -35,8 +36,14 @@ ocfs2-objs := \ sysfile.o \ uptodate.o \ ver.o \ + quota_local.o \ + quota_global.o \ xattr.o +ifeq ($(CONFIG_OCFS2_FS_POSIX_ACL),y) +ocfs2-objs += acl.o +endif + ocfs2_stackglue-objs := stackglue.o ocfs2_stack_o2cb-objs := stack_o2cb.o ocfs2_stack_user-objs := stack_user.o diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c new file mode 100644 index 000000000000..12dfb44c22e5 --- /dev/null +++ b/fs/ocfs2/acl.c @@ -0,0 +1,479 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * acl.c + * + * Copyright (C) 2004, 2008 Oracle. All rights reserved. + * + * CREDITS: + * Lots of code in this file is copy from linux/fs/ext3/acl.c. + * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/string.h> + +#define MLOG_MASK_PREFIX ML_INODE +#include <cluster/masklog.h> + +#include "ocfs2.h" +#include "alloc.h" +#include "dlmglue.h" +#include "file.h" +#include "ocfs2_fs.h" + +#include "xattr.h" +#include "acl.h" + +/* + * Convert from xattr value to acl struct. + */ +static struct posix_acl *ocfs2_acl_from_xattr(const void *value, size_t size) +{ + int n, count; + struct posix_acl *acl; + + if (!value) + return NULL; + if (size < sizeof(struct posix_acl_entry)) + return ERR_PTR(-EINVAL); + + count = size / sizeof(struct posix_acl_entry); + if (count < 0) + return ERR_PTR(-EINVAL); + if (count == 0) + return NULL; + + acl = posix_acl_alloc(count, GFP_NOFS); + if (!acl) + return ERR_PTR(-ENOMEM); + for (n = 0; n < count; n++) { + struct ocfs2_acl_entry *entry = + (struct ocfs2_acl_entry *)value; + + acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag); + acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm); + acl->a_entries[n].e_id = le32_to_cpu(entry->e_id); + value += sizeof(struct posix_acl_entry); + + } + return acl; +} + +/* + * Convert acl struct to xattr value. + */ +static void *ocfs2_acl_to_xattr(const struct posix_acl *acl, size_t *size) +{ + struct ocfs2_acl_entry *entry = NULL; + char *ocfs2_acl; + size_t n; + + *size = acl->a_count * sizeof(struct posix_acl_entry); + + ocfs2_acl = kmalloc(*size, GFP_NOFS); + if (!ocfs2_acl) + return ERR_PTR(-ENOMEM); + + entry = (struct ocfs2_acl_entry *)ocfs2_acl; + for (n = 0; n < acl->a_count; n++, entry++) { + entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag); + entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm); + entry->e_id = cpu_to_le32(acl->a_entries[n].e_id); + } + return ocfs2_acl; +} + +static struct posix_acl *ocfs2_get_acl_nolock(struct inode *inode, + int type, + struct buffer_head *di_bh) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + int name_index; + char *value = NULL; + struct posix_acl *acl; + int retval; + + if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) + return NULL; + + switch (type) { + case ACL_TYPE_ACCESS: + name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS; + break; + case ACL_TYPE_DEFAULT: + name_index = OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT; + break; + default: + return ERR_PTR(-EINVAL); + } + + retval = ocfs2_xattr_get_nolock(inode, di_bh, name_index, "", NULL, 0); + if (retval > 0) { + value = kmalloc(retval, GFP_NOFS); + if (!value) + return ERR_PTR(-ENOMEM); + retval = ocfs2_xattr_get_nolock(inode, di_bh, name_index, + "", value, retval); + } + + if (retval > 0) + acl = ocfs2_acl_from_xattr(value, retval); + else if (retval == -ENODATA || retval == 0) + acl = NULL; + else + acl = ERR_PTR(retval); + + kfree(value); + + return acl; +} + + +/* + * Get posix acl. + */ +static struct posix_acl *ocfs2_get_acl(struct inode *inode, int type) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct buffer_head *di_bh = NULL; + struct posix_acl *acl; + int ret; + + if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) + return NULL; + + ret = ocfs2_inode_lock(inode, &di_bh, 0); + if (ret < 0) { + mlog_errno(ret); + acl = ERR_PTR(ret); + return acl; + } + + acl = ocfs2_get_acl_nolock(inode, type, di_bh); + + ocfs2_inode_unlock(inode, 0); + + brelse(di_bh); + + return acl; +} + +/* + * Set the access or default ACL of an inode. + */ +static int ocfs2_set_acl(handle_t *handle, + struct inode *inode, + struct buffer_head *di_bh, + int type, + struct posix_acl *acl, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_alloc_context *data_ac) +{ + int name_index; + void *value = NULL; + size_t size = 0; + int ret; + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + + switch (type) { + case ACL_TYPE_ACCESS: + name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS; + if (acl) { + mode_t mode = inode->i_mode; + ret = posix_acl_equiv_mode(acl, &mode); + if (ret < 0) + return ret; + else { + inode->i_mode = mode; + if (ret == 0) + acl = NULL; + } + } + break; + case ACL_TYPE_DEFAULT: + name_index = OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT; + if (!S_ISDIR(inode->i_mode)) + return acl ? -EACCES : 0; + break; + default: + return -EINVAL; + } + + if (acl) { + value = ocfs2_acl_to_xattr(acl, &size); + if (IS_ERR(value)) + return (int)PTR_ERR(value); + } + + if (handle) + ret = ocfs2_xattr_set_handle(handle, inode, di_bh, name_index, + "", value, size, 0, + meta_ac, data_ac); + else + ret = ocfs2_xattr_set(inode, name_index, "", value, size, 0); + + kfree(value); + + return ret; +} + +int ocfs2_check_acl(struct inode *inode, int mask) +{ + struct posix_acl *acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS); + + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl) { + int ret = posix_acl_permission(inode, acl, mask); + posix_acl_release(acl); + return ret; + } + + return -EAGAIN; +} + +int ocfs2_acl_chmod(struct inode *inode) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct posix_acl *acl, *clone; + int ret; + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + + if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) + return 0; + + acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS); + if (IS_ERR(acl) || !acl) + return PTR_ERR(acl); + clone = posix_acl_clone(acl, GFP_KERNEL); + posix_acl_release(acl); + if (!clone) + return -ENOMEM; + ret = posix_acl_chmod_masq(clone, inode->i_mode); + if (!ret) + ret = ocfs2_set_acl(NULL, inode, NULL, ACL_TYPE_ACCESS, + clone, NULL, NULL); + posix_acl_release(clone); + return ret; +} + +/* + * Initialize the ACLs of a new inode. If parent directory has default ACL, + * then clone to new inode. Called from ocfs2_mknod. + */ +int ocfs2_init_acl(handle_t *handle, + struct inode *inode, + struct inode *dir, + struct buffer_head *di_bh, + struct buffer_head *dir_bh, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_alloc_context *data_ac) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct posix_acl *acl = NULL; + int ret = 0; + + if (!S_ISLNK(inode->i_mode)) { + if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) { + acl = ocfs2_get_acl_nolock(dir, ACL_TYPE_DEFAULT, + dir_bh); + if (IS_ERR(acl)) + return PTR_ERR(acl); + } + if (!acl) + inode->i_mode &= ~current->fs->umask; + } + if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) { + struct posix_acl *clone; + mode_t mode; + + if (S_ISDIR(inode->i_mode)) { + ret = ocfs2_set_acl(handle, inode, di_bh, + ACL_TYPE_DEFAULT, acl, + meta_ac, data_ac); + if (ret) + goto cleanup; + } + clone = posix_acl_clone(acl, GFP_NOFS); + ret = -ENOMEM; + if (!clone) + goto cleanup; + + mode = inode->i_mode; + ret = posix_acl_create_masq(clone, &mode); + if (ret >= 0) { + inode->i_mode = mode; + if (ret > 0) { + ret = ocfs2_set_acl(handle, inode, + di_bh, ACL_TYPE_ACCESS, + clone, meta_ac, data_ac); + } + } + posix_acl_release(clone); + } +cleanup: + posix_acl_release(acl); + return ret; +} + +static size_t ocfs2_xattr_list_acl_access(struct inode *inode, + char *list, + size_t list_len, + const char *name, + size_t name_len) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS); + + if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) + return 0; + + if (list && size <= list_len) + memcpy(list, POSIX_ACL_XATTR_ACCESS, size); + return size; +} + +static size_t ocfs2_xattr_list_acl_default(struct inode *inode, + char *list, + size_t list_len, + const char *name, + size_t name_len) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT); + + if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) + return 0; + + if (list && size <= list_len) + memcpy(list, POSIX_ACL_XATTR_DEFAULT, size); + return size; +} + +static int ocfs2_xattr_get_acl(struct inode *inode, + int type, + void *buffer, + size_t size) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct posix_acl *acl; + int ret; + + if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) + return -EOPNOTSUPP; + + acl = ocfs2_get_acl(inode, type); + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl == NULL) + return -ENODATA; + ret = posix_acl_to_xattr(acl, buffer, size); + posix_acl_release(acl); + + return ret; +} + +static int ocfs2_xattr_get_acl_access(struct inode *inode, + const char *name, + void *buffer, + size_t size) +{ + if (strcmp(name, "") != 0) + return -EINVAL; + return ocfs2_xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size); +} + +static int ocfs2_xattr_get_acl_default(struct inode *inode, + const char *name, + void *buffer, + size_t size) +{ + if (strcmp(name, "") != 0) + return -EINVAL; + return ocfs2_xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size); +} + +static int ocfs2_xattr_set_acl(struct inode *inode, + int type, + const void *value, + size_t size) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct posix_acl *acl; + int ret = 0; + + if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) + return -EOPNOTSUPP; + + if (!is_owner_or_cap(inode)) + return -EPERM; + + if (value) { + acl = posix_acl_from_xattr(value, size); + if (IS_ERR(acl)) + return PTR_ERR(acl); + else if (acl) { + ret = posix_acl_valid(acl); + if (ret) + goto cleanup; + } + } else + acl = NULL; + + ret = ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL); + +cleanup: + posix_acl_release(acl); + return ret; +} + +static int ocfs2_xattr_set_acl_access(struct inode *inode, + const char *name, + const void *value, + size_t size, + int flags) +{ + if (strcmp(name, "") != 0) + return -EINVAL; + return ocfs2_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size); +} + +static int ocfs2_xattr_set_acl_default(struct inode *inode, + const char *name, + const void *value, + size_t size, + int flags) +{ + if (strcmp(name, "") != 0) + return -EINVAL; + return ocfs2_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size); +} + +struct xattr_handler ocfs2_xattr_acl_access_handler = { + .prefix = POSIX_ACL_XATTR_ACCESS, + .list = ocfs2_xattr_list_acl_access, + .get = ocfs2_xattr_get_acl_access, + .set = ocfs2_xattr_set_acl_access, +}; + +struct xattr_handler ocfs2_xattr_acl_default_handler = { + .prefix = POSIX_ACL_XATTR_DEFAULT, + .list = ocfs2_xattr_list_acl_default, + .get = ocfs2_xattr_get_acl_default, + .set = ocfs2_xattr_set_acl_default, +}; diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h new file mode 100644 index 000000000000..8f6389ed4da5 --- /dev/null +++ b/fs/ocfs2/acl.h @@ -0,0 +1,58 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * acl.h + * + * Copyright (C) 2004, 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#ifndef OCFS2_ACL_H +#define OCFS2_ACL_H + +#include <linux/posix_acl_xattr.h> + +struct ocfs2_acl_entry { + __le16 e_tag; + __le16 e_perm; + __le32 e_id; +}; + +#ifdef CONFIG_OCFS2_FS_POSIX_ACL + +extern int ocfs2_check_acl(struct inode *, int); +extern int ocfs2_acl_chmod(struct inode *); +extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *, + struct buffer_head *, struct buffer_head *, + struct ocfs2_alloc_context *, + struct ocfs2_alloc_context *); + +#else /* CONFIG_OCFS2_FS_POSIX_ACL*/ + +#define ocfs2_check_acl NULL +static inline int ocfs2_acl_chmod(struct inode *inode) +{ + return 0; +} +static inline int ocfs2_init_acl(handle_t *handle, + struct inode *inode, + struct inode *dir, + struct buffer_head *di_bh, + struct buffer_head *dir_bh, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_alloc_context *data_ac) +{ + return 0; +} + +#endif /* CONFIG_OCFS2_FS_POSIX_ACL*/ + +#endif /* OCFS2_ACL_H */ diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 0cc2deb9394c..d861096c9d81 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -28,6 +28,7 @@ #include <linux/slab.h> #include <linux/highmem.h> #include <linux/swap.h> +#include <linux/quotaops.h> #define MLOG_MASK_PREFIX ML_DISK_ALLOC #include <cluster/masklog.h> @@ -36,6 +37,7 @@ #include "alloc.h" #include "aops.h" +#include "blockcheck.h" #include "dlmglue.h" #include "extent_map.h" #include "inode.h" @@ -46,6 +48,7 @@ #include "file.h" #include "super.h" #include "uptodate.h" +#include "xattr.h" #include "buffer_head_io.h" @@ -187,20 +190,12 @@ static int ocfs2_dinode_insert_check(struct inode *inode, static int ocfs2_dinode_sanity_check(struct inode *inode, struct ocfs2_extent_tree *et) { - int ret = 0; - struct ocfs2_dinode *di; + struct ocfs2_dinode *di = et->et_object; BUG_ON(et->et_ops != &ocfs2_dinode_et_ops); + BUG_ON(!OCFS2_IS_VALID_DINODE(di)); - di = et->et_object; - if (!OCFS2_IS_VALID_DINODE(di)) { - ret = -EIO; - ocfs2_error(inode->i_sb, - "Inode %llu has invalid path root", - (unsigned long long)OCFS2_I(inode)->ip_blkno); - } - - return ret; + return 0; } static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et) @@ -213,36 +208,33 @@ static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et) static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et) { - struct ocfs2_xattr_value_root *xv = et->et_object; + struct ocfs2_xattr_value_buf *vb = et->et_object; - et->et_root_el = &xv->xr_list; + et->et_root_el = &vb->vb_xv->xr_list; } static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et, u64 blkno) { - struct ocfs2_xattr_value_root *xv = - (struct ocfs2_xattr_value_root *)et->et_object; + struct ocfs2_xattr_value_buf *vb = et->et_object; - xv->xr_last_eb_blk = cpu_to_le64(blkno); + vb->vb_xv->xr_last_eb_blk = cpu_to_le64(blkno); } static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et) { - struct ocfs2_xattr_value_root *xv = - (struct ocfs2_xattr_value_root *) et->et_object; + struct ocfs2_xattr_value_buf *vb = et->et_object; - return le64_to_cpu(xv->xr_last_eb_blk); + return le64_to_cpu(vb->vb_xv->xr_last_eb_blk); } static void ocfs2_xattr_value_update_clusters(struct inode *inode, struct ocfs2_extent_tree *et, u32 clusters) { - struct ocfs2_xattr_value_root *xv = - (struct ocfs2_xattr_value_root *)et->et_object; + struct ocfs2_xattr_value_buf *vb = et->et_object; - le32_add_cpu(&xv->xr_clusters, clusters); + le32_add_cpu(&vb->vb_xv->xr_clusters, clusters); } static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = { @@ -304,11 +296,13 @@ static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = { static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et, struct inode *inode, struct buffer_head *bh, + ocfs2_journal_access_func access, void *obj, struct ocfs2_extent_tree_operations *ops) { et->et_ops = ops; et->et_root_bh = bh; + et->et_root_journal_access = access; if (!obj) obj = (void *)bh->b_data; et->et_object = obj; @@ -324,23 +318,23 @@ void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et, struct inode *inode, struct buffer_head *bh) { - __ocfs2_init_extent_tree(et, inode, bh, NULL, &ocfs2_dinode_et_ops); + __ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_di, + NULL, &ocfs2_dinode_et_ops); } void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et, struct inode *inode, struct buffer_head *bh) { - __ocfs2_init_extent_tree(et, inode, bh, NULL, - &ocfs2_xattr_tree_et_ops); + __ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_xb, + NULL, &ocfs2_xattr_tree_et_ops); } void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et, struct inode *inode, - struct buffer_head *bh, - struct ocfs2_xattr_value_root *xv) + struct ocfs2_xattr_value_buf *vb) { - __ocfs2_init_extent_tree(et, inode, bh, xv, + __ocfs2_init_extent_tree(et, inode, vb->vb_bh, vb->vb_access, vb, &ocfs2_xattr_value_et_ops); } @@ -362,6 +356,15 @@ static inline void ocfs2_et_update_clusters(struct inode *inode, et->et_ops->eo_update_clusters(inode, et, clusters); } +static inline int ocfs2_et_root_journal_access(handle_t *handle, + struct inode *inode, + struct ocfs2_extent_tree *et, + int type) +{ + return et->et_root_journal_access(handle, inode, et->et_root_bh, + type); +} + static inline int ocfs2_et_insert_check(struct inode *inode, struct ocfs2_extent_tree *et, struct ocfs2_extent_rec *rec) @@ -402,12 +405,14 @@ struct ocfs2_path_item { #define OCFS2_MAX_PATH_DEPTH 5 struct ocfs2_path { - int p_tree_depth; - struct ocfs2_path_item p_node[OCFS2_MAX_PATH_DEPTH]; + int p_tree_depth; + ocfs2_journal_access_func p_root_access; + struct ocfs2_path_item p_node[OCFS2_MAX_PATH_DEPTH]; }; #define path_root_bh(_path) ((_path)->p_node[0].bh) #define path_root_el(_path) ((_path)->p_node[0].el) +#define path_root_access(_path)((_path)->p_root_access) #define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh) #define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el) #define path_num_items(_path) ((_path)->p_tree_depth + 1) @@ -440,6 +445,8 @@ static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root) */ if (keep_root) depth = le16_to_cpu(path_root_el(path)->l_tree_depth); + else + path_root_access(path) = NULL; path->p_tree_depth = depth; } @@ -465,6 +472,7 @@ static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src) BUG_ON(path_root_bh(dest) != path_root_bh(src)); BUG_ON(path_root_el(dest) != path_root_el(src)); + BUG_ON(path_root_access(dest) != path_root_access(src)); ocfs2_reinit_path(dest, 1); @@ -486,6 +494,7 @@ static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src) int i; BUG_ON(path_root_bh(dest) != path_root_bh(src)); + BUG_ON(path_root_access(dest) != path_root_access(src)); for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) { brelse(dest->p_node[i].bh); @@ -521,7 +530,8 @@ static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index, } static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh, - struct ocfs2_extent_list *root_el) + struct ocfs2_extent_list *root_el, + ocfs2_journal_access_func access) { struct ocfs2_path *path; @@ -533,11 +543,48 @@ static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh, get_bh(root_bh); path_root_bh(path) = root_bh; path_root_el(path) = root_el; + path_root_access(path) = access; } return path; } +static struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path) +{ + return ocfs2_new_path(path_root_bh(path), path_root_el(path), + path_root_access(path)); +} + +static struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et) +{ + return ocfs2_new_path(et->et_root_bh, et->et_root_el, + et->et_root_journal_access); +} + +/* + * Journal the buffer at depth idx. All idx>0 are extent_blocks, + * otherwise it's the root_access function. + * + * I don't like the way this function's name looks next to + * ocfs2_journal_access_path(), but I don't have a better one. + */ +static int ocfs2_path_bh_journal_access(handle_t *handle, + struct inode *inode, + struct ocfs2_path *path, + int idx) +{ + ocfs2_journal_access_func access = path_root_access(path); + + if (!access) + access = ocfs2_journal_access; + + if (idx) + access = ocfs2_journal_access_eb; + + return access(handle, inode, path->p_node[idx].bh, + OCFS2_JOURNAL_ACCESS_WRITE); +} + /* * Convenience function to journal all components in a path. */ @@ -550,8 +597,7 @@ static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle, goto out; for(i = 0; i < path_num_items(path); i++) { - ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_path_bh_journal_access(handle, inode, path, i); if (ret < 0) { mlog_errno(ret); goto out; @@ -686,6 +732,80 @@ struct ocfs2_merge_ctxt { int c_split_covers_rec; }; +static int ocfs2_validate_extent_block(struct super_block *sb, + struct buffer_head *bh) +{ + int rc; + struct ocfs2_extent_block *eb = + (struct ocfs2_extent_block *)bh->b_data; + + mlog(0, "Validating extent block %llu\n", + (unsigned long long)bh->b_blocknr); + + BUG_ON(!buffer_uptodate(bh)); + + /* + * If the ecc fails, we return the error but otherwise + * leave the filesystem running. We know any error is + * local to this block. + */ + rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &eb->h_check); + if (rc) { + mlog(ML_ERROR, "Checksum failed for extent block %llu\n", + (unsigned long long)bh->b_blocknr); + return rc; + } + + /* + * Errors after here are fatal. + */ + + if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { + ocfs2_error(sb, + "Extent block #%llu has bad signature %.*s", + (unsigned long long)bh->b_blocknr, 7, + eb->h_signature); + return -EINVAL; + } + + if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) { + ocfs2_error(sb, + "Extent block #%llu has an invalid h_blkno " + "of %llu", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(eb->h_blkno)); + return -EINVAL; + } + + if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) { + ocfs2_error(sb, + "Extent block #%llu has an invalid " + "h_fs_generation of #%u", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(eb->h_fs_generation)); + return -EINVAL; + } + + return 0; +} + +int ocfs2_read_extent_block(struct inode *inode, u64 eb_blkno, + struct buffer_head **bh) +{ + int rc; + struct buffer_head *tmp = *bh; + + rc = ocfs2_read_block(inode, eb_blkno, &tmp, + ocfs2_validate_extent_block); + + /* If ocfs2_read_block() got us a new bh, pass it up. */ + if (!rc && !*bh) + *bh = tmp; + + return rc; +} + + /* * How many free extents have we got before we need more meta data? */ @@ -705,8 +825,7 @@ int ocfs2_num_free_extents(struct ocfs2_super *osb, last_eb_blk = ocfs2_et_get_last_eb_blk(et); if (last_eb_blk) { - retval = ocfs2_read_block(inode, last_eb_blk, - &eb_bh); + retval = ocfs2_read_extent_block(inode, last_eb_blk, &eb_bh); if (retval < 0) { mlog_errno(retval); goto bail; @@ -768,8 +887,8 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb, } ocfs2_set_new_buffer_uptodate(inode, bhs[i]); - status = ocfs2_journal_access(handle, inode, bhs[i], - OCFS2_JOURNAL_ACCESS_CREATE); + status = ocfs2_journal_access_eb(handle, inode, bhs[i], + OCFS2_JOURNAL_ACCESS_CREATE); if (status < 0) { mlog_errno(status); goto bail; @@ -908,15 +1027,12 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, for(i = 0; i < new_blocks; i++) { bh = new_eb_bhs[i]; eb = (struct ocfs2_extent_block *) bh->b_data; - if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { - OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); - status = -EIO; - goto bail; - } + /* ocfs2_create_new_meta_bhs() should create it right! */ + BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb)); eb_el = &eb->h_list; - status = ocfs2_journal_access(handle, inode, bh, - OCFS2_JOURNAL_ACCESS_CREATE); + status = ocfs2_journal_access_eb(handle, inode, bh, + OCFS2_JOURNAL_ACCESS_CREATE); if (status < 0) { mlog_errno(status); goto bail; @@ -955,21 +1071,21 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, * journal_dirty erroring as it won't unless we've aborted the * handle (in which case we would never be here) so reserving * the write with journal_access is all we need to do. */ - status = ocfs2_journal_access(handle, inode, *last_eb_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_eb(handle, inode, *last_eb_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail; } - status = ocfs2_journal_access(handle, inode, et->et_root_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_et_root_journal_access(handle, inode, et, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail; } if (eb_bh) { - status = ocfs2_journal_access(handle, inode, eb_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_eb(handle, inode, eb_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail; @@ -1052,17 +1168,14 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, } eb = (struct ocfs2_extent_block *) new_eb_bh->b_data; - if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { - OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); - status = -EIO; - goto bail; - } + /* ocfs2_create_new_meta_bhs() should create it right! */ + BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb)); eb_el = &eb->h_list; root_el = et->et_root_el; - status = ocfs2_journal_access(handle, inode, new_eb_bh, - OCFS2_JOURNAL_ACCESS_CREATE); + status = ocfs2_journal_access_eb(handle, inode, new_eb_bh, + OCFS2_JOURNAL_ACCESS_CREATE); if (status < 0) { mlog_errno(status); goto bail; @@ -1080,8 +1193,8 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, goto bail; } - status = ocfs2_journal_access(handle, inode, et->et_root_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_et_root_journal_access(handle, inode, et, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail; @@ -1176,18 +1289,13 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb, brelse(bh); bh = NULL; - status = ocfs2_read_block(inode, blkno, &bh); + status = ocfs2_read_extent_block(inode, blkno, &bh); if (status < 0) { mlog_errno(status); goto bail; } eb = (struct ocfs2_extent_block *) bh->b_data; - if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { - OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); - status = -EIO; - goto bail; - } el = &eb->h_list; if (le16_to_cpu(el->l_next_free_rec) < @@ -1540,7 +1648,7 @@ static int __ocfs2_find_path(struct inode *inode, brelse(bh); bh = NULL; - ret = ocfs2_read_block(inode, blkno, &bh); + ret = ocfs2_read_extent_block(inode, blkno, &bh); if (ret) { mlog_errno(ret); goto out; @@ -1548,11 +1656,6 @@ static int __ocfs2_find_path(struct inode *inode, eb = (struct ocfs2_extent_block *) bh->b_data; el = &eb->h_list; - if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { - OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); - ret = -EIO; - goto out; - } if (le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count)) { @@ -1860,25 +1963,23 @@ static int ocfs2_rotate_subtree_right(struct inode *inode, root_bh = left_path->p_node[subtree_index].bh; BUG_ON(root_bh != right_path->p_node[subtree_index].bh); - ret = ocfs2_journal_access(handle, inode, root_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_path_bh_journal_access(handle, inode, right_path, + subtree_index); if (ret) { mlog_errno(ret); goto out; } for(i = subtree_index + 1; i < path_num_items(right_path); i++) { - ret = ocfs2_journal_access(handle, inode, - right_path->p_node[i].bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_path_bh_journal_access(handle, inode, + right_path, i); if (ret) { mlog_errno(ret); goto out; } - ret = ocfs2_journal_access(handle, inode, - left_path->p_node[i].bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_path_bh_journal_access(handle, inode, + left_path, i); if (ret) { mlog_errno(ret); goto out; @@ -2102,8 +2203,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode, *ret_left_path = NULL; - left_path = ocfs2_new_path(path_root_bh(right_path), - path_root_el(right_path)); + left_path = ocfs2_new_path_from_path(right_path); if (!left_path) { ret = -ENOMEM; mlog_errno(ret); @@ -2398,9 +2498,9 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle, return -EAGAIN; if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) { - ret = ocfs2_journal_access(handle, inode, - path_leaf_bh(right_path), - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_journal_access_eb(handle, inode, + path_leaf_bh(right_path), + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out; @@ -2417,8 +2517,8 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle, * We have to update i_last_eb_blk during the meta * data delete. */ - ret = ocfs2_journal_access(handle, inode, et_root_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_et_root_journal_access(handle, inode, et, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out; @@ -2433,25 +2533,23 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle, */ BUG_ON(right_has_empty && !del_right_subtree); - ret = ocfs2_journal_access(handle, inode, root_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_path_bh_journal_access(handle, inode, right_path, + subtree_index); if (ret) { mlog_errno(ret); goto out; } for(i = subtree_index + 1; i < path_num_items(right_path); i++) { - ret = ocfs2_journal_access(handle, inode, - right_path->p_node[i].bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_path_bh_journal_access(handle, inode, + right_path, i); if (ret) { mlog_errno(ret); goto out; } - ret = ocfs2_journal_access(handle, inode, - left_path->p_node[i].bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_path_bh_journal_access(handle, inode, + left_path, i); if (ret) { mlog_errno(ret); goto out; @@ -2596,16 +2694,17 @@ out: static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode, handle_t *handle, - struct buffer_head *bh, - struct ocfs2_extent_list *el) + struct ocfs2_path *path) { int ret; + struct buffer_head *bh = path_leaf_bh(path); + struct ocfs2_extent_list *el = path_leaf_el(path); if (!ocfs2_is_empty_extent(&el->l_recs[0])) return 0; - ret = ocfs2_journal_access(handle, inode, bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_path_bh_journal_access(handle, inode, path, + path_num_items(path) - 1); if (ret) { mlog_errno(ret); goto out; @@ -2644,8 +2743,7 @@ static int __ocfs2_rotate_tree_left(struct inode *inode, goto out; } - left_path = ocfs2_new_path(path_root_bh(path), - path_root_el(path)); + left_path = ocfs2_new_path_from_path(path); if (!left_path) { ret = -ENOMEM; mlog_errno(ret); @@ -2654,8 +2752,7 @@ static int __ocfs2_rotate_tree_left(struct inode *inode, ocfs2_cp_path(left_path, path); - right_path = ocfs2_new_path(path_root_bh(path), - path_root_el(path)); + right_path = ocfs2_new_path_from_path(path); if (!right_path) { ret = -ENOMEM; mlog_errno(ret); @@ -2689,9 +2786,8 @@ static int __ocfs2_rotate_tree_left(struct inode *inode, * Caller might still want to make changes to the * tree root, so re-add it to the journal here. */ - ret = ocfs2_journal_access(handle, inode, - path_root_bh(left_path), - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_path_bh_journal_access(handle, inode, + left_path, 0); if (ret) { mlog_errno(ret); goto out; @@ -2785,8 +2881,7 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle, * We have a path to the left of this one - it needs * an update too. */ - left_path = ocfs2_new_path(path_root_bh(path), - path_root_el(path)); + left_path = ocfs2_new_path_from_path(path); if (!left_path) { ret = -ENOMEM; mlog_errno(ret); @@ -2875,8 +2970,7 @@ rightmost_no_delete: * it up front. */ ret = ocfs2_rotate_rightmost_leaf_left(inode, handle, - path_leaf_bh(path), - path_leaf_el(path)); + path); if (ret) mlog_errno(ret); goto out; @@ -3027,8 +3121,7 @@ static int ocfs2_get_right_path(struct inode *inode, /* This function shouldn't be called for the rightmost leaf. */ BUG_ON(right_cpos == 0); - right_path = ocfs2_new_path(path_root_bh(left_path), - path_root_el(left_path)); + right_path = ocfs2_new_path_from_path(left_path); if (!right_path) { ret = -ENOMEM; mlog_errno(ret); @@ -3111,8 +3204,8 @@ static int ocfs2_merge_rec_right(struct inode *inode, root_bh = left_path->p_node[subtree_index].bh; BUG_ON(root_bh != right_path->p_node[subtree_index].bh); - ret = ocfs2_journal_access(handle, inode, root_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_path_bh_journal_access(handle, inode, right_path, + subtree_index); if (ret) { mlog_errno(ret); goto out; @@ -3120,17 +3213,15 @@ static int ocfs2_merge_rec_right(struct inode *inode, for (i = subtree_index + 1; i < path_num_items(right_path); i++) { - ret = ocfs2_journal_access(handle, inode, - right_path->p_node[i].bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_path_bh_journal_access(handle, inode, + right_path, i); if (ret) { mlog_errno(ret); goto out; } - ret = ocfs2_journal_access(handle, inode, - left_path->p_node[i].bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_path_bh_journal_access(handle, inode, + left_path, i); if (ret) { mlog_errno(ret); goto out; @@ -3142,8 +3233,8 @@ static int ocfs2_merge_rec_right(struct inode *inode, right_rec = &el->l_recs[index + 1]; } - ret = ocfs2_journal_access(handle, inode, bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_path_bh_journal_access(handle, inode, left_path, + path_num_items(left_path) - 1); if (ret) { mlog_errno(ret); goto out; @@ -3199,8 +3290,7 @@ static int ocfs2_get_left_path(struct inode *inode, /* This function shouldn't be called for the leftmost leaf. */ BUG_ON(left_cpos == 0); - left_path = ocfs2_new_path(path_root_bh(right_path), - path_root_el(right_path)); + left_path = ocfs2_new_path_from_path(right_path); if (!left_path) { ret = -ENOMEM; mlog_errno(ret); @@ -3283,8 +3373,8 @@ static int ocfs2_merge_rec_left(struct inode *inode, root_bh = left_path->p_node[subtree_index].bh; BUG_ON(root_bh != right_path->p_node[subtree_index].bh); - ret = ocfs2_journal_access(handle, inode, root_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_path_bh_journal_access(handle, inode, right_path, + subtree_index); if (ret) { mlog_errno(ret); goto out; @@ -3292,17 +3382,15 @@ static int ocfs2_merge_rec_left(struct inode *inode, for (i = subtree_index + 1; i < path_num_items(right_path); i++) { - ret = ocfs2_journal_access(handle, inode, - right_path->p_node[i].bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_path_bh_journal_access(handle, inode, + right_path, i); if (ret) { mlog_errno(ret); goto out; } - ret = ocfs2_journal_access(handle, inode, - left_path->p_node[i].bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_path_bh_journal_access(handle, inode, + left_path, i); if (ret) { mlog_errno(ret); goto out; @@ -3314,8 +3402,8 @@ static int ocfs2_merge_rec_left(struct inode *inode, has_empty_extent = 1; } - ret = ocfs2_journal_access(handle, inode, bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_path_bh_journal_access(handle, inode, right_path, + path_num_items(right_path) - 1); if (ret) { mlog_errno(ret); goto out; @@ -3732,8 +3820,7 @@ static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle, * leftmost leaf. */ if (left_cpos) { - left_path = ocfs2_new_path(path_root_bh(right_path), - path_root_el(right_path)); + left_path = ocfs2_new_path_from_path(right_path); if (!left_path) { ret = -ENOMEM; mlog_errno(ret); @@ -3781,7 +3868,7 @@ static void ocfs2_split_record(struct inode *inode, struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el; struct ocfs2_extent_rec *rec, *tmprec; - right_el = path_leaf_el(right_path);; + right_el = path_leaf_el(right_path); if (left_path) left_el = path_leaf_el(left_path); @@ -3958,8 +4045,8 @@ static int ocfs2_do_insert_extent(struct inode *inode, el = et->et_root_el; - ret = ocfs2_journal_access(handle, inode, et->et_root_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_et_root_journal_access(handle, inode, et, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out; @@ -3970,7 +4057,7 @@ static int ocfs2_do_insert_extent(struct inode *inode, goto out_update_clusters; } - right_path = ocfs2_new_path(et->et_root_bh, et->et_root_el); + right_path = ocfs2_new_path_from_et(et); if (!right_path) { ret = -ENOMEM; mlog_errno(ret); @@ -4020,8 +4107,8 @@ static int ocfs2_do_insert_extent(struct inode *inode, * ocfs2_rotate_tree_right() might have extended the * transaction without re-journaling our tree root. */ - ret = ocfs2_journal_access(handle, inode, et->et_root_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_et_root_journal_access(handle, inode, et, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out; @@ -4082,8 +4169,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path, goto out; if (left_cpos != 0) { - left_path = ocfs2_new_path(path_root_bh(path), - path_root_el(path)); + left_path = ocfs2_new_path_from_path(path); if (!left_path) goto out; @@ -4097,8 +4183,15 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path, le16_to_cpu(new_el->l_count)) { bh = path_leaf_bh(left_path); eb = (struct ocfs2_extent_block *)bh->b_data; - OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, - eb); + ocfs2_error(inode->i_sb, + "Extent block #%llu has an " + "invalid l_next_free_rec of " + "%d. It should have " + "matched the l_count of %d", + (unsigned long long)le64_to_cpu(eb->h_blkno), + le16_to_cpu(new_el->l_next_free_rec), + le16_to_cpu(new_el->l_count)); + status = -EINVAL; goto out; } rec = &new_el->l_recs[ @@ -4132,8 +4225,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path, if (right_cpos == 0) goto out; - right_path = ocfs2_new_path(path_root_bh(path), - path_root_el(path)); + right_path = ocfs2_new_path_from_path(path); if (!right_path) goto out; @@ -4147,8 +4239,12 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path, if (le16_to_cpu(new_el->l_next_free_rec) <= 1) { bh = path_leaf_bh(right_path); eb = (struct ocfs2_extent_block *)bh->b_data; - OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, - eb); + ocfs2_error(inode->i_sb, + "Extent block #%llu has an " + "invalid l_next_free_rec of %d", + (unsigned long long)le64_to_cpu(eb->h_blkno), + le16_to_cpu(new_el->l_next_free_rec)); + status = -EINVAL; goto out; } rec = &new_el->l_recs[1]; @@ -4294,7 +4390,9 @@ static int ocfs2_figure_insert_type(struct inode *inode, * ocfs2_figure_insert_type() and ocfs2_add_branch() * may want it later. */ - ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et), &bh); + ret = ocfs2_read_extent_block(inode, + ocfs2_et_get_last_eb_blk(et), + &bh); if (ret) { mlog_exit(ret); goto out; @@ -4320,7 +4418,7 @@ static int ocfs2_figure_insert_type(struct inode *inode, return 0; } - path = ocfs2_new_path(et->et_root_bh, et->et_root_el); + path = ocfs2_new_path_from_et(et); if (!path) { ret = -ENOMEM; mlog_errno(ret); @@ -4531,9 +4629,9 @@ int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb, BUG_ON(num_bits > clusters_to_add); - /* reserve our write early -- insert_extent may update the inode */ - status = ocfs2_journal_access(handle, inode, et->et_root_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + /* reserve our write early -- insert_extent may update the tree root */ + status = ocfs2_et_root_journal_access(handle, inode, et, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto leave; @@ -4760,20 +4858,15 @@ static int __ocfs2_mark_extent_written(struct inode *inode, if (path->p_tree_depth) { struct ocfs2_extent_block *eb; - ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et), - &last_eb_bh); + ret = ocfs2_read_extent_block(inode, + ocfs2_et_get_last_eb_blk(et), + &last_eb_bh); if (ret) { mlog_exit(ret); goto out; } eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; - if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { - OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); - ret = -EROFS; - goto out; - } - rightmost_el = &eb->h_list; } else rightmost_el = path_root_el(path); @@ -4854,7 +4947,7 @@ int ocfs2_mark_extent_written(struct inode *inode, if (et->et_ops == &ocfs2_dinode_et_ops) ocfs2_extent_map_trunc(inode, 0); - left_path = ocfs2_new_path(et->et_root_bh, et->et_root_el); + left_path = ocfs2_new_path_from_et(et); if (!left_path) { ret = -ENOMEM; mlog_errno(ret); @@ -4918,8 +5011,9 @@ static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et, depth = path->p_tree_depth; if (depth > 0) { - ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et), - &last_eb_bh); + ret = ocfs2_read_extent_block(inode, + ocfs2_et_get_last_eb_blk(et), + &last_eb_bh); if (ret < 0) { mlog_errno(ret); goto out; @@ -5025,8 +5119,7 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle, } if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) { - left_path = ocfs2_new_path(path_root_bh(path), - path_root_el(path)); + left_path = ocfs2_new_path_from_path(path); if (!left_path) { ret = -ENOMEM; mlog_errno(ret); @@ -5135,7 +5228,7 @@ int ocfs2_remove_extent(struct inode *inode, ocfs2_extent_map_trunc(inode, 0); - path = ocfs2_new_path(et->et_root_bh, et->et_root_el); + path = ocfs2_new_path_from_et(et); if (!path) { ret = -ENOMEM; mlog_errno(ret); @@ -5255,6 +5348,78 @@ out: return ret; } +int ocfs2_remove_btree_range(struct inode *inode, + struct ocfs2_extent_tree *et, + u32 cpos, u32 phys_cpos, u32 len, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret; + u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct inode *tl_inode = osb->osb_tl_inode; + handle_t *handle; + struct ocfs2_alloc_context *meta_ac = NULL; + + ret = ocfs2_lock_allocators(inode, et, 0, 1, NULL, &meta_ac); + if (ret) { + mlog_errno(ret); + return ret; + } + + mutex_lock(&tl_inode->i_mutex); + + if (ocfs2_truncate_log_needs_flush(osb)) { + ret = __ocfs2_flush_truncate_log(osb); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + + handle = ocfs2_start_trans(osb, ocfs2_remove_extent_credits(osb->sb)); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_et_root_journal_access(handle, inode, et, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_remove_extent(inode, et, cpos, len, handle, meta_ac, + dealloc); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ocfs2_et_update_clusters(inode, et, -len); + + ret = ocfs2_journal_dirty(handle, et->et_root_bh); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len); + if (ret) + mlog_errno(ret); + +out_commit: + ocfs2_commit_trans(osb, handle); +out: + mutex_unlock(&tl_inode->i_mutex); + + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + + return ret; +} + int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb) { struct buffer_head *tl_bh = osb->osb_tl_bh; @@ -5308,13 +5473,13 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb, start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk); di = (struct ocfs2_dinode *) tl_bh->b_data; - tl = &di->id2.i_dealloc; - if (!OCFS2_IS_VALID_DINODE(di)) { - OCFS2_RO_ON_INVALID_DINODE(osb->sb, di); - status = -EIO; - goto bail; - } + /* tl_bh is loaded from ocfs2_truncate_log_init(). It's validated + * by the underlying call to ocfs2_read_inode_block(), so any + * corruption is a code bug */ + BUG_ON(!OCFS2_IS_VALID_DINODE(di)); + + tl = &di->id2.i_dealloc; tl_count = le16_to_cpu(tl->tl_count); mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) || tl_count == 0, @@ -5332,8 +5497,8 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb, goto bail; } - status = ocfs2_journal_access(handle, tl_inode, tl_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_di(handle, tl_inode, tl_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail; @@ -5394,8 +5559,8 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, while (i >= 0) { /* Caller has given us at least enough credits to * update the truncate log dinode */ - status = ocfs2_journal_access(handle, tl_inode, tl_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_di(handle, tl_inode, tl_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail; @@ -5464,13 +5629,13 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) BUG_ON(mutex_trylock(&tl_inode->i_mutex)); di = (struct ocfs2_dinode *) tl_bh->b_data; - tl = &di->id2.i_dealloc; - if (!OCFS2_IS_VALID_DINODE(di)) { - OCFS2_RO_ON_INVALID_DINODE(osb->sb, di); - status = -EIO; - goto out; - } + /* tl_bh is loaded from ocfs2_truncate_log_init(). It's validated + * by the underlying call to ocfs2_read_inode_block(), so any + * corruption is a code bug */ + BUG_ON(!OCFS2_IS_VALID_DINODE(di)); + + tl = &di->id2.i_dealloc; num_to_flush = le16_to_cpu(tl->tl_used); mlog(0, "Flush %u records from truncate log #%llu\n", num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno); @@ -5586,7 +5751,7 @@ static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb, goto bail; } - status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh); + status = ocfs2_read_inode_block(inode, &bh); if (status < 0) { iput(inode); mlog_errno(status); @@ -5625,13 +5790,13 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb, } di = (struct ocfs2_dinode *) tl_bh->b_data; - tl = &di->id2.i_dealloc; - if (!OCFS2_IS_VALID_DINODE(di)) { - OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di); - status = -EIO; - goto bail; - } + /* tl_bh is loaded from ocfs2_get_truncate_log_info(). It's + * validated by the underlying call to ocfs2_read_inode_block(), + * so any corruption is a code bug */ + BUG_ON(!OCFS2_IS_VALID_DINODE(di)); + + tl = &di->id2.i_dealloc; if (le16_to_cpu(tl->tl_used)) { mlog(0, "We'll have %u logs to recover\n", le16_to_cpu(tl->tl_used)); @@ -5651,6 +5816,7 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb, * tl_used. */ tl->tl_used = 0; + ocfs2_compute_meta_ecc(osb->sb, tl_bh->b_data, &di->i_check); status = ocfs2_write_block(osb, tl_bh, tl_inode); if (status < 0) { mlog_errno(status); @@ -5800,7 +5966,10 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb) */ /* - * Describes a single block free from a suballocator + * Describe a single bit freed from a suballocator. For the block + * suballocators, it represents one block. For the global cluster + * allocator, it represents some clusters and free_bit indicates + * clusters number. */ struct ocfs2_cached_block_free { struct ocfs2_cached_block_free *free_next; @@ -5815,10 +5984,10 @@ struct ocfs2_per_slot_free_list { struct ocfs2_cached_block_free *f_first; }; -static int ocfs2_free_cached_items(struct ocfs2_super *osb, - int sysfile_type, - int slot, - struct ocfs2_cached_block_free *head) +static int ocfs2_free_cached_blocks(struct ocfs2_super *osb, + int sysfile_type, + int slot, + struct ocfs2_cached_block_free *head) { int ret; u64 bg_blkno; @@ -5893,6 +6062,82 @@ out: return ret; } +int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, + u64 blkno, unsigned int bit) +{ + int ret = 0; + struct ocfs2_cached_block_free *item; + + item = kmalloc(sizeof(*item), GFP_NOFS); + if (item == NULL) { + ret = -ENOMEM; + mlog_errno(ret); + return ret; + } + + mlog(0, "Insert clusters: (bit %u, blk %llu)\n", + bit, (unsigned long long)blkno); + + item->free_blk = blkno; + item->free_bit = bit; + item->free_next = ctxt->c_global_allocator; + + ctxt->c_global_allocator = item; + return ret; +} + +static int ocfs2_free_cached_clusters(struct ocfs2_super *osb, + struct ocfs2_cached_block_free *head) +{ + struct ocfs2_cached_block_free *tmp; + struct inode *tl_inode = osb->osb_tl_inode; + handle_t *handle; + int ret = 0; + + mutex_lock(&tl_inode->i_mutex); + + while (head) { + if (ocfs2_truncate_log_needs_flush(osb)) { + ret = __ocfs2_flush_truncate_log(osb); + if (ret < 0) { + mlog_errno(ret); + break; + } + } + + handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + break; + } + + ret = ocfs2_truncate_log_append(osb, handle, head->free_blk, + head->free_bit); + + ocfs2_commit_trans(osb, handle); + tmp = head; + head = head->free_next; + kfree(tmp); + + if (ret < 0) { + mlog_errno(ret); + break; + } + } + + mutex_unlock(&tl_inode->i_mutex); + + while (head) { + /* Premature exit may have left some dangling items. */ + tmp = head; + head = head->free_next; + kfree(tmp); + } + + return ret; +} + int ocfs2_run_deallocs(struct ocfs2_super *osb, struct ocfs2_cached_dealloc_ctxt *ctxt) { @@ -5908,8 +6153,10 @@ int ocfs2_run_deallocs(struct ocfs2_super *osb, if (fl->f_first) { mlog(0, "Free items: (type %u, slot %d)\n", fl->f_inode_type, fl->f_slot); - ret2 = ocfs2_free_cached_items(osb, fl->f_inode_type, - fl->f_slot, fl->f_first); + ret2 = ocfs2_free_cached_blocks(osb, + fl->f_inode_type, + fl->f_slot, + fl->f_first); if (ret2) mlog_errno(ret2); if (!ret) @@ -5920,6 +6167,17 @@ int ocfs2_run_deallocs(struct ocfs2_super *osb, kfree(fl); } + if (ctxt->c_global_allocator) { + ret2 = ocfs2_free_cached_clusters(osb, + ctxt->c_global_allocator); + if (ret2) + mlog_errno(ret2); + if (!ret) + ret = ret2; + + ctxt->c_global_allocator = NULL; + } + return ret; } @@ -6075,11 +6333,10 @@ static int ocfs2_find_new_last_ext_blk(struct inode *inode, eb = (struct ocfs2_extent_block *) bh->b_data; el = &eb->h_list; - if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { - OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); - ret = -EROFS; - goto out; - } + + /* ocfs2_find_leaf() gets the eb from ocfs2_read_extent_block(). + * Any corruption is a code bug. */ + BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb)); *new_last_eb = bh; get_bh(*new_last_eb); @@ -6326,8 +6583,8 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb, } if (last_eb_bh) { - status = ocfs2_journal_access(handle, inode, last_eb_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_eb(handle, inode, last_eb_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail; @@ -6350,6 +6607,8 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb, goto bail; } + vfs_dq_free_space_nodirty(inode, + ocfs2_clusters_to_bytes(osb->sb, clusters_to_del)); spin_lock(&OCFS2_I(inode)->ip_lock); OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) - clusters_to_del; @@ -6436,11 +6695,6 @@ static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle, mlog_errno(ret); else if (ocfs2_should_order_data(inode)) { ret = ocfs2_jbd2_file_inode(handle, inode); -#ifdef CONFIG_OCFS2_COMPAT_JBD - ret = walk_page_buffers(handle, page_buffers(page), - from, to, &partial, - ocfs2_journal_dirty_data); -#endif if (ret < 0) mlog_errno(ret); } @@ -6663,6 +6917,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, struct page **pages = NULL; loff_t end = osb->s_clustersize; struct ocfs2_extent_tree et; + int did_quota = 0; has_data = i_size_read(inode) ? 1 : 0; @@ -6682,15 +6937,16 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, } } - handle = ocfs2_start_trans(osb, OCFS2_INLINE_TO_EXTENTS_CREDITS); + handle = ocfs2_start_trans(osb, + ocfs2_inline_to_extents_credits(osb->sb)); if (IS_ERR(handle)) { ret = PTR_ERR(handle); mlog_errno(ret); goto out_unlock; } - ret = ocfs2_journal_access(handle, inode, di_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_journal_access_di(handle, inode, di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out_commit; @@ -6701,6 +6957,13 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, unsigned int page_end; u64 phys; + if (vfs_dq_alloc_space_nodirty(inode, + ocfs2_clusters_to_bytes(osb->sb, 1))) { + ret = -EDQUOT; + goto out_commit; + } + did_quota = 1; + ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, &num); if (ret) { @@ -6774,6 +7037,10 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, } out_commit: + if (ret < 0 && did_quota) + vfs_dq_free_space_nodirty(inode, + ocfs2_clusters_to_bytes(osb->sb, 1)); + ocfs2_commit_trans(osb, handle); out_unlock: @@ -6813,7 +7080,8 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb, new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb, i_size_read(inode)); - path = ocfs2_new_path(fe_bh, &di->id2.i_list); + path = ocfs2_new_path(fe_bh, &di->id2.i_list, + ocfs2_journal_access_di); if (!path) { status = -ENOMEM; mlog_errno(status); @@ -6984,20 +7252,14 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb, ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc); if (fe->id2.i_list.l_tree_depth) { - status = ocfs2_read_block(inode, le64_to_cpu(fe->i_last_eb_blk), - &last_eb_bh); + status = ocfs2_read_extent_block(inode, + le64_to_cpu(fe->i_last_eb_blk), + &last_eb_bh); if (status < 0) { mlog_errno(status); goto bail; } eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; - if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { - OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); - - brelse(last_eb_bh); - status = -EIO; - goto bail; - } } (*tc)->tc_last_eb_bh = last_eb_bh; @@ -7052,8 +7314,8 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh, goto out; } - ret = ocfs2_journal_access(handle, inode, di_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_journal_access_di(handle, inode, di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out_commit; diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index 70257c84cfbe..cceff5c37f47 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h @@ -45,7 +45,9 @@ * * ocfs2_extent_tree contains info for the root of the b-tree, it must have a * root ocfs2_extent_list and a root_bh so that they can be used in the b-tree - * functions. + * functions. With metadata ecc, we now call different journal_access + * functions for each type of metadata, so it must have the + * root_journal_access function. * ocfs2_extent_tree_operations abstract the normal operations we do for * the root of extent b-tree. */ @@ -54,6 +56,7 @@ struct ocfs2_extent_tree { struct ocfs2_extent_tree_operations *et_ops; struct buffer_head *et_root_bh; struct ocfs2_extent_list *et_root_el; + ocfs2_journal_access_func et_root_journal_access; void *et_object; unsigned int et_max_leaf_clusters; }; @@ -68,10 +71,18 @@ void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et, void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et, struct inode *inode, struct buffer_head *bh); +struct ocfs2_xattr_value_buf; void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et, struct inode *inode, - struct buffer_head *bh, - struct ocfs2_xattr_value_root *xv); + struct ocfs2_xattr_value_buf *vb); + +/* + * Read an extent block into *bh. If *bh is NULL, a bh will be + * allocated. This is a cached read. The extent block will be validated + * with ocfs2_validate_extent_block(). + */ +int ocfs2_read_extent_block(struct inode *inode, u64 eb_blkno, + struct buffer_head **bh); struct ocfs2_alloc_context; int ocfs2_insert_extent(struct ocfs2_super *osb, @@ -110,6 +121,11 @@ int ocfs2_remove_extent(struct inode *inode, u32 cpos, u32 len, handle_t *handle, struct ocfs2_alloc_context *meta_ac, struct ocfs2_cached_dealloc_ctxt *dealloc); +int ocfs2_remove_btree_range(struct inode *inode, + struct ocfs2_extent_tree *et, + u32 cpos, u32 phys_cpos, u32 len, + struct ocfs2_cached_dealloc_ctxt *dealloc); + int ocfs2_num_free_extents(struct ocfs2_super *osb, struct inode *inode, struct ocfs2_extent_tree *et); @@ -167,10 +183,18 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb); */ struct ocfs2_cached_dealloc_ctxt { struct ocfs2_per_slot_free_list *c_first_suballocator; + struct ocfs2_cached_block_free *c_global_allocator; }; static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c) { c->c_first_suballocator = NULL; + c->c_global_allocator = NULL; +} +int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, + u64 blkno, unsigned int bit); +static inline int ocfs2_dealloc_has_cluster(struct ocfs2_cached_dealloc_ctxt *c) +{ + return c->c_global_allocator != NULL; } int ocfs2_run_deallocs(struct ocfs2_super *osb, struct ocfs2_cached_dealloc_ctxt *ctxt); diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index c22543b33420..a067a6cffb01 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -27,6 +27,7 @@ #include <linux/swap.h> #include <linux/pipe_fs_i.h> #include <linux/mpage.h> +#include <linux/quotaops.h> #define MLOG_MASK_PREFIX ML_FILE_IO #include <cluster/masklog.h> @@ -68,20 +69,13 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock, goto bail; } - status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh); + status = ocfs2_read_inode_block(inode, &bh); if (status < 0) { mlog_errno(status); goto bail; } fe = (struct ocfs2_dinode *) bh->b_data; - if (!OCFS2_IS_VALID_DINODE(fe)) { - mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n", - (unsigned long long)le64_to_cpu(fe->i_blkno), 7, - fe->i_signature); - goto bail; - } - if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb, le32_to_cpu(fe->i_clusters))) { mlog(ML_ERROR, "block offset is outside the allocated size: " @@ -262,7 +256,7 @@ static int ocfs2_readpage_inline(struct inode *inode, struct page *page) BUG_ON(!PageLocked(page)); BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)); - ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh); + ret = ocfs2_read_inode_block(inode, &di_bh); if (ret) { mlog_errno(ret); goto out; @@ -481,12 +475,6 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode, if (ocfs2_should_order_data(inode)) { ret = ocfs2_jbd2_file_inode(handle, inode); -#ifdef CONFIG_OCFS2_COMPAT_JBD - ret = walk_page_buffers(handle, - page_buffers(page), - from, to, NULL, - ocfs2_journal_dirty_data); -#endif if (ret < 0) mlog_errno(ret); } @@ -1072,15 +1060,8 @@ static void ocfs2_write_failure(struct inode *inode, tmppage = wc->w_pages[i]; if (page_has_buffers(tmppage)) { - if (ocfs2_should_order_data(inode)) { + if (ocfs2_should_order_data(inode)) ocfs2_jbd2_file_inode(wc->w_handle, inode); -#ifdef CONFIG_OCFS2_COMPAT_JBD - walk_page_buffers(wc->w_handle, - page_buffers(tmppage), - from, to, NULL, - ocfs2_journal_dirty_data); -#endif - } block_commit_write(tmppage, from, to); } @@ -1531,8 +1512,8 @@ static int ocfs2_write_begin_inline(struct address_space *mapping, goto out; } - ret = ocfs2_journal_access(handle, inode, wc->w_di_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_journal_access_di(handle, inode, wc->w_di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { ocfs2_commit_trans(osb, handle); @@ -1750,15 +1731,20 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, wc->w_handle = handle; + if (clusters_to_alloc && vfs_dq_alloc_space_nodirty(inode, + ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc))) { + ret = -EDQUOT; + goto out_commit; + } /* * We don't want this to fail in ocfs2_write_end(), so do it * here. */ - ret = ocfs2_journal_access(handle, inode, wc->w_di_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_journal_access_di(handle, inode, wc->w_di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); - goto out_commit; + goto out_quota; } /* @@ -1771,14 +1757,14 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, mmap_page); if (ret) { mlog_errno(ret); - goto out_commit; + goto out_quota; } ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos, len); if (ret) { mlog_errno(ret); - goto out_commit; + goto out_quota; } if (data_ac) @@ -1790,6 +1776,10 @@ success: *pagep = wc->w_target_page; *fsdata = wc; return 0; +out_quota: + if (clusters_to_alloc) + vfs_dq_free_space(inode, + ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc)); out_commit: ocfs2_commit_trans(osb, handle); @@ -1919,15 +1909,8 @@ int ocfs2_write_end_nolock(struct address_space *mapping, } if (page_has_buffers(tmppage)) { - if (ocfs2_should_order_data(inode)) { + if (ocfs2_should_order_data(inode)) ocfs2_jbd2_file_inode(wc->w_handle, inode); -#ifdef CONFIG_OCFS2_COMPAT_JBD - walk_page_buffers(wc->w_handle, - page_buffers(tmppage), - from, to, NULL, - ocfs2_journal_dirty_data); -#endif - } block_commit_write(tmppage, from, to); } } diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c new file mode 100644 index 000000000000..2a947c44e594 --- /dev/null +++ b/fs/ocfs2/blockcheck.c @@ -0,0 +1,477 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * blockcheck.c + * + * Checksum and ECC codes for the OCFS2 userspace library. + * + * Copyright (C) 2006, 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License, version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/crc32.h> +#include <linux/buffer_head.h> +#include <linux/bitops.h> +#include <asm/byteorder.h> + +#include <cluster/masklog.h> + +#include "ocfs2.h" + +#include "blockcheck.h" + + +/* + * We use the following conventions: + * + * d = # data bits + * p = # parity bits + * c = # total code bits (d + p) + */ + + +/* + * Calculate the bit offset in the hamming code buffer based on the bit's + * offset in the data buffer. Since the hamming code reserves all + * power-of-two bits for parity, the data bit number and the code bit + * number are offest by all the parity bits beforehand. + * + * Recall that bit numbers in hamming code are 1-based. This function + * takes the 0-based data bit from the caller. + * + * An example. Take bit 1 of the data buffer. 1 is a power of two (2^0), + * so it's a parity bit. 2 is a power of two (2^1), so it's a parity bit. + * 3 is not a power of two. So bit 1 of the data buffer ends up as bit 3 + * in the code buffer. + * + * The caller can pass in *p if it wants to keep track of the most recent + * number of parity bits added. This allows the function to start the + * calculation at the last place. + */ +static unsigned int calc_code_bit(unsigned int i, unsigned int *p_cache) +{ + unsigned int b, p = 0; + + /* + * Data bits are 0-based, but we're talking code bits, which + * are 1-based. + */ + b = i + 1; + + /* Use the cache if it is there */ + if (p_cache) + p = *p_cache; + b += p; + + /* + * For every power of two below our bit number, bump our bit. + * + * We compare with (b + 1) because we have to compare with what b + * would be _if_ it were bumped up by the parity bit. Capice? + * + * p is set above. + */ + for (; (1 << p) < (b + 1); p++) + b++; + + if (p_cache) + *p_cache = p; + + return b; +} + +/* + * This is the low level encoder function. It can be called across + * multiple hunks just like the crc32 code. 'd' is the number of bits + * _in_this_hunk_. nr is the bit offset of this hunk. So, if you had + * two 512B buffers, you would do it like so: + * + * parity = ocfs2_hamming_encode(0, buf1, 512 * 8, 0); + * parity = ocfs2_hamming_encode(parity, buf2, 512 * 8, 512 * 8); + * + * If you just have one buffer, use ocfs2_hamming_encode_block(). + */ +u32 ocfs2_hamming_encode(u32 parity, void *data, unsigned int d, unsigned int nr) +{ + unsigned int i, b, p = 0; + + BUG_ON(!d); + + /* + * b is the hamming code bit number. Hamming code specifies a + * 1-based array, but C uses 0-based. So 'i' is for C, and 'b' is + * for the algorithm. + * + * The i++ in the for loop is so that the start offset passed + * to ocfs2_find_next_bit_set() is one greater than the previously + * found bit. + */ + for (i = 0; (i = ocfs2_find_next_bit(data, d, i)) < d; i++) + { + /* + * i is the offset in this hunk, nr + i is the total bit + * offset. + */ + b = calc_code_bit(nr + i, &p); + + /* + * Data bits in the resultant code are checked by + * parity bits that are part of the bit number + * representation. Huh? + * + * <wikipedia href="http://en.wikipedia.org/wiki/Hamming_code"> + * In other words, the parity bit at position 2^k + * checks bits in positions having bit k set in + * their binary representation. Conversely, for + * instance, bit 13, i.e. 1101(2), is checked by + * bits 1000(2) = 8, 0100(2)=4 and 0001(2) = 1. + * </wikipedia> + * + * Note that 'k' is the _code_ bit number. 'b' in + * our loop. + */ + parity ^= b; + } + + /* While the data buffer was treated as little endian, the + * return value is in host endian. */ + return parity; +} + +u32 ocfs2_hamming_encode_block(void *data, unsigned int blocksize) +{ + return ocfs2_hamming_encode(0, data, blocksize * 8, 0); +} + +/* + * Like ocfs2_hamming_encode(), this can handle hunks. nr is the bit + * offset of the current hunk. If bit to be fixed is not part of the + * current hunk, this does nothing. + * + * If you only have one hunk, use ocfs2_hamming_fix_block(). + */ +void ocfs2_hamming_fix(void *data, unsigned int d, unsigned int nr, + unsigned int fix) +{ + unsigned int i, b; + + BUG_ON(!d); + + /* + * If the bit to fix has an hweight of 1, it's a parity bit. One + * busted parity bit is its own error. Nothing to do here. + */ + if (hweight32(fix) == 1) + return; + + /* + * nr + d is the bit right past the data hunk we're looking at. + * If fix after that, nothing to do + */ + if (fix >= calc_code_bit(nr + d, NULL)) + return; + + /* + * nr is the offset in the data hunk we're starting at. Let's + * start b at the offset in the code buffer. See hamming_encode() + * for a more detailed description of 'b'. + */ + b = calc_code_bit(nr, NULL); + /* If the fix is before this hunk, nothing to do */ + if (fix < b) + return; + + for (i = 0; i < d; i++, b++) + { + /* Skip past parity bits */ + while (hweight32(b) == 1) + b++; + + /* + * i is the offset in this data hunk. + * nr + i is the offset in the total data buffer. + * b is the offset in the total code buffer. + * + * Thus, when b == fix, bit i in the current hunk needs + * fixing. + */ + if (b == fix) + { + if (ocfs2_test_bit(i, data)) + ocfs2_clear_bit(i, data); + else + ocfs2_set_bit(i, data); + break; + } + } +} + +void ocfs2_hamming_fix_block(void *data, unsigned int blocksize, + unsigned int fix) +{ + ocfs2_hamming_fix(data, blocksize * 8, 0, fix); +} + +/* + * This function generates check information for a block. + * data is the block to be checked. bc is a pointer to the + * ocfs2_block_check structure describing the crc32 and the ecc. + * + * bc should be a pointer inside data, as the function will + * take care of zeroing it before calculating the check information. If + * bc does not point inside data, the caller must make sure any inline + * ocfs2_block_check structures are zeroed. + * + * The data buffer must be in on-disk endian (little endian for ocfs2). + * bc will be filled with little-endian values and will be ready to go to + * disk. + */ +void ocfs2_block_check_compute(void *data, size_t blocksize, + struct ocfs2_block_check *bc) +{ + u32 crc; + u32 ecc; + + memset(bc, 0, sizeof(struct ocfs2_block_check)); + + crc = crc32_le(~0, data, blocksize); + ecc = ocfs2_hamming_encode_block(data, blocksize); + + /* + * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no + * larger than 16 bits. + */ + BUG_ON(ecc > USHORT_MAX); + + bc->bc_crc32e = cpu_to_le32(crc); + bc->bc_ecc = cpu_to_le16((u16)ecc); +} + +/* + * This function validates existing check information. Like _compute, + * the function will take care of zeroing bc before calculating check codes. + * If bc is not a pointer inside data, the caller must have zeroed any + * inline ocfs2_block_check structures. + * + * Again, the data passed in should be the on-disk endian. + */ +int ocfs2_block_check_validate(void *data, size_t blocksize, + struct ocfs2_block_check *bc) +{ + int rc = 0; + struct ocfs2_block_check check; + u32 crc, ecc; + + check.bc_crc32e = le32_to_cpu(bc->bc_crc32e); + check.bc_ecc = le16_to_cpu(bc->bc_ecc); + + memset(bc, 0, sizeof(struct ocfs2_block_check)); + + /* Fast path - if the crc32 validates, we're good to go */ + crc = crc32_le(~0, data, blocksize); + if (crc == check.bc_crc32e) + goto out; + + mlog(ML_ERROR, + "CRC32 failed: stored: %u, computed %u. Applying ECC.\n", + (unsigned int)check.bc_crc32e, (unsigned int)crc); + + /* Ok, try ECC fixups */ + ecc = ocfs2_hamming_encode_block(data, blocksize); + ocfs2_hamming_fix_block(data, blocksize, ecc ^ check.bc_ecc); + + /* And check the crc32 again */ + crc = crc32_le(~0, data, blocksize); + if (crc == check.bc_crc32e) + goto out; + + mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n", + (unsigned int)check.bc_crc32e, (unsigned int)crc); + + rc = -EIO; + +out: + bc->bc_crc32e = cpu_to_le32(check.bc_crc32e); + bc->bc_ecc = cpu_to_le16(check.bc_ecc); + + return rc; +} + +/* + * This function generates check information for a list of buffer_heads. + * bhs is the blocks to be checked. bc is a pointer to the + * ocfs2_block_check structure describing the crc32 and the ecc. + * + * bc should be a pointer inside data, as the function will + * take care of zeroing it before calculating the check information. If + * bc does not point inside data, the caller must make sure any inline + * ocfs2_block_check structures are zeroed. + * + * The data buffer must be in on-disk endian (little endian for ocfs2). + * bc will be filled with little-endian values and will be ready to go to + * disk. + */ +void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr, + struct ocfs2_block_check *bc) +{ + int i; + u32 crc, ecc; + + BUG_ON(nr < 0); + + if (!nr) + return; + + memset(bc, 0, sizeof(struct ocfs2_block_check)); + + for (i = 0, crc = ~0, ecc = 0; i < nr; i++) { + crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size); + /* + * The number of bits in a buffer is obviously b_size*8. + * The offset of this buffer is b_size*i, so the bit offset + * of this buffer is b_size*8*i. + */ + ecc = (u16)ocfs2_hamming_encode(ecc, bhs[i]->b_data, + bhs[i]->b_size * 8, + bhs[i]->b_size * 8 * i); + } + + /* + * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no + * larger than 16 bits. + */ + BUG_ON(ecc > USHORT_MAX); + + bc->bc_crc32e = cpu_to_le32(crc); + bc->bc_ecc = cpu_to_le16((u16)ecc); +} + +/* + * This function validates existing check information on a list of + * buffer_heads. Like _compute_bhs, the function will take care of + * zeroing bc before calculating check codes. If bc is not a pointer + * inside data, the caller must have zeroed any inline + * ocfs2_block_check structures. + * + * Again, the data passed in should be the on-disk endian. + */ +int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr, + struct ocfs2_block_check *bc) +{ + int i, rc = 0; + struct ocfs2_block_check check; + u32 crc, ecc, fix; + + BUG_ON(nr < 0); + + if (!nr) + return 0; + + check.bc_crc32e = le32_to_cpu(bc->bc_crc32e); + check.bc_ecc = le16_to_cpu(bc->bc_ecc); + + memset(bc, 0, sizeof(struct ocfs2_block_check)); + + /* Fast path - if the crc32 validates, we're good to go */ + for (i = 0, crc = ~0; i < nr; i++) + crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size); + if (crc == check.bc_crc32e) + goto out; + + mlog(ML_ERROR, + "CRC32 failed: stored: %u, computed %u. Applying ECC.\n", + (unsigned int)check.bc_crc32e, (unsigned int)crc); + + /* Ok, try ECC fixups */ + for (i = 0, ecc = 0; i < nr; i++) { + /* + * The number of bits in a buffer is obviously b_size*8. + * The offset of this buffer is b_size*i, so the bit offset + * of this buffer is b_size*8*i. + */ + ecc = (u16)ocfs2_hamming_encode(ecc, bhs[i]->b_data, + bhs[i]->b_size * 8, + bhs[i]->b_size * 8 * i); + } + fix = ecc ^ check.bc_ecc; + for (i = 0; i < nr; i++) { + /* + * Try the fix against each buffer. It will only affect + * one of them. + */ + ocfs2_hamming_fix(bhs[i]->b_data, bhs[i]->b_size * 8, + bhs[i]->b_size * 8 * i, fix); + } + + /* And check the crc32 again */ + for (i = 0, crc = ~0; i < nr; i++) + crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size); + if (crc == check.bc_crc32e) + goto out; + + mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n", + (unsigned int)check.bc_crc32e, (unsigned int)crc); + + rc = -EIO; + +out: + bc->bc_crc32e = cpu_to_le32(check.bc_crc32e); + bc->bc_ecc = cpu_to_le16(check.bc_ecc); + + return rc; +} + +/* + * These are the main API. They check the superblock flag before + * calling the underlying operations. + * + * They expect the buffer(s) to be in disk format. + */ +void ocfs2_compute_meta_ecc(struct super_block *sb, void *data, + struct ocfs2_block_check *bc) +{ + if (ocfs2_meta_ecc(OCFS2_SB(sb))) + ocfs2_block_check_compute(data, sb->s_blocksize, bc); +} + +int ocfs2_validate_meta_ecc(struct super_block *sb, void *data, + struct ocfs2_block_check *bc) +{ + int rc = 0; + + if (ocfs2_meta_ecc(OCFS2_SB(sb))) + rc = ocfs2_block_check_validate(data, sb->s_blocksize, bc); + + return rc; +} + +void ocfs2_compute_meta_ecc_bhs(struct super_block *sb, + struct buffer_head **bhs, int nr, + struct ocfs2_block_check *bc) +{ + if (ocfs2_meta_ecc(OCFS2_SB(sb))) + ocfs2_block_check_compute_bhs(bhs, nr, bc); +} + +int ocfs2_validate_meta_ecc_bhs(struct super_block *sb, + struct buffer_head **bhs, int nr, + struct ocfs2_block_check *bc) +{ + int rc = 0; + + if (ocfs2_meta_ecc(OCFS2_SB(sb))) + rc = ocfs2_block_check_validate_bhs(bhs, nr, bc); + + return rc; +} + diff --git a/fs/ocfs2/blockcheck.h b/fs/ocfs2/blockcheck.h new file mode 100644 index 000000000000..70ec3feda32f --- /dev/null +++ b/fs/ocfs2/blockcheck.h @@ -0,0 +1,82 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * blockcheck.h + * + * Checksum and ECC codes for the OCFS2 userspace library. + * + * Copyright (C) 2004, 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License, version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#ifndef OCFS2_BLOCKCHECK_H +#define OCFS2_BLOCKCHECK_H + + +/* High level block API */ +void ocfs2_compute_meta_ecc(struct super_block *sb, void *data, + struct ocfs2_block_check *bc); +int ocfs2_validate_meta_ecc(struct super_block *sb, void *data, + struct ocfs2_block_check *bc); +void ocfs2_compute_meta_ecc_bhs(struct super_block *sb, + struct buffer_head **bhs, int nr, + struct ocfs2_block_check *bc); +int ocfs2_validate_meta_ecc_bhs(struct super_block *sb, + struct buffer_head **bhs, int nr, + struct ocfs2_block_check *bc); + +/* Lower level API */ +void ocfs2_block_check_compute(void *data, size_t blocksize, + struct ocfs2_block_check *bc); +int ocfs2_block_check_validate(void *data, size_t blocksize, + struct ocfs2_block_check *bc); +void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr, + struct ocfs2_block_check *bc); +int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr, + struct ocfs2_block_check *bc); + +/* + * Hamming code functions + */ + +/* + * Encoding hamming code parity bits for a buffer. + * + * This is the low level encoder function. It can be called across + * multiple hunks just like the crc32 code. 'd' is the number of bits + * _in_this_hunk_. nr is the bit offset of this hunk. So, if you had + * two 512B buffers, you would do it like so: + * + * parity = ocfs2_hamming_encode(0, buf1, 512 * 8, 0); + * parity = ocfs2_hamming_encode(parity, buf2, 512 * 8, 512 * 8); + * + * If you just have one buffer, use ocfs2_hamming_encode_block(). + */ +u32 ocfs2_hamming_encode(u32 parity, void *data, unsigned int d, + unsigned int nr); +/* + * Fix a buffer with a bit error. The 'fix' is the original parity + * xor'd with the parity calculated now. + * + * Like ocfs2_hamming_encode(), this can handle hunks. nr is the bit + * offset of the current hunk. If bit to be fixed is not part of the + * current hunk, this does nothing. + * + * If you only have one buffer, use ocfs2_hamming_fix_block(). + */ +void ocfs2_hamming_fix(void *data, unsigned int d, unsigned int nr, + unsigned int fix); + +/* Convenience wrappers for a single buffer of data */ +extern u32 ocfs2_hamming_encode_block(void *data, unsigned int blocksize); +extern void ocfs2_hamming_fix_block(void *data, unsigned int blocksize, + unsigned int fix); +#endif diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c index 3a178ec48d7c..15c8e6deee2e 100644 --- a/fs/ocfs2/buffer_head_io.c +++ b/fs/ocfs2/buffer_head_io.c @@ -39,6 +39,18 @@ #include "buffer_head_io.h" +/* + * Bits on bh->b_state used by ocfs2. + * + * These MUST be after the JBD2 bits. Hence, we use BH_JBDPrivateStart. + */ +enum ocfs2_state_bits { + BH_NeedsValidate = BH_JBDPrivateStart, +}; + +/* Expand the magic b_state functions */ +BUFFER_FNS(NeedsValidate, needs_validate); + int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, struct inode *inode) { @@ -166,7 +178,9 @@ bail: } int ocfs2_read_blocks(struct inode *inode, u64 block, int nr, - struct buffer_head *bhs[], int flags) + struct buffer_head *bhs[], int flags, + int (*validate)(struct super_block *sb, + struct buffer_head *bh)) { int status = 0; int i, ignore_cache = 0; @@ -298,6 +312,8 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr, clear_buffer_uptodate(bh); get_bh(bh); /* for end_buffer_read_sync() */ + if (validate) + set_buffer_needs_validate(bh); bh->b_end_io = end_buffer_read_sync; submit_bh(READ, bh); continue; @@ -328,6 +344,20 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr, bhs[i] = NULL; continue; } + + if (buffer_needs_validate(bh)) { + /* We never set NeedsValidate if the + * buffer was held by the journal, so + * that better not have changed */ + BUG_ON(buffer_jbd(bh)); + clear_buffer_needs_validate(bh); + status = validate(inode->i_sb, bh); + if (status) { + put_bh(bh); + bhs[i] = NULL; + continue; + } + } } /* Always set the buffer in the cache, even if it was diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h index 75e1dcb1ade7..c75d682dadd8 100644 --- a/fs/ocfs2/buffer_head_io.h +++ b/fs/ocfs2/buffer_head_io.h @@ -31,21 +31,24 @@ void ocfs2_end_buffer_io_sync(struct buffer_head *bh, int uptodate); -static inline int ocfs2_read_block(struct inode *inode, - u64 off, - struct buffer_head **bh); - int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, struct inode *inode); -int ocfs2_read_blocks(struct inode *inode, - u64 block, - int nr, - struct buffer_head *bhs[], - int flags); int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block, unsigned int nr, struct buffer_head *bhs[]); +/* + * If not NULL, validate() will be called on a buffer that is freshly + * read from disk. It will not be called if the buffer was in cache. + * Note that if validate() is being used for this buffer, it needs to + * be set even for a READAHEAD call, as it marks the buffer for later + * validation. + */ +int ocfs2_read_blocks(struct inode *inode, u64 block, int nr, + struct buffer_head *bhs[], int flags, + int (*validate)(struct super_block *sb, + struct buffer_head *bh)); + int ocfs2_write_super_or_backup(struct ocfs2_super *osb, struct buffer_head *bh); @@ -53,7 +56,9 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb, #define OCFS2_BH_READAHEAD 8 static inline int ocfs2_read_block(struct inode *inode, u64 off, - struct buffer_head **bh) + struct buffer_head **bh, + int (*validate)(struct super_block *sb, + struct buffer_head *bh)) { int status = 0; @@ -63,7 +68,7 @@ static inline int ocfs2_read_block(struct inode *inode, u64 off, goto bail; } - status = ocfs2_read_blocks(inode, off, 1, bh, 0); + status = ocfs2_read_blocks(inode, off, 1, bh, 0, validate); bail: return status; diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 6ebaa58e2c03..04697ba7f73e 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -854,7 +854,7 @@ static int o2hb_thread(void *data) while (!kthread_should_stop() && !reg->hr_unclean_stop) { /* We track the time spent inside - * o2hb_do_disk_heartbeat so that we avoid more then + * o2hb_do_disk_heartbeat so that we avoid more than * hr_timeout_ms between disk writes. On busy systems * this should result in a heartbeat which is less * likely to time itself out. */ diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c index d8a0cb92cef6..96df5416993e 100644 --- a/fs/ocfs2/cluster/masklog.c +++ b/fs/ocfs2/cluster/masklog.c @@ -110,6 +110,7 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = { define_mask(QUORUM), define_mask(EXPORT), define_mask(XATTR), + define_mask(QUOTA), define_mask(ERROR), define_mask(NOTICE), define_mask(KTHREAD), diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h index 57670c680471..7e72a81bc2d4 100644 --- a/fs/ocfs2/cluster/masklog.h +++ b/fs/ocfs2/cluster/masklog.h @@ -113,6 +113,7 @@ #define ML_QUORUM 0x0000000008000000ULL /* net connection quorum */ #define ML_EXPORT 0x0000000010000000ULL /* ocfs2 export operations */ #define ML_XATTR 0x0000000020000000ULL /* ocfs2 extended attributes */ +#define ML_QUOTA 0x0000000040000000ULL /* ocfs2 quota operations */ /* bits that are infrequently given and frequently matched in the high word */ #define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */ #define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */ diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 026e6eb85187..f2c4098cf337 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -40,6 +40,7 @@ #include <linux/types.h> #include <linux/slab.h> #include <linux/highmem.h> +#include <linux/quotaops.h> #define MLOG_MASK_PREFIX ML_NAMEI #include <cluster/masklog.h> @@ -47,6 +48,7 @@ #include "ocfs2.h" #include "alloc.h" +#include "blockcheck.h" #include "dir.h" #include "dlmglue.h" #include "extent_map.h" @@ -82,47 +84,72 @@ static int ocfs2_do_extend_dir(struct super_block *sb, struct ocfs2_alloc_context *meta_ac, struct buffer_head **new_bh); -static struct buffer_head *ocfs2_bread(struct inode *inode, - int block, int *err, int reada) +/* + * These are distinct checks because future versions of the file system will + * want to have a trailing dirent structure independent of indexing. + */ +static int ocfs2_dir_has_trailer(struct inode *dir) { - struct buffer_head *bh = NULL; - int tmperr; - u64 p_blkno; - int readflags = 0; + if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) + return 0; - if (reada) - readflags |= OCFS2_BH_READAHEAD; + return ocfs2_meta_ecc(OCFS2_SB(dir->i_sb)); +} - if (((u64)block << inode->i_sb->s_blocksize_bits) >= - i_size_read(inode)) { - BUG_ON(!reada); - return NULL; - } +static int ocfs2_supports_dir_trailer(struct ocfs2_super *osb) +{ + return ocfs2_meta_ecc(osb); +} - down_read(&OCFS2_I(inode)->ip_alloc_sem); - tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL, - NULL); - up_read(&OCFS2_I(inode)->ip_alloc_sem); - if (tmperr < 0) { - mlog_errno(tmperr); - goto fail; - } +static inline unsigned int ocfs2_dir_trailer_blk_off(struct super_block *sb) +{ + return sb->s_blocksize - sizeof(struct ocfs2_dir_block_trailer); +} - tmperr = ocfs2_read_blocks(inode, p_blkno, 1, &bh, readflags); - if (tmperr < 0) - goto fail; +#define ocfs2_trailer_from_bh(_bh, _sb) ((struct ocfs2_dir_block_trailer *) ((_bh)->b_data + ocfs2_dir_trailer_blk_off((_sb)))) - tmperr = 0; +/* XXX ocfs2_block_dqtrailer() is similar but not quite - can we make + * them more consistent? */ +struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize, + void *data) +{ + char *p = data; - *err = 0; - return bh; + p += blocksize - sizeof(struct ocfs2_dir_block_trailer); + return (struct ocfs2_dir_block_trailer *)p; +} -fail: - brelse(bh); - bh = NULL; +/* + * XXX: This is executed once on every dirent. We should consider optimizing + * it. + */ +static int ocfs2_skip_dir_trailer(struct inode *dir, + struct ocfs2_dir_entry *de, + unsigned long offset, + unsigned long blklen) +{ + unsigned long toff = blklen - sizeof(struct ocfs2_dir_block_trailer); - *err = -EIO; - return NULL; + if (!ocfs2_dir_has_trailer(dir)) + return 0; + + if (offset != toff) + return 0; + + return 1; +} + +static void ocfs2_init_dir_trailer(struct inode *inode, + struct buffer_head *bh) +{ + struct ocfs2_dir_block_trailer *trailer; + + trailer = ocfs2_trailer_from_bh(bh, inode->i_sb); + strcpy(trailer->db_signature, OCFS2_DIR_TRAILER_SIGNATURE); + trailer->db_compat_rec_len = + cpu_to_le16(sizeof(struct ocfs2_dir_block_trailer)); + trailer->db_parent_dinode = cpu_to_le64(OCFS2_I(inode)->ip_blkno); + trailer->db_blkno = cpu_to_le64(bh->b_blocknr); } /* @@ -231,7 +258,7 @@ static struct buffer_head *ocfs2_find_entry_id(const char *name, struct ocfs2_dinode *di; struct ocfs2_inline_data *data; - ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno, &di_bh); + ret = ocfs2_read_inode_block(dir, &di_bh); if (ret) { mlog_errno(ret); goto out; @@ -250,6 +277,108 @@ out: return NULL; } +static int ocfs2_validate_dir_block(struct super_block *sb, + struct buffer_head *bh) +{ + int rc; + struct ocfs2_dir_block_trailer *trailer = + ocfs2_trailer_from_bh(bh, sb); + + + /* + * We don't validate dirents here, that's handled + * in-place when the code walks them. + */ + mlog(0, "Validating dirblock %llu\n", + (unsigned long long)bh->b_blocknr); + + BUG_ON(!buffer_uptodate(bh)); + + /* + * If the ecc fails, we return the error but otherwise + * leave the filesystem running. We know any error is + * local to this block. + * + * Note that we are safe to call this even if the directory + * doesn't have a trailer. Filesystems without metaecc will do + * nothing, and filesystems with it will have one. + */ + rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &trailer->db_check); + if (rc) + mlog(ML_ERROR, "Checksum failed for dinode %llu\n", + (unsigned long long)bh->b_blocknr); + + return rc; +} + +/* + * This function forces all errors to -EIO for consistency with its + * predecessor, ocfs2_bread(). We haven't audited what returning the + * real error codes would do to callers. We log the real codes with + * mlog_errno() before we squash them. + */ +static int ocfs2_read_dir_block(struct inode *inode, u64 v_block, + struct buffer_head **bh, int flags) +{ + int rc = 0; + struct buffer_head *tmp = *bh; + struct ocfs2_dir_block_trailer *trailer; + + rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, flags, + ocfs2_validate_dir_block); + if (rc) { + mlog_errno(rc); + goto out; + } + + /* + * We check the trailer here rather than in + * ocfs2_validate_dir_block() because that function doesn't have + * the inode to test. + */ + if (!(flags & OCFS2_BH_READAHEAD) && + ocfs2_dir_has_trailer(inode)) { + trailer = ocfs2_trailer_from_bh(tmp, inode->i_sb); + if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) { + rc = -EINVAL; + ocfs2_error(inode->i_sb, + "Invalid dirblock #%llu: " + "signature = %.*s\n", + (unsigned long long)tmp->b_blocknr, 7, + trailer->db_signature); + goto out; + } + if (le64_to_cpu(trailer->db_blkno) != tmp->b_blocknr) { + rc = -EINVAL; + ocfs2_error(inode->i_sb, + "Directory block #%llu has an invalid " + "db_blkno of %llu", + (unsigned long long)tmp->b_blocknr, + (unsigned long long)le64_to_cpu(trailer->db_blkno)); + goto out; + } + if (le64_to_cpu(trailer->db_parent_dinode) != + OCFS2_I(inode)->ip_blkno) { + rc = -EINVAL; + ocfs2_error(inode->i_sb, + "Directory block #%llu on dinode " + "#%llu has an invalid parent_dinode " + "of %llu", + (unsigned long long)tmp->b_blocknr, + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)le64_to_cpu(trailer->db_blkno)); + goto out; + } + } + + /* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */ + if (!*bh) + *bh = tmp; + +out: + return rc ? -EIO : 0; +} + static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen, struct inode *dir, struct ocfs2_dir_entry **res_dir) @@ -296,15 +425,17 @@ restart: } num++; - bh = ocfs2_bread(dir, b++, &err, 1); + bh = NULL; + err = ocfs2_read_dir_block(dir, b++, &bh, + OCFS2_BH_READAHEAD); bh_use[ra_max] = bh; } } if ((bh = bh_use[ra_ptr++]) == NULL) goto next; - if (ocfs2_read_block(dir, block, &bh)) { + if (ocfs2_read_dir_block(dir, block, &bh, 0)) { /* read error, skip block & hope for the best. - * ocfs2_read_block() has released the bh. */ + * ocfs2_read_dir_block() has released the bh. */ ocfs2_error(dir->i_sb, "reading directory %llu, " "offset %lu\n", (unsigned long long)OCFS2_I(dir)->ip_blkno, @@ -381,14 +512,18 @@ int ocfs2_update_entry(struct inode *dir, handle_t *handle, struct inode *new_entry_inode) { int ret; + ocfs2_journal_access_func access = ocfs2_journal_access_db; /* * The same code works fine for both inline-data and extent - * based directories, so no need to split this up. + * based directories, so no need to split this up. The only + * difference is the journal_access function. */ - ret = ocfs2_journal_access(handle, dir, de_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) + access = ocfs2_journal_access_di; + + ret = access(handle, dir, de_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out; @@ -410,9 +545,13 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir, { struct ocfs2_dir_entry *de, *pde; int i, status = -ENOENT; + ocfs2_journal_access_func access = ocfs2_journal_access_db; mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p)\n", handle, dir, de_del, bh); + if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) + access = ocfs2_journal_access_di; + i = 0; pde = NULL; de = (struct ocfs2_dir_entry *) first_de; @@ -423,8 +562,8 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir, goto bail; } if (de == de_del) { - status = ocfs2_journal_access(handle, dir, bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = access(handle, dir, bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { status = -EIO; mlog_errno(status); @@ -458,7 +597,7 @@ static inline int ocfs2_delete_entry_id(handle_t *handle, struct ocfs2_dinode *di; struct ocfs2_inline_data *data; - ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno, &di_bh); + ret = ocfs2_read_inode_block(dir, &di_bh); if (ret) { mlog_errno(ret); goto out; @@ -576,6 +715,16 @@ int __ocfs2_add_entry(handle_t *handle, goto bail; } + /* We're guaranteed that we should have space, so we + * can't possibly have hit the trailer...right? */ + mlog_bug_on_msg(ocfs2_skip_dir_trailer(dir, de, offset, size), + "Hit dir trailer trying to insert %.*s " + "(namelen %d) into directory %llu. " + "offset is %lu, trailer offset is %d\n", + namelen, name, namelen, + (unsigned long long)parent_fe_bh->b_blocknr, + offset, ocfs2_dir_trailer_blk_off(dir->i_sb)); + if (ocfs2_dirent_would_fit(de, rec_len)) { dir->i_mtime = dir->i_ctime = CURRENT_TIME; retval = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh); @@ -584,8 +733,14 @@ int __ocfs2_add_entry(handle_t *handle, goto bail; } - status = ocfs2_journal_access(handle, dir, insert_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + if (insert_bh == parent_fe_bh) + status = ocfs2_journal_access_di(handle, dir, + insert_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + else + status = ocfs2_journal_access_db(handle, dir, + insert_bh, + OCFS2_JOURNAL_ACCESS_WRITE); /* By now the buffer is marked for journaling */ offset += le16_to_cpu(de->rec_len); if (le64_to_cpu(de->inode)) { @@ -611,6 +766,7 @@ int __ocfs2_add_entry(handle_t *handle, retval = 0; goto bail; } + offset += le16_to_cpu(de->rec_len); de = (struct ocfs2_dir_entry *) ((char *) de + le16_to_cpu(de->rec_len)); } @@ -636,7 +792,7 @@ static int ocfs2_dir_foreach_blk_id(struct inode *inode, struct ocfs2_inline_data *data; struct ocfs2_dir_entry *de; - ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh); + ret = ocfs2_read_inode_block(inode, &di_bh); if (ret) { mlog(ML_ERROR, "Unable to read inode block for dir %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); @@ -724,7 +880,6 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode, int i, stored; struct buffer_head * bh, * tmp; struct ocfs2_dir_entry * de; - int err; struct super_block * sb = inode->i_sb; unsigned int ra_sectors = 16; @@ -735,12 +890,8 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode, while (!error && !stored && *f_pos < i_size_read(inode)) { blk = (*f_pos) >> sb->s_blocksize_bits; - bh = ocfs2_bread(inode, blk, &err, 0); - if (!bh) { - mlog(ML_ERROR, - "directory #%llu contains a hole at offset %lld\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, - *f_pos); + if (ocfs2_read_dir_block(inode, blk, &bh, 0)) { + /* Skip the corrupt dirblock and keep trying */ *f_pos += sb->s_blocksize - offset; continue; } @@ -754,8 +905,10 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode, || (((last_ra_blk - blk) << 9) <= (ra_sectors / 2))) { for (i = ra_sectors >> (sb->s_blocksize_bits - 9); i > 0; i--) { - tmp = ocfs2_bread(inode, ++blk, &err, 1); - brelse(tmp); + tmp = NULL; + if (!ocfs2_read_dir_block(inode, ++blk, &tmp, + OCFS2_BH_READAHEAD)) + brelse(tmp); } last_ra_blk = blk; ra_sectors = 8; @@ -828,6 +981,7 @@ revalidate: } offset = 0; brelse(bh); + bh = NULL; } stored = 0; @@ -1050,9 +1204,15 @@ int ocfs2_empty_dir(struct inode *inode) return !priv.seen_other; } -static void ocfs2_fill_initial_dirents(struct inode *inode, - struct inode *parent, - char *start, unsigned int size) +/* + * Fills "." and ".." dirents in a new directory block. Returns dirent for + * "..", which might be used during creation of a directory with a trailing + * header. It is otherwise safe to ignore the return code. + */ +static struct ocfs2_dir_entry *ocfs2_fill_initial_dirents(struct inode *inode, + struct inode *parent, + char *start, + unsigned int size) { struct ocfs2_dir_entry *de = (struct ocfs2_dir_entry *)start; @@ -1069,6 +1229,8 @@ static void ocfs2_fill_initial_dirents(struct inode *inode, de->name_len = 2; strcpy(de->name, ".."); ocfs2_set_de_type(de, S_IFDIR); + + return de; } /* @@ -1086,8 +1248,8 @@ static int ocfs2_fill_new_dir_id(struct ocfs2_super *osb, struct ocfs2_inline_data *data = &di->id2.i_data; unsigned int size = le16_to_cpu(data->id_count); - ret = ocfs2_journal_access(handle, inode, di_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_journal_access_di(handle, inode, di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out; @@ -1121,10 +1283,15 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb, struct ocfs2_alloc_context *data_ac) { int status; + unsigned int size = osb->sb->s_blocksize; struct buffer_head *new_bh = NULL; + struct ocfs2_dir_entry *de; mlog_entry_void(); + if (ocfs2_supports_dir_trailer(osb)) + size = ocfs2_dir_trailer_blk_off(parent->i_sb); + status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh, data_ac, NULL, &new_bh); if (status < 0) { @@ -1134,16 +1301,17 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb, ocfs2_set_new_buffer_uptodate(inode, new_bh); - status = ocfs2_journal_access(handle, inode, new_bh, - OCFS2_JOURNAL_ACCESS_CREATE); + status = ocfs2_journal_access_db(handle, inode, new_bh, + OCFS2_JOURNAL_ACCESS_CREATE); if (status < 0) { mlog_errno(status); goto bail; } memset(new_bh->b_data, 0, osb->sb->s_blocksize); - ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data, - osb->sb->s_blocksize); + de = ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data, size); + if (ocfs2_supports_dir_trailer(osb)) + ocfs2_init_dir_trailer(inode, new_bh); status = ocfs2_journal_dirty(handle, new_bh); if (status < 0) { @@ -1184,13 +1352,27 @@ int ocfs2_fill_new_dir(struct ocfs2_super *osb, data_ac); } +/* + * Expand rec_len of the rightmost dirent in a directory block so that it + * contains the end of our valid space for dirents. We do this during + * expansion from an inline directory to one with extents. The first dir block + * in that case is taken from the inline data portion of the inode block. + * + * We add the dir trailer if this filesystem wants it. + */ static void ocfs2_expand_last_dirent(char *start, unsigned int old_size, - unsigned int new_size) + struct super_block *sb) { struct ocfs2_dir_entry *de; struct ocfs2_dir_entry *prev_de; char *de_buf, *limit; - unsigned int bytes = new_size - old_size; + unsigned int new_size = sb->s_blocksize; + unsigned int bytes; + + if (ocfs2_supports_dir_trailer(OCFS2_SB(sb))) + new_size = ocfs2_dir_trailer_blk_off(sb); + + bytes = new_size - old_size; limit = start + old_size; de_buf = start; @@ -1216,9 +1398,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, unsigned int blocks_wanted, struct buffer_head **first_block_bh) { - int ret, credits = OCFS2_INLINE_TO_EXTENTS_CREDITS; u32 alloc, bit_off, len; struct super_block *sb = dir->i_sb; + int ret, credits = ocfs2_inline_to_extents_credits(sb); u64 blkno, bytes = blocks_wanted << sb->s_blocksize_bits; struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); struct ocfs2_inode_info *oi = OCFS2_I(dir); @@ -1227,6 +1409,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; handle_t *handle; struct ocfs2_extent_tree et; + int did_quota = 0; ocfs2_init_dinode_extent_tree(&et, dir, di_bh); @@ -1264,6 +1447,12 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, goto out_sem; } + if (vfs_dq_alloc_space_nodirty(dir, + ocfs2_clusters_to_bytes(osb->sb, alloc))) { + ret = -EDQUOT; + goto out_commit; + } + did_quota = 1; /* * Try to claim as many clusters as the bitmap can give though * if we only get one now, that's enough to continue. The rest @@ -1290,8 +1479,8 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, ocfs2_set_new_buffer_uptodate(dir, dirdata_bh); - ret = ocfs2_journal_access(handle, dir, dirdata_bh, - OCFS2_JOURNAL_ACCESS_CREATE); + ret = ocfs2_journal_access_db(handle, dir, dirdata_bh, + OCFS2_JOURNAL_ACCESS_CREATE); if (ret) { mlog_errno(ret); goto out_commit; @@ -1300,8 +1489,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, memcpy(dirdata_bh->b_data, di->id2.i_data.id_data, i_size_read(dir)); memset(dirdata_bh->b_data + i_size_read(dir), 0, sb->s_blocksize - i_size_read(dir)); - ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir), - sb->s_blocksize); + ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir), sb); + if (ocfs2_supports_dir_trailer(osb)) + ocfs2_init_dir_trailer(dir, dirdata_bh); ret = ocfs2_journal_dirty(handle, dirdata_bh); if (ret) { @@ -1317,8 +1507,8 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, * We let the later dirent insert modify c/mtime - to the user * the data hasn't changed. */ - ret = ocfs2_journal_access(handle, dir, di_bh, - OCFS2_JOURNAL_ACCESS_CREATE); + ret = ocfs2_journal_access_di(handle, dir, di_bh, + OCFS2_JOURNAL_ACCESS_CREATE); if (ret) { mlog_errno(ret); goto out_commit; @@ -1386,6 +1576,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, dirdata_bh = NULL; out_commit: + if (ret < 0 && did_quota) + vfs_dq_free_space_nodirty(dir, + ocfs2_clusters_to_bytes(osb->sb, 2)); ocfs2_commit_trans(osb, handle); out_sem: @@ -1410,7 +1603,7 @@ static int ocfs2_do_extend_dir(struct super_block *sb, struct buffer_head **new_bh) { int status; - int extend; + int extend, did_quota = 0; u64 p_blkno, v_blkno; spin_lock(&OCFS2_I(dir)->ip_lock); @@ -1420,6 +1613,13 @@ static int ocfs2_do_extend_dir(struct super_block *sb, if (extend) { u32 offset = OCFS2_I(dir)->ip_clusters; + if (vfs_dq_alloc_space_nodirty(dir, + ocfs2_clusters_to_bytes(sb, 1))) { + status = -EDQUOT; + goto bail; + } + did_quota = 1; + status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset, 1, 0, parent_fe_bh, handle, data_ac, meta_ac, NULL); @@ -1445,6 +1645,8 @@ static int ocfs2_do_extend_dir(struct super_block *sb, } status = 0; bail: + if (did_quota && status < 0) + vfs_dq_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1)); mlog_exit(status); return status; } @@ -1569,16 +1771,22 @@ do_extend: ocfs2_set_new_buffer_uptodate(dir, new_bh); - status = ocfs2_journal_access(handle, dir, new_bh, - OCFS2_JOURNAL_ACCESS_CREATE); + status = ocfs2_journal_access_db(handle, dir, new_bh, + OCFS2_JOURNAL_ACCESS_CREATE); if (status < 0) { mlog_errno(status); goto bail; } memset(new_bh->b_data, 0, sb->s_blocksize); + de = (struct ocfs2_dir_entry *) new_bh->b_data; de->inode = 0; - de->rec_len = cpu_to_le16(sb->s_blocksize); + if (ocfs2_dir_has_trailer(dir)) { + de->rec_len = cpu_to_le16(ocfs2_dir_trailer_blk_off(sb)); + ocfs2_init_dir_trailer(dir, new_bh); + } else { + de->rec_len = cpu_to_le16(sb->s_blocksize); + } status = ocfs2_journal_dirty(handle, new_bh); if (status < 0) { mlog_errno(status); @@ -1620,11 +1828,21 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh, unsigned int *blocks_wanted) { int ret; + struct super_block *sb = dir->i_sb; struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; struct ocfs2_dir_entry *de, *last_de = NULL; char *de_buf, *limit; unsigned long offset = 0; - unsigned int rec_len, new_rec_len; + unsigned int rec_len, new_rec_len, free_space = dir->i_sb->s_blocksize; + + /* + * This calculates how many free bytes we'd have in block zero, should + * this function force expansion to an extent tree. + */ + if (ocfs2_supports_dir_trailer(OCFS2_SB(sb))) + free_space = ocfs2_dir_trailer_blk_off(sb) - i_size_read(dir); + else + free_space = dir->i_sb->s_blocksize - i_size_read(dir); de_buf = di->id2.i_data.id_data; limit = de_buf + i_size_read(dir); @@ -1641,6 +1859,11 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh, ret = -EEXIST; goto out; } + /* + * No need to check for a trailing dirent record here as + * they're not used for inline dirs. + */ + if (ocfs2_dirent_would_fit(de, rec_len)) { /* Ok, we found a spot. Return this bh and let * the caller actually fill it in. */ @@ -1661,7 +1884,7 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh, * dirent can be found. */ *blocks_wanted = 1; - new_rec_len = le16_to_cpu(last_de->rec_len) + (dir->i_sb->s_blocksize - i_size_read(dir)); + new_rec_len = le16_to_cpu(last_de->rec_len) + free_space; if (new_rec_len < (rec_len + OCFS2_DIR_REC_LEN(last_de->name_len))) *blocks_wanted = 2; @@ -1679,9 +1902,10 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name, struct ocfs2_dir_entry *de; struct super_block *sb = dir->i_sb; int status; + int blocksize = dir->i_sb->s_blocksize; - bh = ocfs2_bread(dir, 0, &status, 0); - if (!bh) { + status = ocfs2_read_dir_block(dir, 0, &bh, 0); + if (status) { mlog_errno(status); goto bail; } @@ -1702,11 +1926,10 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name, status = -ENOSPC; goto bail; } - bh = ocfs2_bread(dir, - offset >> sb->s_blocksize_bits, - &status, - 0); - if (!bh) { + status = ocfs2_read_dir_block(dir, + offset >> sb->s_blocksize_bits, + &bh, 0); + if (status) { mlog_errno(status); goto bail; } @@ -1721,6 +1944,11 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name, status = -EEXIST; goto bail; } + + if (ocfs2_skip_dir_trailer(dir, de, offset % blocksize, + blocksize)) + goto next; + if (ocfs2_dirent_would_fit(de, rec_len)) { /* Ok, we found a spot. Return this bh and let * the caller actually fill it in. */ @@ -1729,6 +1957,7 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name, status = 0; goto bail; } +next: offset += le16_to_cpu(de->rec_len); de = (struct ocfs2_dir_entry *)((char *) de + le16_to_cpu(de->rec_len)); } diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h index ce48b9080d87..c511e2e18e9f 100644 --- a/fs/ocfs2/dir.h +++ b/fs/ocfs2/dir.h @@ -83,4 +83,6 @@ int ocfs2_fill_new_dir(struct ocfs2_super *osb, struct buffer_head *fe_bh, struct ocfs2_alloc_context *data_ac); +struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize, + void *data); #endif /* OCFS2_DIR_H */ diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c index 644bee55d8ba..d07ddbe4b283 100644 --- a/fs/ocfs2/dlm/dlmast.c +++ b/fs/ocfs2/dlm/dlmast.c @@ -275,6 +275,7 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data, struct list_head *iter, *head=NULL; u64 cookie; u32 flags; + u8 node; if (!dlm_grab(dlm)) { dlm_error(DLM_REJECTED); @@ -286,18 +287,21 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data, name = past->name; locklen = past->namelen; - cookie = be64_to_cpu(past->cookie); + cookie = past->cookie; flags = be32_to_cpu(past->flags); + node = past->node_idx; if (locklen > DLM_LOCKID_NAME_MAX) { ret = DLM_IVBUFLEN; - mlog(ML_ERROR, "Invalid name length in proxy ast handler!\n"); + mlog(ML_ERROR, "Invalid name length (%d) in proxy ast " + "handler!\n", locklen); goto leave; } if ((flags & (LKM_PUT_LVB|LKM_GET_LVB)) == (LKM_PUT_LVB|LKM_GET_LVB)) { - mlog(ML_ERROR, "both PUT and GET lvb specified\n"); + mlog(ML_ERROR, "Both PUT and GET lvb specified, (0x%x)\n", + flags); ret = DLM_BADARGS; goto leave; } @@ -310,22 +314,21 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data, if (past->type != DLM_AST && past->type != DLM_BAST) { mlog(ML_ERROR, "Unknown ast type! %d, cookie=%u:%llu" - "name=%.*s\n", past->type, - dlm_get_lock_cookie_node(cookie), - dlm_get_lock_cookie_seq(cookie), - locklen, name); + "name=%.*s, node=%u\n", past->type, + dlm_get_lock_cookie_node(be64_to_cpu(cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(cookie)), + locklen, name, node); ret = DLM_IVLOCKID; goto leave; } res = dlm_lookup_lockres(dlm, name, locklen); if (!res) { - mlog(0, "got %sast for unknown lockres! " - "cookie=%u:%llu, name=%.*s, namelen=%u\n", - past->type == DLM_AST ? "" : "b", - dlm_get_lock_cookie_node(cookie), - dlm_get_lock_cookie_seq(cookie), - locklen, name, locklen); + mlog(0, "Got %sast for unknown lockres! cookie=%u:%llu, " + "name=%.*s, node=%u\n", (past->type == DLM_AST ? "" : "b"), + dlm_get_lock_cookie_node(be64_to_cpu(cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(cookie)), + locklen, name, node); ret = DLM_IVLOCKID; goto leave; } @@ -337,12 +340,12 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data, spin_lock(&res->spinlock); if (res->state & DLM_LOCK_RES_RECOVERING) { - mlog(0, "responding with DLM_RECOVERING!\n"); + mlog(0, "Responding with DLM_RECOVERING!\n"); ret = DLM_RECOVERING; goto unlock_out; } if (res->state & DLM_LOCK_RES_MIGRATING) { - mlog(0, "responding with DLM_MIGRATING!\n"); + mlog(0, "Responding with DLM_MIGRATING!\n"); ret = DLM_MIGRATING; goto unlock_out; } @@ -351,7 +354,7 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data, lock = NULL; list_for_each(iter, head) { lock = list_entry (iter, struct dlm_lock, list); - if (be64_to_cpu(lock->ml.cookie) == cookie) + if (lock->ml.cookie == cookie) goto do_ast; } @@ -363,15 +366,15 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data, list_for_each(iter, head) { lock = list_entry (iter, struct dlm_lock, list); - if (be64_to_cpu(lock->ml.cookie) == cookie) + if (lock->ml.cookie == cookie) goto do_ast; } - mlog(0, "got %sast for unknown lock! cookie=%u:%llu, " - "name=%.*s, namelen=%u\n", past->type == DLM_AST ? "" : "b", - dlm_get_lock_cookie_node(cookie), - dlm_get_lock_cookie_seq(cookie), - locklen, name, locklen); + mlog(0, "Got %sast for unknown lock! cookie=%u:%llu, name=%.*s, " + "node=%u\n", past->type == DLM_AST ? "" : "b", + dlm_get_lock_cookie_node(be64_to_cpu(cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(cookie)), + locklen, name, node); ret = DLM_NORMAL; unlock_out: @@ -383,8 +386,8 @@ do_ast: if (past->type == DLM_AST) { /* do not alter lock refcount. switching lists. */ list_move_tail(&lock->list, &res->granted); - mlog(0, "ast: adding to granted list... type=%d, " - "convert_type=%d\n", lock->ml.type, lock->ml.convert_type); + mlog(0, "ast: Adding to granted list... type=%d, " + "convert_type=%d\n", lock->ml.type, lock->ml.convert_type); if (lock->ml.convert_type != LKM_IVMODE) { lock->ml.type = lock->ml.convert_type; lock->ml.convert_type = LKM_IVMODE; @@ -408,7 +411,6 @@ do_ast: dlm_do_local_bast(dlm, res, lock, past->blocked_type); leave: - if (res) dlm_lockres_put(res); diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index d5a86fb81a49..bb53714813ab 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -140,6 +140,7 @@ struct dlm_ctxt unsigned int purge_count; spinlock_t spinlock; spinlock_t ast_lock; + spinlock_t track_lock; char *name; u8 node_num; u32 key; @@ -316,6 +317,8 @@ struct dlm_lock_resource * put on a list for the dlm thread to run. */ unsigned long last_used; + struct dlm_ctxt *dlm; + unsigned migration_pending:1; atomic_t asts_reserved; spinlock_t spinlock; diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index 1b81dcba175d..b32f60a5acfb 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -630,43 +630,38 @@ static void *lockres_seq_start(struct seq_file *m, loff_t *pos) { struct debug_lockres *dl = m->private; struct dlm_ctxt *dlm = dl->dl_ctxt; + struct dlm_lock_resource *oldres = dl->dl_res; struct dlm_lock_resource *res = NULL; + struct list_head *track_list; - spin_lock(&dlm->spinlock); + spin_lock(&dlm->track_lock); + if (oldres) + track_list = &oldres->tracking; + else + track_list = &dlm->tracking_list; - if (dl->dl_res) { - list_for_each_entry(res, &dl->dl_res->tracking, tracking) { - if (dl->dl_res) { - dlm_lockres_put(dl->dl_res); - dl->dl_res = NULL; - } - if (&res->tracking == &dlm->tracking_list) { - mlog(0, "End of list found, %p\n", res); - dl = NULL; - break; - } + list_for_each_entry(res, track_list, tracking) { + if (&res->tracking == &dlm->tracking_list) + res = NULL; + else dlm_lockres_get(res); - dl->dl_res = res; - break; - } - } else { - if (!list_empty(&dlm->tracking_list)) { - list_for_each_entry(res, &dlm->tracking_list, tracking) - break; - dlm_lockres_get(res); - dl->dl_res = res; - } else - dl = NULL; + break; } + spin_unlock(&dlm->track_lock); - if (dl) { - spin_lock(&dl->dl_res->spinlock); - dump_lockres(dl->dl_res, dl->dl_buf, dl->dl_len - 1); - spin_unlock(&dl->dl_res->spinlock); - } + if (oldres) + dlm_lockres_put(oldres); - spin_unlock(&dlm->spinlock); + dl->dl_res = res; + + if (res) { + spin_lock(&res->spinlock); + dump_lockres(res, dl->dl_buf, dl->dl_len - 1); + spin_unlock(&res->spinlock); + } else + dl = NULL; + /* passed to seq_show */ return dl; } diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 63f8125824e8..d8d578f45613 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -1550,6 +1550,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, spin_lock_init(&dlm->spinlock); spin_lock_init(&dlm->master_lock); spin_lock_init(&dlm->ast_lock); + spin_lock_init(&dlm->track_lock); INIT_LIST_HEAD(&dlm->list); INIT_LIST_HEAD(&dlm->dirty_list); INIT_LIST_HEAD(&dlm->reco.resources); diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c index 6f7a77d54020..1c9efb406a96 100644 --- a/fs/ocfs2/dlm/dlmfs.c +++ b/fs/ocfs2/dlm/dlmfs.c @@ -341,7 +341,6 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb) inode->i_mode = mode; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); - inode->i_blocks = 0; inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inc_nlink(inode); @@ -367,7 +366,6 @@ static struct inode *dlmfs_get_inode(struct inode *parent, inode->i_mode = mode; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); - inode->i_blocks = 0; inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 44f87caf3683..54e182a27caf 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -505,8 +505,10 @@ void dlm_change_lockres_owner(struct dlm_ctxt *dlm, static void dlm_lockres_release(struct kref *kref) { struct dlm_lock_resource *res; + struct dlm_ctxt *dlm; res = container_of(kref, struct dlm_lock_resource, refs); + dlm = res->dlm; /* This should not happen -- all lockres' have a name * associated with them at init time. */ @@ -515,6 +517,7 @@ static void dlm_lockres_release(struct kref *kref) mlog(0, "destroying lockres %.*s\n", res->lockname.len, res->lockname.name); + spin_lock(&dlm->track_lock); if (!list_empty(&res->tracking)) list_del_init(&res->tracking); else { @@ -522,6 +525,9 @@ static void dlm_lockres_release(struct kref *kref) res->lockname.len, res->lockname.name); dlm_print_one_lock_resource(res); } + spin_unlock(&dlm->track_lock); + + dlm_put(dlm); if (!hlist_unhashed(&res->hash_node) || !list_empty(&res->granted) || @@ -595,6 +601,10 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm, res->migration_pending = 0; res->inflight_locks = 0; + /* put in dlm_lockres_release */ + dlm_grab(dlm); + res->dlm = dlm; + kref_init(&res->refs); /* just for consistency */ @@ -722,14 +732,21 @@ lookup: if (tmpres) { int dropping_ref = 0; + spin_unlock(&dlm->spinlock); + spin_lock(&tmpres->spinlock); + /* We wait for the other thread that is mastering the resource */ + if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) { + __dlm_wait_on_lockres(tmpres); + BUG_ON(tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN); + } + if (tmpres->owner == dlm->node_num) { BUG_ON(tmpres->state & DLM_LOCK_RES_DROPPING_REF); dlm_lockres_grab_inflight_ref(dlm, tmpres); } else if (tmpres->state & DLM_LOCK_RES_DROPPING_REF) dropping_ref = 1; spin_unlock(&tmpres->spinlock); - spin_unlock(&dlm->spinlock); /* wait until done messaging the master, drop our ref to allow * the lockres to be purged, start over. */ @@ -2949,7 +2966,7 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm, struct dlm_node_iter *iter) { struct dlm_migrate_request migrate; - int ret, status = 0; + int ret, skip, status = 0; int nodenum; memset(&migrate, 0, sizeof(migrate)); @@ -2966,12 +2983,27 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm, nodenum == new_master) continue; + /* We could race exit domain. If exited, skip. */ + spin_lock(&dlm->spinlock); + skip = (!test_bit(nodenum, dlm->domain_map)); + spin_unlock(&dlm->spinlock); + if (skip) { + clear_bit(nodenum, iter->node_map); + continue; + } + ret = o2net_send_message(DLM_MIGRATE_REQUEST_MSG, dlm->key, &migrate, sizeof(migrate), nodenum, &status); - if (ret < 0) - mlog_errno(ret); - else if (status < 0) { + if (ret < 0) { + mlog(0, "migrate_request returned %d!\n", ret); + if (!dlm_is_host_down(ret)) { + mlog(ML_ERROR, "unhandled error=%d!\n", ret); + BUG(); + } + clear_bit(nodenum, iter->node_map); + ret = 0; + } else if (status < 0) { mlog(0, "migrate request (node %u) returned %d!\n", nodenum, status); ret = status; diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index 4060bb328bc8..d1295203029f 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -181,7 +181,8 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm, spin_lock(&res->spinlock); /* This ensures that clear refmap is sent after the set */ - __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG); + __dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_SETREF_INPROG | + DLM_LOCK_RES_MIGRATING)); spin_unlock(&res->spinlock); /* clear our bit from the master's refmap, ignore errors */ diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 6e6cc0a2e5f7..b0c4cadd4c45 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -32,6 +32,7 @@ #include <linux/debugfs.h> #include <linux/seq_file.h> #include <linux/time.h> +#include <linux/quotaops.h> #define MLOG_MASK_PREFIX ML_DLM_GLUE #include <cluster/masklog.h> @@ -51,6 +52,7 @@ #include "slot_map.h" #include "super.h" #include "uptodate.h" +#include "quota.h" #include "buffer_head_io.h" @@ -68,6 +70,7 @@ struct ocfs2_mask_waiter { static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres); static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres); static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres); +static struct ocfs2_super *ocfs2_get_qinfo_osb(struct ocfs2_lock_res *lockres); /* * Return value from ->downconvert_worker functions. @@ -102,6 +105,7 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres, static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb, struct ocfs2_lock_res *lockres); +static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres); #define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres) @@ -111,8 +115,7 @@ static void ocfs2_dump_meta_lvb_info(u64 level, unsigned int line, struct ocfs2_lock_res *lockres) { - struct ocfs2_meta_lvb *lvb = - (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb); + struct ocfs2_meta_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb); mlog(level, "LVB information for %s (called from %s:%u):\n", lockres->l_name, function, line); @@ -258,6 +261,12 @@ static struct ocfs2_lock_res_ops ocfs2_flock_lops = { .flags = 0, }; +static struct ocfs2_lock_res_ops ocfs2_qinfo_lops = { + .set_lvb = ocfs2_set_qinfo_lvb, + .get_osb = ocfs2_get_qinfo_osb, + .flags = LOCK_TYPE_REQUIRES_REFRESH | LOCK_TYPE_USES_LVB, +}; + static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres) { return lockres->l_type == OCFS2_LOCK_TYPE_META || @@ -279,6 +288,13 @@ static inline struct ocfs2_dentry_lock *ocfs2_lock_res_dl(struct ocfs2_lock_res return (struct ocfs2_dentry_lock *)lockres->l_priv; } +static inline struct ocfs2_mem_dqinfo *ocfs2_lock_res_qinfo(struct ocfs2_lock_res *lockres) +{ + BUG_ON(lockres->l_type != OCFS2_LOCK_TYPE_QINFO); + + return (struct ocfs2_mem_dqinfo *)lockres->l_priv; +} + static inline struct ocfs2_super *ocfs2_get_lockres_osb(struct ocfs2_lock_res *lockres) { if (lockres->l_ops->get_osb) @@ -507,6 +523,13 @@ static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres) return OCFS2_SB(inode->i_sb); } +static struct ocfs2_super *ocfs2_get_qinfo_osb(struct ocfs2_lock_res *lockres) +{ + struct ocfs2_mem_dqinfo *info = lockres->l_priv; + + return OCFS2_SB(info->dqi_gi.dqi_sb); +} + static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres) { struct ocfs2_file_private *fp = lockres->l_priv; @@ -609,6 +632,17 @@ void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres, lockres->l_flags |= OCFS2_LOCK_NOCACHE; } +void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres, + struct ocfs2_mem_dqinfo *info) +{ + ocfs2_lock_res_init_once(lockres); + ocfs2_build_lock_name(OCFS2_LOCK_TYPE_QINFO, info->dqi_gi.dqi_type, + 0, lockres->l_name); + ocfs2_lock_res_init_common(OCFS2_SB(info->dqi_gi.dqi_sb), lockres, + OCFS2_LOCK_TYPE_QINFO, &ocfs2_qinfo_lops, + info); +} + void ocfs2_lock_res_free(struct ocfs2_lock_res *res) { mlog_entry_void(); @@ -1290,7 +1324,7 @@ again: goto out; } - mlog(0, "lock %s, successfull return from ocfs2_dlm_lock\n", + mlog(0, "lock %s, successful return from ocfs2_dlm_lock\n", lockres->l_name); /* At this point we've gone inside the dlm and need to @@ -1829,7 +1863,7 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode) mlog_entry_void(); - lvb = (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb); + lvb = ocfs2_dlm_lvb(&lockres->l_lksb); /* * Invalidate the LVB of a deleted inode - this way other @@ -1881,7 +1915,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode) mlog_meta_lvb(0, lockres); - lvb = (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb); + lvb = ocfs2_dlm_lvb(&lockres->l_lksb); /* We're safe here without the lockres lock... */ spin_lock(&oi->ip_lock); @@ -1916,8 +1950,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode) static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode, struct ocfs2_lock_res *lockres) { - struct ocfs2_meta_lvb *lvb = - (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb); + struct ocfs2_meta_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb); if (lvb->lvb_version == OCFS2_LVB_VERSION && be32_to_cpu(lvb->lvb_igeneration) == inode->i_generation) @@ -2024,7 +2057,7 @@ static int ocfs2_inode_lock_update(struct inode *inode, } else { /* Boo, we have to go to disk. */ /* read bh, cast, ocfs2_refresh_inode */ - status = ocfs2_read_block(inode, oi->ip_blkno, bh); + status = ocfs2_read_inode_block(inode, bh); if (status < 0) { mlog_errno(status); goto bail_refresh; @@ -2032,18 +2065,14 @@ static int ocfs2_inode_lock_update(struct inode *inode, fe = (struct ocfs2_dinode *) (*bh)->b_data; /* This is a good chance to make sure we're not - * locking an invalid object. + * locking an invalid object. ocfs2_read_inode_block() + * already checked that the inode block is sane. * * We bug on a stale inode here because we checked * above whether it was wiped from disk. The wiping * node provides a guarantee that we receive that * message and can mark the inode before dropping any * locks associated with it. */ - if (!OCFS2_IS_VALID_DINODE(fe)) { - OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); - status = -EIO; - goto bail_refresh; - } mlog_bug_on_msg(inode->i_generation != le32_to_cpu(fe->i_generation), "Invalid dinode %llu disk generation: %u " @@ -2085,7 +2114,7 @@ static int ocfs2_assign_bh(struct inode *inode, return 0; } - status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, ret_bh); + status = ocfs2_read_inode_block(inode, ret_bh); if (status < 0) mlog_errno(status); @@ -2922,7 +2951,7 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb, ocfs2_dlm_dump_lksb(&lockres->l_lksb); BUG(); } - mlog(0, "lock %s, successfull return from ocfs2_dlm_unlock\n", + mlog(0, "lock %s, successful return from ocfs2_dlm_unlock\n", lockres->l_name); ocfs2_wait_on_busy_lock(lockres); @@ -3449,6 +3478,117 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres, return UNBLOCK_CONTINUE_POST; } +static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres) +{ + struct ocfs2_qinfo_lvb *lvb; + struct ocfs2_mem_dqinfo *oinfo = ocfs2_lock_res_qinfo(lockres); + struct mem_dqinfo *info = sb_dqinfo(oinfo->dqi_gi.dqi_sb, + oinfo->dqi_gi.dqi_type); + + mlog_entry_void(); + + lvb = ocfs2_dlm_lvb(&lockres->l_lksb); + lvb->lvb_version = OCFS2_QINFO_LVB_VERSION; + lvb->lvb_bgrace = cpu_to_be32(info->dqi_bgrace); + lvb->lvb_igrace = cpu_to_be32(info->dqi_igrace); + lvb->lvb_syncms = cpu_to_be32(oinfo->dqi_syncms); + lvb->lvb_blocks = cpu_to_be32(oinfo->dqi_gi.dqi_blocks); + lvb->lvb_free_blk = cpu_to_be32(oinfo->dqi_gi.dqi_free_blk); + lvb->lvb_free_entry = cpu_to_be32(oinfo->dqi_gi.dqi_free_entry); + + mlog_exit_void(); +} + +void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex) +{ + struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock; + struct ocfs2_super *osb = OCFS2_SB(oinfo->dqi_gi.dqi_sb); + int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; + + mlog_entry_void(); + if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb)) + ocfs2_cluster_unlock(osb, lockres, level); + mlog_exit_void(); +} + +static int ocfs2_refresh_qinfo(struct ocfs2_mem_dqinfo *oinfo) +{ + struct mem_dqinfo *info = sb_dqinfo(oinfo->dqi_gi.dqi_sb, + oinfo->dqi_gi.dqi_type); + struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock; + struct ocfs2_qinfo_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb); + struct buffer_head *bh = NULL; + struct ocfs2_global_disk_dqinfo *gdinfo; + int status = 0; + + if (lvb->lvb_version == OCFS2_QINFO_LVB_VERSION) { + info->dqi_bgrace = be32_to_cpu(lvb->lvb_bgrace); + info->dqi_igrace = be32_to_cpu(lvb->lvb_igrace); + oinfo->dqi_syncms = be32_to_cpu(lvb->lvb_syncms); + oinfo->dqi_gi.dqi_blocks = be32_to_cpu(lvb->lvb_blocks); + oinfo->dqi_gi.dqi_free_blk = be32_to_cpu(lvb->lvb_free_blk); + oinfo->dqi_gi.dqi_free_entry = + be32_to_cpu(lvb->lvb_free_entry); + } else { + status = ocfs2_read_quota_block(oinfo->dqi_gqinode, 0, &bh); + if (status) { + mlog_errno(status); + goto bail; + } + gdinfo = (struct ocfs2_global_disk_dqinfo *) + (bh->b_data + OCFS2_GLOBAL_INFO_OFF); + info->dqi_bgrace = le32_to_cpu(gdinfo->dqi_bgrace); + info->dqi_igrace = le32_to_cpu(gdinfo->dqi_igrace); + oinfo->dqi_syncms = le32_to_cpu(gdinfo->dqi_syncms); + oinfo->dqi_gi.dqi_blocks = le32_to_cpu(gdinfo->dqi_blocks); + oinfo->dqi_gi.dqi_free_blk = le32_to_cpu(gdinfo->dqi_free_blk); + oinfo->dqi_gi.dqi_free_entry = + le32_to_cpu(gdinfo->dqi_free_entry); + brelse(bh); + ocfs2_track_lock_refresh(lockres); + } + +bail: + return status; +} + +/* Lock quota info, this function expects at least shared lock on the quota file + * so that we can safely refresh quota info from disk. */ +int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex) +{ + struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock; + struct ocfs2_super *osb = OCFS2_SB(oinfo->dqi_gi.dqi_sb); + int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; + int status = 0; + + mlog_entry_void(); + + /* On RO devices, locking really isn't needed... */ + if (ocfs2_is_hard_readonly(osb)) { + if (ex) + status = -EROFS; + goto bail; + } + if (ocfs2_mount_local(osb)) + goto bail; + + status = ocfs2_cluster_lock(osb, lockres, level, 0, 0); + if (status < 0) { + mlog_errno(status); + goto bail; + } + if (!ocfs2_should_refresh_lock_res(lockres)) + goto bail; + /* OK, we have the lock but we need to refresh the quota info */ + status = ocfs2_refresh_qinfo(oinfo); + if (status) + ocfs2_qinfo_unlock(oinfo, ex); + ocfs2_complete_lock_res_refresh(lockres, status); +bail: + mlog_exit(status); + return status; +} + /* * This is the filesystem locking protocol. It provides the lock handling * hooks for the underlying DLM. It has a maximum version number. diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h index 2bb01f09c1b1..3f8d9986b8e0 100644 --- a/fs/ocfs2/dlmglue.h +++ b/fs/ocfs2/dlmglue.h @@ -49,6 +49,19 @@ struct ocfs2_meta_lvb { __be32 lvb_reserved2; }; +#define OCFS2_QINFO_LVB_VERSION 1 + +struct ocfs2_qinfo_lvb { + __u8 lvb_version; + __u8 lvb_reserved[3]; + __be32 lvb_bgrace; + __be32 lvb_igrace; + __be32 lvb_syncms; + __be32 lvb_blocks; + __be32 lvb_free_blk; + __be32 lvb_free_entry; +}; + /* ocfs2_inode_lock_full() 'arg_flags' flags */ /* don't wait on recovery. */ #define OCFS2_META_LOCK_RECOVERY (0x01) @@ -69,6 +82,9 @@ void ocfs2_dentry_lock_res_init(struct ocfs2_dentry_lock *dl, struct ocfs2_file_private; void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres, struct ocfs2_file_private *fp); +struct ocfs2_mem_dqinfo; +void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres, + struct ocfs2_mem_dqinfo *info); void ocfs2_lock_res_free(struct ocfs2_lock_res *res); int ocfs2_create_new_inode_locks(struct inode *inode); int ocfs2_drop_inode_locks(struct inode *inode); @@ -103,6 +119,9 @@ int ocfs2_dentry_lock(struct dentry *dentry, int ex); void ocfs2_dentry_unlock(struct dentry *dentry, int ex); int ocfs2_file_lock(struct file *file, int ex, int trylock); void ocfs2_file_unlock(struct file *file); +int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex); +void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex); + void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres); void ocfs2_simple_drop_lockres(struct ocfs2_super *osb, diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index 2baedac58234..f2bb1a04d253 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -293,7 +293,7 @@ static int ocfs2_last_eb_is_empty(struct inode *inode, struct ocfs2_extent_block *eb; struct ocfs2_extent_list *el; - ret = ocfs2_read_block(inode, last_eb_blk, &eb_bh); + ret = ocfs2_read_extent_block(inode, last_eb_blk, &eb_bh); if (ret) { mlog_errno(ret); goto out; @@ -302,12 +302,6 @@ static int ocfs2_last_eb_is_empty(struct inode *inode, eb = (struct ocfs2_extent_block *) eb_bh->b_data; el = &eb->h_list; - if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { - ret = -EROFS; - OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); - goto out; - } - if (el->l_tree_depth) { ocfs2_error(inode->i_sb, "Inode %lu has non zero tree depth in " @@ -381,23 +375,16 @@ static int ocfs2_figure_hole_clusters(struct inode *inode, if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL) goto no_more_extents; - ret = ocfs2_read_block(inode, - le64_to_cpu(eb->h_next_leaf_blk), - &next_eb_bh); + ret = ocfs2_read_extent_block(inode, + le64_to_cpu(eb->h_next_leaf_blk), + &next_eb_bh); if (ret) { mlog_errno(ret); goto out; } - next_eb = (struct ocfs2_extent_block *)next_eb_bh->b_data; - - if (!OCFS2_IS_VALID_EXTENT_BLOCK(next_eb)) { - ret = -EROFS; - OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, next_eb); - goto out; - } + next_eb = (struct ocfs2_extent_block *)next_eb_bh->b_data; el = &next_eb->h_list; - i = ocfs2_search_for_hole_index(el, v_cluster); } @@ -630,7 +617,7 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, if (ret == 0) goto out; - ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh); + ret = ocfs2_read_inode_block(inode, &di_bh); if (ret) { mlog_errno(ret); goto out; @@ -819,3 +806,74 @@ out: return ret; } + +int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr, + struct buffer_head *bhs[], int flags, + int (*validate)(struct super_block *sb, + struct buffer_head *bh)) +{ + int rc = 0; + u64 p_block, p_count; + int i, count, done = 0; + + mlog_entry("(inode = %p, v_block = %llu, nr = %d, bhs = %p, " + "flags = %x, validate = %p)\n", + inode, (unsigned long long)v_block, nr, bhs, flags, + validate); + + if (((v_block + nr - 1) << inode->i_sb->s_blocksize_bits) >= + i_size_read(inode)) { + BUG_ON(!(flags & OCFS2_BH_READAHEAD)); + goto out; + } + + while (done < nr) { + down_read(&OCFS2_I(inode)->ip_alloc_sem); + rc = ocfs2_extent_map_get_blocks(inode, v_block + done, + &p_block, &p_count, NULL); + up_read(&OCFS2_I(inode)->ip_alloc_sem); + if (rc) { + mlog_errno(rc); + break; + } + + if (!p_block) { + rc = -EIO; + mlog(ML_ERROR, + "Inode #%llu contains a hole at offset %llu\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)(v_block + done) << + inode->i_sb->s_blocksize_bits); + break; + } + + count = nr - done; + if (p_count < count) + count = p_count; + + /* + * If the caller passed us bhs, they should have come + * from a previous readahead call to this function. Thus, + * they should have the right b_blocknr. + */ + for (i = 0; i < count; i++) { + if (!bhs[done + i]) + continue; + BUG_ON(bhs[done + i]->b_blocknr != (p_block + i)); + } + + rc = ocfs2_read_blocks(inode, p_block, count, bhs + done, + flags, validate); + if (rc) { + mlog_errno(rc); + break; + } + done += count; + } + +out: + mlog_exit(rc); + return rc; +} + + diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h index 1c4aa8b06f34..b7dd9731b462 100644 --- a/fs/ocfs2/extent_map.h +++ b/fs/ocfs2/extent_map.h @@ -57,4 +57,28 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, u32 *p_cluster, u32 *num_clusters, struct ocfs2_extent_list *el); +int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr, + struct buffer_head *bhs[], int flags, + int (*validate)(struct super_block *sb, + struct buffer_head *bh)); +static inline int ocfs2_read_virt_block(struct inode *inode, u64 v_block, + struct buffer_head **bh, + int (*validate)(struct super_block *sb, + struct buffer_head *bh)) +{ + int status = 0; + + if (bh == NULL) { + printk("ocfs2: bh == NULL\n"); + status = -EINVAL; + goto bail; + } + + status = ocfs2_read_virt_blocks(inode, v_block, 1, bh, 0, validate); + +bail: + return status; +} + + #endif /* _EXTENT_MAP_H */ diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index e2570a3bc2b2..a5887df2cd8a 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -35,6 +35,7 @@ #include <linux/mount.h> #include <linux/writeback.h> #include <linux/falloc.h> +#include <linux/quotaops.h> #define MLOG_MASK_PREFIX ML_INODE #include <cluster/masklog.h> @@ -56,6 +57,8 @@ #include "suballoc.h" #include "super.h" #include "xattr.h" +#include "acl.h" +#include "quota.h" #include "buffer_head_io.h" @@ -253,8 +256,8 @@ int ocfs2_update_inode_atime(struct inode *inode, goto out; } - ret = ocfs2_journal_access(handle, inode, bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_journal_access_di(handle, inode, bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out_commit; @@ -303,9 +306,9 @@ bail: return status; } -static int ocfs2_simple_size_update(struct inode *inode, - struct buffer_head *di_bh, - u64 new_i_size) +int ocfs2_simple_size_update(struct inode *inode, + struct buffer_head *di_bh, + u64 new_i_size) { int ret; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); @@ -350,8 +353,8 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, goto out; } - status = ocfs2_journal_access(handle, inode, fe_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_di(handle, inode, fe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto out_commit; @@ -401,12 +404,9 @@ static int ocfs2_truncate_file(struct inode *inode, (unsigned long long)OCFS2_I(inode)->ip_blkno, (unsigned long long)new_i_size); + /* We trust di_bh because it comes from ocfs2_inode_lock(), which + * already validated it */ fe = (struct ocfs2_dinode *) di_bh->b_data; - if (!OCFS2_IS_VALID_DINODE(fe)) { - OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); - status = -EIO; - goto bail; - } mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode), "Inode %llu, inode i_size = %lld != di " @@ -536,6 +536,7 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start, enum ocfs2_alloc_restarted why; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_extent_tree et; + int did_quota = 0; mlog_entry("(clusters_to_add = %u)\n", clusters_to_add); @@ -545,18 +546,12 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start, */ BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb)); - status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh); + status = ocfs2_read_inode_block(inode, &bh); if (status < 0) { mlog_errno(status); goto leave; } - fe = (struct ocfs2_dinode *) bh->b_data; - if (!OCFS2_IS_VALID_DINODE(fe)) { - OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); - status = -EIO; - goto leave; - } restart_all: BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); @@ -585,11 +580,18 @@ restart_all: } restarted_transaction: + if (vfs_dq_alloc_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb, + clusters_to_add))) { + status = -EDQUOT; + goto leave; + } + did_quota = 1; + /* reserve a write to the file entry early on - that we if we * run out of credits in the allocation path, we can still * update i_size. */ - status = ocfs2_journal_access(handle, inode, bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_di(handle, inode, bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto leave; @@ -622,6 +624,10 @@ restarted_transaction: spin_lock(&OCFS2_I(inode)->ip_lock); clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters); spin_unlock(&OCFS2_I(inode)->ip_lock); + /* Release unused quota reservation */ + vfs_dq_free_space(inode, + ocfs2_clusters_to_bytes(osb->sb, clusters_to_add)); + did_quota = 0; if (why != RESTART_NONE && clusters_to_add) { if (why == RESTART_META) { @@ -654,6 +660,9 @@ restarted_transaction: OCFS2_I(inode)->ip_clusters, (long long)i_size_read(inode)); leave: + if (status < 0 && did_quota) + vfs_dq_free_space(inode, + ocfs2_clusters_to_bytes(osb->sb, clusters_to_add)); if (handle) { ocfs2_commit_trans(osb, handle); handle = NULL; @@ -885,6 +894,9 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) struct ocfs2_super *osb = OCFS2_SB(sb); struct buffer_head *bh = NULL; handle_t *handle = NULL; + int locked[MAXQUOTAS] = {0, 0}; + int credits, qtype; + struct ocfs2_mem_dqinfo *oinfo; mlog_entry("(0x%p, '%.*s')\n", dentry, dentry->d_name.len, dentry->d_name.name); @@ -955,11 +967,47 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) } } - handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); - if (IS_ERR(handle)) { - status = PTR_ERR(handle); - mlog_errno(status); - goto bail_unlock; + if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || + (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { + credits = OCFS2_INODE_UPDATE_CREDITS; + if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid + && OCFS2_HAS_RO_COMPAT_FEATURE(sb, + OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) { + oinfo = sb_dqinfo(sb, USRQUOTA)->dqi_priv; + status = ocfs2_lock_global_qf(oinfo, 1); + if (status < 0) + goto bail_unlock; + credits += ocfs2_calc_qinit_credits(sb, USRQUOTA) + + ocfs2_calc_qdel_credits(sb, USRQUOTA); + locked[USRQUOTA] = 1; + } + if (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid + && OCFS2_HAS_RO_COMPAT_FEATURE(sb, + OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) { + oinfo = sb_dqinfo(sb, GRPQUOTA)->dqi_priv; + status = ocfs2_lock_global_qf(oinfo, 1); + if (status < 0) + goto bail_unlock; + credits += ocfs2_calc_qinit_credits(sb, GRPQUOTA) + + ocfs2_calc_qdel_credits(sb, GRPQUOTA); + locked[GRPQUOTA] = 1; + } + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto bail_unlock; + } + status = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0; + if (status < 0) + goto bail_commit; + } else { + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto bail_unlock; + } } /* @@ -982,6 +1030,12 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) bail_commit: ocfs2_commit_trans(osb, handle); bail_unlock: + for (qtype = 0; qtype < MAXQUOTAS; qtype++) { + if (!locked[qtype]) + continue; + oinfo = sb_dqinfo(sb, qtype)->dqi_priv; + ocfs2_unlock_global_qf(oinfo, 1); + } ocfs2_inode_unlock(inode, 1); bail_unlock_rw: if (size_change) @@ -989,6 +1043,12 @@ bail_unlock_rw: bail: brelse(bh); + if (!status && attr->ia_valid & ATTR_MODE) { + status = ocfs2_acl_chmod(inode); + if (status < 0) + mlog_errno(status); + } + mlog_exit(status); return status; } @@ -1035,7 +1095,7 @@ int ocfs2_permission(struct inode *inode, int mask) goto out; } - ret = generic_permission(inode, mask, NULL); + ret = generic_permission(inode, mask, ocfs2_check_acl); ocfs2_inode_unlock(inode, 0); out: @@ -1061,8 +1121,8 @@ static int __ocfs2_write_remove_suid(struct inode *inode, goto out; } - ret = ocfs2_journal_access(handle, inode, bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_journal_access_di(handle, inode, bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret < 0) { mlog_errno(ret); goto out_trans; @@ -1128,9 +1188,8 @@ static int ocfs2_write_remove_suid(struct inode *inode) { int ret; struct buffer_head *bh = NULL; - struct ocfs2_inode_info *oi = OCFS2_I(inode); - ret = ocfs2_read_block(inode, oi->ip_blkno, &bh); + ret = ocfs2_read_inode_block(inode, &bh); if (ret < 0) { mlog_errno(ret); goto out; @@ -1156,8 +1215,7 @@ static int ocfs2_allocate_unwritten_extents(struct inode *inode, struct buffer_head *di_bh = NULL; if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { - ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, - &di_bh); + ret = ocfs2_read_inode_block(inode, &di_bh); if (ret) { mlog_errno(ret); goto out; @@ -1226,83 +1284,6 @@ out: return ret; } -static int __ocfs2_remove_inode_range(struct inode *inode, - struct buffer_head *di_bh, - u32 cpos, u32 phys_cpos, u32 len, - struct ocfs2_cached_dealloc_ctxt *dealloc) -{ - int ret; - u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - struct inode *tl_inode = osb->osb_tl_inode; - handle_t *handle; - struct ocfs2_alloc_context *meta_ac = NULL; - struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; - struct ocfs2_extent_tree et; - - ocfs2_init_dinode_extent_tree(&et, inode, di_bh); - - ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac); - if (ret) { - mlog_errno(ret); - return ret; - } - - mutex_lock(&tl_inode->i_mutex); - - if (ocfs2_truncate_log_needs_flush(osb)) { - ret = __ocfs2_flush_truncate_log(osb); - if (ret < 0) { - mlog_errno(ret); - goto out; - } - } - - handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - mlog_errno(ret); - goto out; - } - - ret = ocfs2_journal_access(handle, inode, di_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (ret) { - mlog_errno(ret); - goto out; - } - - ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac, - dealloc); - if (ret) { - mlog_errno(ret); - goto out_commit; - } - - OCFS2_I(inode)->ip_clusters -= len; - di->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters); - - ret = ocfs2_journal_dirty(handle, di_bh); - if (ret) { - mlog_errno(ret); - goto out_commit; - } - - ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len); - if (ret) - mlog_errno(ret); - -out_commit: - ocfs2_commit_trans(osb, handle); -out: - mutex_unlock(&tl_inode->i_mutex); - - if (meta_ac) - ocfs2_free_alloc_context(meta_ac); - - return ret; -} - /* * Truncate a byte range, avoiding pages within partial clusters. This * preserves those pages for the zeroing code to write to. @@ -1402,7 +1383,9 @@ static int ocfs2_remove_inode_range(struct inode *inode, struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_cached_dealloc_ctxt dealloc; struct address_space *mapping = inode->i_mapping; + struct ocfs2_extent_tree et; + ocfs2_init_dinode_extent_tree(&et, inode, di_bh); ocfs2_init_dealloc_ctxt(&dealloc); if (byte_len == 0) @@ -1458,9 +1441,9 @@ static int ocfs2_remove_inode_range(struct inode *inode, /* Only do work for non-holes */ if (phys_cpos != 0) { - ret = __ocfs2_remove_inode_range(inode, di_bh, cpos, - phys_cpos, alloc_size, - &dealloc); + ret = ocfs2_remove_btree_range(inode, &et, cpos, + phys_cpos, alloc_size, + &dealloc); if (ret) { mlog_errno(ret); goto out; @@ -1622,7 +1605,7 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd, struct ocfs2_space_resv *sr) { struct inode *inode = file->f_path.dentry->d_inode; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) && !ocfs2_writes_unwritten_extents(osb)) diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h index e92382cbca5f..172f9fbc9fc7 100644 --- a/fs/ocfs2/file.h +++ b/fs/ocfs2/file.h @@ -51,6 +51,9 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb, struct ocfs2_alloc_context *data_ac, struct ocfs2_alloc_context *meta_ac, enum ocfs2_alloc_restarted *reason_ret); +int ocfs2_simple_size_update(struct inode *inode, + struct buffer_head *di_bh, + u64 new_i_size); int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to); int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 7aa00d511874..229e707bc050 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -28,6 +28,7 @@ #include <linux/slab.h> #include <linux/highmem.h> #include <linux/pagemap.h> +#include <linux/quotaops.h> #include <asm/byteorder.h> @@ -37,6 +38,7 @@ #include "ocfs2.h" #include "alloc.h" +#include "blockcheck.h" #include "dlmglue.h" #include "extent_map.h" #include "file.h" @@ -214,12 +216,11 @@ static int ocfs2_init_locked_inode(struct inode *inode, void *opaque) return 0; } -int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, - int create_ino) +void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, + int create_ino) { struct super_block *sb; struct ocfs2_super *osb; - int status = -EINVAL; int use_plocks = 1; mlog_entry("(0x%p, size:%llu)\n", inode, @@ -232,25 +233,17 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, ocfs2_mount_local(osb) || !ocfs2_stack_supports_plocks()) use_plocks = 0; - /* this means that read_inode cannot create a superblock inode - * today. change if needed. */ - if (!OCFS2_IS_VALID_DINODE(fe) || - !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL))) { - mlog(0, "Invalid dinode: i_ino=%lu, i_blkno=%llu, " - "signature = %.*s, flags = 0x%x\n", - inode->i_ino, - (unsigned long long)le64_to_cpu(fe->i_blkno), 7, - fe->i_signature, le32_to_cpu(fe->i_flags)); - goto bail; - } + /* + * These have all been checked by ocfs2_read_inode_block() or set + * by ocfs2_mknod_locked(), so a failure is a code bug. + */ + BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); /* This means that read_inode + cannot create a superblock + inode today. change if + that is needed. */ + BUG_ON(!(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL))); + BUG_ON(le32_to_cpu(fe->i_fs_generation) != osb->fs_generation); - if (le32_to_cpu(fe->i_fs_generation) != osb->fs_generation) { - mlog(ML_ERROR, "file entry generation does not match " - "superblock! osb->fs_generation=%x, " - "fe->i_fs_generation=%x\n", - osb->fs_generation, le32_to_cpu(fe->i_fs_generation)); - goto bail; - } OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr); @@ -284,14 +277,18 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, inode->i_nlink = le16_to_cpu(fe->i_links_count); - if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) + if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) { OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE; + inode->i_flags |= S_NOQUOTA; + } if (fe->i_flags & cpu_to_le32(OCFS2_LOCAL_ALLOC_FL)) { OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP; mlog(0, "local alloc inode: i_ino=%lu\n", inode->i_ino); } else if (fe->i_flags & cpu_to_le32(OCFS2_BITMAP_FL)) { OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP; + } else if (fe->i_flags & cpu_to_le32(OCFS2_QUOTA_FL)) { + inode->i_flags |= S_NOQUOTA; } else if (fe->i_flags & cpu_to_le32(OCFS2_SUPER_BLOCK_FL)) { mlog(0, "superblock inode: i_ino=%lu\n", inode->i_ino); /* we can't actually hit this as read_inode can't @@ -354,10 +351,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, ocfs2_set_inode_flags(inode); - status = 0; -bail: - mlog_exit(status); - return status; + mlog_exit_void(); } static int ocfs2_read_locked_inode(struct inode *inode, @@ -460,11 +454,14 @@ static int ocfs2_read_locked_inode(struct inode *inode, } } - if (can_lock) - status = ocfs2_read_blocks(inode, args->fi_blkno, 1, &bh, - OCFS2_BH_IGNORE_CACHE); - else + if (can_lock) { + status = ocfs2_read_inode_block_full(inode, &bh, + OCFS2_BH_IGNORE_CACHE); + } else { status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh); + if (!status) + status = ocfs2_validate_inode_block(osb->sb, bh); + } if (status < 0) { mlog_errno(status); goto bail; @@ -472,12 +469,6 @@ static int ocfs2_read_locked_inode(struct inode *inode, status = -EINVAL; fe = (struct ocfs2_dinode *) bh->b_data; - if (!OCFS2_IS_VALID_DINODE(fe)) { - mlog(0, "Invalid dinode #%llu: signature = %.*s\n", - (unsigned long long)args->fi_blkno, 7, - fe->i_signature); - goto bail; - } /* * This is a code bug. Right now the caller needs to @@ -491,10 +482,9 @@ static int ocfs2_read_locked_inode(struct inode *inode, if (S_ISCHR(le16_to_cpu(fe->i_mode)) || S_ISBLK(le16_to_cpu(fe->i_mode))) - inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev)); + inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev)); - if (ocfs2_populate_inode(inode, fe, 0) < 0) - goto bail; + ocfs2_populate_inode(inode, fe, 0); BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno)); @@ -547,8 +537,8 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb, goto out; } - status = ocfs2_journal_access(handle, inode, fe_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_di(handle, inode, fe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto out; @@ -615,7 +605,8 @@ static int ocfs2_remove_inode(struct inode *inode, goto bail; } - handle = ocfs2_start_trans(osb, OCFS2_DELETE_INODE_CREDITS); + handle = ocfs2_start_trans(osb, OCFS2_DELETE_INODE_CREDITS + + ocfs2_quota_trans_credits(inode->i_sb)); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); @@ -630,8 +621,8 @@ static int ocfs2_remove_inode(struct inode *inode, } /* set the inodes dtime */ - status = ocfs2_journal_access(handle, inode, di_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_di(handle, inode, di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail_commit; @@ -647,6 +638,7 @@ static int ocfs2_remove_inode(struct inode *inode, } ocfs2_remove_from_cache(inode, di_bh); + vfs_dq_free_inode(inode); status = ocfs2_free_dinode(handle, inode_alloc_inode, inode_alloc_bh, di); @@ -929,7 +921,10 @@ void ocfs2_delete_inode(struct inode *inode) mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino); - if (is_bad_inode(inode)) { + /* When we fail in read_inode() we mark inode as bad. The second test + * catches the case when inode allocation fails before allocating + * a block for inode. */ + if (is_bad_inode(inode) || !OCFS2_I(inode)->ip_blkno) { mlog(0, "Skipping delete of bad inode\n"); goto bail; } @@ -1195,8 +1190,8 @@ int ocfs2_mark_inode_dirty(handle_t *handle, mlog_entry("(inode %llu)\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); - status = ocfs2_journal_access(handle, inode, bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_di(handle, inode, bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto leave; @@ -1264,3 +1259,89 @@ void ocfs2_refresh_inode(struct inode *inode, spin_unlock(&OCFS2_I(inode)->ip_lock); } + +int ocfs2_validate_inode_block(struct super_block *sb, + struct buffer_head *bh) +{ + int rc; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; + + mlog(0, "Validating dinode %llu\n", + (unsigned long long)bh->b_blocknr); + + BUG_ON(!buffer_uptodate(bh)); + + /* + * If the ecc fails, we return the error but otherwise + * leave the filesystem running. We know any error is + * local to this block. + */ + rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check); + if (rc) { + mlog(ML_ERROR, "Checksum failed for dinode %llu\n", + (unsigned long long)bh->b_blocknr); + goto bail; + } + + /* + * Errors after here are fatal. + */ + + rc = -EINVAL; + + if (!OCFS2_IS_VALID_DINODE(di)) { + ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n", + (unsigned long long)bh->b_blocknr, 7, + di->i_signature); + goto bail; + } + + if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) { + ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(di->i_blkno)); + goto bail; + } + + if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) { + ocfs2_error(sb, + "Invalid dinode #%llu: OCFS2_VALID_FL not set\n", + (unsigned long long)bh->b_blocknr); + goto bail; + } + + if (le32_to_cpu(di->i_fs_generation) != + OCFS2_SB(sb)->fs_generation) { + ocfs2_error(sb, + "Invalid dinode #%llu: fs_generation is %u\n", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(di->i_fs_generation)); + goto bail; + } + + rc = 0; + +bail: + return rc; +} + +int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh, + int flags) +{ + int rc; + struct buffer_head *tmp = *bh; + + rc = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, &tmp, + flags, ocfs2_validate_inode_block); + + /* If ocfs2_read_blocks() got us a new bh, pass it up. */ + if (!rc && !*bh) + *bh = tmp; + + return rc; +} + +int ocfs2_read_inode_block(struct inode *inode, struct buffer_head **bh) +{ + return ocfs2_read_inode_block_full(inode, bh, 0); +} diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 2f37af9bcc4a..eb3c302b38d3 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -128,8 +128,8 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags, int sysfile_type); int ocfs2_inode_init_private(struct inode *inode); int ocfs2_inode_revalidate(struct dentry *dentry); -int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, - int create_ino); +void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, + int create_ino); void ocfs2_read_inode(struct inode *inode); void ocfs2_read_inode2(struct inode *inode, void *opaque); ssize_t ocfs2_rw_direct(int rw, struct file *filp, char *buf, @@ -142,6 +142,8 @@ int ocfs2_mark_inode_dirty(handle_t *handle, struct buffer_head *bh); int ocfs2_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb); int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb); +struct buffer_head *ocfs2_bread(struct inode *inode, + int block, int *err, int reada); void ocfs2_set_inode_flags(struct inode *inode); void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi); @@ -153,4 +155,16 @@ static inline blkcnt_t ocfs2_inode_sector_count(struct inode *inode) return (blkcnt_t)(OCFS2_I(inode)->ip_clusters << c_to_s_bits); } +/* Validate that a bh contains a valid inode */ +int ocfs2_validate_inode_block(struct super_block *sb, + struct buffer_head *bh); +/* + * Read an inode block into *bh. If *bh is NULL, a bh will be allocated. + * This is a cached read. The inode will be validated with + * ocfs2_validate_inode_block(). + */ +int ocfs2_read_inode_block(struct inode *inode, struct buffer_head **bh); +/* The same, but can be passed OCFS2_BH_* flags */ +int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh, + int flags); #endif /* OCFS2_INODE_H */ diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 99fe9d584f3c..57d7d25a2b9a 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -35,6 +35,7 @@ #include "ocfs2.h" #include "alloc.h" +#include "blockcheck.h" #include "dir.h" #include "dlmglue.h" #include "extent_map.h" @@ -45,6 +46,7 @@ #include "slot_map.h" #include "super.h" #include "sysfile.h" +#include "quota.h" #include "buffer_head_io.h" @@ -52,10 +54,10 @@ DEFINE_SPINLOCK(trans_inc_lock); static int ocfs2_force_read_journal(struct inode *inode); static int ocfs2_recover_node(struct ocfs2_super *osb, - int node_num); + int node_num, int slot_num); static int __ocfs2_recovery_thread(void *arg); static int ocfs2_commit_cache(struct ocfs2_super *osb); -static int ocfs2_wait_on_mount(struct ocfs2_super *osb); +static int __ocfs2_wait_on_mount(struct ocfs2_super *osb, int quota); static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb, int dirty, int replayed); static int ocfs2_trylock_journal(struct ocfs2_super *osb, @@ -64,6 +66,17 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, int slot); static int ocfs2_commit_thread(void *arg); +static inline int ocfs2_wait_on_mount(struct ocfs2_super *osb) +{ + return __ocfs2_wait_on_mount(osb, 0); +} + +static inline int ocfs2_wait_on_quotas(struct ocfs2_super *osb) +{ + return __ocfs2_wait_on_mount(osb, 1); +} + + /* * The recovery_list is a simple linked list of node numbers to recover. @@ -256,11 +269,9 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs) BUG_ON(osb->journal->j_state == OCFS2_JOURNAL_FREE); BUG_ON(max_buffs <= 0); - /* JBD might support this, but our journalling code doesn't yet. */ - if (journal_current_handle()) { - mlog(ML_ERROR, "Recursive transaction attempted!\n"); - BUG(); - } + /* Nested transaction? Just return the handle... */ + if (journal_current_handle()) + return jbd2_journal_start(journal, max_buffs); down_read(&osb->journal->j_trans_barrier); @@ -285,16 +296,18 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs) int ocfs2_commit_trans(struct ocfs2_super *osb, handle_t *handle) { - int ret; + int ret, nested; struct ocfs2_journal *journal = osb->journal; BUG_ON(!handle); + nested = handle->h_ref > 1; ret = jbd2_journal_stop(handle); if (ret < 0) mlog_errno(ret); - up_read(&journal->j_trans_barrier); + if (!nested) + up_read(&journal->j_trans_barrier); return ret; } @@ -357,10 +370,137 @@ bail: return status; } -int ocfs2_journal_access(handle_t *handle, - struct inode *inode, - struct buffer_head *bh, - int type) +struct ocfs2_triggers { + struct jbd2_buffer_trigger_type ot_triggers; + int ot_offset; +}; + +static inline struct ocfs2_triggers *to_ocfs2_trigger(struct jbd2_buffer_trigger_type *triggers) +{ + return container_of(triggers, struct ocfs2_triggers, ot_triggers); +} + +static void ocfs2_commit_trigger(struct jbd2_buffer_trigger_type *triggers, + struct buffer_head *bh, + void *data, size_t size) +{ + struct ocfs2_triggers *ot = to_ocfs2_trigger(triggers); + + /* + * We aren't guaranteed to have the superblock here, so we + * must unconditionally compute the ecc data. + * __ocfs2_journal_access() will only set the triggers if + * metaecc is enabled. + */ + ocfs2_block_check_compute(data, size, data + ot->ot_offset); +} + +/* + * Quota blocks have their own trigger because the struct ocfs2_block_check + * offset depends on the blocksize. + */ +static void ocfs2_dq_commit_trigger(struct jbd2_buffer_trigger_type *triggers, + struct buffer_head *bh, + void *data, size_t size) +{ + struct ocfs2_disk_dqtrailer *dqt = + ocfs2_block_dqtrailer(size, data); + + /* + * We aren't guaranteed to have the superblock here, so we + * must unconditionally compute the ecc data. + * __ocfs2_journal_access() will only set the triggers if + * metaecc is enabled. + */ + ocfs2_block_check_compute(data, size, &dqt->dq_check); +} + +/* + * Directory blocks also have their own trigger because the + * struct ocfs2_block_check offset depends on the blocksize. + */ +static void ocfs2_db_commit_trigger(struct jbd2_buffer_trigger_type *triggers, + struct buffer_head *bh, + void *data, size_t size) +{ + struct ocfs2_dir_block_trailer *trailer = + ocfs2_dir_trailer_from_size(size, data); + + /* + * We aren't guaranteed to have the superblock here, so we + * must unconditionally compute the ecc data. + * __ocfs2_journal_access() will only set the triggers if + * metaecc is enabled. + */ + ocfs2_block_check_compute(data, size, &trailer->db_check); +} + +static void ocfs2_abort_trigger(struct jbd2_buffer_trigger_type *triggers, + struct buffer_head *bh) +{ + mlog(ML_ERROR, + "ocfs2_abort_trigger called by JBD2. bh = 0x%lx, " + "bh->b_blocknr = %llu\n", + (unsigned long)bh, + (unsigned long long)bh->b_blocknr); + + /* We aren't guaranteed to have the superblock here - but if we + * don't, it'll just crash. */ + ocfs2_error(bh->b_assoc_map->host->i_sb, + "JBD2 has aborted our journal, ocfs2 cannot continue\n"); +} + +static struct ocfs2_triggers di_triggers = { + .ot_triggers = { + .t_commit = ocfs2_commit_trigger, + .t_abort = ocfs2_abort_trigger, + }, + .ot_offset = offsetof(struct ocfs2_dinode, i_check), +}; + +static struct ocfs2_triggers eb_triggers = { + .ot_triggers = { + .t_commit = ocfs2_commit_trigger, + .t_abort = ocfs2_abort_trigger, + }, + .ot_offset = offsetof(struct ocfs2_extent_block, h_check), +}; + +static struct ocfs2_triggers gd_triggers = { + .ot_triggers = { + .t_commit = ocfs2_commit_trigger, + .t_abort = ocfs2_abort_trigger, + }, + .ot_offset = offsetof(struct ocfs2_group_desc, bg_check), +}; + +static struct ocfs2_triggers db_triggers = { + .ot_triggers = { + .t_commit = ocfs2_db_commit_trigger, + .t_abort = ocfs2_abort_trigger, + }, +}; + +static struct ocfs2_triggers xb_triggers = { + .ot_triggers = { + .t_commit = ocfs2_commit_trigger, + .t_abort = ocfs2_abort_trigger, + }, + .ot_offset = offsetof(struct ocfs2_xattr_block, xb_check), +}; + +static struct ocfs2_triggers dq_triggers = { + .ot_triggers = { + .t_commit = ocfs2_dq_commit_trigger, + .t_abort = ocfs2_abort_trigger, + }, +}; + +static int __ocfs2_journal_access(handle_t *handle, + struct inode *inode, + struct buffer_head *bh, + struct ocfs2_triggers *triggers, + int type) { int status; @@ -406,6 +546,8 @@ int ocfs2_journal_access(handle_t *handle, status = -EINVAL; mlog(ML_ERROR, "Uknown access type!\n"); } + if (!status && ocfs2_meta_ecc(OCFS2_SB(inode->i_sb)) && triggers) + jbd2_journal_set_triggers(bh, &triggers->ot_triggers); mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); if (status < 0) @@ -416,6 +558,54 @@ int ocfs2_journal_access(handle_t *handle, return status; } +int ocfs2_journal_access_di(handle_t *handle, struct inode *inode, + struct buffer_head *bh, int type) +{ + return __ocfs2_journal_access(handle, inode, bh, &di_triggers, + type); +} + +int ocfs2_journal_access_eb(handle_t *handle, struct inode *inode, + struct buffer_head *bh, int type) +{ + return __ocfs2_journal_access(handle, inode, bh, &eb_triggers, + type); +} + +int ocfs2_journal_access_gd(handle_t *handle, struct inode *inode, + struct buffer_head *bh, int type) +{ + return __ocfs2_journal_access(handle, inode, bh, &gd_triggers, + type); +} + +int ocfs2_journal_access_db(handle_t *handle, struct inode *inode, + struct buffer_head *bh, int type) +{ + return __ocfs2_journal_access(handle, inode, bh, &db_triggers, + type); +} + +int ocfs2_journal_access_xb(handle_t *handle, struct inode *inode, + struct buffer_head *bh, int type) +{ + return __ocfs2_journal_access(handle, inode, bh, &xb_triggers, + type); +} + +int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode, + struct buffer_head *bh, int type) +{ + return __ocfs2_journal_access(handle, inode, bh, &dq_triggers, + type); +} + +int ocfs2_journal_access(handle_t *handle, struct inode *inode, + struct buffer_head *bh, int type) +{ + return __ocfs2_journal_access(handle, inode, bh, NULL, type); +} + int ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh) { @@ -434,20 +624,6 @@ int ocfs2_journal_dirty(handle_t *handle, return status; } -#ifdef CONFIG_OCFS2_COMPAT_JBD -int ocfs2_journal_dirty_data(handle_t *handle, - struct buffer_head *bh) -{ - int err = journal_dirty_data(handle, bh); - if (err) - mlog_errno(err); - /* TODO: When we can handle it, abort the handle and go RO on - * error here. */ - - return err; -} -#endif - #define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE) void ocfs2_set_journal_params(struct ocfs2_super *osb) @@ -587,17 +763,11 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb, mlog_entry_void(); fe = (struct ocfs2_dinode *)bh->b_data; - if (!OCFS2_IS_VALID_DINODE(fe)) { - /* This is called from startup/shutdown which will - * handle the errors in a specific manner, so no need - * to call ocfs2_error() here. */ - mlog(ML_ERROR, "Journal dinode %llu has invalid " - "signature: %.*s", - (unsigned long long)le64_to_cpu(fe->i_blkno), 7, - fe->i_signature); - status = -EIO; - goto out; - } + + /* The journal bh on the osb always comes from ocfs2_journal_init() + * and was validated there inside ocfs2_inode_lock_full(). It's a + * code bug if we mess it up. */ + BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); flags = le32_to_cpu(fe->id1.journal1.ij_flags); if (dirty) @@ -609,11 +779,11 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb, if (replayed) ocfs2_bump_recovery_generation(fe); + ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &fe->i_check); status = ocfs2_write_block(osb, bh, journal->j_inode); if (status < 0) mlog_errno(status); -out: mlog_exit(status); return status; } @@ -878,6 +1048,7 @@ struct ocfs2_la_recovery_item { int lri_slot; struct ocfs2_dinode *lri_la_dinode; struct ocfs2_dinode *lri_tl_dinode; + struct ocfs2_quota_recovery *lri_qrec; }; /* Does the second half of the recovery process. By this point, the @@ -898,6 +1069,7 @@ void ocfs2_complete_recovery(struct work_struct *work) struct ocfs2_super *osb = journal->j_osb; struct ocfs2_dinode *la_dinode, *tl_dinode; struct ocfs2_la_recovery_item *item, *n; + struct ocfs2_quota_recovery *qrec; LIST_HEAD(tmp_la_list); mlog_entry_void(); @@ -913,6 +1085,8 @@ void ocfs2_complete_recovery(struct work_struct *work) mlog(0, "Complete recovery for slot %d\n", item->lri_slot); + ocfs2_wait_on_quotas(osb); + la_dinode = item->lri_la_dinode; if (la_dinode) { mlog(0, "Clean up local alloc %llu\n", @@ -943,6 +1117,16 @@ void ocfs2_complete_recovery(struct work_struct *work) if (ret < 0) mlog_errno(ret); + qrec = item->lri_qrec; + if (qrec) { + mlog(0, "Recovering quota files"); + ret = ocfs2_finish_quota_recovery(osb, qrec, + item->lri_slot); + if (ret < 0) + mlog_errno(ret); + /* Recovery info is already freed now */ + } + kfree(item); } @@ -956,7 +1140,8 @@ void ocfs2_complete_recovery(struct work_struct *work) static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal, int slot_num, struct ocfs2_dinode *la_dinode, - struct ocfs2_dinode *tl_dinode) + struct ocfs2_dinode *tl_dinode, + struct ocfs2_quota_recovery *qrec) { struct ocfs2_la_recovery_item *item; @@ -971,6 +1156,9 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal, if (tl_dinode) kfree(tl_dinode); + if (qrec) + ocfs2_free_quota_recovery(qrec); + mlog_errno(-ENOMEM); return; } @@ -979,6 +1167,7 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal, item->lri_la_dinode = la_dinode; item->lri_slot = slot_num; item->lri_tl_dinode = tl_dinode; + item->lri_qrec = qrec; spin_lock(&journal->j_lock); list_add_tail(&item->lri_list, &journal->j_la_cleanups); @@ -998,6 +1187,7 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb) ocfs2_queue_recovery_completion(journal, osb->slot_num, osb->local_alloc_copy, + NULL, NULL); ocfs2_schedule_truncate_log_flush(osb, 0); @@ -1006,11 +1196,26 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb) } } +void ocfs2_complete_quota_recovery(struct ocfs2_super *osb) +{ + if (osb->quota_rec) { + ocfs2_queue_recovery_completion(osb->journal, + osb->slot_num, + NULL, + NULL, + osb->quota_rec); + osb->quota_rec = NULL; + } +} + static int __ocfs2_recovery_thread(void *arg) { - int status, node_num; + int status, node_num, slot_num; struct ocfs2_super *osb = arg; struct ocfs2_recovery_map *rm = osb->recovery_map; + int *rm_quota = NULL; + int rm_quota_used = 0, i; + struct ocfs2_quota_recovery *qrec; mlog_entry_void(); @@ -1019,6 +1224,11 @@ static int __ocfs2_recovery_thread(void *arg) goto bail; } + rm_quota = kzalloc(osb->max_slots * sizeof(int), GFP_NOFS); + if (!rm_quota) { + status = -ENOMEM; + goto bail; + } restart: status = ocfs2_super_lock(osb, 1); if (status < 0) { @@ -1032,8 +1242,28 @@ restart: * clear it until ocfs2_recover_node() has succeeded. */ node_num = rm->rm_entries[0]; spin_unlock(&osb->osb_lock); - - status = ocfs2_recover_node(osb, node_num); + mlog(0, "checking node %d\n", node_num); + slot_num = ocfs2_node_num_to_slot(osb, node_num); + if (slot_num == -ENOENT) { + status = 0; + mlog(0, "no slot for this node, so no recovery" + "required.\n"); + goto skip_recovery; + } + mlog(0, "node %d was using slot %d\n", node_num, slot_num); + + /* It is a bit subtle with quota recovery. We cannot do it + * immediately because we have to obtain cluster locks from + * quota files and we also don't want to just skip it because + * then quota usage would be out of sync until some node takes + * the slot. So we remember which nodes need quota recovery + * and when everything else is done, we recover quotas. */ + for (i = 0; i < rm_quota_used && rm_quota[i] != slot_num; i++); + if (i == rm_quota_used) + rm_quota[rm_quota_used++] = slot_num; + + status = ocfs2_recover_node(osb, node_num, slot_num); +skip_recovery: if (!status) { ocfs2_recovery_map_clear(osb, node_num); } else { @@ -1055,13 +1285,27 @@ restart: if (status < 0) mlog_errno(status); + /* Now it is right time to recover quotas... We have to do this under + * superblock lock so that noone can start using the slot (and crash) + * before we recover it */ + for (i = 0; i < rm_quota_used; i++) { + qrec = ocfs2_begin_quota_recovery(osb, rm_quota[i]); + if (IS_ERR(qrec)) { + status = PTR_ERR(qrec); + mlog_errno(status); + continue; + } + ocfs2_queue_recovery_completion(osb->journal, rm_quota[i], + NULL, NULL, qrec); + } + ocfs2_super_unlock(osb, 1); /* We always run recovery on our own orphan dir - the dead * node(s) may have disallowd a previos inode delete. Re-processing * is therefore required. */ ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL, - NULL); + NULL, NULL); bail: mutex_lock(&osb->recovery_lock); @@ -1076,6 +1320,9 @@ bail: mutex_unlock(&osb->recovery_lock); + if (rm_quota) + kfree(rm_quota); + mlog_exit(status); /* no one is callint kthread_stop() for us so the kthread() api * requires that we call do_exit(). And it isn't exported, but @@ -1135,8 +1382,7 @@ static int ocfs2_read_journal_inode(struct ocfs2_super *osb, } SET_INODE_JOURNAL(inode); - status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, bh, - OCFS2_BH_IGNORE_CACHE); + status = ocfs2_read_inode_block_full(inode, bh, OCFS2_BH_IGNORE_CACHE); if (status < 0) { mlog_errno(status); goto bail; @@ -1268,6 +1514,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb, osb->slot_recovery_generations[slot_num] = ocfs2_get_recovery_generation(fe); + ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &fe->i_check); status = ocfs2_write_block(osb, bh, inode); if (status < 0) mlog_errno(status); @@ -1304,31 +1551,19 @@ done: * far less concerning. */ static int ocfs2_recover_node(struct ocfs2_super *osb, - int node_num) + int node_num, int slot_num) { int status = 0; - int slot_num; struct ocfs2_dinode *la_copy = NULL; struct ocfs2_dinode *tl_copy = NULL; - mlog_entry("(node_num=%d, osb->node_num = %d)\n", - node_num, osb->node_num); - - mlog(0, "checking node %d\n", node_num); + mlog_entry("(node_num=%d, slot_num=%d, osb->node_num = %d)\n", + node_num, slot_num, osb->node_num); /* Should not ever be called to recover ourselves -- in that * case we should've called ocfs2_journal_load instead. */ BUG_ON(osb->node_num == node_num); - slot_num = ocfs2_node_num_to_slot(osb, node_num); - if (slot_num == -ENOENT) { - status = 0; - mlog(0, "no slot for this node, so no recovery required.\n"); - goto done; - } - - mlog(0, "node %d was using slot %d\n", node_num, slot_num); - status = ocfs2_replay_journal(osb, node_num, slot_num); if (status < 0) { if (status == -EBUSY) { @@ -1364,7 +1599,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb, /* This will kfree the memory pointed to by la_copy and tl_copy */ ocfs2_queue_recovery_completion(osb->journal, slot_num, la_copy, - tl_copy); + tl_copy, NULL); status = 0; done: @@ -1659,13 +1894,14 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, return ret; } -static int ocfs2_wait_on_mount(struct ocfs2_super *osb) +static int __ocfs2_wait_on_mount(struct ocfs2_super *osb, int quota) { /* This check is good because ocfs2 will wait on our recovery * thread before changing it to something other than MOUNTED * or DISABLED. */ wait_event(osb->osb_mount_event, - atomic_read(&osb->vol_state) == VOLUME_MOUNTED || + (!quota && atomic_read(&osb->vol_state) == VOLUME_MOUNTED) || + atomic_read(&osb->vol_state) == VOLUME_MOUNTED_QUOTAS || atomic_read(&osb->vol_state) == VOLUME_DISABLED); /* If there's an error on mount, then we may never get to the diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index d4d14e9a3cea..3c3532e1307c 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -27,12 +27,7 @@ #define OCFS2_JOURNAL_H #include <linux/fs.h> -#ifndef CONFIG_OCFS2_COMPAT_JBD -# include <linux/jbd2.h> -#else -# include <linux/jbd.h> -# include "ocfs2_jbd_compat.h" -#endif +#include <linux/jbd2.h> enum ocfs2_journal_state { OCFS2_JOURNAL_FREE = 0, @@ -173,6 +168,7 @@ void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num); int ocfs2_mark_dead_nodes(struct ocfs2_super *osb); void ocfs2_complete_mount_recovery(struct ocfs2_super *osb); +void ocfs2_complete_quota_recovery(struct ocfs2_super *osb); static inline void ocfs2_start_checkpoint(struct ocfs2_super *osb) { @@ -216,9 +212,12 @@ static inline void ocfs2_checkpoint_inode(struct inode *inode) * ocfs2_extend_trans - Extend a handle by nblocks credits. This may * commit the handle to disk in the process, but will * not release any locks taken during the transaction. - * ocfs2_journal_access - Notify the handle that we want to journal this + * ocfs2_journal_access* - Notify the handle that we want to journal this * buffer. Will have to call ocfs2_journal_dirty once * we've actually dirtied it. Type is one of . or . + * Always call the specific flavor of + * ocfs2_journal_access_*() unless you intend to + * manage the checksum by hand. * ocfs2_journal_dirty - Mark a journalled buffer as having dirty data. * ocfs2_jbd2_file_inode - Mark an inode so that its data goes out before * the current handle commits. @@ -248,10 +247,29 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks); #define OCFS2_JOURNAL_ACCESS_WRITE 1 #define OCFS2_JOURNAL_ACCESS_UNDO 2 -int ocfs2_journal_access(handle_t *handle, - struct inode *inode, - struct buffer_head *bh, - int type); + +/* ocfs2_inode */ +int ocfs2_journal_access_di(handle_t *handle, struct inode *inode, + struct buffer_head *bh, int type); +/* ocfs2_extent_block */ +int ocfs2_journal_access_eb(handle_t *handle, struct inode *inode, + struct buffer_head *bh, int type); +/* ocfs2_group_desc */ +int ocfs2_journal_access_gd(handle_t *handle, struct inode *inode, + struct buffer_head *bh, int type); +/* ocfs2_xattr_block */ +int ocfs2_journal_access_xb(handle_t *handle, struct inode *inode, + struct buffer_head *bh, int type); +/* quota blocks */ +int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode, + struct buffer_head *bh, int type); +/* dirblock */ +int ocfs2_journal_access_db(handle_t *handle, struct inode *inode, + struct buffer_head *bh, int type); +/* Anything that has no ecc */ +int ocfs2_journal_access(handle_t *handle, struct inode *inode, + struct buffer_head *bh, int type); + /* * A word about the journal_access/journal_dirty "dance". It is * entirely legal to journal_access a buffer more than once (as long @@ -273,10 +291,6 @@ int ocfs2_journal_access(handle_t *handle, */ int ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh); -#ifdef CONFIG_OCFS2_COMPAT_JBD -int ocfs2_journal_dirty_data(handle_t *handle, - struct buffer_head *bh); -#endif /* * Credit Macros: @@ -293,6 +307,37 @@ int ocfs2_journal_dirty_data(handle_t *handle, /* extended attribute block update */ #define OCFS2_XATTR_BLOCK_UPDATE_CREDITS 1 +/* global quotafile inode update, data block */ +#define OCFS2_QINFO_WRITE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1) + +/* + * The two writes below can accidentally see global info dirty due + * to set_info() quotactl so make them prepared for the writes. + */ +/* quota data block, global info */ +/* Write to local quota file */ +#define OCFS2_QWRITE_CREDITS (OCFS2_QINFO_WRITE_CREDITS + 1) + +/* global quota data block, local quota data block, global quota inode, + * global quota info */ +#define OCFS2_QSYNC_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 3) + +static inline int ocfs2_quota_trans_credits(struct super_block *sb) +{ + int credits = 0; + + if (OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) + credits += OCFS2_QWRITE_CREDITS; + if (OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) + credits += OCFS2_QWRITE_CREDITS; + return credits; +} + +/* Number of credits needed for removing quota structure from file */ +int ocfs2_calc_qdel_credits(struct super_block *sb, int type); +/* Number of credits needed for initialization of new quota structure */ +int ocfs2_calc_qinit_credits(struct super_block *sb, int type); + /* group extend. inode update and last group update. */ #define OCFS2_GROUP_EXTEND_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1) @@ -303,8 +348,11 @@ int ocfs2_journal_dirty_data(handle_t *handle, * prev. group desc. if we relink. */ #define OCFS2_SUBALLOC_ALLOC (3) -#define OCFS2_INLINE_TO_EXTENTS_CREDITS (OCFS2_SUBALLOC_ALLOC \ - + OCFS2_INODE_UPDATE_CREDITS) +static inline int ocfs2_inline_to_extents_credits(struct super_block *sb) +{ + return OCFS2_SUBALLOC_ALLOC + OCFS2_INODE_UPDATE_CREDITS + + ocfs2_quota_trans_credits(sb); +} /* dinode + group descriptor update. We don't relink on free yet. */ #define OCFS2_SUBALLOC_FREE (2) @@ -313,16 +361,23 @@ int ocfs2_journal_dirty_data(handle_t *handle, #define OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC (OCFS2_SUBALLOC_FREE \ + OCFS2_TRUNCATE_LOG_UPDATE) -#define OCFS2_REMOVE_EXTENT_CREDITS (OCFS2_TRUNCATE_LOG_UPDATE + OCFS2_INODE_UPDATE_CREDITS) +static inline int ocfs2_remove_extent_credits(struct super_block *sb) +{ + return OCFS2_TRUNCATE_LOG_UPDATE + OCFS2_INODE_UPDATE_CREDITS + + ocfs2_quota_trans_credits(sb); +} /* data block for new dir/symlink, 2 for bitmap updates (bitmap fe + * bitmap block for the new bit) */ #define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2) /* parent fe, parent block, new file entry, inode alloc fe, inode alloc - * group descriptor + mkdir/symlink blocks */ -#define OCFS2_MKNOD_CREDITS (3 + OCFS2_SUBALLOC_ALLOC \ - + OCFS2_DIR_LINK_ADDITIONAL_CREDITS) + * group descriptor + mkdir/symlink blocks + quota update */ +static inline int ocfs2_mknod_credits(struct super_block *sb) +{ + return 3 + OCFS2_SUBALLOC_ALLOC + OCFS2_DIR_LINK_ADDITIONAL_CREDITS + + ocfs2_quota_trans_credits(sb); +} /* local alloc metadata change + main bitmap updates */ #define OCFS2_WINDOW_MOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS \ @@ -332,13 +387,21 @@ int ocfs2_journal_dirty_data(handle_t *handle, * for the dinode, one for the new block. */ #define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2) -/* file update (nlink, etc) + directory mtime/ctime + dir entry block */ -#define OCFS2_LINK_CREDITS (2*OCFS2_INODE_UPDATE_CREDITS + 1) +/* file update (nlink, etc) + directory mtime/ctime + dir entry block + quota + * update on dir */ +static inline int ocfs2_link_credits(struct super_block *sb) +{ + return 2*OCFS2_INODE_UPDATE_CREDITS + 1 + + ocfs2_quota_trans_credits(sb); +} /* inode + dir inode (if we unlink a dir), + dir entry block + orphan * dir inode link */ -#define OCFS2_UNLINK_CREDITS (2 * OCFS2_INODE_UPDATE_CREDITS + 1 \ - + OCFS2_LINK_CREDITS) +static inline int ocfs2_unlink_credits(struct super_block *sb) +{ + /* The quota update from ocfs2_link_credits is unused here... */ + return 2 * OCFS2_INODE_UPDATE_CREDITS + 1 + ocfs2_link_credits(sb); +} /* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry + * inode alloc group descriptor */ @@ -347,8 +410,10 @@ int ocfs2_journal_dirty_data(handle_t *handle, /* dinode update, old dir dinode update, new dir dinode update, old * dir dir entry, new dir dir entry, dir entry update for renaming * directory + target unlink */ -#define OCFS2_RENAME_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3 \ - + OCFS2_UNLINK_CREDITS) +static inline int ocfs2_rename_credits(struct super_block *sb) +{ + return 3 * OCFS2_INODE_UPDATE_CREDITS + 3 + ocfs2_unlink_credits(sb); +} /* global bitmap dinode, group desc., relinked group, * suballocator dinode, group desc., relinked group, @@ -386,18 +451,19 @@ static inline int ocfs2_calc_extend_credits(struct super_block *sb, * credit for the dinode there. */ extent_blocks = 1 + 1 + le16_to_cpu(root_el->l_tree_depth); - return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks; + return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks + + ocfs2_quota_trans_credits(sb); } static inline int ocfs2_calc_symlink_credits(struct super_block *sb) { - int blocks = OCFS2_MKNOD_CREDITS; + int blocks = ocfs2_mknod_credits(sb); /* links can be longer than one block so we may update many * within our single allocated extent. */ blocks += ocfs2_clusters_to_blocks(sb, 1); - return blocks; + return blocks + ocfs2_quota_trans_credits(sb); } static inline int ocfs2_calc_group_alloc_credits(struct super_block *sb, @@ -434,6 +500,8 @@ static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb, /* update to the truncate log. */ credits += OCFS2_TRUNCATE_LOG_UPDATE; + credits += ocfs2_quota_trans_credits(sb); + return credits; } diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 687b28713c32..ec70cdbe77fc 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -36,6 +36,7 @@ #include "ocfs2.h" #include "alloc.h" +#include "blockcheck.h" #include "dlmglue.h" #include "inode.h" #include "journal.h" @@ -248,8 +249,8 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb) goto bail; } - status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, - &alloc_bh, OCFS2_BH_IGNORE_CACHE); + status = ocfs2_read_inode_block_full(inode, &alloc_bh, + OCFS2_BH_IGNORE_CACHE); if (status < 0) { mlog_errno(status); goto bail; @@ -382,8 +383,8 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb) } memcpy(alloc_copy, alloc, bh->b_size); - status = ocfs2_journal_access(handle, local_alloc_inode, bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_di(handle, local_alloc_inode, bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto out_commit; @@ -459,8 +460,8 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb, mutex_lock(&inode->i_mutex); - status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, - &alloc_bh, OCFS2_BH_IGNORE_CACHE); + status = ocfs2_read_inode_block_full(inode, &alloc_bh, + OCFS2_BH_IGNORE_CACHE); if (status < 0) { mlog_errno(status); goto bail; @@ -476,6 +477,7 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb, alloc = (struct ocfs2_dinode *) alloc_bh->b_data; ocfs2_clear_local_alloc(alloc); + ocfs2_compute_meta_ecc(osb->sb, alloc_bh->b_data, &alloc->i_check); status = ocfs2_write_block(osb, alloc_bh, inode); if (status < 0) mlog_errno(status); @@ -762,9 +764,9 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb, * delete bits from it! */ *num_bits = bits_wanted; - status = ocfs2_journal_access(handle, local_alloc_inode, - osb->local_alloc_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_di(handle, local_alloc_inode, + osb->local_alloc_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail; @@ -1240,9 +1242,9 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, } memcpy(alloc_copy, alloc, osb->local_alloc_bh->b_size); - status = ocfs2_journal_access(handle, local_alloc_inode, - osb->local_alloc_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_di(handle, local_alloc_inode, + osb->local_alloc_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail; diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 2545e7402efe..084aba86c3b2 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -40,6 +40,7 @@ #include <linux/types.h> #include <linux/slab.h> #include <linux/highmem.h> +#include <linux/quotaops.h> #define MLOG_MASK_PREFIX ML_NAMEI #include <cluster/masklog.h> @@ -61,17 +62,18 @@ #include "sysfile.h" #include "uptodate.h" #include "xattr.h" +#include "acl.h" #include "buffer_head_io.h" static int ocfs2_mknod_locked(struct ocfs2_super *osb, struct inode *dir, - struct dentry *dentry, int mode, + struct inode *inode, + struct dentry *dentry, dev_t dev, struct buffer_head **new_fe_bh, struct buffer_head *parent_fe_bh, handle_t *handle, - struct inode **ret_inode, struct ocfs2_alloc_context *inode_ac); static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, @@ -186,6 +188,35 @@ bail: return ret; } +static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode) +{ + struct inode *inode; + + inode = new_inode(dir->i_sb); + if (!inode) { + mlog(ML_ERROR, "new_inode failed!\n"); + return NULL; + } + + /* populate as many fields early on as possible - many of + * these are used by the support functions here and in + * callers. */ + if (S_ISDIR(mode)) + inode->i_nlink = 2; + else + inode->i_nlink = 1; + inode->i_uid = current_fsuid(); + if (dir->i_mode & S_ISGID) { + inode->i_gid = dir->i_gid; + if (S_ISDIR(mode)) + mode |= S_ISGID; + } else + inode->i_gid = current_fsgid(); + inode->i_mode = mode; + vfs_dq_init(inode); + return inode; +} + static int ocfs2_mknod(struct inode *dir, struct dentry *dentry, int mode, @@ -201,6 +232,13 @@ static int ocfs2_mknod(struct inode *dir, struct inode *inode = NULL; struct ocfs2_alloc_context *inode_ac = NULL; struct ocfs2_alloc_context *data_ac = NULL; + struct ocfs2_alloc_context *xattr_ac = NULL; + int want_clusters = 0; + int xattr_credits = 0; + struct ocfs2_security_xattr_info si = { + .enable = 1, + }; + int did_quota_inode = 0; mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode, (unsigned long)dev, dentry->d_name.len, @@ -250,17 +288,46 @@ static int ocfs2_mknod(struct inode *dir, goto leave; } - /* Reserve a cluster if creating an extent based directory. */ - if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb)) { - status = ocfs2_reserve_clusters(osb, 1, &data_ac); - if (status < 0) { - if (status != -ENOSPC) - mlog_errno(status); + inode = ocfs2_get_init_inode(dir, mode); + if (!inode) { + status = -ENOMEM; + mlog_errno(status); + goto leave; + } + + /* get security xattr */ + status = ocfs2_init_security_get(inode, dir, &si); + if (status) { + if (status == -EOPNOTSUPP) + si.enable = 0; + else { + mlog_errno(status); goto leave; } } - handle = ocfs2_start_trans(osb, OCFS2_MKNOD_CREDITS); + /* calculate meta data/clusters for setting security and acl xattr */ + status = ocfs2_calc_xattr_init(dir, parent_fe_bh, mode, + &si, &want_clusters, + &xattr_credits, &xattr_ac); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + /* Reserve a cluster if creating an extent based directory. */ + if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb)) + want_clusters += 1; + + status = ocfs2_reserve_clusters(osb, want_clusters, &data_ac); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto leave; + } + + handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb) + + xattr_credits); if (IS_ERR(handle)) { status = PTR_ERR(handle); handle = NULL; @@ -268,10 +335,19 @@ static int ocfs2_mknod(struct inode *dir, goto leave; } + /* We don't use standard VFS wrapper because we don't want vfs_dq_init + * to be called. */ + if (sb_any_quota_active(osb->sb) && + osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) { + status = -EDQUOT; + goto leave; + } + did_quota_inode = 1; + /* do the real work now. */ - status = ocfs2_mknod_locked(osb, dir, dentry, mode, dev, + status = ocfs2_mknod_locked(osb, dir, inode, dentry, dev, &new_fe_bh, parent_fe_bh, handle, - &inode, inode_ac); + inode_ac); if (status < 0) { mlog_errno(status); goto leave; @@ -285,8 +361,8 @@ static int ocfs2_mknod(struct inode *dir, goto leave; } - status = ocfs2_journal_access(handle, dir, parent_fe_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_di(handle, dir, parent_fe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto leave; @@ -300,6 +376,22 @@ static int ocfs2_mknod(struct inode *dir, inc_nlink(dir); } + status = ocfs2_init_acl(handle, inode, dir, new_fe_bh, parent_fe_bh, + xattr_ac, data_ac); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + if (si.enable) { + status = ocfs2_init_security_set(handle, inode, new_fe_bh, &si, + xattr_ac, data_ac); + if (status < 0) { + mlog_errno(status); + goto leave; + } + } + status = ocfs2_add_entry(handle, dentry, inode, OCFS2_I(inode)->ip_blkno, parent_fe_bh, de_bh); @@ -320,6 +412,8 @@ static int ocfs2_mknod(struct inode *dir, d_instantiate(dentry, inode); status = 0; leave: + if (status < 0 && did_quota_inode) + vfs_dq_free_inode(inode); if (handle) ocfs2_commit_trans(osb, handle); @@ -331,9 +425,13 @@ leave: brelse(new_fe_bh); brelse(de_bh); brelse(parent_fe_bh); + kfree(si.name); + kfree(si.value); - if ((status < 0) && inode) + if ((status < 0) && inode) { + clear_nlink(inode); iput(inode); + } if (inode_ac) ocfs2_free_alloc_context(inode_ac); @@ -341,6 +439,9 @@ leave: if (data_ac) ocfs2_free_alloc_context(data_ac); + if (xattr_ac) + ocfs2_free_alloc_context(xattr_ac); + mlog_exit(status); return status; @@ -348,12 +449,12 @@ leave: static int ocfs2_mknod_locked(struct ocfs2_super *osb, struct inode *dir, - struct dentry *dentry, int mode, + struct inode *inode, + struct dentry *dentry, dev_t dev, struct buffer_head **new_fe_bh, struct buffer_head *parent_fe_bh, handle_t *handle, - struct inode **ret_inode, struct ocfs2_alloc_context *inode_ac) { int status = 0; @@ -361,14 +462,12 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, struct ocfs2_extent_list *fel; u64 fe_blkno = 0; u16 suballoc_bit; - struct inode *inode = NULL; - mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode, - (unsigned long)dev, dentry->d_name.len, + mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, + inode->i_mode, (unsigned long)dev, dentry->d_name.len, dentry->d_name.name); *new_fe_bh = NULL; - *ret_inode = NULL; status = ocfs2_claim_new_inode(osb, handle, inode_ac, &suballoc_bit, &fe_blkno); @@ -377,23 +476,11 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, goto leave; } - inode = new_inode(dir->i_sb); - if (!inode) { - status = -ENOMEM; - mlog(ML_ERROR, "new_inode failed!\n"); - goto leave; - } - /* populate as many fields early on as possible - many of * these are used by the support functions here and in * callers. */ inode->i_ino = ino_from_blkno(osb->sb, fe_blkno); OCFS2_I(inode)->ip_blkno = fe_blkno; - if (S_ISDIR(mode)) - inode->i_nlink = 2; - else - inode->i_nlink = 1; - inode->i_mode = mode; spin_lock(&osb->osb_lock); inode->i_generation = osb->s_next_generation++; spin_unlock(&osb->osb_lock); @@ -406,8 +493,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, } ocfs2_set_new_buffer_uptodate(inode, *new_fe_bh); - status = ocfs2_journal_access(handle, inode, *new_fe_bh, - OCFS2_JOURNAL_ACCESS_CREATE); + status = ocfs2_journal_access_di(handle, inode, *new_fe_bh, + OCFS2_JOURNAL_ACCESS_CREATE); if (status < 0) { mlog_errno(status); goto leave; @@ -421,17 +508,11 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, fe->i_blkno = cpu_to_le64(fe_blkno); fe->i_suballoc_bit = cpu_to_le16(suballoc_bit); fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot); - fe->i_uid = cpu_to_le32(current_fsuid()); - if (dir->i_mode & S_ISGID) { - fe->i_gid = cpu_to_le32(dir->i_gid); - if (S_ISDIR(mode)) - mode |= S_ISGID; - } else - fe->i_gid = cpu_to_le32(current_fsgid()); - fe->i_mode = cpu_to_le16(mode); - if (S_ISCHR(mode) || S_ISBLK(mode)) + fe->i_uid = cpu_to_le32(inode->i_uid); + fe->i_gid = cpu_to_le32(inode->i_gid); + fe->i_mode = cpu_to_le16(inode->i_mode); + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev)); - fe->i_links_count = cpu_to_le16(inode->i_nlink); fe->i_last_eb_blk = 0; @@ -446,7 +527,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, /* * If supported, directories start with inline data. */ - if (S_ISDIR(mode) && ocfs2_supports_inline_data(osb)) { + if (S_ISDIR(inode->i_mode) && ocfs2_supports_inline_data(osb)) { u16 feat = le16_to_cpu(fe->i_dyn_features); fe->i_dyn_features = cpu_to_le16(feat | OCFS2_INLINE_DATA_FL); @@ -465,15 +546,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, goto leave; } - if (ocfs2_populate_inode(inode, fe, 1) < 0) { - mlog(ML_ERROR, "populate inode failed! bh->b_blocknr=%llu, " - "i_blkno=%llu, i_ino=%lu\n", - (unsigned long long)(*new_fe_bh)->b_blocknr, - (unsigned long long)le64_to_cpu(fe->i_blkno), - inode->i_ino); - BUG(); - } - + ocfs2_populate_inode(inode, fe, 1); ocfs2_inode_set_new(osb, inode); if (!ocfs2_mount_local(osb)) { status = ocfs2_create_new_inode_locks(inode); @@ -484,17 +557,12 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, status = 0; /* error in ocfs2_create_new_inode_locks is not * critical */ - *ret_inode = inode; leave: if (status < 0) { if (*new_fe_bh) { brelse(*new_fe_bh); *new_fe_bh = NULL; } - if (inode) { - clear_nlink(inode); - iput(inode); - } } mlog_exit(status); @@ -588,7 +656,7 @@ static int ocfs2_link(struct dentry *old_dentry, goto out_unlock_inode; } - handle = ocfs2_start_trans(osb, OCFS2_LINK_CREDITS); + handle = ocfs2_start_trans(osb, ocfs2_link_credits(osb->sb)); if (IS_ERR(handle)) { err = PTR_ERR(handle); handle = NULL; @@ -596,8 +664,8 @@ static int ocfs2_link(struct dentry *old_dentry, goto out_unlock_inode; } - err = ocfs2_journal_access(handle, inode, fe_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + err = ocfs2_journal_access_di(handle, inode, fe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (err < 0) { mlog_errno(err); goto out_commit; @@ -775,7 +843,7 @@ static int ocfs2_unlink(struct inode *dir, } } - handle = ocfs2_start_trans(osb, OCFS2_UNLINK_CREDITS); + handle = ocfs2_start_trans(osb, ocfs2_unlink_credits(osb->sb)); if (IS_ERR(handle)) { status = PTR_ERR(handle); handle = NULL; @@ -783,8 +851,8 @@ static int ocfs2_unlink(struct inode *dir, goto leave; } - status = ocfs2_journal_access(handle, inode, fe_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_di(handle, inode, fe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto leave; @@ -1181,7 +1249,7 @@ static int ocfs2_rename(struct inode *old_dir, } } - handle = ocfs2_start_trans(osb, OCFS2_RENAME_CREDITS); + handle = ocfs2_start_trans(osb, ocfs2_rename_credits(osb->sb)); if (IS_ERR(handle)) { status = PTR_ERR(handle); handle = NULL; @@ -1197,8 +1265,8 @@ static int ocfs2_rename(struct inode *old_dir, goto bail; } } - status = ocfs2_journal_access(handle, new_inode, newfe_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_di(handle, new_inode, newfe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail; @@ -1244,8 +1312,8 @@ static int ocfs2_rename(struct inode *old_dir, old_inode->i_ctime = CURRENT_TIME; mark_inode_dirty(old_inode); - status = ocfs2_journal_access(handle, old_inode, old_inode_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_di(handle, old_inode, old_inode_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status >= 0) { old_di = (struct ocfs2_dinode *) old_inode_bh->b_data; @@ -1321,9 +1389,9 @@ static int ocfs2_rename(struct inode *old_dir, (int)old_dir_nlink, old_dir->i_nlink); } else { struct ocfs2_dinode *fe; - status = ocfs2_journal_access(handle, old_dir, - old_dir_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_di(handle, old_dir, + old_dir_bh, + OCFS2_JOURNAL_ACCESS_WRITE); fe = (struct ocfs2_dinode *) old_dir_bh->b_data; fe->i_links_count = cpu_to_le16(old_dir->i_nlink); status = ocfs2_journal_dirty(handle, old_dir_bh); @@ -1496,6 +1564,13 @@ static int ocfs2_symlink(struct inode *dir, handle_t *handle = NULL; struct ocfs2_alloc_context *inode_ac = NULL; struct ocfs2_alloc_context *data_ac = NULL; + struct ocfs2_alloc_context *xattr_ac = NULL; + int want_clusters = 0; + int xattr_credits = 0; + struct ocfs2_security_xattr_info si = { + .enable = 1, + }; + int did_quota = 0, did_quota_inode = 0; mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir, dentry, symname, dentry->d_name.len, dentry->d_name.name); @@ -1542,17 +1617,46 @@ static int ocfs2_symlink(struct inode *dir, goto bail; } - /* don't reserve bitmap space for fast symlinks. */ - if (l > ocfs2_fast_symlink_chars(sb)) { - status = ocfs2_reserve_clusters(osb, 1, &data_ac); + inode = ocfs2_get_init_inode(dir, S_IFLNK | S_IRWXUGO); + if (!inode) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + /* get security xattr */ + status = ocfs2_init_security_get(inode, dir, &si); + if (status) { + if (status == -EOPNOTSUPP) + si.enable = 0; + else { + mlog_errno(status); + goto bail; + } + } + + /* calculate meta data/clusters for setting security xattr */ + if (si.enable) { + status = ocfs2_calc_security_init(dir, &si, &want_clusters, + &xattr_credits, &xattr_ac); if (status < 0) { - if (status != -ENOSPC) - mlog_errno(status); + mlog_errno(status); goto bail; } } - handle = ocfs2_start_trans(osb, credits); + /* don't reserve bitmap space for fast symlinks. */ + if (l > ocfs2_fast_symlink_chars(sb)) + want_clusters += 1; + + status = ocfs2_reserve_clusters(osb, want_clusters, &data_ac); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + + handle = ocfs2_start_trans(osb, credits + xattr_credits); if (IS_ERR(handle)) { status = PTR_ERR(handle); handle = NULL; @@ -1560,10 +1664,18 @@ static int ocfs2_symlink(struct inode *dir, goto bail; } - status = ocfs2_mknod_locked(osb, dir, dentry, - S_IFLNK | S_IRWXUGO, 0, - &new_fe_bh, parent_fe_bh, handle, - &inode, inode_ac); + /* We don't use standard VFS wrapper because we don't want vfs_dq_init + * to be called. */ + if (sb_any_quota_active(osb->sb) && + osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) { + status = -EDQUOT; + goto bail; + } + did_quota_inode = 1; + + status = ocfs2_mknod_locked(osb, dir, inode, dentry, + 0, &new_fe_bh, parent_fe_bh, handle, + inode_ac); if (status < 0) { mlog_errno(status); goto bail; @@ -1576,6 +1688,12 @@ static int ocfs2_symlink(struct inode *dir, u32 offset = 0; inode->i_op = &ocfs2_symlink_inode_operations; + if (vfs_dq_alloc_space_nodirty(inode, + ocfs2_clusters_to_bytes(osb->sb, 1))) { + status = -EDQUOT; + goto bail; + } + did_quota = 1; status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0, new_fe_bh, handle, data_ac, NULL, @@ -1614,6 +1732,15 @@ static int ocfs2_symlink(struct inode *dir, } } + if (si.enable) { + status = ocfs2_init_security_set(handle, inode, new_fe_bh, &si, + xattr_ac, data_ac); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + status = ocfs2_add_entry(handle, dentry, inode, le64_to_cpu(fe->i_blkno), parent_fe_bh, de_bh); @@ -1632,6 +1759,11 @@ static int ocfs2_symlink(struct inode *dir, dentry->d_op = &ocfs2_dentry_ops; d_instantiate(dentry, inode); bail: + if (status < 0 && did_quota) + vfs_dq_free_space_nodirty(inode, + ocfs2_clusters_to_bytes(osb->sb, 1)); + if (status < 0 && did_quota_inode) + vfs_dq_free_inode(inode); if (handle) ocfs2_commit_trans(osb, handle); @@ -1640,12 +1772,18 @@ bail: brelse(new_fe_bh); brelse(parent_fe_bh); brelse(de_bh); + kfree(si.name); + kfree(si.value); if (inode_ac) ocfs2_free_alloc_context(inode_ac); if (data_ac) ocfs2_free_alloc_context(data_ac); - if ((status < 0) && inode) + if (xattr_ac) + ocfs2_free_alloc_context(xattr_ac); + if ((status < 0) && inode) { + clear_nlink(inode); iput(inode); + } mlog_exit(status); @@ -1754,16 +1892,14 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino); - status = ocfs2_read_block(orphan_dir_inode, - OCFS2_I(orphan_dir_inode)->ip_blkno, - &orphan_dir_bh); + status = ocfs2_read_inode_block(orphan_dir_inode, &orphan_dir_bh); if (status < 0) { mlog_errno(status); goto leave; } - status = ocfs2_journal_access(handle, orphan_dir_inode, orphan_dir_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_di(handle, orphan_dir_inode, orphan_dir_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto leave; @@ -1850,8 +1986,8 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, goto leave; } - status = ocfs2_journal_access(handle,orphan_dir_inode, orphan_dir_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_di(handle,orphan_dir_inode, orphan_dir_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto leave; diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 3fed9e3d8992..ad5c24a29edd 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -161,6 +161,7 @@ enum ocfs2_vol_state { VOLUME_INIT = 0, VOLUME_MOUNTED, + VOLUME_MOUNTED_QUOTAS, VOLUME_DISMOUNTED, VOLUME_DISABLED }; @@ -195,6 +196,9 @@ enum ocfs2_mount_options OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */ OCFS2_MOUNT_NOUSERXATTR = 1 << 6, /* No user xattr */ OCFS2_MOUNT_INODE64 = 1 << 7, /* Allow inode numbers > 2^32 */ + OCFS2_MOUNT_POSIX_ACL = 1 << 8, /* POSIX access control lists */ + OCFS2_MOUNT_USRQUOTA = 1 << 9, /* We support user quotas */ + OCFS2_MOUNT_GRPQUOTA = 1 << 10, /* We support group quotas */ }; #define OCFS2_OSB_SOFT_RO 0x0001 @@ -205,6 +209,7 @@ enum ocfs2_mount_options struct ocfs2_journal; struct ocfs2_slot_info; struct ocfs2_recovery_map; +struct ocfs2_quota_recovery; struct ocfs2_super { struct task_struct *commit_task; @@ -286,10 +291,11 @@ struct ocfs2_super char *local_alloc_debug_buf; #endif - /* Next two fields are for local node slot recovery during + /* Next three fields are for local node slot recovery during * mount. */ int dirty; struct ocfs2_dinode *local_alloc_copy; + struct ocfs2_quota_recovery *quota_rec; struct ocfs2_alloc_stats alloc_stats; char dev_str[20]; /* "major,minor" of the device */ @@ -333,6 +339,10 @@ struct ocfs2_super #define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) +/* Useful typedef for passing around journal access functions */ +typedef int (*ocfs2_journal_access_func)(handle_t *handle, struct inode *inode, + struct buffer_head *bh, int type); + static inline int ocfs2_should_order_data(struct inode *inode) { if (!S_ISREG(inode->i_mode)) @@ -376,6 +386,13 @@ static inline int ocfs2_supports_xattr(struct ocfs2_super *osb) return 0; } +static inline int ocfs2_meta_ecc(struct ocfs2_super *osb) +{ + if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_META_ECC) + return 1; + return 0; +} + /* set / clear functions because cluster events can make these happen * in parallel so we want the transitions to be atomic. this also * means that any future flags osb_flags must be protected by spinlock @@ -443,39 +460,19 @@ static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb) #define OCFS2_IS_VALID_DINODE(ptr) \ (!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE)) -#define OCFS2_RO_ON_INVALID_DINODE(__sb, __di) do { \ - typeof(__di) ____di = (__di); \ - ocfs2_error((__sb), \ - "Dinode # %llu has bad signature %.*s", \ - (unsigned long long)le64_to_cpu((____di)->i_blkno), 7, \ - (____di)->i_signature); \ -} while (0) - #define OCFS2_IS_VALID_EXTENT_BLOCK(ptr) \ (!strcmp((ptr)->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE)) -#define OCFS2_RO_ON_INVALID_EXTENT_BLOCK(__sb, __eb) do { \ - typeof(__eb) ____eb = (__eb); \ - ocfs2_error((__sb), \ - "Extent Block # %llu has bad signature %.*s", \ - (unsigned long long)le64_to_cpu((____eb)->h_blkno), 7, \ - (____eb)->h_signature); \ -} while (0) - #define OCFS2_IS_VALID_GROUP_DESC(ptr) \ (!strcmp((ptr)->bg_signature, OCFS2_GROUP_DESC_SIGNATURE)) -#define OCFS2_RO_ON_INVALID_GROUP_DESC(__sb, __gd) do { \ - typeof(__gd) ____gd = (__gd); \ - ocfs2_error((__sb), \ - "Group Descriptor # %llu has bad signature %.*s", \ - (unsigned long long)le64_to_cpu((____gd)->bg_blkno), 7, \ - (____gd)->bg_signature); \ -} while (0) #define OCFS2_IS_VALID_XATTR_BLOCK(ptr) \ (!strcmp((ptr)->xb_signature, OCFS2_XATTR_BLOCK_SIGNATURE)) +#define OCFS2_IS_VALID_DIR_TRAILER(ptr) \ + (!strcmp((ptr)->db_signature, OCFS2_DIR_TRAILER_SIGNATURE)) + static inline unsigned long ino_from_blkno(struct super_block *sb, u64 blkno) { @@ -632,5 +629,6 @@ static inline s16 ocfs2_get_inode_steal_slot(struct ocfs2_super *osb) #define ocfs2_clear_bit ext2_clear_bit #define ocfs2_test_bit ext2_test_bit #define ocfs2_find_next_zero_bit ext2_find_next_zero_bit +#define ocfs2_find_next_bit ext2_find_next_bit #endif /* OCFS2_H */ diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 5e0c0d0aef7d..c7ae45aaa36c 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -65,6 +65,7 @@ #define OCFS2_EXTENT_BLOCK_SIGNATURE "EXBLK01" #define OCFS2_GROUP_DESC_SIGNATURE "GROUP01" #define OCFS2_XATTR_BLOCK_SIGNATURE "XATTR01" +#define OCFS2_DIR_TRAILER_SIGNATURE "DIRTRL1" /* Compatibility flags */ #define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \ @@ -93,8 +94,11 @@ | OCFS2_FEATURE_INCOMPAT_INLINE_DATA \ | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \ | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \ - | OCFS2_FEATURE_INCOMPAT_XATTR) -#define OCFS2_FEATURE_RO_COMPAT_SUPP OCFS2_FEATURE_RO_COMPAT_UNWRITTEN + | OCFS2_FEATURE_INCOMPAT_XATTR \ + | OCFS2_FEATURE_INCOMPAT_META_ECC) +#define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \ + | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \ + | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA) /* * Heartbeat-only devices are missing journals and other files. The @@ -147,6 +151,9 @@ /* Support for extended attributes */ #define OCFS2_FEATURE_INCOMPAT_XATTR 0x0200 +/* Metadata checksum and error correction */ +#define OCFS2_FEATURE_INCOMPAT_META_ECC 0x0800 + /* * backup superblock flag is used to indicate that this volume * has backup superblocks. @@ -163,6 +170,12 @@ */ #define OCFS2_FEATURE_RO_COMPAT_UNWRITTEN 0x0001 +/* + * Maintain quota information for this filesystem + */ +#define OCFS2_FEATURE_RO_COMPAT_USRQUOTA 0x0002 +#define OCFS2_FEATURE_RO_COMPAT_GRPQUOTA 0x0004 + /* The byte offset of the first backup block will be 1G. * The following will be 4G, 16G, 64G, 256G and 1T. */ @@ -192,6 +205,7 @@ #define OCFS2_HEARTBEAT_FL (0x00000200) /* Heartbeat area */ #define OCFS2_CHAIN_FL (0x00000400) /* Chain allocator */ #define OCFS2_DEALLOC_FL (0x00000800) /* Truncate log */ +#define OCFS2_QUOTA_FL (0x00001000) /* Quota file */ /* * Flags on ocfs2_dinode.i_dyn_features @@ -329,13 +343,17 @@ enum { #define OCFS2_FIRST_ONLINE_SYSTEM_INODE SLOT_MAP_SYSTEM_INODE HEARTBEAT_SYSTEM_INODE, GLOBAL_BITMAP_SYSTEM_INODE, -#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GLOBAL_BITMAP_SYSTEM_INODE + USER_QUOTA_SYSTEM_INODE, + GROUP_QUOTA_SYSTEM_INODE, +#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GROUP_QUOTA_SYSTEM_INODE ORPHAN_DIR_SYSTEM_INODE, EXTENT_ALLOC_SYSTEM_INODE, INODE_ALLOC_SYSTEM_INODE, JOURNAL_SYSTEM_INODE, LOCAL_ALLOC_SYSTEM_INODE, TRUNCATE_LOG_SYSTEM_INODE, + LOCAL_USER_QUOTA_SYSTEM_INODE, + LOCAL_GROUP_QUOTA_SYSTEM_INODE, NUM_SYSTEM_INODES }; @@ -349,6 +367,8 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = { [SLOT_MAP_SYSTEM_INODE] = { "slot_map", 0, S_IFREG | 0644 }, [HEARTBEAT_SYSTEM_INODE] = { "heartbeat", OCFS2_HEARTBEAT_FL, S_IFREG | 0644 }, [GLOBAL_BITMAP_SYSTEM_INODE] = { "global_bitmap", 0, S_IFREG | 0644 }, + [USER_QUOTA_SYSTEM_INODE] = { "aquota.user", OCFS2_QUOTA_FL, S_IFREG | 0644 }, + [GROUP_QUOTA_SYSTEM_INODE] = { "aquota.group", OCFS2_QUOTA_FL, S_IFREG | 0644 }, /* Slot-specific system inodes (one copy per slot) */ [ORPHAN_DIR_SYSTEM_INODE] = { "orphan_dir:%04d", 0, S_IFDIR | 0755 }, @@ -356,7 +376,9 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = { [INODE_ALLOC_SYSTEM_INODE] = { "inode_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 }, [JOURNAL_SYSTEM_INODE] = { "journal:%04d", OCFS2_JOURNAL_FL, S_IFREG | 0644 }, [LOCAL_ALLOC_SYSTEM_INODE] = { "local_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_LOCAL_ALLOC_FL, S_IFREG | 0644 }, - [TRUNCATE_LOG_SYSTEM_INODE] = { "truncate_log:%04d", OCFS2_DEALLOC_FL, S_IFREG | 0644 } + [TRUNCATE_LOG_SYSTEM_INODE] = { "truncate_log:%04d", OCFS2_DEALLOC_FL, S_IFREG | 0644 }, + [LOCAL_USER_QUOTA_SYSTEM_INODE] = { "aquota.user:%04d", OCFS2_QUOTA_FL, S_IFREG | 0644 }, + [LOCAL_GROUP_QUOTA_SYSTEM_INODE] = { "aquota.group:%04d", OCFS2_QUOTA_FL, S_IFREG | 0644 }, }; /* Parameter passed from mount.ocfs2 to module */ @@ -410,6 +432,22 @@ static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = { #define OCFS2_RAW_SB(dinode) (&((dinode)->id2.i_super)) /* + * Block checking structure. This is used in metadata to validate the + * contents. If OCFS2_FEATURE_INCOMPAT_META_ECC is not set, it is all + * zeros. + */ +struct ocfs2_block_check { +/*00*/ __le32 bc_crc32e; /* 802.3 Ethernet II CRC32 */ + __le16 bc_ecc; /* Single-error-correction parity vector. + This is a simple Hamming code dependant + on the blocksize. OCFS2's maximum + blocksize, 4K, requires 16 parity bits, + so we fit in __le16. */ + __le16 bc_reserved1; +/*08*/ +}; + +/* * On disk extent record for OCFS2 * It describes a range of clusters on disk. * @@ -496,7 +534,7 @@ struct ocfs2_truncate_log { struct ocfs2_extent_block { /*00*/ __u8 h_signature[8]; /* Signature for verification */ - __le64 h_reserved1; + struct ocfs2_block_check h_check; /* Error checking */ /*10*/ __le16 h_suballoc_slot; /* Slot suballocator this extent_header belongs to */ __le16 h_suballoc_bit; /* Bit offset in suballocator @@ -666,7 +704,8 @@ struct ocfs2_dinode { was set in i_flags */ __le16 i_dyn_features; __le64 i_xattr_loc; -/*80*/ __le64 i_reserved2[7]; +/*80*/ struct ocfs2_block_check i_check; /* Error checking */ +/*88*/ __le64 i_reserved2[6]; /*B8*/ union { __le64 i_pad1; /* Generic way to refer to this 64bit union */ @@ -715,6 +754,34 @@ struct ocfs2_dir_entry { } __attribute__ ((packed)); /* + * Per-block record for the unindexed directory btree. This is carefully + * crafted so that the rec_len and name_len records of an ocfs2_dir_entry are + * mirrored. That way, the directory manipulation code needs a minimal amount + * of update. + * + * NOTE: Keep this structure aligned to a multiple of 4 bytes. + */ +struct ocfs2_dir_block_trailer { +/*00*/ __le64 db_compat_inode; /* Always zero. Was inode */ + + __le16 db_compat_rec_len; /* Backwards compatible with + * ocfs2_dir_entry. */ + __u8 db_compat_name_len; /* Always zero. Was name_len */ + __u8 db_reserved0; + __le16 db_reserved1; + __le16 db_free_rec_len; /* Size of largest empty hole + * in this block. (unused) */ +/*10*/ __u8 db_signature[8]; /* Signature for verification */ + __le64 db_reserved2; + __le64 db_free_next; /* Next block in list (unused) */ +/*20*/ __le64 db_blkno; /* Offset on disk, in blocks */ + __le64 db_parent_dinode; /* dinode which owns me, in + blocks */ +/*30*/ struct ocfs2_block_check db_check; /* Error checking */ +/*40*/ +}; + +/* * On disk allocator group structure for OCFS2 */ struct ocfs2_group_desc @@ -733,7 +800,8 @@ struct ocfs2_group_desc /*20*/ __le64 bg_parent_dinode; /* dinode which owns me, in blocks */ __le64 bg_blkno; /* Offset on disk, in blocks */ -/*30*/ __le64 bg_reserved2[2]; +/*30*/ struct ocfs2_block_check bg_check; /* Error checking */ + __le64 bg_reserved2; /*40*/ __u8 bg_bitmap[0]; }; @@ -776,7 +844,12 @@ struct ocfs2_xattr_header { in this extent record, only valid in the first bucket. */ - __le64 xh_csum; + struct ocfs2_block_check xh_check; /* Error checking + (Note, this is only + used for xattr + buckets. A block uses + xb_check and sets + this field to zero.) */ struct ocfs2_xattr_entry xh_entries[0]; /* xattr entry list. */ }; @@ -827,7 +900,7 @@ struct ocfs2_xattr_block { block group */ __le32 xb_fs_generation; /* Must match super block */ /*10*/ __le64 xb_blkno; /* Offset on disk, in blocks */ - __le64 xb_csum; + struct ocfs2_block_check xb_check; /* Error checking */ /*20*/ __le16 xb_flags; /* Indicates whether this block contains real xattr or a xattr tree. */ __le16 xb_reserved0; @@ -868,6 +941,128 @@ static inline int ocfs2_xattr_get_type(struct ocfs2_xattr_entry *xe) return xe->xe_type & OCFS2_XATTR_TYPE_MASK; } +/* + * On disk structures for global quota file + */ + +/* Magic numbers and known versions for global quota files */ +#define OCFS2_GLOBAL_QMAGICS {\ + 0x0cf52470, /* USRQUOTA */ \ + 0x0cf52471 /* GRPQUOTA */ \ +} + +#define OCFS2_GLOBAL_QVERSIONS {\ + 0, \ + 0, \ +} + + +/* Each block of each quota file has a certain fixed number of bytes reserved + * for OCFS2 internal use at its end. OCFS2 can use it for things like + * checksums, etc. */ +#define OCFS2_QBLK_RESERVED_SPACE 8 + +/* Generic header of all quota files */ +struct ocfs2_disk_dqheader { + __le32 dqh_magic; /* Magic number identifying file */ + __le32 dqh_version; /* Quota format version */ +}; + +#define OCFS2_GLOBAL_INFO_OFF (sizeof(struct ocfs2_disk_dqheader)) + +/* Information header of global quota file (immediately follows the generic + * header) */ +struct ocfs2_global_disk_dqinfo { +/*00*/ __le32 dqi_bgrace; /* Grace time for space softlimit excess */ + __le32 dqi_igrace; /* Grace time for inode softlimit excess */ + __le32 dqi_syncms; /* Time after which we sync local changes to + * global quota file */ + __le32 dqi_blocks; /* Number of blocks in quota file */ +/*10*/ __le32 dqi_free_blk; /* First free block in quota file */ + __le32 dqi_free_entry; /* First block with free dquot entry in quota + * file */ +}; + +/* Structure with global user / group information. We reserve some space + * for future use. */ +struct ocfs2_global_disk_dqblk { +/*00*/ __le32 dqb_id; /* ID the structure belongs to */ + __le32 dqb_use_count; /* Number of nodes having reference to this structure */ + __le64 dqb_ihardlimit; /* absolute limit on allocated inodes */ +/*10*/ __le64 dqb_isoftlimit; /* preferred inode limit */ + __le64 dqb_curinodes; /* current # allocated inodes */ +/*20*/ __le64 dqb_bhardlimit; /* absolute limit on disk space */ + __le64 dqb_bsoftlimit; /* preferred limit on disk space */ +/*30*/ __le64 dqb_curspace; /* current space occupied */ + __le64 dqb_btime; /* time limit for excessive disk use */ +/*40*/ __le64 dqb_itime; /* time limit for excessive inode use */ + __le64 dqb_pad1; +/*50*/ __le64 dqb_pad2; +}; + +/* + * On-disk structures for local quota file + */ + +/* Magic numbers and known versions for local quota files */ +#define OCFS2_LOCAL_QMAGICS {\ + 0x0cf524c0, /* USRQUOTA */ \ + 0x0cf524c1 /* GRPQUOTA */ \ +} + +#define OCFS2_LOCAL_QVERSIONS {\ + 0, \ + 0, \ +} + +/* Quota flags in dqinfo header */ +#define OLQF_CLEAN 0x0001 /* Quota file is empty (this should be after\ + * quota has been cleanly turned off) */ + +#define OCFS2_LOCAL_INFO_OFF (sizeof(struct ocfs2_disk_dqheader)) + +/* Information header of local quota file (immediately follows the generic + * header) */ +struct ocfs2_local_disk_dqinfo { + __le32 dqi_flags; /* Flags for quota file */ + __le32 dqi_chunks; /* Number of chunks of quota structures + * with a bitmap */ + __le32 dqi_blocks; /* Number of blocks allocated for quota file */ +}; + +/* Header of one chunk of a quota file */ +struct ocfs2_local_disk_chunk { + __le32 dqc_free; /* Number of free entries in the bitmap */ + u8 dqc_bitmap[0]; /* Bitmap of entries in the corresponding + * chunk of quota file */ +}; + +/* One entry in local quota file */ +struct ocfs2_local_disk_dqblk { +/*00*/ __le64 dqb_id; /* id this quota applies to */ + __le64 dqb_spacemod; /* Change in the amount of used space */ +/*10*/ __le64 dqb_inodemod; /* Change in the amount of used inodes */ +}; + + +/* + * The quota trailer lives at the end of each quota block. + */ + +struct ocfs2_disk_dqtrailer { +/*00*/ struct ocfs2_block_check dq_check; /* Error checking */ +/*08*/ /* Cannot be larger than OCFS2_QBLK_RESERVED_SPACE */ +}; + +static inline struct ocfs2_disk_dqtrailer *ocfs2_block_dqtrailer(int blocksize, + void *buf) +{ + char *ptr = buf; + ptr += blocksize - OCFS2_QBLK_RESERVED_SPACE; + + return (struct ocfs2_disk_dqtrailer *)ptr; +} + #ifdef __KERNEL__ static inline int ocfs2_fast_symlink_chars(struct super_block *sb) { diff --git a/fs/ocfs2/ocfs2_jbd_compat.h b/fs/ocfs2/ocfs2_jbd_compat.h deleted file mode 100644 index b91c78f8f558..000000000000 --- a/fs/ocfs2/ocfs2_jbd_compat.h +++ /dev/null @@ -1,82 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * ocfs2_jbd_compat.h - * - * Compatibility defines for JBD. - * - * Copyright (C) 2008 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License version 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - */ - -#ifndef OCFS2_JBD_COMPAT_H -#define OCFS2_JBD_COMPAT_H - -#ifndef CONFIG_OCFS2_COMPAT_JBD -# error Should not have been included -#endif - -struct jbd2_inode { - unsigned int dummy; -}; - -#define JBD2_BARRIER JFS_BARRIER -#define JBD2_DEFAULT_MAX_COMMIT_AGE JBD_DEFAULT_MAX_COMMIT_AGE - -#define jbd2_journal_ack_err journal_ack_err -#define jbd2_journal_clear_err journal_clear_err -#define jbd2_journal_destroy journal_destroy -#define jbd2_journal_dirty_metadata journal_dirty_metadata -#define jbd2_journal_errno journal_errno -#define jbd2_journal_extend journal_extend -#define jbd2_journal_flush journal_flush -#define jbd2_journal_force_commit journal_force_commit -#define jbd2_journal_get_write_access journal_get_write_access -#define jbd2_journal_get_undo_access journal_get_undo_access -#define jbd2_journal_init_inode journal_init_inode -#define jbd2_journal_invalidatepage journal_invalidatepage -#define jbd2_journal_load journal_load -#define jbd2_journal_lock_updates journal_lock_updates -#define jbd2_journal_restart journal_restart -#define jbd2_journal_start journal_start -#define jbd2_journal_start_commit journal_start_commit -#define jbd2_journal_stop journal_stop -#define jbd2_journal_try_to_free_buffers journal_try_to_free_buffers -#define jbd2_journal_unlock_updates journal_unlock_updates -#define jbd2_journal_wipe journal_wipe -#define jbd2_log_wait_commit log_wait_commit - -static inline int jbd2_journal_file_inode(handle_t *handle, - struct jbd2_inode *inode) -{ - return 0; -} - -static inline int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode, - loff_t new_size) -{ - return 0; -} - -static inline void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, - struct inode *inode) -{ - return; -} - -static inline void jbd2_journal_release_jbd_inode(journal_t *journal, - struct jbd2_inode *jinode) -{ - return; -} - - -#endif /* OCFS2_JBD_COMPAT_H */ diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h index 82c200f7a8f1..eb6f50c9ceca 100644 --- a/fs/ocfs2/ocfs2_lockid.h +++ b/fs/ocfs2/ocfs2_lockid.h @@ -46,6 +46,7 @@ enum ocfs2_lock_type { OCFS2_LOCK_TYPE_DENTRY, OCFS2_LOCK_TYPE_OPEN, OCFS2_LOCK_TYPE_FLOCK, + OCFS2_LOCK_TYPE_QINFO, OCFS2_NUM_LOCK_TYPES }; @@ -77,6 +78,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type) case OCFS2_LOCK_TYPE_FLOCK: c = 'F'; break; + case OCFS2_LOCK_TYPE_QINFO: + c = 'Q'; + break; default: c = '\0'; } @@ -95,6 +99,7 @@ static char *ocfs2_lock_type_strings[] = { [OCFS2_LOCK_TYPE_DENTRY] = "Dentry", [OCFS2_LOCK_TYPE_OPEN] = "Open", [OCFS2_LOCK_TYPE_FLOCK] = "Flock", + [OCFS2_LOCK_TYPE_QINFO] = "Quota", }; static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type) diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h new file mode 100644 index 000000000000..7365e2e08706 --- /dev/null +++ b/fs/ocfs2/quota.h @@ -0,0 +1,119 @@ +/* + * quota.h for OCFS2 + * + * On disk quota structures for local and global quota file, in-memory + * structures. + * + */ + +#ifndef _OCFS2_QUOTA_H +#define _OCFS2_QUOTA_H + +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/quota.h> +#include <linux/list.h> +#include <linux/dqblk_qtree.h> + +#include "ocfs2.h" + +/* Common stuff */ +/* id number of quota format */ +#define QFMT_OCFS2 3 + +/* + * In-memory structures + */ +struct ocfs2_dquot { + struct dquot dq_dquot; /* Generic VFS dquot */ + loff_t dq_local_off; /* Offset in the local quota file */ + struct ocfs2_quota_chunk *dq_chunk; /* Chunk dquot is in */ + unsigned int dq_use_count; /* Number of nodes having reference to this entry in global quota file */ + s64 dq_origspace; /* Last globally synced space usage */ + s64 dq_originodes; /* Last globally synced inode usage */ +}; + +/* Description of one chunk to recover in memory */ +struct ocfs2_recovery_chunk { + struct list_head rc_list; /* List of chunks */ + int rc_chunk; /* Chunk number */ + unsigned long *rc_bitmap; /* Bitmap of entries to recover */ +}; + +struct ocfs2_quota_recovery { + struct list_head r_list[MAXQUOTAS]; /* List of chunks to recover */ +}; + +/* In-memory structure with quota header information */ +struct ocfs2_mem_dqinfo { + unsigned int dqi_type; /* Quota type this structure describes */ + unsigned int dqi_chunks; /* Number of chunks in local quota file */ + unsigned int dqi_blocks; /* Number of blocks allocated for local quota file */ + unsigned int dqi_syncms; /* How often should we sync with other nodes */ + unsigned int dqi_syncjiff; /* Precomputed dqi_syncms in jiffies */ + struct list_head dqi_chunk; /* List of chunks */ + struct inode *dqi_gqinode; /* Global quota file inode */ + struct ocfs2_lock_res dqi_gqlock; /* Lock protecting quota information structure */ + struct buffer_head *dqi_gqi_bh; /* Buffer head with global quota file inode - set only if inode lock is obtained */ + int dqi_gqi_count; /* Number of holders of dqi_gqi_bh */ + struct buffer_head *dqi_lqi_bh; /* Buffer head with local quota file inode */ + struct buffer_head *dqi_ibh; /* Buffer with information header */ + struct qtree_mem_dqinfo dqi_gi; /* Info about global file */ + struct delayed_work dqi_sync_work; /* Work for syncing dquots */ + struct ocfs2_quota_recovery *dqi_rec; /* Pointer to recovery + * information, in case we + * enable quotas on file + * needing it */ +}; + +static inline struct ocfs2_dquot *OCFS2_DQUOT(struct dquot *dquot) +{ + return container_of(dquot, struct ocfs2_dquot, dq_dquot); +} + +struct ocfs2_quota_chunk { + struct list_head qc_chunk; /* List of quotafile chunks */ + int qc_num; /* Number of quota chunk */ + struct buffer_head *qc_headerbh; /* Buffer head with chunk header */ +}; + +extern struct kmem_cache *ocfs2_dquot_cachep; +extern struct kmem_cache *ocfs2_qf_chunk_cachep; + +extern struct qtree_fmt_operations ocfs2_global_ops; + +struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery( + struct ocfs2_super *osb, int slot_num); +int ocfs2_finish_quota_recovery(struct ocfs2_super *osb, + struct ocfs2_quota_recovery *rec, + int slot_num); +void ocfs2_free_quota_recovery(struct ocfs2_quota_recovery *rec); +ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data, + size_t len, loff_t off); +ssize_t ocfs2_quota_write(struct super_block *sb, int type, + const char *data, size_t len, loff_t off); +int ocfs2_global_read_info(struct super_block *sb, int type); +int ocfs2_global_write_info(struct super_block *sb, int type); +int ocfs2_global_read_dquot(struct dquot *dquot); +int __ocfs2_sync_dquot(struct dquot *dquot, int freeing); +static inline int ocfs2_sync_dquot(struct dquot *dquot) +{ + return __ocfs2_sync_dquot(dquot, 0); +} +static inline int ocfs2_global_release_dquot(struct dquot *dquot) +{ + return __ocfs2_sync_dquot(dquot, 1); +} + +int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex); +void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex); +int ocfs2_read_quota_block(struct inode *inode, u64 v_block, + struct buffer_head **bh); + +extern struct dquot_operations ocfs2_quota_operations; +extern struct quota_format_type ocfs2_quota_format; + +int ocfs2_quota_setup(void); +void ocfs2_quota_shutdown(void); + +#endif /* _OCFS2_QUOTA_H */ diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c new file mode 100644 index 000000000000..6aff8f2d3e49 --- /dev/null +++ b/fs/ocfs2/quota_global.c @@ -0,0 +1,1025 @@ +/* + * Implementation of operations over global quota file + */ +#include <linux/spinlock.h> +#include <linux/fs.h> +#include <linux/quota.h> +#include <linux/quotaops.h> +#include <linux/dqblk_qtree.h> +#include <linux/jiffies.h> +#include <linux/writeback.h> +#include <linux/workqueue.h> + +#define MLOG_MASK_PREFIX ML_QUOTA +#include <cluster/masklog.h> + +#include "ocfs2_fs.h" +#include "ocfs2.h" +#include "alloc.h" +#include "blockcheck.h" +#include "inode.h" +#include "journal.h" +#include "file.h" +#include "sysfile.h" +#include "dlmglue.h" +#include "uptodate.h" +#include "quota.h" + +static struct workqueue_struct *ocfs2_quota_wq = NULL; + +static void qsync_work_fn(struct work_struct *work); + +static void ocfs2_global_disk2memdqb(struct dquot *dquot, void *dp) +{ + struct ocfs2_global_disk_dqblk *d = dp; + struct mem_dqblk *m = &dquot->dq_dqb; + + /* Update from disk only entries not set by the admin */ + if (!test_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags)) { + m->dqb_ihardlimit = le64_to_cpu(d->dqb_ihardlimit); + m->dqb_isoftlimit = le64_to_cpu(d->dqb_isoftlimit); + } + if (!test_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags)) + m->dqb_curinodes = le64_to_cpu(d->dqb_curinodes); + if (!test_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags)) { + m->dqb_bhardlimit = le64_to_cpu(d->dqb_bhardlimit); + m->dqb_bsoftlimit = le64_to_cpu(d->dqb_bsoftlimit); + } + if (!test_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags)) + m->dqb_curspace = le64_to_cpu(d->dqb_curspace); + if (!test_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags)) + m->dqb_btime = le64_to_cpu(d->dqb_btime); + if (!test_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags)) + m->dqb_itime = le64_to_cpu(d->dqb_itime); + OCFS2_DQUOT(dquot)->dq_use_count = le32_to_cpu(d->dqb_use_count); +} + +static void ocfs2_global_mem2diskdqb(void *dp, struct dquot *dquot) +{ + struct ocfs2_global_disk_dqblk *d = dp; + struct mem_dqblk *m = &dquot->dq_dqb; + + d->dqb_id = cpu_to_le32(dquot->dq_id); + d->dqb_use_count = cpu_to_le32(OCFS2_DQUOT(dquot)->dq_use_count); + d->dqb_ihardlimit = cpu_to_le64(m->dqb_ihardlimit); + d->dqb_isoftlimit = cpu_to_le64(m->dqb_isoftlimit); + d->dqb_curinodes = cpu_to_le64(m->dqb_curinodes); + d->dqb_bhardlimit = cpu_to_le64(m->dqb_bhardlimit); + d->dqb_bsoftlimit = cpu_to_le64(m->dqb_bsoftlimit); + d->dqb_curspace = cpu_to_le64(m->dqb_curspace); + d->dqb_btime = cpu_to_le64(m->dqb_btime); + d->dqb_itime = cpu_to_le64(m->dqb_itime); +} + +static int ocfs2_global_is_id(void *dp, struct dquot *dquot) +{ + struct ocfs2_global_disk_dqblk *d = dp; + struct ocfs2_mem_dqinfo *oinfo = + sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; + + if (qtree_entry_unused(&oinfo->dqi_gi, dp)) + return 0; + return le32_to_cpu(d->dqb_id) == dquot->dq_id; +} + +struct qtree_fmt_operations ocfs2_global_ops = { + .mem2disk_dqblk = ocfs2_global_mem2diskdqb, + .disk2mem_dqblk = ocfs2_global_disk2memdqb, + .is_id = ocfs2_global_is_id, +}; + +static int ocfs2_validate_quota_block(struct super_block *sb, + struct buffer_head *bh) +{ + struct ocfs2_disk_dqtrailer *dqt = + ocfs2_block_dqtrailer(sb->s_blocksize, bh->b_data); + + mlog(0, "Validating quota block %llu\n", + (unsigned long long)bh->b_blocknr); + + BUG_ON(!buffer_uptodate(bh)); + + /* + * If the ecc fails, we return the error but otherwise + * leave the filesystem running. We know any error is + * local to this block. + */ + return ocfs2_validate_meta_ecc(sb, bh->b_data, &dqt->dq_check); +} + +int ocfs2_read_quota_block(struct inode *inode, u64 v_block, + struct buffer_head **bh) +{ + int rc = 0; + struct buffer_head *tmp = *bh; + + rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0, + ocfs2_validate_quota_block); + if (rc) + mlog_errno(rc); + + /* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */ + if (!rc && !*bh) + *bh = tmp; + + return rc; +} + +static int ocfs2_get_quota_block(struct inode *inode, int block, + struct buffer_head **bh) +{ + u64 pblock, pcount; + int err; + + down_read(&OCFS2_I(inode)->ip_alloc_sem); + err = ocfs2_extent_map_get_blocks(inode, block, &pblock, &pcount, NULL); + up_read(&OCFS2_I(inode)->ip_alloc_sem); + if (err) { + mlog_errno(err); + return err; + } + *bh = sb_getblk(inode->i_sb, pblock); + if (!*bh) { + err = -EIO; + mlog_errno(err); + } + return err;; +} + +/* Read data from global quotafile - avoid pagecache and such because we cannot + * afford acquiring the locks... We use quota cluster lock to serialize + * operations. Caller is responsible for acquiring it. */ +ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data, + size_t len, loff_t off) +{ + struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv; + struct inode *gqinode = oinfo->dqi_gqinode; + loff_t i_size = i_size_read(gqinode); + int offset = off & (sb->s_blocksize - 1); + sector_t blk = off >> sb->s_blocksize_bits; + int err = 0; + struct buffer_head *bh; + size_t toread, tocopy; + + if (off > i_size) + return 0; + if (off + len > i_size) + len = i_size - off; + toread = len; + while (toread > 0) { + tocopy = min_t(size_t, (sb->s_blocksize - offset), toread); + bh = NULL; + err = ocfs2_read_quota_block(gqinode, blk, &bh); + if (err) { + mlog_errno(err); + return err; + } + memcpy(data, bh->b_data + offset, tocopy); + brelse(bh); + offset = 0; + toread -= tocopy; + data += tocopy; + blk++; + } + return len; +} + +/* Write to quotafile (we know the transaction is already started and has + * enough credits) */ +ssize_t ocfs2_quota_write(struct super_block *sb, int type, + const char *data, size_t len, loff_t off) +{ + struct mem_dqinfo *info = sb_dqinfo(sb, type); + struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv; + struct inode *gqinode = oinfo->dqi_gqinode; + int offset = off & (sb->s_blocksize - 1); + sector_t blk = off >> sb->s_blocksize_bits; + int err = 0, new = 0, ja_type; + struct buffer_head *bh = NULL; + handle_t *handle = journal_current_handle(); + + if (!handle) { + mlog(ML_ERROR, "Quota write (off=%llu, len=%llu) cancelled " + "because transaction was not started.\n", + (unsigned long long)off, (unsigned long long)len); + return -EIO; + } + if (len > sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset) { + WARN_ON(1); + len = sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset; + } + + mutex_lock_nested(&gqinode->i_mutex, I_MUTEX_QUOTA); + if (gqinode->i_size < off + len) { + down_write(&OCFS2_I(gqinode)->ip_alloc_sem); + err = ocfs2_extend_no_holes(gqinode, off + len, off); + up_write(&OCFS2_I(gqinode)->ip_alloc_sem); + if (err < 0) + goto out; + err = ocfs2_simple_size_update(gqinode, + oinfo->dqi_gqi_bh, + off + len); + if (err < 0) + goto out; + new = 1; + } + /* Not rewriting whole block? */ + if ((offset || len < sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE) && + !new) { + err = ocfs2_read_quota_block(gqinode, blk, &bh); + ja_type = OCFS2_JOURNAL_ACCESS_WRITE; + } else { + err = ocfs2_get_quota_block(gqinode, blk, &bh); + ja_type = OCFS2_JOURNAL_ACCESS_CREATE; + } + if (err) { + mlog_errno(err); + return err; + } + lock_buffer(bh); + if (new) + memset(bh->b_data, 0, sb->s_blocksize); + memcpy(bh->b_data + offset, data, len); + flush_dcache_page(bh->b_page); + set_buffer_uptodate(bh); + unlock_buffer(bh); + ocfs2_set_buffer_uptodate(gqinode, bh); + err = ocfs2_journal_access_dq(handle, gqinode, bh, ja_type); + if (err < 0) { + brelse(bh); + goto out; + } + err = ocfs2_journal_dirty(handle, bh); + brelse(bh); + if (err < 0) + goto out; +out: + if (err) { + mutex_unlock(&gqinode->i_mutex); + mlog_errno(err); + return err; + } + gqinode->i_version++; + ocfs2_mark_inode_dirty(handle, gqinode, oinfo->dqi_gqi_bh); + mutex_unlock(&gqinode->i_mutex); + return len; +} + +int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex) +{ + int status; + struct buffer_head *bh = NULL; + + status = ocfs2_inode_lock(oinfo->dqi_gqinode, &bh, ex); + if (status < 0) + return status; + spin_lock(&dq_data_lock); + if (!oinfo->dqi_gqi_count++) + oinfo->dqi_gqi_bh = bh; + else + WARN_ON(bh != oinfo->dqi_gqi_bh); + spin_unlock(&dq_data_lock); + return 0; +} + +void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex) +{ + ocfs2_inode_unlock(oinfo->dqi_gqinode, ex); + brelse(oinfo->dqi_gqi_bh); + spin_lock(&dq_data_lock); + if (!--oinfo->dqi_gqi_count) + oinfo->dqi_gqi_bh = NULL; + spin_unlock(&dq_data_lock); +} + +/* Read information header from global quota file */ +int ocfs2_global_read_info(struct super_block *sb, int type) +{ + struct inode *gqinode = NULL; + unsigned int ino[MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE, + GROUP_QUOTA_SYSTEM_INODE }; + struct ocfs2_global_disk_dqinfo dinfo; + struct mem_dqinfo *info = sb_dqinfo(sb, type); + struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv; + int status; + + mlog_entry_void(); + + /* Read global header */ + gqinode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type], + OCFS2_INVALID_SLOT); + if (!gqinode) { + mlog(ML_ERROR, "failed to get global quota inode (type=%d)\n", + type); + status = -EINVAL; + goto out_err; + } + oinfo->dqi_gi.dqi_sb = sb; + oinfo->dqi_gi.dqi_type = type; + ocfs2_qinfo_lock_res_init(&oinfo->dqi_gqlock, oinfo); + oinfo->dqi_gi.dqi_entry_size = sizeof(struct ocfs2_global_disk_dqblk); + oinfo->dqi_gi.dqi_ops = &ocfs2_global_ops; + oinfo->dqi_gqi_bh = NULL; + oinfo->dqi_gqi_count = 0; + oinfo->dqi_gqinode = gqinode; + status = ocfs2_lock_global_qf(oinfo, 0); + if (status < 0) { + mlog_errno(status); + goto out_err; + } + status = sb->s_op->quota_read(sb, type, (char *)&dinfo, + sizeof(struct ocfs2_global_disk_dqinfo), + OCFS2_GLOBAL_INFO_OFF); + ocfs2_unlock_global_qf(oinfo, 0); + if (status != sizeof(struct ocfs2_global_disk_dqinfo)) { + mlog(ML_ERROR, "Cannot read global quota info (%d).\n", + status); + if (status >= 0) + status = -EIO; + mlog_errno(status); + goto out_err; + } + info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace); + info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace); + oinfo->dqi_syncms = le32_to_cpu(dinfo.dqi_syncms); + oinfo->dqi_syncjiff = msecs_to_jiffies(oinfo->dqi_syncms); + oinfo->dqi_gi.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks); + oinfo->dqi_gi.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk); + oinfo->dqi_gi.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry); + oinfo->dqi_gi.dqi_blocksize_bits = sb->s_blocksize_bits; + oinfo->dqi_gi.dqi_usable_bs = sb->s_blocksize - + OCFS2_QBLK_RESERVED_SPACE; + oinfo->dqi_gi.dqi_qtree_depth = qtree_depth(&oinfo->dqi_gi); + INIT_DELAYED_WORK(&oinfo->dqi_sync_work, qsync_work_fn); + queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work, + oinfo->dqi_syncjiff); + +out_err: + mlog_exit(status); + return status; +} + +/* Write information to global quota file. Expects exlusive lock on quota + * file inode and quota info */ +static int __ocfs2_global_write_info(struct super_block *sb, int type) +{ + struct mem_dqinfo *info = sb_dqinfo(sb, type); + struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv; + struct ocfs2_global_disk_dqinfo dinfo; + ssize_t size; + + spin_lock(&dq_data_lock); + info->dqi_flags &= ~DQF_INFO_DIRTY; + dinfo.dqi_bgrace = cpu_to_le32(info->dqi_bgrace); + dinfo.dqi_igrace = cpu_to_le32(info->dqi_igrace); + spin_unlock(&dq_data_lock); + dinfo.dqi_syncms = cpu_to_le32(oinfo->dqi_syncms); + dinfo.dqi_blocks = cpu_to_le32(oinfo->dqi_gi.dqi_blocks); + dinfo.dqi_free_blk = cpu_to_le32(oinfo->dqi_gi.dqi_free_blk); + dinfo.dqi_free_entry = cpu_to_le32(oinfo->dqi_gi.dqi_free_entry); + size = sb->s_op->quota_write(sb, type, (char *)&dinfo, + sizeof(struct ocfs2_global_disk_dqinfo), + OCFS2_GLOBAL_INFO_OFF); + if (size != sizeof(struct ocfs2_global_disk_dqinfo)) { + mlog(ML_ERROR, "Cannot write global quota info structure\n"); + if (size >= 0) + size = -EIO; + return size; + } + return 0; +} + +int ocfs2_global_write_info(struct super_block *sb, int type) +{ + int err; + struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv; + + err = ocfs2_qinfo_lock(info, 1); + if (err < 0) + return err; + err = __ocfs2_global_write_info(sb, type); + ocfs2_qinfo_unlock(info, 1); + return err; +} + +/* Read in information from global quota file and acquire a reference to it. + * dquot_acquire() has already started the transaction and locked quota file */ +int ocfs2_global_read_dquot(struct dquot *dquot) +{ + int err, err2, ex = 0; + struct ocfs2_mem_dqinfo *info = + sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; + + err = ocfs2_qinfo_lock(info, 0); + if (err < 0) + goto out; + err = qtree_read_dquot(&info->dqi_gi, dquot); + if (err < 0) + goto out_qlock; + OCFS2_DQUOT(dquot)->dq_use_count++; + OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace; + OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes; + if (!dquot->dq_off) { /* No real quota entry? */ + /* Upgrade to exclusive lock for allocation */ + err = ocfs2_qinfo_lock(info, 1); + if (err < 0) + goto out_qlock; + ex = 1; + } + err = qtree_write_dquot(&info->dqi_gi, dquot); + if (ex && info_dirty(sb_dqinfo(dquot->dq_sb, dquot->dq_type))) { + err2 = __ocfs2_global_write_info(dquot->dq_sb, dquot->dq_type); + if (!err) + err = err2; + } +out_qlock: + if (ex) + ocfs2_qinfo_unlock(info, 1); + ocfs2_qinfo_unlock(info, 0); +out: + if (err < 0) + mlog_errno(err); + return err; +} + +/* Sync local information about quota modifications with global quota file. + * Caller must have started the transaction and obtained exclusive lock for + * global quota file inode */ +int __ocfs2_sync_dquot(struct dquot *dquot, int freeing) +{ + int err, err2; + struct super_block *sb = dquot->dq_sb; + int type = dquot->dq_type; + struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv; + struct ocfs2_global_disk_dqblk dqblk; + s64 spacechange, inodechange; + time_t olditime, oldbtime; + + err = sb->s_op->quota_read(sb, type, (char *)&dqblk, + sizeof(struct ocfs2_global_disk_dqblk), + dquot->dq_off); + if (err != sizeof(struct ocfs2_global_disk_dqblk)) { + if (err >= 0) { + mlog(ML_ERROR, "Short read from global quota file " + "(%u read)\n", err); + err = -EIO; + } + goto out; + } + + /* Update space and inode usage. Get also other information from + * global quota file so that we don't overwrite any changes there. + * We are */ + spin_lock(&dq_data_lock); + spacechange = dquot->dq_dqb.dqb_curspace - + OCFS2_DQUOT(dquot)->dq_origspace; + inodechange = dquot->dq_dqb.dqb_curinodes - + OCFS2_DQUOT(dquot)->dq_originodes; + olditime = dquot->dq_dqb.dqb_itime; + oldbtime = dquot->dq_dqb.dqb_btime; + ocfs2_global_disk2memdqb(dquot, &dqblk); + mlog(0, "Syncing global dquot %u space %lld+%lld, inodes %lld+%lld\n", + dquot->dq_id, dquot->dq_dqb.dqb_curspace, (long long)spacechange, + dquot->dq_dqb.dqb_curinodes, (long long)inodechange); + if (!test_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags)) + dquot->dq_dqb.dqb_curspace += spacechange; + if (!test_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags)) + dquot->dq_dqb.dqb_curinodes += inodechange; + /* Set properly space grace time... */ + if (dquot->dq_dqb.dqb_bsoftlimit && + dquot->dq_dqb.dqb_curspace > dquot->dq_dqb.dqb_bsoftlimit) { + if (!test_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags) && + oldbtime > 0) { + if (dquot->dq_dqb.dqb_btime > 0) + dquot->dq_dqb.dqb_btime = + min(dquot->dq_dqb.dqb_btime, oldbtime); + else + dquot->dq_dqb.dqb_btime = oldbtime; + } + } else { + dquot->dq_dqb.dqb_btime = 0; + clear_bit(DQ_BLKS_B, &dquot->dq_flags); + } + /* Set properly inode grace time... */ + if (dquot->dq_dqb.dqb_isoftlimit && + dquot->dq_dqb.dqb_curinodes > dquot->dq_dqb.dqb_isoftlimit) { + if (!test_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags) && + olditime > 0) { + if (dquot->dq_dqb.dqb_itime > 0) + dquot->dq_dqb.dqb_itime = + min(dquot->dq_dqb.dqb_itime, olditime); + else + dquot->dq_dqb.dqb_itime = olditime; + } + } else { + dquot->dq_dqb.dqb_itime = 0; + clear_bit(DQ_INODES_B, &dquot->dq_flags); + } + /* All information is properly updated, clear the flags */ + __clear_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags); + __clear_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags); + __clear_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags); + __clear_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags); + __clear_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags); + __clear_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags); + OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace; + OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes; + spin_unlock(&dq_data_lock); + err = ocfs2_qinfo_lock(info, freeing); + if (err < 0) { + mlog(ML_ERROR, "Failed to lock quota info, loosing quota write" + " (type=%d, id=%u)\n", dquot->dq_type, + (unsigned)dquot->dq_id); + goto out; + } + if (freeing) + OCFS2_DQUOT(dquot)->dq_use_count--; + err = qtree_write_dquot(&info->dqi_gi, dquot); + if (err < 0) + goto out_qlock; + if (freeing && !OCFS2_DQUOT(dquot)->dq_use_count) { + err = qtree_release_dquot(&info->dqi_gi, dquot); + if (info_dirty(sb_dqinfo(sb, type))) { + err2 = __ocfs2_global_write_info(sb, type); + if (!err) + err = err2; + } + } +out_qlock: + ocfs2_qinfo_unlock(info, freeing); +out: + if (err < 0) + mlog_errno(err); + return err; +} + +/* + * Functions for periodic syncing of dquots with global file + */ +static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type) +{ + handle_t *handle; + struct super_block *sb = dquot->dq_sb; + struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv; + struct ocfs2_super *osb = OCFS2_SB(sb); + int status = 0; + + mlog_entry("id=%u qtype=%u type=%lu device=%s\n", dquot->dq_id, + dquot->dq_type, type, sb->s_id); + if (type != dquot->dq_type) + goto out; + status = ocfs2_lock_global_qf(oinfo, 1); + if (status < 0) + goto out; + + handle = ocfs2_start_trans(osb, OCFS2_QSYNC_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto out_ilock; + } + mutex_lock(&sb_dqopt(sb)->dqio_mutex); + status = ocfs2_sync_dquot(dquot); + mutex_unlock(&sb_dqopt(sb)->dqio_mutex); + if (status < 0) + mlog_errno(status); + /* We have to write local structure as well... */ + dquot_mark_dquot_dirty(dquot); + status = dquot_commit(dquot); + if (status < 0) + mlog_errno(status); + ocfs2_commit_trans(osb, handle); +out_ilock: + ocfs2_unlock_global_qf(oinfo, 1); +out: + mlog_exit(status); + return status; +} + +static void qsync_work_fn(struct work_struct *work) +{ + struct ocfs2_mem_dqinfo *oinfo = container_of(work, + struct ocfs2_mem_dqinfo, + dqi_sync_work.work); + struct super_block *sb = oinfo->dqi_gqinode->i_sb; + + dquot_scan_active(sb, ocfs2_sync_dquot_helper, oinfo->dqi_type); + queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work, + oinfo->dqi_syncjiff); +} + +/* + * Wrappers for generic quota functions + */ + +static int ocfs2_write_dquot(struct dquot *dquot) +{ + handle_t *handle; + struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb); + int status = 0; + + mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type); + + handle = ocfs2_start_trans(osb, OCFS2_QWRITE_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto out; + } + status = dquot_commit(dquot); + ocfs2_commit_trans(osb, handle); +out: + mlog_exit(status); + return status; +} + +int ocfs2_calc_qdel_credits(struct super_block *sb, int type) +{ + struct ocfs2_mem_dqinfo *oinfo; + int features[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, + OCFS2_FEATURE_RO_COMPAT_GRPQUOTA }; + + if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, features[type])) + return 0; + + oinfo = sb_dqinfo(sb, type)->dqi_priv; + /* We modify tree, leaf block, global info, local chunk header, + * global and local inode */ + return oinfo->dqi_gi.dqi_qtree_depth + 2 + 1 + + 2 * OCFS2_INODE_UPDATE_CREDITS; +} + +static int ocfs2_release_dquot(struct dquot *dquot) +{ + handle_t *handle; + struct ocfs2_mem_dqinfo *oinfo = + sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; + struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb); + int status = 0; + + mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type); + + status = ocfs2_lock_global_qf(oinfo, 1); + if (status < 0) + goto out; + handle = ocfs2_start_trans(osb, + ocfs2_calc_qdel_credits(dquot->dq_sb, dquot->dq_type)); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto out_ilock; + } + status = dquot_release(dquot); + ocfs2_commit_trans(osb, handle); +out_ilock: + ocfs2_unlock_global_qf(oinfo, 1); +out: + mlog_exit(status); + return status; +} + +int ocfs2_calc_qinit_credits(struct super_block *sb, int type) +{ + struct ocfs2_mem_dqinfo *oinfo; + int features[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, + OCFS2_FEATURE_RO_COMPAT_GRPQUOTA }; + struct ocfs2_dinode *lfe, *gfe; + + if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, features[type])) + return 0; + + oinfo = sb_dqinfo(sb, type)->dqi_priv; + gfe = (struct ocfs2_dinode *)oinfo->dqi_gqi_bh->b_data; + lfe = (struct ocfs2_dinode *)oinfo->dqi_lqi_bh->b_data; + /* We can extend local file + global file. In local file we + * can modify info, chunk header block and dquot block. In + * global file we can modify info, tree and leaf block */ + return ocfs2_calc_extend_credits(sb, &lfe->id2.i_list, 0) + + ocfs2_calc_extend_credits(sb, &gfe->id2.i_list, 0) + + 3 + oinfo->dqi_gi.dqi_qtree_depth + 2; +} + +static int ocfs2_acquire_dquot(struct dquot *dquot) +{ + handle_t *handle; + struct ocfs2_mem_dqinfo *oinfo = + sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; + struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb); + int status = 0; + + mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type); + /* We need an exclusive lock, because we're going to update use count + * and instantiate possibly new dquot structure */ + status = ocfs2_lock_global_qf(oinfo, 1); + if (status < 0) + goto out; + handle = ocfs2_start_trans(osb, + ocfs2_calc_qinit_credits(dquot->dq_sb, dquot->dq_type)); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto out_ilock; + } + status = dquot_acquire(dquot); + ocfs2_commit_trans(osb, handle); +out_ilock: + ocfs2_unlock_global_qf(oinfo, 1); +out: + mlog_exit(status); + return status; +} + +static int ocfs2_mark_dquot_dirty(struct dquot *dquot) +{ + unsigned long mask = (1 << (DQ_LASTSET_B + QIF_ILIMITS_B)) | + (1 << (DQ_LASTSET_B + QIF_BLIMITS_B)) | + (1 << (DQ_LASTSET_B + QIF_INODES_B)) | + (1 << (DQ_LASTSET_B + QIF_SPACE_B)) | + (1 << (DQ_LASTSET_B + QIF_BTIME_B)) | + (1 << (DQ_LASTSET_B + QIF_ITIME_B)); + int sync = 0; + int status; + struct super_block *sb = dquot->dq_sb; + int type = dquot->dq_type; + struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv; + handle_t *handle; + struct ocfs2_super *osb = OCFS2_SB(sb); + + mlog_entry("id=%u, type=%d", dquot->dq_id, type); + dquot_mark_dquot_dirty(dquot); + + /* In case user set some limits, sync dquot immediately to global + * quota file so that information propagates quicker */ + spin_lock(&dq_data_lock); + if (dquot->dq_flags & mask) + sync = 1; + spin_unlock(&dq_data_lock); + if (!sync) { + status = ocfs2_write_dquot(dquot); + goto out; + } + status = ocfs2_lock_global_qf(oinfo, 1); + if (status < 0) + goto out; + handle = ocfs2_start_trans(osb, OCFS2_QSYNC_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto out_ilock; + } + status = ocfs2_sync_dquot(dquot); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + /* Now write updated local dquot structure */ + status = dquot_commit(dquot); +out_trans: + ocfs2_commit_trans(osb, handle); +out_ilock: + ocfs2_unlock_global_qf(oinfo, 1); +out: + mlog_exit(status); + return status; +} + +/* This should happen only after set_dqinfo(). */ +static int ocfs2_write_info(struct super_block *sb, int type) +{ + handle_t *handle; + int status = 0; + struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv; + + mlog_entry_void(); + + status = ocfs2_lock_global_qf(oinfo, 1); + if (status < 0) + goto out; + handle = ocfs2_start_trans(OCFS2_SB(sb), OCFS2_QINFO_WRITE_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto out_ilock; + } + status = dquot_commit_info(sb, type); + ocfs2_commit_trans(OCFS2_SB(sb), handle); +out_ilock: + ocfs2_unlock_global_qf(oinfo, 1); +out: + mlog_exit(status); + return status; +} + +/* This is difficult. We have to lock quota inode and start transaction + * in this function but we don't want to take the penalty of exlusive + * quota file lock when we are just going to use cached structures. So + * we just take read lock check whether we have dquot cached and if so, + * we don't have to take the write lock... */ +static int ocfs2_dquot_initialize(struct inode *inode, int type) +{ + handle_t *handle = NULL; + int status = 0; + struct super_block *sb = inode->i_sb; + struct ocfs2_mem_dqinfo *oinfo; + int exclusive = 0; + int cnt; + qid_t id; + + mlog_entry_void(); + + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (type != -1 && cnt != type) + continue; + if (!sb_has_quota_active(sb, cnt)) + continue; + oinfo = sb_dqinfo(sb, cnt)->dqi_priv; + status = ocfs2_lock_global_qf(oinfo, 0); + if (status < 0) + goto out; + /* This is just a performance optimization not a reliable test. + * Since we hold an inode lock, noone can actually release + * the structure until we are finished with initialization. */ + if (inode->i_dquot[cnt] != NODQUOT) { + ocfs2_unlock_global_qf(oinfo, 0); + continue; + } + /* When we have inode lock, we know that no dquot_release() can + * run and thus we can safely check whether we need to + * read+modify global file to get quota information or whether + * our node already has it. */ + if (cnt == USRQUOTA) + id = inode->i_uid; + else if (cnt == GRPQUOTA) + id = inode->i_gid; + else + BUG(); + /* Obtain exclusion from quota off... */ + down_write(&sb_dqopt(sb)->dqptr_sem); + exclusive = !dquot_is_cached(sb, id, cnt); + up_write(&sb_dqopt(sb)->dqptr_sem); + if (exclusive) { + status = ocfs2_lock_global_qf(oinfo, 1); + if (status < 0) { + exclusive = 0; + mlog_errno(status); + goto out_ilock; + } + handle = ocfs2_start_trans(OCFS2_SB(sb), + ocfs2_calc_qinit_credits(sb, cnt)); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto out_ilock; + } + } + dquot_initialize(inode, cnt); + if (exclusive) { + ocfs2_commit_trans(OCFS2_SB(sb), handle); + ocfs2_unlock_global_qf(oinfo, 1); + } + ocfs2_unlock_global_qf(oinfo, 0); + } + mlog_exit(0); + return 0; +out_ilock: + if (exclusive) + ocfs2_unlock_global_qf(oinfo, 1); + ocfs2_unlock_global_qf(oinfo, 0); +out: + mlog_exit(status); + return status; +} + +static int ocfs2_dquot_drop_slow(struct inode *inode) +{ + int status = 0; + int cnt; + int got_lock[MAXQUOTAS] = {0, 0}; + handle_t *handle; + struct super_block *sb = inode->i_sb; + struct ocfs2_mem_dqinfo *oinfo; + + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (!sb_has_quota_active(sb, cnt)) + continue; + oinfo = sb_dqinfo(sb, cnt)->dqi_priv; + status = ocfs2_lock_global_qf(oinfo, 1); + if (status < 0) + goto out; + got_lock[cnt] = 1; + } + handle = ocfs2_start_trans(OCFS2_SB(sb), + ocfs2_calc_qinit_credits(sb, USRQUOTA) + + ocfs2_calc_qinit_credits(sb, GRPQUOTA)); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto out; + } + dquot_drop(inode); + ocfs2_commit_trans(OCFS2_SB(sb), handle); +out: + for (cnt = 0; cnt < MAXQUOTAS; cnt++) + if (got_lock[cnt]) { + oinfo = sb_dqinfo(sb, cnt)->dqi_priv; + ocfs2_unlock_global_qf(oinfo, 1); + } + return status; +} + +/* See the comment before ocfs2_dquot_initialize. */ +static int ocfs2_dquot_drop(struct inode *inode) +{ + int status = 0; + struct super_block *sb = inode->i_sb; + struct ocfs2_mem_dqinfo *oinfo; + int exclusive = 0; + int cnt; + int got_lock[MAXQUOTAS] = {0, 0}; + + mlog_entry_void(); + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (!sb_has_quota_active(sb, cnt)) + continue; + oinfo = sb_dqinfo(sb, cnt)->dqi_priv; + status = ocfs2_lock_global_qf(oinfo, 0); + if (status < 0) + goto out; + got_lock[cnt] = 1; + } + /* Lock against anyone releasing references so that when when we check + * we know we are not going to be last ones to release dquot */ + down_write(&sb_dqopt(sb)->dqptr_sem); + /* Urgh, this is a terrible hack :( */ + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (inode->i_dquot[cnt] != NODQUOT && + atomic_read(&inode->i_dquot[cnt]->dq_count) > 1) { + exclusive = 1; + break; + } + } + if (!exclusive) + dquot_drop_locked(inode); + up_write(&sb_dqopt(sb)->dqptr_sem); +out: + for (cnt = 0; cnt < MAXQUOTAS; cnt++) + if (got_lock[cnt]) { + oinfo = sb_dqinfo(sb, cnt)->dqi_priv; + ocfs2_unlock_global_qf(oinfo, 0); + } + /* In case we bailed out because we had to do expensive locking + * do it now... */ + if (exclusive) + status = ocfs2_dquot_drop_slow(inode); + mlog_exit(status); + return status; +} + +static struct dquot *ocfs2_alloc_dquot(struct super_block *sb, int type) +{ + struct ocfs2_dquot *dquot = + kmem_cache_zalloc(ocfs2_dquot_cachep, GFP_NOFS); + + if (!dquot) + return NULL; + return &dquot->dq_dquot; +} + +static void ocfs2_destroy_dquot(struct dquot *dquot) +{ + kmem_cache_free(ocfs2_dquot_cachep, dquot); +} + +struct dquot_operations ocfs2_quota_operations = { + .initialize = ocfs2_dquot_initialize, + .drop = ocfs2_dquot_drop, + .alloc_space = dquot_alloc_space, + .alloc_inode = dquot_alloc_inode, + .free_space = dquot_free_space, + .free_inode = dquot_free_inode, + .transfer = dquot_transfer, + .write_dquot = ocfs2_write_dquot, + .acquire_dquot = ocfs2_acquire_dquot, + .release_dquot = ocfs2_release_dquot, + .mark_dirty = ocfs2_mark_dquot_dirty, + .write_info = ocfs2_write_info, + .alloc_dquot = ocfs2_alloc_dquot, + .destroy_dquot = ocfs2_destroy_dquot, +}; + +int ocfs2_quota_setup(void) +{ + ocfs2_quota_wq = create_workqueue("o2quot"); + if (!ocfs2_quota_wq) + return -ENOMEM; + return 0; +} + +void ocfs2_quota_shutdown(void) +{ + if (ocfs2_quota_wq) { + flush_workqueue(ocfs2_quota_wq); + destroy_workqueue(ocfs2_quota_wq); + ocfs2_quota_wq = NULL; + } +} diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c new file mode 100644 index 000000000000..07deec5e9721 --- /dev/null +++ b/fs/ocfs2/quota_local.c @@ -0,0 +1,1253 @@ +/* + * Implementation of operations over local quota file + */ + +#include <linux/fs.h> +#include <linux/quota.h> +#include <linux/quotaops.h> +#include <linux/module.h> + +#define MLOG_MASK_PREFIX ML_QUOTA +#include <cluster/masklog.h> + +#include "ocfs2_fs.h" +#include "ocfs2.h" +#include "inode.h" +#include "alloc.h" +#include "file.h" +#include "buffer_head_io.h" +#include "journal.h" +#include "sysfile.h" +#include "dlmglue.h" +#include "quota.h" + +/* Number of local quota structures per block */ +static inline unsigned int ol_quota_entries_per_block(struct super_block *sb) +{ + return ((sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE) / + sizeof(struct ocfs2_local_disk_dqblk)); +} + +/* Number of blocks with entries in one chunk */ +static inline unsigned int ol_chunk_blocks(struct super_block *sb) +{ + return ((sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) - + OCFS2_QBLK_RESERVED_SPACE) << 3) / + ol_quota_entries_per_block(sb); +} + +/* Number of entries in a chunk bitmap */ +static unsigned int ol_chunk_entries(struct super_block *sb) +{ + return ol_chunk_blocks(sb) * ol_quota_entries_per_block(sb); +} + +/* Offset of the chunk in quota file */ +static unsigned int ol_quota_chunk_block(struct super_block *sb, int c) +{ + /* 1 block for local quota file info, 1 block per chunk for chunk info */ + return 1 + (ol_chunk_blocks(sb) + 1) * c; +} + +static unsigned int ol_dqblk_block(struct super_block *sb, int c, int off) +{ + int epb = ol_quota_entries_per_block(sb); + + return ol_quota_chunk_block(sb, c) + 1 + off / epb; +} + +static unsigned int ol_dqblk_block_off(struct super_block *sb, int c, int off) +{ + int epb = ol_quota_entries_per_block(sb); + + return (off % epb) * sizeof(struct ocfs2_local_disk_dqblk); +} + +/* Offset of the dquot structure in the quota file */ +static loff_t ol_dqblk_off(struct super_block *sb, int c, int off) +{ + return (ol_dqblk_block(sb, c, off) << sb->s_blocksize_bits) + + ol_dqblk_block_off(sb, c, off); +} + +/* Compute block number from given offset */ +static inline unsigned int ol_dqblk_file_block(struct super_block *sb, loff_t off) +{ + return off >> sb->s_blocksize_bits; +} + +static inline unsigned int ol_dqblk_block_offset(struct super_block *sb, loff_t off) +{ + return off & ((1 << sb->s_blocksize_bits) - 1); +} + +/* Compute offset in the chunk of a structure with the given offset */ +static int ol_dqblk_chunk_off(struct super_block *sb, int c, loff_t off) +{ + int epb = ol_quota_entries_per_block(sb); + + return ((off >> sb->s_blocksize_bits) - + ol_quota_chunk_block(sb, c) - 1) * epb + + ((unsigned int)(off & ((1 << sb->s_blocksize_bits) - 1))) / + sizeof(struct ocfs2_local_disk_dqblk); +} + +/* Write bufferhead into the fs */ +static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh, + void (*modify)(struct buffer_head *, void *), void *private) +{ + struct super_block *sb = inode->i_sb; + handle_t *handle; + int status; + + handle = ocfs2_start_trans(OCFS2_SB(sb), 1); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + return status; + } + status = ocfs2_journal_access_dq(handle, inode, bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + ocfs2_commit_trans(OCFS2_SB(sb), handle); + return status; + } + lock_buffer(bh); + modify(bh, private); + unlock_buffer(bh); + status = ocfs2_journal_dirty(handle, bh); + if (status < 0) { + mlog_errno(status); + ocfs2_commit_trans(OCFS2_SB(sb), handle); + return status; + } + status = ocfs2_commit_trans(OCFS2_SB(sb), handle); + if (status < 0) { + mlog_errno(status); + return status; + } + return 0; +} + +/* Check whether we understand format of quota files */ +static int ocfs2_local_check_quota_file(struct super_block *sb, int type) +{ + unsigned int lmagics[MAXQUOTAS] = OCFS2_LOCAL_QMAGICS; + unsigned int lversions[MAXQUOTAS] = OCFS2_LOCAL_QVERSIONS; + unsigned int gmagics[MAXQUOTAS] = OCFS2_GLOBAL_QMAGICS; + unsigned int gversions[MAXQUOTAS] = OCFS2_GLOBAL_QVERSIONS; + unsigned int ino[MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE, + GROUP_QUOTA_SYSTEM_INODE }; + struct buffer_head *bh = NULL; + struct inode *linode = sb_dqopt(sb)->files[type]; + struct inode *ginode = NULL; + struct ocfs2_disk_dqheader *dqhead; + int status, ret = 0; + + /* First check whether we understand local quota file */ + status = ocfs2_read_quota_block(linode, 0, &bh); + if (status) { + mlog_errno(status); + mlog(ML_ERROR, "failed to read quota file header (type=%d)\n", + type); + goto out_err; + } + dqhead = (struct ocfs2_disk_dqheader *)(bh->b_data); + if (le32_to_cpu(dqhead->dqh_magic) != lmagics[type]) { + mlog(ML_ERROR, "quota file magic does not match (%u != %u)," + " type=%d\n", le32_to_cpu(dqhead->dqh_magic), + lmagics[type], type); + goto out_err; + } + if (le32_to_cpu(dqhead->dqh_version) != lversions[type]) { + mlog(ML_ERROR, "quota file version does not match (%u != %u)," + " type=%d\n", le32_to_cpu(dqhead->dqh_version), + lversions[type], type); + goto out_err; + } + brelse(bh); + bh = NULL; + + /* Next check whether we understand global quota file */ + ginode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type], + OCFS2_INVALID_SLOT); + if (!ginode) { + mlog(ML_ERROR, "cannot get global quota file inode " + "(type=%d)\n", type); + goto out_err; + } + /* Since the header is read only, we don't care about locking */ + status = ocfs2_read_quota_block(ginode, 0, &bh); + if (status) { + mlog_errno(status); + mlog(ML_ERROR, "failed to read global quota file header " + "(type=%d)\n", type); + goto out_err; + } + dqhead = (struct ocfs2_disk_dqheader *)(bh->b_data); + if (le32_to_cpu(dqhead->dqh_magic) != gmagics[type]) { + mlog(ML_ERROR, "global quota file magic does not match " + "(%u != %u), type=%d\n", + le32_to_cpu(dqhead->dqh_magic), gmagics[type], type); + goto out_err; + } + if (le32_to_cpu(dqhead->dqh_version) != gversions[type]) { + mlog(ML_ERROR, "global quota file version does not match " + "(%u != %u), type=%d\n", + le32_to_cpu(dqhead->dqh_version), gversions[type], + type); + goto out_err; + } + + ret = 1; +out_err: + brelse(bh); + iput(ginode); + return ret; +} + +/* Release given list of quota file chunks */ +static void ocfs2_release_local_quota_bitmaps(struct list_head *head) +{ + struct ocfs2_quota_chunk *pos, *next; + + list_for_each_entry_safe(pos, next, head, qc_chunk) { + list_del(&pos->qc_chunk); + brelse(pos->qc_headerbh); + kmem_cache_free(ocfs2_qf_chunk_cachep, pos); + } +} + +/* Load quota bitmaps into memory */ +static int ocfs2_load_local_quota_bitmaps(struct inode *inode, + struct ocfs2_local_disk_dqinfo *ldinfo, + struct list_head *head) +{ + struct ocfs2_quota_chunk *newchunk; + int i, status; + + INIT_LIST_HEAD(head); + for (i = 0; i < le32_to_cpu(ldinfo->dqi_chunks); i++) { + newchunk = kmem_cache_alloc(ocfs2_qf_chunk_cachep, GFP_NOFS); + if (!newchunk) { + ocfs2_release_local_quota_bitmaps(head); + return -ENOMEM; + } + newchunk->qc_num = i; + newchunk->qc_headerbh = NULL; + status = ocfs2_read_quota_block(inode, + ol_quota_chunk_block(inode->i_sb, i), + &newchunk->qc_headerbh); + if (status) { + mlog_errno(status); + kmem_cache_free(ocfs2_qf_chunk_cachep, newchunk); + ocfs2_release_local_quota_bitmaps(head); + return status; + } + list_add_tail(&newchunk->qc_chunk, head); + } + return 0; +} + +static void olq_update_info(struct buffer_head *bh, void *private) +{ + struct mem_dqinfo *info = private; + struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv; + struct ocfs2_local_disk_dqinfo *ldinfo; + + ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data + + OCFS2_LOCAL_INFO_OFF); + spin_lock(&dq_data_lock); + ldinfo->dqi_flags = cpu_to_le32(info->dqi_flags & DQF_MASK); + ldinfo->dqi_chunks = cpu_to_le32(oinfo->dqi_chunks); + ldinfo->dqi_blocks = cpu_to_le32(oinfo->dqi_blocks); + spin_unlock(&dq_data_lock); +} + +static int ocfs2_add_recovery_chunk(struct super_block *sb, + struct ocfs2_local_disk_chunk *dchunk, + int chunk, + struct list_head *head) +{ + struct ocfs2_recovery_chunk *rc; + + rc = kmalloc(sizeof(struct ocfs2_recovery_chunk), GFP_NOFS); + if (!rc) + return -ENOMEM; + rc->rc_chunk = chunk; + rc->rc_bitmap = kmalloc(sb->s_blocksize, GFP_NOFS); + if (!rc->rc_bitmap) { + kfree(rc); + return -ENOMEM; + } + memcpy(rc->rc_bitmap, dchunk->dqc_bitmap, + (ol_chunk_entries(sb) + 7) >> 3); + list_add_tail(&rc->rc_list, head); + return 0; +} + +static void free_recovery_list(struct list_head *head) +{ + struct ocfs2_recovery_chunk *next; + struct ocfs2_recovery_chunk *rchunk; + + list_for_each_entry_safe(rchunk, next, head, rc_list) { + list_del(&rchunk->rc_list); + kfree(rchunk->rc_bitmap); + kfree(rchunk); + } +} + +void ocfs2_free_quota_recovery(struct ocfs2_quota_recovery *rec) +{ + int type; + + for (type = 0; type < MAXQUOTAS; type++) + free_recovery_list(&(rec->r_list[type])); + kfree(rec); +} + +/* Load entries in our quota file we have to recover*/ +static int ocfs2_recovery_load_quota(struct inode *lqinode, + struct ocfs2_local_disk_dqinfo *ldinfo, + int type, + struct list_head *head) +{ + struct super_block *sb = lqinode->i_sb; + struct buffer_head *hbh; + struct ocfs2_local_disk_chunk *dchunk; + int i, chunks = le32_to_cpu(ldinfo->dqi_chunks); + int status = 0; + + for (i = 0; i < chunks; i++) { + hbh = NULL; + status = ocfs2_read_quota_block(lqinode, + ol_quota_chunk_block(sb, i), + &hbh); + if (status) { + mlog_errno(status); + break; + } + dchunk = (struct ocfs2_local_disk_chunk *)hbh->b_data; + if (le32_to_cpu(dchunk->dqc_free) < ol_chunk_entries(sb)) + status = ocfs2_add_recovery_chunk(sb, dchunk, i, head); + brelse(hbh); + if (status < 0) + break; + } + if (status < 0) + free_recovery_list(head); + return status; +} + +static struct ocfs2_quota_recovery *ocfs2_alloc_quota_recovery(void) +{ + int type; + struct ocfs2_quota_recovery *rec; + + rec = kmalloc(sizeof(struct ocfs2_quota_recovery), GFP_NOFS); + if (!rec) + return NULL; + for (type = 0; type < MAXQUOTAS; type++) + INIT_LIST_HEAD(&(rec->r_list[type])); + return rec; +} + +/* Load information we need for quota recovery into memory */ +struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery( + struct ocfs2_super *osb, + int slot_num) +{ + unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, + OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; + unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE, + LOCAL_GROUP_QUOTA_SYSTEM_INODE }; + struct super_block *sb = osb->sb; + struct ocfs2_local_disk_dqinfo *ldinfo; + struct inode *lqinode; + struct buffer_head *bh; + int type; + int status = 0; + struct ocfs2_quota_recovery *rec; + + mlog(ML_NOTICE, "Beginning quota recovery in slot %u\n", slot_num); + rec = ocfs2_alloc_quota_recovery(); + if (!rec) + return ERR_PTR(-ENOMEM); + /* First init... */ + + for (type = 0; type < MAXQUOTAS; type++) { + if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) + continue; + /* At this point, journal of the slot is already replayed so + * we can trust metadata and data of the quota file */ + lqinode = ocfs2_get_system_file_inode(osb, ino[type], slot_num); + if (!lqinode) { + status = -ENOENT; + goto out; + } + status = ocfs2_inode_lock_full(lqinode, NULL, 1, + OCFS2_META_LOCK_RECOVERY); + if (status < 0) { + mlog_errno(status); + goto out_put; + } + /* Now read local header */ + bh = NULL; + status = ocfs2_read_quota_block(lqinode, 0, &bh); + if (status) { + mlog_errno(status); + mlog(ML_ERROR, "failed to read quota file info header " + "(slot=%d type=%d)\n", slot_num, type); + goto out_lock; + } + ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data + + OCFS2_LOCAL_INFO_OFF); + status = ocfs2_recovery_load_quota(lqinode, ldinfo, type, + &rec->r_list[type]); + brelse(bh); +out_lock: + ocfs2_inode_unlock(lqinode, 1); +out_put: + iput(lqinode); + if (status < 0) + break; + } +out: + if (status < 0) { + ocfs2_free_quota_recovery(rec); + rec = ERR_PTR(status); + } + return rec; +} + +/* Sync changes in local quota file into global quota file and + * reinitialize local quota file. + * The function expects local quota file to be already locked and + * dqonoff_mutex locked. */ +static int ocfs2_recover_local_quota_file(struct inode *lqinode, + int type, + struct ocfs2_quota_recovery *rec) +{ + struct super_block *sb = lqinode->i_sb; + struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv; + struct ocfs2_local_disk_chunk *dchunk; + struct ocfs2_local_disk_dqblk *dqblk; + struct dquot *dquot; + handle_t *handle; + struct buffer_head *hbh = NULL, *qbh = NULL; + int status = 0; + int bit, chunk; + struct ocfs2_recovery_chunk *rchunk, *next; + qsize_t spacechange, inodechange; + + mlog_entry("ino=%lu type=%u", (unsigned long)lqinode->i_ino, type); + + status = ocfs2_lock_global_qf(oinfo, 1); + if (status < 0) + goto out; + + list_for_each_entry_safe(rchunk, next, &(rec->r_list[type]), rc_list) { + chunk = rchunk->rc_chunk; + hbh = NULL; + status = ocfs2_read_quota_block(lqinode, + ol_quota_chunk_block(sb, chunk), + &hbh); + if (status) { + mlog_errno(status); + break; + } + dchunk = (struct ocfs2_local_disk_chunk *)hbh->b_data; + for_each_bit(bit, rchunk->rc_bitmap, ol_chunk_entries(sb)) { + qbh = NULL; + status = ocfs2_read_quota_block(lqinode, + ol_dqblk_block(sb, chunk, bit), + &qbh); + if (status) { + mlog_errno(status); + break; + } + dqblk = (struct ocfs2_local_disk_dqblk *)(qbh->b_data + + ol_dqblk_block_off(sb, chunk, bit)); + dquot = dqget(sb, le64_to_cpu(dqblk->dqb_id), type); + if (!dquot) { + status = -EIO; + mlog(ML_ERROR, "Failed to get quota structure " + "for id %u, type %d. Cannot finish quota " + "file recovery.\n", + (unsigned)le64_to_cpu(dqblk->dqb_id), + type); + goto out_put_bh; + } + handle = ocfs2_start_trans(OCFS2_SB(sb), + OCFS2_QSYNC_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto out_put_dquot; + } + mutex_lock(&sb_dqopt(sb)->dqio_mutex); + spin_lock(&dq_data_lock); + /* Add usage from quota entry into quota changes + * of our node. Auxiliary variables are important + * due to signedness */ + spacechange = le64_to_cpu(dqblk->dqb_spacemod); + inodechange = le64_to_cpu(dqblk->dqb_inodemod); + dquot->dq_dqb.dqb_curspace += spacechange; + dquot->dq_dqb.dqb_curinodes += inodechange; + spin_unlock(&dq_data_lock); + /* We want to drop reference held by the crashed + * node. Since we have our own reference we know + * global structure actually won't be freed. */ + status = ocfs2_global_release_dquot(dquot); + if (status < 0) { + mlog_errno(status); + goto out_commit; + } + /* Release local quota file entry */ + status = ocfs2_journal_access_dq(handle, lqinode, + qbh, OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto out_commit; + } + lock_buffer(qbh); + WARN_ON(!ocfs2_test_bit(bit, dchunk->dqc_bitmap)); + ocfs2_clear_bit(bit, dchunk->dqc_bitmap); + le32_add_cpu(&dchunk->dqc_free, 1); + unlock_buffer(qbh); + status = ocfs2_journal_dirty(handle, qbh); + if (status < 0) + mlog_errno(status); +out_commit: + mutex_unlock(&sb_dqopt(sb)->dqio_mutex); + ocfs2_commit_trans(OCFS2_SB(sb), handle); +out_put_dquot: + dqput(dquot); +out_put_bh: + brelse(qbh); + if (status < 0) + break; + } + brelse(hbh); + list_del(&rchunk->rc_list); + kfree(rchunk->rc_bitmap); + kfree(rchunk); + if (status < 0) + break; + } + ocfs2_unlock_global_qf(oinfo, 1); +out: + if (status < 0) + free_recovery_list(&(rec->r_list[type])); + mlog_exit(status); + return status; +} + +/* Recover local quota files for given node different from us */ +int ocfs2_finish_quota_recovery(struct ocfs2_super *osb, + struct ocfs2_quota_recovery *rec, + int slot_num) +{ + unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE, + LOCAL_GROUP_QUOTA_SYSTEM_INODE }; + struct super_block *sb = osb->sb; + struct ocfs2_local_disk_dqinfo *ldinfo; + struct buffer_head *bh; + handle_t *handle; + int type; + int status = 0; + struct inode *lqinode; + unsigned int flags; + + mlog(ML_NOTICE, "Finishing quota recovery in slot %u\n", slot_num); + mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); + for (type = 0; type < MAXQUOTAS; type++) { + if (list_empty(&(rec->r_list[type]))) + continue; + mlog(0, "Recovering quota in slot %d\n", slot_num); + lqinode = ocfs2_get_system_file_inode(osb, ino[type], slot_num); + if (!lqinode) { + status = -ENOENT; + goto out; + } + status = ocfs2_inode_lock_full(lqinode, NULL, 1, + OCFS2_META_LOCK_NOQUEUE); + /* Someone else is holding the lock? Then he must be + * doing the recovery. Just skip the file... */ + if (status == -EAGAIN) { + mlog(ML_NOTICE, "skipping quota recovery for slot %d " + "because quota file is locked.\n", slot_num); + status = 0; + goto out_put; + } else if (status < 0) { + mlog_errno(status); + goto out_put; + } + /* Now read local header */ + bh = NULL; + status = ocfs2_read_quota_block(lqinode, 0, &bh); + if (status) { + mlog_errno(status); + mlog(ML_ERROR, "failed to read quota file info header " + "(slot=%d type=%d)\n", slot_num, type); + goto out_lock; + } + ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data + + OCFS2_LOCAL_INFO_OFF); + /* Is recovery still needed? */ + flags = le32_to_cpu(ldinfo->dqi_flags); + if (!(flags & OLQF_CLEAN)) + status = ocfs2_recover_local_quota_file(lqinode, + type, + rec); + /* We don't want to mark file as clean when it is actually + * active */ + if (slot_num == osb->slot_num) + goto out_bh; + /* Mark quota file as clean if we are recovering quota file of + * some other node. */ + handle = ocfs2_start_trans(osb, 1); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto out_bh; + } + status = ocfs2_journal_access_dq(handle, lqinode, bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + lock_buffer(bh); + ldinfo->dqi_flags = cpu_to_le32(flags | OLQF_CLEAN); + unlock_buffer(bh); + status = ocfs2_journal_dirty(handle, bh); + if (status < 0) + mlog_errno(status); +out_trans: + ocfs2_commit_trans(osb, handle); +out_bh: + brelse(bh); +out_lock: + ocfs2_inode_unlock(lqinode, 1); +out_put: + iput(lqinode); + if (status < 0) + break; + } +out: + mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); + kfree(rec); + return status; +} + +/* Read information header from quota file */ +static int ocfs2_local_read_info(struct super_block *sb, int type) +{ + struct ocfs2_local_disk_dqinfo *ldinfo; + struct mem_dqinfo *info = sb_dqinfo(sb, type); + struct ocfs2_mem_dqinfo *oinfo; + struct inode *lqinode = sb_dqopt(sb)->files[type]; + int status; + struct buffer_head *bh = NULL; + struct ocfs2_quota_recovery *rec; + int locked = 0; + + info->dqi_maxblimit = 0x7fffffffffffffffLL; + info->dqi_maxilimit = 0x7fffffffffffffffLL; + oinfo = kmalloc(sizeof(struct ocfs2_mem_dqinfo), GFP_NOFS); + if (!oinfo) { + mlog(ML_ERROR, "failed to allocate memory for ocfs2 quota" + " info."); + goto out_err; + } + info->dqi_priv = oinfo; + oinfo->dqi_type = type; + INIT_LIST_HEAD(&oinfo->dqi_chunk); + oinfo->dqi_rec = NULL; + oinfo->dqi_lqi_bh = NULL; + oinfo->dqi_ibh = NULL; + + status = ocfs2_global_read_info(sb, type); + if (status < 0) + goto out_err; + + status = ocfs2_inode_lock(lqinode, &oinfo->dqi_lqi_bh, 1); + if (status < 0) { + mlog_errno(status); + goto out_err; + } + locked = 1; + + /* Now read local header */ + status = ocfs2_read_quota_block(lqinode, 0, &bh); + if (status) { + mlog_errno(status); + mlog(ML_ERROR, "failed to read quota file info header " + "(type=%d)\n", type); + goto out_err; + } + ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data + + OCFS2_LOCAL_INFO_OFF); + info->dqi_flags = le32_to_cpu(ldinfo->dqi_flags); + oinfo->dqi_chunks = le32_to_cpu(ldinfo->dqi_chunks); + oinfo->dqi_blocks = le32_to_cpu(ldinfo->dqi_blocks); + oinfo->dqi_ibh = bh; + + /* We crashed when using local quota file? */ + if (!(info->dqi_flags & OLQF_CLEAN)) { + rec = OCFS2_SB(sb)->quota_rec; + if (!rec) { + rec = ocfs2_alloc_quota_recovery(); + if (!rec) { + status = -ENOMEM; + mlog_errno(status); + goto out_err; + } + OCFS2_SB(sb)->quota_rec = rec; + } + + status = ocfs2_recovery_load_quota(lqinode, ldinfo, type, + &rec->r_list[type]); + if (status < 0) { + mlog_errno(status); + goto out_err; + } + } + + status = ocfs2_load_local_quota_bitmaps(lqinode, + ldinfo, + &oinfo->dqi_chunk); + if (status < 0) { + mlog_errno(status); + goto out_err; + } + + /* Now mark quota file as used */ + info->dqi_flags &= ~OLQF_CLEAN; + status = ocfs2_modify_bh(lqinode, bh, olq_update_info, info); + if (status < 0) { + mlog_errno(status); + goto out_err; + } + + return 0; +out_err: + if (oinfo) { + iput(oinfo->dqi_gqinode); + ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock); + ocfs2_lock_res_free(&oinfo->dqi_gqlock); + brelse(oinfo->dqi_lqi_bh); + if (locked) + ocfs2_inode_unlock(lqinode, 1); + ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk); + kfree(oinfo); + } + brelse(bh); + return -1; +} + +/* Write local info to quota file */ +static int ocfs2_local_write_info(struct super_block *sb, int type) +{ + struct mem_dqinfo *info = sb_dqinfo(sb, type); + struct buffer_head *bh = ((struct ocfs2_mem_dqinfo *)info->dqi_priv) + ->dqi_ibh; + int status; + + status = ocfs2_modify_bh(sb_dqopt(sb)->files[type], bh, olq_update_info, + info); + if (status < 0) { + mlog_errno(status); + return -1; + } + + return 0; +} + +/* Release info from memory */ +static int ocfs2_local_free_info(struct super_block *sb, int type) +{ + struct mem_dqinfo *info = sb_dqinfo(sb, type); + struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv; + struct ocfs2_quota_chunk *chunk; + struct ocfs2_local_disk_chunk *dchunk; + int mark_clean = 1, len; + int status; + + /* At this point we know there are no more dquots and thus + * even if there's some sync in the pdflush queue, it won't + * find any dquots and return without doing anything */ + cancel_delayed_work_sync(&oinfo->dqi_sync_work); + iput(oinfo->dqi_gqinode); + ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock); + ocfs2_lock_res_free(&oinfo->dqi_gqlock); + list_for_each_entry(chunk, &oinfo->dqi_chunk, qc_chunk) { + dchunk = (struct ocfs2_local_disk_chunk *) + (chunk->qc_headerbh->b_data); + if (chunk->qc_num < oinfo->dqi_chunks - 1) { + len = ol_chunk_entries(sb); + } else { + len = (oinfo->dqi_blocks - + ol_quota_chunk_block(sb, chunk->qc_num) - 1) + * ol_quota_entries_per_block(sb); + } + /* Not all entries free? Bug! */ + if (le32_to_cpu(dchunk->dqc_free) != len) { + mlog(ML_ERROR, "releasing quota file with used " + "entries (type=%d)\n", type); + mark_clean = 0; + } + } + ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk); + + /* dqonoff_mutex protects us against racing with recovery thread... */ + if (oinfo->dqi_rec) { + ocfs2_free_quota_recovery(oinfo->dqi_rec); + mark_clean = 0; + } + + if (!mark_clean) + goto out; + + /* Mark local file as clean */ + info->dqi_flags |= OLQF_CLEAN; + status = ocfs2_modify_bh(sb_dqopt(sb)->files[type], + oinfo->dqi_ibh, + olq_update_info, + info); + if (status < 0) { + mlog_errno(status); + goto out; + } + +out: + ocfs2_inode_unlock(sb_dqopt(sb)->files[type], 1); + brelse(oinfo->dqi_ibh); + brelse(oinfo->dqi_lqi_bh); + kfree(oinfo); + return 0; +} + +static void olq_set_dquot(struct buffer_head *bh, void *private) +{ + struct ocfs2_dquot *od = private; + struct ocfs2_local_disk_dqblk *dqblk; + struct super_block *sb = od->dq_dquot.dq_sb; + + dqblk = (struct ocfs2_local_disk_dqblk *)(bh->b_data + + ol_dqblk_block_offset(sb, od->dq_local_off)); + + dqblk->dqb_id = cpu_to_le64(od->dq_dquot.dq_id); + spin_lock(&dq_data_lock); + dqblk->dqb_spacemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curspace - + od->dq_origspace); + dqblk->dqb_inodemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curinodes - + od->dq_originodes); + spin_unlock(&dq_data_lock); + mlog(0, "Writing local dquot %u space %lld inodes %lld\n", + od->dq_dquot.dq_id, (long long)le64_to_cpu(dqblk->dqb_spacemod), + (long long)le64_to_cpu(dqblk->dqb_inodemod)); +} + +/* Write dquot to local quota file */ +static int ocfs2_local_write_dquot(struct dquot *dquot) +{ + struct super_block *sb = dquot->dq_sb; + struct ocfs2_dquot *od = OCFS2_DQUOT(dquot); + struct buffer_head *bh = NULL; + int status; + + status = ocfs2_read_quota_block(sb_dqopt(sb)->files[dquot->dq_type], + ol_dqblk_file_block(sb, od->dq_local_off), + &bh); + if (status) { + mlog_errno(status); + goto out; + } + status = ocfs2_modify_bh(sb_dqopt(sb)->files[dquot->dq_type], bh, + olq_set_dquot, od); + if (status < 0) { + mlog_errno(status); + goto out; + } +out: + brelse(bh); + return status; +} + +/* Find free entry in local quota file */ +static struct ocfs2_quota_chunk *ocfs2_find_free_entry(struct super_block *sb, + int type, + int *offset) +{ + struct mem_dqinfo *info = sb_dqinfo(sb, type); + struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv; + struct ocfs2_quota_chunk *chunk; + struct ocfs2_local_disk_chunk *dchunk; + int found = 0, len; + + list_for_each_entry(chunk, &oinfo->dqi_chunk, qc_chunk) { + dchunk = (struct ocfs2_local_disk_chunk *) + chunk->qc_headerbh->b_data; + if (le32_to_cpu(dchunk->dqc_free) > 0) { + found = 1; + break; + } + } + if (!found) + return NULL; + + if (chunk->qc_num < oinfo->dqi_chunks - 1) { + len = ol_chunk_entries(sb); + } else { + len = (oinfo->dqi_blocks - + ol_quota_chunk_block(sb, chunk->qc_num) - 1) + * ol_quota_entries_per_block(sb); + } + + found = ocfs2_find_next_zero_bit(dchunk->dqc_bitmap, len, 0); + /* We failed? */ + if (found == len) { + mlog(ML_ERROR, "Did not find empty entry in chunk %d with %u" + " entries free (type=%d)\n", chunk->qc_num, + le32_to_cpu(dchunk->dqc_free), type); + return ERR_PTR(-EIO); + } + *offset = found; + return chunk; +} + +/* Add new chunk to the local quota file */ +static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk( + struct super_block *sb, + int type, + int *offset) +{ + struct mem_dqinfo *info = sb_dqinfo(sb, type); + struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv; + struct inode *lqinode = sb_dqopt(sb)->files[type]; + struct ocfs2_quota_chunk *chunk = NULL; + struct ocfs2_local_disk_chunk *dchunk; + int status; + handle_t *handle; + struct buffer_head *bh = NULL; + u64 p_blkno; + + /* We are protected by dqio_sem so no locking needed */ + status = ocfs2_extend_no_holes(lqinode, + lqinode->i_size + 2 * sb->s_blocksize, + lqinode->i_size); + if (status < 0) { + mlog_errno(status); + goto out; + } + status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh, + lqinode->i_size + 2 * sb->s_blocksize); + if (status < 0) { + mlog_errno(status); + goto out; + } + + chunk = kmem_cache_alloc(ocfs2_qf_chunk_cachep, GFP_NOFS); + if (!chunk) { + status = -ENOMEM; + mlog_errno(status); + goto out; + } + + down_read(&OCFS2_I(lqinode)->ip_alloc_sem); + status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks, + &p_blkno, NULL, NULL); + up_read(&OCFS2_I(lqinode)->ip_alloc_sem); + if (status < 0) { + mlog_errno(status); + goto out; + } + bh = sb_getblk(sb, p_blkno); + if (!bh) { + status = -ENOMEM; + mlog_errno(status); + goto out; + } + dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data; + + handle = ocfs2_start_trans(OCFS2_SB(sb), 2); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto out; + } + + status = ocfs2_journal_access_dq(handle, lqinode, bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + lock_buffer(bh); + dchunk->dqc_free = cpu_to_le32(ol_quota_entries_per_block(sb)); + memset(dchunk->dqc_bitmap, 0, + sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) - + OCFS2_QBLK_RESERVED_SPACE); + set_buffer_uptodate(bh); + unlock_buffer(bh); + status = ocfs2_journal_dirty(handle, bh); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + + oinfo->dqi_blocks += 2; + oinfo->dqi_chunks++; + status = ocfs2_local_write_info(sb, type); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + status = ocfs2_commit_trans(OCFS2_SB(sb), handle); + if (status < 0) { + mlog_errno(status); + goto out; + } + + list_add_tail(&chunk->qc_chunk, &oinfo->dqi_chunk); + chunk->qc_num = list_entry(chunk->qc_chunk.prev, + struct ocfs2_quota_chunk, + qc_chunk)->qc_num + 1; + chunk->qc_headerbh = bh; + *offset = 0; + return chunk; +out_trans: + ocfs2_commit_trans(OCFS2_SB(sb), handle); +out: + brelse(bh); + kmem_cache_free(ocfs2_qf_chunk_cachep, chunk); + return ERR_PTR(status); +} + +/* Find free entry in local quota file */ +static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file( + struct super_block *sb, + int type, + int *offset) +{ + struct mem_dqinfo *info = sb_dqinfo(sb, type); + struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv; + struct ocfs2_quota_chunk *chunk; + struct inode *lqinode = sb_dqopt(sb)->files[type]; + struct ocfs2_local_disk_chunk *dchunk; + int epb = ol_quota_entries_per_block(sb); + unsigned int chunk_blocks; + int status; + handle_t *handle; + + if (list_empty(&oinfo->dqi_chunk)) + return ocfs2_local_quota_add_chunk(sb, type, offset); + /* Is the last chunk full? */ + chunk = list_entry(oinfo->dqi_chunk.prev, + struct ocfs2_quota_chunk, qc_chunk); + chunk_blocks = oinfo->dqi_blocks - + ol_quota_chunk_block(sb, chunk->qc_num) - 1; + if (ol_chunk_blocks(sb) == chunk_blocks) + return ocfs2_local_quota_add_chunk(sb, type, offset); + + /* We are protected by dqio_sem so no locking needed */ + status = ocfs2_extend_no_holes(lqinode, + lqinode->i_size + sb->s_blocksize, + lqinode->i_size); + if (status < 0) { + mlog_errno(status); + goto out; + } + status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh, + lqinode->i_size + sb->s_blocksize); + if (status < 0) { + mlog_errno(status); + goto out; + } + handle = ocfs2_start_trans(OCFS2_SB(sb), 2); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto out; + } + status = ocfs2_journal_access_dq(handle, lqinode, chunk->qc_headerbh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + + dchunk = (struct ocfs2_local_disk_chunk *)chunk->qc_headerbh->b_data; + lock_buffer(chunk->qc_headerbh); + le32_add_cpu(&dchunk->dqc_free, ol_quota_entries_per_block(sb)); + unlock_buffer(chunk->qc_headerbh); + status = ocfs2_journal_dirty(handle, chunk->qc_headerbh); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + oinfo->dqi_blocks++; + status = ocfs2_local_write_info(sb, type); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + + status = ocfs2_commit_trans(OCFS2_SB(sb), handle); + if (status < 0) { + mlog_errno(status); + goto out; + } + *offset = chunk_blocks * epb; + return chunk; +out_trans: + ocfs2_commit_trans(OCFS2_SB(sb), handle); +out: + return ERR_PTR(status); +} + +static void olq_alloc_dquot(struct buffer_head *bh, void *private) +{ + int *offset = private; + struct ocfs2_local_disk_chunk *dchunk; + + dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data; + ocfs2_set_bit(*offset, dchunk->dqc_bitmap); + le32_add_cpu(&dchunk->dqc_free, -1); +} + +/* Create dquot in the local file for given id */ +static int ocfs2_create_local_dquot(struct dquot *dquot) +{ + struct super_block *sb = dquot->dq_sb; + int type = dquot->dq_type; + struct inode *lqinode = sb_dqopt(sb)->files[type]; + struct ocfs2_quota_chunk *chunk; + struct ocfs2_dquot *od = OCFS2_DQUOT(dquot); + int offset; + int status; + + chunk = ocfs2_find_free_entry(sb, type, &offset); + if (!chunk) { + chunk = ocfs2_extend_local_quota_file(sb, type, &offset); + if (IS_ERR(chunk)) + return PTR_ERR(chunk); + } else if (IS_ERR(chunk)) { + return PTR_ERR(chunk); + } + od->dq_local_off = ol_dqblk_off(sb, chunk->qc_num, offset); + od->dq_chunk = chunk; + + /* Initialize dquot structure on disk */ + status = ocfs2_local_write_dquot(dquot); + if (status < 0) { + mlog_errno(status); + goto out; + } + + /* Mark structure as allocated */ + status = ocfs2_modify_bh(lqinode, chunk->qc_headerbh, olq_alloc_dquot, + &offset); + if (status < 0) { + mlog_errno(status); + goto out; + } +out: + return status; +} + +/* Create entry in local file for dquot, load data from the global file */ +static int ocfs2_local_read_dquot(struct dquot *dquot) +{ + int status; + + mlog_entry("id=%u, type=%d\n", dquot->dq_id, dquot->dq_type); + + status = ocfs2_global_read_dquot(dquot); + if (status < 0) { + mlog_errno(status); + goto out_err; + } + + /* Now create entry in the local quota file */ + status = ocfs2_create_local_dquot(dquot); + if (status < 0) { + mlog_errno(status); + goto out_err; + } + mlog_exit(0); + return 0; +out_err: + mlog_exit(status); + return status; +} + +/* Release dquot structure from local quota file. ocfs2_release_dquot() has + * already started a transaction and obtained exclusive lock for global + * quota file. */ +static int ocfs2_local_release_dquot(struct dquot *dquot) +{ + int status; + int type = dquot->dq_type; + struct ocfs2_dquot *od = OCFS2_DQUOT(dquot); + struct super_block *sb = dquot->dq_sb; + struct ocfs2_local_disk_chunk *dchunk; + int offset; + handle_t *handle = journal_current_handle(); + + BUG_ON(!handle); + /* First write all local changes to global file */ + status = ocfs2_global_release_dquot(dquot); + if (status < 0) { + mlog_errno(status); + goto out; + } + + status = ocfs2_journal_access_dq(handle, sb_dqopt(sb)->files[type], + od->dq_chunk->qc_headerbh, OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto out; + } + offset = ol_dqblk_chunk_off(sb, od->dq_chunk->qc_num, + od->dq_local_off); + dchunk = (struct ocfs2_local_disk_chunk *) + (od->dq_chunk->qc_headerbh->b_data); + /* Mark structure as freed */ + lock_buffer(od->dq_chunk->qc_headerbh); + ocfs2_clear_bit(offset, dchunk->dqc_bitmap); + le32_add_cpu(&dchunk->dqc_free, 1); + unlock_buffer(od->dq_chunk->qc_headerbh); + status = ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh); + if (status < 0) { + mlog_errno(status); + goto out; + } + status = 0; +out: + /* Clear the read bit so that next time someone uses this + * dquot he reads fresh info from disk and allocates local + * dquot structure */ + clear_bit(DQ_READ_B, &dquot->dq_flags); + return status; +} + +static struct quota_format_ops ocfs2_format_ops = { + .check_quota_file = ocfs2_local_check_quota_file, + .read_file_info = ocfs2_local_read_info, + .write_file_info = ocfs2_global_write_info, + .free_file_info = ocfs2_local_free_info, + .read_dqblk = ocfs2_local_read_dquot, + .commit_dqblk = ocfs2_local_write_dquot, + .release_dqblk = ocfs2_local_release_dquot, +}; + +struct quota_format_type ocfs2_quota_format = { + .qf_fmt_id = QFMT_OCFS2, + .qf_ops = &ocfs2_format_ops, + .qf_owner = THIS_MODULE +}; diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c index ffd48db229a7..424adaa5f900 100644 --- a/fs/ocfs2/resize.c +++ b/fs/ocfs2/resize.c @@ -106,8 +106,8 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle, mlog_entry("(new_clusters=%d, first_new_cluster = %u)\n", new_clusters, first_new_cluster); - ret = ocfs2_journal_access(handle, bm_inode, group_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_journal_access_gd(handle, bm_inode, group_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret < 0) { mlog_errno(ret); goto out; @@ -141,8 +141,8 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle, } /* update the inode accordingly. */ - ret = ocfs2_journal_access(handle, bm_inode, bm_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_journal_access_di(handle, bm_inode, bm_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret < 0) { mlog_errno(ret); goto out_rollback; @@ -314,6 +314,10 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters) fe = (struct ocfs2_dinode *)main_bm_bh->b_data; + /* main_bm_bh is validated by inode read inside ocfs2_inode_lock(), + * so any corruption is a code bug. */ + BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); + if (le16_to_cpu(fe->id2.i_chain.cl_cpg) != ocfs2_group_bitmap_size(osb->sb) * 8) { mlog(ML_ERROR, "The disk is too old and small. " @@ -322,30 +326,18 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters) goto out_unlock; } - if (!OCFS2_IS_VALID_DINODE(fe)) { - OCFS2_RO_ON_INVALID_DINODE(main_bm_inode->i_sb, fe); - ret = -EIO; - goto out_unlock; - } - first_new_cluster = le32_to_cpu(fe->i_clusters); lgd_blkno = ocfs2_which_cluster_group(main_bm_inode, first_new_cluster - 1); - ret = ocfs2_read_block(main_bm_inode, lgd_blkno, &group_bh); + ret = ocfs2_read_group_descriptor(main_bm_inode, fe, lgd_blkno, + &group_bh); if (ret < 0) { mlog_errno(ret); goto out_unlock; } - group = (struct ocfs2_group_desc *)group_bh->b_data; - ret = ocfs2_check_group_descriptor(inode->i_sb, fe, group); - if (ret) { - mlog_errno(ret); - goto out_unlock; - } - cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc); if (le16_to_cpu(group->bg_bits) / cl_bpc + new_clusters > le16_to_cpu(fe->id2.i_chain.cl_cpg)) { @@ -398,41 +390,16 @@ static int ocfs2_check_new_group(struct inode *inode, struct buffer_head *group_bh) { int ret; - struct ocfs2_group_desc *gd; + struct ocfs2_group_desc *gd = + (struct ocfs2_group_desc *)group_bh->b_data; u16 cl_bpc = le16_to_cpu(di->id2.i_chain.cl_bpc); - unsigned int max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * - le16_to_cpu(di->id2.i_chain.cl_bpc); - - gd = (struct ocfs2_group_desc *)group_bh->b_data; + ret = ocfs2_check_group_descriptor(inode->i_sb, di, group_bh); + if (ret) + goto out; - ret = -EIO; - if (!OCFS2_IS_VALID_GROUP_DESC(gd)) - mlog(ML_ERROR, "Group descriptor # %llu isn't valid.\n", - (unsigned long long)le64_to_cpu(gd->bg_blkno)); - else if (di->i_blkno != gd->bg_parent_dinode) - mlog(ML_ERROR, "Group descriptor # %llu has bad parent " - "pointer (%llu, expected %llu)\n", - (unsigned long long)le64_to_cpu(gd->bg_blkno), - (unsigned long long)le64_to_cpu(gd->bg_parent_dinode), - (unsigned long long)le64_to_cpu(di->i_blkno)); - else if (le16_to_cpu(gd->bg_bits) > max_bits) - mlog(ML_ERROR, "Group descriptor # %llu has bit count of %u\n", - (unsigned long long)le64_to_cpu(gd->bg_blkno), - le16_to_cpu(gd->bg_bits)); - else if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) - mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but " - "claims that %u are free\n", - (unsigned long long)le64_to_cpu(gd->bg_blkno), - le16_to_cpu(gd->bg_bits), - le16_to_cpu(gd->bg_free_bits_count)); - else if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) - mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but " - "max bitmap bits of %u\n", - (unsigned long long)le64_to_cpu(gd->bg_blkno), - le16_to_cpu(gd->bg_bits), - 8 * le16_to_cpu(gd->bg_size)); - else if (le16_to_cpu(gd->bg_chain) != input->chain) + ret = -EINVAL; + if (le16_to_cpu(gd->bg_chain) != input->chain) mlog(ML_ERROR, "Group descriptor # %llu has bad chain %u " "while input has %u set.\n", (unsigned long long)le64_to_cpu(gd->bg_blkno), @@ -451,6 +418,7 @@ static int ocfs2_check_new_group(struct inode *inode, else ret = 0; +out: return ret; } @@ -568,8 +536,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input) cl = &fe->id2.i_chain; cr = &cl->cl_recs[input->chain]; - ret = ocfs2_journal_access(handle, main_bm_inode, group_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_journal_access_gd(handle, main_bm_inode, group_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret < 0) { mlog_errno(ret); goto out_commit; @@ -584,8 +552,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input) goto out_commit; } - ret = ocfs2_journal_access(handle, main_bm_inode, main_bm_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_journal_access_di(handle, main_bm_inode, main_bm_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret < 0) { mlog_errno(ret); goto out_commit; diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index bdda2d8f8508..40661e7824e9 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -151,7 +151,7 @@ int ocfs2_refresh_slot_info(struct ocfs2_super *osb) * this is not true, the read of -1 (UINT64_MAX) will fail. */ ret = ocfs2_read_blocks(si->si_inode, -1, si->si_blocks, si->si_bh, - OCFS2_BH_IGNORE_CACHE); + OCFS2_BH_IGNORE_CACHE, NULL); if (ret == 0) { spin_lock(&osb->osb_lock); ocfs2_update_slot_info(si); @@ -405,7 +405,7 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb, bh = NULL; /* Acquire a fresh bh */ status = ocfs2_read_blocks(si->si_inode, blkno, 1, &bh, - OCFS2_BH_IGNORE_CACHE); + OCFS2_BH_IGNORE_CACHE, NULL); if (status < 0) { mlog_errno(status); goto bail; diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index c5ff18b46b57..a69628603e18 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -35,6 +35,7 @@ #include "ocfs2.h" #include "alloc.h" +#include "blockcheck.h" #include "dlmglue.h" #include "inode.h" #include "journal.h" @@ -145,62 +146,183 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl) return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc); } -/* somewhat more expensive than our other checks, so use sparingly. */ -int ocfs2_check_group_descriptor(struct super_block *sb, - struct ocfs2_dinode *di, - struct ocfs2_group_desc *gd) +#define do_error(fmt, ...) \ + do{ \ + if (clean_error) \ + mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__); \ + else \ + ocfs2_error(sb, fmt, ##__VA_ARGS__); \ + } while (0) + +static int ocfs2_validate_gd_self(struct super_block *sb, + struct buffer_head *bh, + int clean_error) { - unsigned int max_bits; + struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; if (!OCFS2_IS_VALID_GROUP_DESC(gd)) { - OCFS2_RO_ON_INVALID_GROUP_DESC(sb, gd); - return -EIO; + do_error("Group descriptor #%llu has bad signature %.*s", + (unsigned long long)bh->b_blocknr, 7, + gd->bg_signature); + return -EINVAL; } + if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) { + do_error("Group descriptor #%llu has an invalid bg_blkno " + "of %llu", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(gd->bg_blkno)); + return -EINVAL; + } + + if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) { + do_error("Group descriptor #%llu has an invalid " + "fs_generation of #%u", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(gd->bg_generation)); + return -EINVAL; + } + + if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) { + do_error("Group descriptor #%llu has bit count %u but " + "claims that %u are free", + (unsigned long long)bh->b_blocknr, + le16_to_cpu(gd->bg_bits), + le16_to_cpu(gd->bg_free_bits_count)); + return -EINVAL; + } + + if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) { + do_error("Group descriptor #%llu has bit count %u but " + "max bitmap bits of %u", + (unsigned long long)bh->b_blocknr, + le16_to_cpu(gd->bg_bits), + 8 * le16_to_cpu(gd->bg_size)); + return -EINVAL; + } + + return 0; +} + +static int ocfs2_validate_gd_parent(struct super_block *sb, + struct ocfs2_dinode *di, + struct buffer_head *bh, + int clean_error) +{ + unsigned int max_bits; + struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; + if (di->i_blkno != gd->bg_parent_dinode) { - ocfs2_error(sb, "Group descriptor # %llu has bad parent " - "pointer (%llu, expected %llu)", - (unsigned long long)le64_to_cpu(gd->bg_blkno), - (unsigned long long)le64_to_cpu(gd->bg_parent_dinode), - (unsigned long long)le64_to_cpu(di->i_blkno)); - return -EIO; + do_error("Group descriptor #%llu has bad parent " + "pointer (%llu, expected %llu)", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(gd->bg_parent_dinode), + (unsigned long long)le64_to_cpu(di->i_blkno)); + return -EINVAL; } max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc); if (le16_to_cpu(gd->bg_bits) > max_bits) { - ocfs2_error(sb, "Group descriptor # %llu has bit count of %u", - (unsigned long long)le64_to_cpu(gd->bg_blkno), - le16_to_cpu(gd->bg_bits)); - return -EIO; + do_error("Group descriptor #%llu has bit count of %u", + (unsigned long long)bh->b_blocknr, + le16_to_cpu(gd->bg_bits)); + return -EINVAL; } if (le16_to_cpu(gd->bg_chain) >= le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) { - ocfs2_error(sb, "Group descriptor # %llu has bad chain %u", - (unsigned long long)le64_to_cpu(gd->bg_blkno), - le16_to_cpu(gd->bg_chain)); - return -EIO; + do_error("Group descriptor #%llu has bad chain %u", + (unsigned long long)bh->b_blocknr, + le16_to_cpu(gd->bg_chain)); + return -EINVAL; } - if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) { - ocfs2_error(sb, "Group descriptor # %llu has bit count %u but " - "claims that %u are free", - (unsigned long long)le64_to_cpu(gd->bg_blkno), - le16_to_cpu(gd->bg_bits), - le16_to_cpu(gd->bg_free_bits_count)); - return -EIO; - } + return 0; +} - if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) { - ocfs2_error(sb, "Group descriptor # %llu has bit count %u but " - "max bitmap bits of %u", - (unsigned long long)le64_to_cpu(gd->bg_blkno), - le16_to_cpu(gd->bg_bits), - 8 * le16_to_cpu(gd->bg_size)); - return -EIO; +#undef do_error + +/* + * This version only prints errors. It does not fail the filesystem, and + * exists only for resize. + */ +int ocfs2_check_group_descriptor(struct super_block *sb, + struct ocfs2_dinode *di, + struct buffer_head *bh) +{ + int rc; + struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; + + BUG_ON(!buffer_uptodate(bh)); + + /* + * If the ecc fails, we return the error but otherwise + * leave the filesystem running. We know any error is + * local to this block. + */ + rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check); + if (rc) { + mlog(ML_ERROR, + "Checksum failed for group descriptor %llu\n", + (unsigned long long)bh->b_blocknr); + } else + rc = ocfs2_validate_gd_self(sb, bh, 1); + if (!rc) + rc = ocfs2_validate_gd_parent(sb, di, bh, 1); + + return rc; +} + +static int ocfs2_validate_group_descriptor(struct super_block *sb, + struct buffer_head *bh) +{ + int rc; + struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; + + mlog(0, "Validating group descriptor %llu\n", + (unsigned long long)bh->b_blocknr); + + BUG_ON(!buffer_uptodate(bh)); + + /* + * If the ecc fails, we return the error but otherwise + * leave the filesystem running. We know any error is + * local to this block. + */ + rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check); + if (rc) + return rc; + + /* + * Errors after here are fatal. + */ + + return ocfs2_validate_gd_self(sb, bh, 0); +} + +int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di, + u64 gd_blkno, struct buffer_head **bh) +{ + int rc; + struct buffer_head *tmp = *bh; + + rc = ocfs2_read_block(inode, gd_blkno, &tmp, + ocfs2_validate_group_descriptor); + if (rc) + goto out; + + rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0); + if (rc) { + brelse(tmp); + goto out; } - return 0; + /* If ocfs2_read_block() got us a new bh, pass it up. */ + if (!*bh) + *bh = tmp; + +out: + return rc; } static int ocfs2_block_group_fill(handle_t *handle, @@ -225,10 +347,10 @@ static int ocfs2_block_group_fill(handle_t *handle, goto bail; } - status = ocfs2_journal_access(handle, - alloc_inode, - bg_bh, - OCFS2_JOURNAL_ACCESS_CREATE); + status = ocfs2_journal_access_gd(handle, + alloc_inode, + bg_bh, + OCFS2_JOURNAL_ACCESS_CREATE); if (status < 0) { mlog_errno(status); goto bail; @@ -358,8 +480,8 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb, bg = (struct ocfs2_group_desc *) bg_bh->b_data; - status = ocfs2_journal_access(handle, alloc_inode, - bh, OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_di(handle, alloc_inode, + bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail; @@ -441,11 +563,11 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, ac->ac_alloc_slot = slot; fe = (struct ocfs2_dinode *) bh->b_data; - if (!OCFS2_IS_VALID_DINODE(fe)) { - OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe); - status = -EIO; - goto bail; - } + + /* The bh was validated by the inode read inside + * ocfs2_inode_lock(). Any corruption is a code bug. */ + BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); + if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) { ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu", (unsigned long long)le64_to_cpu(fe->i_blkno)); @@ -790,10 +912,9 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, int offset, start, found, status = 0; struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; - if (!OCFS2_IS_VALID_GROUP_DESC(bg)) { - OCFS2_RO_ON_INVALID_GROUP_DESC(osb->sb, bg); - return -EIO; - } + /* Callers got this descriptor from + * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ + BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); found = start = best_offset = best_size = 0; bitmap = bg->bg_bitmap; @@ -858,11 +979,9 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle, mlog_entry_void(); - if (!OCFS2_IS_VALID_GROUP_DESC(bg)) { - OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg); - status = -EIO; - goto bail; - } + /* All callers get the descriptor via + * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ + BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits); mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off, @@ -871,10 +990,10 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle, if (ocfs2_is_cluster_bitmap(alloc_inode)) journal_type = OCFS2_JOURNAL_ACCESS_UNDO; - status = ocfs2_journal_access(handle, - alloc_inode, - group_bh, - journal_type); + status = ocfs2_journal_access_gd(handle, + alloc_inode, + group_bh, + journal_type); if (status < 0) { mlog_errno(status); goto bail; @@ -931,21 +1050,10 @@ static int ocfs2_relink_block_group(handle_t *handle, struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data; - if (!OCFS2_IS_VALID_DINODE(fe)) { - OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe); - status = -EIO; - goto out; - } - if (!OCFS2_IS_VALID_GROUP_DESC(bg)) { - OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg); - status = -EIO; - goto out; - } - if (!OCFS2_IS_VALID_GROUP_DESC(prev_bg)) { - OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, prev_bg); - status = -EIO; - goto out; - } + /* The caller got these descriptors from + * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ + BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); + BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg)); mlog(0, "Suballoc %llu, chain %u, move group %llu to top, prev = %llu\n", (unsigned long long)le64_to_cpu(fe->i_blkno), chain, @@ -956,8 +1064,8 @@ static int ocfs2_relink_block_group(handle_t *handle, bg_ptr = le64_to_cpu(bg->bg_next_group); prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group); - status = ocfs2_journal_access(handle, alloc_inode, prev_bg_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_gd(handle, alloc_inode, prev_bg_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto out_rollback; @@ -971,8 +1079,8 @@ static int ocfs2_relink_block_group(handle_t *handle, goto out_rollback; } - status = ocfs2_journal_access(handle, alloc_inode, bg_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_gd(handle, alloc_inode, bg_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto out_rollback; @@ -986,8 +1094,8 @@ static int ocfs2_relink_block_group(handle_t *handle, goto out_rollback; } - status = ocfs2_journal_access(handle, alloc_inode, fe_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_di(handle, alloc_inode, fe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto out_rollback; @@ -1008,7 +1116,7 @@ out_rollback: bg->bg_next_group = cpu_to_le64(bg_ptr); prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr); } -out: + mlog_exit(status); return status; } @@ -1138,8 +1246,8 @@ static int ocfs2_alloc_dinode_update_counts(struct inode *inode, struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain; - ret = ocfs2_journal_access(handle, inode, di_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_journal_access_di(handle, inode, di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret < 0) { mlog_errno(ret); goto out; @@ -1170,21 +1278,17 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac, u16 found; struct buffer_head *group_bh = NULL; struct ocfs2_group_desc *gd; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data; struct inode *alloc_inode = ac->ac_inode; - ret = ocfs2_read_block(alloc_inode, gd_blkno, &group_bh); + ret = ocfs2_read_group_descriptor(alloc_inode, di, gd_blkno, + &group_bh); if (ret < 0) { mlog_errno(ret); return ret; } gd = (struct ocfs2_group_desc *) group_bh->b_data; - if (!OCFS2_IS_VALID_GROUP_DESC(gd)) { - OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, gd); - ret = -EIO; - goto out; - } - ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits, ac->ac_max_block, bit_off, &found); if (ret < 0) { @@ -1241,19 +1345,14 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, bits_wanted, chain, (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno); - status = ocfs2_read_block(alloc_inode, - le64_to_cpu(cl->cl_recs[chain].c_blkno), - &group_bh); + status = ocfs2_read_group_descriptor(alloc_inode, fe, + le64_to_cpu(cl->cl_recs[chain].c_blkno), + &group_bh); if (status < 0) { mlog_errno(status); goto bail; } bg = (struct ocfs2_group_desc *) group_bh->b_data; - status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, bg); - if (status) { - mlog_errno(status); - goto bail; - } status = -ENOSPC; /* for now, the chain search is a bit simplistic. We just use @@ -1271,18 +1370,13 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, next_group = le64_to_cpu(bg->bg_next_group); prev_group_bh = group_bh; group_bh = NULL; - status = ocfs2_read_block(alloc_inode, - next_group, &group_bh); + status = ocfs2_read_group_descriptor(alloc_inode, fe, + next_group, &group_bh); if (status < 0) { mlog_errno(status); goto bail; } bg = (struct ocfs2_group_desc *) group_bh->b_data; - status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, bg); - if (status) { - mlog_errno(status); - goto bail; - } } if (status < 0) { if (status != -ENOSPC) @@ -1324,10 +1418,10 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, /* Ok, claim our bits now: set the info on dinode, chainlist * and then the group */ - status = ocfs2_journal_access(handle, - alloc_inode, - ac->ac_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_di(handle, + alloc_inode, + ac->ac_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail; @@ -1392,11 +1486,11 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, BUG_ON(!ac->ac_bh); fe = (struct ocfs2_dinode *) ac->ac_bh->b_data; - if (!OCFS2_IS_VALID_DINODE(fe)) { - OCFS2_RO_ON_INVALID_DINODE(osb->sb, fe); - status = -EIO; - goto bail; - } + + /* The bh was validated by the inode read during + * ocfs2_reserve_suballoc_bits(). Any corruption is a code bug. */ + BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); + if (le32_to_cpu(fe->id1.bitmap1.i_used) >= le32_to_cpu(fe->id1.bitmap1.i_total)) { ocfs2_error(osb->sb, "Chain allocator dinode %llu has %u used " @@ -1725,19 +1819,17 @@ static inline int ocfs2_block_group_clear_bits(handle_t *handle, mlog_entry_void(); - if (!OCFS2_IS_VALID_GROUP_DESC(bg)) { - OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg); - status = -EIO; - goto bail; - } + /* The caller got this descriptor from + * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ + BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); mlog(0, "off = %u, num = %u\n", bit_off, num_bits); if (ocfs2_is_cluster_bitmap(alloc_inode)) journal_type = OCFS2_JOURNAL_ACCESS_UNDO; - status = ocfs2_journal_access(handle, alloc_inode, group_bh, - journal_type); + status = ocfs2_journal_access_gd(handle, alloc_inode, group_bh, + journal_type); if (status < 0) { mlog_errno(status); goto bail; @@ -1782,29 +1874,26 @@ int ocfs2_free_suballoc_bits(handle_t *handle, mlog_entry_void(); - if (!OCFS2_IS_VALID_DINODE(fe)) { - OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe); - status = -EIO; - goto bail; - } + /* The alloc_bh comes from ocfs2_free_dinode() or + * ocfs2_free_clusters(). The callers have all locked the + * allocator and gotten alloc_bh from the lock call. This + * validates the dinode buffer. Any corruption that has happended + * is a code bug. */ + BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl)); mlog(0, "%llu: freeing %u bits from group %llu, starting at %u\n", (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count, (unsigned long long)bg_blkno, start_bit); - status = ocfs2_read_block(alloc_inode, bg_blkno, &group_bh); + status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno, + &group_bh); if (status < 0) { mlog_errno(status); goto bail; } - group = (struct ocfs2_group_desc *) group_bh->b_data; - status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, group); - if (status) { - mlog_errno(status); - goto bail; - } + BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits)); status = ocfs2_block_group_clear_bits(handle, alloc_inode, @@ -1815,8 +1904,8 @@ int ocfs2_free_suballoc_bits(handle_t *handle, goto bail; } - status = ocfs2_journal_access(handle, alloc_inode, alloc_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_di(handle, alloc_inode, alloc_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail; diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h index 4df159d8f450..e3c13c77f9e8 100644 --- a/fs/ocfs2/suballoc.h +++ b/fs/ocfs2/suballoc.h @@ -164,10 +164,24 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac); * and return that block offset. */ u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster); -/* somewhat more expensive than our other checks, so use sparingly. */ +/* + * By default, ocfs2_read_group_descriptor() calls ocfs2_error() when it + * finds a problem. A caller that wants to check a group descriptor + * without going readonly should read the block with ocfs2_read_block[s]() + * and then checking it with this function. This is only resize, really. + * Everyone else should be using ocfs2_read_group_descriptor(). + */ int ocfs2_check_group_descriptor(struct super_block *sb, struct ocfs2_dinode *di, - struct ocfs2_group_desc *gd); + struct buffer_head *bh); +/* + * Read a group descriptor block into *bh. If *bh is NULL, a bh will be + * allocated. This is a cached read. The descriptor will be validated with + * ocfs2_validate_group_descriptor(). + */ +int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di, + u64 gd_blkno, struct buffer_head **bh); + int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et, u32 clusters_to_add, u32 extents_to_split, struct ocfs2_alloc_context **data_ac, diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 304b63ac78cf..43ed11345b59 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -41,6 +41,7 @@ #include <linux/debugfs.h> #include <linux/mount.h> #include <linux/seq_file.h> +#include <linux/quotaops.h> #define MLOG_MASK_PREFIX ML_SUPER #include <cluster/masklog.h> @@ -51,6 +52,7 @@ #include "ocfs1_fs_compat.h" #include "alloc.h" +#include "blockcheck.h" #include "dlmglue.h" #include "export.h" #include "extent_map.h" @@ -65,10 +67,13 @@ #include "uptodate.h" #include "ver.h" #include "xattr.h" +#include "quota.h" #include "buffer_head_io.h" static struct kmem_cache *ocfs2_inode_cachep = NULL; +struct kmem_cache *ocfs2_dquot_cachep; +struct kmem_cache *ocfs2_qf_chunk_cachep; /* OCFS2 needs to schedule several differnt types of work which * require cluster locking, disk I/O, recovery waits, etc. Since these @@ -124,6 +129,9 @@ static int ocfs2_get_sector(struct super_block *sb, static void ocfs2_write_super(struct super_block *sb); static struct inode *ocfs2_alloc_inode(struct super_block *sb); static void ocfs2_destroy_inode(struct inode *inode); +static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend); +static int ocfs2_enable_quotas(struct ocfs2_super *osb); +static void ocfs2_disable_quotas(struct ocfs2_super *osb); static const struct super_operations ocfs2_sops = { .statfs = ocfs2_statfs, @@ -137,6 +145,8 @@ static const struct super_operations ocfs2_sops = { .put_super = ocfs2_put_super, .remount_fs = ocfs2_remount, .show_options = ocfs2_show_options, + .quota_read = ocfs2_quota_read, + .quota_write = ocfs2_quota_write, }; enum { @@ -158,6 +168,10 @@ enum { Opt_user_xattr, Opt_nouser_xattr, Opt_inode64, + Opt_acl, + Opt_noacl, + Opt_usrquota, + Opt_grpquota, Opt_err, }; @@ -180,6 +194,10 @@ static const match_table_t tokens = { {Opt_user_xattr, "user_xattr"}, {Opt_nouser_xattr, "nouser_xattr"}, {Opt_inode64, "inode64"}, + {Opt_acl, "acl"}, + {Opt_noacl, "noacl"}, + {Opt_usrquota, "usrquota"}, + {Opt_grpquota, "grpquota"}, {Opt_err, NULL} }; @@ -221,6 +239,19 @@ static int ocfs2_sync_fs(struct super_block *sb, int wait) return 0; } +static int ocfs2_need_system_inode(struct ocfs2_super *osb, int ino) +{ + if (!OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_RO_COMPAT_USRQUOTA) + && (ino == USER_QUOTA_SYSTEM_INODE + || ino == LOCAL_USER_QUOTA_SYSTEM_INODE)) + return 0; + if (!OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA) + && (ino == GROUP_QUOTA_SYSTEM_INODE + || ino == LOCAL_GROUP_QUOTA_SYSTEM_INODE)) + return 0; + return 1; +} + static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb) { struct inode *new = NULL; @@ -247,6 +278,8 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb) for (i = OCFS2_FIRST_ONLINE_SYSTEM_INODE; i <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; i++) { + if (!ocfs2_need_system_inode(osb, i)) + continue; new = ocfs2_get_system_file_inode(osb, i, osb->slot_num); if (!new) { ocfs2_release_system_inodes(osb); @@ -277,6 +310,8 @@ static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb) for (i = OCFS2_LAST_GLOBAL_SYSTEM_INODE + 1; i < NUM_SYSTEM_INODES; i++) { + if (!ocfs2_need_system_inode(osb, i)) + continue; new = ocfs2_get_system_file_inode(osb, i, osb->slot_num); if (!new) { ocfs2_release_system_inodes(osb); @@ -426,6 +461,12 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data) /* We're going to/from readonly mode. */ if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) { + /* Disable quota accounting before remounting RO */ + if (*flags & MS_RDONLY) { + ret = ocfs2_susp_quotas(osb, 0); + if (ret < 0) + goto out; + } /* Lock here so the check of HARD_RO and the potential * setting of SOFT_RO is atomic. */ spin_lock(&osb->osb_lock); @@ -461,11 +502,28 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data) } unlock_osb: spin_unlock(&osb->osb_lock); + /* Enable quota accounting after remounting RW */ + if (!ret && !(*flags & MS_RDONLY)) { + if (sb_any_quota_suspended(sb)) + ret = ocfs2_susp_quotas(osb, 1); + else + ret = ocfs2_enable_quotas(osb); + if (ret < 0) { + /* Return back changes... */ + spin_lock(&osb->osb_lock); + sb->s_flags |= MS_RDONLY; + osb->osb_flags |= OCFS2_OSB_SOFT_RO; + spin_unlock(&osb->osb_lock); + goto out; + } + } } if (!ret) { /* Only save off the new mount options in case of a successful * remount. */ + if (!(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR)) + parsed_options.mount_opt &= ~OCFS2_MOUNT_POSIX_ACL; osb->s_mount_opt = parsed_options.mount_opt; osb->s_atime_quantum = parsed_options.atime_quantum; osb->preferred_slot = parsed_options.slot; @@ -619,6 +677,131 @@ static int ocfs2_verify_userspace_stack(struct ocfs2_super *osb, return 0; } +static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend) +{ + int type; + struct super_block *sb = osb->sb; + unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, + OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; + int status = 0; + + for (type = 0; type < MAXQUOTAS; type++) { + if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) + continue; + if (unsuspend) + status = vfs_quota_enable( + sb_dqopt(sb)->files[type], + type, QFMT_OCFS2, + DQUOT_SUSPENDED); + else + status = vfs_quota_disable(sb, type, + DQUOT_SUSPENDED); + if (status < 0) + break; + } + if (status < 0) + mlog(ML_ERROR, "Failed to suspend/unsuspend quotas on " + "remount (error = %d).\n", status); + return status; +} + +static int ocfs2_enable_quotas(struct ocfs2_super *osb) +{ + struct inode *inode[MAXQUOTAS] = { NULL, NULL }; + struct super_block *sb = osb->sb; + unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, + OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; + unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE, + LOCAL_GROUP_QUOTA_SYSTEM_INODE }; + int status; + int type; + + sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NEGATIVE_USAGE; + for (type = 0; type < MAXQUOTAS; type++) { + if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) + continue; + inode[type] = ocfs2_get_system_file_inode(osb, ino[type], + osb->slot_num); + if (!inode[type]) { + status = -ENOENT; + goto out_quota_off; + } + status = vfs_quota_enable(inode[type], type, QFMT_OCFS2, + DQUOT_USAGE_ENABLED); + if (status < 0) + goto out_quota_off; + } + + for (type = 0; type < MAXQUOTAS; type++) + iput(inode[type]); + return 0; +out_quota_off: + ocfs2_disable_quotas(osb); + for (type = 0; type < MAXQUOTAS; type++) + iput(inode[type]); + mlog_errno(status); + return status; +} + +static void ocfs2_disable_quotas(struct ocfs2_super *osb) +{ + int type; + struct inode *inode; + struct super_block *sb = osb->sb; + + /* We mostly ignore errors in this function because there's not much + * we can do when we see them */ + for (type = 0; type < MAXQUOTAS; type++) { + if (!sb_has_quota_loaded(sb, type)) + continue; + inode = igrab(sb->s_dquot.files[type]); + /* Turn off quotas. This will remove all dquot structures from + * memory and so they will be automatically synced to global + * quota files */ + vfs_quota_disable(sb, type, DQUOT_USAGE_ENABLED | + DQUOT_LIMITS_ENABLED); + if (!inode) + continue; + iput(inode); + } +} + +/* Handle quota on quotactl */ +static int ocfs2_quota_on(struct super_block *sb, int type, int format_id, + char *path, int remount) +{ + unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, + OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; + + if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) + return -EINVAL; + + if (remount) + return 0; /* Just ignore it has been handled in + * ocfs2_remount() */ + return vfs_quota_enable(sb_dqopt(sb)->files[type], type, + format_id, DQUOT_LIMITS_ENABLED); +} + +/* Handle quota off quotactl */ +static int ocfs2_quota_off(struct super_block *sb, int type, int remount) +{ + if (remount) + return 0; /* Ignore now and handle later in + * ocfs2_remount() */ + return vfs_quota_disable(sb, type, DQUOT_LIMITS_ENABLED); +} + +static struct quotactl_ops ocfs2_quotactl_ops = { + .quota_on = ocfs2_quota_on, + .quota_off = ocfs2_quota_off, + .quota_sync = vfs_quota_sync, + .get_info = vfs_get_dqinfo, + .set_info = vfs_set_dqinfo, + .get_dqblk = vfs_get_dqblk, + .set_dqblk = vfs_set_dqblk, +}; + static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) { struct dentry *root; @@ -651,12 +834,32 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) } brelse(bh); bh = NULL; + + if (!(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR)) + parsed_options.mount_opt &= ~OCFS2_MOUNT_POSIX_ACL; + osb->s_mount_opt = parsed_options.mount_opt; osb->s_atime_quantum = parsed_options.atime_quantum; osb->preferred_slot = parsed_options.slot; osb->osb_commit_interval = parsed_options.commit_interval; osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt); osb->local_alloc_bits = osb->local_alloc_default_bits; + if (osb->s_mount_opt & OCFS2_MOUNT_USRQUOTA && + !OCFS2_HAS_RO_COMPAT_FEATURE(sb, + OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) { + status = -EINVAL; + mlog(ML_ERROR, "User quotas were requested, but this " + "filesystem does not have the feature enabled.\n"); + goto read_super_error; + } + if (osb->s_mount_opt & OCFS2_MOUNT_GRPQUOTA && + !OCFS2_HAS_RO_COMPAT_FEATURE(sb, + OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) { + status = -EINVAL; + mlog(ML_ERROR, "Group quotas were requested, but this " + "filesystem does not have the feature enabled.\n"); + goto read_super_error; + } status = ocfs2_verify_userspace_stack(osb, &parsed_options); if (status) @@ -664,6 +867,9 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) sb->s_magic = OCFS2_SUPER_MAGIC; + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); + /* Hard readonly mode only if: bdev_read_only, MS_RDONLY, * heartbeat=none */ if (bdev_read_only(sb->s_bdev)) { @@ -758,6 +964,28 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) atomic_set(&osb->vol_state, VOLUME_MOUNTED); wake_up(&osb->osb_mount_event); + /* Now we can initialize quotas because we can afford to wait + * for cluster locks recovery now. That also means that truncation + * log recovery can happen but that waits for proper quota setup */ + if (!(sb->s_flags & MS_RDONLY)) { + status = ocfs2_enable_quotas(osb); + if (status < 0) { + /* We have to err-out specially here because + * s_root is already set */ + mlog_errno(status); + atomic_set(&osb->vol_state, VOLUME_DISABLED); + wake_up(&osb->osb_mount_event); + mlog_exit(status); + return status; + } + } + + ocfs2_complete_quota_recovery(osb); + + /* Now we wake up again for processes waiting for quotas */ + atomic_set(&osb->vol_state, VOLUME_MOUNTED_QUOTAS); + wake_up(&osb->osb_mount_event); + mlog_exit(status); return status; @@ -945,6 +1173,41 @@ static int ocfs2_parse_options(struct super_block *sb, case Opt_inode64: mopt->mount_opt |= OCFS2_MOUNT_INODE64; break; + case Opt_usrquota: + /* We check only on remount, otherwise features + * aren't yet initialized. */ + if (is_remount && !OCFS2_HAS_RO_COMPAT_FEATURE(sb, + OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) { + mlog(ML_ERROR, "User quota requested but " + "filesystem feature is not set\n"); + status = 0; + goto bail; + } + mopt->mount_opt |= OCFS2_MOUNT_USRQUOTA; + break; + case Opt_grpquota: + if (is_remount && !OCFS2_HAS_RO_COMPAT_FEATURE(sb, + OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) { + mlog(ML_ERROR, "Group quota requested but " + "filesystem feature is not set\n"); + status = 0; + goto bail; + } + mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA; + break; +#ifdef CONFIG_OCFS2_FS_POSIX_ACL + case Opt_acl: + mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL; + break; + case Opt_noacl: + mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL; + break; +#else + case Opt_acl: + case Opt_noacl: + printk(KERN_INFO "ocfs2 (no)acl options not supported\n"); + break; +#endif default: mlog(ML_ERROR, "Unrecognized mount option \"%s\" " @@ -1008,6 +1271,10 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt) if (osb->osb_cluster_stack[0]) seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN, osb->osb_cluster_stack); + if (opts & OCFS2_MOUNT_USRQUOTA) + seq_printf(s, ",usrquota"); + if (opts & OCFS2_MOUNT_GRPQUOTA) + seq_printf(s, ",grpquota"); if (opts & OCFS2_MOUNT_NOUSERXATTR) seq_printf(s, ",nouser_xattr"); @@ -1017,6 +1284,13 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt) if (opts & OCFS2_MOUNT_INODE64) seq_printf(s, ",inode64"); +#ifdef CONFIG_OCFS2_FS_POSIX_ACL + if (opts & OCFS2_MOUNT_POSIX_ACL) + seq_printf(s, ",acl"); + else + seq_printf(s, ",noacl"); +#endif + return 0; } @@ -1052,10 +1326,16 @@ static int __init ocfs2_init(void) mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n"); } + status = ocfs2_quota_setup(); + if (status) + goto leave; + ocfs2_set_locking_protocol(); + status = register_quota_format(&ocfs2_quota_format); leave: if (status < 0) { + ocfs2_quota_shutdown(); ocfs2_free_mem_caches(); exit_ocfs2_uptodate_cache(); } @@ -1072,11 +1352,15 @@ static void __exit ocfs2_exit(void) { mlog_entry_void(); + ocfs2_quota_shutdown(); + if (ocfs2_wq) { flush_workqueue(ocfs2_wq); destroy_workqueue(ocfs2_wq); } + unregister_quota_format(&ocfs2_quota_format); + debugfs_remove(ocfs2_debugfs_root); ocfs2_free_mem_caches(); @@ -1192,8 +1476,27 @@ static int ocfs2_initialize_mem_caches(void) (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| SLAB_MEM_SPREAD), ocfs2_inode_init_once); - if (!ocfs2_inode_cachep) + ocfs2_dquot_cachep = kmem_cache_create("ocfs2_dquot_cache", + sizeof(struct ocfs2_dquot), + 0, + (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + NULL); + ocfs2_qf_chunk_cachep = kmem_cache_create("ocfs2_qf_chunk_cache", + sizeof(struct ocfs2_quota_chunk), + 0, + (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD), + NULL); + if (!ocfs2_inode_cachep || !ocfs2_dquot_cachep || + !ocfs2_qf_chunk_cachep) { + if (ocfs2_inode_cachep) + kmem_cache_destroy(ocfs2_inode_cachep); + if (ocfs2_dquot_cachep) + kmem_cache_destroy(ocfs2_dquot_cachep); + if (ocfs2_qf_chunk_cachep) + kmem_cache_destroy(ocfs2_qf_chunk_cachep); return -ENOMEM; + } return 0; } @@ -1202,8 +1505,15 @@ static void ocfs2_free_mem_caches(void) { if (ocfs2_inode_cachep) kmem_cache_destroy(ocfs2_inode_cachep); - ocfs2_inode_cachep = NULL; + + if (ocfs2_dquot_cachep) + kmem_cache_destroy(ocfs2_dquot_cachep); + ocfs2_dquot_cachep = NULL; + + if (ocfs2_qf_chunk_cachep) + kmem_cache_destroy(ocfs2_qf_chunk_cachep); + ocfs2_qf_chunk_cachep = NULL; } static int ocfs2_get_sector(struct super_block *sb, @@ -1303,6 +1613,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) osb = OCFS2_SB(sb); BUG_ON(!osb); + ocfs2_disable_quotas(osb); + ocfs2_shutdown_local_alloc(osb); ocfs2_truncate_log_shutdown(osb); @@ -1413,6 +1725,8 @@ static int ocfs2_initialize_super(struct super_block *sb, sb->s_fs_info = osb; sb->s_op = &ocfs2_sops; sb->s_export_op = &ocfs2_export_ops; + sb->s_qcop = &ocfs2_quotactl_ops; + sb->dq_op = &ocfs2_quota_operations; sb->s_xattr = ocfs2_xattr_handlers; sb->s_time_gran = 1; sb->s_flags |= MS_NOATIME; @@ -1676,6 +1990,15 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di, if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE, strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) { + /* We have to do a raw check of the feature here */ + if (le32_to_cpu(di->id2.i_super.s_feature_incompat) & + OCFS2_FEATURE_INCOMPAT_META_ECC) { + status = ocfs2_block_check_validate(bh->b_data, + bh->b_size, + &di->i_check); + if (status) + goto out; + } status = -EINVAL; if ((1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits)) != blksz) { mlog(ML_ERROR, "found superblock with incorrect block " @@ -1717,6 +2040,7 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di, } } +out: mlog_exit(status); return status; } diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c index cbd03dfdc7b9..ed0a0cfd68d2 100644 --- a/fs/ocfs2/symlink.c +++ b/fs/ocfs2/symlink.c @@ -84,7 +84,7 @@ static char *ocfs2_fast_symlink_getlink(struct inode *inode, mlog_entry_void(); - status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, bh); + status = ocfs2_read_inode_block(inode, bh); if (status < 0) { mlog_errno(status); link = ERR_PTR(status); diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 74d7367ade13..e1d638af6ac3 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -35,12 +35,14 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/string.h> +#include <linux/security.h> #define MLOG_MASK_PREFIX ML_XATTR #include <cluster/masklog.h> #include "ocfs2.h" #include "alloc.h" +#include "blockcheck.h" #include "dlmglue.h" #include "file.h" #include "symlink.h" @@ -61,12 +63,32 @@ struct ocfs2_xattr_def_value_root { }; struct ocfs2_xattr_bucket { - struct buffer_head *bhs[OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET]; - struct ocfs2_xattr_header *xh; + /* The inode these xattrs are associated with */ + struct inode *bu_inode; + + /* The actual buffers that make up the bucket */ + struct buffer_head *bu_bhs[OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET]; + + /* How many blocks make up one bucket for this filesystem */ + int bu_blocks; +}; + +struct ocfs2_xattr_set_ctxt { + handle_t *handle; + struct ocfs2_alloc_context *meta_ac; + struct ocfs2_alloc_context *data_ac; + struct ocfs2_cached_dealloc_ctxt dealloc; }; #define OCFS2_XATTR_ROOT_SIZE (sizeof(struct ocfs2_xattr_def_value_root)) #define OCFS2_XATTR_INLINE_SIZE 80 +#define OCFS2_XATTR_FREE_IN_IBODY (OCFS2_MIN_XATTR_INLINE_SIZE \ + - sizeof(struct ocfs2_xattr_header) \ + - sizeof(__u32)) +#define OCFS2_XATTR_FREE_IN_BLOCK(ptr) ((ptr)->i_sb->s_blocksize \ + - sizeof(struct ocfs2_xattr_block) \ + - sizeof(struct ocfs2_xattr_header) \ + - sizeof(__u32)) static struct ocfs2_xattr_def_value_root def_xv = { .xv.xr_list.l_count = cpu_to_le16(1), @@ -74,13 +96,25 @@ static struct ocfs2_xattr_def_value_root def_xv = { struct xattr_handler *ocfs2_xattr_handlers[] = { &ocfs2_xattr_user_handler, +#ifdef CONFIG_OCFS2_FS_POSIX_ACL + &ocfs2_xattr_acl_access_handler, + &ocfs2_xattr_acl_default_handler, +#endif &ocfs2_xattr_trusted_handler, + &ocfs2_xattr_security_handler, NULL }; static struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = { [OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler, +#ifdef CONFIG_OCFS2_FS_POSIX_ACL + [OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS] + = &ocfs2_xattr_acl_access_handler, + [OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT] + = &ocfs2_xattr_acl_default_handler, +#endif [OCFS2_XATTR_INDEX_TRUSTED] = &ocfs2_xattr_trusted_handler, + [OCFS2_XATTR_INDEX_SECURITY] = &ocfs2_xattr_security_handler, }; struct ocfs2_xattr_info { @@ -98,7 +132,7 @@ struct ocfs2_xattr_search { */ struct buffer_head *xattr_bh; struct ocfs2_xattr_header *header; - struct ocfs2_xattr_bucket bucket; + struct ocfs2_xattr_bucket *bucket; void *base; void *end; struct ocfs2_xattr_entry *here; @@ -127,14 +161,20 @@ static int ocfs2_xattr_tree_list_index_block(struct inode *inode, size_t buffer_size); static int ocfs2_xattr_create_index_block(struct inode *inode, - struct ocfs2_xattr_search *xs); + struct ocfs2_xattr_search *xs, + struct ocfs2_xattr_set_ctxt *ctxt); static int ocfs2_xattr_set_entry_index_block(struct inode *inode, struct ocfs2_xattr_info *xi, - struct ocfs2_xattr_search *xs); + struct ocfs2_xattr_search *xs, + struct ocfs2_xattr_set_ctxt *ctxt); static int ocfs2_delete_xattr_index_block(struct inode *inode, struct buffer_head *xb_bh); +static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle, + u64 src_blk, u64 last_blk, u64 to_blk, + unsigned int start_bucket, + u32 *first_hash); static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb) { @@ -154,6 +194,216 @@ static inline u16 ocfs2_xattr_max_xe_in_bucket(struct super_block *sb) return len / sizeof(struct ocfs2_xattr_entry); } +#define bucket_blkno(_b) ((_b)->bu_bhs[0]->b_blocknr) +#define bucket_block(_b, _n) ((_b)->bu_bhs[(_n)]->b_data) +#define bucket_xh(_b) ((struct ocfs2_xattr_header *)bucket_block((_b), 0)) + +static struct ocfs2_xattr_bucket *ocfs2_xattr_bucket_new(struct inode *inode) +{ + struct ocfs2_xattr_bucket *bucket; + int blks = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + + BUG_ON(blks > OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET); + + bucket = kzalloc(sizeof(struct ocfs2_xattr_bucket), GFP_NOFS); + if (bucket) { + bucket->bu_inode = inode; + bucket->bu_blocks = blks; + } + + return bucket; +} + +static void ocfs2_xattr_bucket_relse(struct ocfs2_xattr_bucket *bucket) +{ + int i; + + for (i = 0; i < bucket->bu_blocks; i++) { + brelse(bucket->bu_bhs[i]); + bucket->bu_bhs[i] = NULL; + } +} + +static void ocfs2_xattr_bucket_free(struct ocfs2_xattr_bucket *bucket) +{ + if (bucket) { + ocfs2_xattr_bucket_relse(bucket); + bucket->bu_inode = NULL; + kfree(bucket); + } +} + +/* + * A bucket that has never been written to disk doesn't need to be + * read. We just need the buffer_heads. Don't call this for + * buckets that are already on disk. ocfs2_read_xattr_bucket() initializes + * them fully. + */ +static int ocfs2_init_xattr_bucket(struct ocfs2_xattr_bucket *bucket, + u64 xb_blkno) +{ + int i, rc = 0; + + for (i = 0; i < bucket->bu_blocks; i++) { + bucket->bu_bhs[i] = sb_getblk(bucket->bu_inode->i_sb, + xb_blkno + i); + if (!bucket->bu_bhs[i]) { + rc = -EIO; + mlog_errno(rc); + break; + } + + if (!ocfs2_buffer_uptodate(bucket->bu_inode, + bucket->bu_bhs[i])) + ocfs2_set_new_buffer_uptodate(bucket->bu_inode, + bucket->bu_bhs[i]); + } + + if (rc) + ocfs2_xattr_bucket_relse(bucket); + return rc; +} + +/* Read the xattr bucket at xb_blkno */ +static int ocfs2_read_xattr_bucket(struct ocfs2_xattr_bucket *bucket, + u64 xb_blkno) +{ + int rc; + + rc = ocfs2_read_blocks(bucket->bu_inode, xb_blkno, + bucket->bu_blocks, bucket->bu_bhs, 0, + NULL); + if (!rc) { + rc = ocfs2_validate_meta_ecc_bhs(bucket->bu_inode->i_sb, + bucket->bu_bhs, + bucket->bu_blocks, + &bucket_xh(bucket)->xh_check); + if (rc) + mlog_errno(rc); + } + + if (rc) + ocfs2_xattr_bucket_relse(bucket); + return rc; +} + +static int ocfs2_xattr_bucket_journal_access(handle_t *handle, + struct ocfs2_xattr_bucket *bucket, + int type) +{ + int i, rc = 0; + + for (i = 0; i < bucket->bu_blocks; i++) { + rc = ocfs2_journal_access(handle, bucket->bu_inode, + bucket->bu_bhs[i], type); + if (rc) { + mlog_errno(rc); + break; + } + } + + return rc; +} + +static void ocfs2_xattr_bucket_journal_dirty(handle_t *handle, + struct ocfs2_xattr_bucket *bucket) +{ + int i; + + ocfs2_compute_meta_ecc_bhs(bucket->bu_inode->i_sb, + bucket->bu_bhs, bucket->bu_blocks, + &bucket_xh(bucket)->xh_check); + + for (i = 0; i < bucket->bu_blocks; i++) + ocfs2_journal_dirty(handle, bucket->bu_bhs[i]); +} + +static void ocfs2_xattr_bucket_copy_data(struct ocfs2_xattr_bucket *dest, + struct ocfs2_xattr_bucket *src) +{ + int i; + int blocksize = src->bu_inode->i_sb->s_blocksize; + + BUG_ON(dest->bu_blocks != src->bu_blocks); + BUG_ON(dest->bu_inode != src->bu_inode); + + for (i = 0; i < src->bu_blocks; i++) { + memcpy(bucket_block(dest, i), bucket_block(src, i), + blocksize); + } +} + +static int ocfs2_validate_xattr_block(struct super_block *sb, + struct buffer_head *bh) +{ + int rc; + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)bh->b_data; + + mlog(0, "Validating xattr block %llu\n", + (unsigned long long)bh->b_blocknr); + + BUG_ON(!buffer_uptodate(bh)); + + /* + * If the ecc fails, we return the error but otherwise + * leave the filesystem running. We know any error is + * local to this block. + */ + rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &xb->xb_check); + if (rc) + return rc; + + /* + * Errors after here are fatal + */ + + if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) { + ocfs2_error(sb, + "Extended attribute block #%llu has bad " + "signature %.*s", + (unsigned long long)bh->b_blocknr, 7, + xb->xb_signature); + return -EINVAL; + } + + if (le64_to_cpu(xb->xb_blkno) != bh->b_blocknr) { + ocfs2_error(sb, + "Extended attribute block #%llu has an " + "invalid xb_blkno of %llu", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(xb->xb_blkno)); + return -EINVAL; + } + + if (le32_to_cpu(xb->xb_fs_generation) != OCFS2_SB(sb)->fs_generation) { + ocfs2_error(sb, + "Extended attribute block #%llu has an invalid " + "xb_fs_generation of #%u", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(xb->xb_fs_generation)); + return -EINVAL; + } + + return 0; +} + +static int ocfs2_read_xattr_block(struct inode *inode, u64 xb_blkno, + struct buffer_head **bh) +{ + int rc; + struct buffer_head *tmp = *bh; + + rc = ocfs2_read_block(inode, xb_blkno, &tmp, + ocfs2_validate_xattr_block); + + /* If ocfs2_read_block() got us a new bh, pass it up. */ + if (!rc && !*bh) + *bh = tmp; + + return rc; +} + static inline const char *ocfs2_xattr_prefix(int name_index) { struct xattr_handler *handler = NULL; @@ -200,54 +450,163 @@ static void ocfs2_xattr_hash_entry(struct inode *inode, return; } +static int ocfs2_xattr_entry_real_size(int name_len, size_t value_len) +{ + int size = 0; + + if (value_len <= OCFS2_XATTR_INLINE_SIZE) + size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(value_len); + else + size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE; + size += sizeof(struct ocfs2_xattr_entry); + + return size; +} + +int ocfs2_calc_security_init(struct inode *dir, + struct ocfs2_security_xattr_info *si, + int *want_clusters, + int *xattr_credits, + struct ocfs2_alloc_context **xattr_ac) +{ + int ret = 0; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + int s_size = ocfs2_xattr_entry_real_size(strlen(si->name), + si->value_len); + + /* + * The max space of security xattr taken inline is + * 256(name) + 80(value) + 16(entry) = 352 bytes, + * So reserve one metadata block for it is ok. + */ + if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE || + s_size > OCFS2_XATTR_FREE_IN_IBODY) { + ret = ocfs2_reserve_new_metadata_blocks(osb, 1, xattr_ac); + if (ret) { + mlog_errno(ret); + return ret; + } + *xattr_credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS; + } + + /* reserve clusters for xattr value which will be set in B tree*/ + if (si->value_len > OCFS2_XATTR_INLINE_SIZE) { + int new_clusters = ocfs2_clusters_for_bytes(dir->i_sb, + si->value_len); + + *xattr_credits += ocfs2_clusters_to_blocks(dir->i_sb, + new_clusters); + *want_clusters += new_clusters; + } + return ret; +} + +int ocfs2_calc_xattr_init(struct inode *dir, + struct buffer_head *dir_bh, + int mode, + struct ocfs2_security_xattr_info *si, + int *want_clusters, + int *xattr_credits, + struct ocfs2_alloc_context **xattr_ac) +{ + int ret = 0; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + int s_size = 0, a_size = 0, acl_len = 0, new_clusters; + + if (si->enable) + s_size = ocfs2_xattr_entry_real_size(strlen(si->name), + si->value_len); + + if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) { + acl_len = ocfs2_xattr_get_nolock(dir, dir_bh, + OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT, + "", NULL, 0); + if (acl_len > 0) { + a_size = ocfs2_xattr_entry_real_size(0, acl_len); + if (S_ISDIR(mode)) + a_size <<= 1; + } else if (acl_len != 0 && acl_len != -ENODATA) { + mlog_errno(ret); + return ret; + } + } + + if (!(s_size + a_size)) + return ret; + + /* + * The max space of security xattr taken inline is + * 256(name) + 80(value) + 16(entry) = 352 bytes, + * The max space of acl xattr taken inline is + * 80(value) + 16(entry) * 2(if directory) = 192 bytes, + * when blocksize = 512, may reserve one more cluser for + * xattr bucket, otherwise reserve one metadata block + * for them is ok. + */ + if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE || + (s_size + a_size) > OCFS2_XATTR_FREE_IN_IBODY) { + ret = ocfs2_reserve_new_metadata_blocks(osb, 1, xattr_ac); + if (ret) { + mlog_errno(ret); + return ret; + } + *xattr_credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS; + } + + if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE && + (s_size + a_size) > OCFS2_XATTR_FREE_IN_BLOCK(dir)) { + *want_clusters += 1; + *xattr_credits += ocfs2_blocks_per_xattr_bucket(dir->i_sb); + } + + /* + * reserve credits and clusters for xattrs which has large value + * and have to be set outside + */ + if (si->enable && si->value_len > OCFS2_XATTR_INLINE_SIZE) { + new_clusters = ocfs2_clusters_for_bytes(dir->i_sb, + si->value_len); + *xattr_credits += ocfs2_clusters_to_blocks(dir->i_sb, + new_clusters); + *want_clusters += new_clusters; + } + if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL && + acl_len > OCFS2_XATTR_INLINE_SIZE) { + /* for directory, it has DEFAULT and ACCESS two types of acls */ + new_clusters = (S_ISDIR(mode) ? 2 : 1) * + ocfs2_clusters_for_bytes(dir->i_sb, acl_len); + *xattr_credits += ocfs2_clusters_to_blocks(dir->i_sb, + new_clusters); + *want_clusters += new_clusters; + } + + return ret; +} + static int ocfs2_xattr_extend_allocation(struct inode *inode, u32 clusters_to_add, - struct buffer_head *xattr_bh, - struct ocfs2_xattr_value_root *xv) + struct ocfs2_xattr_value_buf *vb, + struct ocfs2_xattr_set_ctxt *ctxt) { int status = 0; - int restart_func = 0; - int credits = 0; - handle_t *handle = NULL; - struct ocfs2_alloc_context *data_ac = NULL; - struct ocfs2_alloc_context *meta_ac = NULL; + handle_t *handle = ctxt->handle; enum ocfs2_alloc_restarted why; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - u32 prev_clusters, logical_start = le32_to_cpu(xv->xr_clusters); + u32 prev_clusters, logical_start = le32_to_cpu(vb->vb_xv->xr_clusters); struct ocfs2_extent_tree et; mlog(0, "(clusters_to_add for xattr= %u)\n", clusters_to_add); - ocfs2_init_xattr_value_extent_tree(&et, inode, xattr_bh, xv); - -restart_all: - - status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0, - &data_ac, &meta_ac); - if (status) { - mlog_errno(status); - goto leave; - } - - credits = ocfs2_calc_extend_credits(osb->sb, et.et_root_el, - clusters_to_add); - handle = ocfs2_start_trans(osb, credits); - if (IS_ERR(handle)) { - status = PTR_ERR(handle); - handle = NULL; - mlog_errno(status); - goto leave; - } + ocfs2_init_xattr_value_extent_tree(&et, inode, vb); -restarted_transaction: - status = ocfs2_journal_access(handle, inode, xattr_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = vb->vb_access(handle, inode, vb->vb_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto leave; } - prev_clusters = le32_to_cpu(xv->xr_clusters); + prev_clusters = le32_to_cpu(vb->vb_xv->xr_clusters); status = ocfs2_add_clusters_in_btree(osb, inode, &logical_start, @@ -255,157 +614,84 @@ restarted_transaction: 0, &et, handle, - data_ac, - meta_ac, + ctxt->data_ac, + ctxt->meta_ac, &why); - if ((status < 0) && (status != -EAGAIN)) { - if (status != -ENOSPC) - mlog_errno(status); + if (status < 0) { + mlog_errno(status); goto leave; } - status = ocfs2_journal_dirty(handle, xattr_bh); + status = ocfs2_journal_dirty(handle, vb->vb_bh); if (status < 0) { mlog_errno(status); goto leave; } - clusters_to_add -= le32_to_cpu(xv->xr_clusters) - prev_clusters; + clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) - prev_clusters; - if (why != RESTART_NONE && clusters_to_add) { - if (why == RESTART_META) { - mlog(0, "restarting function.\n"); - restart_func = 1; - } else { - BUG_ON(why != RESTART_TRANS); - - mlog(0, "restarting transaction.\n"); - /* TODO: This can be more intelligent. */ - credits = ocfs2_calc_extend_credits(osb->sb, - et.et_root_el, - clusters_to_add); - status = ocfs2_extend_trans(handle, credits); - if (status < 0) { - /* handle still has to be committed at - * this point. */ - status = -ENOMEM; - mlog_errno(status); - goto leave; - } - goto restarted_transaction; - } - } + /* + * We should have already allocated enough space before the transaction, + * so no need to restart. + */ + BUG_ON(why != RESTART_NONE || clusters_to_add); leave: - if (handle) { - ocfs2_commit_trans(osb, handle); - handle = NULL; - } - if (data_ac) { - ocfs2_free_alloc_context(data_ac); - data_ac = NULL; - } - if (meta_ac) { - ocfs2_free_alloc_context(meta_ac); - meta_ac = NULL; - } - if ((!status) && restart_func) { - restart_func = 0; - goto restart_all; - } return status; } static int __ocfs2_remove_xattr_range(struct inode *inode, - struct buffer_head *root_bh, - struct ocfs2_xattr_value_root *xv, + struct ocfs2_xattr_value_buf *vb, u32 cpos, u32 phys_cpos, u32 len, - struct ocfs2_cached_dealloc_ctxt *dealloc) + struct ocfs2_xattr_set_ctxt *ctxt) { int ret; u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - struct inode *tl_inode = osb->osb_tl_inode; - handle_t *handle; - struct ocfs2_alloc_context *meta_ac = NULL; + handle_t *handle = ctxt->handle; struct ocfs2_extent_tree et; - ocfs2_init_xattr_value_extent_tree(&et, inode, root_bh, xv); + ocfs2_init_xattr_value_extent_tree(&et, inode, vb); - ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac); + ret = vb->vb_access(handle, inode, vb->vb_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); - return ret; - } - - mutex_lock(&tl_inode->i_mutex); - - if (ocfs2_truncate_log_needs_flush(osb)) { - ret = __ocfs2_flush_truncate_log(osb); - if (ret < 0) { - mlog_errno(ret); - goto out; - } - } - - handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - mlog_errno(ret); goto out; } - ret = ocfs2_journal_access(handle, inode, root_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (ret) { - mlog_errno(ret); - goto out_commit; - } - - ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac, - dealloc); + ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, ctxt->meta_ac, + &ctxt->dealloc); if (ret) { mlog_errno(ret); - goto out_commit; + goto out; } - le32_add_cpu(&xv->xr_clusters, -len); + le32_add_cpu(&vb->vb_xv->xr_clusters, -len); - ret = ocfs2_journal_dirty(handle, root_bh); + ret = ocfs2_journal_dirty(handle, vb->vb_bh); if (ret) { mlog_errno(ret); - goto out_commit; + goto out; } - ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len); + ret = ocfs2_cache_cluster_dealloc(&ctxt->dealloc, phys_blkno, len); if (ret) mlog_errno(ret); -out_commit: - ocfs2_commit_trans(osb, handle); out: - mutex_unlock(&tl_inode->i_mutex); - - if (meta_ac) - ocfs2_free_alloc_context(meta_ac); - return ret; } static int ocfs2_xattr_shrink_size(struct inode *inode, u32 old_clusters, u32 new_clusters, - struct buffer_head *root_bh, - struct ocfs2_xattr_value_root *xv) + struct ocfs2_xattr_value_buf *vb, + struct ocfs2_xattr_set_ctxt *ctxt) { int ret = 0; u32 trunc_len, cpos, phys_cpos, alloc_size; u64 block; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - struct ocfs2_cached_dealloc_ctxt dealloc; - - ocfs2_init_dealloc_ctxt(&dealloc); if (old_clusters <= new_clusters) return 0; @@ -414,7 +700,8 @@ static int ocfs2_xattr_shrink_size(struct inode *inode, trunc_len = old_clusters - new_clusters; while (trunc_len) { ret = ocfs2_xattr_get_clusters(inode, cpos, &phys_cpos, - &alloc_size, &xv->xr_list); + &alloc_size, + &vb->vb_xv->xr_list); if (ret) { mlog_errno(ret); goto out; @@ -423,9 +710,9 @@ static int ocfs2_xattr_shrink_size(struct inode *inode, if (alloc_size > trunc_len) alloc_size = trunc_len; - ret = __ocfs2_remove_xattr_range(inode, root_bh, xv, cpos, + ret = __ocfs2_remove_xattr_range(inode, vb, cpos, phys_cpos, alloc_size, - &dealloc); + ctxt); if (ret) { mlog_errno(ret); goto out; @@ -439,20 +726,17 @@ static int ocfs2_xattr_shrink_size(struct inode *inode, } out: - ocfs2_schedule_truncate_log_flush(osb, 1); - ocfs2_run_deallocs(osb, &dealloc); - return ret; } static int ocfs2_xattr_value_truncate(struct inode *inode, - struct buffer_head *root_bh, - struct ocfs2_xattr_value_root *xv, - int len) + struct ocfs2_xattr_value_buf *vb, + int len, + struct ocfs2_xattr_set_ctxt *ctxt) { int ret; u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb, len); - u32 old_clusters = le32_to_cpu(xv->xr_clusters); + u32 old_clusters = le32_to_cpu(vb->vb_xv->xr_clusters); if (new_clusters == old_clusters) return 0; @@ -460,11 +744,11 @@ static int ocfs2_xattr_value_truncate(struct inode *inode, if (new_clusters > old_clusters) ret = ocfs2_xattr_extend_allocation(inode, new_clusters - old_clusters, - root_bh, xv); + vb, ctxt); else ret = ocfs2_xattr_shrink_size(inode, old_clusters, new_clusters, - root_bh, xv); + vb, ctxt); return ret; } @@ -554,18 +838,14 @@ static int ocfs2_xattr_block_list(struct inode *inode, if (!di->i_xattr_loc) return ret; - ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh); + ret = ocfs2_read_xattr_block(inode, le64_to_cpu(di->i_xattr_loc), + &blk_bh); if (ret < 0) { mlog_errno(ret); return ret; } xb = (struct ocfs2_xattr_block *)blk_bh->b_data; - if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) { - ret = -EIO; - goto cleanup; - } - if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) { struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header; ret = ocfs2_xattr_list_entries(inode, header, @@ -575,7 +855,7 @@ static int ocfs2_xattr_block_list(struct inode *inode, ret = ocfs2_xattr_tree_list_index_block(inode, xt, buffer, buffer_size); } -cleanup: + brelse(blk_bh); return ret; @@ -685,7 +965,7 @@ static int ocfs2_xattr_get_value_outside(struct inode *inode, blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster); /* Copy ocfs2_xattr_value */ for (i = 0; i < num_clusters * bpc; i++, blkno++) { - ret = ocfs2_read_block(inode, blkno, &bh); + ret = ocfs2_read_block(inode, blkno, &bh, NULL); if (ret) { mlog_errno(ret); goto out; @@ -769,7 +1049,12 @@ static int ocfs2_xattr_block_get(struct inode *inode, size_t size; int ret = -ENODATA, name_offset, name_len, block_off, i; - memset(&xs->bucket, 0, sizeof(xs->bucket)); + xs->bucket = ocfs2_xattr_bucket_new(inode); + if (!xs->bucket) { + ret = -ENOMEM; + mlog_errno(ret); + goto cleanup; + } ret = ocfs2_xattr_block_find(inode, name_index, name, xs); if (ret) { @@ -795,11 +1080,11 @@ static int ocfs2_xattr_block_get(struct inode *inode, if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) { ret = ocfs2_xattr_bucket_get_name_value(inode, - xs->bucket.xh, + bucket_xh(xs->bucket), i, &block_off, &name_offset); - xs->base = xs->bucket.bhs[block_off]->b_data; + xs->base = bucket_block(xs->bucket, block_off); } if (ocfs2_xattr_is_local(xs->here)) { memcpy(buffer, (void *)xs->base + @@ -817,21 +1102,15 @@ static int ocfs2_xattr_block_get(struct inode *inode, } ret = size; cleanup: - for (i = 0; i < OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET; i++) - brelse(xs->bucket.bhs[i]); - memset(&xs->bucket, 0, sizeof(xs->bucket)); + ocfs2_xattr_bucket_free(xs->bucket); brelse(xs->xattr_bh); xs->xattr_bh = NULL; return ret; } -/* ocfs2_xattr_get() - * - * Copy an extended attribute into the buffer provided. - * Buffer is NULL to compute the size of buffer required. - */ -static int ocfs2_xattr_get(struct inode *inode, +int ocfs2_xattr_get_nolock(struct inode *inode, + struct buffer_head *di_bh, int name_index, const char *name, void *buffer, @@ -839,7 +1118,6 @@ static int ocfs2_xattr_get(struct inode *inode, { int ret; struct ocfs2_dinode *di = NULL; - struct buffer_head *di_bh = NULL; struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_xattr_search xis = { .not_found = -ENODATA, @@ -854,11 +1132,6 @@ static int ocfs2_xattr_get(struct inode *inode, if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) ret = -ENODATA; - ret = ocfs2_inode_lock(inode, &di_bh, 0); - if (ret < 0) { - mlog_errno(ret); - return ret; - } xis.inode_bh = xbs.inode_bh = di_bh; di = (struct ocfs2_dinode *)di_bh->b_data; @@ -869,6 +1142,32 @@ static int ocfs2_xattr_get(struct inode *inode, ret = ocfs2_xattr_block_get(inode, name_index, name, buffer, buffer_size, &xbs); up_read(&oi->ip_xattr_sem); + + return ret; +} + +/* ocfs2_xattr_get() + * + * Copy an extended attribute into the buffer provided. + * Buffer is NULL to compute the size of buffer required. + */ +static int ocfs2_xattr_get(struct inode *inode, + int name_index, + const char *name, + void *buffer, + size_t buffer_size) +{ + int ret; + struct buffer_head *di_bh = NULL; + + ret = ocfs2_inode_lock(inode, &di_bh, 0); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + ret = ocfs2_xattr_get_nolock(inode, di_bh, name_index, + name, buffer, buffer_size); + ocfs2_inode_unlock(inode, 0); brelse(di_bh); @@ -877,44 +1176,36 @@ static int ocfs2_xattr_get(struct inode *inode, } static int __ocfs2_xattr_set_value_outside(struct inode *inode, + handle_t *handle, struct ocfs2_xattr_value_root *xv, const void *value, int value_len) { - int ret = 0, i, cp_len, credits; + int ret = 0, i, cp_len; u16 blocksize = inode->i_sb->s_blocksize; u32 p_cluster, num_clusters; u32 cpos = 0, bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); u32 clusters = ocfs2_clusters_for_bytes(inode->i_sb, value_len); u64 blkno; struct buffer_head *bh = NULL; - handle_t *handle; BUG_ON(clusters > le32_to_cpu(xv->xr_clusters)); - credits = clusters * bpc; - handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb), credits); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - mlog_errno(ret); - goto out; - } - while (cpos < clusters) { ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster, &num_clusters, &xv->xr_list); if (ret) { mlog_errno(ret); - goto out_commit; + goto out; } blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster); for (i = 0; i < num_clusters * bpc; i++, blkno++) { - ret = ocfs2_read_block(inode, blkno, &bh); + ret = ocfs2_read_block(inode, blkno, &bh, NULL); if (ret) { mlog_errno(ret); - goto out_commit; + goto out; } ret = ocfs2_journal_access(handle, @@ -923,7 +1214,7 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode, OCFS2_JOURNAL_ACCESS_WRITE); if (ret < 0) { mlog_errno(ret); - goto out_commit; + goto out; } cp_len = value_len > blocksize ? blocksize : value_len; @@ -937,7 +1228,7 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode, ret = ocfs2_journal_dirty(handle, bh); if (ret < 0) { mlog_errno(ret); - goto out_commit; + goto out; } brelse(bh); bh = NULL; @@ -951,8 +1242,6 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode, } cpos += num_clusters; } -out_commit: - ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); out: brelse(bh); @@ -960,28 +1249,22 @@ out: } static int ocfs2_xattr_cleanup(struct inode *inode, + handle_t *handle, struct ocfs2_xattr_info *xi, struct ocfs2_xattr_search *xs, + struct ocfs2_xattr_value_buf *vb, size_t offs) { - handle_t *handle = NULL; int ret = 0; size_t name_len = strlen(xi->name); void *val = xs->base + offs; size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE; - handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), - OCFS2_XATTR_BLOCK_UPDATE_CREDITS); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - mlog_errno(ret); - goto out; - } - ret = ocfs2_journal_access(handle, inode, xs->xattr_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = vb->vb_access(handle, inode, vb->vb_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); - goto out_commit; + goto out; } /* Decrease xattr count */ le16_add_cpu(&xs->header->xh_count, -1); @@ -989,35 +1272,27 @@ static int ocfs2_xattr_cleanup(struct inode *inode, memset((void *)xs->here, 0, sizeof(struct ocfs2_xattr_entry)); memset(val, 0, size); - ret = ocfs2_journal_dirty(handle, xs->xattr_bh); + ret = ocfs2_journal_dirty(handle, vb->vb_bh); if (ret < 0) mlog_errno(ret); -out_commit: - ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); out: return ret; } static int ocfs2_xattr_update_entry(struct inode *inode, + handle_t *handle, struct ocfs2_xattr_info *xi, struct ocfs2_xattr_search *xs, + struct ocfs2_xattr_value_buf *vb, size_t offs) { - handle_t *handle = NULL; - int ret = 0; + int ret; - handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), - OCFS2_XATTR_BLOCK_UPDATE_CREDITS); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - mlog_errno(ret); - goto out; - } - ret = ocfs2_journal_access(handle, inode, xs->xattr_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = vb->vb_access(handle, inode, vb->vb_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); - goto out_commit; + goto out; } xs->here->xe_name_offset = cpu_to_le16(offs); @@ -1028,11 +1303,9 @@ static int ocfs2_xattr_update_entry(struct inode *inode, ocfs2_xattr_set_local(xs->here, 0); ocfs2_xattr_hash_entry(inode, xs->header, xs->here); - ret = ocfs2_journal_dirty(handle, xs->xattr_bh); + ret = ocfs2_journal_dirty(handle, vb->vb_bh); if (ret < 0) mlog_errno(ret); -out_commit: - ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); out: return ret; } @@ -1045,6 +1318,8 @@ out: static int ocfs2_xattr_set_value_outside(struct inode *inode, struct ocfs2_xattr_info *xi, struct ocfs2_xattr_search *xs, + struct ocfs2_xattr_set_ctxt *ctxt, + struct ocfs2_xattr_value_buf *vb, size_t offs) { size_t name_len = strlen(xi->name); @@ -1062,20 +1337,20 @@ static int ocfs2_xattr_set_value_outside(struct inode *inode, xv->xr_list.l_tree_depth = 0; xv->xr_list.l_count = cpu_to_le16(1); xv->xr_list.l_next_free_rec = 0; + vb->vb_xv = xv; - ret = ocfs2_xattr_value_truncate(inode, xs->xattr_bh, xv, - xi->value_len); + ret = ocfs2_xattr_value_truncate(inode, vb, xi->value_len, ctxt); if (ret < 0) { mlog_errno(ret); return ret; } - ret = __ocfs2_xattr_set_value_outside(inode, xv, xi->value, - xi->value_len); + ret = ocfs2_xattr_update_entry(inode, ctxt->handle, xi, xs, vb, offs); if (ret < 0) { mlog_errno(ret); return ret; } - ret = ocfs2_xattr_update_entry(inode, xi, xs, offs); + ret = __ocfs2_xattr_set_value_outside(inode, ctxt->handle, vb->vb_xv, + xi->value, xi->value_len); if (ret < 0) mlog_errno(ret); @@ -1195,6 +1470,7 @@ static void ocfs2_xattr_set_entry_local(struct inode *inode, static int ocfs2_xattr_set_entry(struct inode *inode, struct ocfs2_xattr_info *xi, struct ocfs2_xattr_search *xs, + struct ocfs2_xattr_set_ctxt *ctxt, int flag) { struct ocfs2_xattr_entry *last; @@ -1202,7 +1478,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode, struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; size_t min_offs = xs->end - xs->base, name_len = strlen(xi->name); size_t size_l = 0; - handle_t *handle = NULL; + handle_t *handle = ctxt->handle; int free, i, ret; struct ocfs2_xattr_info xi_l = { .name_index = xi->name_index, @@ -1210,6 +1486,16 @@ static int ocfs2_xattr_set_entry(struct inode *inode, .value = xi->value, .value_len = xi->value_len, }; + struct ocfs2_xattr_value_buf vb = { + .vb_bh = xs->xattr_bh, + .vb_access = ocfs2_journal_access_di, + }; + + if (!(flag & OCFS2_INLINE_XATTR_FL)) { + BUG_ON(xs->xattr_bh == xs->inode_bh); + vb.vb_access = ocfs2_journal_access_xb; + } else + BUG_ON(xs->xattr_bh != xs->inode_bh); /* Compute min_offs, last and free space. */ last = xs->header->xh_entries; @@ -1265,15 +1551,14 @@ static int ocfs2_xattr_set_entry(struct inode *inode, if (ocfs2_xattr_is_local(xs->here) && size == size_l) { /* Replace existing local xattr with tree root */ ret = ocfs2_xattr_set_value_outside(inode, xi, xs, - offs); + ctxt, &vb, offs); if (ret < 0) mlog_errno(ret); goto out; } else if (!ocfs2_xattr_is_local(xs->here)) { /* For existing xattr which has value outside */ - struct ocfs2_xattr_value_root *xv = NULL; - xv = (struct ocfs2_xattr_value_root *)(val + - OCFS2_XATTR_SIZE(name_len)); + vb.vb_xv = (struct ocfs2_xattr_value_root *) + (val + OCFS2_XATTR_SIZE(name_len)); if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) { /* @@ -1282,27 +1567,30 @@ static int ocfs2_xattr_set_entry(struct inode *inode, * then set new value with set_value_outside(). */ ret = ocfs2_xattr_value_truncate(inode, - xs->xattr_bh, - xv, - xi->value_len); + &vb, + xi->value_len, + ctxt); if (ret < 0) { mlog_errno(ret); goto out; } - ret = __ocfs2_xattr_set_value_outside(inode, - xv, - xi->value, - xi->value_len); + ret = ocfs2_xattr_update_entry(inode, + handle, + xi, + xs, + &vb, + offs); if (ret < 0) { mlog_errno(ret); goto out; } - ret = ocfs2_xattr_update_entry(inode, - xi, - xs, - offs); + ret = __ocfs2_xattr_set_value_outside(inode, + handle, + vb.vb_xv, + xi->value, + xi->value_len); if (ret < 0) mlog_errno(ret); goto out; @@ -1312,44 +1600,28 @@ static int ocfs2_xattr_set_entry(struct inode *inode, * just trucate old value to zero. */ ret = ocfs2_xattr_value_truncate(inode, - xs->xattr_bh, - xv, - 0); + &vb, + 0, + ctxt); if (ret < 0) mlog_errno(ret); } } } - handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), - OCFS2_INODE_UPDATE_CREDITS); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - mlog_errno(ret); - goto out; - } - - ret = ocfs2_journal_access(handle, inode, xs->inode_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_journal_access_di(handle, inode, xs->inode_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); - goto out_commit; + goto out; } if (!(flag & OCFS2_INLINE_XATTR_FL)) { - /* set extended attribute in external block. */ - ret = ocfs2_extend_trans(handle, - OCFS2_INODE_UPDATE_CREDITS + - OCFS2_XATTR_BLOCK_UPDATE_CREDITS); - if (ret) { - mlog_errno(ret); - goto out_commit; - } - ret = ocfs2_journal_access(handle, inode, xs->xattr_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = vb.vb_access(handle, inode, vb.vb_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); - goto out_commit; + goto out; } } @@ -1363,7 +1635,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode, ret = ocfs2_journal_dirty(handle, xs->xattr_bh); if (ret < 0) { mlog_errno(ret); - goto out_commit; + goto out; } } @@ -1391,25 +1663,19 @@ static int ocfs2_xattr_set_entry(struct inode *inode, oi->ip_dyn_features |= flag; di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); spin_unlock(&oi->ip_lock); - /* Update inode ctime */ - inode->i_ctime = CURRENT_TIME; - di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); - di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); ret = ocfs2_journal_dirty(handle, xs->inode_bh); if (ret < 0) mlog_errno(ret); -out_commit: - ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); - if (!ret && xi->value_len > OCFS2_XATTR_INLINE_SIZE) { /* * Set value outside in B tree. * This is the second step for value size > INLINE_SIZE. */ size_t offs = le16_to_cpu(xs->here->xe_name_offset); - ret = ocfs2_xattr_set_value_outside(inode, xi, xs, offs); + ret = ocfs2_xattr_set_value_outside(inode, xi, xs, ctxt, + &vb, offs); if (ret < 0) { int ret2; @@ -1418,41 +1684,56 @@ out_commit: * If set value outside failed, we have to clean * the junk tree root we have already set in local. */ - ret2 = ocfs2_xattr_cleanup(inode, xi, xs, offs); + ret2 = ocfs2_xattr_cleanup(inode, ctxt->handle, + xi, xs, &vb, offs); if (ret2 < 0) mlog_errno(ret2); } } out: return ret; - } static int ocfs2_remove_value_outside(struct inode*inode, - struct buffer_head *bh, + struct ocfs2_xattr_value_buf *vb, struct ocfs2_xattr_header *header) { int ret = 0, i; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, }; + + ocfs2_init_dealloc_ctxt(&ctxt.dealloc); + + ctxt.handle = ocfs2_start_trans(osb, + ocfs2_remove_extent_credits(osb->sb)); + if (IS_ERR(ctxt.handle)) { + ret = PTR_ERR(ctxt.handle); + mlog_errno(ret); + goto out; + } for (i = 0; i < le16_to_cpu(header->xh_count); i++) { struct ocfs2_xattr_entry *entry = &header->xh_entries[i]; if (!ocfs2_xattr_is_local(entry)) { - struct ocfs2_xattr_value_root *xv; void *val; val = (void *)header + le16_to_cpu(entry->xe_name_offset); - xv = (struct ocfs2_xattr_value_root *) + vb->vb_xv = (struct ocfs2_xattr_value_root *) (val + OCFS2_XATTR_SIZE(entry->xe_name_len)); - ret = ocfs2_xattr_value_truncate(inode, bh, xv, 0); + ret = ocfs2_xattr_value_truncate(inode, vb, 0, &ctxt); if (ret < 0) { mlog_errno(ret); - return ret; + break; } } } + ocfs2_commit_trans(osb, ctxt.handle); + ocfs2_schedule_truncate_log_flush(osb, 1); + ocfs2_run_deallocs(osb, &ctxt.dealloc); +out: return ret; } @@ -1463,12 +1744,16 @@ static int ocfs2_xattr_ibody_remove(struct inode *inode, struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; struct ocfs2_xattr_header *header; int ret; + struct ocfs2_xattr_value_buf vb = { + .vb_bh = di_bh, + .vb_access = ocfs2_journal_access_di, + }; header = (struct ocfs2_xattr_header *) ((void *)di + inode->i_sb->s_blocksize - le16_to_cpu(di->i_xattr_inline_size)); - ret = ocfs2_remove_value_outside(inode, di_bh, header); + ret = ocfs2_remove_value_outside(inode, &vb, header); return ret; } @@ -1478,11 +1763,15 @@ static int ocfs2_xattr_block_remove(struct inode *inode, { struct ocfs2_xattr_block *xb; int ret = 0; + struct ocfs2_xattr_value_buf vb = { + .vb_bh = blk_bh, + .vb_access = ocfs2_journal_access_xb, + }; xb = (struct ocfs2_xattr_block *)blk_bh->b_data; if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) { struct ocfs2_xattr_header *header = &(xb->xb_attrs.xb_header); - ret = ocfs2_remove_value_outside(inode, blk_bh, header); + ret = ocfs2_remove_value_outside(inode, &vb, header); } else ret = ocfs2_delete_xattr_index_block(inode, blk_bh); @@ -1502,24 +1791,19 @@ static int ocfs2_xattr_free_block(struct inode *inode, u64 blk, bg_blkno; u16 bit; - ret = ocfs2_read_block(inode, block, &blk_bh); + ret = ocfs2_read_xattr_block(inode, block, &blk_bh); if (ret < 0) { mlog_errno(ret); goto out; } - xb = (struct ocfs2_xattr_block *)blk_bh->b_data; - if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) { - ret = -EIO; - goto out; - } - ret = ocfs2_xattr_block_remove(inode, blk_bh); if (ret < 0) { mlog_errno(ret); goto out; } + xb = (struct ocfs2_xattr_block *)blk_bh->b_data; blk = le64_to_cpu(xb->xb_blkno); bit = le16_to_cpu(xb->xb_suballoc_bit); bg_blkno = ocfs2_which_suballoc_group(blk, bit); @@ -1606,8 +1890,8 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh) mlog_errno(ret); goto out; } - ret = ocfs2_journal_access(handle, inode, di_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_journal_access_di(handle, inode, di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out_commit; @@ -1714,7 +1998,8 @@ static int ocfs2_xattr_ibody_find(struct inode *inode, */ static int ocfs2_xattr_ibody_set(struct inode *inode, struct ocfs2_xattr_info *xi, - struct ocfs2_xattr_search *xs) + struct ocfs2_xattr_search *xs, + struct ocfs2_xattr_set_ctxt *ctxt) { struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; @@ -1731,7 +2016,7 @@ static int ocfs2_xattr_ibody_set(struct inode *inode, } } - ret = ocfs2_xattr_set_entry(inode, xi, xs, + ret = ocfs2_xattr_set_entry(inode, xi, xs, ctxt, (OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL)); out: up_write(&oi->ip_alloc_sem); @@ -1758,19 +2043,15 @@ static int ocfs2_xattr_block_find(struct inode *inode, if (!di->i_xattr_loc) return ret; - ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh); + ret = ocfs2_read_xattr_block(inode, le64_to_cpu(di->i_xattr_loc), + &blk_bh); if (ret < 0) { mlog_errno(ret); return ret; } - xb = (struct ocfs2_xattr_block *)blk_bh->b_data; - if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) { - ret = -EIO; - goto cleanup; - } - xs->xattr_bh = blk_bh; + xb = (struct ocfs2_xattr_block *)blk_bh->b_data; if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) { xs->header = &xb->xb_attrs.xb_header; @@ -1804,13 +2085,13 @@ cleanup: */ static int ocfs2_xattr_block_set(struct inode *inode, struct ocfs2_xattr_info *xi, - struct ocfs2_xattr_search *xs) + struct ocfs2_xattr_search *xs, + struct ocfs2_xattr_set_ctxt *ctxt) { struct buffer_head *new_bh = NULL; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; - struct ocfs2_alloc_context *meta_ac = NULL; - handle_t *handle = NULL; + handle_t *handle = ctxt->handle; struct ocfs2_xattr_block *xblk = NULL; u16 suballoc_bit_start; u32 num_got; @@ -1818,45 +2099,29 @@ static int ocfs2_xattr_block_set(struct inode *inode, int ret; if (!xs->xattr_bh) { - /* - * Alloc one external block for extended attribute - * outside of inode. - */ - ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac); + ret = ocfs2_journal_access_di(handle, inode, xs->inode_bh, + OCFS2_JOURNAL_ACCESS_CREATE); if (ret < 0) { mlog_errno(ret); - goto out; - } - handle = ocfs2_start_trans(osb, - OCFS2_XATTR_BLOCK_CREATE_CREDITS); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - mlog_errno(ret); - goto out; - } - ret = ocfs2_journal_access(handle, inode, xs->inode_bh, - OCFS2_JOURNAL_ACCESS_CREATE); - if (ret < 0) { - mlog_errno(ret); - goto out_commit; + goto end; } - ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, + ret = ocfs2_claim_metadata(osb, handle, ctxt->meta_ac, 1, &suballoc_bit_start, &num_got, &first_blkno); if (ret < 0) { mlog_errno(ret); - goto out_commit; + goto end; } new_bh = sb_getblk(inode->i_sb, first_blkno); ocfs2_set_new_buffer_uptodate(inode, new_bh); - ret = ocfs2_journal_access(handle, inode, new_bh, - OCFS2_JOURNAL_ACCESS_CREATE); + ret = ocfs2_journal_access_xb(handle, inode, new_bh, + OCFS2_JOURNAL_ACCESS_CREATE); if (ret < 0) { mlog_errno(ret); - goto out_commit; + goto end; } /* Initialize ocfs2_xattr_block */ @@ -1874,44 +2139,555 @@ static int ocfs2_xattr_block_set(struct inode *inode, xs->end = (void *)xblk + inode->i_sb->s_blocksize; xs->here = xs->header->xh_entries; - ret = ocfs2_journal_dirty(handle, new_bh); if (ret < 0) { mlog_errno(ret); - goto out_commit; + goto end; } di->i_xattr_loc = cpu_to_le64(first_blkno); - ret = ocfs2_journal_dirty(handle, xs->inode_bh); - if (ret < 0) - mlog_errno(ret); -out_commit: - ocfs2_commit_trans(osb, handle); -out: - if (meta_ac) - ocfs2_free_alloc_context(meta_ac); - if (ret < 0) - return ret; + ocfs2_journal_dirty(handle, xs->inode_bh); } else xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data; if (!(le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)) { /* Set extended attribute into external block */ - ret = ocfs2_xattr_set_entry(inode, xi, xs, OCFS2_HAS_XATTR_FL); + ret = ocfs2_xattr_set_entry(inode, xi, xs, ctxt, + OCFS2_HAS_XATTR_FL); if (!ret || ret != -ENOSPC) goto end; - ret = ocfs2_xattr_create_index_block(inode, xs); + ret = ocfs2_xattr_create_index_block(inode, xs, ctxt); if (ret) goto end; } - ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs); + ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs, ctxt); end: return ret; } +/* Check whether the new xattr can be inserted into the inode. */ +static int ocfs2_xattr_can_be_in_inode(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs) +{ + u64 value_size; + struct ocfs2_xattr_entry *last; + int free, i; + size_t min_offs = xs->end - xs->base; + + if (!xs->header) + return 0; + + last = xs->header->xh_entries; + + for (i = 0; i < le16_to_cpu(xs->header->xh_count); i++) { + size_t offs = le16_to_cpu(last->xe_name_offset); + if (offs < min_offs) + min_offs = offs; + last += 1; + } + + free = min_offs - ((void *)last - xs->base) - sizeof(__u32); + if (free < 0) + return 0; + + BUG_ON(!xs->not_found); + + if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) + value_size = OCFS2_XATTR_ROOT_SIZE; + else + value_size = OCFS2_XATTR_SIZE(xi->value_len); + + if (free >= sizeof(struct ocfs2_xattr_entry) + + OCFS2_XATTR_SIZE(strlen(xi->name)) + value_size) + return 1; + + return 0; +} + +static int ocfs2_calc_xattr_set_need(struct inode *inode, + struct ocfs2_dinode *di, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xis, + struct ocfs2_xattr_search *xbs, + int *clusters_need, + int *meta_need, + int *credits_need) +{ + int ret = 0, old_in_xb = 0; + int clusters_add = 0, meta_add = 0, credits = 0; + struct buffer_head *bh = NULL; + struct ocfs2_xattr_block *xb = NULL; + struct ocfs2_xattr_entry *xe = NULL; + struct ocfs2_xattr_value_root *xv = NULL; + char *base = NULL; + int name_offset, name_len = 0; + u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb, + xi->value_len); + u64 value_size; + + /* + * Calculate the clusters we need to write. + * No matter whether we replace an old one or add a new one, + * we need this for writing. + */ + if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) + credits += new_clusters * + ocfs2_clusters_to_blocks(inode->i_sb, 1); + + if (xis->not_found && xbs->not_found) { + credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb); + + if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) { + clusters_add += new_clusters; + credits += ocfs2_calc_extend_credits(inode->i_sb, + &def_xv.xv.xr_list, + new_clusters); + } + + goto meta_guess; + } + + if (!xis->not_found) { + xe = xis->here; + name_offset = le16_to_cpu(xe->xe_name_offset); + name_len = OCFS2_XATTR_SIZE(xe->xe_name_len); + base = xis->base; + credits += OCFS2_INODE_UPDATE_CREDITS; + } else { + int i, block_off = 0; + xb = (struct ocfs2_xattr_block *)xbs->xattr_bh->b_data; + xe = xbs->here; + name_offset = le16_to_cpu(xe->xe_name_offset); + name_len = OCFS2_XATTR_SIZE(xe->xe_name_len); + i = xbs->here - xbs->header->xh_entries; + old_in_xb = 1; + + if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) { + ret = ocfs2_xattr_bucket_get_name_value(inode, + bucket_xh(xbs->bucket), + i, &block_off, + &name_offset); + base = bucket_block(xbs->bucket, block_off); + credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb); + } else { + base = xbs->base; + credits += OCFS2_XATTR_BLOCK_UPDATE_CREDITS; + } + } + + /* + * delete a xattr doesn't need metadata and cluster allocation. + * so just calculate the credits and return. + * + * The credits for removing the value tree will be extended + * by ocfs2_remove_extent itself. + */ + if (!xi->value) { + if (!ocfs2_xattr_is_local(xe)) + credits += ocfs2_remove_extent_credits(inode->i_sb); + + goto out; + } + + /* do cluster allocation guess first. */ + value_size = le64_to_cpu(xe->xe_value_size); + + if (old_in_xb) { + /* + * In xattr set, we always try to set the xe in inode first, + * so if it can be inserted into inode successfully, the old + * one will be removed from the xattr block, and this xattr + * will be inserted into inode as a new xattr in inode. + */ + if (ocfs2_xattr_can_be_in_inode(inode, xi, xis)) { + clusters_add += new_clusters; + credits += ocfs2_remove_extent_credits(inode->i_sb) + + OCFS2_INODE_UPDATE_CREDITS; + if (!ocfs2_xattr_is_local(xe)) + credits += ocfs2_calc_extend_credits( + inode->i_sb, + &def_xv.xv.xr_list, + new_clusters); + goto out; + } + } + + if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) { + /* the new values will be stored outside. */ + u32 old_clusters = 0; + + if (!ocfs2_xattr_is_local(xe)) { + old_clusters = ocfs2_clusters_for_bytes(inode->i_sb, + value_size); + xv = (struct ocfs2_xattr_value_root *) + (base + name_offset + name_len); + value_size = OCFS2_XATTR_ROOT_SIZE; + } else + xv = &def_xv.xv; + + if (old_clusters >= new_clusters) { + credits += ocfs2_remove_extent_credits(inode->i_sb); + goto out; + } else { + meta_add += ocfs2_extend_meta_needed(&xv->xr_list); + clusters_add += new_clusters - old_clusters; + credits += ocfs2_calc_extend_credits(inode->i_sb, + &xv->xr_list, + new_clusters - + old_clusters); + if (value_size >= OCFS2_XATTR_ROOT_SIZE) + goto out; + } + } else { + /* + * Now the new value will be stored inside. So if the new + * value is smaller than the size of value root or the old + * value, we don't need any allocation, otherwise we have + * to guess metadata allocation. + */ + if ((ocfs2_xattr_is_local(xe) && value_size >= xi->value_len) || + (!ocfs2_xattr_is_local(xe) && + OCFS2_XATTR_ROOT_SIZE >= xi->value_len)) + goto out; + } + +meta_guess: + /* calculate metadata allocation. */ + if (di->i_xattr_loc) { + if (!xbs->xattr_bh) { + ret = ocfs2_read_xattr_block(inode, + le64_to_cpu(di->i_xattr_loc), + &bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + xb = (struct ocfs2_xattr_block *)bh->b_data; + } else + xb = (struct ocfs2_xattr_block *)xbs->xattr_bh->b_data; + + /* + * If there is already an xattr tree, good, we can calculate + * like other b-trees. Otherwise we may have the chance of + * create a tree, the credit calculation is borrowed from + * ocfs2_calc_extend_credits with root_el = NULL. And the + * new tree will be cluster based, so no meta is needed. + */ + if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) { + struct ocfs2_extent_list *el = + &xb->xb_attrs.xb_root.xt_list; + meta_add += ocfs2_extend_meta_needed(el); + credits += ocfs2_calc_extend_credits(inode->i_sb, + el, 1); + } else + credits += OCFS2_SUBALLOC_ALLOC + 1; + + /* + * This cluster will be used either for new bucket or for + * new xattr block. + * If the cluster size is the same as the bucket size, one + * more is needed since we may need to extend the bucket + * also. + */ + clusters_add += 1; + credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb); + if (OCFS2_XATTR_BUCKET_SIZE == + OCFS2_SB(inode->i_sb)->s_clustersize) { + credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb); + clusters_add += 1; + } + } else { + meta_add += 1; + credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS; + } +out: + if (clusters_need) + *clusters_need = clusters_add; + if (meta_need) + *meta_need = meta_add; + if (credits_need) + *credits_need = credits; + brelse(bh); + return ret; +} + +static int ocfs2_init_xattr_set_ctxt(struct inode *inode, + struct ocfs2_dinode *di, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xis, + struct ocfs2_xattr_search *xbs, + struct ocfs2_xattr_set_ctxt *ctxt, + int *credits) +{ + int clusters_add, meta_add, ret; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + memset(ctxt, 0, sizeof(struct ocfs2_xattr_set_ctxt)); + + ocfs2_init_dealloc_ctxt(&ctxt->dealloc); + + ret = ocfs2_calc_xattr_set_need(inode, di, xi, xis, xbs, + &clusters_add, &meta_add, credits); + if (ret) { + mlog_errno(ret); + return ret; + } + + mlog(0, "Set xattr %s, reserve meta blocks = %d, clusters = %d, " + "credits = %d\n", xi->name, meta_add, clusters_add, *credits); + + if (meta_add) { + ret = ocfs2_reserve_new_metadata_blocks(osb, meta_add, + &ctxt->meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + if (clusters_add) { + ret = ocfs2_reserve_clusters(osb, clusters_add, &ctxt->data_ac); + if (ret) + mlog_errno(ret); + } +out: + if (ret) { + if (ctxt->meta_ac) { + ocfs2_free_alloc_context(ctxt->meta_ac); + ctxt->meta_ac = NULL; + } + + /* + * We cannot have an error and a non null ctxt->data_ac. + */ + } + + return ret; +} + +static int __ocfs2_xattr_set_handle(struct inode *inode, + struct ocfs2_dinode *di, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xis, + struct ocfs2_xattr_search *xbs, + struct ocfs2_xattr_set_ctxt *ctxt) +{ + int ret = 0, credits, old_found; + + if (!xi->value) { + /* Remove existing extended attribute */ + if (!xis->not_found) + ret = ocfs2_xattr_ibody_set(inode, xi, xis, ctxt); + else if (!xbs->not_found) + ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt); + } else { + /* We always try to set extended attribute into inode first*/ + ret = ocfs2_xattr_ibody_set(inode, xi, xis, ctxt); + if (!ret && !xbs->not_found) { + /* + * If succeed and that extended attribute existing in + * external block, then we will remove it. + */ + xi->value = NULL; + xi->value_len = 0; + + old_found = xis->not_found; + xis->not_found = -ENODATA; + ret = ocfs2_calc_xattr_set_need(inode, + di, + xi, + xis, + xbs, + NULL, + NULL, + &credits); + xis->not_found = old_found; + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_extend_trans(ctxt->handle, credits + + ctxt->handle->h_buffer_credits); + if (ret) { + mlog_errno(ret); + goto out; + } + ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt); + } else if (ret == -ENOSPC) { + if (di->i_xattr_loc && !xbs->xattr_bh) { + ret = ocfs2_xattr_block_find(inode, + xi->name_index, + xi->name, xbs); + if (ret) + goto out; + + old_found = xis->not_found; + xis->not_found = -ENODATA; + ret = ocfs2_calc_xattr_set_need(inode, + di, + xi, + xis, + xbs, + NULL, + NULL, + &credits); + xis->not_found = old_found; + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_extend_trans(ctxt->handle, credits + + ctxt->handle->h_buffer_credits); + if (ret) { + mlog_errno(ret); + goto out; + } + } + /* + * If no space in inode, we will set extended attribute + * into external block. + */ + ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt); + if (ret) + goto out; + if (!xis->not_found) { + /* + * If succeed and that extended attribute + * existing in inode, we will remove it. + */ + xi->value = NULL; + xi->value_len = 0; + xbs->not_found = -ENODATA; + ret = ocfs2_calc_xattr_set_need(inode, + di, + xi, + xis, + xbs, + NULL, + NULL, + &credits); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_extend_trans(ctxt->handle, credits + + ctxt->handle->h_buffer_credits); + if (ret) { + mlog_errno(ret); + goto out; + } + ret = ocfs2_xattr_ibody_set(inode, xi, + xis, ctxt); + } + } + } + + if (!ret) { + /* Update inode ctime. */ + ret = ocfs2_journal_access(ctxt->handle, inode, xis->inode_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + inode->i_ctime = CURRENT_TIME; + di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); + di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + ocfs2_journal_dirty(ctxt->handle, xis->inode_bh); + } +out: + return ret; +} + +/* + * This function only called duing creating inode + * for init security/acl xattrs of the new inode. + * All transanction credits have been reserved in mknod. + */ +int ocfs2_xattr_set_handle(handle_t *handle, + struct inode *inode, + struct buffer_head *di_bh, + int name_index, + const char *name, + const void *value, + size_t value_len, + int flags, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_alloc_context *data_ac) +{ + struct ocfs2_dinode *di; + int ret; + + struct ocfs2_xattr_info xi = { + .name_index = name_index, + .name = name, + .value = value, + .value_len = value_len, + }; + + struct ocfs2_xattr_search xis = { + .not_found = -ENODATA, + }; + + struct ocfs2_xattr_search xbs = { + .not_found = -ENODATA, + }; + + struct ocfs2_xattr_set_ctxt ctxt = { + .handle = handle, + .meta_ac = meta_ac, + .data_ac = data_ac, + }; + + if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb))) + return -EOPNOTSUPP; + + /* + * In extreme situation, may need xattr bucket when + * block size is too small. And we have already reserved + * the credits for bucket in mknod. + */ + if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE) { + xbs.bucket = ocfs2_xattr_bucket_new(inode); + if (!xbs.bucket) { + mlog_errno(-ENOMEM); + return -ENOMEM; + } + } + + xis.inode_bh = xbs.inode_bh = di_bh; + di = (struct ocfs2_dinode *)di_bh->b_data; + + down_write(&OCFS2_I(inode)->ip_xattr_sem); + + ret = ocfs2_xattr_ibody_find(inode, name_index, name, &xis); + if (ret) + goto cleanup; + if (xis.not_found) { + ret = ocfs2_xattr_block_find(inode, name_index, name, &xbs); + if (ret) + goto cleanup; + } + + ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt); + +cleanup: + up_write(&OCFS2_I(inode)->ip_xattr_sem); + brelse(xbs.xattr_bh); + ocfs2_xattr_bucket_free(xbs.bucket); + + return ret; +} + /* * ocfs2_xattr_set() * @@ -1928,8 +2704,10 @@ int ocfs2_xattr_set(struct inode *inode, { struct buffer_head *di_bh = NULL; struct ocfs2_dinode *di; - int ret; - u16 i, blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + int ret, credits; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct inode *tl_inode = osb->osb_tl_inode; + struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, }; struct ocfs2_xattr_info xi = { .name_index = name_index, @@ -1949,10 +2727,20 @@ int ocfs2_xattr_set(struct inode *inode, if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb))) return -EOPNOTSUPP; + /* + * Only xbs will be used on indexed trees. xis doesn't need a + * bucket. + */ + xbs.bucket = ocfs2_xattr_bucket_new(inode); + if (!xbs.bucket) { + mlog_errno(-ENOMEM); + return -ENOMEM; + } + ret = ocfs2_inode_lock(inode, &di_bh, 1); if (ret < 0) { mlog_errno(ret); - return ret; + goto cleanup_nolock; } xis.inode_bh = xbs.inode_bh = di_bh; di = (struct ocfs2_dinode *)di_bh->b_data; @@ -1984,55 +2772,53 @@ int ocfs2_xattr_set(struct inode *inode, goto cleanup; } - if (!value) { - /* Remove existing extended attribute */ - if (!xis.not_found) - ret = ocfs2_xattr_ibody_set(inode, &xi, &xis); - else if (!xbs.not_found) - ret = ocfs2_xattr_block_set(inode, &xi, &xbs); - } else { - /* We always try to set extended attribute into inode first*/ - ret = ocfs2_xattr_ibody_set(inode, &xi, &xis); - if (!ret && !xbs.not_found) { - /* - * If succeed and that extended attribute existing in - * external block, then we will remove it. - */ - xi.value = NULL; - xi.value_len = 0; - ret = ocfs2_xattr_block_set(inode, &xi, &xbs); - } else if (ret == -ENOSPC) { - if (di->i_xattr_loc && !xbs.xattr_bh) { - ret = ocfs2_xattr_block_find(inode, name_index, - name, &xbs); - if (ret) - goto cleanup; - } - /* - * If no space in inode, we will set extended attribute - * into external block. - */ - ret = ocfs2_xattr_block_set(inode, &xi, &xbs); - if (ret) - goto cleanup; - if (!xis.not_found) { - /* - * If succeed and that extended attribute - * existing in inode, we will remove it. - */ - xi.value = NULL; - xi.value_len = 0; - ret = ocfs2_xattr_ibody_set(inode, &xi, &xis); - } + + mutex_lock(&tl_inode->i_mutex); + + if (ocfs2_truncate_log_needs_flush(osb)) { + ret = __ocfs2_flush_truncate_log(osb); + if (ret < 0) { + mutex_unlock(&tl_inode->i_mutex); + mlog_errno(ret); + goto cleanup; } } + mutex_unlock(&tl_inode->i_mutex); + + ret = ocfs2_init_xattr_set_ctxt(inode, di, &xi, &xis, + &xbs, &ctxt, &credits); + if (ret) { + mlog_errno(ret); + goto cleanup; + } + + /* we need to update inode's ctime field, so add credit for it. */ + credits += OCFS2_INODE_UPDATE_CREDITS; + ctxt.handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(ctxt.handle)) { + ret = PTR_ERR(ctxt.handle); + mlog_errno(ret); + goto cleanup; + } + + ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt); + + ocfs2_commit_trans(osb, ctxt.handle); + + if (ctxt.data_ac) + ocfs2_free_alloc_context(ctxt.data_ac); + if (ctxt.meta_ac) + ocfs2_free_alloc_context(ctxt.meta_ac); + if (ocfs2_dealloc_has_cluster(&ctxt.dealloc)) + ocfs2_schedule_truncate_log_flush(osb, 1); + ocfs2_run_deallocs(osb, &ctxt.dealloc); cleanup: up_write(&OCFS2_I(inode)->ip_xattr_sem); ocfs2_inode_unlock(inode, 1); +cleanup_nolock: brelse(di_bh); brelse(xbs.xattr_bh); - for (i = 0; i < blk_per_bucket; i++) - brelse(xbs.bucket.bhs[i]); + ocfs2_xattr_bucket_free(xbs.bucket); return ret; } @@ -2107,7 +2893,7 @@ typedef int (xattr_bucket_func)(struct inode *inode, void *para); static int ocfs2_find_xe_in_bucket(struct inode *inode, - struct buffer_head *header_bh, + struct ocfs2_xattr_bucket *bucket, int name_index, const char *name, u32 name_hash, @@ -2115,11 +2901,9 @@ static int ocfs2_find_xe_in_bucket(struct inode *inode, int *found) { int i, ret = 0, cmp = 1, block_off, new_offset; - struct ocfs2_xattr_header *xh = - (struct ocfs2_xattr_header *)header_bh->b_data; + struct ocfs2_xattr_header *xh = bucket_xh(bucket); size_t name_len = strlen(name); struct ocfs2_xattr_entry *xe = NULL; - struct buffer_head *name_bh = NULL; char *xe_name; /* @@ -2150,19 +2934,9 @@ static int ocfs2_find_xe_in_bucket(struct inode *inode, break; } - ret = ocfs2_read_block(inode, header_bh->b_blocknr + block_off, - &name_bh); - if (ret) { - mlog_errno(ret); - break; - } - xe_name = name_bh->b_data + new_offset; - cmp = memcmp(name, xe_name, name_len); - brelse(name_bh); - name_bh = NULL; - - if (cmp == 0) { + xe_name = bucket_block(bucket, block_off) + new_offset; + if (!memcmp(name, xe_name, name_len)) { *xe_index = i; *found = 1; ret = 0; @@ -2192,39 +2966,42 @@ static int ocfs2_xattr_bucket_find(struct inode *inode, struct ocfs2_xattr_search *xs) { int ret, found = 0; - struct buffer_head *bh = NULL; - struct buffer_head *lower_bh = NULL; struct ocfs2_xattr_header *xh = NULL; struct ocfs2_xattr_entry *xe = NULL; u16 index = 0; u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); int low_bucket = 0, bucket, high_bucket; + struct ocfs2_xattr_bucket *search; u32 last_hash; - u64 blkno; + u64 blkno, lower_blkno = 0; - ret = ocfs2_read_block(inode, p_blkno, &bh); + search = ocfs2_xattr_bucket_new(inode); + if (!search) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_read_xattr_bucket(search, p_blkno); if (ret) { mlog_errno(ret); goto out; } - xh = (struct ocfs2_xattr_header *)bh->b_data; + xh = bucket_xh(search); high_bucket = le16_to_cpu(xh->xh_num_buckets) - 1; - while (low_bucket <= high_bucket) { - brelse(bh); - bh = NULL; - bucket = (low_bucket + high_bucket) / 2; + ocfs2_xattr_bucket_relse(search); + bucket = (low_bucket + high_bucket) / 2; blkno = p_blkno + bucket * blk_per_bucket; - - ret = ocfs2_read_block(inode, blkno, &bh); + ret = ocfs2_read_xattr_bucket(search, blkno); if (ret) { mlog_errno(ret); goto out; } - xh = (struct ocfs2_xattr_header *)bh->b_data; + xh = bucket_xh(search); xe = &xh->xh_entries[0]; if (name_hash < le32_to_cpu(xe->xe_name_hash)) { high_bucket = bucket - 1; @@ -2241,10 +3018,8 @@ static int ocfs2_xattr_bucket_find(struct inode *inode, last_hash = le32_to_cpu(xe->xe_name_hash); - /* record lower_bh which may be the insert place. */ - brelse(lower_bh); - lower_bh = bh; - bh = NULL; + /* record lower_blkno which may be the insert place. */ + lower_blkno = blkno; if (name_hash > le32_to_cpu(xe->xe_name_hash)) { low_bucket = bucket + 1; @@ -2252,7 +3027,7 @@ static int ocfs2_xattr_bucket_find(struct inode *inode, } /* the searched xattr should reside in this bucket if exists. */ - ret = ocfs2_find_xe_in_bucket(inode, lower_bh, + ret = ocfs2_find_xe_in_bucket(inode, search, name_index, name, name_hash, &index, &found); if (ret) { @@ -2267,46 +3042,29 @@ static int ocfs2_xattr_bucket_find(struct inode *inode, * When the xattr's hash value is in the gap of 2 buckets, we will * always set it to the previous bucket. */ - if (!lower_bh) { - /* - * We can't find any bucket whose first name_hash is less - * than the find name_hash. - */ - BUG_ON(bh->b_blocknr != p_blkno); - lower_bh = bh; - bh = NULL; + if (!lower_blkno) + lower_blkno = p_blkno; + + /* This should be in cache - we just read it during the search */ + ret = ocfs2_read_xattr_bucket(xs->bucket, lower_blkno); + if (ret) { + mlog_errno(ret); + goto out; } - xs->bucket.bhs[0] = lower_bh; - xs->bucket.xh = (struct ocfs2_xattr_header *) - xs->bucket.bhs[0]->b_data; - lower_bh = NULL; - xs->header = xs->bucket.xh; - xs->base = xs->bucket.bhs[0]->b_data; + xs->header = bucket_xh(xs->bucket); + xs->base = bucket_block(xs->bucket, 0); xs->end = xs->base + inode->i_sb->s_blocksize; if (found) { - /* - * If we have found the xattr enty, read all the blocks in - * this bucket. - */ - ret = ocfs2_read_blocks(inode, xs->bucket.bhs[0]->b_blocknr + 1, - blk_per_bucket - 1, &xs->bucket.bhs[1], - 0); - if (ret) { - mlog_errno(ret); - goto out; - } - xs->here = &xs->header->xh_entries[index]; mlog(0, "find xattr %s in bucket %llu, entry = %u\n", name, - (unsigned long long)xs->bucket.bhs[0]->b_blocknr, index); + (unsigned long long)bucket_blkno(xs->bucket), index); } else ret = -ENODATA; out: - brelse(bh); - brelse(lower_bh); + ocfs2_xattr_bucket_free(search); return ret; } @@ -2357,53 +3115,50 @@ static int ocfs2_iterate_xattr_buckets(struct inode *inode, xattr_bucket_func *func, void *para) { - int i, j, ret = 0; - int blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + int i, ret = 0; u32 bpc = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)); u32 num_buckets = clusters * bpc; - struct ocfs2_xattr_bucket bucket; + struct ocfs2_xattr_bucket *bucket; - memset(&bucket, 0, sizeof(bucket)); + bucket = ocfs2_xattr_bucket_new(inode); + if (!bucket) { + mlog_errno(-ENOMEM); + return -ENOMEM; + } mlog(0, "iterating xattr buckets in %u clusters starting from %llu\n", clusters, (unsigned long long)blkno); - for (i = 0; i < num_buckets; i++, blkno += blk_per_bucket) { - ret = ocfs2_read_blocks(inode, blkno, blk_per_bucket, - bucket.bhs, 0); + for (i = 0; i < num_buckets; i++, blkno += bucket->bu_blocks) { + ret = ocfs2_read_xattr_bucket(bucket, blkno); if (ret) { mlog_errno(ret); - goto out; + break; } - bucket.xh = (struct ocfs2_xattr_header *)bucket.bhs[0]->b_data; /* * The real bucket num in this series of blocks is stored * in the 1st bucket. */ if (i == 0) - num_buckets = le16_to_cpu(bucket.xh->xh_num_buckets); + num_buckets = le16_to_cpu(bucket_xh(bucket)->xh_num_buckets); mlog(0, "iterating xattr bucket %llu, first hash %u\n", (unsigned long long)blkno, - le32_to_cpu(bucket.xh->xh_entries[0].xe_name_hash)); + le32_to_cpu(bucket_xh(bucket)->xh_entries[0].xe_name_hash)); if (func) { - ret = func(inode, &bucket, para); - if (ret) { + ret = func(inode, bucket, para); + if (ret) mlog_errno(ret); - break; - } + /* Fall through to bucket_relse() */ } - for (j = 0; j < blk_per_bucket; j++) - brelse(bucket.bhs[j]); - memset(&bucket, 0, sizeof(bucket)); + ocfs2_xattr_bucket_relse(bucket); + if (ret) + break; } -out: - for (j = 0; j < blk_per_bucket; j++) - brelse(bucket.bhs[j]); - + ocfs2_xattr_bucket_free(bucket); return ret; } @@ -2441,21 +3196,21 @@ static int ocfs2_list_xattr_bucket(struct inode *inode, int i, block_off, new_offset; const char *prefix, *name; - for (i = 0 ; i < le16_to_cpu(bucket->xh->xh_count); i++) { - struct ocfs2_xattr_entry *entry = &bucket->xh->xh_entries[i]; + for (i = 0 ; i < le16_to_cpu(bucket_xh(bucket)->xh_count); i++) { + struct ocfs2_xattr_entry *entry = &bucket_xh(bucket)->xh_entries[i]; type = ocfs2_xattr_get_type(entry); prefix = ocfs2_xattr_prefix(type); if (prefix) { ret = ocfs2_xattr_bucket_get_name_value(inode, - bucket->xh, + bucket_xh(bucket), i, &block_off, &new_offset); if (ret) break; - name = (const char *)bucket->bhs[block_off]->b_data + + name = (const char *)bucket_block(bucket, block_off) + new_offset; ret = ocfs2_xattr_list_entry(xl->buffer, xl->buffer_size, @@ -2540,32 +3295,34 @@ static void swap_xe(void *a, void *b, int size) /* * When the ocfs2_xattr_block is filled up, new bucket will be created * and all the xattr entries will be moved to the new bucket. + * The header goes at the start of the bucket, and the names+values are + * filled from the end. This is why *target starts as the last buffer. * Note: we need to sort the entries since they are not saved in order * in the ocfs2_xattr_block. */ static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode, struct buffer_head *xb_bh, - struct buffer_head *xh_bh, - struct buffer_head *data_bh) + struct ocfs2_xattr_bucket *bucket) { int i, blocksize = inode->i_sb->s_blocksize; + int blks = ocfs2_blocks_per_xattr_bucket(inode->i_sb); u16 offset, size, off_change; struct ocfs2_xattr_entry *xe; struct ocfs2_xattr_block *xb = (struct ocfs2_xattr_block *)xb_bh->b_data; struct ocfs2_xattr_header *xb_xh = &xb->xb_attrs.xb_header; - struct ocfs2_xattr_header *xh = - (struct ocfs2_xattr_header *)xh_bh->b_data; + struct ocfs2_xattr_header *xh = bucket_xh(bucket); u16 count = le16_to_cpu(xb_xh->xh_count); - char *target = xh_bh->b_data, *src = xb_bh->b_data; + char *src = xb_bh->b_data; + char *target = bucket_block(bucket, blks - 1); mlog(0, "cp xattr from block %llu to bucket %llu\n", (unsigned long long)xb_bh->b_blocknr, - (unsigned long long)xh_bh->b_blocknr); + (unsigned long long)bucket_blkno(bucket)); + + for (i = 0; i < blks; i++) + memset(bucket_block(bucket, i), 0, blocksize); - memset(xh_bh->b_data, 0, blocksize); - if (data_bh) - memset(data_bh->b_data, 0, blocksize); /* * Since the xe_name_offset is based on ocfs2_xattr_header, * there is a offset change corresponding to the change of @@ -2577,8 +3334,6 @@ static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode, size = blocksize - offset; /* copy all the names and values. */ - if (data_bh) - target = data_bh->b_data; memcpy(target + offset, src + offset, size); /* Init new header now. */ @@ -2588,7 +3343,7 @@ static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode, xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE - size); /* copy all the entries. */ - target = xh_bh->b_data; + target = bucket_block(bucket, 0); offset = offsetof(struct ocfs2_xattr_header, xh_entries); size = count * sizeof(struct ocfs2_xattr_entry); memcpy(target + offset, (char *)xb_xh + offset, size); @@ -2614,73 +3369,47 @@ static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode, * While if the entry is in index b-tree, "bucket" indicates the * real place of the xattr. */ -static int ocfs2_xattr_update_xattr_search(struct inode *inode, - struct ocfs2_xattr_search *xs, - struct buffer_head *old_bh, - struct buffer_head *new_bh) +static void ocfs2_xattr_update_xattr_search(struct inode *inode, + struct ocfs2_xattr_search *xs, + struct buffer_head *old_bh) { - int ret = 0; char *buf = old_bh->b_data; struct ocfs2_xattr_block *old_xb = (struct ocfs2_xattr_block *)buf; struct ocfs2_xattr_header *old_xh = &old_xb->xb_attrs.xb_header; - int i, blocksize = inode->i_sb->s_blocksize; - u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); - - xs->bucket.bhs[0] = new_bh; - get_bh(new_bh); - xs->bucket.xh = (struct ocfs2_xattr_header *)xs->bucket.bhs[0]->b_data; - xs->header = xs->bucket.xh; + int i; - xs->base = new_bh->b_data; + xs->header = bucket_xh(xs->bucket); + xs->base = bucket_block(xs->bucket, 0); xs->end = xs->base + inode->i_sb->s_blocksize; - if (!xs->not_found) { - if (OCFS2_XATTR_BUCKET_SIZE != blocksize) { - ret = ocfs2_read_blocks(inode, - xs->bucket.bhs[0]->b_blocknr + 1, - blk_per_bucket - 1, &xs->bucket.bhs[1], - 0); - if (ret) { - mlog_errno(ret); - return ret; - } - - } - i = xs->here - old_xh->xh_entries; - xs->here = &xs->header->xh_entries[i]; - } + if (xs->not_found) + return; - return ret; + i = xs->here - old_xh->xh_entries; + xs->here = &xs->header->xh_entries[i]; } static int ocfs2_xattr_create_index_block(struct inode *inode, - struct ocfs2_xattr_search *xs) + struct ocfs2_xattr_search *xs, + struct ocfs2_xattr_set_ctxt *ctxt) { - int ret, credits = OCFS2_SUBALLOC_ALLOC; + int ret; u32 bit_off, len; u64 blkno; - handle_t *handle; + handle_t *handle = ctxt->handle; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_inode_info *oi = OCFS2_I(inode); - struct ocfs2_alloc_context *data_ac; - struct buffer_head *xh_bh = NULL, *data_bh = NULL; struct buffer_head *xb_bh = xs->xattr_bh; struct ocfs2_xattr_block *xb = (struct ocfs2_xattr_block *)xb_bh->b_data; struct ocfs2_xattr_tree_root *xr; u16 xb_flags = le16_to_cpu(xb->xb_flags); - u16 bpb = ocfs2_blocks_per_xattr_bucket(inode->i_sb); mlog(0, "create xattr index block for %llu\n", (unsigned long long)xb_bh->b_blocknr); BUG_ON(xb_flags & OCFS2_XATTR_INDEXED); - - ret = ocfs2_reserve_clusters(osb, 1, &data_ac); - if (ret) { - mlog_errno(ret); - goto out; - } + BUG_ON(!xs->bucket); /* * XXX: @@ -2689,29 +3418,18 @@ static int ocfs2_xattr_create_index_block(struct inode *inode, */ down_write(&oi->ip_alloc_sem); - /* - * 3 more credits, one for xattr block update, one for the 1st block - * of the new xattr bucket and one for the value/data. - */ - credits += 3; - handle = ocfs2_start_trans(osb, credits); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - mlog_errno(ret); - goto out_sem; - } - - ret = ocfs2_journal_access(handle, inode, xb_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_journal_access_xb(handle, inode, xb_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); - goto out_commit; + goto out; } - ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, &len); + ret = __ocfs2_claim_clusters(osb, handle, ctxt->data_ac, + 1, 1, &bit_off, &len); if (ret) { mlog_errno(ret); - goto out_commit; + goto out; } /* @@ -2724,51 +3442,23 @@ static int ocfs2_xattr_create_index_block(struct inode *inode, mlog(0, "allocate 1 cluster from %llu to xattr block\n", (unsigned long long)blkno); - xh_bh = sb_getblk(inode->i_sb, blkno); - if (!xh_bh) { - ret = -EIO; + ret = ocfs2_init_xattr_bucket(xs->bucket, blkno); + if (ret) { mlog_errno(ret); - goto out_commit; + goto out; } - ocfs2_set_new_buffer_uptodate(inode, xh_bh); - - ret = ocfs2_journal_access(handle, inode, xh_bh, - OCFS2_JOURNAL_ACCESS_CREATE); + ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket, + OCFS2_JOURNAL_ACCESS_CREATE); if (ret) { mlog_errno(ret); - goto out_commit; - } - - if (bpb > 1) { - data_bh = sb_getblk(inode->i_sb, blkno + bpb - 1); - if (!data_bh) { - ret = -EIO; - mlog_errno(ret); - goto out_commit; - } - - ocfs2_set_new_buffer_uptodate(inode, data_bh); - - ret = ocfs2_journal_access(handle, inode, data_bh, - OCFS2_JOURNAL_ACCESS_CREATE); - if (ret) { - mlog_errno(ret); - goto out_commit; - } + goto out; } - ocfs2_cp_xattr_block_to_bucket(inode, xb_bh, xh_bh, data_bh); + ocfs2_cp_xattr_block_to_bucket(inode, xb_bh, xs->bucket); + ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket); - ocfs2_journal_dirty(handle, xh_bh); - if (data_bh) - ocfs2_journal_dirty(handle, data_bh); - - ret = ocfs2_xattr_update_xattr_search(inode, xs, xb_bh, xh_bh); - if (ret) { - mlog_errno(ret); - goto out_commit; - } + ocfs2_xattr_update_xattr_search(inode, xs, xb_bh); /* Change from ocfs2_xattr_header to ocfs2_xattr_tree_root */ memset(&xb->xb_attrs, 0, inode->i_sb->s_blocksize - @@ -2787,24 +3477,10 @@ static int ocfs2_xattr_create_index_block(struct inode *inode, xb->xb_flags = cpu_to_le16(xb_flags | OCFS2_XATTR_INDEXED); - ret = ocfs2_journal_dirty(handle, xb_bh); - if (ret) { - mlog_errno(ret); - goto out_commit; - } - -out_commit: - ocfs2_commit_trans(osb, handle); - -out_sem: - up_write(&oi->ip_alloc_sem); + ocfs2_journal_dirty(handle, xb_bh); out: - if (data_ac) - ocfs2_free_alloc_context(data_ac); - - brelse(xh_bh); - brelse(data_bh); + up_write(&oi->ip_alloc_sem); return ret; } @@ -2829,29 +3505,18 @@ static int cmp_xe_offset(const void *a, const void *b) * so that we can spare some space for insertion. */ static int ocfs2_defrag_xattr_bucket(struct inode *inode, + handle_t *handle, struct ocfs2_xattr_bucket *bucket) { int ret, i; size_t end, offset, len, value_len; struct ocfs2_xattr_header *xh; char *entries, *buf, *bucket_buf = NULL; - u64 blkno = bucket->bhs[0]->b_blocknr; - u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + u64 blkno = bucket_blkno(bucket); u16 xh_free_start; size_t blocksize = inode->i_sb->s_blocksize; - handle_t *handle; - struct buffer_head **bhs; struct ocfs2_xattr_entry *xe; - bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket, - GFP_NOFS); - if (!bhs) - return -ENOMEM; - - ret = ocfs2_read_blocks(inode, blkno, blk_per_bucket, bhs, 0); - if (ret) - goto out; - /* * In order to make the operation more efficient and generic, * we copy all the blocks into a contiguous memory and do the @@ -2865,26 +3530,16 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode, } buf = bucket_buf; - for (i = 0; i < blk_per_bucket; i++, buf += blocksize) - memcpy(buf, bhs[i]->b_data, blocksize); + for (i = 0; i < bucket->bu_blocks; i++, buf += blocksize) + memcpy(buf, bucket_block(bucket, i), blocksize); - handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), blk_per_bucket); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - handle = NULL; + ret = ocfs2_xattr_bucket_journal_access(handle, bucket, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { mlog_errno(ret); goto out; } - for (i = 0; i < blk_per_bucket; i++) { - ret = ocfs2_journal_access(handle, inode, bhs[i], - OCFS2_JOURNAL_ACCESS_WRITE); - if (ret < 0) { - mlog_errno(ret); - goto commit; - } - } - xh = (struct ocfs2_xattr_header *)bucket_buf; entries = (char *)xh->xh_entries; xh_free_start = le16_to_cpu(xh->xh_free_start); @@ -2940,7 +3595,7 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode, "bucket %llu\n", (unsigned long long)blkno); if (xh_free_start == end) - goto commit; + goto out; memset(bucket_buf + xh_free_start, 0, end - xh_free_start); xh->xh_free_start = cpu_to_le16(end); @@ -2951,169 +3606,94 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode, cmp_xe, swap_xe); buf = bucket_buf; - for (i = 0; i < blk_per_bucket; i++, buf += blocksize) { - memcpy(bhs[i]->b_data, buf, blocksize); - ocfs2_journal_dirty(handle, bhs[i]); - } + for (i = 0; i < bucket->bu_blocks; i++, buf += blocksize) + memcpy(bucket_block(bucket, i), buf, blocksize); + ocfs2_xattr_bucket_journal_dirty(handle, bucket); -commit: - ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); out: - - if (bhs) { - for (i = 0; i < blk_per_bucket; i++) - brelse(bhs[i]); - } - kfree(bhs); - kfree(bucket_buf); return ret; } /* - * Move half nums of the xattr bucket in the previous cluster to this new - * cluster. We only touch the last cluster of the previous extend record. + * prev_blkno points to the start of an existing extent. new_blkno + * points to a newly allocated extent. Because we know each of our + * clusters contains more than bucket, we can easily split one cluster + * at a bucket boundary. So we take the last cluster of the existing + * extent and split it down the middle. We move the last half of the + * buckets in the last cluster of the existing extent over to the new + * extent. + * + * first_bh is the buffer at prev_blkno so we can update the existing + * extent's bucket count. header_bh is the bucket were we were hoping + * to insert our xattr. If the bucket move places the target in the new + * extent, we'll update first_bh and header_bh after modifying the old + * extent. * - * first_bh is the first buffer_head of a series of bucket in the same - * extent rec and header_bh is the header of one bucket in this cluster. - * They will be updated if we move the data header_bh contains to the new - * cluster. first_hash will be set as the 1st xe's name_hash of the new cluster. + * first_hash will be set as the 1st xe's name_hash in the new extent. */ static int ocfs2_mv_xattr_bucket_cross_cluster(struct inode *inode, handle_t *handle, - struct buffer_head **first_bh, - struct buffer_head **header_bh, + struct ocfs2_xattr_bucket *first, + struct ocfs2_xattr_bucket *target, u64 new_blkno, - u64 prev_blkno, u32 num_clusters, u32 *first_hash) { - int i, ret, credits; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); - int num_buckets = ocfs2_xattr_buckets_per_cluster(osb); - int blocksize = inode->i_sb->s_blocksize; - struct buffer_head *old_bh, *new_bh, *prev_bh, *new_first_bh = NULL; - struct ocfs2_xattr_header *new_xh; - struct ocfs2_xattr_header *xh = - (struct ocfs2_xattr_header *)((*first_bh)->b_data); - - BUG_ON(le16_to_cpu(xh->xh_num_buckets) < num_buckets); - BUG_ON(OCFS2_XATTR_BUCKET_SIZE == osb->s_clustersize); - - prev_bh = *first_bh; - get_bh(prev_bh); - xh = (struct ocfs2_xattr_header *)prev_bh->b_data; + int ret; + struct super_block *sb = inode->i_sb; + int blks_per_bucket = ocfs2_blocks_per_xattr_bucket(sb); + int num_buckets = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(sb)); + int to_move = num_buckets / 2; + u64 src_blkno; + u64 last_cluster_blkno = bucket_blkno(first) + + ((num_clusters - 1) * ocfs2_clusters_to_blocks(sb, 1)); - prev_blkno += (num_clusters - 1) * bpc + bpc / 2; + BUG_ON(le16_to_cpu(bucket_xh(first)->xh_num_buckets) < num_buckets); + BUG_ON(OCFS2_XATTR_BUCKET_SIZE == OCFS2_SB(sb)->s_clustersize); mlog(0, "move half of xattrs in cluster %llu to %llu\n", - (unsigned long long)prev_blkno, (unsigned long long)new_blkno); + (unsigned long long)last_cluster_blkno, (unsigned long long)new_blkno); - /* - * We need to update the 1st half of the new cluster and - * 1 more for the update of the 1st bucket of the previous - * extent record. - */ - credits = bpc / 2 + 1; - ret = ocfs2_extend_trans(handle, credits); + ret = ocfs2_mv_xattr_buckets(inode, handle, bucket_blkno(first), + last_cluster_blkno, new_blkno, + to_move, first_hash); if (ret) { mlog_errno(ret); goto out; } - ret = ocfs2_journal_access(handle, inode, prev_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (ret) { - mlog_errno(ret); - goto out; - } + /* This is the first bucket that got moved */ + src_blkno = last_cluster_blkno + (to_move * blks_per_bucket); - for (i = 0; i < bpc / 2; i++, prev_blkno++, new_blkno++) { - old_bh = new_bh = NULL; - new_bh = sb_getblk(inode->i_sb, new_blkno); - if (!new_bh) { - ret = -EIO; - mlog_errno(ret); - goto out; - } + /* + * If the target bucket was part of the moved buckets, we need to + * update first and target. + */ + if (bucket_blkno(target) >= src_blkno) { + /* Find the block for the new target bucket */ + src_blkno = new_blkno + + (bucket_blkno(target) - src_blkno); - ocfs2_set_new_buffer_uptodate(inode, new_bh); + ocfs2_xattr_bucket_relse(first); + ocfs2_xattr_bucket_relse(target); - ret = ocfs2_journal_access(handle, inode, new_bh, - OCFS2_JOURNAL_ACCESS_CREATE); - if (ret < 0) { + /* + * These shouldn't fail - the buffers are in the + * journal from ocfs2_cp_xattr_bucket(). + */ + ret = ocfs2_read_xattr_bucket(first, new_blkno); + if (ret) { mlog_errno(ret); - brelse(new_bh); goto out; } - - ret = ocfs2_read_block(inode, prev_blkno, &old_bh); - if (ret < 0) { + ret = ocfs2_read_xattr_bucket(target, src_blkno); + if (ret) mlog_errno(ret); - brelse(new_bh); - goto out; - } - memcpy(new_bh->b_data, old_bh->b_data, blocksize); - - if (i == 0) { - new_xh = (struct ocfs2_xattr_header *)new_bh->b_data; - new_xh->xh_num_buckets = cpu_to_le16(num_buckets / 2); - - if (first_hash) - *first_hash = le32_to_cpu( - new_xh->xh_entries[0].xe_name_hash); - new_first_bh = new_bh; - get_bh(new_first_bh); - } - - ocfs2_journal_dirty(handle, new_bh); - - if (*header_bh == old_bh) { - brelse(*header_bh); - *header_bh = new_bh; - get_bh(*header_bh); - - brelse(*first_bh); - *first_bh = new_first_bh; - get_bh(*first_bh); - } - brelse(new_bh); - brelse(old_bh); } - le16_add_cpu(&xh->xh_num_buckets, -(num_buckets / 2)); - - ocfs2_journal_dirty(handle, prev_bh); out: - brelse(prev_bh); - brelse(new_first_bh); - return ret; -} - -static int ocfs2_read_xattr_bucket(struct inode *inode, - u64 blkno, - struct buffer_head **bhs, - int new) -{ - int ret = 0; - u16 i, blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); - - if (!new) - return ocfs2_read_blocks(inode, blkno, - blk_per_bucket, bhs, 0); - - for (i = 0; i < blk_per_bucket; i++) { - bhs[i] = sb_getblk(inode->i_sb, blkno + i); - if (bhs[i] == NULL) { - ret = -EIO; - mlog_errno(ret); - break; - } - ocfs2_set_new_buffer_uptodate(inode, bhs[i]); - } - return ret; } @@ -3178,8 +3758,7 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode, { int ret, i; int count, start, len, name_value_len = 0, xe_len, name_offset = 0; - u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); - struct buffer_head **s_bhs, **t_bhs = NULL; + struct ocfs2_xattr_bucket *s_bucket = NULL, *t_bucket = NULL; struct ocfs2_xattr_header *xh; struct ocfs2_xattr_entry *xe; int blocksize = inode->i_sb->s_blocksize; @@ -3187,47 +3766,52 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode, mlog(0, "move some of xattrs from bucket %llu to %llu\n", (unsigned long long)blk, (unsigned long long)new_blk); - s_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS); - if (!s_bhs) - return -ENOMEM; - - ret = ocfs2_read_xattr_bucket(inode, blk, s_bhs, 0); - if (ret) { + s_bucket = ocfs2_xattr_bucket_new(inode); + t_bucket = ocfs2_xattr_bucket_new(inode); + if (!s_bucket || !t_bucket) { + ret = -ENOMEM; mlog_errno(ret); goto out; } - ret = ocfs2_journal_access(handle, inode, s_bhs[0], - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_read_xattr_bucket(s_bucket, blk); if (ret) { mlog_errno(ret); goto out; } - t_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS); - if (!t_bhs) { - ret = -ENOMEM; + ret = ocfs2_xattr_bucket_journal_access(handle, s_bucket, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); goto out; } - ret = ocfs2_read_xattr_bucket(inode, new_blk, t_bhs, new_bucket_head); + /* + * Even if !new_bucket_head, we're overwriting t_bucket. Thus, + * there's no need to read it. + */ + ret = ocfs2_init_xattr_bucket(t_bucket, new_blk); if (ret) { mlog_errno(ret); goto out; } - for (i = 0; i < blk_per_bucket; i++) { - ret = ocfs2_journal_access(handle, inode, t_bhs[i], - new_bucket_head ? - OCFS2_JOURNAL_ACCESS_CREATE : - OCFS2_JOURNAL_ACCESS_WRITE); - if (ret) { - mlog_errno(ret); - goto out; - } + /* + * Hey, if we're overwriting t_bucket, what difference does + * ACCESS_CREATE vs ACCESS_WRITE make? See the comment in the + * same part of ocfs2_cp_xattr_bucket(). + */ + ret = ocfs2_xattr_bucket_journal_access(handle, t_bucket, + new_bucket_head ? + OCFS2_JOURNAL_ACCESS_CREATE : + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; } - xh = (struct ocfs2_xattr_header *)s_bhs[0]->b_data; + xh = bucket_xh(s_bucket); count = le16_to_cpu(xh->xh_count); start = ocfs2_xattr_find_divide_pos(xh); @@ -3239,10 +3823,10 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode, * The hash value is set as one larger than * that of the last entry in the previous bucket. */ - for (i = 0; i < blk_per_bucket; i++) - memset(t_bhs[i]->b_data, 0, blocksize); + for (i = 0; i < t_bucket->bu_blocks; i++) + memset(bucket_block(t_bucket, i), 0, blocksize); - xh = (struct ocfs2_xattr_header *)t_bhs[0]->b_data; + xh = bucket_xh(t_bucket); xh->xh_free_start = cpu_to_le16(blocksize); xh->xh_entries[0].xe_name_hash = xe->xe_name_hash; le32_add_cpu(&xh->xh_entries[0].xe_name_hash, 1); @@ -3251,11 +3835,10 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode, } /* copy the whole bucket to the new first. */ - for (i = 0; i < blk_per_bucket; i++) - memcpy(t_bhs[i]->b_data, s_bhs[i]->b_data, blocksize); + ocfs2_xattr_bucket_copy_data(t_bucket, s_bucket); /* update the new bucket. */ - xh = (struct ocfs2_xattr_header *)t_bhs[0]->b_data; + xh = bucket_xh(t_bucket); /* * Calculate the total name/value len and xh_free_start for @@ -3319,11 +3902,7 @@ set_num_buckets: else xh->xh_num_buckets = 0; - for (i = 0; i < blk_per_bucket; i++) { - ocfs2_journal_dirty(handle, t_bhs[i]); - if (ret) - mlog_errno(ret); - } + ocfs2_xattr_bucket_journal_dirty(handle, t_bucket); /* store the first_hash of the new bucket. */ if (first_hash) @@ -3337,29 +3916,18 @@ set_num_buckets: if (start == count) goto out; - xh = (struct ocfs2_xattr_header *)s_bhs[0]->b_data; + xh = bucket_xh(s_bucket); memset(&xh->xh_entries[start], 0, sizeof(struct ocfs2_xattr_entry) * (count - start)); xh->xh_count = cpu_to_le16(start); xh->xh_free_start = cpu_to_le16(name_offset); xh->xh_name_value_len = cpu_to_le16(name_value_len); - ocfs2_journal_dirty(handle, s_bhs[0]); - if (ret) - mlog_errno(ret); + ocfs2_xattr_bucket_journal_dirty(handle, s_bucket); out: - if (s_bhs) { - for (i = 0; i < blk_per_bucket; i++) - brelse(s_bhs[i]); - } - kfree(s_bhs); - - if (t_bhs) { - for (i = 0; i < blk_per_bucket; i++) - brelse(t_bhs[i]); - } - kfree(t_bhs); + ocfs2_xattr_bucket_free(s_bucket); + ocfs2_xattr_bucket_free(t_bucket); return ret; } @@ -3376,10 +3944,8 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode, u64 t_blkno, int t_is_new) { - int ret, i; - int blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); - int blocksize = inode->i_sb->s_blocksize; - struct buffer_head **s_bhs, **t_bhs = NULL; + int ret; + struct ocfs2_xattr_bucket *s_bucket = NULL, *t_bucket = NULL; BUG_ON(s_blkno == t_blkno); @@ -3387,92 +3953,115 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode, (unsigned long long)s_blkno, (unsigned long long)t_blkno, t_is_new); - s_bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket, - GFP_NOFS); - if (!s_bhs) - return -ENOMEM; + s_bucket = ocfs2_xattr_bucket_new(inode); + t_bucket = ocfs2_xattr_bucket_new(inode); + if (!s_bucket || !t_bucket) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } - ret = ocfs2_read_xattr_bucket(inode, s_blkno, s_bhs, 0); + ret = ocfs2_read_xattr_bucket(s_bucket, s_blkno); if (ret) goto out; - t_bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket, - GFP_NOFS); - if (!t_bhs) { - ret = -ENOMEM; + /* + * Even if !t_is_new, we're overwriting t_bucket. Thus, + * there's no need to read it. + */ + ret = ocfs2_init_xattr_bucket(t_bucket, t_blkno); + if (ret) goto out; - } - ret = ocfs2_read_xattr_bucket(inode, t_blkno, t_bhs, t_is_new); + /* + * Hey, if we're overwriting t_bucket, what difference does + * ACCESS_CREATE vs ACCESS_WRITE make? Well, if we allocated a new + * cluster to fill, we came here from + * ocfs2_mv_xattr_buckets(), and it is really new - + * ACCESS_CREATE is required. But we also might have moved data + * out of t_bucket before extending back into it. + * ocfs2_add_new_xattr_bucket() can do this - its call to + * ocfs2_add_new_xattr_cluster() may have created a new extent + * and copied out the end of the old extent. Then it re-extends + * the old extent back to create space for new xattrs. That's + * how we get here, and the bucket isn't really new. + */ + ret = ocfs2_xattr_bucket_journal_access(handle, t_bucket, + t_is_new ? + OCFS2_JOURNAL_ACCESS_CREATE : + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) goto out; - for (i = 0; i < blk_per_bucket; i++) { - ret = ocfs2_journal_access(handle, inode, t_bhs[i], - t_is_new ? - OCFS2_JOURNAL_ACCESS_CREATE : - OCFS2_JOURNAL_ACCESS_WRITE); - if (ret) - goto out; - } - - for (i = 0; i < blk_per_bucket; i++) { - memcpy(t_bhs[i]->b_data, s_bhs[i]->b_data, blocksize); - ocfs2_journal_dirty(handle, t_bhs[i]); - } + ocfs2_xattr_bucket_copy_data(t_bucket, s_bucket); + ocfs2_xattr_bucket_journal_dirty(handle, t_bucket); out: - if (s_bhs) { - for (i = 0; i < blk_per_bucket; i++) - brelse(s_bhs[i]); - } - kfree(s_bhs); - - if (t_bhs) { - for (i = 0; i < blk_per_bucket; i++) - brelse(t_bhs[i]); - } - kfree(t_bhs); + ocfs2_xattr_bucket_free(t_bucket); + ocfs2_xattr_bucket_free(s_bucket); return ret; } /* - * Copy one xattr cluster from src_blk to to_blk. - * The to_blk will become the first bucket header of the cluster, so its - * xh_num_buckets will be initialized as the bucket num in the cluster. + * src_blk points to the start of an existing extent. last_blk points to + * last cluster in that extent. to_blk points to a newly allocated + * extent. We copy the buckets from the cluster at last_blk to the new + * extent. If start_bucket is non-zero, we skip that many buckets before + * we start copying. The new extent's xh_num_buckets gets set to the + * number of buckets we copied. The old extent's xh_num_buckets shrinks + * by the same amount. */ -static int ocfs2_cp_xattr_cluster(struct inode *inode, - handle_t *handle, - struct buffer_head *first_bh, - u64 src_blk, - u64 to_blk, +static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle, + u64 src_blk, u64 last_blk, u64 to_blk, + unsigned int start_bucket, u32 *first_hash) { int i, ret, credits; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); + int blks_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); int num_buckets = ocfs2_xattr_buckets_per_cluster(osb); - struct buffer_head *bh = NULL; - struct ocfs2_xattr_header *xh; - u64 to_blk_start = to_blk; + struct ocfs2_xattr_bucket *old_first, *new_first; + + mlog(0, "mv xattrs from cluster %llu to %llu\n", + (unsigned long long)last_blk, (unsigned long long)to_blk); + + BUG_ON(start_bucket >= num_buckets); + if (start_bucket) { + num_buckets -= start_bucket; + last_blk += (start_bucket * blks_per_bucket); + } + + /* The first bucket of the original extent */ + old_first = ocfs2_xattr_bucket_new(inode); + /* The first bucket of the new extent */ + new_first = ocfs2_xattr_bucket_new(inode); + if (!old_first || !new_first) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } - mlog(0, "cp xattrs from cluster %llu to %llu\n", - (unsigned long long)src_blk, (unsigned long long)to_blk); + ret = ocfs2_read_xattr_bucket(old_first, src_blk); + if (ret) { + mlog_errno(ret); + goto out; + } /* - * We need to update the new cluster and 1 more for the update of - * the 1st bucket of the previous extent rec. + * We need to update the first bucket of the old extent and all + * the buckets going to the new extent. */ - credits = bpc + 1; + credits = ((num_buckets + 1) * blks_per_bucket) + + handle->h_buffer_credits; ret = ocfs2_extend_trans(handle, credits); if (ret) { mlog_errno(ret); goto out; } - ret = ocfs2_journal_access(handle, inode, first_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_xattr_bucket_journal_access(handle, old_first, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out; @@ -3480,45 +4069,45 @@ static int ocfs2_cp_xattr_cluster(struct inode *inode, for (i = 0; i < num_buckets; i++) { ret = ocfs2_cp_xattr_bucket(inode, handle, - src_blk, to_blk, 1); + last_blk + (i * blks_per_bucket), + to_blk + (i * blks_per_bucket), + 1); if (ret) { mlog_errno(ret); goto out; } - - src_blk += ocfs2_blocks_per_xattr_bucket(inode->i_sb); - to_blk += ocfs2_blocks_per_xattr_bucket(inode->i_sb); } - /* update the old bucket header. */ - xh = (struct ocfs2_xattr_header *)first_bh->b_data; - le16_add_cpu(&xh->xh_num_buckets, -num_buckets); - - ocfs2_journal_dirty(handle, first_bh); - - /* update the new bucket header. */ - ret = ocfs2_read_block(inode, to_blk_start, &bh); - if (ret < 0) { + /* + * Get the new bucket ready before we dirty anything + * (This actually shouldn't fail, because we already dirtied + * it once in ocfs2_cp_xattr_bucket()). + */ + ret = ocfs2_read_xattr_bucket(new_first, to_blk); + if (ret) { mlog_errno(ret); goto out; } - - ret = ocfs2_journal_access(handle, inode, bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_xattr_bucket_journal_access(handle, new_first, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out; } - xh = (struct ocfs2_xattr_header *)bh->b_data; - xh->xh_num_buckets = cpu_to_le16(num_buckets); + /* Now update the headers */ + le16_add_cpu(&bucket_xh(old_first)->xh_num_buckets, -num_buckets); + ocfs2_xattr_bucket_journal_dirty(handle, old_first); - ocfs2_journal_dirty(handle, bh); + bucket_xh(new_first)->xh_num_buckets = cpu_to_le16(num_buckets); + ocfs2_xattr_bucket_journal_dirty(handle, new_first); if (first_hash) - *first_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash); + *first_hash = le32_to_cpu(bucket_xh(new_first)->xh_entries[0].xe_name_hash); + out: - brelse(bh); + ocfs2_xattr_bucket_free(new_first); + ocfs2_xattr_bucket_free(old_first); return ret; } @@ -3534,7 +4123,7 @@ static int ocfs2_divide_xattr_cluster(struct inode *inode, u32 *first_hash) { u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); - int ret, credits = 2 * blk_per_bucket; + int ret, credits = 2 * blk_per_bucket + handle->h_buffer_credits; BUG_ON(OCFS2_XATTR_BUCKET_SIZE < OCFS2_SB(inode->i_sb)->s_clustersize); @@ -3577,43 +4166,49 @@ static int ocfs2_divide_xattr_cluster(struct inode *inode, */ static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode, handle_t *handle, - struct buffer_head **first_bh, - struct buffer_head **header_bh, + struct ocfs2_xattr_bucket *first, + struct ocfs2_xattr_bucket *target, u64 new_blk, - u64 prev_blk, u32 prev_clusters, u32 *v_start, int *extend) { - int ret = 0; - int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); + int ret; mlog(0, "adjust xattrs from cluster %llu len %u to %llu\n", - (unsigned long long)prev_blk, prev_clusters, + (unsigned long long)bucket_blkno(first), prev_clusters, (unsigned long long)new_blk); - if (ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)) > 1) + if (ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)) > 1) { ret = ocfs2_mv_xattr_bucket_cross_cluster(inode, handle, - first_bh, - header_bh, + first, target, new_blk, - prev_blk, prev_clusters, v_start); - else { - u64 last_blk = prev_blk + bpc * (prev_clusters - 1); - - if (prev_clusters > 1 && (*header_bh)->b_blocknr != last_blk) - ret = ocfs2_cp_xattr_cluster(inode, handle, *first_bh, - last_blk, new_blk, + if (ret) + mlog_errno(ret); + } else { + /* The start of the last cluster in the first extent */ + u64 last_blk = bucket_blkno(first) + + ((prev_clusters - 1) * + ocfs2_clusters_to_blocks(inode->i_sb, 1)); + + if (prev_clusters > 1 && bucket_blkno(target) != last_blk) { + ret = ocfs2_mv_xattr_buckets(inode, handle, + bucket_blkno(first), + last_blk, new_blk, 0, v_start); - else { + if (ret) + mlog_errno(ret); + } else { ret = ocfs2_divide_xattr_cluster(inode, handle, last_blk, new_blk, v_start); + if (ret) + mlog_errno(ret); - if ((*header_bh)->b_blocknr == last_blk && extend) + if ((bucket_blkno(target) == last_blk) && extend) *extend = 0; } } @@ -3639,56 +4234,37 @@ static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode, */ static int ocfs2_add_new_xattr_cluster(struct inode *inode, struct buffer_head *root_bh, - struct buffer_head **first_bh, - struct buffer_head **header_bh, + struct ocfs2_xattr_bucket *first, + struct ocfs2_xattr_bucket *target, u32 *num_clusters, u32 prev_cpos, - u64 prev_blkno, - int *extend) + int *extend, + struct ocfs2_xattr_set_ctxt *ctxt) { - int ret, credits; + int ret; u16 bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); u32 prev_clusters = *num_clusters; u32 clusters_to_add = 1, bit_off, num_bits, v_start = 0; u64 block; - handle_t *handle = NULL; - struct ocfs2_alloc_context *data_ac = NULL; - struct ocfs2_alloc_context *meta_ac = NULL; + handle_t *handle = ctxt->handle; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_extent_tree et; mlog(0, "Add new xattr cluster for %llu, previous xattr hash = %u, " "previous xattr blkno = %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, - prev_cpos, (unsigned long long)prev_blkno); + prev_cpos, (unsigned long long)bucket_blkno(first)); ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh); - ret = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0, - &data_ac, &meta_ac); - if (ret) { - mlog_errno(ret); - goto leave; - } - - credits = ocfs2_calc_extend_credits(osb->sb, et.et_root_el, - clusters_to_add); - handle = ocfs2_start_trans(osb, credits); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - handle = NULL; - mlog_errno(ret); - goto leave; - } - - ret = ocfs2_journal_access(handle, inode, root_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_journal_access_xb(handle, inode, root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret < 0) { mlog_errno(ret); goto leave; } - ret = __ocfs2_claim_clusters(osb, handle, data_ac, 1, + ret = __ocfs2_claim_clusters(osb, handle, ctxt->data_ac, 1, clusters_to_add, &bit_off, &num_bits); if (ret < 0) { if (ret != -ENOSPC) @@ -3702,7 +4278,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode, mlog(0, "Allocating %u clusters at block %u for xattr in inode %llu\n", num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); - if (prev_blkno + prev_clusters * bpc == block && + if (bucket_blkno(first) + (prev_clusters * bpc) == block && (prev_clusters + num_bits) << osb->s_clustersize_bits <= OCFS2_MAX_XATTR_TREE_LEAF_SIZE) { /* @@ -3721,10 +4297,9 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode, } else { ret = ocfs2_adjust_xattr_cross_cluster(inode, handle, - first_bh, - header_bh, + first, + target, block, - prev_blkno, prev_clusters, &v_start, extend); @@ -3734,149 +4309,137 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode, } } - if (handle->h_buffer_credits < credits) { - /* - * The journal has been restarted before, and don't - * have enough space for the insertion, so extend it - * here. - */ - ret = ocfs2_extend_trans(handle, credits); - if (ret) { - mlog_errno(ret); - goto leave; - } - } mlog(0, "Insert %u clusters at block %llu for xattr at %u\n", num_bits, (unsigned long long)block, v_start); ret = ocfs2_insert_extent(osb, handle, inode, &et, v_start, block, - num_bits, 0, meta_ac); + num_bits, 0, ctxt->meta_ac); if (ret < 0) { mlog_errno(ret); goto leave; } ret = ocfs2_journal_dirty(handle, root_bh); - if (ret < 0) { + if (ret < 0) mlog_errno(ret); - goto leave; - } leave: - if (handle) - ocfs2_commit_trans(osb, handle); - if (data_ac) - ocfs2_free_alloc_context(data_ac); - if (meta_ac) - ocfs2_free_alloc_context(meta_ac); - return ret; } /* - * Extend a new xattr bucket and move xattrs to the end one by one until - * We meet with start_bh. Only move half of the xattrs to the bucket after it. + * We are given an extent. 'first' is the bucket at the very front of + * the extent. The extent has space for an additional bucket past + * bucket_xh(first)->xh_num_buckets. 'target_blkno' is the block number + * of the target bucket. We wish to shift every bucket past the target + * down one, filling in that additional space. When we get back to the + * target, we split the target between itself and the now-empty bucket + * at target+1 (aka, target_blkno + blks_per_bucket). */ static int ocfs2_extend_xattr_bucket(struct inode *inode, - struct buffer_head *first_bh, - struct buffer_head *start_bh, + handle_t *handle, + struct ocfs2_xattr_bucket *first, + u64 target_blk, u32 num_clusters) { int ret, credits; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); - u64 start_blk = start_bh->b_blocknr, end_blk; - u32 num_buckets = num_clusters * ocfs2_xattr_buckets_per_cluster(osb); - handle_t *handle; - struct ocfs2_xattr_header *first_xh = - (struct ocfs2_xattr_header *)first_bh->b_data; - u16 bucket = le16_to_cpu(first_xh->xh_num_buckets); + u64 end_blk; + u16 new_bucket = le16_to_cpu(bucket_xh(first)->xh_num_buckets); mlog(0, "extend xattr bucket in %llu, xattr extend rec starting " - "from %llu, len = %u\n", (unsigned long long)start_blk, - (unsigned long long)first_bh->b_blocknr, num_clusters); + "from %llu, len = %u\n", (unsigned long long)target_blk, + (unsigned long long)bucket_blkno(first), num_clusters); - BUG_ON(bucket >= num_buckets); + /* The extent must have room for an additional bucket */ + BUG_ON(new_bucket >= + (num_clusters * ocfs2_xattr_buckets_per_cluster(osb))); - end_blk = first_bh->b_blocknr + (bucket - 1) * blk_per_bucket; + /* end_blk points to the last existing bucket */ + end_blk = bucket_blkno(first) + ((new_bucket - 1) * blk_per_bucket); /* - * We will touch all the buckets after the start_bh(include it). - * Add one more bucket and modify the first_bh. + * end_blk is the start of the last existing bucket. + * Thus, (end_blk - target_blk) covers the target bucket and + * every bucket after it up to, but not including, the last + * existing bucket. Then we add the last existing bucket, the + * new bucket, and the first bucket (3 * blk_per_bucket). */ - credits = end_blk - start_blk + 2 * blk_per_bucket + 1; - handle = ocfs2_start_trans(osb, credits); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - handle = NULL; + credits = (end_blk - target_blk) + (3 * blk_per_bucket) + + handle->h_buffer_credits; + ret = ocfs2_extend_trans(handle, credits); + if (ret) { mlog_errno(ret); goto out; } - ret = ocfs2_journal_access(handle, inode, first_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_xattr_bucket_journal_access(handle, first, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); - goto commit; + goto out; } - while (end_blk != start_blk) { + while (end_blk != target_blk) { ret = ocfs2_cp_xattr_bucket(inode, handle, end_blk, end_blk + blk_per_bucket, 0); if (ret) - goto commit; + goto out; end_blk -= blk_per_bucket; } - /* Move half of the xattr in start_blk to the next bucket. */ - ret = ocfs2_divide_xattr_bucket(inode, handle, start_blk, - start_blk + blk_per_bucket, NULL, 0); + /* Move half of the xattr in target_blkno to the next bucket. */ + ret = ocfs2_divide_xattr_bucket(inode, handle, target_blk, + target_blk + blk_per_bucket, NULL, 0); - le16_add_cpu(&first_xh->xh_num_buckets, 1); - ocfs2_journal_dirty(handle, first_bh); + le16_add_cpu(&bucket_xh(first)->xh_num_buckets, 1); + ocfs2_xattr_bucket_journal_dirty(handle, first); -commit: - ocfs2_commit_trans(osb, handle); out: return ret; } /* - * Add new xattr bucket in an extent record and adjust the buckets accordingly. - * xb_bh is the ocfs2_xattr_block. - * We will move all the buckets starting from header_bh to the next place. As - * for this one, half num of its xattrs will be moved to the next one. + * Add new xattr bucket in an extent record and adjust the buckets + * accordingly. xb_bh is the ocfs2_xattr_block, and target is the + * bucket we want to insert into. + * + * In the easy case, we will move all the buckets after target down by + * one. Half of target's xattrs will be moved to the next bucket. * - * We will allocate a new cluster if current cluster is full and adjust - * header_bh and first_bh if the insert place is moved to the new cluster. + * If current cluster is full, we'll allocate a new one. This may not + * be contiguous. The underlying calls will make sure that there is + * space for the insert, shifting buckets around if necessary. + * 'target' may be moved by those calls. */ static int ocfs2_add_new_xattr_bucket(struct inode *inode, struct buffer_head *xb_bh, - struct buffer_head *header_bh) + struct ocfs2_xattr_bucket *target, + struct ocfs2_xattr_set_ctxt *ctxt) { - struct ocfs2_xattr_header *first_xh = NULL; - struct buffer_head *first_bh = NULL; struct ocfs2_xattr_block *xb = (struct ocfs2_xattr_block *)xb_bh->b_data; struct ocfs2_xattr_tree_root *xb_root = &xb->xb_attrs.xb_root; struct ocfs2_extent_list *el = &xb_root->xt_list; - struct ocfs2_xattr_header *xh = - (struct ocfs2_xattr_header *)header_bh->b_data; - u32 name_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash); - struct super_block *sb = inode->i_sb; - struct ocfs2_super *osb = OCFS2_SB(sb); + u32 name_hash = + le32_to_cpu(bucket_xh(target)->xh_entries[0].xe_name_hash); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); int ret, num_buckets, extend = 1; u64 p_blkno; u32 e_cpos, num_clusters; + /* The bucket at the front of the extent */ + struct ocfs2_xattr_bucket *first; - mlog(0, "Add new xattr bucket starting form %llu\n", - (unsigned long long)header_bh->b_blocknr); + mlog(0, "Add new xattr bucket starting from %llu\n", + (unsigned long long)bucket_blkno(target)); - /* - * Add refrence for header_bh here because it may be - * changed in ocfs2_add_new_xattr_cluster and we need - * to free it in the end. - */ - get_bh(header_bh); + /* The first bucket of the original extent */ + first = ocfs2_xattr_bucket_new(inode); + if (!first) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, &e_cpos, &num_clusters, el); @@ -3885,40 +4448,45 @@ static int ocfs2_add_new_xattr_bucket(struct inode *inode, goto out; } - ret = ocfs2_read_block(inode, p_blkno, &first_bh); + ret = ocfs2_read_xattr_bucket(first, p_blkno); if (ret) { mlog_errno(ret); goto out; } num_buckets = ocfs2_xattr_buckets_per_cluster(osb) * num_clusters; - first_xh = (struct ocfs2_xattr_header *)first_bh->b_data; - - if (num_buckets == le16_to_cpu(first_xh->xh_num_buckets)) { + if (num_buckets == le16_to_cpu(bucket_xh(first)->xh_num_buckets)) { + /* + * This can move first+target if the target bucket moves + * to the new extent. + */ ret = ocfs2_add_new_xattr_cluster(inode, xb_bh, - &first_bh, - &header_bh, + first, + target, &num_clusters, e_cpos, - p_blkno, - &extend); + &extend, + ctxt); if (ret) { mlog_errno(ret); goto out; } } - if (extend) + if (extend) { ret = ocfs2_extend_xattr_bucket(inode, - first_bh, - header_bh, + ctxt->handle, + first, + bucket_blkno(target), num_clusters); - if (ret) - mlog_errno(ret); + if (ret) + mlog_errno(ret); + } + out: - brelse(first_bh); - brelse(header_bh); + ocfs2_xattr_bucket_free(first); + return ret; } @@ -3929,7 +4497,7 @@ static inline char *ocfs2_xattr_bucket_get_val(struct inode *inode, int block_off = offs >> inode->i_sb->s_blocksize_bits; offs = offs % inode->i_sb->s_blocksize; - return bucket->bhs[block_off]->b_data + offs; + return bucket_block(bucket, block_off) + offs; } /* @@ -3984,7 +4552,7 @@ static void ocfs2_xattr_set_entry_normal(struct inode *inode, xe->xe_value_size = 0; val = ocfs2_xattr_bucket_get_val(inode, - &xs->bucket, offs); + xs->bucket, offs); memset(val + OCFS2_XATTR_SIZE(name_len), 0, size - OCFS2_XATTR_SIZE(name_len)); if (OCFS2_XATTR_SIZE(xi->value_len) > 0) @@ -4062,8 +4630,7 @@ set_new_name_value: xh->xh_free_start = cpu_to_le16(offs); } - val = ocfs2_xattr_bucket_get_val(inode, - &xs->bucket, offs - size); + val = ocfs2_xattr_bucket_get_val(inode, xs->bucket, offs - size); xe->xe_name_offset = cpu_to_le16(offs - size); memset(val, 0, size); @@ -4079,125 +4646,45 @@ set_new_name_value: return; } -static int ocfs2_xattr_bucket_handle_journal(struct inode *inode, - handle_t *handle, - struct ocfs2_xattr_search *xs, - struct buffer_head **bhs, - u16 bh_num) -{ - int ret = 0, off, block_off; - struct ocfs2_xattr_entry *xe = xs->here; - - /* - * First calculate all the blocks we should journal_access - * and journal_dirty. The first block should always be touched. - */ - ret = ocfs2_journal_dirty(handle, bhs[0]); - if (ret) - mlog_errno(ret); - - /* calc the data. */ - off = le16_to_cpu(xe->xe_name_offset); - block_off = off >> inode->i_sb->s_blocksize_bits; - ret = ocfs2_journal_dirty(handle, bhs[block_off]); - if (ret) - mlog_errno(ret); - - return ret; -} - /* * Set the xattr entry in the specified bucket. * The bucket is indicated by xs->bucket and it should have the enough * space for the xattr insertion. */ static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode, + handle_t *handle, struct ocfs2_xattr_info *xi, struct ocfs2_xattr_search *xs, u32 name_hash, int local) { - int i, ret; - handle_t *handle = NULL; - u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + int ret; + u64 blkno; mlog(0, "Set xattr entry len = %lu index = %d in bucket %llu\n", (unsigned long)xi->value_len, xi->name_index, - (unsigned long long)xs->bucket.bhs[0]->b_blocknr); + (unsigned long long)bucket_blkno(xs->bucket)); - if (!xs->bucket.bhs[1]) { - ret = ocfs2_read_blocks(inode, - xs->bucket.bhs[0]->b_blocknr + 1, - blk_per_bucket - 1, &xs->bucket.bhs[1], - 0); + if (!xs->bucket->bu_bhs[1]) { + blkno = bucket_blkno(xs->bucket); + ocfs2_xattr_bucket_relse(xs->bucket); + ret = ocfs2_read_xattr_bucket(xs->bucket, blkno); if (ret) { mlog_errno(ret); goto out; } } - handle = ocfs2_start_trans(osb, blk_per_bucket); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - handle = NULL; + ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { mlog_errno(ret); goto out; } - for (i = 0; i < blk_per_bucket; i++) { - ret = ocfs2_journal_access(handle, inode, xs->bucket.bhs[i], - OCFS2_JOURNAL_ACCESS_WRITE); - if (ret < 0) { - mlog_errno(ret); - goto out; - } - } - ocfs2_xattr_set_entry_normal(inode, xi, xs, name_hash, local); + ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket); - /*Only dirty the blocks we have touched in set xattr. */ - ret = ocfs2_xattr_bucket_handle_journal(inode, handle, xs, - xs->bucket.bhs, blk_per_bucket); - if (ret) - mlog_errno(ret); -out: - ocfs2_commit_trans(osb, handle); - - return ret; -} - -static int ocfs2_xattr_value_update_size(struct inode *inode, - struct buffer_head *xe_bh, - struct ocfs2_xattr_entry *xe, - u64 new_size) -{ - int ret; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - handle_t *handle = NULL; - - handle = ocfs2_start_trans(osb, 1); - if (IS_ERR(handle)) { - ret = -ENOMEM; - mlog_errno(ret); - goto out; - } - - ret = ocfs2_journal_access(handle, inode, xe_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (ret < 0) { - mlog_errno(ret); - goto out_commit; - } - - xe->xe_value_size = cpu_to_le64(new_size); - - ret = ocfs2_journal_dirty(handle, xe_bh); - if (ret < 0) - mlog_errno(ret); - -out_commit: - ocfs2_commit_trans(osb, handle); out: return ret; } @@ -4210,18 +4697,19 @@ out: * Copy the new updated xe and xe_value_root to new_xe and new_xv if needed. */ static int ocfs2_xattr_bucket_value_truncate(struct inode *inode, - struct buffer_head *header_bh, + struct ocfs2_xattr_bucket *bucket, int xe_off, - int len) + int len, + struct ocfs2_xattr_set_ctxt *ctxt) { int ret, offset; u64 value_blk; - struct buffer_head *value_bh = NULL; - struct ocfs2_xattr_value_root *xv; struct ocfs2_xattr_entry *xe; - struct ocfs2_xattr_header *xh = - (struct ocfs2_xattr_header *)header_bh->b_data; + struct ocfs2_xattr_header *xh = bucket_xh(bucket); size_t blocksize = inode->i_sb->s_blocksize; + struct ocfs2_xattr_value_buf vb = { + .vb_access = ocfs2_journal_access, + }; xe = &xh->xh_entries[xe_off]; @@ -4234,49 +4722,58 @@ static int ocfs2_xattr_bucket_value_truncate(struct inode *inode, /* We don't allow ocfs2_xattr_value to be stored in different block. */ BUG_ON(value_blk != (offset + OCFS2_XATTR_ROOT_SIZE - 1) / blocksize); - value_blk += header_bh->b_blocknr; - ret = ocfs2_read_block(inode, value_blk, &value_bh); - if (ret) { - mlog_errno(ret); - goto out; - } + vb.vb_bh = bucket->bu_bhs[value_blk]; + BUG_ON(!vb.vb_bh); - xv = (struct ocfs2_xattr_value_root *) - (value_bh->b_data + offset % blocksize); + vb.vb_xv = (struct ocfs2_xattr_value_root *) + (vb.vb_bh->b_data + offset % blocksize); - mlog(0, "truncate %u in xattr bucket %llu to %d bytes.\n", - xe_off, (unsigned long long)header_bh->b_blocknr, len); - ret = ocfs2_xattr_value_truncate(inode, value_bh, xv, len); + ret = ocfs2_xattr_bucket_journal_access(ctxt->handle, bucket, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out; } - ret = ocfs2_xattr_value_update_size(inode, header_bh, xe, len); + /* + * From here on out we have to dirty the bucket. The generic + * value calls only modify one of the bucket's bhs, but we need + * to send the bucket at once. So if they error, they *could* have + * modified something. We have to assume they did, and dirty + * the whole bucket. This leaves us in a consistent state. + */ + mlog(0, "truncate %u in xattr bucket %llu to %d bytes.\n", + xe_off, (unsigned long long)bucket_blkno(bucket), len); + ret = ocfs2_xattr_value_truncate(inode, &vb, len, ctxt); if (ret) { mlog_errno(ret); - goto out; + goto out_dirty; } + xe->xe_value_size = cpu_to_le64(len); + +out_dirty: + ocfs2_xattr_bucket_journal_dirty(ctxt->handle, bucket); + out: - brelse(value_bh); return ret; } static int ocfs2_xattr_bucket_value_truncate_xs(struct inode *inode, - struct ocfs2_xattr_search *xs, - int len) + struct ocfs2_xattr_search *xs, + int len, + struct ocfs2_xattr_set_ctxt *ctxt) { int ret, offset; struct ocfs2_xattr_entry *xe = xs->here; struct ocfs2_xattr_header *xh = (struct ocfs2_xattr_header *)xs->base; - BUG_ON(!xs->bucket.bhs[0] || !xe || ocfs2_xattr_is_local(xe)); + BUG_ON(!xs->bucket->bu_bhs[0] || !xe || ocfs2_xattr_is_local(xe)); offset = xe - xh->xh_entries; - ret = ocfs2_xattr_bucket_value_truncate(inode, xs->bucket.bhs[0], - offset, len); + ret = ocfs2_xattr_bucket_value_truncate(inode, xs->bucket, + offset, len, ctxt); if (ret) mlog_errno(ret); @@ -4284,6 +4781,7 @@ static int ocfs2_xattr_bucket_value_truncate_xs(struct inode *inode, } static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode, + handle_t *handle, struct ocfs2_xattr_search *xs, char *val, int value_len) @@ -4299,7 +4797,8 @@ static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode, xv = (struct ocfs2_xattr_value_root *)(xs->base + offset); - return __ocfs2_xattr_set_value_outside(inode, xv, val, value_len); + return __ocfs2_xattr_set_value_outside(inode, handle, + xv, val, value_len); } static int ocfs2_rm_xattr_cluster(struct inode *inode, @@ -4343,15 +4842,15 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode, } } - handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS); + handle = ocfs2_start_trans(osb, ocfs2_remove_extent_credits(osb->sb)); if (IS_ERR(handle)) { ret = -ENOMEM; mlog_errno(ret); goto out; } - ret = ocfs2_journal_access(handle, inode, root_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_journal_access_xb(handle, inode, root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out_commit; @@ -4392,26 +4891,19 @@ out: } static void ocfs2_xattr_bucket_remove_xs(struct inode *inode, + handle_t *handle, struct ocfs2_xattr_search *xs) { - handle_t *handle = NULL; - struct ocfs2_xattr_header *xh = xs->bucket.xh; + struct ocfs2_xattr_header *xh = bucket_xh(xs->bucket); struct ocfs2_xattr_entry *last = &xh->xh_entries[ le16_to_cpu(xh->xh_count) - 1]; int ret = 0; - handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), 1); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - mlog_errno(ret); - return; - } - - ret = ocfs2_journal_access(handle, inode, xs->bucket.bhs[0], - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); - goto out_commit; + return; } /* Remove the old entry. */ @@ -4420,11 +4912,7 @@ static void ocfs2_xattr_bucket_remove_xs(struct inode *inode, memset(last, 0, sizeof(struct ocfs2_xattr_entry)); le16_add_cpu(&xh->xh_count, -1); - ret = ocfs2_journal_dirty(handle, xs->bucket.bhs[0]); - if (ret < 0) - mlog_errno(ret); -out_commit: - ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); + ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket); } /* @@ -4440,7 +4928,8 @@ out_commit: */ static int ocfs2_xattr_set_in_bucket(struct inode *inode, struct ocfs2_xattr_info *xi, - struct ocfs2_xattr_search *xs) + struct ocfs2_xattr_search *xs, + struct ocfs2_xattr_set_ctxt *ctxt) { int ret, local = 1; size_t value_len; @@ -4468,7 +4957,8 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode, value_len = 0; ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs, - value_len); + value_len, + ctxt); if (ret) goto out; @@ -4488,7 +4978,8 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode, xi->value_len = OCFS2_XATTR_ROOT_SIZE; } - ret = ocfs2_xattr_set_entry_in_bucket(inode, xi, xs, name_hash, local); + ret = ocfs2_xattr_set_entry_in_bucket(inode, ctxt->handle, xi, xs, + name_hash, local); if (ret) { mlog_errno(ret); goto out; @@ -4499,7 +4990,7 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode, /* allocate the space now for the outside block storage. */ ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs, - value_len); + value_len, ctxt); if (ret) { mlog_errno(ret); @@ -4509,13 +5000,14 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode, * storage and we have allocated xattr already, * so need to remove it. */ - ocfs2_xattr_bucket_remove_xs(inode, xs); + ocfs2_xattr_bucket_remove_xs(inode, ctxt->handle, xs); } goto out; } set_value_outside: - ret = ocfs2_xattr_bucket_set_value_outside(inode, xs, val, value_len); + ret = ocfs2_xattr_bucket_set_value_outside(inode, ctxt->handle, + xs, val, value_len); out: return ret; } @@ -4530,7 +5022,7 @@ static int ocfs2_check_xattr_bucket_collision(struct inode *inode, struct ocfs2_xattr_bucket *bucket, const char *name) { - struct ocfs2_xattr_header *xh = bucket->xh; + struct ocfs2_xattr_header *xh = bucket_xh(bucket); u32 name_hash = ocfs2_xattr_name_hash(inode, name, strlen(name)); if (name_hash != le32_to_cpu(xh->xh_entries[0].xe_name_hash)) @@ -4540,7 +5032,7 @@ static int ocfs2_check_xattr_bucket_collision(struct inode *inode, xh->xh_entries[0].xe_name_hash) { mlog(ML_ERROR, "Too much hash collision in xattr bucket %llu, " "hash = %u\n", - (unsigned long long)bucket->bhs[0]->b_blocknr, + (unsigned long long)bucket_blkno(bucket), le32_to_cpu(xh->xh_entries[0].xe_name_hash)); return -ENOSPC; } @@ -4550,16 +5042,16 @@ static int ocfs2_check_xattr_bucket_collision(struct inode *inode, static int ocfs2_xattr_set_entry_index_block(struct inode *inode, struct ocfs2_xattr_info *xi, - struct ocfs2_xattr_search *xs) + struct ocfs2_xattr_search *xs, + struct ocfs2_xattr_set_ctxt *ctxt) { struct ocfs2_xattr_header *xh; struct ocfs2_xattr_entry *xe; u16 count, header_size, xh_free_start; - int i, free, max_free, need, old; + int free, max_free, need, old; size_t value_size = 0, name_len = strlen(xi->name); size_t blocksize = inode->i_sb->s_blocksize; int ret, allocation = 0; - u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); mlog_entry("Set xattr %s in xattr index block\n", xi->name); @@ -4574,7 +5066,7 @@ try_again: mlog_bug_on_msg(header_size > blocksize, "bucket %llu has header size " "of %u which exceed block size\n", - (unsigned long long)xs->bucket.bhs[0]->b_blocknr, + (unsigned long long)bucket_blkno(xs->bucket), header_size); if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE) @@ -4614,11 +5106,13 @@ try_again: mlog(0, "xs->not_found = %d, in xattr bucket %llu: free = %d, " "need = %d, max_free = %d, xh_free_start = %u, xh_name_value_len =" " %u\n", xs->not_found, - (unsigned long long)xs->bucket.bhs[0]->b_blocknr, + (unsigned long long)bucket_blkno(xs->bucket), free, need, max_free, le16_to_cpu(xh->xh_free_start), le16_to_cpu(xh->xh_name_value_len)); - if (free < need || count == ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) { + if (free < need || + (xs->not_found && + count == ocfs2_xattr_max_xe_in_bucket(inode->i_sb))) { if (need <= max_free && count < ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) { /* @@ -4626,7 +5120,8 @@ try_again: * name/value will be moved, the xe shouldn't be changed * in xs. */ - ret = ocfs2_defrag_xattr_bucket(inode, &xs->bucket); + ret = ocfs2_defrag_xattr_bucket(inode, ctxt->handle, + xs->bucket); if (ret) { mlog_errno(ret); goto out; @@ -4658,7 +5153,7 @@ try_again: * add a new bucket for the insert. */ ret = ocfs2_check_xattr_bucket_collision(inode, - &xs->bucket, + xs->bucket, xi->name); if (ret) { mlog_errno(ret); @@ -4667,17 +5162,21 @@ try_again: ret = ocfs2_add_new_xattr_bucket(inode, xs->xattr_bh, - xs->bucket.bhs[0]); + xs->bucket, + ctxt); if (ret) { mlog_errno(ret); goto out; } - for (i = 0; i < blk_per_bucket; i++) - brelse(xs->bucket.bhs[i]); - - memset(&xs->bucket, 0, sizeof(xs->bucket)); - + /* + * ocfs2_add_new_xattr_bucket() will have updated + * xs->bucket if it moved, but it will not have updated + * any of the other search fields. Thus, we drop it and + * re-search. Everything should be cached, so it'll be + * quick. + */ + ocfs2_xattr_bucket_relse(xs->bucket); ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh, xi->name_index, xi->name, xs); @@ -4689,7 +5188,7 @@ try_again: } xattr_set: - ret = ocfs2_xattr_set_in_bucket(inode, xi, xs); + ret = ocfs2_xattr_set_in_bucket(inode, xi, xs, ctxt); out: mlog_exit(ret); return ret; @@ -4700,24 +5199,41 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode, void *para) { int ret = 0; - struct ocfs2_xattr_header *xh = bucket->xh; + struct ocfs2_xattr_header *xh = bucket_xh(bucket); u16 i; struct ocfs2_xattr_entry *xe; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_xattr_set_ctxt ctxt = {NULL, NULL,}; + int credits = ocfs2_remove_extent_credits(osb->sb) + + ocfs2_blocks_per_xattr_bucket(inode->i_sb); + + + ocfs2_init_dealloc_ctxt(&ctxt.dealloc); for (i = 0; i < le16_to_cpu(xh->xh_count); i++) { xe = &xh->xh_entries[i]; if (ocfs2_xattr_is_local(xe)) continue; - ret = ocfs2_xattr_bucket_value_truncate(inode, - bucket->bhs[0], - i, 0); + ctxt.handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(ctxt.handle)) { + ret = PTR_ERR(ctxt.handle); + mlog_errno(ret); + break; + } + + ret = ocfs2_xattr_bucket_value_truncate(inode, bucket, + i, 0, &ctxt); + + ocfs2_commit_trans(osb, ctxt.handle); if (ret) { mlog_errno(ret); break; } } + ocfs2_schedule_truncate_log_flush(osb, 1); + ocfs2_run_deallocs(osb, &ctxt.dealloc); return ret; } @@ -4768,6 +5284,74 @@ out: } /* + * 'security' attributes support + */ +static size_t ocfs2_xattr_security_list(struct inode *inode, char *list, + size_t list_size, const char *name, + size_t name_len) +{ + const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN; + const size_t total_len = prefix_len + name_len + 1; + + if (list && total_len <= list_size) { + memcpy(list, XATTR_SECURITY_PREFIX, prefix_len); + memcpy(list + prefix_len, name, name_len); + list[prefix_len + name_len] = '\0'; + } + return total_len; +} + +static int ocfs2_xattr_security_get(struct inode *inode, const char *name, + void *buffer, size_t size) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_SECURITY, name, + buffer, size); +} + +static int ocfs2_xattr_security_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + + return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY, name, value, + size, flags); +} + +int ocfs2_init_security_get(struct inode *inode, + struct inode *dir, + struct ocfs2_security_xattr_info *si) +{ + /* check whether ocfs2 support feature xattr */ + if (!ocfs2_supports_xattr(OCFS2_SB(dir->i_sb))) + return -EOPNOTSUPP; + return security_inode_init_security(inode, dir, &si->name, &si->value, + &si->value_len); +} + +int ocfs2_init_security_set(handle_t *handle, + struct inode *inode, + struct buffer_head *di_bh, + struct ocfs2_security_xattr_info *si, + struct ocfs2_alloc_context *xattr_ac, + struct ocfs2_alloc_context *data_ac) +{ + return ocfs2_xattr_set_handle(handle, inode, di_bh, + OCFS2_XATTR_INDEX_SECURITY, + si->name, si->value, si->value_len, 0, + xattr_ac, data_ac); +} + +struct xattr_handler ocfs2_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .list = ocfs2_xattr_security_list, + .get = ocfs2_xattr_security_get, + .set = ocfs2_xattr_security_set, +}; + +/* * 'trusted' attributes support */ static size_t ocfs2_xattr_trusted_list(struct inode *inode, char *list, diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h index 1d8314c7656d..5a1ebc789f7e 100644 --- a/fs/ocfs2/xattr.h +++ b/fs/ocfs2/xattr.h @@ -30,13 +30,58 @@ enum ocfs2_xattr_type { OCFS2_XATTR_MAX }; +struct ocfs2_security_xattr_info { + int enable; + char *name; + void *value; + size_t value_len; +}; + extern struct xattr_handler ocfs2_xattr_user_handler; extern struct xattr_handler ocfs2_xattr_trusted_handler; +extern struct xattr_handler ocfs2_xattr_security_handler; +#ifdef CONFIG_OCFS2_FS_POSIX_ACL +extern struct xattr_handler ocfs2_xattr_acl_access_handler; +extern struct xattr_handler ocfs2_xattr_acl_default_handler; +#endif extern struct xattr_handler *ocfs2_xattr_handlers[]; ssize_t ocfs2_listxattr(struct dentry *, char *, size_t); +int ocfs2_xattr_get_nolock(struct inode *, struct buffer_head *, int, + const char *, void *, size_t); int ocfs2_xattr_set(struct inode *, int, const char *, const void *, size_t, int); +int ocfs2_xattr_set_handle(handle_t *, struct inode *, struct buffer_head *, + int, const char *, const void *, size_t, int, + struct ocfs2_alloc_context *, + struct ocfs2_alloc_context *); int ocfs2_xattr_remove(struct inode *, struct buffer_head *); +int ocfs2_init_security_get(struct inode *, struct inode *, + struct ocfs2_security_xattr_info *); +int ocfs2_init_security_set(handle_t *, struct inode *, + struct buffer_head *, + struct ocfs2_security_xattr_info *, + struct ocfs2_alloc_context *, + struct ocfs2_alloc_context *); +int ocfs2_calc_security_init(struct inode *, + struct ocfs2_security_xattr_info *, + int *, int *, struct ocfs2_alloc_context **); +int ocfs2_calc_xattr_init(struct inode *, struct buffer_head *, + int, struct ocfs2_security_xattr_info *, + int *, int *, struct ocfs2_alloc_context **); + +/* + * xattrs can live inside an inode, as part of an external xattr block, + * or inside an xattr bucket, which is the leaf of a tree rooted in an + * xattr block. Some of the xattr calls, especially the value setting + * functions, want to treat each of these locations as equal. Let's wrap + * them in a structure that we can pass around instead of raw buffer_heads. + */ +struct ocfs2_xattr_value_buf { + struct buffer_head *vb_bh; + ocfs2_journal_access_func vb_access; + struct ocfs2_xattr_value_root *vb_xv; +}; + #endif /* OCFS2_XATTR_H */ diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index 6afe57c84f84..633e9dc972bb 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -39,7 +39,6 @@ struct inode *omfs_new_inode(struct inode *dir, int mode) inode->i_mode = mode; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); - inode->i_blocks = 0; inode->i_mapping->a_ops = &omfs_aops; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; diff --git a/fs/open.c b/fs/open.c index c0a426d5766c..d882fd2351d6 100644 --- a/fs/open.c +++ b/fs/open.c @@ -272,6 +272,8 @@ static long do_sys_truncate(const char __user *pathname, loff_t length) goto put_write_and_out; error = locks_verify_truncate(inode, NULL, length); + if (!error) + error = security_path_truncate(&path, length, 0); if (!error) { DQUOT_INIT(inode); error = do_truncate(path.dentry, length, 0, NULL); @@ -329,6 +331,9 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small) error = locks_verify_truncate(inode, file, length); if (!error) + error = security_path_truncate(&file->f_path, length, + ATTR_MTIME|ATTR_CTIME); + if (!error) error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, file); out_putf: fput(file); @@ -407,7 +412,7 @@ asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len) if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0)) goto out_fput; - if (inode->i_op && inode->i_op->fallocate) + if (inode->i_op->fallocate) ret = inode->i_op->fallocate(inode, mode, offset, len); else ret = -EOPNOTSUPP; diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index d41bdc784de4..ffcd04f0012c 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -256,9 +256,6 @@ found: break; } - inode->i_gid = 0; - inode->i_uid = 0; - d_add(dentry, inode); return NULL; } diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 6d5b213b8a9b..6d720243f5f4 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -334,6 +334,7 @@ void delete_partition(struct gendisk *disk, int partno) blk_free_devt(part_devt(part)); rcu_assign_pointer(ptbl->part[partno], NULL); + rcu_assign_pointer(ptbl->last_lookup, NULL); kobject_put(part->holder_dir); device_del(part_to_dev(part)); @@ -384,9 +385,9 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno, dname = dev_name(ddev); if (isdigit(dname[strlen(dname) - 1])) - snprintf(pdev->bus_id, BUS_ID_SIZE, "%sp%d", dname, partno); + dev_set_name(pdev, "%sp%d", dname, partno); else - snprintf(pdev->bus_id, BUS_ID_SIZE, "%s%d", dname, partno); + dev_set_name(pdev, "%s%d", dname, partno); device_initialize(pdev); pdev->class = &block_class; @@ -447,16 +448,11 @@ void register_disk(struct gendisk *disk) struct block_device *bdev; struct disk_part_iter piter; struct hd_struct *part; - char *s; int err; ddev->parent = disk->driverfs_dev; - strlcpy(ddev->bus_id, disk->disk_name, BUS_ID_SIZE); - /* ewww... some of these buggers have / in the name... */ - s = strchr(ddev->bus_id, '/'); - if (s) - *s = '!'; + dev_set_name(ddev, disk->disk_name); /* delay uevents, until we scanned partition table */ ddev->uevent_suppress = 1; diff --git a/fs/pipe.c b/fs/pipe.c index aaf797bd57b9..891697112f66 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -1016,10 +1016,7 @@ int do_pipe_flags(int *fd, int flags) goto err_fdr; fdw = error; - error = audit_fd_pair(fdr, fdw); - if (error < 0) - goto err_fdw; - + audit_fd_pair(fdr, fdw); fd_install(fdr, fr); fd_install(fdw, fw); fd[0] = fdr; @@ -1027,8 +1024,6 @@ int do_pipe_flags(int *fd, int flags) return 0; - err_fdw: - put_unused_fd(fdw); err_fdr: put_unused_fd(fdr); err_read_pipe: diff --git a/fs/proc/base.c b/fs/proc/base.c index cad92c1ac2b3..0c9de19a1633 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -65,6 +65,7 @@ #include <linux/mm.h> #include <linux/rcupdate.h> #include <linux/kallsyms.h> +#include <linux/stacktrace.h> #include <linux/resource.h> #include <linux/module.h> #include <linux/mount.h> @@ -109,25 +110,22 @@ struct pid_entry { .op = OP, \ } -#define DIR(NAME, MODE, OTYPE) \ - NOD(NAME, (S_IFDIR|(MODE)), \ - &proc_##OTYPE##_inode_operations, &proc_##OTYPE##_operations, \ - {} ) -#define LNK(NAME, OTYPE) \ +#define DIR(NAME, MODE, iops, fops) \ + NOD(NAME, (S_IFDIR|(MODE)), &iops, &fops, {} ) +#define LNK(NAME, get_link) \ NOD(NAME, (S_IFLNK|S_IRWXUGO), \ &proc_pid_link_inode_operations, NULL, \ - { .proc_get_link = &proc_##OTYPE##_link } ) -#define REG(NAME, MODE, OTYPE) \ - NOD(NAME, (S_IFREG|(MODE)), NULL, \ - &proc_##OTYPE##_operations, {}) -#define INF(NAME, MODE, OTYPE) \ + { .proc_get_link = get_link } ) +#define REG(NAME, MODE, fops) \ + NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {}) +#define INF(NAME, MODE, read) \ NOD(NAME, (S_IFREG|(MODE)), \ NULL, &proc_info_file_operations, \ - { .proc_read = &proc_##OTYPE } ) -#define ONE(NAME, MODE, OTYPE) \ + { .proc_read = read } ) +#define ONE(NAME, MODE, show) \ NOD(NAME, (S_IFREG|(MODE)), \ NULL, &proc_single_file_operations, \ - { .proc_show = &proc_##OTYPE } ) + { .proc_show = show } ) /* * Count the number of hardlinks for the pid_entry table, excluding the . @@ -308,9 +306,9 @@ static int proc_pid_auxv(struct task_struct *task, char *buffer) struct mm_struct *mm = get_task_mm(task); if (mm) { unsigned int nwords = 0; - do + do { nwords += 2; - while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */ + } while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */ res = nwords * sizeof(mm->saved_auxv[0]); if (res > PAGE_SIZE) res = PAGE_SIZE; @@ -340,6 +338,37 @@ static int proc_pid_wchan(struct task_struct *task, char *buffer) } #endif /* CONFIG_KALLSYMS */ +#ifdef CONFIG_STACKTRACE + +#define MAX_STACK_TRACE_DEPTH 64 + +static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + struct stack_trace trace; + unsigned long *entries; + int i; + + entries = kmalloc(MAX_STACK_TRACE_DEPTH * sizeof(*entries), GFP_KERNEL); + if (!entries) + return -ENOMEM; + + trace.nr_entries = 0; + trace.max_entries = MAX_STACK_TRACE_DEPTH; + trace.entries = entries; + trace.skip = 0; + save_stack_trace_tsk(task, &trace); + + for (i = 0; i < trace.nr_entries; i++) { + seq_printf(m, "[<%p>] %pS\n", + (void *)entries[i], (void *)entries[i]); + } + kfree(entries); + + return 0; +} +#endif + #ifdef CONFIG_SCHEDSTATS /* * Provides /proc/PID/schedstat @@ -1186,8 +1215,6 @@ static int sched_show(struct seq_file *m, void *v) struct inode *inode = m->private; struct task_struct *p; - WARN_ON(!inode); - p = get_proc_task(inode); if (!p) return -ESRCH; @@ -1205,8 +1232,6 @@ sched_write(struct file *file, const char __user *buf, struct inode *inode = file->f_path.dentry->d_inode; struct task_struct *p; - WARN_ON(!inode); - p = get_proc_task(inode); if (!p) return -ESRCH; @@ -1426,8 +1451,6 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st if (!ei->pid) goto out_unlock; - inode->i_uid = 0; - inode->i_gid = 0; if (task_dumpable(task)) { rcu_read_lock(); cred = __task_cred(task); @@ -1976,13 +1999,11 @@ static struct dentry *proc_pident_lookup(struct inode *dir, const struct pid_entry *ents, unsigned int nents) { - struct inode *inode; struct dentry *error; struct task_struct *task = get_proc_task(dir); const struct pid_entry *p, *last; error = ERR_PTR(-ENOENT); - inode = NULL; if (!task) goto out_no_task; @@ -2138,12 +2159,12 @@ static const struct file_operations proc_pid_attr_operations = { }; static const struct pid_entry attr_dir_stuff[] = { - REG("current", S_IRUGO|S_IWUGO, pid_attr), - REG("prev", S_IRUGO, pid_attr), - REG("exec", S_IRUGO|S_IWUGO, pid_attr), - REG("fscreate", S_IRUGO|S_IWUGO, pid_attr), - REG("keycreate", S_IRUGO|S_IWUGO, pid_attr), - REG("sockcreate", S_IRUGO|S_IWUGO, pid_attr), + REG("current", S_IRUGO|S_IWUGO, proc_pid_attr_operations), + REG("prev", S_IRUGO, proc_pid_attr_operations), + REG("exec", S_IRUGO|S_IWUGO, proc_pid_attr_operations), + REG("fscreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations), + REG("keycreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations), + REG("sockcreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations), }; static int proc_attr_dir_readdir(struct file * filp, @@ -2349,8 +2370,6 @@ static struct dentry *proc_base_instantiate(struct inode *dir, if (!ei->pid) goto out_iput; - inode->i_uid = 0; - inode->i_gid = 0; inode->i_mode = p->mode; if (S_ISDIR(inode->i_mode)) inode->i_nlink = 2; @@ -2465,74 +2484,77 @@ static const struct file_operations proc_task_operations; static const struct inode_operations proc_task_inode_operations; static const struct pid_entry tgid_base_stuff[] = { - DIR("task", S_IRUGO|S_IXUGO, task), - DIR("fd", S_IRUSR|S_IXUSR, fd), - DIR("fdinfo", S_IRUSR|S_IXUSR, fdinfo), + DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations), + DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), + DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations), #ifdef CONFIG_NET - DIR("net", S_IRUGO|S_IXUGO, net), + DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations), #endif - REG("environ", S_IRUSR, environ), - INF("auxv", S_IRUSR, pid_auxv), - ONE("status", S_IRUGO, pid_status), - ONE("personality", S_IRUSR, pid_personality), - INF("limits", S_IRUSR, pid_limits), + REG("environ", S_IRUSR, proc_environ_operations), + INF("auxv", S_IRUSR, proc_pid_auxv), + ONE("status", S_IRUGO, proc_pid_status), + ONE("personality", S_IRUSR, proc_pid_personality), + INF("limits", S_IRUSR, proc_pid_limits), #ifdef CONFIG_SCHED_DEBUG - REG("sched", S_IRUGO|S_IWUSR, pid_sched), + REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), #endif #ifdef CONFIG_HAVE_ARCH_TRACEHOOK - INF("syscall", S_IRUSR, pid_syscall), + INF("syscall", S_IRUSR, proc_pid_syscall), #endif - INF("cmdline", S_IRUGO, pid_cmdline), - ONE("stat", S_IRUGO, tgid_stat), - ONE("statm", S_IRUGO, pid_statm), - REG("maps", S_IRUGO, maps), + INF("cmdline", S_IRUGO, proc_pid_cmdline), + ONE("stat", S_IRUGO, proc_tgid_stat), + ONE("statm", S_IRUGO, proc_pid_statm), + REG("maps", S_IRUGO, proc_maps_operations), #ifdef CONFIG_NUMA - REG("numa_maps", S_IRUGO, numa_maps), + REG("numa_maps", S_IRUGO, proc_numa_maps_operations), #endif - REG("mem", S_IRUSR|S_IWUSR, mem), - LNK("cwd", cwd), - LNK("root", root), - LNK("exe", exe), - REG("mounts", S_IRUGO, mounts), - REG("mountinfo", S_IRUGO, mountinfo), - REG("mountstats", S_IRUSR, mountstats), + REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations), + LNK("cwd", proc_cwd_link), + LNK("root", proc_root_link), + LNK("exe", proc_exe_link), + REG("mounts", S_IRUGO, proc_mounts_operations), + REG("mountinfo", S_IRUGO, proc_mountinfo_operations), + REG("mountstats", S_IRUSR, proc_mountstats_operations), #ifdef CONFIG_PROC_PAGE_MONITOR - REG("clear_refs", S_IWUSR, clear_refs), - REG("smaps", S_IRUGO, smaps), - REG("pagemap", S_IRUSR, pagemap), + REG("clear_refs", S_IWUSR, proc_clear_refs_operations), + REG("smaps", S_IRUGO, proc_smaps_operations), + REG("pagemap", S_IRUSR, proc_pagemap_operations), #endif #ifdef CONFIG_SECURITY - DIR("attr", S_IRUGO|S_IXUGO, attr_dir), + DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), #endif #ifdef CONFIG_KALLSYMS - INF("wchan", S_IRUGO, pid_wchan), + INF("wchan", S_IRUGO, proc_pid_wchan), +#endif +#ifdef CONFIG_STACKTRACE + ONE("stack", S_IRUSR, proc_pid_stack), #endif #ifdef CONFIG_SCHEDSTATS - INF("schedstat", S_IRUGO, pid_schedstat), + INF("schedstat", S_IRUGO, proc_pid_schedstat), #endif #ifdef CONFIG_LATENCYTOP - REG("latency", S_IRUGO, lstats), + REG("latency", S_IRUGO, proc_lstats_operations), #endif #ifdef CONFIG_PROC_PID_CPUSET - REG("cpuset", S_IRUGO, cpuset), + REG("cpuset", S_IRUGO, proc_cpuset_operations), #endif #ifdef CONFIG_CGROUPS - REG("cgroup", S_IRUGO, cgroup), + REG("cgroup", S_IRUGO, proc_cgroup_operations), #endif - INF("oom_score", S_IRUGO, oom_score), - REG("oom_adj", S_IRUGO|S_IWUSR, oom_adjust), + INF("oom_score", S_IRUGO, proc_oom_score), + REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations), #ifdef CONFIG_AUDITSYSCALL - REG("loginuid", S_IWUSR|S_IRUGO, loginuid), - REG("sessionid", S_IRUGO, sessionid), + REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), + REG("sessionid", S_IRUGO, proc_sessionid_operations), #endif #ifdef CONFIG_FAULT_INJECTION - REG("make-it-fail", S_IRUGO|S_IWUSR, fault_inject), + REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations), #endif #if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE) - REG("coredump_filter", S_IRUGO|S_IWUSR, coredump_filter), + REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations), #endif #ifdef CONFIG_TASK_IO_ACCOUNTING - INF("io", S_IRUGO, tgid_io_accounting), + INF("io", S_IRUGO, proc_tgid_io_accounting), #endif }; @@ -2805,66 +2827,69 @@ out_no_task: * Tasks */ static const struct pid_entry tid_base_stuff[] = { - DIR("fd", S_IRUSR|S_IXUSR, fd), - DIR("fdinfo", S_IRUSR|S_IXUSR, fdinfo), - REG("environ", S_IRUSR, environ), - INF("auxv", S_IRUSR, pid_auxv), - ONE("status", S_IRUGO, pid_status), - ONE("personality", S_IRUSR, pid_personality), - INF("limits", S_IRUSR, pid_limits), + DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), + DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fd_operations), + REG("environ", S_IRUSR, proc_environ_operations), + INF("auxv", S_IRUSR, proc_pid_auxv), + ONE("status", S_IRUGO, proc_pid_status), + ONE("personality", S_IRUSR, proc_pid_personality), + INF("limits", S_IRUSR, proc_pid_limits), #ifdef CONFIG_SCHED_DEBUG - REG("sched", S_IRUGO|S_IWUSR, pid_sched), + REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), #endif #ifdef CONFIG_HAVE_ARCH_TRACEHOOK - INF("syscall", S_IRUSR, pid_syscall), + INF("syscall", S_IRUSR, proc_pid_syscall), #endif - INF("cmdline", S_IRUGO, pid_cmdline), - ONE("stat", S_IRUGO, tid_stat), - ONE("statm", S_IRUGO, pid_statm), - REG("maps", S_IRUGO, maps), + INF("cmdline", S_IRUGO, proc_pid_cmdline), + ONE("stat", S_IRUGO, proc_tid_stat), + ONE("statm", S_IRUGO, proc_pid_statm), + REG("maps", S_IRUGO, proc_maps_operations), #ifdef CONFIG_NUMA - REG("numa_maps", S_IRUGO, numa_maps), + REG("numa_maps", S_IRUGO, proc_numa_maps_operations), #endif - REG("mem", S_IRUSR|S_IWUSR, mem), - LNK("cwd", cwd), - LNK("root", root), - LNK("exe", exe), - REG("mounts", S_IRUGO, mounts), - REG("mountinfo", S_IRUGO, mountinfo), + REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations), + LNK("cwd", proc_cwd_link), + LNK("root", proc_root_link), + LNK("exe", proc_exe_link), + REG("mounts", S_IRUGO, proc_mounts_operations), + REG("mountinfo", S_IRUGO, proc_mountinfo_operations), #ifdef CONFIG_PROC_PAGE_MONITOR - REG("clear_refs", S_IWUSR, clear_refs), - REG("smaps", S_IRUGO, smaps), - REG("pagemap", S_IRUSR, pagemap), + REG("clear_refs", S_IWUSR, proc_clear_refs_operations), + REG("smaps", S_IRUGO, proc_smaps_operations), + REG("pagemap", S_IRUSR, proc_pagemap_operations), #endif #ifdef CONFIG_SECURITY - DIR("attr", S_IRUGO|S_IXUGO, attr_dir), + DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), #endif #ifdef CONFIG_KALLSYMS - INF("wchan", S_IRUGO, pid_wchan), + INF("wchan", S_IRUGO, proc_pid_wchan), +#endif +#ifdef CONFIG_STACKTRACE + ONE("stack", S_IRUSR, proc_pid_stack), #endif #ifdef CONFIG_SCHEDSTATS - INF("schedstat", S_IRUGO, pid_schedstat), + INF("schedstat", S_IRUGO, proc_pid_schedstat), #endif #ifdef CONFIG_LATENCYTOP - REG("latency", S_IRUGO, lstats), + REG("latency", S_IRUGO, proc_lstats_operations), #endif #ifdef CONFIG_PROC_PID_CPUSET - REG("cpuset", S_IRUGO, cpuset), + REG("cpuset", S_IRUGO, proc_cpuset_operations), #endif #ifdef CONFIG_CGROUPS - REG("cgroup", S_IRUGO, cgroup), + REG("cgroup", S_IRUGO, proc_cgroup_operations), #endif - INF("oom_score", S_IRUGO, oom_score), - REG("oom_adj", S_IRUGO|S_IWUSR, oom_adjust), + INF("oom_score", S_IRUGO, proc_oom_score), + REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations), #ifdef CONFIG_AUDITSYSCALL - REG("loginuid", S_IWUSR|S_IRUGO, loginuid), - REG("sessionid", S_IRUSR, sessionid), + REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), + REG("sessionid", S_IRUSR, proc_sessionid_operations), #endif #ifdef CONFIG_FAULT_INJECTION - REG("make-it-fail", S_IRUGO|S_IWUSR, fault_inject), + REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations), #endif #ifdef CONFIG_TASK_IO_ACCOUNTING - INF("io", S_IRUGO, tid_io_accounting), + INF("io", S_IRUGO, proc_tid_io_accounting), #endif }; diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 60a359b35582..db7fa5cab988 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -14,7 +14,6 @@ #include <linux/stat.h> #include <linux/module.h> #include <linux/mount.h> -#include <linux/smp_lock.h> #include <linux/init.h> #include <linux/idr.h> #include <linux/namei.h> @@ -379,7 +378,6 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir, struct inode *inode = NULL; int error = -ENOENT; - lock_kernel(); spin_lock(&proc_subdir_lock); for (de = de->subdir; de ; de = de->next) { if (de->namelen != dentry->d_name.len) @@ -397,7 +395,6 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir, } spin_unlock(&proc_subdir_lock); out_unlock: - unlock_kernel(); if (inode) { dentry->d_op = &proc_dentry_operations; @@ -432,8 +429,6 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent, struct inode *inode = filp->f_path.dentry->d_inode; int ret = 0; - lock_kernel(); - ino = inode->i_ino; i = filp->f_pos; switch (i) { @@ -487,7 +482,7 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent, spin_unlock(&proc_subdir_lock); } ret = 1; -out: unlock_kernel(); +out: return ret; } @@ -504,6 +499,7 @@ int proc_readdir(struct file *filp, void *dirent, filldir_t filldir) * the /proc directory. */ static const struct file_operations proc_dir_operations = { + .llseek = generic_file_llseek, .read = generic_read_dir, .readdir = proc_readdir, }; diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 2543fd00c658..3e76bb9b3ad6 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -35,16 +35,13 @@ struct proc_dir_entry *de_get(struct proc_dir_entry *de) */ void de_put(struct proc_dir_entry *de) { - lock_kernel(); if (!atomic_read(&de->count)) { printk("de_put: entry %s already free!\n", de->name); - unlock_kernel(); return; } if (atomic_dec_and_test(&de->count)) free_proc_entry(de); - unlock_kernel(); } /* diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 3e8aeb8b61ce..cd53ff838498 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -41,8 +41,6 @@ do { \ (vmi)->used = 0; \ (vmi)->largest_chunk = 0; \ } while(0) - -extern int nommu_vma_show(struct seq_file *, struct vm_area_struct *); #endif extern int proc_tid_stat(struct seq_file *m, struct pid_namespace *ns, diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index b1675c4e66da..43d23948384a 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -74,6 +74,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v) "LowTotal: %8lu kB\n" "LowFree: %8lu kB\n" #endif +#ifndef CONFIG_MMU + "MmapCopy: %8lu kB\n" +#endif "SwapTotal: %8lu kB\n" "SwapFree: %8lu kB\n" "Dirty: %8lu kB\n" @@ -116,6 +119,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v) K(i.totalram-i.totalhigh), K(i.freeram-i.freehigh), #endif +#ifndef CONFIG_MMU + K((unsigned long) atomic_read(&mmap_pages_allocated)), +#endif K(i.totalswap), K(i.freeswap), K(global_page_state(NR_FILE_DIRTY)), diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c index 3f87d2632947..b446d7ad0b0d 100644 --- a/fs/proc/nommu.c +++ b/fs/proc/nommu.c @@ -33,33 +33,33 @@ #include "internal.h" /* - * display a single VMA to a sequenced file + * display a single region to a sequenced file */ -int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) +static int nommu_region_show(struct seq_file *m, struct vm_region *region) { unsigned long ino = 0; struct file *file; dev_t dev = 0; int flags, len; - flags = vma->vm_flags; - file = vma->vm_file; + flags = region->vm_flags; + file = region->vm_file; if (file) { - struct inode *inode = vma->vm_file->f_path.dentry->d_inode; + struct inode *inode = region->vm_file->f_path.dentry->d_inode; dev = inode->i_sb->s_dev; ino = inode->i_ino; } seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n", - vma->vm_start, - vma->vm_end, + region->vm_start, + region->vm_end, flags & VM_READ ? 'r' : '-', flags & VM_WRITE ? 'w' : '-', flags & VM_EXEC ? 'x' : '-', flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p', - ((loff_t)vma->vm_pgoff) << PAGE_SHIFT, + ((loff_t)region->vm_pgoff) << PAGE_SHIFT, MAJOR(dev), MINOR(dev), ino, &len); if (file) { @@ -75,61 +75,54 @@ int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) } /* - * display a list of all the VMAs the kernel knows about + * display a list of all the REGIONs the kernel knows about * - nommu kernals have a single flat list */ -static int nommu_vma_list_show(struct seq_file *m, void *v) +static int nommu_region_list_show(struct seq_file *m, void *_p) { - struct vm_area_struct *vma; + struct rb_node *p = _p; - vma = rb_entry((struct rb_node *) v, struct vm_area_struct, vm_rb); - return nommu_vma_show(m, vma); + return nommu_region_show(m, rb_entry(p, struct vm_region, vm_rb)); } -static void *nommu_vma_list_start(struct seq_file *m, loff_t *_pos) +static void *nommu_region_list_start(struct seq_file *m, loff_t *_pos) { - struct rb_node *_rb; + struct rb_node *p; loff_t pos = *_pos; - void *next = NULL; - down_read(&nommu_vma_sem); + down_read(&nommu_region_sem); - for (_rb = rb_first(&nommu_vma_tree); _rb; _rb = rb_next(_rb)) { - if (pos == 0) { - next = _rb; - break; - } - pos--; - } - - return next; + for (p = rb_first(&nommu_region_tree); p; p = rb_next(p)) + if (pos-- == 0) + return p; + return NULL; } -static void nommu_vma_list_stop(struct seq_file *m, void *v) +static void nommu_region_list_stop(struct seq_file *m, void *v) { - up_read(&nommu_vma_sem); + up_read(&nommu_region_sem); } -static void *nommu_vma_list_next(struct seq_file *m, void *v, loff_t *pos) +static void *nommu_region_list_next(struct seq_file *m, void *v, loff_t *pos) { (*pos)++; return rb_next((struct rb_node *) v); } -static const struct seq_operations proc_nommu_vma_list_seqop = { - .start = nommu_vma_list_start, - .next = nommu_vma_list_next, - .stop = nommu_vma_list_stop, - .show = nommu_vma_list_show +static struct seq_operations proc_nommu_region_list_seqop = { + .start = nommu_region_list_start, + .next = nommu_region_list_next, + .stop = nommu_region_list_stop, + .show = nommu_region_list_show }; -static int proc_nommu_vma_list_open(struct inode *inode, struct file *file) +static int proc_nommu_region_list_open(struct inode *inode, struct file *file) { - return seq_open(file, &proc_nommu_vma_list_seqop); + return seq_open(file, &proc_nommu_region_list_seqop); } -static const struct file_operations proc_nommu_vma_list_operations = { - .open = proc_nommu_vma_list_open, +static const struct file_operations proc_nommu_region_list_operations = { + .open = proc_nommu_region_list_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, @@ -137,7 +130,7 @@ static const struct file_operations proc_nommu_vma_list_operations = { static int __init proc_nommu_init(void) { - proc_create("maps", S_IRUGO, NULL, &proc_nommu_vma_list_operations); + proc_create("maps", S_IRUGO, NULL, &proc_nommu_region_list_operations); return 0; } diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 7bc296f424ae..04d1270f1c38 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -18,7 +18,6 @@ #include <linux/sched.h> #include <linux/module.h> #include <linux/bitops.h> -#include <linux/smp_lock.h> #include <linux/mount.h> #include <linux/nsproxy.h> #include <net/net_namespace.h> @@ -172,6 +171,7 @@ static int proc_tgid_net_readdir(struct file *filp, void *dirent, } const struct file_operations proc_net_operations = { + .llseek = generic_file_llseek, .read = generic_read_dir, .readdir = proc_tgid_net_readdir, }; diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 06ed10b7da9e..94fcfff6863a 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -31,7 +31,6 @@ static struct inode *proc_sys_make_inode(struct super_block *sb, inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; inode->i_flags |= S_PRIVATE; /* tell selinux to ignore this inode */ inode->i_mode = table->mode; - inode->i_uid = inode->i_gid = 0; if (!table->child) { inode->i_mode |= S_IFREG; inode->i_op = &proc_sys_inode_operations; diff --git a/fs/proc/root.c b/fs/proc/root.c index 7761602af9de..f6299a25594e 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -16,7 +16,6 @@ #include <linux/sched.h> #include <linux/module.h> #include <linux/bitops.h> -#include <linux/smp_lock.h> #include <linux/mount.h> #include <linux/pid_namespace.h> @@ -162,17 +161,12 @@ static int proc_root_readdir(struct file * filp, unsigned int nr = filp->f_pos; int ret; - lock_kernel(); - if (nr < FIRST_PROCESS_ENTRY) { int error = proc_readdir(filp, dirent, filldir); - if (error <= 0) { - unlock_kernel(); + if (error <= 0) return error; - } filp->f_pos = FIRST_PROCESS_ENTRY; } - unlock_kernel(); ret = proc_pid_readdir(filp, dirent, filldir); return ret; diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 81904f07679d..f75efa22df5e 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -9,6 +9,7 @@ #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/time.h> +#include <linux/irqnr.h> #include <asm/cputime.h> #ifndef arch_irq_stat_cpu @@ -44,10 +45,9 @@ static int show_stat(struct seq_file *p, void *v) softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq); steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal); guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest); - - for_each_irq_nr(j) + for_each_irq_nr(j) { sum += kstat_irqs_cpu(j, i); - + } sum += arch_irq_stat_cpu(i); } sum += arch_irq_stat(); @@ -92,7 +92,6 @@ static int show_stat(struct seq_file *p, void *v) /* sum again ? it could be updated? */ for_each_irq_nr(j) { per_irq_sum = 0; - for_each_possible_cpu(i) per_irq_sum += kstat_irqs_cpu(j, i); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 3a8bdd7f5756..94063840832a 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -396,7 +396,9 @@ static int show_smap(struct seq_file *m, void *v) "Private_Clean: %8lu kB\n" "Private_Dirty: %8lu kB\n" "Referenced: %8lu kB\n" - "Swap: %8lu kB\n", + "Swap: %8lu kB\n" + "KernelPageSize: %8lu kB\n" + "MMUPageSize: %8lu kB\n", (vma->vm_end - vma->vm_start) >> 10, mss.resident >> 10, (unsigned long)(mss.pss >> (10 + PSS_SHIFT)), @@ -405,7 +407,9 @@ static int show_smap(struct seq_file *m, void *v) mss.private_clean >> 10, mss.private_dirty >> 10, mss.referenced >> 10, - mss.swap >> 10); + mss.swap >> 10, + vma_kernel_pagesize(vma) >> 10, + vma_mmu_pagesize(vma) >> 10); if (m->count < m->size) /* vma is copied successfully */ m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0; diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 219bd79ea894..343ea1216bc8 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -9,31 +9,38 @@ /* * Logic: we've got two memory sums for each process, "shared", and - * "non-shared". Shared memory may get counted more then once, for + * "non-shared". Shared memory may get counted more than once, for * each process that owns it. Non-shared memory is counted * accurately. */ void task_mem(struct seq_file *m, struct mm_struct *mm) { - struct vm_list_struct *vml; - unsigned long bytes = 0, sbytes = 0, slack = 0; + struct vm_area_struct *vma; + struct vm_region *region; + struct rb_node *p; + unsigned long bytes = 0, sbytes = 0, slack = 0, size; down_read(&mm->mmap_sem); - for (vml = mm->context.vmlist; vml; vml = vml->next) { - if (!vml->vma) - continue; + for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) { + vma = rb_entry(p, struct vm_area_struct, vm_rb); + + bytes += kobjsize(vma); + + region = vma->vm_region; + if (region) { + size = kobjsize(region); + size += region->vm_end - region->vm_start; + } else { + size = vma->vm_end - vma->vm_start; + } - bytes += kobjsize(vml); if (atomic_read(&mm->mm_count) > 1 || - atomic_read(&vml->vma->vm_usage) > 1 - ) { - sbytes += kobjsize((void *) vml->vma->vm_start); - sbytes += kobjsize(vml->vma); + vma->vm_flags & VM_MAYSHARE) { + sbytes += size; } else { - bytes += kobjsize((void *) vml->vma->vm_start); - bytes += kobjsize(vml->vma); - slack += kobjsize((void *) vml->vma->vm_start) - - (vml->vma->vm_end - vml->vma->vm_start); + bytes += size; + if (region) + slack = region->vm_end - vma->vm_end; } } @@ -70,13 +77,14 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) unsigned long task_vsize(struct mm_struct *mm) { - struct vm_list_struct *tbp; + struct vm_area_struct *vma; + struct rb_node *p; unsigned long vsize = 0; down_read(&mm->mmap_sem); - for (tbp = mm->context.vmlist; tbp; tbp = tbp->next) { - if (tbp->vma) - vsize += kobjsize((void *) tbp->vma->vm_start); + for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) { + vma = rb_entry(p, struct vm_area_struct, vm_rb); + vsize += vma->vm_end - vma->vm_start; } up_read(&mm->mmap_sem); return vsize; @@ -85,15 +93,19 @@ unsigned long task_vsize(struct mm_struct *mm) int task_statm(struct mm_struct *mm, int *shared, int *text, int *data, int *resident) { - struct vm_list_struct *tbp; + struct vm_area_struct *vma; + struct vm_region *region; + struct rb_node *p; int size = kobjsize(mm); down_read(&mm->mmap_sem); - for (tbp = mm->context.vmlist; tbp; tbp = tbp->next) { - size += kobjsize(tbp); - if (tbp->vma) { - size += kobjsize(tbp->vma); - size += kobjsize((void *) tbp->vma->vm_start); + for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) { + vma = rb_entry(p, struct vm_area_struct, vm_rb); + size += kobjsize(vma); + region = vma->vm_region; + if (region) { + size += kobjsize(region); + size += region->vm_end - region->vm_start; } } @@ -105,20 +117,62 @@ int task_statm(struct mm_struct *mm, int *shared, int *text, } /* + * display a single VMA to a sequenced file + */ +static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) +{ + unsigned long ino = 0; + struct file *file; + dev_t dev = 0; + int flags, len; + + flags = vma->vm_flags; + file = vma->vm_file; + + if (file) { + struct inode *inode = vma->vm_file->f_path.dentry->d_inode; + dev = inode->i_sb->s_dev; + ino = inode->i_ino; + } + + seq_printf(m, + "%08lx-%08lx %c%c%c%c %08lx %02x:%02x %lu %n", + vma->vm_start, + vma->vm_end, + flags & VM_READ ? 'r' : '-', + flags & VM_WRITE ? 'w' : '-', + flags & VM_EXEC ? 'x' : '-', + flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p', + vma->vm_pgoff << PAGE_SHIFT, + MAJOR(dev), MINOR(dev), ino, &len); + + if (file) { + len = 25 + sizeof(void *) * 6 - len; + if (len < 1) + len = 1; + seq_printf(m, "%*c", len, ' '); + seq_path(m, &file->f_path, ""); + } + + seq_putc(m, '\n'); + return 0; +} + +/* * display mapping lines for a particular process's /proc/pid/maps */ -static int show_map(struct seq_file *m, void *_vml) +static int show_map(struct seq_file *m, void *_p) { - struct vm_list_struct *vml = _vml; + struct rb_node *p = _p; - return nommu_vma_show(m, vml->vma); + return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb)); } static void *m_start(struct seq_file *m, loff_t *pos) { struct proc_maps_private *priv = m->private; - struct vm_list_struct *vml; struct mm_struct *mm; + struct rb_node *p; loff_t n = *pos; /* pin the task and mm whilst we play with them */ @@ -134,9 +188,9 @@ static void *m_start(struct seq_file *m, loff_t *pos) } /* start from the Nth VMA */ - for (vml = mm->context.vmlist; vml; vml = vml->next) + for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) if (n-- == 0) - return vml; + return p; return NULL; } @@ -152,12 +206,12 @@ static void m_stop(struct seq_file *m, void *_vml) } } -static void *m_next(struct seq_file *m, void *_vml, loff_t *pos) +static void *m_next(struct seq_file *m, void *_p, loff_t *pos) { - struct vm_list_struct *vml = _vml; + struct rb_node *p = _p; (*pos)++; - return vml ? vml->next : NULL; + return p ? rb_next(p) : NULL; } static const struct seq_operations proc_pid_maps_ops = { diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 03ec59504906..5edcc3f92ba7 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -47,8 +47,6 @@ static ssize_t read_from_oldmem(char *buf, size_t count, offset = (unsigned long)(*ppos % PAGE_SIZE); pfn = (unsigned long)(*ppos / PAGE_SIZE); - if (pfn > saved_max_pfn) - return -EINVAL; do { if (count > (PAGE_SIZE - offset)) diff --git a/fs/quota.c b/fs/quota.c index b7fe44e01618..4a8c94f05f76 100644 --- a/fs/quota.c +++ b/fs/quota.c @@ -73,7 +73,7 @@ static int generic_quotactl_valid(struct super_block *sb, int type, int cmd, qid case Q_SETQUOTA: case Q_GETQUOTA: /* This is just informative test so we are satisfied without a lock */ - if (!sb_has_quota_enabled(sb, type)) + if (!sb_has_quota_active(sb, type)) return -ESRCH; } @@ -160,6 +160,9 @@ static void quota_sync_sb(struct super_block *sb, int type) int cnt; sb->s_qcop->quota_sync(sb, type); + + if (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE) + return; /* This is not very clever (and fast) but currently I don't know about * any other simple way of getting quota data to disk and we must get * them there for userspace to be visible... */ @@ -175,7 +178,7 @@ static void quota_sync_sb(struct super_block *sb, int type) for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (type != -1 && cnt != type) continue; - if (!sb_has_quota_enabled(sb, cnt)) + if (!sb_has_quota_active(sb, cnt)) continue; mutex_lock_nested(&sb_dqopt(sb)->files[cnt]->i_mutex, I_MUTEX_QUOTA); truncate_inode_pages(&sb_dqopt(sb)->files[cnt]->i_data, 0); @@ -201,7 +204,7 @@ restart: for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (type != -1 && type != cnt) continue; - if (!sb_has_quota_enabled(sb, cnt)) + if (!sb_has_quota_active(sb, cnt)) continue; if (!info_dirty(&sb_dqopt(sb)->info[cnt]) && list_empty(&sb_dqopt(sb)->info[cnt].dqi_dirty_list)) @@ -245,7 +248,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, void __u32 fmt; down_read(&sb_dqopt(sb)->dqptr_sem); - if (!sb_has_quota_enabled(sb, type)) { + if (!sb_has_quota_active(sb, type)) { up_read(&sb_dqopt(sb)->dqptr_sem); return -ESRCH; } diff --git a/fs/quota_tree.c b/fs/quota_tree.c new file mode 100644 index 000000000000..953404c95b17 --- /dev/null +++ b/fs/quota_tree.c @@ -0,0 +1,645 @@ +/* + * vfsv0 quota IO operations on file + */ + +#include <linux/errno.h> +#include <linux/fs.h> +#include <linux/mount.h> +#include <linux/dqblk_v2.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/quotaops.h> + +#include <asm/byteorder.h> + +#include "quota_tree.h" + +MODULE_AUTHOR("Jan Kara"); +MODULE_DESCRIPTION("Quota trie support"); +MODULE_LICENSE("GPL"); + +#define __QUOTA_QT_PARANOIA + +typedef char *dqbuf_t; + +static int get_index(struct qtree_mem_dqinfo *info, qid_t id, int depth) +{ + unsigned int epb = info->dqi_usable_bs >> 2; + + depth = info->dqi_qtree_depth - depth - 1; + while (depth--) + id /= epb; + return id % epb; +} + +/* Number of entries in one blocks */ +static inline int qtree_dqstr_in_blk(struct qtree_mem_dqinfo *info) +{ + return (info->dqi_usable_bs - sizeof(struct qt_disk_dqdbheader)) + / info->dqi_entry_size; +} + +static dqbuf_t getdqbuf(size_t size) +{ + dqbuf_t buf = kmalloc(size, GFP_NOFS); + if (!buf) + printk(KERN_WARNING "VFS: Not enough memory for quota buffers.\n"); + return buf; +} + +static inline void freedqbuf(dqbuf_t buf) +{ + kfree(buf); +} + +static inline ssize_t read_blk(struct qtree_mem_dqinfo *info, uint blk, dqbuf_t buf) +{ + struct super_block *sb = info->dqi_sb; + + memset(buf, 0, info->dqi_usable_bs); + return sb->s_op->quota_read(sb, info->dqi_type, (char *)buf, + info->dqi_usable_bs, blk << info->dqi_blocksize_bits); +} + +static inline ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, dqbuf_t buf) +{ + struct super_block *sb = info->dqi_sb; + + return sb->s_op->quota_write(sb, info->dqi_type, (char *)buf, + info->dqi_usable_bs, blk << info->dqi_blocksize_bits); +} + +/* Remove empty block from list and return it */ +static int get_free_dqblk(struct qtree_mem_dqinfo *info) +{ + dqbuf_t buf = getdqbuf(info->dqi_usable_bs); + struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf; + int ret, blk; + + if (!buf) + return -ENOMEM; + if (info->dqi_free_blk) { + blk = info->dqi_free_blk; + ret = read_blk(info, blk, buf); + if (ret < 0) + goto out_buf; + info->dqi_free_blk = le32_to_cpu(dh->dqdh_next_free); + } + else { + memset(buf, 0, info->dqi_usable_bs); + /* Assure block allocation... */ + ret = write_blk(info, info->dqi_blocks, buf); + if (ret < 0) + goto out_buf; + blk = info->dqi_blocks++; + } + mark_info_dirty(info->dqi_sb, info->dqi_type); + ret = blk; +out_buf: + freedqbuf(buf); + return ret; +} + +/* Insert empty block to the list */ +static int put_free_dqblk(struct qtree_mem_dqinfo *info, dqbuf_t buf, uint blk) +{ + struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf; + int err; + + dh->dqdh_next_free = cpu_to_le32(info->dqi_free_blk); + dh->dqdh_prev_free = cpu_to_le32(0); + dh->dqdh_entries = cpu_to_le16(0); + err = write_blk(info, blk, buf); + if (err < 0) + return err; + info->dqi_free_blk = blk; + mark_info_dirty(info->dqi_sb, info->dqi_type); + return 0; +} + +/* Remove given block from the list of blocks with free entries */ +static int remove_free_dqentry(struct qtree_mem_dqinfo *info, dqbuf_t buf, uint blk) +{ + dqbuf_t tmpbuf = getdqbuf(info->dqi_usable_bs); + struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf; + uint nextblk = le32_to_cpu(dh->dqdh_next_free); + uint prevblk = le32_to_cpu(dh->dqdh_prev_free); + int err; + + if (!tmpbuf) + return -ENOMEM; + if (nextblk) { + err = read_blk(info, nextblk, tmpbuf); + if (err < 0) + goto out_buf; + ((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_prev_free = + dh->dqdh_prev_free; + err = write_blk(info, nextblk, tmpbuf); + if (err < 0) + goto out_buf; + } + if (prevblk) { + err = read_blk(info, prevblk, tmpbuf); + if (err < 0) + goto out_buf; + ((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_next_free = + dh->dqdh_next_free; + err = write_blk(info, prevblk, tmpbuf); + if (err < 0) + goto out_buf; + } else { + info->dqi_free_entry = nextblk; + mark_info_dirty(info->dqi_sb, info->dqi_type); + } + freedqbuf(tmpbuf); + dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0); + /* No matter whether write succeeds block is out of list */ + if (write_blk(info, blk, buf) < 0) + printk(KERN_ERR "VFS: Can't write block (%u) with free entries.\n", blk); + return 0; +out_buf: + freedqbuf(tmpbuf); + return err; +} + +/* Insert given block to the beginning of list with free entries */ +static int insert_free_dqentry(struct qtree_mem_dqinfo *info, dqbuf_t buf, uint blk) +{ + dqbuf_t tmpbuf = getdqbuf(info->dqi_usable_bs); + struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf; + int err; + + if (!tmpbuf) + return -ENOMEM; + dh->dqdh_next_free = cpu_to_le32(info->dqi_free_entry); + dh->dqdh_prev_free = cpu_to_le32(0); + err = write_blk(info, blk, buf); + if (err < 0) + goto out_buf; + if (info->dqi_free_entry) { + err = read_blk(info, info->dqi_free_entry, tmpbuf); + if (err < 0) + goto out_buf; + ((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_prev_free = + cpu_to_le32(blk); + err = write_blk(info, info->dqi_free_entry, tmpbuf); + if (err < 0) + goto out_buf; + } + freedqbuf(tmpbuf); + info->dqi_free_entry = blk; + mark_info_dirty(info->dqi_sb, info->dqi_type); + return 0; +out_buf: + freedqbuf(tmpbuf); + return err; +} + +/* Is the entry in the block free? */ +int qtree_entry_unused(struct qtree_mem_dqinfo *info, char *disk) +{ + int i; + + for (i = 0; i < info->dqi_entry_size; i++) + if (disk[i]) + return 0; + return 1; +} +EXPORT_SYMBOL(qtree_entry_unused); + +/* Find space for dquot */ +static uint find_free_dqentry(struct qtree_mem_dqinfo *info, + struct dquot *dquot, int *err) +{ + uint blk, i; + struct qt_disk_dqdbheader *dh; + dqbuf_t buf = getdqbuf(info->dqi_usable_bs); + char *ddquot; + + *err = 0; + if (!buf) { + *err = -ENOMEM; + return 0; + } + dh = (struct qt_disk_dqdbheader *)buf; + if (info->dqi_free_entry) { + blk = info->dqi_free_entry; + *err = read_blk(info, blk, buf); + if (*err < 0) + goto out_buf; + } else { + blk = get_free_dqblk(info); + if ((int)blk < 0) { + *err = blk; + freedqbuf(buf); + return 0; + } + memset(buf, 0, info->dqi_usable_bs); + /* This is enough as block is already zeroed and entry list is empty... */ + info->dqi_free_entry = blk; + mark_info_dirty(dquot->dq_sb, dquot->dq_type); + } + /* Block will be full? */ + if (le16_to_cpu(dh->dqdh_entries) + 1 >= qtree_dqstr_in_blk(info)) { + *err = remove_free_dqentry(info, buf, blk); + if (*err < 0) { + printk(KERN_ERR "VFS: find_free_dqentry(): Can't " + "remove block (%u) from entry free list.\n", + blk); + goto out_buf; + } + } + le16_add_cpu(&dh->dqdh_entries, 1); + /* Find free structure in block */ + for (i = 0, ddquot = ((char *)buf) + sizeof(struct qt_disk_dqdbheader); + i < qtree_dqstr_in_blk(info) && !qtree_entry_unused(info, ddquot); + i++, ddquot += info->dqi_entry_size); +#ifdef __QUOTA_QT_PARANOIA + if (i == qtree_dqstr_in_blk(info)) { + printk(KERN_ERR "VFS: find_free_dqentry(): Data block full " + "but it shouldn't.\n"); + *err = -EIO; + goto out_buf; + } +#endif + *err = write_blk(info, blk, buf); + if (*err < 0) { + printk(KERN_ERR "VFS: find_free_dqentry(): Can't write quota " + "data block %u.\n", blk); + goto out_buf; + } + dquot->dq_off = (blk << info->dqi_blocksize_bits) + + sizeof(struct qt_disk_dqdbheader) + + i * info->dqi_entry_size; + freedqbuf(buf); + return blk; +out_buf: + freedqbuf(buf); + return 0; +} + +/* Insert reference to structure into the trie */ +static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot, + uint *treeblk, int depth) +{ + dqbuf_t buf = getdqbuf(info->dqi_usable_bs); + int ret = 0, newson = 0, newact = 0; + __le32 *ref; + uint newblk; + + if (!buf) + return -ENOMEM; + if (!*treeblk) { + ret = get_free_dqblk(info); + if (ret < 0) + goto out_buf; + *treeblk = ret; + memset(buf, 0, info->dqi_usable_bs); + newact = 1; + } else { + ret = read_blk(info, *treeblk, buf); + if (ret < 0) { + printk(KERN_ERR "VFS: Can't read tree quota block " + "%u.\n", *treeblk); + goto out_buf; + } + } + ref = (__le32 *)buf; + newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); + if (!newblk) + newson = 1; + if (depth == info->dqi_qtree_depth - 1) { +#ifdef __QUOTA_QT_PARANOIA + if (newblk) { + printk(KERN_ERR "VFS: Inserting already present quota " + "entry (block %u).\n", + le32_to_cpu(ref[get_index(info, + dquot->dq_id, depth)])); + ret = -EIO; + goto out_buf; + } +#endif + newblk = find_free_dqentry(info, dquot, &ret); + } else { + ret = do_insert_tree(info, dquot, &newblk, depth+1); + } + if (newson && ret >= 0) { + ref[get_index(info, dquot->dq_id, depth)] = + cpu_to_le32(newblk); + ret = write_blk(info, *treeblk, buf); + } else if (newact && ret < 0) { + put_free_dqblk(info, buf, *treeblk); + } +out_buf: + freedqbuf(buf); + return ret; +} + +/* Wrapper for inserting quota structure into tree */ +static inline int dq_insert_tree(struct qtree_mem_dqinfo *info, + struct dquot *dquot) +{ + int tmp = QT_TREEOFF; + return do_insert_tree(info, dquot, &tmp, 0); +} + +/* + * We don't have to be afraid of deadlocks as we never have quotas on quota files... + */ +int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) +{ + int type = dquot->dq_type; + struct super_block *sb = dquot->dq_sb; + ssize_t ret; + dqbuf_t ddquot = getdqbuf(info->dqi_entry_size); + + if (!ddquot) + return -ENOMEM; + + /* dq_off is guarded by dqio_mutex */ + if (!dquot->dq_off) { + ret = dq_insert_tree(info, dquot); + if (ret < 0) { + printk(KERN_ERR "VFS: Error %zd occurred while " + "creating quota.\n", ret); + freedqbuf(ddquot); + return ret; + } + } + spin_lock(&dq_data_lock); + info->dqi_ops->mem2disk_dqblk(ddquot, dquot); + spin_unlock(&dq_data_lock); + ret = sb->s_op->quota_write(sb, type, (char *)ddquot, + info->dqi_entry_size, dquot->dq_off); + if (ret != info->dqi_entry_size) { + printk(KERN_WARNING "VFS: dquota write failed on dev %s\n", + sb->s_id); + if (ret >= 0) + ret = -ENOSPC; + } else { + ret = 0; + } + dqstats.writes++; + freedqbuf(ddquot); + + return ret; +} +EXPORT_SYMBOL(qtree_write_dquot); + +/* Free dquot entry in data block */ +static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot, + uint blk) +{ + struct qt_disk_dqdbheader *dh; + dqbuf_t buf = getdqbuf(info->dqi_usable_bs); + int ret = 0; + + if (!buf) + return -ENOMEM; + if (dquot->dq_off >> info->dqi_blocksize_bits != blk) { + printk(KERN_ERR "VFS: Quota structure has offset to other " + "block (%u) than it should (%u).\n", blk, + (uint)(dquot->dq_off >> info->dqi_blocksize_bits)); + goto out_buf; + } + ret = read_blk(info, blk, buf); + if (ret < 0) { + printk(KERN_ERR "VFS: Can't read quota data block %u\n", blk); + goto out_buf; + } + dh = (struct qt_disk_dqdbheader *)buf; + le16_add_cpu(&dh->dqdh_entries, -1); + if (!le16_to_cpu(dh->dqdh_entries)) { /* Block got free? */ + ret = remove_free_dqentry(info, buf, blk); + if (ret >= 0) + ret = put_free_dqblk(info, buf, blk); + if (ret < 0) { + printk(KERN_ERR "VFS: Can't move quota data block (%u) " + "to free list.\n", blk); + goto out_buf; + } + } else { + memset(buf + + (dquot->dq_off & ((1 << info->dqi_blocksize_bits) - 1)), + 0, info->dqi_entry_size); + if (le16_to_cpu(dh->dqdh_entries) == + qtree_dqstr_in_blk(info) - 1) { + /* Insert will write block itself */ + ret = insert_free_dqentry(info, buf, blk); + if (ret < 0) { + printk(KERN_ERR "VFS: Can't insert quota data " + "block (%u) to free entry list.\n", blk); + goto out_buf; + } + } else { + ret = write_blk(info, blk, buf); + if (ret < 0) { + printk(KERN_ERR "VFS: Can't write quota data " + "block %u\n", blk); + goto out_buf; + } + } + } + dquot->dq_off = 0; /* Quota is now unattached */ +out_buf: + freedqbuf(buf); + return ret; +} + +/* Remove reference to dquot from tree */ +static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot, + uint *blk, int depth) +{ + dqbuf_t buf = getdqbuf(info->dqi_usable_bs); + int ret = 0; + uint newblk; + __le32 *ref = (__le32 *)buf; + + if (!buf) + return -ENOMEM; + ret = read_blk(info, *blk, buf); + if (ret < 0) { + printk(KERN_ERR "VFS: Can't read quota data block %u\n", *blk); + goto out_buf; + } + newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); + if (depth == info->dqi_qtree_depth - 1) { + ret = free_dqentry(info, dquot, newblk); + newblk = 0; + } else { + ret = remove_tree(info, dquot, &newblk, depth+1); + } + if (ret >= 0 && !newblk) { + int i; + ref[get_index(info, dquot->dq_id, depth)] = cpu_to_le32(0); + /* Block got empty? */ + for (i = 0; + i < (info->dqi_usable_bs >> 2) && !ref[i]; + i++); + /* Don't put the root block into the free block list */ + if (i == (info->dqi_usable_bs >> 2) + && *blk != QT_TREEOFF) { + put_free_dqblk(info, buf, *blk); + *blk = 0; + } else { + ret = write_blk(info, *blk, buf); + if (ret < 0) + printk(KERN_ERR "VFS: Can't write quota tree " + "block %u.\n", *blk); + } + } +out_buf: + freedqbuf(buf); + return ret; +} + +/* Delete dquot from tree */ +int qtree_delete_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) +{ + uint tmp = QT_TREEOFF; + + if (!dquot->dq_off) /* Even not allocated? */ + return 0; + return remove_tree(info, dquot, &tmp, 0); +} +EXPORT_SYMBOL(qtree_delete_dquot); + +/* Find entry in block */ +static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info, + struct dquot *dquot, uint blk) +{ + dqbuf_t buf = getdqbuf(info->dqi_usable_bs); + loff_t ret = 0; + int i; + char *ddquot; + + if (!buf) + return -ENOMEM; + ret = read_blk(info, blk, buf); + if (ret < 0) { + printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk); + goto out_buf; + } + for (i = 0, ddquot = ((char *)buf) + sizeof(struct qt_disk_dqdbheader); + i < qtree_dqstr_in_blk(info) && !info->dqi_ops->is_id(ddquot, dquot); + i++, ddquot += info->dqi_entry_size); + if (i == qtree_dqstr_in_blk(info)) { + printk(KERN_ERR "VFS: Quota for id %u referenced " + "but not present.\n", dquot->dq_id); + ret = -EIO; + goto out_buf; + } else { + ret = (blk << info->dqi_blocksize_bits) + sizeof(struct + qt_disk_dqdbheader) + i * info->dqi_entry_size; + } +out_buf: + freedqbuf(buf); + return ret; +} + +/* Find entry for given id in the tree */ +static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info, + struct dquot *dquot, uint blk, int depth) +{ + dqbuf_t buf = getdqbuf(info->dqi_usable_bs); + loff_t ret = 0; + __le32 *ref = (__le32 *)buf; + + if (!buf) + return -ENOMEM; + ret = read_blk(info, blk, buf); + if (ret < 0) { + printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk); + goto out_buf; + } + ret = 0; + blk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); + if (!blk) /* No reference? */ + goto out_buf; + if (depth < info->dqi_qtree_depth - 1) + ret = find_tree_dqentry(info, dquot, blk, depth+1); + else + ret = find_block_dqentry(info, dquot, blk); +out_buf: + freedqbuf(buf); + return ret; +} + +/* Find entry for given id in the tree - wrapper function */ +static inline loff_t find_dqentry(struct qtree_mem_dqinfo *info, + struct dquot *dquot) +{ + return find_tree_dqentry(info, dquot, QT_TREEOFF, 0); +} + +int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) +{ + int type = dquot->dq_type; + struct super_block *sb = dquot->dq_sb; + loff_t offset; + dqbuf_t ddquot; + int ret = 0; + +#ifdef __QUOTA_QT_PARANOIA + /* Invalidated quota? */ + if (!sb_dqopt(dquot->dq_sb)->files[type]) { + printk(KERN_ERR "VFS: Quota invalidated while reading!\n"); + return -EIO; + } +#endif + /* Do we know offset of the dquot entry in the quota file? */ + if (!dquot->dq_off) { + offset = find_dqentry(info, dquot); + if (offset <= 0) { /* Entry not present? */ + if (offset < 0) + printk(KERN_ERR "VFS: Can't read quota " + "structure for id %u.\n", dquot->dq_id); + dquot->dq_off = 0; + set_bit(DQ_FAKE_B, &dquot->dq_flags); + memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk)); + ret = offset; + goto out; + } + dquot->dq_off = offset; + } + ddquot = getdqbuf(info->dqi_entry_size); + if (!ddquot) + return -ENOMEM; + ret = sb->s_op->quota_read(sb, type, (char *)ddquot, + info->dqi_entry_size, dquot->dq_off); + if (ret != info->dqi_entry_size) { + if (ret >= 0) + ret = -EIO; + printk(KERN_ERR "VFS: Error while reading quota " + "structure for id %u.\n", dquot->dq_id); + set_bit(DQ_FAKE_B, &dquot->dq_flags); + memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk)); + freedqbuf(ddquot); + goto out; + } + spin_lock(&dq_data_lock); + info->dqi_ops->disk2mem_dqblk(dquot, ddquot); + if (!dquot->dq_dqb.dqb_bhardlimit && + !dquot->dq_dqb.dqb_bsoftlimit && + !dquot->dq_dqb.dqb_ihardlimit && + !dquot->dq_dqb.dqb_isoftlimit) + set_bit(DQ_FAKE_B, &dquot->dq_flags); + spin_unlock(&dq_data_lock); + freedqbuf(ddquot); +out: + dqstats.reads++; + return ret; +} +EXPORT_SYMBOL(qtree_read_dquot); + +/* Check whether dquot should not be deleted. We know we are + * the only one operating on dquot (thanks to dq_lock) */ +int qtree_release_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) +{ + if (test_bit(DQ_FAKE_B, &dquot->dq_flags) && !(dquot->dq_dqb.dqb_curinodes | dquot->dq_dqb.dqb_curspace)) + return qtree_delete_dquot(info, dquot); + return 0; +} +EXPORT_SYMBOL(qtree_release_dquot); diff --git a/fs/quota_tree.h b/fs/quota_tree.h new file mode 100644 index 000000000000..a1ab8db81a51 --- /dev/null +++ b/fs/quota_tree.h @@ -0,0 +1,25 @@ +/* + * Definitions of structures for vfsv0 quota format + */ + +#ifndef _LINUX_QUOTA_TREE_H +#define _LINUX_QUOTA_TREE_H + +#include <linux/types.h> +#include <linux/quota.h> + +/* + * Structure of header of block with quota structures. It is padded to 16 bytes so + * there will be space for exactly 21 quota-entries in a block + */ +struct qt_disk_dqdbheader { + __le32 dqdh_next_free; /* Number of next block with free entry */ + __le32 dqdh_prev_free; /* Number of previous block with free entry */ + __le16 dqdh_entries; /* Number of valid entries in block */ + __le16 dqdh_pad1; + __le32 dqdh_pad2; +}; + +#define QT_TREEOFF 1 /* Offset of tree in file in blocks */ + +#endif /* _LINUX_QUOTAIO_TREE_H */ diff --git a/fs/quota_v1.c b/fs/quota_v1.c index 5ae15b13eeb0..b4af1c69ad16 100644 --- a/fs/quota_v1.c +++ b/fs/quota_v1.c @@ -3,25 +3,39 @@ #include <linux/quota.h> #include <linux/quotaops.h> #include <linux/dqblk_v1.h> -#include <linux/quotaio_v1.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <asm/byteorder.h> +#include "quotaio_v1.h" + MODULE_AUTHOR("Jan Kara"); MODULE_DESCRIPTION("Old quota format support"); MODULE_LICENSE("GPL"); +#define QUOTABLOCK_BITS 10 +#define QUOTABLOCK_SIZE (1 << QUOTABLOCK_BITS) + +static inline qsize_t v1_stoqb(qsize_t space) +{ + return (space + QUOTABLOCK_SIZE - 1) >> QUOTABLOCK_BITS; +} + +static inline qsize_t v1_qbtos(qsize_t blocks) +{ + return blocks << QUOTABLOCK_BITS; +} + static void v1_disk2mem_dqblk(struct mem_dqblk *m, struct v1_disk_dqblk *d) { m->dqb_ihardlimit = d->dqb_ihardlimit; m->dqb_isoftlimit = d->dqb_isoftlimit; m->dqb_curinodes = d->dqb_curinodes; - m->dqb_bhardlimit = d->dqb_bhardlimit; - m->dqb_bsoftlimit = d->dqb_bsoftlimit; - m->dqb_curspace = ((qsize_t)d->dqb_curblocks) << QUOTABLOCK_BITS; + m->dqb_bhardlimit = v1_qbtos(d->dqb_bhardlimit); + m->dqb_bsoftlimit = v1_qbtos(d->dqb_bsoftlimit); + m->dqb_curspace = v1_qbtos(d->dqb_curblocks); m->dqb_itime = d->dqb_itime; m->dqb_btime = d->dqb_btime; } @@ -31,9 +45,9 @@ static void v1_mem2disk_dqblk(struct v1_disk_dqblk *d, struct mem_dqblk *m) d->dqb_ihardlimit = m->dqb_ihardlimit; d->dqb_isoftlimit = m->dqb_isoftlimit; d->dqb_curinodes = m->dqb_curinodes; - d->dqb_bhardlimit = m->dqb_bhardlimit; - d->dqb_bsoftlimit = m->dqb_bsoftlimit; - d->dqb_curblocks = toqb(m->dqb_curspace); + d->dqb_bhardlimit = v1_stoqb(m->dqb_bhardlimit); + d->dqb_bsoftlimit = v1_stoqb(m->dqb_bsoftlimit); + d->dqb_curblocks = v1_stoqb(m->dqb_curspace); d->dqb_itime = m->dqb_itime; d->dqb_btime = m->dqb_btime; } diff --git a/fs/quota_v2.c b/fs/quota_v2.c index b53827dc02d9..b618b563635c 100644 --- a/fs/quota_v2.c +++ b/fs/quota_v2.c @@ -6,7 +6,6 @@ #include <linux/fs.h> #include <linux/mount.h> #include <linux/dqblk_v2.h> -#include <linux/quotaio_v2.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> @@ -15,16 +14,37 @@ #include <asm/byteorder.h> +#include "quota_tree.h" +#include "quotaio_v2.h" + MODULE_AUTHOR("Jan Kara"); MODULE_DESCRIPTION("Quota format v2 support"); MODULE_LICENSE("GPL"); #define __QUOTA_V2_PARANOIA -typedef char *dqbuf_t; +static void v2_mem2diskdqb(void *dp, struct dquot *dquot); +static void v2_disk2memdqb(struct dquot *dquot, void *dp); +static int v2_is_id(void *dp, struct dquot *dquot); + +static struct qtree_fmt_operations v2_qtree_ops = { + .mem2disk_dqblk = v2_mem2diskdqb, + .disk2mem_dqblk = v2_disk2memdqb, + .is_id = v2_is_id, +}; + +#define QUOTABLOCK_BITS 10 +#define QUOTABLOCK_SIZE (1 << QUOTABLOCK_BITS) -#define GETIDINDEX(id, depth) (((id) >> ((V2_DQTREEDEPTH-(depth)-1)*8)) & 0xff) -#define GETENTRIES(buf) ((struct v2_disk_dqblk *)(((char *)buf)+sizeof(struct v2_disk_dqdbheader))) +static inline qsize_t v2_stoqb(qsize_t space) +{ + return (space + QUOTABLOCK_SIZE - 1) >> QUOTABLOCK_BITS; +} + +static inline qsize_t v2_qbtos(qsize_t blocks) +{ + return blocks << QUOTABLOCK_BITS; +} /* Check whether given file is really vfsv0 quotafile */ static int v2_check_quota_file(struct super_block *sb, int type) @@ -50,7 +70,8 @@ static int v2_check_quota_file(struct super_block *sb, int type) static int v2_read_file_info(struct super_block *sb, int type) { struct v2_disk_dqinfo dinfo; - struct mem_dqinfo *info = sb_dqopt(sb)->info+type; + struct mem_dqinfo *info = sb_dqinfo(sb, type); + struct qtree_mem_dqinfo *qinfo; ssize_t size; size = sb->s_op->quota_read(sb, type, (char *)&dinfo, @@ -60,15 +81,29 @@ static int v2_read_file_info(struct super_block *sb, int type) sb->s_id); return -1; } + info->dqi_priv = kmalloc(sizeof(struct qtree_mem_dqinfo), GFP_NOFS); + if (!info->dqi_priv) { + printk(KERN_WARNING + "Not enough memory for quota information structure.\n"); + return -1; + } + qinfo = info->dqi_priv; /* limits are stored as unsigned 32-bit data */ info->dqi_maxblimit = 0xffffffff; info->dqi_maxilimit = 0xffffffff; info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace); info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace); info->dqi_flags = le32_to_cpu(dinfo.dqi_flags); - info->u.v2_i.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks); - info->u.v2_i.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk); - info->u.v2_i.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry); + qinfo->dqi_sb = sb; + qinfo->dqi_type = type; + qinfo->dqi_blocks = le32_to_cpu(dinfo.dqi_blocks); + qinfo->dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk); + qinfo->dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry); + qinfo->dqi_blocksize_bits = V2_DQBLKSIZE_BITS; + qinfo->dqi_usable_bs = 1 << V2_DQBLKSIZE_BITS; + qinfo->dqi_qtree_depth = qtree_depth(qinfo); + qinfo->dqi_entry_size = sizeof(struct v2_disk_dqblk); + qinfo->dqi_ops = &v2_qtree_ops; return 0; } @@ -76,7 +111,8 @@ static int v2_read_file_info(struct super_block *sb, int type) static int v2_write_file_info(struct super_block *sb, int type) { struct v2_disk_dqinfo dinfo; - struct mem_dqinfo *info = sb_dqopt(sb)->info+type; + struct mem_dqinfo *info = sb_dqinfo(sb, type); + struct qtree_mem_dqinfo *qinfo = info->dqi_priv; ssize_t size; spin_lock(&dq_data_lock); @@ -85,9 +121,9 @@ static int v2_write_file_info(struct super_block *sb, int type) dinfo.dqi_igrace = cpu_to_le32(info->dqi_igrace); dinfo.dqi_flags = cpu_to_le32(info->dqi_flags & DQF_MASK); spin_unlock(&dq_data_lock); - dinfo.dqi_blocks = cpu_to_le32(info->u.v2_i.dqi_blocks); - dinfo.dqi_free_blk = cpu_to_le32(info->u.v2_i.dqi_free_blk); - dinfo.dqi_free_entry = cpu_to_le32(info->u.v2_i.dqi_free_entry); + dinfo.dqi_blocks = cpu_to_le32(qinfo->dqi_blocks); + dinfo.dqi_free_blk = cpu_to_le32(qinfo->dqi_free_blk); + dinfo.dqi_free_entry = cpu_to_le32(qinfo->dqi_free_entry); size = sb->s_op->quota_write(sb, type, (char *)&dinfo, sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF); if (size != sizeof(struct v2_disk_dqinfo)) { @@ -98,574 +134,75 @@ static int v2_write_file_info(struct super_block *sb, int type) return 0; } -static void disk2memdqb(struct mem_dqblk *m, struct v2_disk_dqblk *d) +static void v2_disk2memdqb(struct dquot *dquot, void *dp) { + struct v2_disk_dqblk *d = dp, empty; + struct mem_dqblk *m = &dquot->dq_dqb; + m->dqb_ihardlimit = le32_to_cpu(d->dqb_ihardlimit); m->dqb_isoftlimit = le32_to_cpu(d->dqb_isoftlimit); m->dqb_curinodes = le32_to_cpu(d->dqb_curinodes); m->dqb_itime = le64_to_cpu(d->dqb_itime); - m->dqb_bhardlimit = le32_to_cpu(d->dqb_bhardlimit); - m->dqb_bsoftlimit = le32_to_cpu(d->dqb_bsoftlimit); + m->dqb_bhardlimit = v2_qbtos(le32_to_cpu(d->dqb_bhardlimit)); + m->dqb_bsoftlimit = v2_qbtos(le32_to_cpu(d->dqb_bsoftlimit)); m->dqb_curspace = le64_to_cpu(d->dqb_curspace); m->dqb_btime = le64_to_cpu(d->dqb_btime); + /* We need to escape back all-zero structure */ + memset(&empty, 0, sizeof(struct v2_disk_dqblk)); + empty.dqb_itime = cpu_to_le64(1); + if (!memcmp(&empty, dp, sizeof(struct v2_disk_dqblk))) + m->dqb_itime = 0; } -static void mem2diskdqb(struct v2_disk_dqblk *d, struct mem_dqblk *m, qid_t id) +static void v2_mem2diskdqb(void *dp, struct dquot *dquot) { + struct v2_disk_dqblk *d = dp; + struct mem_dqblk *m = &dquot->dq_dqb; + struct qtree_mem_dqinfo *info = + sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; + d->dqb_ihardlimit = cpu_to_le32(m->dqb_ihardlimit); d->dqb_isoftlimit = cpu_to_le32(m->dqb_isoftlimit); d->dqb_curinodes = cpu_to_le32(m->dqb_curinodes); d->dqb_itime = cpu_to_le64(m->dqb_itime); - d->dqb_bhardlimit = cpu_to_le32(m->dqb_bhardlimit); - d->dqb_bsoftlimit = cpu_to_le32(m->dqb_bsoftlimit); + d->dqb_bhardlimit = cpu_to_le32(v2_stoqb(m->dqb_bhardlimit)); + d->dqb_bsoftlimit = cpu_to_le32(v2_stoqb(m->dqb_bsoftlimit)); d->dqb_curspace = cpu_to_le64(m->dqb_curspace); d->dqb_btime = cpu_to_le64(m->dqb_btime); - d->dqb_id = cpu_to_le32(id); -} - -static dqbuf_t getdqbuf(void) -{ - dqbuf_t buf = kmalloc(V2_DQBLKSIZE, GFP_NOFS); - if (!buf) - printk(KERN_WARNING "VFS: Not enough memory for quota buffers.\n"); - return buf; -} - -static inline void freedqbuf(dqbuf_t buf) -{ - kfree(buf); -} - -static inline ssize_t read_blk(struct super_block *sb, int type, uint blk, dqbuf_t buf) -{ - memset(buf, 0, V2_DQBLKSIZE); - return sb->s_op->quota_read(sb, type, (char *)buf, - V2_DQBLKSIZE, blk << V2_DQBLKSIZE_BITS); -} - -static inline ssize_t write_blk(struct super_block *sb, int type, uint blk, dqbuf_t buf) -{ - return sb->s_op->quota_write(sb, type, (char *)buf, - V2_DQBLKSIZE, blk << V2_DQBLKSIZE_BITS); -} - -/* Remove empty block from list and return it */ -static int get_free_dqblk(struct super_block *sb, int type) -{ - dqbuf_t buf = getdqbuf(); - struct mem_dqinfo *info = sb_dqinfo(sb, type); - struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf; - int ret, blk; - - if (!buf) - return -ENOMEM; - if (info->u.v2_i.dqi_free_blk) { - blk = info->u.v2_i.dqi_free_blk; - if ((ret = read_blk(sb, type, blk, buf)) < 0) - goto out_buf; - info->u.v2_i.dqi_free_blk = le32_to_cpu(dh->dqdh_next_free); - } - else { - memset(buf, 0, V2_DQBLKSIZE); - /* Assure block allocation... */ - if ((ret = write_blk(sb, type, info->u.v2_i.dqi_blocks, buf)) < 0) - goto out_buf; - blk = info->u.v2_i.dqi_blocks++; - } - mark_info_dirty(sb, type); - ret = blk; -out_buf: - freedqbuf(buf); - return ret; -} - -/* Insert empty block to the list */ -static int put_free_dqblk(struct super_block *sb, int type, dqbuf_t buf, uint blk) -{ - struct mem_dqinfo *info = sb_dqinfo(sb, type); - struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf; - int err; - - dh->dqdh_next_free = cpu_to_le32(info->u.v2_i.dqi_free_blk); - dh->dqdh_prev_free = cpu_to_le32(0); - dh->dqdh_entries = cpu_to_le16(0); - info->u.v2_i.dqi_free_blk = blk; - mark_info_dirty(sb, type); - /* Some strange block. We had better leave it... */ - if ((err = write_blk(sb, type, blk, buf)) < 0) - return err; - return 0; + d->dqb_id = cpu_to_le32(dquot->dq_id); + if (qtree_entry_unused(info, dp)) + d->dqb_itime = cpu_to_le64(1); } -/* Remove given block from the list of blocks with free entries */ -static int remove_free_dqentry(struct super_block *sb, int type, dqbuf_t buf, uint blk) +static int v2_is_id(void *dp, struct dquot *dquot) { - dqbuf_t tmpbuf = getdqbuf(); - struct mem_dqinfo *info = sb_dqinfo(sb, type); - struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf; - uint nextblk = le32_to_cpu(dh->dqdh_next_free), prevblk = le32_to_cpu(dh->dqdh_prev_free); - int err; + struct v2_disk_dqblk *d = dp; + struct qtree_mem_dqinfo *info = + sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; - if (!tmpbuf) - return -ENOMEM; - if (nextblk) { - if ((err = read_blk(sb, type, nextblk, tmpbuf)) < 0) - goto out_buf; - ((struct v2_disk_dqdbheader *)tmpbuf)->dqdh_prev_free = dh->dqdh_prev_free; - if ((err = write_blk(sb, type, nextblk, tmpbuf)) < 0) - goto out_buf; - } - if (prevblk) { - if ((err = read_blk(sb, type, prevblk, tmpbuf)) < 0) - goto out_buf; - ((struct v2_disk_dqdbheader *)tmpbuf)->dqdh_next_free = dh->dqdh_next_free; - if ((err = write_blk(sb, type, prevblk, tmpbuf)) < 0) - goto out_buf; - } - else { - info->u.v2_i.dqi_free_entry = nextblk; - mark_info_dirty(sb, type); - } - freedqbuf(tmpbuf); - dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0); - /* No matter whether write succeeds block is out of list */ - if (write_blk(sb, type, blk, buf) < 0) - printk(KERN_ERR "VFS: Can't write block (%u) with free entries.\n", blk); - return 0; -out_buf: - freedqbuf(tmpbuf); - return err; -} - -/* Insert given block to the beginning of list with free entries */ -static int insert_free_dqentry(struct super_block *sb, int type, dqbuf_t buf, uint blk) -{ - dqbuf_t tmpbuf = getdqbuf(); - struct mem_dqinfo *info = sb_dqinfo(sb, type); - struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf; - int err; - - if (!tmpbuf) - return -ENOMEM; - dh->dqdh_next_free = cpu_to_le32(info->u.v2_i.dqi_free_entry); - dh->dqdh_prev_free = cpu_to_le32(0); - if ((err = write_blk(sb, type, blk, buf)) < 0) - goto out_buf; - if (info->u.v2_i.dqi_free_entry) { - if ((err = read_blk(sb, type, info->u.v2_i.dqi_free_entry, tmpbuf)) < 0) - goto out_buf; - ((struct v2_disk_dqdbheader *)tmpbuf)->dqdh_prev_free = cpu_to_le32(blk); - if ((err = write_blk(sb, type, info->u.v2_i.dqi_free_entry, tmpbuf)) < 0) - goto out_buf; - } - freedqbuf(tmpbuf); - info->u.v2_i.dqi_free_entry = blk; - mark_info_dirty(sb, type); - return 0; -out_buf: - freedqbuf(tmpbuf); - return err; -} - -/* Find space for dquot */ -static uint find_free_dqentry(struct dquot *dquot, int *err) -{ - struct super_block *sb = dquot->dq_sb; - struct mem_dqinfo *info = sb_dqopt(sb)->info+dquot->dq_type; - uint blk, i; - struct v2_disk_dqdbheader *dh; - struct v2_disk_dqblk *ddquot; - struct v2_disk_dqblk fakedquot; - dqbuf_t buf; - - *err = 0; - if (!(buf = getdqbuf())) { - *err = -ENOMEM; + if (qtree_entry_unused(info, dp)) return 0; - } - dh = (struct v2_disk_dqdbheader *)buf; - ddquot = GETENTRIES(buf); - if (info->u.v2_i.dqi_free_entry) { - blk = info->u.v2_i.dqi_free_entry; - if ((*err = read_blk(sb, dquot->dq_type, blk, buf)) < 0) - goto out_buf; - } - else { - blk = get_free_dqblk(sb, dquot->dq_type); - if ((int)blk < 0) { - *err = blk; - freedqbuf(buf); - return 0; - } - memset(buf, 0, V2_DQBLKSIZE); - /* This is enough as block is already zeroed and entry list is empty... */ - info->u.v2_i.dqi_free_entry = blk; - mark_info_dirty(sb, dquot->dq_type); - } - if (le16_to_cpu(dh->dqdh_entries)+1 >= V2_DQSTRINBLK) /* Block will be full? */ - if ((*err = remove_free_dqentry(sb, dquot->dq_type, buf, blk)) < 0) { - printk(KERN_ERR "VFS: find_free_dqentry(): Can't remove block (%u) from entry free list.\n", blk); - goto out_buf; - } - le16_add_cpu(&dh->dqdh_entries, 1); - memset(&fakedquot, 0, sizeof(struct v2_disk_dqblk)); - /* Find free structure in block */ - for (i = 0; i < V2_DQSTRINBLK && memcmp(&fakedquot, ddquot+i, sizeof(struct v2_disk_dqblk)); i++); -#ifdef __QUOTA_V2_PARANOIA - if (i == V2_DQSTRINBLK) { - printk(KERN_ERR "VFS: find_free_dqentry(): Data block full but it shouldn't.\n"); - *err = -EIO; - goto out_buf; - } -#endif - if ((*err = write_blk(sb, dquot->dq_type, blk, buf)) < 0) { - printk(KERN_ERR "VFS: find_free_dqentry(): Can't write quota data block %u.\n", blk); - goto out_buf; - } - dquot->dq_off = (blk<<V2_DQBLKSIZE_BITS)+sizeof(struct v2_disk_dqdbheader)+i*sizeof(struct v2_disk_dqblk); - freedqbuf(buf); - return blk; -out_buf: - freedqbuf(buf); - return 0; -} - -/* Insert reference to structure into the trie */ -static int do_insert_tree(struct dquot *dquot, uint *treeblk, int depth) -{ - struct super_block *sb = dquot->dq_sb; - dqbuf_t buf; - int ret = 0, newson = 0, newact = 0; - __le32 *ref; - uint newblk; - - if (!(buf = getdqbuf())) - return -ENOMEM; - if (!*treeblk) { - ret = get_free_dqblk(sb, dquot->dq_type); - if (ret < 0) - goto out_buf; - *treeblk = ret; - memset(buf, 0, V2_DQBLKSIZE); - newact = 1; - } - else { - if ((ret = read_blk(sb, dquot->dq_type, *treeblk, buf)) < 0) { - printk(KERN_ERR "VFS: Can't read tree quota block %u.\n", *treeblk); - goto out_buf; - } - } - ref = (__le32 *)buf; - newblk = le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]); - if (!newblk) - newson = 1; - if (depth == V2_DQTREEDEPTH-1) { -#ifdef __QUOTA_V2_PARANOIA - if (newblk) { - printk(KERN_ERR "VFS: Inserting already present quota entry (block %u).\n", le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)])); - ret = -EIO; - goto out_buf; - } -#endif - newblk = find_free_dqentry(dquot, &ret); - } - else - ret = do_insert_tree(dquot, &newblk, depth+1); - if (newson && ret >= 0) { - ref[GETIDINDEX(dquot->dq_id, depth)] = cpu_to_le32(newblk); - ret = write_blk(sb, dquot->dq_type, *treeblk, buf); - } - else if (newact && ret < 0) - put_free_dqblk(sb, dquot->dq_type, buf, *treeblk); -out_buf: - freedqbuf(buf); - return ret; + return le32_to_cpu(d->dqb_id) == dquot->dq_id; } -/* Wrapper for inserting quota structure into tree */ -static inline int dq_insert_tree(struct dquot *dquot) +static int v2_read_dquot(struct dquot *dquot) { - int tmp = V2_DQTREEOFF; - return do_insert_tree(dquot, &tmp, 0); + return qtree_read_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv, dquot); } -/* - * We don't have to be afraid of deadlocks as we never have quotas on quota files... - */ static int v2_write_dquot(struct dquot *dquot) { - int type = dquot->dq_type; - ssize_t ret; - struct v2_disk_dqblk ddquot, empty; - - /* dq_off is guarded by dqio_mutex */ - if (!dquot->dq_off) - if ((ret = dq_insert_tree(dquot)) < 0) { - printk(KERN_ERR "VFS: Error %zd occurred while creating quota.\n", ret); - return ret; - } - spin_lock(&dq_data_lock); - mem2diskdqb(&ddquot, &dquot->dq_dqb, dquot->dq_id); - /* Argh... We may need to write structure full of zeroes but that would be - * treated as an empty place by the rest of the code. Format change would - * be definitely cleaner but the problems probably are not worth it */ - memset(&empty, 0, sizeof(struct v2_disk_dqblk)); - if (!memcmp(&empty, &ddquot, sizeof(struct v2_disk_dqblk))) - ddquot.dqb_itime = cpu_to_le64(1); - spin_unlock(&dq_data_lock); - ret = dquot->dq_sb->s_op->quota_write(dquot->dq_sb, type, - (char *)&ddquot, sizeof(struct v2_disk_dqblk), dquot->dq_off); - if (ret != sizeof(struct v2_disk_dqblk)) { - printk(KERN_WARNING "VFS: dquota write failed on dev %s\n", dquot->dq_sb->s_id); - if (ret >= 0) - ret = -ENOSPC; - } - else - ret = 0; - dqstats.writes++; - - return ret; + return qtree_write_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv, dquot); } -/* Free dquot entry in data block */ -static int free_dqentry(struct dquot *dquot, uint blk) -{ - struct super_block *sb = dquot->dq_sb; - int type = dquot->dq_type; - struct v2_disk_dqdbheader *dh; - dqbuf_t buf = getdqbuf(); - int ret = 0; - - if (!buf) - return -ENOMEM; - if (dquot->dq_off >> V2_DQBLKSIZE_BITS != blk) { - printk(KERN_ERR "VFS: Quota structure has offset to other " - "block (%u) than it should (%u).\n", blk, - (uint)(dquot->dq_off >> V2_DQBLKSIZE_BITS)); - goto out_buf; - } - if ((ret = read_blk(sb, type, blk, buf)) < 0) { - printk(KERN_ERR "VFS: Can't read quota data block %u\n", blk); - goto out_buf; - } - dh = (struct v2_disk_dqdbheader *)buf; - le16_add_cpu(&dh->dqdh_entries, -1); - if (!le16_to_cpu(dh->dqdh_entries)) { /* Block got free? */ - if ((ret = remove_free_dqentry(sb, type, buf, blk)) < 0 || - (ret = put_free_dqblk(sb, type, buf, blk)) < 0) { - printk(KERN_ERR "VFS: Can't move quota data block (%u) " - "to free list.\n", blk); - goto out_buf; - } - } - else { - memset(buf+(dquot->dq_off & ((1 << V2_DQBLKSIZE_BITS)-1)), 0, - sizeof(struct v2_disk_dqblk)); - if (le16_to_cpu(dh->dqdh_entries) == V2_DQSTRINBLK-1) { - /* Insert will write block itself */ - if ((ret = insert_free_dqentry(sb, type, buf, blk)) < 0) { - printk(KERN_ERR "VFS: Can't insert quota data block (%u) to free entry list.\n", blk); - goto out_buf; - } - } - else - if ((ret = write_blk(sb, type, blk, buf)) < 0) { - printk(KERN_ERR "VFS: Can't write quota data " - "block %u\n", blk); - goto out_buf; - } - } - dquot->dq_off = 0; /* Quota is now unattached */ -out_buf: - freedqbuf(buf); - return ret; -} - -/* Remove reference to dquot from tree */ -static int remove_tree(struct dquot *dquot, uint *blk, int depth) -{ - struct super_block *sb = dquot->dq_sb; - int type = dquot->dq_type; - dqbuf_t buf = getdqbuf(); - int ret = 0; - uint newblk; - __le32 *ref = (__le32 *)buf; - - if (!buf) - return -ENOMEM; - if ((ret = read_blk(sb, type, *blk, buf)) < 0) { - printk(KERN_ERR "VFS: Can't read quota data block %u\n", *blk); - goto out_buf; - } - newblk = le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]); - if (depth == V2_DQTREEDEPTH-1) { - ret = free_dqentry(dquot, newblk); - newblk = 0; - } - else - ret = remove_tree(dquot, &newblk, depth+1); - if (ret >= 0 && !newblk) { - int i; - ref[GETIDINDEX(dquot->dq_id, depth)] = cpu_to_le32(0); - for (i = 0; i < V2_DQBLKSIZE && !buf[i]; i++); /* Block got empty? */ - /* Don't put the root block into the free block list */ - if (i == V2_DQBLKSIZE && *blk != V2_DQTREEOFF) { - put_free_dqblk(sb, type, buf, *blk); - *blk = 0; - } - else - if ((ret = write_blk(sb, type, *blk, buf)) < 0) - printk(KERN_ERR "VFS: Can't write quota tree " - "block %u.\n", *blk); - } -out_buf: - freedqbuf(buf); - return ret; -} - -/* Delete dquot from tree */ -static int v2_delete_dquot(struct dquot *dquot) -{ - uint tmp = V2_DQTREEOFF; - - if (!dquot->dq_off) /* Even not allocated? */ - return 0; - return remove_tree(dquot, &tmp, 0); -} - -/* Find entry in block */ -static loff_t find_block_dqentry(struct dquot *dquot, uint blk) -{ - dqbuf_t buf = getdqbuf(); - loff_t ret = 0; - int i; - struct v2_disk_dqblk *ddquot = GETENTRIES(buf); - - if (!buf) - return -ENOMEM; - if ((ret = read_blk(dquot->dq_sb, dquot->dq_type, blk, buf)) < 0) { - printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk); - goto out_buf; - } - if (dquot->dq_id) - for (i = 0; i < V2_DQSTRINBLK && - le32_to_cpu(ddquot[i].dqb_id) != dquot->dq_id; i++); - else { /* ID 0 as a bit more complicated searching... */ - struct v2_disk_dqblk fakedquot; - - memset(&fakedquot, 0, sizeof(struct v2_disk_dqblk)); - for (i = 0; i < V2_DQSTRINBLK; i++) - if (!le32_to_cpu(ddquot[i].dqb_id) && - memcmp(&fakedquot, ddquot+i, sizeof(struct v2_disk_dqblk))) - break; - } - if (i == V2_DQSTRINBLK) { - printk(KERN_ERR "VFS: Quota for id %u referenced " - "but not present.\n", dquot->dq_id); - ret = -EIO; - goto out_buf; - } - else - ret = (blk << V2_DQBLKSIZE_BITS) + sizeof(struct - v2_disk_dqdbheader) + i * sizeof(struct v2_disk_dqblk); -out_buf: - freedqbuf(buf); - return ret; -} - -/* Find entry for given id in the tree */ -static loff_t find_tree_dqentry(struct dquot *dquot, uint blk, int depth) -{ - dqbuf_t buf = getdqbuf(); - loff_t ret = 0; - __le32 *ref = (__le32 *)buf; - - if (!buf) - return -ENOMEM; - if ((ret = read_blk(dquot->dq_sb, dquot->dq_type, blk, buf)) < 0) { - printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk); - goto out_buf; - } - ret = 0; - blk = le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]); - if (!blk) /* No reference? */ - goto out_buf; - if (depth < V2_DQTREEDEPTH-1) - ret = find_tree_dqentry(dquot, blk, depth+1); - else - ret = find_block_dqentry(dquot, blk); -out_buf: - freedqbuf(buf); - return ret; -} - -/* Find entry for given id in the tree - wrapper function */ -static inline loff_t find_dqentry(struct dquot *dquot) -{ - return find_tree_dqentry(dquot, V2_DQTREEOFF, 0); -} - -static int v2_read_dquot(struct dquot *dquot) +static int v2_release_dquot(struct dquot *dquot) { - int type = dquot->dq_type; - loff_t offset; - struct v2_disk_dqblk ddquot, empty; - int ret = 0; - -#ifdef __QUOTA_V2_PARANOIA - /* Invalidated quota? */ - if (!dquot->dq_sb || !sb_dqopt(dquot->dq_sb)->files[type]) { - printk(KERN_ERR "VFS: Quota invalidated while reading!\n"); - return -EIO; - } -#endif - offset = find_dqentry(dquot); - if (offset <= 0) { /* Entry not present? */ - if (offset < 0) - printk(KERN_ERR "VFS: Can't read quota " - "structure for id %u.\n", dquot->dq_id); - dquot->dq_off = 0; - set_bit(DQ_FAKE_B, &dquot->dq_flags); - memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk)); - ret = offset; - } - else { - dquot->dq_off = offset; - if ((ret = dquot->dq_sb->s_op->quota_read(dquot->dq_sb, type, - (char *)&ddquot, sizeof(struct v2_disk_dqblk), offset)) - != sizeof(struct v2_disk_dqblk)) { - if (ret >= 0) - ret = -EIO; - printk(KERN_ERR "VFS: Error while reading quota " - "structure for id %u.\n", dquot->dq_id); - memset(&ddquot, 0, sizeof(struct v2_disk_dqblk)); - } - else { - ret = 0; - /* We need to escape back all-zero structure */ - memset(&empty, 0, sizeof(struct v2_disk_dqblk)); - empty.dqb_itime = cpu_to_le64(1); - if (!memcmp(&empty, &ddquot, sizeof(struct v2_disk_dqblk))) - ddquot.dqb_itime = 0; - } - disk2memdqb(&dquot->dq_dqb, &ddquot); - if (!dquot->dq_dqb.dqb_bhardlimit && - !dquot->dq_dqb.dqb_bsoftlimit && - !dquot->dq_dqb.dqb_ihardlimit && - !dquot->dq_dqb.dqb_isoftlimit) - set_bit(DQ_FAKE_B, &dquot->dq_flags); - } - dqstats.reads++; - - return ret; + return qtree_release_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv, dquot); } -/* Check whether dquot should not be deleted. We know we are - * the only one operating on dquot (thanks to dq_lock) */ -static int v2_release_dquot(struct dquot *dquot) +static int v2_free_file_info(struct super_block *sb, int type) { - if (test_bit(DQ_FAKE_B, &dquot->dq_flags) && !(dquot->dq_dqb.dqb_curinodes | dquot->dq_dqb.dqb_curspace)) - return v2_delete_dquot(dquot); + kfree(sb_dqinfo(sb, type)->dqi_priv); return 0; } @@ -673,7 +210,7 @@ static struct quota_format_ops v2_format_ops = { .check_quota_file = v2_check_quota_file, .read_file_info = v2_read_file_info, .write_file_info = v2_write_file_info, - .free_file_info = NULL, + .free_file_info = v2_free_file_info, .read_dqblk = v2_read_dquot, .commit_dqblk = v2_write_dquot, .release_dqblk = v2_release_dquot, diff --git a/fs/quotaio_v1.h b/fs/quotaio_v1.h new file mode 100644 index 000000000000..746654b5de70 --- /dev/null +++ b/fs/quotaio_v1.h @@ -0,0 +1,33 @@ +#ifndef _LINUX_QUOTAIO_V1_H +#define _LINUX_QUOTAIO_V1_H + +#include <linux/types.h> + +/* + * The following constants define the amount of time given a user + * before the soft limits are treated as hard limits (usually resulting + * in an allocation failure). The timer is started when the user crosses + * their soft limit, it is reset when they go below their soft limit. + */ +#define MAX_IQ_TIME 604800 /* (7*24*60*60) 1 week */ +#define MAX_DQ_TIME 604800 /* (7*24*60*60) 1 week */ + +/* + * The following structure defines the format of the disk quota file + * (as it appears on disk) - the file is an array of these structures + * indexed by user or group number. + */ +struct v1_disk_dqblk { + __u32 dqb_bhardlimit; /* absolute limit on disk blks alloc */ + __u32 dqb_bsoftlimit; /* preferred limit on disk blks */ + __u32 dqb_curblocks; /* current block count */ + __u32 dqb_ihardlimit; /* absolute limit on allocated inodes */ + __u32 dqb_isoftlimit; /* preferred inode limit */ + __u32 dqb_curinodes; /* current # allocated inodes */ + time_t dqb_btime; /* time limit for excessive disk use */ + time_t dqb_itime; /* time limit for excessive inode use */ +}; + +#define v1_dqoff(UID) ((loff_t)((UID) * sizeof (struct v1_disk_dqblk))) + +#endif /* _LINUX_QUOTAIO_V1_H */ diff --git a/fs/quotaio_v2.h b/fs/quotaio_v2.h new file mode 100644 index 000000000000..530fe580685c --- /dev/null +++ b/fs/quotaio_v2.h @@ -0,0 +1,60 @@ +/* + * Definitions of structures for vfsv0 quota format + */ + +#ifndef _LINUX_QUOTAIO_V2_H +#define _LINUX_QUOTAIO_V2_H + +#include <linux/types.h> +#include <linux/quota.h> + +/* + * Definitions of magics and versions of current quota files + */ +#define V2_INITQMAGICS {\ + 0xd9c01f11, /* USRQUOTA */\ + 0xd9c01927 /* GRPQUOTA */\ +} + +#define V2_INITQVERSIONS {\ + 0, /* USRQUOTA */\ + 0 /* GRPQUOTA */\ +} + +/* First generic header */ +struct v2_disk_dqheader { + __le32 dqh_magic; /* Magic number identifying file */ + __le32 dqh_version; /* File version */ +}; + +/* + * The following structure defines the format of the disk quota file + * (as it appears on disk) - the file is a radix tree whose leaves point + * to blocks of these structures. + */ +struct v2_disk_dqblk { + __le32 dqb_id; /* id this quota applies to */ + __le32 dqb_ihardlimit; /* absolute limit on allocated inodes */ + __le32 dqb_isoftlimit; /* preferred inode limit */ + __le32 dqb_curinodes; /* current # allocated inodes */ + __le32 dqb_bhardlimit; /* absolute limit on disk space (in QUOTABLOCK_SIZE) */ + __le32 dqb_bsoftlimit; /* preferred limit on disk space (in QUOTABLOCK_SIZE) */ + __le64 dqb_curspace; /* current space occupied (in bytes) */ + __le64 dqb_btime; /* time limit for excessive disk use */ + __le64 dqb_itime; /* time limit for excessive inode use */ +}; + +/* Header with type and version specific information */ +struct v2_disk_dqinfo { + __le32 dqi_bgrace; /* Time before block soft limit becomes hard limit */ + __le32 dqi_igrace; /* Time before inode soft limit becomes hard limit */ + __le32 dqi_flags; /* Flags for quotafile (DQF_*) */ + __le32 dqi_blocks; /* Number of blocks in file */ + __le32 dqi_free_blk; /* Number of first free block in the list */ + __le32 dqi_free_entry; /* Number of block with at least one free entry */ +}; + +#define V2_DQINFOOFF sizeof(struct v2_disk_dqheader) /* Offset of info header in file */ +#define V2_DQBLKSIZE_BITS 10 /* Size of leaf block in tree */ + +#endif /* _LINUX_QUOTAIO_V2_H */ diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 76acdbc34611..b9b567a28376 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -262,11 +262,11 @@ unsigned long ramfs_nommu_get_unmapped_area(struct file *file, ret = -ENOMEM; pages = kzalloc(lpages * sizeof(struct page *), GFP_KERNEL); if (!pages) - goto out; + goto out_free; nr = find_get_pages(inode->i_mapping, pgoff, lpages, pages); if (nr != lpages) - goto out; /* leave if some pages were missing */ + goto out_free_pages; /* leave if some pages were missing */ /* check the pages for physical adjacency */ ptr = pages; @@ -274,19 +274,18 @@ unsigned long ramfs_nommu_get_unmapped_area(struct file *file, page++; for (loop = lpages; loop > 1; loop--) if (*ptr++ != page++) - goto out; + goto out_free_pages; /* okay - all conditions fulfilled */ ret = (unsigned long) page_address(pages[0]); - out: - if (pages) { - ptr = pages; - for (loop = lpages; loop > 0; loop--) - put_page(*ptr++); - kfree(pages); - } - +out_free_pages: + ptr = pages; + for (loop = nr; loop > 0; loop--) + put_page(*ptr++); +out_free: + kfree(pages); +out: return ret; } diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index a83a3518ae33..b7e6ac706b87 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -57,7 +57,6 @@ struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev) inode->i_mode = mode; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); - inode->i_blocks = 0; inode->i_mapping->a_ops = &ramfs_aops; inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info; mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); diff --git a/fs/read_write.c b/fs/read_write.c index 969a6d9c020b..5cc6924eb158 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -50,6 +50,14 @@ generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin) offset += inode->i_size; break; case SEEK_CUR: + /* + * Here we special-case the lseek(fd, 0, SEEK_CUR) + * position-querying operation. Avoid rewriting the "same" + * f_pos value back to the file because a concurrent read(), + * write() or lseek() might have altered it + */ + if (offset == 0) + return file->f_pos; offset += file->f_pos; break; } @@ -105,6 +113,10 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin) offset += i_size_read(file->f_path.dentry->d_inode); break; case SEEK_CUR: + if (offset == 0) { + retval = file->f_pos; + goto out; + } offset += file->f_pos; } retval = -EINVAL; @@ -115,6 +127,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin) } retval = offset; } +out: unlock_kernel(); return retval; } diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 6c4c2c69449f..55fce92cdf18 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -1753,6 +1753,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, struct inode *inode) { struct super_block *sb; + struct reiserfs_iget_args args; INITIALIZE_PATH(path_to_key); struct cpu_key key; struct item_head ih; @@ -1780,6 +1781,20 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, err = -ENOMEM; goto out_bad_inode; } + args.objectid = inode->i_ino = le32_to_cpu(ih.ih_key.k_objectid); + if (old_format_only(sb)) + make_le_item_head(&ih, NULL, KEY_FORMAT_3_5, SD_OFFSET, + TYPE_STAT_DATA, SD_V1_SIZE, MAX_US_INT); + else + make_le_item_head(&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET, + TYPE_STAT_DATA, SD_SIZE, MAX_US_INT); + memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE); + args.dirid = le32_to_cpu(ih.ih_key.k_dir_id); + if (insert_inode_locked4(inode, args.objectid, + reiserfs_find_actor, &args) < 0) { + err = -EINVAL; + goto out_bad_inode; + } if (old_format_only(sb)) /* not a perfect generation count, as object ids can be reused, but ** this is as good as reiserfs can do right now. @@ -1825,13 +1840,6 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, reiserfs_init_acl_default(inode); reiserfs_init_xattr_rwsem(inode); - if (old_format_only(sb)) - make_le_item_head(&ih, NULL, KEY_FORMAT_3_5, SD_OFFSET, - TYPE_STAT_DATA, SD_V1_SIZE, MAX_US_INT); - else - make_le_item_head(&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET, - TYPE_STAT_DATA, SD_SIZE, MAX_US_INT); - /* key to search for correct place for new stat data */ _make_cpu_key(&key, KEY_FORMAT_3_6, le32_to_cpu(ih.ih_key.k_dir_id), le32_to_cpu(ih.ih_key.k_objectid), SD_OFFSET, @@ -1859,13 +1867,9 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, } else { inode2sd(&sd, inode, inode->i_size); } - // these do not go to on-disk stat data - inode->i_ino = le32_to_cpu(ih.ih_key.k_objectid); - // store in in-core inode the key of stat data and version all // object items will have (directory items will have old offset // format, other new objects will consist of new items) - memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE); if (old_format_only(sb) || S_ISDIR(mode) || S_ISLNK(mode)) set_inode_item_key_version(inode, KEY_FORMAT_3_5); else @@ -1929,7 +1933,6 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, reiserfs_mark_inode_private(inode); } - insert_inode_hash(inode); reiserfs_update_sd(th, inode); reiserfs_check_path(&path_to_key); @@ -1956,6 +1959,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, out_inserted_sd: inode->i_nlink = 0; th->t_trans_id = 0; /* so the caller can't use this handle later */ + unlock_new_inode(inode); /* OK to do even if we hadn't locked it */ /* If we were inheriting an ACL, we need to release the lock so that * iput doesn't deadlock in reiserfs_delete_xattrs. The locking @@ -2556,7 +2560,7 @@ static int reiserfs_write_begin(struct file *file, } index = pos >> PAGE_CACHE_SHIFT; - page = __grab_cache_page(mapping, index); + page = grab_cache_page_write_begin(mapping, index, flags); if (!page) return -ENOMEM; *pagep = page; diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 4f322e5ed840..738967f6c8ee 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -646,6 +646,7 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode, err = journal_end(&th, dir->i_sb, jbegin_count); if (err) retval = err; + unlock_new_inode(inode); iput(inode); goto out_failed; } @@ -653,6 +654,7 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode, reiserfs_update_inode_transaction(dir); d_instantiate(dentry, inode); + unlock_new_inode(inode); retval = journal_end(&th, dir->i_sb, jbegin_count); out_failed: @@ -727,11 +729,13 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode, err = journal_end(&th, dir->i_sb, jbegin_count); if (err) retval = err; + unlock_new_inode(inode); iput(inode); goto out_failed; } d_instantiate(dentry, inode); + unlock_new_inode(inode); retval = journal_end(&th, dir->i_sb, jbegin_count); out_failed: @@ -812,6 +816,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) err = journal_end(&th, dir->i_sb, jbegin_count); if (err) retval = err; + unlock_new_inode(inode); iput(inode); goto out_failed; } @@ -819,6 +824,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) reiserfs_update_sd(&th, dir); d_instantiate(dentry, inode); + unlock_new_inode(inode); retval = journal_end(&th, dir->i_sb, jbegin_count); out_failed: if (locked) @@ -1096,11 +1102,13 @@ static int reiserfs_symlink(struct inode *parent_dir, err = journal_end(&th, parent_dir->i_sb, jbegin_count); if (err) retval = err; + unlock_new_inode(inode); iput(inode); goto out_failed; } d_instantiate(dentry, inode); + unlock_new_inode(inode); retval = journal_end(&th, parent_dir->i_sb, jbegin_count); out_failed: reiserfs_write_unlock(parent_dir->i_sb); diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 663a91f5dce8..f3c820b75829 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -83,7 +83,7 @@ static void reiserfs_write_super(struct super_block *s) reiserfs_sync_fs(s, 1); } -static void reiserfs_write_super_lockfs(struct super_block *s) +static int reiserfs_freeze(struct super_block *s) { struct reiserfs_transaction_handle th; reiserfs_write_lock(s); @@ -101,11 +101,13 @@ static void reiserfs_write_super_lockfs(struct super_block *s) } s->s_dirt = 0; reiserfs_write_unlock(s); + return 0; } -static void reiserfs_unlockfs(struct super_block *s) +static int reiserfs_unfreeze(struct super_block *s) { reiserfs_allow_writes(s); + return 0; } extern const struct in_core_key MAX_IN_CORE_KEY; @@ -613,8 +615,8 @@ static const struct super_operations reiserfs_sops = { .put_super = reiserfs_put_super, .write_super = reiserfs_write_super, .sync_fs = reiserfs_sync_fs, - .write_super_lockfs = reiserfs_write_super_lockfs, - .unlockfs = reiserfs_unlockfs, + .freeze_fs = reiserfs_freeze, + .unfreeze_fs = reiserfs_unfreeze, .statfs = reiserfs_statfs, .remount_fs = reiserfs_remount, .show_options = generic_show_options, @@ -649,6 +651,8 @@ static struct dquot_operations reiserfs_quota_operations = { .release_dquot = reiserfs_release_dquot, .mark_dirty = reiserfs_mark_dquot_dirty, .write_info = reiserfs_write_info, + .alloc_dquot = dquot_alloc, + .destroy_dquot = dquot_destroy, }; static struct quotactl_ops reiserfs_qctl_operations = { @@ -994,8 +998,7 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin if (c == 'u' || c == 'g') { int qtype = c == 'u' ? USRQUOTA : GRPQUOTA; - if ((sb_any_quota_enabled(s) || - sb_any_quota_suspended(s)) && + if (sb_any_quota_loaded(s) && (!*arg != !REISERFS_SB(s)->s_qf_names[qtype])) { reiserfs_warning(s, "reiserfs_parse_options: cannot change journaled quota options when quota turned on."); @@ -1041,8 +1044,7 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin "reiserfs_parse_options: unknown quota format specified."); return 0; } - if ((sb_any_quota_enabled(s) || - sb_any_quota_suspended(s)) && + if (sb_any_quota_loaded(s) && *qfmt != REISERFS_SB(s)->s_jquota_fmt) { reiserfs_warning(s, "reiserfs_parse_options: cannot change journaled quota options when quota turned on."); @@ -1067,7 +1069,7 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin } /* This checking is not precise wrt the quota type but for our purposes it is sufficient */ if (!(*mount_options & (1 << REISERFS_QUOTA)) - && sb_any_quota_enabled(s)) { + && sb_any_quota_loaded(s)) { reiserfs_warning(s, "reiserfs_parse_options: quota options must be present when quota is turned on."); return 0; diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c index 60d2f822e87b..98a232f7196b 100644 --- a/fs/romfs/inode.c +++ b/fs/romfs/inode.c @@ -490,7 +490,7 @@ static mode_t romfs_modemap[] = static struct inode * romfs_iget(struct super_block *sb, unsigned long ino) { - int nextfh; + int nextfh, ret; struct romfs_inode ri; struct inode *i; @@ -524,14 +524,13 @@ romfs_iget(struct super_block *sb, unsigned long ino) i->i_size = be32_to_cpu(ri.size); i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0; i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0; - i->i_uid = i->i_gid = 0; /* Precalculate the data offset */ - ino = romfs_strnlen(i, ino+ROMFH_SIZE, ROMFS_MAXFN); - if (ino >= 0) - ino = ((ROMFH_SIZE+ino+1+ROMFH_PAD)&ROMFH_MASK); - else - ino = 0; + ret = romfs_strnlen(i, ino + ROMFH_SIZE, ROMFS_MAXFN); + if (ret >= 0) + ino = (ROMFH_SIZE + ret + 1 + ROMFH_PAD) & ROMFH_MASK; + else + ino = 0; ROMFS_I(i)->i_metasize = ino; ROMFS_I(i)->i_dataoffset = ino+(i->i_ino&ROMFH_MASK); diff --git a/fs/select.c b/fs/select.c index 87df51eadcf2..08b91beed806 100644 --- a/fs/select.c +++ b/fs/select.c @@ -109,11 +109,11 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, void poll_initwait(struct poll_wqueues *pwq) { init_poll_funcptr(&pwq->pt, __pollwait); + pwq->polling_task = current; pwq->error = 0; pwq->table = NULL; pwq->inline_index = 0; } - EXPORT_SYMBOL(poll_initwait); static void free_poll_entry(struct poll_table_entry *entry) @@ -142,12 +142,10 @@ void poll_freewait(struct poll_wqueues *pwq) free_page((unsigned long) old); } } - EXPORT_SYMBOL(poll_freewait); -static struct poll_table_entry *poll_get_entry(poll_table *_p) +static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p) { - struct poll_wqueues *p = container_of(_p, struct poll_wqueues, pt); struct poll_table_page *table = p->table; if (p->inline_index < N_INLINE_POLL_ENTRIES) @@ -159,7 +157,6 @@ static struct poll_table_entry *poll_get_entry(poll_table *_p) new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL); if (!new_table) { p->error = -ENOMEM; - __set_current_state(TASK_RUNNING); return NULL; } new_table->entry = new_table->entries; @@ -171,20 +168,75 @@ static struct poll_table_entry *poll_get_entry(poll_table *_p) return table->entry++; } +static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key) +{ + struct poll_wqueues *pwq = wait->private; + DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task); + + /* + * Although this function is called under waitqueue lock, LOCK + * doesn't imply write barrier and the users expect write + * barrier semantics on wakeup functions. The following + * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up() + * and is paired with set_mb() in poll_schedule_timeout. + */ + smp_wmb(); + pwq->triggered = 1; + + /* + * Perform the default wake up operation using a dummy + * waitqueue. + * + * TODO: This is hacky but there currently is no interface to + * pass in @sync. @sync is scheduled to be removed and once + * that happens, wake_up_process() can be used directly. + */ + return default_wake_function(&dummy_wait, mode, sync, key); +} + /* Add a new entry */ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p) { - struct poll_table_entry *entry = poll_get_entry(p); + struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt); + struct poll_table_entry *entry = poll_get_entry(pwq); if (!entry) return; get_file(filp); entry->filp = filp; entry->wait_address = wait_address; - init_waitqueue_entry(&entry->wait, current); + init_waitqueue_func_entry(&entry->wait, pollwake); + entry->wait.private = pwq; add_wait_queue(wait_address, &entry->wait); } +int poll_schedule_timeout(struct poll_wqueues *pwq, int state, + ktime_t *expires, unsigned long slack) +{ + int rc = -EINTR; + + set_current_state(state); + if (!pwq->triggered) + rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS); + __set_current_state(TASK_RUNNING); + + /* + * Prepare for the next iteration. + * + * The following set_mb() serves two purposes. First, it's + * the counterpart rmb of the wmb in pollwake() such that data + * written before wake up is always visible after wake up. + * Second, the full barrier guarantees that triggered clearing + * doesn't pass event check of the next iteration. Note that + * this problem doesn't exist for the first iteration as + * add_wait_queue() has full barrier semantics. + */ + set_mb(pwq->triggered, 0); + + return rc; +} +EXPORT_SYMBOL(poll_schedule_timeout); + /** * poll_select_set_timeout - helper function to setup the timeout value * @to: pointer to timespec variable for the final timeout @@ -340,8 +392,6 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) for (;;) { unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp; - set_current_state(TASK_INTERRUPTIBLE); - inp = fds->in; outp = fds->out; exp = fds->ex; rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex; @@ -411,10 +461,10 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) to = &expire; } - if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) + if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE, + to, slack)) timed_out = 1; } - __set_current_state(TASK_RUNNING); poll_freewait(&table); @@ -666,7 +716,6 @@ static int do_poll(unsigned int nfds, struct poll_list *list, for (;;) { struct poll_list *walk; - set_current_state(TASK_INTERRUPTIBLE); for (walk = list; walk != NULL; walk = walk->next) { struct pollfd * pfd, * pfd_end; @@ -709,10 +758,9 @@ static int do_poll(unsigned int nfds, struct poll_list *list, to = &expire; } - if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) + if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack)) timed_out = 1; } - __set_current_state(TASK_RUNNING); return count; } diff --git a/fs/seq_file.c b/fs/seq_file.c index 16c211558c22..b569ff1c4dc8 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -389,8 +389,14 @@ char *mangle_path(char *s, char *p, char *esc) } EXPORT_SYMBOL(mangle_path); -/* - * return the absolute path of 'dentry' residing in mount 'mnt'. +/** + * seq_path - seq_file interface to print a pathname + * @m: the seq_file handle + * @path: the struct path to print + * @esc: set of characters to escape in the output + * + * return the absolute path of 'path', as represented by the + * dentry / mnt pair in the path parameter. */ int seq_path(struct seq_file *m, struct path *path, char *esc) { @@ -462,7 +468,8 @@ int seq_dentry(struct seq_file *m, struct dentry *dentry, char *esc) return -1; } -int seq_bitmap(struct seq_file *m, unsigned long *bits, unsigned int nr_bits) +int seq_bitmap(struct seq_file *m, const unsigned long *bits, + unsigned int nr_bits) { if (m->count < m->size) { int len = bitmap_scnprintf(m->buf + m->count, diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c index e4f8d51a5553..92d5e8ffb639 100644 --- a/fs/smbfs/file.c +++ b/fs/smbfs/file.c @@ -297,7 +297,7 @@ static int smb_write_begin(struct file *file, struct address_space *mapping, struct page **pagep, void **fsdata) { pgoff_t index = pos >> PAGE_CACHE_SHIFT; - *pagep = __grab_cache_page(mapping, index); + *pagep = grab_cache_page_write_begin(mapping, index, flags); if (!*pagep) return -ENOMEM; return 0; diff --git a/fs/splice.c b/fs/splice.c index 1abab5cee4ba..a54b3e3f10a7 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -21,6 +21,7 @@ #include <linux/file.h> #include <linux/pagemap.h> #include <linux/splice.h> +#include <linux/memcontrol.h> #include <linux/mm_inline.h> #include <linux/swap.h> #include <linux/writeback.h> diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile new file mode 100644 index 000000000000..8258cf9a0317 --- /dev/null +++ b/fs/squashfs/Makefile @@ -0,0 +1,8 @@ +# +# Makefile for the linux squashfs routines. +# + +obj-$(CONFIG_SQUASHFS) += squashfs.o +squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o +squashfs-y += namei.o super.o symlink.o +#squashfs-y += squashfs2_0.o diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c new file mode 100644 index 000000000000..c837dfc2b3c6 --- /dev/null +++ b/fs/squashfs/block.c @@ -0,0 +1,274 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher <phillip@lougher.demon.co.uk> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * block.c + */ + +/* + * This file implements the low-level routines to read and decompress + * datablocks and metadata blocks. + */ + +#include <linux/fs.h> +#include <linux/vfs.h> +#include <linux/slab.h> +#include <linux/mutex.h> +#include <linux/string.h> +#include <linux/buffer_head.h> +#include <linux/zlib.h> + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "squashfs.h" + +/* + * Read the metadata block length, this is stored in the first two + * bytes of the metadata block. + */ +static struct buffer_head *get_block_length(struct super_block *sb, + u64 *cur_index, int *offset, int *length) +{ + struct squashfs_sb_info *msblk = sb->s_fs_info; + struct buffer_head *bh; + + bh = sb_bread(sb, *cur_index); + if (bh == NULL) + return NULL; + + if (msblk->devblksize - *offset == 1) { + *length = (unsigned char) bh->b_data[*offset]; + put_bh(bh); + bh = sb_bread(sb, ++(*cur_index)); + if (bh == NULL) + return NULL; + *length |= (unsigned char) bh->b_data[0] << 8; + *offset = 1; + } else { + *length = (unsigned char) bh->b_data[*offset] | + (unsigned char) bh->b_data[*offset + 1] << 8; + *offset += 2; + } + + return bh; +} + + +/* + * Read and decompress a metadata block or datablock. Length is non-zero + * if a datablock is being read (the size is stored elsewhere in the + * filesystem), otherwise the length is obtained from the first two bytes of + * the metadata block. A bit in the length field indicates if the block + * is stored uncompressed in the filesystem (usually because compression + * generated a larger block - this does occasionally happen with zlib). + */ +int squashfs_read_data(struct super_block *sb, void **buffer, u64 index, + int length, u64 *next_index, int srclength) +{ + struct squashfs_sb_info *msblk = sb->s_fs_info; + struct buffer_head **bh; + int offset = index & ((1 << msblk->devblksize_log2) - 1); + u64 cur_index = index >> msblk->devblksize_log2; + int bytes, compressed, b = 0, k = 0, page = 0, avail; + + + bh = kcalloc((msblk->block_size >> msblk->devblksize_log2) + 1, + sizeof(*bh), GFP_KERNEL); + if (bh == NULL) + return -ENOMEM; + + if (length) { + /* + * Datablock. + */ + bytes = -offset; + compressed = SQUASHFS_COMPRESSED_BLOCK(length); + length = SQUASHFS_COMPRESSED_SIZE_BLOCK(length); + if (next_index) + *next_index = index + length; + + TRACE("Block @ 0x%llx, %scompressed size %d, src size %d\n", + index, compressed ? "" : "un", length, srclength); + + if (length < 0 || length > srclength || + (index + length) > msblk->bytes_used) + goto read_failure; + + for (b = 0; bytes < length; b++, cur_index++) { + bh[b] = sb_getblk(sb, cur_index); + if (bh[b] == NULL) + goto block_release; + bytes += msblk->devblksize; + } + ll_rw_block(READ, b, bh); + } else { + /* + * Metadata block. + */ + if ((index + 2) > msblk->bytes_used) + goto read_failure; + + bh[0] = get_block_length(sb, &cur_index, &offset, &length); + if (bh[0] == NULL) + goto read_failure; + b = 1; + + bytes = msblk->devblksize - offset; + compressed = SQUASHFS_COMPRESSED(length); + length = SQUASHFS_COMPRESSED_SIZE(length); + if (next_index) + *next_index = index + length + 2; + + TRACE("Block @ 0x%llx, %scompressed size %d\n", index, + compressed ? "" : "un", length); + + if (length < 0 || length > srclength || + (index + length) > msblk->bytes_used) + goto block_release; + + for (; bytes < length; b++) { + bh[b] = sb_getblk(sb, ++cur_index); + if (bh[b] == NULL) + goto block_release; + bytes += msblk->devblksize; + } + ll_rw_block(READ, b - 1, bh + 1); + } + + if (compressed) { + int zlib_err = 0, zlib_init = 0; + + /* + * Uncompress block. + */ + + mutex_lock(&msblk->read_data_mutex); + + msblk->stream.avail_out = 0; + msblk->stream.avail_in = 0; + + bytes = length; + do { + if (msblk->stream.avail_in == 0 && k < b) { + avail = min(bytes, msblk->devblksize - offset); + bytes -= avail; + wait_on_buffer(bh[k]); + if (!buffer_uptodate(bh[k])) + goto release_mutex; + + if (avail == 0) { + offset = 0; + put_bh(bh[k++]); + continue; + } + + msblk->stream.next_in = bh[k]->b_data + offset; + msblk->stream.avail_in = avail; + offset = 0; + } + + if (msblk->stream.avail_out == 0) { + msblk->stream.next_out = buffer[page++]; + msblk->stream.avail_out = PAGE_CACHE_SIZE; + } + + if (!zlib_init) { + zlib_err = zlib_inflateInit(&msblk->stream); + if (zlib_err != Z_OK) { + ERROR("zlib_inflateInit returned" + " unexpected result 0x%x," + " srclength %d\n", zlib_err, + srclength); + goto release_mutex; + } + zlib_init = 1; + } + + zlib_err = zlib_inflate(&msblk->stream, Z_NO_FLUSH); + + if (msblk->stream.avail_in == 0 && k < b) + put_bh(bh[k++]); + } while (zlib_err == Z_OK); + + if (zlib_err != Z_STREAM_END) { + ERROR("zlib_inflate returned unexpected result" + " 0x%x, srclength %d, avail_in %d," + " avail_out %d\n", zlib_err, srclength, + msblk->stream.avail_in, + msblk->stream.avail_out); + goto release_mutex; + } + + zlib_err = zlib_inflateEnd(&msblk->stream); + if (zlib_err != Z_OK) { + ERROR("zlib_inflateEnd returned unexpected result 0x%x," + " srclength %d\n", zlib_err, srclength); + goto release_mutex; + } + length = msblk->stream.total_out; + mutex_unlock(&msblk->read_data_mutex); + } else { + /* + * Block is uncompressed. + */ + int i, in, pg_offset = 0; + + for (i = 0; i < b; i++) { + wait_on_buffer(bh[i]); + if (!buffer_uptodate(bh[i])) + goto block_release; + } + + for (bytes = length; k < b; k++) { + in = min(bytes, msblk->devblksize - offset); + bytes -= in; + while (in) { + if (pg_offset == PAGE_CACHE_SIZE) { + page++; + pg_offset = 0; + } + avail = min_t(int, in, PAGE_CACHE_SIZE - + pg_offset); + memcpy(buffer[page] + pg_offset, + bh[k]->b_data + offset, avail); + in -= avail; + pg_offset += avail; + offset += avail; + } + offset = 0; + put_bh(bh[k]); + } + } + + kfree(bh); + return length; + +release_mutex: + mutex_unlock(&msblk->read_data_mutex); + +block_release: + for (; k < b; k++) + put_bh(bh[k]); + +read_failure: + ERROR("sb_bread failed reading block 0x%llx\n", cur_index); + kfree(bh); + return -EIO; +} diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c new file mode 100644 index 000000000000..f29eda16d25e --- /dev/null +++ b/fs/squashfs/cache.c @@ -0,0 +1,412 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher <phillip@lougher.demon.co.uk> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * cache.c + */ + +/* + * Blocks in Squashfs are compressed. To avoid repeatedly decompressing + * recently accessed data Squashfs uses two small metadata and fragment caches. + * + * This file implements a generic cache implementation used for both caches, + * plus functions layered ontop of the generic cache implementation to + * access the metadata and fragment caches. + * + * To avoid out of memory and fragmentation isssues with vmalloc the cache + * uses sequences of kmalloced PAGE_CACHE_SIZE buffers. + * + * It should be noted that the cache is not used for file datablocks, these + * are decompressed and cached in the page-cache in the normal way. The + * cache is only used to temporarily cache fragment and metadata blocks + * which have been read as as a result of a metadata (i.e. inode or + * directory) or fragment access. Because metadata and fragments are packed + * together into blocks (to gain greater compression) the read of a particular + * piece of metadata or fragment will retrieve other metadata/fragments which + * have been packed with it, these because of locality-of-reference may be read + * in the near future. Temporarily caching them ensures they are available for + * near future access without requiring an additional read and decompress. + */ + +#include <linux/fs.h> +#include <linux/vfs.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/sched.h> +#include <linux/spinlock.h> +#include <linux/wait.h> +#include <linux/zlib.h> +#include <linux/pagemap.h> + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "squashfs.h" + +/* + * Look-up block in cache, and increment usage count. If not in cache, read + * and decompress it from disk. + */ +struct squashfs_cache_entry *squashfs_cache_get(struct super_block *sb, + struct squashfs_cache *cache, u64 block, int length) +{ + int i, n; + struct squashfs_cache_entry *entry; + + spin_lock(&cache->lock); + + while (1) { + for (i = 0; i < cache->entries; i++) + if (cache->entry[i].block == block) + break; + + if (i == cache->entries) { + /* + * Block not in cache, if all cache entries are used + * go to sleep waiting for one to become available. + */ + if (cache->unused == 0) { + cache->num_waiters++; + spin_unlock(&cache->lock); + wait_event(cache->wait_queue, cache->unused); + spin_lock(&cache->lock); + cache->num_waiters--; + continue; + } + + /* + * At least one unused cache entry. A simple + * round-robin strategy is used to choose the entry to + * be evicted from the cache. + */ + i = cache->next_blk; + for (n = 0; n < cache->entries; n++) { + if (cache->entry[i].refcount == 0) + break; + i = (i + 1) % cache->entries; + } + + cache->next_blk = (i + 1) % cache->entries; + entry = &cache->entry[i]; + + /* + * Initialise choosen cache entry, and fill it in from + * disk. + */ + cache->unused--; + entry->block = block; + entry->refcount = 1; + entry->pending = 1; + entry->num_waiters = 0; + entry->error = 0; + spin_unlock(&cache->lock); + + entry->length = squashfs_read_data(sb, entry->data, + block, length, &entry->next_index, + cache->block_size); + + spin_lock(&cache->lock); + + if (entry->length < 0) + entry->error = entry->length; + + entry->pending = 0; + + /* + * While filling this entry one or more other processes + * have looked it up in the cache, and have slept + * waiting for it to become available. + */ + if (entry->num_waiters) { + spin_unlock(&cache->lock); + wake_up_all(&entry->wait_queue); + } else + spin_unlock(&cache->lock); + + goto out; + } + + /* + * Block already in cache. Increment refcount so it doesn't + * get reused until we're finished with it, if it was + * previously unused there's one less cache entry available + * for reuse. + */ + entry = &cache->entry[i]; + if (entry->refcount == 0) + cache->unused--; + entry->refcount++; + + /* + * If the entry is currently being filled in by another process + * go to sleep waiting for it to become available. + */ + if (entry->pending) { + entry->num_waiters++; + spin_unlock(&cache->lock); + wait_event(entry->wait_queue, !entry->pending); + } else + spin_unlock(&cache->lock); + + goto out; + } + +out: + TRACE("Got %s %d, start block %lld, refcount %d, error %d\n", + cache->name, i, entry->block, entry->refcount, entry->error); + + if (entry->error) + ERROR("Unable to read %s cache entry [%llx]\n", cache->name, + block); + return entry; +} + + +/* + * Release cache entry, once usage count is zero it can be reused. + */ +void squashfs_cache_put(struct squashfs_cache_entry *entry) +{ + struct squashfs_cache *cache = entry->cache; + + spin_lock(&cache->lock); + entry->refcount--; + if (entry->refcount == 0) { + cache->unused++; + /* + * If there's any processes waiting for a block to become + * available, wake one up. + */ + if (cache->num_waiters) { + spin_unlock(&cache->lock); + wake_up(&cache->wait_queue); + return; + } + } + spin_unlock(&cache->lock); +} + +/* + * Delete cache reclaiming all kmalloced buffers. + */ +void squashfs_cache_delete(struct squashfs_cache *cache) +{ + int i, j; + + if (cache == NULL) + return; + + for (i = 0; i < cache->entries; i++) { + if (cache->entry[i].data) { + for (j = 0; j < cache->pages; j++) + kfree(cache->entry[i].data[j]); + kfree(cache->entry[i].data); + } + } + + kfree(cache->entry); + kfree(cache); +} + + +/* + * Initialise cache allocating the specified number of entries, each of + * size block_size. To avoid vmalloc fragmentation issues each entry + * is allocated as a sequence of kmalloced PAGE_CACHE_SIZE buffers. + */ +struct squashfs_cache *squashfs_cache_init(char *name, int entries, + int block_size) +{ + int i, j; + struct squashfs_cache *cache = kzalloc(sizeof(*cache), GFP_KERNEL); + + if (cache == NULL) { + ERROR("Failed to allocate %s cache\n", name); + return NULL; + } + + cache->entry = kcalloc(entries, sizeof(*(cache->entry)), GFP_KERNEL); + if (cache->entry == NULL) { + ERROR("Failed to allocate %s cache\n", name); + goto cleanup; + } + + cache->next_blk = 0; + cache->unused = entries; + cache->entries = entries; + cache->block_size = block_size; + cache->pages = block_size >> PAGE_CACHE_SHIFT; + cache->name = name; + cache->num_waiters = 0; + spin_lock_init(&cache->lock); + init_waitqueue_head(&cache->wait_queue); + + for (i = 0; i < entries; i++) { + struct squashfs_cache_entry *entry = &cache->entry[i]; + + init_waitqueue_head(&cache->entry[i].wait_queue); + entry->cache = cache; + entry->block = SQUASHFS_INVALID_BLK; + entry->data = kcalloc(cache->pages, sizeof(void *), GFP_KERNEL); + if (entry->data == NULL) { + ERROR("Failed to allocate %s cache entry\n", name); + goto cleanup; + } + + for (j = 0; j < cache->pages; j++) { + entry->data[j] = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL); + if (entry->data[j] == NULL) { + ERROR("Failed to allocate %s buffer\n", name); + goto cleanup; + } + } + } + + return cache; + +cleanup: + squashfs_cache_delete(cache); + return NULL; +} + + +/* + * Copy upto length bytes from cache entry to buffer starting at offset bytes + * into the cache entry. If there's not length bytes then copy the number of + * bytes available. In all cases return the number of bytes copied. + */ +int squashfs_copy_data(void *buffer, struct squashfs_cache_entry *entry, + int offset, int length) +{ + int remaining = length; + + if (length == 0) + return 0; + else if (buffer == NULL) + return min(length, entry->length - offset); + + while (offset < entry->length) { + void *buff = entry->data[offset / PAGE_CACHE_SIZE] + + (offset % PAGE_CACHE_SIZE); + int bytes = min_t(int, entry->length - offset, + PAGE_CACHE_SIZE - (offset % PAGE_CACHE_SIZE)); + + if (bytes >= remaining) { + memcpy(buffer, buff, remaining); + remaining = 0; + break; + } + + memcpy(buffer, buff, bytes); + buffer += bytes; + remaining -= bytes; + offset += bytes; + } + + return length - remaining; +} + + +/* + * Read length bytes from metadata position <block, offset> (block is the + * start of the compressed block on disk, and offset is the offset into + * the block once decompressed). Data is packed into consecutive blocks, + * and length bytes may require reading more than one block. + */ +int squashfs_read_metadata(struct super_block *sb, void *buffer, + u64 *block, int *offset, int length) +{ + struct squashfs_sb_info *msblk = sb->s_fs_info; + int bytes, copied = length; + struct squashfs_cache_entry *entry; + + TRACE("Entered squashfs_read_metadata [%llx:%x]\n", *block, *offset); + + while (length) { + entry = squashfs_cache_get(sb, msblk->block_cache, *block, 0); + if (entry->error) + return entry->error; + else if (*offset >= entry->length) + return -EIO; + + bytes = squashfs_copy_data(buffer, entry, *offset, length); + if (buffer) + buffer += bytes; + length -= bytes; + *offset += bytes; + + if (*offset == entry->length) { + *block = entry->next_index; + *offset = 0; + } + + squashfs_cache_put(entry); + } + + return copied; +} + + +/* + * Look-up in the fragmment cache the fragment located at <start_block> in the + * filesystem. If necessary read and decompress it from disk. + */ +struct squashfs_cache_entry *squashfs_get_fragment(struct super_block *sb, + u64 start_block, int length) +{ + struct squashfs_sb_info *msblk = sb->s_fs_info; + + return squashfs_cache_get(sb, msblk->fragment_cache, start_block, + length); +} + + +/* + * Read and decompress the datablock located at <start_block> in the + * filesystem. The cache is used here to avoid duplicating locking and + * read/decompress code. + */ +struct squashfs_cache_entry *squashfs_get_datablock(struct super_block *sb, + u64 start_block, int length) +{ + struct squashfs_sb_info *msblk = sb->s_fs_info; + + return squashfs_cache_get(sb, msblk->read_page, start_block, length); +} + + +/* + * Read a filesystem table (uncompressed sequence of bytes) from disk + */ +int squashfs_read_table(struct super_block *sb, void *buffer, u64 block, + int length) +{ + int pages = (length + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + int i, res; + void **data = kcalloc(pages, sizeof(void *), GFP_KERNEL); + if (data == NULL) + return -ENOMEM; + + for (i = 0; i < pages; i++, buffer += PAGE_CACHE_SIZE) + data[i] = buffer; + res = squashfs_read_data(sb, data, block, length | + SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, length); + kfree(data); + return res; +} diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c new file mode 100644 index 000000000000..566b0eaed868 --- /dev/null +++ b/fs/squashfs/dir.c @@ -0,0 +1,235 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher <phillip@lougher.demon.co.uk> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * dir.c + */ + +/* + * This file implements code to read directories from disk. + * + * See namei.c for a description of directory organisation on disk. + */ + +#include <linux/fs.h> +#include <linux/vfs.h> +#include <linux/slab.h> +#include <linux/zlib.h> + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "squashfs.h" + +static const unsigned char squashfs_filetype_table[] = { + DT_UNKNOWN, DT_DIR, DT_REG, DT_LNK, DT_BLK, DT_CHR, DT_FIFO, DT_SOCK +}; + +/* + * Lookup offset (f_pos) in the directory index, returning the + * metadata block containing it. + * + * If we get an error reading the index then return the part of the index + * (if any) we have managed to read - the index isn't essential, just + * quicker. + */ +static int get_dir_index_using_offset(struct super_block *sb, + u64 *next_block, int *next_offset, u64 index_start, int index_offset, + int i_count, u64 f_pos) +{ + struct squashfs_sb_info *msblk = sb->s_fs_info; + int err, i, index, length = 0; + struct squashfs_dir_index dir_index; + + TRACE("Entered get_dir_index_using_offset, i_count %d, f_pos %lld\n", + i_count, f_pos); + + /* + * Translate from external f_pos to the internal f_pos. This + * is offset by 3 because we invent "." and ".." entries which are + * not actually stored in the directory. + */ + if (f_pos < 3) + return f_pos; + f_pos -= 3; + + for (i = 0; i < i_count; i++) { + err = squashfs_read_metadata(sb, &dir_index, &index_start, + &index_offset, sizeof(dir_index)); + if (err < 0) + break; + + index = le32_to_cpu(dir_index.index); + if (index > f_pos) + /* + * Found the index we're looking for. + */ + break; + + err = squashfs_read_metadata(sb, NULL, &index_start, + &index_offset, le32_to_cpu(dir_index.size) + 1); + if (err < 0) + break; + + length = index; + *next_block = le32_to_cpu(dir_index.start_block) + + msblk->directory_table; + } + + *next_offset = (length + *next_offset) % SQUASHFS_METADATA_SIZE; + + /* + * Translate back from internal f_pos to external f_pos. + */ + return length + 3; +} + + +static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir) +{ + struct inode *inode = file->f_dentry->d_inode; + struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; + u64 block = squashfs_i(inode)->start + msblk->directory_table; + int offset = squashfs_i(inode)->offset, length = 0, dir_count, size, + type, err; + unsigned int inode_number; + struct squashfs_dir_header dirh; + struct squashfs_dir_entry *dire; + + TRACE("Entered squashfs_readdir [%llx:%x]\n", block, offset); + + dire = kmalloc(sizeof(*dire) + SQUASHFS_NAME_LEN + 1, GFP_KERNEL); + if (dire == NULL) { + ERROR("Failed to allocate squashfs_dir_entry\n"); + goto finish; + } + + /* + * Return "." and ".." entries as the first two filenames in the + * directory. To maximise compression these two entries are not + * stored in the directory, and so we invent them here. + * + * It also means that the external f_pos is offset by 3 from the + * on-disk directory f_pos. + */ + while (file->f_pos < 3) { + char *name; + int i_ino; + + if (file->f_pos == 0) { + name = "."; + size = 1; + i_ino = inode->i_ino; + } else { + name = ".."; + size = 2; + i_ino = squashfs_i(inode)->parent; + } + + TRACE("Calling filldir(%p, %s, %d, %lld, %d, %d)\n", + dirent, name, size, file->f_pos, i_ino, + squashfs_filetype_table[1]); + + if (filldir(dirent, name, size, file->f_pos, i_ino, + squashfs_filetype_table[1]) < 0) { + TRACE("Filldir returned less than 0\n"); + goto finish; + } + + file->f_pos += size; + } + + length = get_dir_index_using_offset(inode->i_sb, &block, &offset, + squashfs_i(inode)->dir_idx_start, + squashfs_i(inode)->dir_idx_offset, + squashfs_i(inode)->dir_idx_cnt, + file->f_pos); + + while (length < i_size_read(inode)) { + /* + * Read directory header + */ + err = squashfs_read_metadata(inode->i_sb, &dirh, &block, + &offset, sizeof(dirh)); + if (err < 0) + goto failed_read; + + length += sizeof(dirh); + + dir_count = le32_to_cpu(dirh.count) + 1; + while (dir_count--) { + /* + * Read directory entry. + */ + err = squashfs_read_metadata(inode->i_sb, dire, &block, + &offset, sizeof(*dire)); + if (err < 0) + goto failed_read; + + size = le16_to_cpu(dire->size) + 1; + + err = squashfs_read_metadata(inode->i_sb, dire->name, + &block, &offset, size); + if (err < 0) + goto failed_read; + + length += sizeof(*dire) + size; + + if (file->f_pos >= length) + continue; + + dire->name[size] = '\0'; + inode_number = le32_to_cpu(dirh.inode_number) + + ((short) le16_to_cpu(dire->inode_number)); + type = le16_to_cpu(dire->type); + + TRACE("Calling filldir(%p, %s, %d, %lld, %x:%x, %d, %d)" + "\n", dirent, dire->name, size, + file->f_pos, + le32_to_cpu(dirh.start_block), + le16_to_cpu(dire->offset), + inode_number, + squashfs_filetype_table[type]); + + if (filldir(dirent, dire->name, size, file->f_pos, + inode_number, + squashfs_filetype_table[type]) < 0) { + TRACE("Filldir returned less than 0\n"); + goto finish; + } + + file->f_pos = length; + } + } + +finish: + kfree(dire); + return 0; + +failed_read: + ERROR("Unable to read directory block [%llx:%x]\n", block, offset); + kfree(dire); + return 0; +} + + +const struct file_operations squashfs_dir_ops = { + .read = generic_read_dir, + .readdir = squashfs_readdir +}; diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c new file mode 100644 index 000000000000..69e971d5ddc1 --- /dev/null +++ b/fs/squashfs/export.c @@ -0,0 +1,155 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher <phillip@lougher.demon.co.uk> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * export.c + */ + +/* + * This file implements code to make Squashfs filesystems exportable (NFS etc.) + * + * The export code uses an inode lookup table to map inode numbers passed in + * filehandles to an inode location on disk. This table is stored compressed + * into metadata blocks. A second index table is used to locate these. This + * second index table for speed of access (and because it is small) is read at + * mount time and cached in memory. + * + * The inode lookup table is used only by the export code, inode disk + * locations are directly encoded in directories, enabling direct access + * without an intermediate lookup for all operations except the export ops. + */ + +#include <linux/fs.h> +#include <linux/vfs.h> +#include <linux/dcache.h> +#include <linux/exportfs.h> +#include <linux/zlib.h> + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "squashfs.h" + +/* + * Look-up inode number (ino) in table, returning the inode location. + */ +static long long squashfs_inode_lookup(struct super_block *sb, int ino_num) +{ + struct squashfs_sb_info *msblk = sb->s_fs_info; + int blk = SQUASHFS_LOOKUP_BLOCK(ino_num - 1); + int offset = SQUASHFS_LOOKUP_BLOCK_OFFSET(ino_num - 1); + u64 start = le64_to_cpu(msblk->inode_lookup_table[blk]); + __le64 ino; + int err; + + TRACE("Entered squashfs_inode_lookup, inode_number = %d\n", ino_num); + + err = squashfs_read_metadata(sb, &ino, &start, &offset, sizeof(ino)); + if (err < 0) + return err; + + TRACE("squashfs_inode_lookup, inode = 0x%llx\n", + (u64) le64_to_cpu(ino)); + + return le64_to_cpu(ino); +} + + +static struct dentry *squashfs_export_iget(struct super_block *sb, + unsigned int ino_num) +{ + long long ino; + struct dentry *dentry = ERR_PTR(-ENOENT); + + TRACE("Entered squashfs_export_iget\n"); + + ino = squashfs_inode_lookup(sb, ino_num); + if (ino >= 0) + dentry = d_obtain_alias(squashfs_iget(sb, ino, ino_num)); + + return dentry; +} + + +static struct dentry *squashfs_fh_to_dentry(struct super_block *sb, + struct fid *fid, int fh_len, int fh_type) +{ + if ((fh_type != FILEID_INO32_GEN && fh_type != FILEID_INO32_GEN_PARENT) + || fh_len < 2) + return NULL; + + return squashfs_export_iget(sb, fid->i32.ino); +} + + +static struct dentry *squashfs_fh_to_parent(struct super_block *sb, + struct fid *fid, int fh_len, int fh_type) +{ + if (fh_type != FILEID_INO32_GEN_PARENT || fh_len < 4) + return NULL; + + return squashfs_export_iget(sb, fid->i32.parent_ino); +} + + +static struct dentry *squashfs_get_parent(struct dentry *child) +{ + struct inode *inode = child->d_inode; + unsigned int parent_ino = squashfs_i(inode)->parent; + + return squashfs_export_iget(inode->i_sb, parent_ino); +} + + +/* + * Read uncompressed inode lookup table indexes off disk into memory + */ +__le64 *squashfs_read_inode_lookup_table(struct super_block *sb, + u64 lookup_table_start, unsigned int inodes) +{ + unsigned int length = SQUASHFS_LOOKUP_BLOCK_BYTES(inodes); + __le64 *inode_lookup_table; + int err; + + TRACE("In read_inode_lookup_table, length %d\n", length); + + /* Allocate inode lookup table indexes */ + inode_lookup_table = kmalloc(length, GFP_KERNEL); + if (inode_lookup_table == NULL) { + ERROR("Failed to allocate inode lookup table\n"); + return ERR_PTR(-ENOMEM); + } + + err = squashfs_read_table(sb, inode_lookup_table, lookup_table_start, + length); + if (err < 0) { + ERROR("unable to read inode lookup table\n"); + kfree(inode_lookup_table); + return ERR_PTR(err); + } + + return inode_lookup_table; +} + + +const struct export_operations squashfs_export_ops = { + .fh_to_dentry = squashfs_fh_to_dentry, + .fh_to_parent = squashfs_fh_to_parent, + .get_parent = squashfs_get_parent +}; diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c new file mode 100644 index 000000000000..717767d831df --- /dev/null +++ b/fs/squashfs/file.c @@ -0,0 +1,502 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher <phillip@lougher.demon.co.uk> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * file.c + */ + +/* + * This file contains code for handling regular files. A regular file + * consists of a sequence of contiguous compressed blocks, and/or a + * compressed fragment block (tail-end packed block). The compressed size + * of each datablock is stored in a block list contained within the + * file inode (itself stored in one or more compressed metadata blocks). + * + * To speed up access to datablocks when reading 'large' files (256 Mbytes or + * larger), the code implements an index cache that caches the mapping from + * block index to datablock location on disk. + * + * The index cache allows Squashfs to handle large files (up to 1.75 TiB) while + * retaining a simple and space-efficient block list on disk. The cache + * is split into slots, caching up to eight 224 GiB files (128 KiB blocks). + * Larger files use multiple slots, with 1.75 TiB files using all 8 slots. + * The index cache is designed to be memory efficient, and by default uses + * 16 KiB. + */ + +#include <linux/fs.h> +#include <linux/vfs.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/pagemap.h> +#include <linux/mutex.h> +#include <linux/zlib.h> + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "squashfs.h" + +/* + * Locate cache slot in range [offset, index] for specified inode. If + * there's more than one return the slot closest to index. + */ +static struct meta_index *locate_meta_index(struct inode *inode, int offset, + int index) +{ + struct meta_index *meta = NULL; + struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; + int i; + + mutex_lock(&msblk->meta_index_mutex); + + TRACE("locate_meta_index: index %d, offset %d\n", index, offset); + + if (msblk->meta_index == NULL) + goto not_allocated; + + for (i = 0; i < SQUASHFS_META_SLOTS; i++) { + if (msblk->meta_index[i].inode_number == inode->i_ino && + msblk->meta_index[i].offset >= offset && + msblk->meta_index[i].offset <= index && + msblk->meta_index[i].locked == 0) { + TRACE("locate_meta_index: entry %d, offset %d\n", i, + msblk->meta_index[i].offset); + meta = &msblk->meta_index[i]; + offset = meta->offset; + } + } + + if (meta) + meta->locked = 1; + +not_allocated: + mutex_unlock(&msblk->meta_index_mutex); + + return meta; +} + + +/* + * Find and initialise an empty cache slot for index offset. + */ +static struct meta_index *empty_meta_index(struct inode *inode, int offset, + int skip) +{ + struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; + struct meta_index *meta = NULL; + int i; + + mutex_lock(&msblk->meta_index_mutex); + + TRACE("empty_meta_index: offset %d, skip %d\n", offset, skip); + + if (msblk->meta_index == NULL) { + /* + * First time cache index has been used, allocate and + * initialise. The cache index could be allocated at + * mount time but doing it here means it is allocated only + * if a 'large' file is read. + */ + msblk->meta_index = kcalloc(SQUASHFS_META_SLOTS, + sizeof(*(msblk->meta_index)), GFP_KERNEL); + if (msblk->meta_index == NULL) { + ERROR("Failed to allocate meta_index\n"); + goto failed; + } + for (i = 0; i < SQUASHFS_META_SLOTS; i++) { + msblk->meta_index[i].inode_number = 0; + msblk->meta_index[i].locked = 0; + } + msblk->next_meta_index = 0; + } + + for (i = SQUASHFS_META_SLOTS; i && + msblk->meta_index[msblk->next_meta_index].locked; i--) + msblk->next_meta_index = (msblk->next_meta_index + 1) % + SQUASHFS_META_SLOTS; + + if (i == 0) { + TRACE("empty_meta_index: failed!\n"); + goto failed; + } + + TRACE("empty_meta_index: returned meta entry %d, %p\n", + msblk->next_meta_index, + &msblk->meta_index[msblk->next_meta_index]); + + meta = &msblk->meta_index[msblk->next_meta_index]; + msblk->next_meta_index = (msblk->next_meta_index + 1) % + SQUASHFS_META_SLOTS; + + meta->inode_number = inode->i_ino; + meta->offset = offset; + meta->skip = skip; + meta->entries = 0; + meta->locked = 1; + +failed: + mutex_unlock(&msblk->meta_index_mutex); + return meta; +} + + +static void release_meta_index(struct inode *inode, struct meta_index *meta) +{ + struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; + mutex_lock(&msblk->meta_index_mutex); + meta->locked = 0; + mutex_unlock(&msblk->meta_index_mutex); +} + + +/* + * Read the next n blocks from the block list, starting from + * metadata block <start_block, offset>. + */ +static long long read_indexes(struct super_block *sb, int n, + u64 *start_block, int *offset) +{ + int err, i; + long long block = 0; + __le32 *blist = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL); + + if (blist == NULL) { + ERROR("read_indexes: Failed to allocate block_list\n"); + return -ENOMEM; + } + + while (n) { + int blocks = min_t(int, n, PAGE_CACHE_SIZE >> 2); + + err = squashfs_read_metadata(sb, blist, start_block, + offset, blocks << 2); + if (err < 0) { + ERROR("read_indexes: reading block [%llx:%x]\n", + *start_block, *offset); + goto failure; + } + + for (i = 0; i < blocks; i++) { + int size = le32_to_cpu(blist[i]); + block += SQUASHFS_COMPRESSED_SIZE_BLOCK(size); + } + n -= blocks; + } + + kfree(blist); + return block; + +failure: + kfree(blist); + return err; +} + + +/* + * Each cache index slot has SQUASHFS_META_ENTRIES, each of which + * can cache one index -> datablock/blocklist-block mapping. We wish + * to distribute these over the length of the file, entry[0] maps index x, + * entry[1] maps index x + skip, entry[2] maps index x + 2 * skip, and so on. + * The larger the file, the greater the skip factor. The skip factor is + * limited to the size of the metadata cache (SQUASHFS_CACHED_BLKS) to ensure + * the number of metadata blocks that need to be read fits into the cache. + * If the skip factor is limited in this way then the file will use multiple + * slots. + */ +static inline int calculate_skip(int blocks) +{ + int skip = blocks / ((SQUASHFS_META_ENTRIES + 1) + * SQUASHFS_META_INDEXES); + return min(SQUASHFS_CACHED_BLKS - 1, skip + 1); +} + + +/* + * Search and grow the index cache for the specified inode, returning the + * on-disk locations of the datablock and block list metadata block + * <index_block, index_offset> for index (scaled to nearest cache index). + */ +static int fill_meta_index(struct inode *inode, int index, + u64 *index_block, int *index_offset, u64 *data_block) +{ + struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; + int skip = calculate_skip(i_size_read(inode) >> msblk->block_log); + int offset = 0; + struct meta_index *meta; + struct meta_entry *meta_entry; + u64 cur_index_block = squashfs_i(inode)->block_list_start; + int cur_offset = squashfs_i(inode)->offset; + u64 cur_data_block = squashfs_i(inode)->start; + int err, i; + + /* + * Scale index to cache index (cache slot entry) + */ + index /= SQUASHFS_META_INDEXES * skip; + + while (offset < index) { + meta = locate_meta_index(inode, offset + 1, index); + + if (meta == NULL) { + meta = empty_meta_index(inode, offset + 1, skip); + if (meta == NULL) + goto all_done; + } else { + offset = index < meta->offset + meta->entries ? index : + meta->offset + meta->entries - 1; + meta_entry = &meta->meta_entry[offset - meta->offset]; + cur_index_block = meta_entry->index_block + + msblk->inode_table; + cur_offset = meta_entry->offset; + cur_data_block = meta_entry->data_block; + TRACE("get_meta_index: offset %d, meta->offset %d, " + "meta->entries %d\n", offset, meta->offset, + meta->entries); + TRACE("get_meta_index: index_block 0x%llx, offset 0x%x" + " data_block 0x%llx\n", cur_index_block, + cur_offset, cur_data_block); + } + + /* + * If necessary grow cache slot by reading block list. Cache + * slot is extended up to index or to the end of the slot, in + * which case further slots will be used. + */ + for (i = meta->offset + meta->entries; i <= index && + i < meta->offset + SQUASHFS_META_ENTRIES; i++) { + int blocks = skip * SQUASHFS_META_INDEXES; + long long res = read_indexes(inode->i_sb, blocks, + &cur_index_block, &cur_offset); + + if (res < 0) { + if (meta->entries == 0) + /* + * Don't leave an empty slot on read + * error allocated to this inode... + */ + meta->inode_number = 0; + err = res; + goto failed; + } + + cur_data_block += res; + meta_entry = &meta->meta_entry[i - meta->offset]; + meta_entry->index_block = cur_index_block - + msblk->inode_table; + meta_entry->offset = cur_offset; + meta_entry->data_block = cur_data_block; + meta->entries++; + offset++; + } + + TRACE("get_meta_index: meta->offset %d, meta->entries %d\n", + meta->offset, meta->entries); + + release_meta_index(inode, meta); + } + +all_done: + *index_block = cur_index_block; + *index_offset = cur_offset; + *data_block = cur_data_block; + + /* + * Scale cache index (cache slot entry) to index + */ + return offset * SQUASHFS_META_INDEXES * skip; + +failed: + release_meta_index(inode, meta); + return err; +} + + +/* + * Get the on-disk location and compressed size of the datablock + * specified by index. Fill_meta_index() does most of the work. + */ +static int read_blocklist(struct inode *inode, int index, u64 *block) +{ + u64 start; + long long blks; + int offset; + __le32 size; + int res = fill_meta_index(inode, index, &start, &offset, block); + + TRACE("read_blocklist: res %d, index %d, start 0x%llx, offset" + " 0x%x, block 0x%llx\n", res, index, start, offset, + *block); + + if (res < 0) + return res; + + /* + * res contains the index of the mapping returned by fill_meta_index(), + * this will likely be less than the desired index (because the + * meta_index cache works at a higher granularity). Read any + * extra block indexes needed. + */ + if (res < index) { + blks = read_indexes(inode->i_sb, index - res, &start, &offset); + if (blks < 0) + return (int) blks; + *block += blks; + } + + /* + * Read length of block specified by index. + */ + res = squashfs_read_metadata(inode->i_sb, &size, &start, &offset, + sizeof(size)); + if (res < 0) + return res; + return le32_to_cpu(size); +} + + +static int squashfs_readpage(struct file *file, struct page *page) +{ + struct inode *inode = page->mapping->host; + struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; + int bytes, i, offset = 0, sparse = 0; + struct squashfs_cache_entry *buffer = NULL; + void *pageaddr; + + int mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1; + int index = page->index >> (msblk->block_log - PAGE_CACHE_SHIFT); + int start_index = page->index & ~mask; + int end_index = start_index | mask; + int file_end = i_size_read(inode) >> msblk->block_log; + + TRACE("Entered squashfs_readpage, page index %lx, start block %llx\n", + page->index, squashfs_i(inode)->start); + + if (page->index >= ((i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT)) + goto out; + + if (index < file_end || squashfs_i(inode)->fragment_block == + SQUASHFS_INVALID_BLK) { + /* + * Reading a datablock from disk. Need to read block list + * to get location and block size. + */ + u64 block = 0; + int bsize = read_blocklist(inode, index, &block); + if (bsize < 0) + goto error_out; + + if (bsize == 0) { /* hole */ + bytes = index == file_end ? + (i_size_read(inode) & (msblk->block_size - 1)) : + msblk->block_size; + sparse = 1; + } else { + /* + * Read and decompress datablock. + */ + buffer = squashfs_get_datablock(inode->i_sb, + block, bsize); + if (buffer->error) { + ERROR("Unable to read page, block %llx, size %x" + "\n", block, bsize); + squashfs_cache_put(buffer); + goto error_out; + } + bytes = buffer->length; + } + } else { + /* + * Datablock is stored inside a fragment (tail-end packed + * block). + */ + buffer = squashfs_get_fragment(inode->i_sb, + squashfs_i(inode)->fragment_block, + squashfs_i(inode)->fragment_size); + + if (buffer->error) { + ERROR("Unable to read page, block %llx, size %x\n", + squashfs_i(inode)->fragment_block, + squashfs_i(inode)->fragment_size); + squashfs_cache_put(buffer); + goto error_out; + } + bytes = i_size_read(inode) & (msblk->block_size - 1); + offset = squashfs_i(inode)->fragment_offset; + } + + /* + * Loop copying datablock into pages. As the datablock likely covers + * many PAGE_CACHE_SIZE pages (default block size is 128 KiB) explicitly + * grab the pages from the page cache, except for the page that we've + * been called to fill. + */ + for (i = start_index; i <= end_index && bytes > 0; i++, + bytes -= PAGE_CACHE_SIZE, offset += PAGE_CACHE_SIZE) { + struct page *push_page; + int avail = sparse ? 0 : min_t(int, bytes, PAGE_CACHE_SIZE); + + TRACE("bytes %d, i %d, available_bytes %d\n", bytes, i, avail); + + push_page = (i == page->index) ? page : + grab_cache_page_nowait(page->mapping, i); + + if (!push_page) + continue; + + if (PageUptodate(push_page)) + goto skip_page; + + pageaddr = kmap_atomic(push_page, KM_USER0); + squashfs_copy_data(pageaddr, buffer, offset, avail); + memset(pageaddr + avail, 0, PAGE_CACHE_SIZE - avail); + kunmap_atomic(pageaddr, KM_USER0); + flush_dcache_page(push_page); + SetPageUptodate(push_page); +skip_page: + unlock_page(push_page); + if (i != page->index) + page_cache_release(push_page); + } + + if (!sparse) + squashfs_cache_put(buffer); + + return 0; + +error_out: + SetPageError(page); +out: + pageaddr = kmap_atomic(page, KM_USER0); + memset(pageaddr, 0, PAGE_CACHE_SIZE); + kunmap_atomic(pageaddr, KM_USER0); + flush_dcache_page(page); + if (!PageError(page)) + SetPageUptodate(page); + unlock_page(page); + + return 0; +} + + +const struct address_space_operations squashfs_aops = { + .readpage = squashfs_readpage +}; diff --git a/fs/squashfs/fragment.c b/fs/squashfs/fragment.c new file mode 100644 index 000000000000..b5a2c15bbbc7 --- /dev/null +++ b/fs/squashfs/fragment.c @@ -0,0 +1,98 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher <phillip@lougher.demon.co.uk> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * fragment.c + */ + +/* + * This file implements code to handle compressed fragments (tail-end packed + * datablocks). + * + * Regular files contain a fragment index which is mapped to a fragment + * location on disk and compressed size using a fragment lookup table. + * Like everything in Squashfs this fragment lookup table is itself stored + * compressed into metadata blocks. A second index table is used to locate + * these. This second index table for speed of access (and because it + * is small) is read at mount time and cached in memory. + */ + +#include <linux/fs.h> +#include <linux/vfs.h> +#include <linux/slab.h> +#include <linux/zlib.h> + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "squashfs.h" + +/* + * Look-up fragment using the fragment index table. Return the on disk + * location of the fragment and its compressed size + */ +int squashfs_frag_lookup(struct super_block *sb, unsigned int fragment, + u64 *fragment_block) +{ + struct squashfs_sb_info *msblk = sb->s_fs_info; + int block = SQUASHFS_FRAGMENT_INDEX(fragment); + int offset = SQUASHFS_FRAGMENT_INDEX_OFFSET(fragment); + u64 start_block = le64_to_cpu(msblk->fragment_index[block]); + struct squashfs_fragment_entry fragment_entry; + int size; + + size = squashfs_read_metadata(sb, &fragment_entry, &start_block, + &offset, sizeof(fragment_entry)); + if (size < 0) + return size; + + *fragment_block = le64_to_cpu(fragment_entry.start_block); + size = le32_to_cpu(fragment_entry.size); + + return size; +} + + +/* + * Read the uncompressed fragment lookup table indexes off disk into memory + */ +__le64 *squashfs_read_fragment_index_table(struct super_block *sb, + u64 fragment_table_start, unsigned int fragments) +{ + unsigned int length = SQUASHFS_FRAGMENT_INDEX_BYTES(fragments); + __le64 *fragment_index; + int err; + + /* Allocate fragment lookup table indexes */ + fragment_index = kmalloc(length, GFP_KERNEL); + if (fragment_index == NULL) { + ERROR("Failed to allocate fragment index table\n"); + return ERR_PTR(-ENOMEM); + } + + err = squashfs_read_table(sb, fragment_index, fragment_table_start, + length); + if (err < 0) { + ERROR("unable to read fragment index table\n"); + kfree(fragment_index); + return ERR_PTR(err); + } + + return fragment_index; +} diff --git a/fs/squashfs/id.c b/fs/squashfs/id.c new file mode 100644 index 000000000000..3795b837ba28 --- /dev/null +++ b/fs/squashfs/id.c @@ -0,0 +1,94 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher <phillip@lougher.demon.co.uk> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * id.c + */ + +/* + * This file implements code to handle uids and gids. + * + * For space efficiency regular files store uid and gid indexes, which are + * converted to 32-bit uids/gids using an id look up table. This table is + * stored compressed into metadata blocks. A second index table is used to + * locate these. This second index table for speed of access (and because it + * is small) is read at mount time and cached in memory. + */ + +#include <linux/fs.h> +#include <linux/vfs.h> +#include <linux/slab.h> +#include <linux/zlib.h> + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "squashfs.h" + +/* + * Map uid/gid index into real 32-bit uid/gid using the id look up table + */ +int squashfs_get_id(struct super_block *sb, unsigned int index, + unsigned int *id) +{ + struct squashfs_sb_info *msblk = sb->s_fs_info; + int block = SQUASHFS_ID_BLOCK(index); + int offset = SQUASHFS_ID_BLOCK_OFFSET(index); + u64 start_block = le64_to_cpu(msblk->id_table[block]); + __le32 disk_id; + int err; + + err = squashfs_read_metadata(sb, &disk_id, &start_block, &offset, + sizeof(disk_id)); + if (err < 0) + return err; + + *id = le32_to_cpu(disk_id); + return 0; +} + + +/* + * Read uncompressed id lookup table indexes from disk into memory + */ +__le64 *squashfs_read_id_index_table(struct super_block *sb, + u64 id_table_start, unsigned short no_ids) +{ + unsigned int length = SQUASHFS_ID_BLOCK_BYTES(no_ids); + __le64 *id_table; + int err; + + TRACE("In read_id_index_table, length %d\n", length); + + /* Allocate id lookup table indexes */ + id_table = kmalloc(length, GFP_KERNEL); + if (id_table == NULL) { + ERROR("Failed to allocate id index table\n"); + return ERR_PTR(-ENOMEM); + } + + err = squashfs_read_table(sb, id_table, id_table_start, length); + if (err < 0) { + ERROR("unable to read id index table\n"); + kfree(id_table); + return ERR_PTR(err); + } + + return id_table; +} diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c new file mode 100644 index 000000000000..7a63398bb855 --- /dev/null +++ b/fs/squashfs/inode.c @@ -0,0 +1,346 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher <phillip@lougher.demon.co.uk> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * inode.c + */ + +/* + * This file implements code to create and read inodes from disk. + * + * Inodes in Squashfs are identified by a 48-bit inode which encodes the + * location of the compressed metadata block containing the inode, and the byte + * offset into that block where the inode is placed (<block, offset>). + * + * To maximise compression there are different inodes for each file type + * (regular file, directory, device, etc.), the inode contents and length + * varying with the type. + * + * To further maximise compression, two types of regular file inode and + * directory inode are defined: inodes optimised for frequently occurring + * regular files and directories, and extended types where extra + * information has to be stored. + */ + +#include <linux/fs.h> +#include <linux/vfs.h> +#include <linux/zlib.h> + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "squashfs.h" + +/* + * Initialise VFS inode with the base inode information common to all + * Squashfs inode types. Sqsh_ino contains the unswapped base inode + * off disk. + */ +static int squashfs_new_inode(struct super_block *sb, struct inode *inode, + struct squashfs_base_inode *sqsh_ino) +{ + int err; + + err = squashfs_get_id(sb, le16_to_cpu(sqsh_ino->uid), &inode->i_uid); + if (err) + return err; + + err = squashfs_get_id(sb, le16_to_cpu(sqsh_ino->guid), &inode->i_gid); + if (err) + return err; + + inode->i_ino = le32_to_cpu(sqsh_ino->inode_number); + inode->i_mtime.tv_sec = le32_to_cpu(sqsh_ino->mtime); + inode->i_atime.tv_sec = inode->i_mtime.tv_sec; + inode->i_ctime.tv_sec = inode->i_mtime.tv_sec; + inode->i_mode = le16_to_cpu(sqsh_ino->mode); + inode->i_size = 0; + + return err; +} + + +struct inode *squashfs_iget(struct super_block *sb, long long ino, + unsigned int ino_number) +{ + struct inode *inode = iget_locked(sb, ino_number); + int err; + + TRACE("Entered squashfs_iget\n"); + + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + err = squashfs_read_inode(inode, ino); + if (err) { + iget_failed(inode); + return ERR_PTR(err); + } + + unlock_new_inode(inode); + return inode; +} + + +/* + * Initialise VFS inode by reading inode from inode table (compressed + * metadata). The format and amount of data read depends on type. + */ +int squashfs_read_inode(struct inode *inode, long long ino) +{ + struct super_block *sb = inode->i_sb; + struct squashfs_sb_info *msblk = sb->s_fs_info; + u64 block = SQUASHFS_INODE_BLK(ino) + msblk->inode_table; + int err, type, offset = SQUASHFS_INODE_OFFSET(ino); + union squashfs_inode squashfs_ino; + struct squashfs_base_inode *sqshb_ino = &squashfs_ino.base; + + TRACE("Entered squashfs_read_inode\n"); + + /* + * Read inode base common to all inode types. + */ + err = squashfs_read_metadata(sb, sqshb_ino, &block, + &offset, sizeof(*sqshb_ino)); + if (err < 0) + goto failed_read; + + err = squashfs_new_inode(sb, inode, sqshb_ino); + if (err) + goto failed_read; + + block = SQUASHFS_INODE_BLK(ino) + msblk->inode_table; + offset = SQUASHFS_INODE_OFFSET(ino); + + type = le16_to_cpu(sqshb_ino->inode_type); + switch (type) { + case SQUASHFS_REG_TYPE: { + unsigned int frag_offset, frag_size, frag; + u64 frag_blk; + struct squashfs_reg_inode *sqsh_ino = &squashfs_ino.reg; + + err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, + sizeof(*sqsh_ino)); + if (err < 0) + goto failed_read; + + frag = le32_to_cpu(sqsh_ino->fragment); + if (frag != SQUASHFS_INVALID_FRAG) { + frag_offset = le32_to_cpu(sqsh_ino->offset); + frag_size = squashfs_frag_lookup(sb, frag, &frag_blk); + if (frag_size < 0) { + err = frag_size; + goto failed_read; + } + } else { + frag_blk = SQUASHFS_INVALID_BLK; + frag_size = 0; + frag_offset = 0; + } + + inode->i_nlink = 1; + inode->i_size = le32_to_cpu(sqsh_ino->file_size); + inode->i_fop = &generic_ro_fops; + inode->i_mode |= S_IFREG; + inode->i_blocks = ((inode->i_size - 1) >> 9) + 1; + squashfs_i(inode)->fragment_block = frag_blk; + squashfs_i(inode)->fragment_size = frag_size; + squashfs_i(inode)->fragment_offset = frag_offset; + squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block); + squashfs_i(inode)->block_list_start = block; + squashfs_i(inode)->offset = offset; + inode->i_data.a_ops = &squashfs_aops; + + TRACE("File inode %x:%x, start_block %llx, block_list_start " + "%llx, offset %x\n", SQUASHFS_INODE_BLK(ino), + offset, squashfs_i(inode)->start, block, offset); + break; + } + case SQUASHFS_LREG_TYPE: { + unsigned int frag_offset, frag_size, frag; + u64 frag_blk; + struct squashfs_lreg_inode *sqsh_ino = &squashfs_ino.lreg; + + err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, + sizeof(*sqsh_ino)); + if (err < 0) + goto failed_read; + + frag = le32_to_cpu(sqsh_ino->fragment); + if (frag != SQUASHFS_INVALID_FRAG) { + frag_offset = le32_to_cpu(sqsh_ino->offset); + frag_size = squashfs_frag_lookup(sb, frag, &frag_blk); + if (frag_size < 0) { + err = frag_size; + goto failed_read; + } + } else { + frag_blk = SQUASHFS_INVALID_BLK; + frag_size = 0; + frag_offset = 0; + } + + inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); + inode->i_size = le64_to_cpu(sqsh_ino->file_size); + inode->i_fop = &generic_ro_fops; + inode->i_mode |= S_IFREG; + inode->i_blocks = ((inode->i_size - + le64_to_cpu(sqsh_ino->sparse) - 1) >> 9) + 1; + + squashfs_i(inode)->fragment_block = frag_blk; + squashfs_i(inode)->fragment_size = frag_size; + squashfs_i(inode)->fragment_offset = frag_offset; + squashfs_i(inode)->start = le64_to_cpu(sqsh_ino->start_block); + squashfs_i(inode)->block_list_start = block; + squashfs_i(inode)->offset = offset; + inode->i_data.a_ops = &squashfs_aops; + + TRACE("File inode %x:%x, start_block %llx, block_list_start " + "%llx, offset %x\n", SQUASHFS_INODE_BLK(ino), + offset, squashfs_i(inode)->start, block, offset); + break; + } + case SQUASHFS_DIR_TYPE: { + struct squashfs_dir_inode *sqsh_ino = &squashfs_ino.dir; + + err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, + sizeof(*sqsh_ino)); + if (err < 0) + goto failed_read; + + inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); + inode->i_size = le16_to_cpu(sqsh_ino->file_size); + inode->i_op = &squashfs_dir_inode_ops; + inode->i_fop = &squashfs_dir_ops; + inode->i_mode |= S_IFDIR; + squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block); + squashfs_i(inode)->offset = le16_to_cpu(sqsh_ino->offset); + squashfs_i(inode)->dir_idx_cnt = 0; + squashfs_i(inode)->parent = le32_to_cpu(sqsh_ino->parent_inode); + + TRACE("Directory inode %x:%x, start_block %llx, offset %x\n", + SQUASHFS_INODE_BLK(ino), offset, + squashfs_i(inode)->start, + le16_to_cpu(sqsh_ino->offset)); + break; + } + case SQUASHFS_LDIR_TYPE: { + struct squashfs_ldir_inode *sqsh_ino = &squashfs_ino.ldir; + + err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, + sizeof(*sqsh_ino)); + if (err < 0) + goto failed_read; + + inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); + inode->i_size = le32_to_cpu(sqsh_ino->file_size); + inode->i_op = &squashfs_dir_inode_ops; + inode->i_fop = &squashfs_dir_ops; + inode->i_mode |= S_IFDIR; + squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block); + squashfs_i(inode)->offset = le16_to_cpu(sqsh_ino->offset); + squashfs_i(inode)->dir_idx_start = block; + squashfs_i(inode)->dir_idx_offset = offset; + squashfs_i(inode)->dir_idx_cnt = le16_to_cpu(sqsh_ino->i_count); + squashfs_i(inode)->parent = le32_to_cpu(sqsh_ino->parent_inode); + + TRACE("Long directory inode %x:%x, start_block %llx, offset " + "%x\n", SQUASHFS_INODE_BLK(ino), offset, + squashfs_i(inode)->start, + le16_to_cpu(sqsh_ino->offset)); + break; + } + case SQUASHFS_SYMLINK_TYPE: + case SQUASHFS_LSYMLINK_TYPE: { + struct squashfs_symlink_inode *sqsh_ino = &squashfs_ino.symlink; + + err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, + sizeof(*sqsh_ino)); + if (err < 0) + goto failed_read; + + inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); + inode->i_size = le32_to_cpu(sqsh_ino->symlink_size); + inode->i_op = &page_symlink_inode_operations; + inode->i_data.a_ops = &squashfs_symlink_aops; + inode->i_mode |= S_IFLNK; + squashfs_i(inode)->start = block; + squashfs_i(inode)->offset = offset; + + TRACE("Symbolic link inode %x:%x, start_block %llx, offset " + "%x\n", SQUASHFS_INODE_BLK(ino), offset, + block, offset); + break; + } + case SQUASHFS_BLKDEV_TYPE: + case SQUASHFS_CHRDEV_TYPE: + case SQUASHFS_LBLKDEV_TYPE: + case SQUASHFS_LCHRDEV_TYPE: { + struct squashfs_dev_inode *sqsh_ino = &squashfs_ino.dev; + unsigned int rdev; + + err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, + sizeof(*sqsh_ino)); + if (err < 0) + goto failed_read; + + if (type == SQUASHFS_CHRDEV_TYPE) + inode->i_mode |= S_IFCHR; + else + inode->i_mode |= S_IFBLK; + inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); + rdev = le32_to_cpu(sqsh_ino->rdev); + init_special_inode(inode, inode->i_mode, new_decode_dev(rdev)); + + TRACE("Device inode %x:%x, rdev %x\n", + SQUASHFS_INODE_BLK(ino), offset, rdev); + break; + } + case SQUASHFS_FIFO_TYPE: + case SQUASHFS_SOCKET_TYPE: + case SQUASHFS_LFIFO_TYPE: + case SQUASHFS_LSOCKET_TYPE: { + struct squashfs_ipc_inode *sqsh_ino = &squashfs_ino.ipc; + + err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, + sizeof(*sqsh_ino)); + if (err < 0) + goto failed_read; + + if (type == SQUASHFS_FIFO_TYPE) + inode->i_mode |= S_IFIFO; + else + inode->i_mode |= S_IFSOCK; + inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); + init_special_inode(inode, inode->i_mode, 0); + break; + } + default: + ERROR("Unknown inode type %d in squashfs_iget!\n", type); + return -EINVAL; + } + + return 0; + +failed_read: + ERROR("Unable to read inode 0x%llx\n", ino); + return err; +} diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c new file mode 100644 index 000000000000..9e398653b22b --- /dev/null +++ b/fs/squashfs/namei.c @@ -0,0 +1,242 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher <phillip@lougher.demon.co.uk> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * namei.c + */ + +/* + * This file implements code to do filename lookup in directories. + * + * Like inodes, directories are packed into compressed metadata blocks, stored + * in a directory table. Directories are accessed using the start address of + * the metablock containing the directory and the offset into the + * decompressed block (<block, offset>). + * + * Directories are organised in a slightly complex way, and are not simply + * a list of file names. The organisation takes advantage of the + * fact that (in most cases) the inodes of the files will be in the same + * compressed metadata block, and therefore, can share the start block. + * Directories are therefore organised in a two level list, a directory + * header containing the shared start block value, and a sequence of directory + * entries, each of which share the shared start block. A new directory header + * is written once/if the inode start block changes. The directory + * header/directory entry list is repeated as many times as necessary. + * + * Directories are sorted, and can contain a directory index to speed up + * file lookup. Directory indexes store one entry per metablock, each entry + * storing the index/filename mapping to the first directory header + * in each metadata block. Directories are sorted in alphabetical order, + * and at lookup the index is scanned linearly looking for the first filename + * alphabetically larger than the filename being looked up. At this point the + * location of the metadata block the filename is in has been found. + * The general idea of the index is ensure only one metadata block needs to be + * decompressed to do a lookup irrespective of the length of the directory. + * This scheme has the advantage that it doesn't require extra memory overhead + * and doesn't require much extra storage on disk. + */ + +#include <linux/fs.h> +#include <linux/vfs.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/dcache.h> +#include <linux/zlib.h> + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "squashfs.h" + +/* + * Lookup name in the directory index, returning the location of the metadata + * block containing it, and the directory index this represents. + * + * If we get an error reading the index then return the part of the index + * (if any) we have managed to read - the index isn't essential, just + * quicker. + */ +static int get_dir_index_using_name(struct super_block *sb, + u64 *next_block, int *next_offset, u64 index_start, + int index_offset, int i_count, const char *name, + int len) +{ + struct squashfs_sb_info *msblk = sb->s_fs_info; + int i, size, length = 0, err; + struct squashfs_dir_index *index; + char *str; + + TRACE("Entered get_dir_index_using_name, i_count %d\n", i_count); + + index = kmalloc(sizeof(*index) + SQUASHFS_NAME_LEN * 2 + 2, GFP_KERNEL); + if (index == NULL) { + ERROR("Failed to allocate squashfs_dir_index\n"); + goto out; + } + + str = &index->name[SQUASHFS_NAME_LEN + 1]; + strncpy(str, name, len); + str[len] = '\0'; + + for (i = 0; i < i_count; i++) { + err = squashfs_read_metadata(sb, index, &index_start, + &index_offset, sizeof(*index)); + if (err < 0) + break; + + + size = le32_to_cpu(index->size) + 1; + + err = squashfs_read_metadata(sb, index->name, &index_start, + &index_offset, size); + if (err < 0) + break; + + index->name[size] = '\0'; + + if (strcmp(index->name, str) > 0) + break; + + length = le32_to_cpu(index->index); + *next_block = le32_to_cpu(index->start_block) + + msblk->directory_table; + } + + *next_offset = (length + *next_offset) % SQUASHFS_METADATA_SIZE; + kfree(index); + +out: + /* + * Return index (f_pos) of the looked up metadata block. Translate + * from internal f_pos to external f_pos which is offset by 3 because + * we invent "." and ".." entries which are not actually stored in the + * directory. + */ + return length + 3; +} + + +static struct dentry *squashfs_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + const unsigned char *name = dentry->d_name.name; + int len = dentry->d_name.len; + struct inode *inode = NULL; + struct squashfs_sb_info *msblk = dir->i_sb->s_fs_info; + struct squashfs_dir_header dirh; + struct squashfs_dir_entry *dire; + u64 block = squashfs_i(dir)->start + msblk->directory_table; + int offset = squashfs_i(dir)->offset; + int err, length = 0, dir_count, size; + + TRACE("Entered squashfs_lookup [%llx:%x]\n", block, offset); + + dire = kmalloc(sizeof(*dire) + SQUASHFS_NAME_LEN + 1, GFP_KERNEL); + if (dire == NULL) { + ERROR("Failed to allocate squashfs_dir_entry\n"); + return ERR_PTR(-ENOMEM); + } + + if (len > SQUASHFS_NAME_LEN) { + err = -ENAMETOOLONG; + goto failed; + } + + length = get_dir_index_using_name(dir->i_sb, &block, &offset, + squashfs_i(dir)->dir_idx_start, + squashfs_i(dir)->dir_idx_offset, + squashfs_i(dir)->dir_idx_cnt, name, len); + + while (length < i_size_read(dir)) { + /* + * Read directory header. + */ + err = squashfs_read_metadata(dir->i_sb, &dirh, &block, + &offset, sizeof(dirh)); + if (err < 0) + goto read_failure; + + length += sizeof(dirh); + + dir_count = le32_to_cpu(dirh.count) + 1; + while (dir_count--) { + /* + * Read directory entry. + */ + err = squashfs_read_metadata(dir->i_sb, dire, &block, + &offset, sizeof(*dire)); + if (err < 0) + goto read_failure; + + size = le16_to_cpu(dire->size) + 1; + + err = squashfs_read_metadata(dir->i_sb, dire->name, + &block, &offset, size); + if (err < 0) + goto read_failure; + + length += sizeof(*dire) + size; + + if (name[0] < dire->name[0]) + goto exit_lookup; + + if (len == size && !strncmp(name, dire->name, len)) { + unsigned int blk, off, ino_num; + long long ino; + blk = le32_to_cpu(dirh.start_block); + off = le16_to_cpu(dire->offset); + ino_num = le32_to_cpu(dirh.inode_number) + + (short) le16_to_cpu(dire->inode_number); + ino = SQUASHFS_MKINODE(blk, off); + + TRACE("calling squashfs_iget for directory " + "entry %s, inode %x:%x, %d\n", name, + blk, off, ino_num); + + inode = squashfs_iget(dir->i_sb, ino, ino_num); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto failed; + } + + goto exit_lookup; + } + } + } + +exit_lookup: + kfree(dire); + if (inode) + return d_splice_alias(inode, dentry); + d_add(dentry, inode); + return ERR_PTR(0); + +read_failure: + ERROR("Unable to read directory block [%llx:%x]\n", + squashfs_i(dir)->start + msblk->directory_table, + squashfs_i(dir)->offset); +failed: + kfree(dire); + return ERR_PTR(err); +} + + +const struct inode_operations squashfs_dir_inode_ops = { + .lookup = squashfs_lookup +}; diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h new file mode 100644 index 000000000000..6b2515d027d5 --- /dev/null +++ b/fs/squashfs/squashfs.h @@ -0,0 +1,90 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher <phillip@lougher.demon.co.uk> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * squashfs.h + */ + +#define TRACE(s, args...) pr_debug("SQUASHFS: "s, ## args) + +#define ERROR(s, args...) pr_err("SQUASHFS error: "s, ## args) + +#define WARNING(s, args...) pr_warning("SQUASHFS: "s, ## args) + +static inline struct squashfs_inode_info *squashfs_i(struct inode *inode) +{ + return list_entry(inode, struct squashfs_inode_info, vfs_inode); +} + +/* block.c */ +extern int squashfs_read_data(struct super_block *, void **, u64, int, u64 *, + int); + +/* cache.c */ +extern struct squashfs_cache *squashfs_cache_init(char *, int, int); +extern void squashfs_cache_delete(struct squashfs_cache *); +extern struct squashfs_cache_entry *squashfs_cache_get(struct super_block *, + struct squashfs_cache *, u64, int); +extern void squashfs_cache_put(struct squashfs_cache_entry *); +extern int squashfs_copy_data(void *, struct squashfs_cache_entry *, int, int); +extern int squashfs_read_metadata(struct super_block *, void *, u64 *, + int *, int); +extern struct squashfs_cache_entry *squashfs_get_fragment(struct super_block *, + u64, int); +extern struct squashfs_cache_entry *squashfs_get_datablock(struct super_block *, + u64, int); +extern int squashfs_read_table(struct super_block *, void *, u64, int); + +/* export.c */ +extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64, + unsigned int); + +/* fragment.c */ +extern int squashfs_frag_lookup(struct super_block *, unsigned int, u64 *); +extern __le64 *squashfs_read_fragment_index_table(struct super_block *, + u64, unsigned int); + +/* id.c */ +extern int squashfs_get_id(struct super_block *, unsigned int, unsigned int *); +extern __le64 *squashfs_read_id_index_table(struct super_block *, u64, + unsigned short); + +/* inode.c */ +extern struct inode *squashfs_iget(struct super_block *, long long, + unsigned int); +extern int squashfs_read_inode(struct inode *, long long); + +/* + * Inodes and files operations + */ + +/* dir.c */ +extern const struct file_operations squashfs_dir_ops; + +/* export.c */ +extern const struct export_operations squashfs_export_ops; + +/* file.c */ +extern const struct address_space_operations squashfs_aops; + +/* namei.c */ +extern const struct inode_operations squashfs_dir_inode_ops; + +/* symlink.c */ +extern const struct address_space_operations squashfs_symlink_aops; diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h new file mode 100644 index 000000000000..6840da1bf21e --- /dev/null +++ b/fs/squashfs/squashfs_fs.h @@ -0,0 +1,381 @@ +#ifndef SQUASHFS_FS +#define SQUASHFS_FS +/* + * Squashfs + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher <phillip@lougher.demon.co.uk> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * squashfs_fs.h + */ + +#define SQUASHFS_CACHED_FRAGMENTS CONFIG_SQUASHFS_FRAGMENT_CACHE_SIZE +#define SQUASHFS_MAJOR 4 +#define SQUASHFS_MINOR 0 +#define SQUASHFS_MAGIC 0x73717368 +#define SQUASHFS_START 0 + +/* size of metadata (inode and directory) blocks */ +#define SQUASHFS_METADATA_SIZE 8192 +#define SQUASHFS_METADATA_LOG 13 + +/* default size of data blocks */ +#define SQUASHFS_FILE_SIZE 131072 +#define SQUASHFS_FILE_LOG 17 + +#define SQUASHFS_FILE_MAX_SIZE 1048576 +#define SQUASHFS_FILE_MAX_LOG 20 + +/* Max number of uids and gids */ +#define SQUASHFS_IDS 65536 + +/* Max length of filename (not 255) */ +#define SQUASHFS_NAME_LEN 256 + +#define SQUASHFS_INVALID_FRAG (0xffffffffU) +#define SQUASHFS_INVALID_BLK (-1LL) + +/* Filesystem flags */ +#define SQUASHFS_NOI 0 +#define SQUASHFS_NOD 1 +#define SQUASHFS_NOF 3 +#define SQUASHFS_NO_FRAG 4 +#define SQUASHFS_ALWAYS_FRAG 5 +#define SQUASHFS_DUPLICATE 6 +#define SQUASHFS_EXPORT 7 + +#define SQUASHFS_BIT(flag, bit) ((flag >> bit) & 1) + +#define SQUASHFS_UNCOMPRESSED_INODES(flags) SQUASHFS_BIT(flags, \ + SQUASHFS_NOI) + +#define SQUASHFS_UNCOMPRESSED_DATA(flags) SQUASHFS_BIT(flags, \ + SQUASHFS_NOD) + +#define SQUASHFS_UNCOMPRESSED_FRAGMENTS(flags) SQUASHFS_BIT(flags, \ + SQUASHFS_NOF) + +#define SQUASHFS_NO_FRAGMENTS(flags) SQUASHFS_BIT(flags, \ + SQUASHFS_NO_FRAG) + +#define SQUASHFS_ALWAYS_FRAGMENTS(flags) SQUASHFS_BIT(flags, \ + SQUASHFS_ALWAYS_FRAG) + +#define SQUASHFS_DUPLICATES(flags) SQUASHFS_BIT(flags, \ + SQUASHFS_DUPLICATE) + +#define SQUASHFS_EXPORTABLE(flags) SQUASHFS_BIT(flags, \ + SQUASHFS_EXPORT) + +/* Max number of types and file types */ +#define SQUASHFS_DIR_TYPE 1 +#define SQUASHFS_REG_TYPE 2 +#define SQUASHFS_SYMLINK_TYPE 3 +#define SQUASHFS_BLKDEV_TYPE 4 +#define SQUASHFS_CHRDEV_TYPE 5 +#define SQUASHFS_FIFO_TYPE 6 +#define SQUASHFS_SOCKET_TYPE 7 +#define SQUASHFS_LDIR_TYPE 8 +#define SQUASHFS_LREG_TYPE 9 +#define SQUASHFS_LSYMLINK_TYPE 10 +#define SQUASHFS_LBLKDEV_TYPE 11 +#define SQUASHFS_LCHRDEV_TYPE 12 +#define SQUASHFS_LFIFO_TYPE 13 +#define SQUASHFS_LSOCKET_TYPE 14 + +/* Flag whether block is compressed or uncompressed, bit is set if block is + * uncompressed */ +#define SQUASHFS_COMPRESSED_BIT (1 << 15) + +#define SQUASHFS_COMPRESSED_SIZE(B) (((B) & ~SQUASHFS_COMPRESSED_BIT) ? \ + (B) & ~SQUASHFS_COMPRESSED_BIT : SQUASHFS_COMPRESSED_BIT) + +#define SQUASHFS_COMPRESSED(B) (!((B) & SQUASHFS_COMPRESSED_BIT)) + +#define SQUASHFS_COMPRESSED_BIT_BLOCK (1 << 24) + +#define SQUASHFS_COMPRESSED_SIZE_BLOCK(B) ((B) & \ + ~SQUASHFS_COMPRESSED_BIT_BLOCK) + +#define SQUASHFS_COMPRESSED_BLOCK(B) (!((B) & SQUASHFS_COMPRESSED_BIT_BLOCK)) + +/* + * Inode number ops. Inodes consist of a compressed block number, and an + * uncompressed offset within that block + */ +#define SQUASHFS_INODE_BLK(A) ((unsigned int) ((A) >> 16)) + +#define SQUASHFS_INODE_OFFSET(A) ((unsigned int) ((A) & 0xffff)) + +#define SQUASHFS_MKINODE(A, B) ((long long)(((long long) (A)\ + << 16) + (B))) + +/* Translate between VFS mode and squashfs mode */ +#define SQUASHFS_MODE(A) ((A) & 0xfff) + +/* fragment and fragment table defines */ +#define SQUASHFS_FRAGMENT_BYTES(A) \ + ((A) * sizeof(struct squashfs_fragment_entry)) + +#define SQUASHFS_FRAGMENT_INDEX(A) (SQUASHFS_FRAGMENT_BYTES(A) / \ + SQUASHFS_METADATA_SIZE) + +#define SQUASHFS_FRAGMENT_INDEX_OFFSET(A) (SQUASHFS_FRAGMENT_BYTES(A) % \ + SQUASHFS_METADATA_SIZE) + +#define SQUASHFS_FRAGMENT_INDEXES(A) ((SQUASHFS_FRAGMENT_BYTES(A) + \ + SQUASHFS_METADATA_SIZE - 1) / \ + SQUASHFS_METADATA_SIZE) + +#define SQUASHFS_FRAGMENT_INDEX_BYTES(A) (SQUASHFS_FRAGMENT_INDEXES(A) *\ + sizeof(u64)) + +/* inode lookup table defines */ +#define SQUASHFS_LOOKUP_BYTES(A) ((A) * sizeof(u64)) + +#define SQUASHFS_LOOKUP_BLOCK(A) (SQUASHFS_LOOKUP_BYTES(A) / \ + SQUASHFS_METADATA_SIZE) + +#define SQUASHFS_LOOKUP_BLOCK_OFFSET(A) (SQUASHFS_LOOKUP_BYTES(A) % \ + SQUASHFS_METADATA_SIZE) + +#define SQUASHFS_LOOKUP_BLOCKS(A) ((SQUASHFS_LOOKUP_BYTES(A) + \ + SQUASHFS_METADATA_SIZE - 1) / \ + SQUASHFS_METADATA_SIZE) + +#define SQUASHFS_LOOKUP_BLOCK_BYTES(A) (SQUASHFS_LOOKUP_BLOCKS(A) *\ + sizeof(u64)) + +/* uid/gid lookup table defines */ +#define SQUASHFS_ID_BYTES(A) ((A) * sizeof(unsigned int)) + +#define SQUASHFS_ID_BLOCK(A) (SQUASHFS_ID_BYTES(A) / \ + SQUASHFS_METADATA_SIZE) + +#define SQUASHFS_ID_BLOCK_OFFSET(A) (SQUASHFS_ID_BYTES(A) % \ + SQUASHFS_METADATA_SIZE) + +#define SQUASHFS_ID_BLOCKS(A) ((SQUASHFS_ID_BYTES(A) + \ + SQUASHFS_METADATA_SIZE - 1) / \ + SQUASHFS_METADATA_SIZE) + +#define SQUASHFS_ID_BLOCK_BYTES(A) (SQUASHFS_ID_BLOCKS(A) *\ + sizeof(u64)) + +/* cached data constants for filesystem */ +#define SQUASHFS_CACHED_BLKS 8 + +#define SQUASHFS_MAX_FILE_SIZE_LOG 64 + +#define SQUASHFS_MAX_FILE_SIZE (1LL << \ + (SQUASHFS_MAX_FILE_SIZE_LOG - 2)) + +#define SQUASHFS_MARKER_BYTE 0xff + +/* meta index cache */ +#define SQUASHFS_META_INDEXES (SQUASHFS_METADATA_SIZE / sizeof(unsigned int)) +#define SQUASHFS_META_ENTRIES 127 +#define SQUASHFS_META_SLOTS 8 + +struct meta_entry { + u64 data_block; + unsigned int index_block; + unsigned short offset; + unsigned short pad; +}; + +struct meta_index { + unsigned int inode_number; + unsigned int offset; + unsigned short entries; + unsigned short skip; + unsigned short locked; + unsigned short pad; + struct meta_entry meta_entry[SQUASHFS_META_ENTRIES]; +}; + + +/* + * definitions for structures on disk + */ +#define ZLIB_COMPRESSION 1 + +struct squashfs_super_block { + __le32 s_magic; + __le32 inodes; + __le32 mkfs_time; + __le32 block_size; + __le32 fragments; + __le16 compression; + __le16 block_log; + __le16 flags; + __le16 no_ids; + __le16 s_major; + __le16 s_minor; + __le64 root_inode; + __le64 bytes_used; + __le64 id_table_start; + __le64 xattr_table_start; + __le64 inode_table_start; + __le64 directory_table_start; + __le64 fragment_table_start; + __le64 lookup_table_start; +}; + +struct squashfs_dir_index { + __le32 index; + __le32 start_block; + __le32 size; + unsigned char name[0]; +}; + +struct squashfs_base_inode { + __le16 inode_type; + __le16 mode; + __le16 uid; + __le16 guid; + __le32 mtime; + __le32 inode_number; +}; + +struct squashfs_ipc_inode { + __le16 inode_type; + __le16 mode; + __le16 uid; + __le16 guid; + __le32 mtime; + __le32 inode_number; + __le32 nlink; +}; + +struct squashfs_dev_inode { + __le16 inode_type; + __le16 mode; + __le16 uid; + __le16 guid; + __le32 mtime; + __le32 inode_number; + __le32 nlink; + __le32 rdev; +}; + +struct squashfs_symlink_inode { + __le16 inode_type; + __le16 mode; + __le16 uid; + __le16 guid; + __le32 mtime; + __le32 inode_number; + __le32 nlink; + __le32 symlink_size; + char symlink[0]; +}; + +struct squashfs_reg_inode { + __le16 inode_type; + __le16 mode; + __le16 uid; + __le16 guid; + __le32 mtime; + __le32 inode_number; + __le32 start_block; + __le32 fragment; + __le32 offset; + __le32 file_size; + __le16 block_list[0]; +}; + +struct squashfs_lreg_inode { + __le16 inode_type; + __le16 mode; + __le16 uid; + __le16 guid; + __le32 mtime; + __le32 inode_number; + __le64 start_block; + __le64 file_size; + __le64 sparse; + __le32 nlink; + __le32 fragment; + __le32 offset; + __le32 xattr; + __le16 block_list[0]; +}; + +struct squashfs_dir_inode { + __le16 inode_type; + __le16 mode; + __le16 uid; + __le16 guid; + __le32 mtime; + __le32 inode_number; + __le32 start_block; + __le32 nlink; + __le16 file_size; + __le16 offset; + __le32 parent_inode; +}; + +struct squashfs_ldir_inode { + __le16 inode_type; + __le16 mode; + __le16 uid; + __le16 guid; + __le32 mtime; + __le32 inode_number; + __le32 nlink; + __le32 file_size; + __le32 start_block; + __le32 parent_inode; + __le16 i_count; + __le16 offset; + __le32 xattr; + struct squashfs_dir_index index[0]; +}; + +union squashfs_inode { + struct squashfs_base_inode base; + struct squashfs_dev_inode dev; + struct squashfs_symlink_inode symlink; + struct squashfs_reg_inode reg; + struct squashfs_lreg_inode lreg; + struct squashfs_dir_inode dir; + struct squashfs_ldir_inode ldir; + struct squashfs_ipc_inode ipc; +}; + +struct squashfs_dir_entry { + __le16 offset; + __le16 inode_number; + __le16 type; + __le16 size; + char name[0]; +}; + +struct squashfs_dir_header { + __le32 count; + __le32 start_block; + __le32 inode_number; +}; + +struct squashfs_fragment_entry { + __le64 start_block; + __le32 size; + unsigned int unused; +}; + +#endif diff --git a/fs/squashfs/squashfs_fs_i.h b/fs/squashfs/squashfs_fs_i.h new file mode 100644 index 000000000000..fbfca30c0c68 --- /dev/null +++ b/fs/squashfs/squashfs_fs_i.h @@ -0,0 +1,45 @@ +#ifndef SQUASHFS_FS_I +#define SQUASHFS_FS_I +/* + * Squashfs + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher <phillip@lougher.demon.co.uk> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * squashfs_fs_i.h + */ + +struct squashfs_inode_info { + u64 start; + int offset; + union { + struct { + u64 fragment_block; + int fragment_size; + int fragment_offset; + u64 block_list_start; + }; + struct { + u64 dir_idx_start; + int dir_idx_offset; + int dir_idx_cnt; + int parent; + }; + }; + struct inode vfs_inode; +}; +#endif diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h new file mode 100644 index 000000000000..c8c65614dd1c --- /dev/null +++ b/fs/squashfs/squashfs_fs_sb.h @@ -0,0 +1,76 @@ +#ifndef SQUASHFS_FS_SB +#define SQUASHFS_FS_SB +/* + * Squashfs + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher <phillip@lougher.demon.co.uk> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * squashfs_fs_sb.h + */ + +#include "squashfs_fs.h" + +struct squashfs_cache { + char *name; + int entries; + int next_blk; + int num_waiters; + int unused; + int block_size; + int pages; + spinlock_t lock; + wait_queue_head_t wait_queue; + struct squashfs_cache_entry *entry; +}; + +struct squashfs_cache_entry { + u64 block; + int length; + int refcount; + u64 next_index; + int pending; + int error; + int num_waiters; + wait_queue_head_t wait_queue; + struct squashfs_cache *cache; + void **data; +}; + +struct squashfs_sb_info { + int devblksize; + int devblksize_log2; + struct squashfs_cache *block_cache; + struct squashfs_cache *fragment_cache; + struct squashfs_cache *read_page; + int next_meta_index; + __le64 *id_table; + __le64 *fragment_index; + unsigned int *fragment_index_2; + struct mutex read_data_mutex; + struct mutex meta_index_mutex; + struct meta_index *meta_index; + z_stream stream; + __le64 *inode_lookup_table; + u64 inode_table; + u64 directory_table; + unsigned int block_size; + unsigned short block_log; + long long bytes_used; + unsigned int inodes; +}; +#endif diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c new file mode 100644 index 000000000000..a0466d7467b2 --- /dev/null +++ b/fs/squashfs/super.c @@ -0,0 +1,440 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher <phillip@lougher.demon.co.uk> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * super.c + */ + +/* + * This file implements code to read the superblock, read and initialise + * in-memory structures at mount time, and all the VFS glue code to register + * the filesystem. + */ + +#include <linux/fs.h> +#include <linux/vfs.h> +#include <linux/slab.h> +#include <linux/mutex.h> +#include <linux/pagemap.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/zlib.h> + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "squashfs.h" + +static struct file_system_type squashfs_fs_type; +static struct super_operations squashfs_super_ops; + +static int supported_squashfs_filesystem(short major, short minor, short comp) +{ + if (major < SQUASHFS_MAJOR) { + ERROR("Major/Minor mismatch, older Squashfs %d.%d " + "filesystems are unsupported\n", major, minor); + return -EINVAL; + } else if (major > SQUASHFS_MAJOR || minor > SQUASHFS_MINOR) { + ERROR("Major/Minor mismatch, trying to mount newer " + "%d.%d filesystem\n", major, minor); + ERROR("Please update your kernel\n"); + return -EINVAL; + } + + if (comp != ZLIB_COMPRESSION) + return -EINVAL; + + return 0; +} + + +static int squashfs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct squashfs_sb_info *msblk; + struct squashfs_super_block *sblk = NULL; + char b[BDEVNAME_SIZE]; + struct inode *root; + long long root_inode; + unsigned short flags; + unsigned int fragments; + u64 lookup_table_start; + int err; + + TRACE("Entered squashfs_fill_superblock\n"); + + sb->s_fs_info = kzalloc(sizeof(*msblk), GFP_KERNEL); + if (sb->s_fs_info == NULL) { + ERROR("Failed to allocate squashfs_sb_info\n"); + return -ENOMEM; + } + msblk = sb->s_fs_info; + + msblk->stream.workspace = kmalloc(zlib_inflate_workspacesize(), + GFP_KERNEL); + if (msblk->stream.workspace == NULL) { + ERROR("Failed to allocate zlib workspace\n"); + goto failure; + } + + sblk = kzalloc(sizeof(*sblk), GFP_KERNEL); + if (sblk == NULL) { + ERROR("Failed to allocate squashfs_super_block\n"); + goto failure; + } + + msblk->devblksize = sb_min_blocksize(sb, BLOCK_SIZE); + msblk->devblksize_log2 = ffz(~msblk->devblksize); + + mutex_init(&msblk->read_data_mutex); + mutex_init(&msblk->meta_index_mutex); + + /* + * msblk->bytes_used is checked in squashfs_read_table to ensure reads + * are not beyond filesystem end. But as we're using + * squashfs_read_table here to read the superblock (including the value + * of bytes_used) we need to set it to an initial sensible dummy value + */ + msblk->bytes_used = sizeof(*sblk); + err = squashfs_read_table(sb, sblk, SQUASHFS_START, sizeof(*sblk)); + + if (err < 0) { + ERROR("unable to read squashfs_super_block\n"); + goto failed_mount; + } + + /* Check it is a SQUASHFS superblock */ + sb->s_magic = le32_to_cpu(sblk->s_magic); + if (sb->s_magic != SQUASHFS_MAGIC) { + if (!silent) + ERROR("Can't find a SQUASHFS superblock on %s\n", + bdevname(sb->s_bdev, b)); + err = -EINVAL; + goto failed_mount; + } + + /* Check the MAJOR & MINOR versions and compression type */ + err = supported_squashfs_filesystem(le16_to_cpu(sblk->s_major), + le16_to_cpu(sblk->s_minor), + le16_to_cpu(sblk->compression)); + if (err < 0) + goto failed_mount; + + err = -EINVAL; + + /* + * Check if there's xattrs in the filesystem. These are not + * supported in this version, so warn that they will be ignored. + */ + if (le64_to_cpu(sblk->xattr_table_start) != SQUASHFS_INVALID_BLK) + ERROR("Xattrs in filesystem, these will be ignored\n"); + + /* Check the filesystem does not extend beyond the end of the + block device */ + msblk->bytes_used = le64_to_cpu(sblk->bytes_used); + if (msblk->bytes_used < 0 || msblk->bytes_used > + i_size_read(sb->s_bdev->bd_inode)) + goto failed_mount; + + /* Check block size for sanity */ + msblk->block_size = le32_to_cpu(sblk->block_size); + if (msblk->block_size > SQUASHFS_FILE_MAX_SIZE) + goto failed_mount; + + msblk->block_log = le16_to_cpu(sblk->block_log); + if (msblk->block_log > SQUASHFS_FILE_MAX_LOG) + goto failed_mount; + + /* Check the root inode for sanity */ + root_inode = le64_to_cpu(sblk->root_inode); + if (SQUASHFS_INODE_OFFSET(root_inode) > SQUASHFS_METADATA_SIZE) + goto failed_mount; + + msblk->inode_table = le64_to_cpu(sblk->inode_table_start); + msblk->directory_table = le64_to_cpu(sblk->directory_table_start); + msblk->inodes = le32_to_cpu(sblk->inodes); + flags = le16_to_cpu(sblk->flags); + + TRACE("Found valid superblock on %s\n", bdevname(sb->s_bdev, b)); + TRACE("Inodes are %scompressed\n", SQUASHFS_UNCOMPRESSED_INODES(flags) + ? "un" : ""); + TRACE("Data is %scompressed\n", SQUASHFS_UNCOMPRESSED_DATA(flags) + ? "un" : ""); + TRACE("Filesystem size %lld bytes\n", msblk->bytes_used); + TRACE("Block size %d\n", msblk->block_size); + TRACE("Number of inodes %d\n", msblk->inodes); + TRACE("Number of fragments %d\n", le32_to_cpu(sblk->fragments)); + TRACE("Number of ids %d\n", le16_to_cpu(sblk->no_ids)); + TRACE("sblk->inode_table_start %llx\n", msblk->inode_table); + TRACE("sblk->directory_table_start %llx\n", msblk->directory_table); + TRACE("sblk->fragment_table_start %llx\n", + (u64) le64_to_cpu(sblk->fragment_table_start)); + TRACE("sblk->id_table_start %llx\n", + (u64) le64_to_cpu(sblk->id_table_start)); + + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_flags |= MS_RDONLY; + sb->s_op = &squashfs_super_ops; + + err = -ENOMEM; + + msblk->block_cache = squashfs_cache_init("metadata", + SQUASHFS_CACHED_BLKS, SQUASHFS_METADATA_SIZE); + if (msblk->block_cache == NULL) + goto failed_mount; + + /* Allocate read_page block */ + msblk->read_page = squashfs_cache_init("data", 1, msblk->block_size); + if (msblk->read_page == NULL) { + ERROR("Failed to allocate read_page block\n"); + goto failed_mount; + } + + /* Allocate and read id index table */ + msblk->id_table = squashfs_read_id_index_table(sb, + le64_to_cpu(sblk->id_table_start), le16_to_cpu(sblk->no_ids)); + if (IS_ERR(msblk->id_table)) { + err = PTR_ERR(msblk->id_table); + msblk->id_table = NULL; + goto failed_mount; + } + + fragments = le32_to_cpu(sblk->fragments); + if (fragments == 0) + goto allocate_lookup_table; + + msblk->fragment_cache = squashfs_cache_init("fragment", + SQUASHFS_CACHED_FRAGMENTS, msblk->block_size); + if (msblk->fragment_cache == NULL) { + err = -ENOMEM; + goto failed_mount; + } + + /* Allocate and read fragment index table */ + msblk->fragment_index = squashfs_read_fragment_index_table(sb, + le64_to_cpu(sblk->fragment_table_start), fragments); + if (IS_ERR(msblk->fragment_index)) { + err = PTR_ERR(msblk->fragment_index); + msblk->fragment_index = NULL; + goto failed_mount; + } + +allocate_lookup_table: + lookup_table_start = le64_to_cpu(sblk->lookup_table_start); + if (lookup_table_start == SQUASHFS_INVALID_BLK) + goto allocate_root; + + /* Allocate and read inode lookup table */ + msblk->inode_lookup_table = squashfs_read_inode_lookup_table(sb, + lookup_table_start, msblk->inodes); + if (IS_ERR(msblk->inode_lookup_table)) { + err = PTR_ERR(msblk->inode_lookup_table); + msblk->inode_lookup_table = NULL; + goto failed_mount; + } + + sb->s_export_op = &squashfs_export_ops; + +allocate_root: + root = new_inode(sb); + if (!root) { + err = -ENOMEM; + goto failed_mount; + } + + err = squashfs_read_inode(root, root_inode); + if (err) { + iget_failed(root); + goto failed_mount; + } + insert_inode_hash(root); + + sb->s_root = d_alloc_root(root); + if (sb->s_root == NULL) { + ERROR("Root inode create failed\n"); + err = -ENOMEM; + iput(root); + goto failed_mount; + } + + TRACE("Leaving squashfs_fill_super\n"); + kfree(sblk); + return 0; + +failed_mount: + squashfs_cache_delete(msblk->block_cache); + squashfs_cache_delete(msblk->fragment_cache); + squashfs_cache_delete(msblk->read_page); + kfree(msblk->inode_lookup_table); + kfree(msblk->fragment_index); + kfree(msblk->id_table); + kfree(msblk->stream.workspace); + kfree(sb->s_fs_info); + sb->s_fs_info = NULL; + kfree(sblk); + return err; + +failure: + kfree(msblk->stream.workspace); + kfree(sb->s_fs_info); + sb->s_fs_info = NULL; + return -ENOMEM; +} + + +static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct squashfs_sb_info *msblk = dentry->d_sb->s_fs_info; + + TRACE("Entered squashfs_statfs\n"); + + buf->f_type = SQUASHFS_MAGIC; + buf->f_bsize = msblk->block_size; + buf->f_blocks = ((msblk->bytes_used - 1) >> msblk->block_log) + 1; + buf->f_bfree = buf->f_bavail = 0; + buf->f_files = msblk->inodes; + buf->f_ffree = 0; + buf->f_namelen = SQUASHFS_NAME_LEN; + + return 0; +} + + +static int squashfs_remount(struct super_block *sb, int *flags, char *data) +{ + *flags |= MS_RDONLY; + return 0; +} + + +static void squashfs_put_super(struct super_block *sb) +{ + if (sb->s_fs_info) { + struct squashfs_sb_info *sbi = sb->s_fs_info; + squashfs_cache_delete(sbi->block_cache); + squashfs_cache_delete(sbi->fragment_cache); + squashfs_cache_delete(sbi->read_page); + kfree(sbi->id_table); + kfree(sbi->fragment_index); + kfree(sbi->meta_index); + kfree(sbi->stream.workspace); + kfree(sb->s_fs_info); + sb->s_fs_info = NULL; + } +} + + +static int squashfs_get_sb(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data, + struct vfsmount *mnt) +{ + return get_sb_bdev(fs_type, flags, dev_name, data, squashfs_fill_super, + mnt); +} + + +static struct kmem_cache *squashfs_inode_cachep; + + +static void init_once(void *foo) +{ + struct squashfs_inode_info *ei = foo; + + inode_init_once(&ei->vfs_inode); +} + + +static int __init init_inodecache(void) +{ + squashfs_inode_cachep = kmem_cache_create("squashfs_inode_cache", + sizeof(struct squashfs_inode_info), 0, + SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT, init_once); + + return squashfs_inode_cachep ? 0 : -ENOMEM; +} + + +static void destroy_inodecache(void) +{ + kmem_cache_destroy(squashfs_inode_cachep); +} + + +static int __init init_squashfs_fs(void) +{ + int err = init_inodecache(); + + if (err) + return err; + + err = register_filesystem(&squashfs_fs_type); + if (err) { + destroy_inodecache(); + return err; + } + + printk(KERN_INFO "squashfs: version 4.0 (2009/01/03) " + "Phillip Lougher\n"); + + return 0; +} + + +static void __exit exit_squashfs_fs(void) +{ + unregister_filesystem(&squashfs_fs_type); + destroy_inodecache(); +} + + +static struct inode *squashfs_alloc_inode(struct super_block *sb) +{ + struct squashfs_inode_info *ei = + kmem_cache_alloc(squashfs_inode_cachep, GFP_KERNEL); + + return ei ? &ei->vfs_inode : NULL; +} + + +static void squashfs_destroy_inode(struct inode *inode) +{ + kmem_cache_free(squashfs_inode_cachep, squashfs_i(inode)); +} + + +static struct file_system_type squashfs_fs_type = { + .owner = THIS_MODULE, + .name = "squashfs", + .get_sb = squashfs_get_sb, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV +}; + +static struct super_operations squashfs_super_ops = { + .alloc_inode = squashfs_alloc_inode, + .destroy_inode = squashfs_destroy_inode, + .statfs = squashfs_statfs, + .put_super = squashfs_put_super, + .remount_fs = squashfs_remount +}; + +module_init(init_squashfs_fs); +module_exit(exit_squashfs_fs); +MODULE_DESCRIPTION("squashfs 4.0, a compressed read-only filesystem"); +MODULE_AUTHOR("Phillip Lougher <phillip@lougher.demon.co.uk>"); +MODULE_LICENSE("GPL"); diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c new file mode 100644 index 000000000000..83d87880aac8 --- /dev/null +++ b/fs/squashfs/symlink.c @@ -0,0 +1,118 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher <phillip@lougher.demon.co.uk> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * symlink.c + */ + +/* + * This file implements code to handle symbolic links. + * + * The data contents of symbolic links are stored inside the symbolic + * link inode within the inode table. This allows the normally small symbolic + * link to be compressed as part of the inode table, achieving much greater + * compression than if the symbolic link was compressed individually. + */ + +#include <linux/fs.h> +#include <linux/vfs.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/pagemap.h> +#include <linux/zlib.h> + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "squashfs.h" + +static int squashfs_symlink_readpage(struct file *file, struct page *page) +{ + struct inode *inode = page->mapping->host; + struct super_block *sb = inode->i_sb; + struct squashfs_sb_info *msblk = sb->s_fs_info; + int index = page->index << PAGE_CACHE_SHIFT; + u64 block = squashfs_i(inode)->start; + int offset = squashfs_i(inode)->offset; + int length = min_t(int, i_size_read(inode) - index, PAGE_CACHE_SIZE); + int bytes, copied; + void *pageaddr; + struct squashfs_cache_entry *entry; + + TRACE("Entered squashfs_symlink_readpage, page index %ld, start block " + "%llx, offset %x\n", page->index, block, offset); + + /* + * Skip index bytes into symlink metadata. + */ + if (index) { + bytes = squashfs_read_metadata(sb, NULL, &block, &offset, + index); + if (bytes < 0) { + ERROR("Unable to read symlink [%llx:%x]\n", + squashfs_i(inode)->start, + squashfs_i(inode)->offset); + goto error_out; + } + } + + /* + * Read length bytes from symlink metadata. Squashfs_read_metadata + * is not used here because it can sleep and we want to use + * kmap_atomic to map the page. Instead call the underlying + * squashfs_cache_get routine. As length bytes may overlap metadata + * blocks, we may need to call squashfs_cache_get multiple times. + */ + for (bytes = 0; bytes < length; offset = 0, bytes += copied) { + entry = squashfs_cache_get(sb, msblk->block_cache, block, 0); + if (entry->error) { + ERROR("Unable to read symlink [%llx:%x]\n", + squashfs_i(inode)->start, + squashfs_i(inode)->offset); + squashfs_cache_put(entry); + goto error_out; + } + + pageaddr = kmap_atomic(page, KM_USER0); + copied = squashfs_copy_data(pageaddr + bytes, entry, offset, + length - bytes); + if (copied == length - bytes) + memset(pageaddr + length, 0, PAGE_CACHE_SIZE - length); + else + block = entry->next_index; + kunmap_atomic(pageaddr, KM_USER0); + squashfs_cache_put(entry); + } + + flush_dcache_page(page); + SetPageUptodate(page); + unlock_page(page); + return 0; + +error_out: + SetPageError(page); + unlock_page(page); + return 0; +} + + +const struct address_space_operations squashfs_symlink_aops = { + .readpage = squashfs_symlink_readpage +}; diff --git a/fs/stat.c b/fs/stat.c index 7c46fbeb8b76..7e12a6f82795 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -305,7 +305,7 @@ asmlinkage long sys_readlinkat(int dfd, const char __user *pathname, struct inode *inode = path.dentry->d_inode; error = -EINVAL; - if (inode->i_op && inode->i_op->readlink) { + if (inode->i_op->readlink) { error = security_inode_readlink(path.dentry); if (!error) { touch_atime(path.mnt, path.dentry); diff --git a/fs/super.c b/fs/super.c index ddba069d7a99..ed080c417167 100644 --- a/fs/super.c +++ b/fs/super.c @@ -38,6 +38,7 @@ #include <linux/kobject.h> #include <linux/mutex.h> #include <linux/file.h> +#include <linux/async.h> #include <asm/uaccess.h> #include "internal.h" @@ -71,6 +72,7 @@ static struct super_block *alloc_super(struct file_system_type *type) INIT_HLIST_HEAD(&s->s_anon); INIT_LIST_HEAD(&s->s_inodes); INIT_LIST_HEAD(&s->s_dentry_lru); + INIT_LIST_HEAD(&s->s_async_list); init_rwsem(&s->s_umount); mutex_init(&s->s_lock); lockdep_set_class(&s->s_umount, &type->s_umount_key); @@ -289,11 +291,18 @@ void generic_shutdown_super(struct super_block *sb) { const struct super_operations *sop = sb->s_op; + if (sb->s_root) { shrink_dcache_for_umount(sb); fsync_super(sb); lock_super(sb); sb->s_flags &= ~MS_ACTIVE; + + /* + * wait for asynchronous fs operations to finish before going further + */ + async_synchronize_full_special(&sb->s_async_list); + /* bad name - it should be evict_inodes() */ invalidate_inodes(sb); lock_kernel(); @@ -461,6 +470,7 @@ restart: sb->s_count++; spin_unlock(&sb_lock); down_read(&sb->s_umount); + async_synchronize_full_special(&sb->s_async_list); if (sb->s_root && (wait || sb->s_dirt)) sb->s_op->sync_fs(sb, wait); up_read(&sb->s_umount); @@ -800,6 +810,7 @@ int get_sb_bdev(struct file_system_type *fs_type, } s->s_flags |= MS_ACTIVE; + bdev->bd_super = s; } return simple_set_mnt(mnt, s); @@ -819,6 +830,7 @@ void kill_block_super(struct super_block *sb) struct block_device *bdev = sb->s_bdev; fmode_t mode = sb->s_mode; + bdev->bd_super = 0; generic_shutdown_super(sb); sync_blockdev(bdev); close_bdev_exclusive(bdev, mode); diff --git a/fs/sync.c b/fs/sync.c index 2967562d416f..ac02b56548bc 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -75,14 +75,39 @@ int file_fsync(struct file *filp, struct dentry *dentry, int datasync) return ret; } -long do_fsync(struct file *file, int datasync) +/** + * vfs_fsync - perform a fsync or fdatasync on a file + * @file: file to sync + * @dentry: dentry of @file + * @data: only perform a fdatasync operation + * + * Write back data and metadata for @file to disk. If @datasync is + * set only metadata needed to access modified file data is written. + * + * In case this function is called from nfsd @file may be %NULL and + * only @dentry is set. This can only happen when the filesystem + * implements the export_operations API. + */ +int vfs_fsync(struct file *file, struct dentry *dentry, int datasync) { - int ret; - int err; - struct address_space *mapping = file->f_mapping; + const struct file_operations *fop; + struct address_space *mapping; + int err, ret; + + /* + * Get mapping and operations from the file in case we have + * as file, or get the default values for them in case we + * don't have a struct file available. Damn nfsd.. + */ + if (file) { + mapping = file->f_mapping; + fop = file->f_op; + } else { + mapping = dentry->d_inode->i_mapping; + fop = dentry->d_inode->i_fop; + } - if (!file->f_op || !file->f_op->fsync) { - /* Why? We can still call filemap_fdatawrite */ + if (!fop || !fop->fsync) { ret = -EINVAL; goto out; } @@ -94,7 +119,7 @@ long do_fsync(struct file *file, int datasync) * livelocks in fsync_buffers_list(). */ mutex_lock(&mapping->host->i_mutex); - err = file->f_op->fsync(file, file->f_path.dentry, datasync); + err = fop->fsync(file, dentry, datasync); if (!ret) ret = err; mutex_unlock(&mapping->host->i_mutex); @@ -104,15 +129,16 @@ long do_fsync(struct file *file, int datasync) out: return ret; } +EXPORT_SYMBOL(vfs_fsync); -static long __do_fsync(unsigned int fd, int datasync) +static int do_fsync(unsigned int fd, int datasync) { struct file *file; int ret = -EBADF; file = fget(fd); if (file) { - ret = do_fsync(file, datasync); + ret = vfs_fsync(file, file->f_path.dentry, datasync); fput(file); } return ret; @@ -120,12 +146,12 @@ static long __do_fsync(unsigned int fd, int datasync) asmlinkage long sys_fsync(unsigned int fd) { - return __do_fsync(fd, 0); + return do_fsync(fd, 0); } asmlinkage long sys_fdatasync(unsigned int fd) { - return __do_fsync(fd, 1); + return do_fsync(fd, 1); } /* @@ -269,7 +295,7 @@ int do_sync_mapping_range(struct address_space *mapping, loff_t offset, if (flags & SYNC_FILE_RANGE_WRITE) { ret = __filemap_fdatawrite_range(mapping, offset, endbyte, - WB_SYNC_NONE); + WB_SYNC_ALL); if (ret < 0) goto out; } diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index eb53c632f856..dfa3d94cfc74 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -107,8 +107,6 @@ int sysfs_setattr(struct dentry * dentry, struct iattr * iattr) static inline void set_default_inode_attr(struct inode * inode, mode_t mode) { inode->i_mode = mode; - inode->i_uid = 0; - inode->i_gid = 0; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; } @@ -149,7 +147,6 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode) { struct bin_attribute *bin_attr; - inode->i_blocks = 0; inode->i_mapping->a_ops = &sysfs_aops; inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info; inode->i_op = &sysfs_inode_operations; diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index df0d435baa48..3d81bf58dae2 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -27,6 +27,7 @@ #include <linux/init.h> #include <linux/buffer_head.h> #include <linux/vfs.h> +#include <linux/namei.h> #include <asm/byteorder.h> #include "sysv.h" @@ -163,8 +164,11 @@ void sysv_set_inode(struct inode *inode, dev_t rdev) if (inode->i_blocks) { inode->i_op = &sysv_symlink_inode_operations; inode->i_mapping->a_ops = &sysv_aops; - } else + } else { inode->i_op = &sysv_fast_symlink_inode_operations; + nd_terminate_link(SYSV_I(inode)->i_data, inode->i_size, + sizeof(SYSV_I(inode)->i_data) - 1); + } } else init_special_inode(inode, inode->i_mode, rdev); } diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig index 91ceeda7e5bf..e35b54d5059d 100644 --- a/fs/ubifs/Kconfig +++ b/fs/ubifs/Kconfig @@ -40,7 +40,7 @@ config UBIFS_FS_ZLIB depends on UBIFS_FS default y help - Zlib copresses better then LZO but it is slower. Say 'Y' if unsure. + Zlib compresses better than LZO but it is slower. Say 'Y' if unsure. # Debugging-related stuff config UBIFS_FS_DEBUG diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c index 4a18f084cc42..175f9c590b77 100644 --- a/fs/ubifs/budget.c +++ b/fs/ubifs/budget.c @@ -32,18 +32,15 @@ #include "ubifs.h" #include <linux/writeback.h> -#include <asm/div64.h> +#include <linux/math64.h> /* * When pessimistic budget calculations say that there is no enough space, * UBIFS starts writing back dirty inodes and pages, doing garbage collection, - * or committing. The below constants define maximum number of times UBIFS + * or committing. The below constant defines maximum number of times UBIFS * repeats the operations. */ -#define MAX_SHRINK_RETRIES 8 -#define MAX_GC_RETRIES 4 -#define MAX_CMT_RETRIES 2 -#define MAX_NOSPC_RETRIES 1 +#define MAX_MKSPC_RETRIES 3 /* * The below constant defines amount of dirty pages which should be written @@ -52,30 +49,6 @@ #define NR_TO_WRITE 16 /** - * struct retries_info - information about re-tries while making free space. - * @prev_liability: previous liability - * @shrink_cnt: how many times the liability was shrinked - * @shrink_retries: count of liability shrink re-tries (increased when - * liability does not shrink) - * @try_gc: GC should be tried first - * @gc_retries: how many times GC was run - * @cmt_retries: how many times commit has been done - * @nospc_retries: how many times GC returned %-ENOSPC - * - * Since we consider budgeting to be the fast-path, and this structure has to - * be allocated on stack and zeroed out, we make it smaller using bit-fields. - */ -struct retries_info { - long long prev_liability; - unsigned int shrink_cnt; - unsigned int shrink_retries:5; - unsigned int try_gc:1; - unsigned int gc_retries:4; - unsigned int cmt_retries:3; - unsigned int nospc_retries:1; -}; - -/** * shrink_liability - write-back some dirty pages/inodes. * @c: UBIFS file-system description object * @nr_to_write: how many dirty pages to write-back @@ -147,13 +120,29 @@ static int run_gc(struct ubifs_info *c) } /** + * get_liability - calculate current liability. + * @c: UBIFS file-system description object + * + * This function calculates and returns current UBIFS liability, i.e. the + * amount of bytes UBIFS has "promised" to write to the media. + */ +static long long get_liability(struct ubifs_info *c) +{ + long long liab; + + spin_lock(&c->space_lock); + liab = c->budg_idx_growth + c->budg_data_growth + c->budg_dd_growth; + spin_unlock(&c->space_lock); + return liab; +} + +/** * make_free_space - make more free space on the file-system. * @c: UBIFS file-system description object - * @ri: information about previous invocations of this function * * This function is called when an operation cannot be budgeted because there * is supposedly no free space. But in most cases there is some free space: - * o budgeting is pessimistic, so it always budgets more then it is actually + * o budgeting is pessimistic, so it always budgets more than it is actually * needed, so shrinking the liability is one way to make free space - the * cached data will take less space then it was budgeted for; * o GC may turn some dark space into free space (budgeting treats dark space @@ -165,87 +154,42 @@ static int run_gc(struct ubifs_info *c) * Returns %-ENOSPC if it couldn't do more free space, and other negative error * codes on failures. */ -static int make_free_space(struct ubifs_info *c, struct retries_info *ri) +static int make_free_space(struct ubifs_info *c) { - int err; - - /* - * If we have some dirty pages and inodes (liability), try to write - * them back unless this was tried too many times without effect - * already. - */ - if (ri->shrink_retries < MAX_SHRINK_RETRIES && !ri->try_gc) { - long long liability; - - spin_lock(&c->space_lock); - liability = c->budg_idx_growth + c->budg_data_growth + - c->budg_dd_growth; - spin_unlock(&c->space_lock); + int err, retries = 0; + long long liab1, liab2; - if (ri->prev_liability >= liability) { - /* Liability does not shrink, next time try GC then */ - ri->shrink_retries += 1; - if (ri->gc_retries < MAX_GC_RETRIES) - ri->try_gc = 1; - dbg_budg("liability did not shrink: retries %d of %d", - ri->shrink_retries, MAX_SHRINK_RETRIES); - } + do { + liab1 = get_liability(c); + /* + * We probably have some dirty pages or inodes (liability), try + * to write them back. + */ + dbg_budg("liability %lld, run write-back", liab1); + shrink_liability(c, NR_TO_WRITE); - dbg_budg("force write-back (count %d)", ri->shrink_cnt); - shrink_liability(c, NR_TO_WRITE + ri->shrink_cnt); + liab2 = get_liability(c); + if (liab2 < liab1) + return -EAGAIN; - ri->prev_liability = liability; - ri->shrink_cnt += 1; - return -EAGAIN; - } + dbg_budg("new liability %lld (not shrinked)", liab2); - /* - * Try to run garbage collector unless it was already tried too many - * times. - */ - if (ri->gc_retries < MAX_GC_RETRIES) { - ri->gc_retries += 1; - dbg_budg("run GC, retries %d of %d", - ri->gc_retries, MAX_GC_RETRIES); - - ri->try_gc = 0; + /* Liability did not shrink again, try GC */ + dbg_budg("Run GC"); err = run_gc(c); if (!err) return -EAGAIN; - if (err == -EAGAIN) { - dbg_budg("GC asked to commit"); - err = ubifs_run_commit(c); - if (err) - return err; - return -EAGAIN; - } - - if (err != -ENOSPC) - return err; - - /* - * GC could not make any progress. If this is the first time, - * then it makes sense to try to commit, because it might make - * some dirty space. - */ - dbg_budg("GC returned -ENOSPC, retries %d", - ri->nospc_retries); - if (ri->nospc_retries >= MAX_NOSPC_RETRIES) + if (err != -EAGAIN && err != -ENOSPC) + /* Some real error happened */ return err; - ri->nospc_retries += 1; - } - /* Neither GC nor write-back helped, try to commit */ - if (ri->cmt_retries < MAX_CMT_RETRIES) { - ri->cmt_retries += 1; - dbg_budg("run commit, retries %d of %d", - ri->cmt_retries, MAX_CMT_RETRIES); + dbg_budg("Run commit (retries %d)", retries); err = ubifs_run_commit(c); if (err) return err; - return -EAGAIN; - } + } while (retries++ < MAX_MKSPC_RETRIES); + return -ENOSPC; } @@ -258,8 +202,8 @@ static int make_free_space(struct ubifs_info *c, struct retries_info *ri) */ int ubifs_calc_min_idx_lebs(struct ubifs_info *c) { - int ret; - uint64_t idx_size; + int idx_lebs, eff_leb_size = c->leb_size - c->max_idx_node_sz; + long long idx_size; idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx; @@ -271,23 +215,16 @@ int ubifs_calc_min_idx_lebs(struct ubifs_info *c) * pair, nor similarly the two variables for the new index size, so we * have to do this costly 64-bit division on fast-path. */ - if (do_div(idx_size, c->leb_size - c->max_idx_node_sz)) - ret = idx_size + 1; - else - ret = idx_size; + idx_size += eff_leb_size - 1; + idx_lebs = div_u64(idx_size, eff_leb_size); /* * The index head is not available for the in-the-gaps method, so add an * extra LEB to compensate. */ - ret += 1; - /* - * At present the index needs at least 2 LEBs: one for the index head - * and one for in-the-gaps method (which currently does not cater for - * the index head and so excludes it from consideration). - */ - if (ret < 2) - ret = 2; - return ret; + idx_lebs += 1; + if (idx_lebs < MIN_INDEX_LEBS) + idx_lebs = MIN_INDEX_LEBS; + return idx_lebs; } /** @@ -530,8 +467,7 @@ static int calc_dd_growth(const struct ubifs_info *c, int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req) { int uninitialized_var(cmt_retries), uninitialized_var(wb_retries); - int err, idx_growth, data_growth, dd_growth; - struct retries_info ri; + int err, idx_growth, data_growth, dd_growth, retried = 0; ubifs_assert(req->new_page <= 1); ubifs_assert(req->dirtied_page <= 1); @@ -549,7 +485,6 @@ int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req) if (!data_growth && !dd_growth) return 0; idx_growth = calc_idx_growth(c, req); - memset(&ri, 0, sizeof(struct retries_info)); again: spin_lock(&c->space_lock); @@ -587,12 +522,17 @@ again: return err; } - err = make_free_space(c, &ri); + err = make_free_space(c); + cond_resched(); if (err == -EAGAIN) { dbg_budg("try again"); - cond_resched(); goto again; } else if (err == -ENOSPC) { + if (!retried) { + retried = 1; + dbg_budg("-ENOSPC, but anyway try once again"); + goto again; + } dbg_budg("FS is full, -ENOSPC"); c->nospace = 1; if (can_use_rp(c) || c->rp_size == 0) @@ -666,7 +606,7 @@ void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req) * @c: UBIFS file-system description object * * This function converts budget which was allocated for a new page of data to - * the budget of changing an existing page of data. The latter is smaller then + * the budget of changing an existing page of data. The latter is smaller than * the former, so this function only does simple re-calculation and does not * involve any write-back. */ @@ -712,9 +652,9 @@ void ubifs_release_dirty_inode_budget(struct ubifs_info *c, * user-space. User-space application tend to expect that if the file-system * (e.g., via the 'statfs()' call) reports that it has N bytes available, they * are able to write a file of size N. UBIFS attaches node headers to each data - * node and it has to write indexind nodes as well. This introduces additional - * overhead, and UBIFS it has to report sligtly less free space to meet the - * above expectetion. + * node and it has to write indexing nodes as well. This introduces additional + * overhead, and UBIFS has to report slightly less free space to meet the above + * expectations. * * This function assumes free space is made up of uncompressed data nodes and * full index nodes (one per data node, tripled because we always allow enough @@ -723,7 +663,7 @@ void ubifs_release_dirty_inode_budget(struct ubifs_info *c, * Note, the calculation is pessimistic, which means that most of the time * UBIFS reports less space than it actually has. */ -long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free) +long long ubifs_reported_space(const struct ubifs_info *c, long long free) { int divisor, factor, f; @@ -737,7 +677,7 @@ long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free) * of data nodes, f - fanout. Because effective UBIFS fanout is twice * as less than maximum fanout, we assume that each data node * introduces 3 * @c->max_idx_node_sz / (@c->fanout/2 - 1) bytes. - * Note, the multiplier 3 is because UBIFS reseves thrice as more space + * Note, the multiplier 3 is because UBIFS reserves thrice as more space * for the index. */ f = c->fanout > 3 ? c->fanout >> 1 : 2; @@ -745,8 +685,7 @@ long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free) divisor = UBIFS_MAX_DATA_NODE_SZ; divisor += (c->max_idx_node_sz * 3) / (f - 1); free *= factor; - do_div(free, divisor); - return free; + return div_u64(free, divisor); } /** @@ -756,10 +695,10 @@ long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free) * This function calculates amount of free space to report to user-space. * * Because UBIFS may introduce substantial overhead (the index, node headers, - * alighment, wastage at the end of eraseblocks, etc), it cannot report real + * alignment, wastage at the end of eraseblocks, etc), it cannot report real * amount of free flash space it has (well, because not all dirty space is - * reclamable, UBIFS does not actually know the real amount). If UBIFS did so, - * it would bread user expectetion about what free space is. Users seem to + * reclaimable, UBIFS does not actually know the real amount). If UBIFS did so, + * it would bread user expectations about what free space is. Users seem to * accustomed to assume that if the file-system reports N bytes of free space, * they would be able to fit a file of N bytes to the FS. This almost works for * traditional file-systems, because they have way less overhead than UBIFS. @@ -771,18 +710,9 @@ long long ubifs_get_free_space(struct ubifs_info *c) long long available, outstanding, free; spin_lock(&c->space_lock); - min_idx_lebs = ubifs_calc_min_idx_lebs(c); + min_idx_lebs = c->min_idx_lebs; + ubifs_assert(min_idx_lebs == ubifs_calc_min_idx_lebs(c)); outstanding = c->budg_data_growth + c->budg_dd_growth; - - /* - * Force the amount available to the total size reported if the used - * space is zero. - */ - if (c->lst.total_used <= UBIFS_INO_NODE_SZ && !outstanding) { - spin_unlock(&c->space_lock); - return (long long)c->block_cnt << UBIFS_BLOCK_SHIFT; - } - available = ubifs_calc_available(c, min_idx_lebs); /* diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c index b49884c8c10e..f3a7945527fb 100644 --- a/fs/ubifs/commit.c +++ b/fs/ubifs/commit.c @@ -470,12 +470,12 @@ int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot) { struct ubifs_idx_node *idx; int lnum, offs, len, err = 0; + struct ubifs_debug_info *d = c->dbg; - c->old_zroot = *zroot; - - lnum = c->old_zroot.lnum; - offs = c->old_zroot.offs; - len = c->old_zroot.len; + d->old_zroot = *zroot; + lnum = d->old_zroot.lnum; + offs = d->old_zroot.offs; + len = d->old_zroot.len; idx = kmalloc(c->max_idx_node_sz, GFP_NOFS); if (!idx) @@ -485,8 +485,8 @@ int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot) if (err) goto out; - c->old_zroot_level = le16_to_cpu(idx->level); - c->old_zroot_sqnum = le64_to_cpu(idx->ch.sqnum); + d->old_zroot_level = le16_to_cpu(idx->level); + d->old_zroot_sqnum = le64_to_cpu(idx->ch.sqnum); out: kfree(idx); return err; @@ -509,6 +509,7 @@ int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot) { int lnum, offs, len, err = 0, uninitialized_var(last_level), child_cnt; int first = 1, iip; + struct ubifs_debug_info *d = c->dbg; union ubifs_key lower_key, upper_key, l_key, u_key; unsigned long long uninitialized_var(last_sqnum); struct ubifs_idx_node *idx; @@ -525,9 +526,9 @@ int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot) UBIFS_IDX_NODE_SZ; /* Start at the old zroot */ - lnum = c->old_zroot.lnum; - offs = c->old_zroot.offs; - len = c->old_zroot.len; + lnum = d->old_zroot.lnum; + offs = d->old_zroot.offs; + len = d->old_zroot.len; iip = 0; /* @@ -560,11 +561,11 @@ int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot) if (first) { first = 0; /* Check root level and sqnum */ - if (le16_to_cpu(idx->level) != c->old_zroot_level) { + if (le16_to_cpu(idx->level) != d->old_zroot_level) { err = 2; goto out_dump; } - if (le64_to_cpu(idx->ch.sqnum) != c->old_zroot_sqnum) { + if (le64_to_cpu(idx->ch.sqnum) != d->old_zroot_sqnum) { err = 3; goto out_dump; } diff --git a/fs/ubifs/compress.c b/fs/ubifs/compress.c index a0ada596b17c..11e4132f314a 100644 --- a/fs/ubifs/compress.c +++ b/fs/ubifs/compress.c @@ -33,7 +33,7 @@ /* Fake description object for the "none" compressor */ static struct ubifs_compressor none_compr = { .compr_type = UBIFS_COMPR_NONE, - .name = "no compression", + .name = "none", .capi_name = "", }; @@ -43,13 +43,13 @@ static DEFINE_MUTEX(lzo_mutex); static struct ubifs_compressor lzo_compr = { .compr_type = UBIFS_COMPR_LZO, .comp_mutex = &lzo_mutex, - .name = "LZO", + .name = "lzo", .capi_name = "lzo", }; #else static struct ubifs_compressor lzo_compr = { .compr_type = UBIFS_COMPR_LZO, - .name = "LZO", + .name = "lzo", }; #endif @@ -108,7 +108,7 @@ void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len, if (compr->comp_mutex) mutex_lock(compr->comp_mutex); err = crypto_comp_compress(compr->cc, in_buf, in_len, out_buf, - out_len); + (unsigned int *)out_len); if (compr->comp_mutex) mutex_unlock(compr->comp_mutex); if (unlikely(err)) { @@ -119,10 +119,10 @@ void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len, } /* - * Presently, we just require that compression results in less data, - * rather than any defined minimum compression ratio or amount. + * If the data compressed only slightly, it is better to leave it + * uncompressed to improve read speed. */ - if (ALIGN(*out_len, 8) >= ALIGN(in_len, 8)) + if (in_len - *out_len < UBIFS_MIN_COMPRESS_DIFF) goto no_compr; return; @@ -172,7 +172,7 @@ int ubifs_decompress(const void *in_buf, int in_len, void *out_buf, if (compr->decomp_mutex) mutex_lock(compr->decomp_mutex); err = crypto_comp_decompress(compr->cc, in_buf, in_len, out_buf, - out_len); + (unsigned int *)out_len); if (compr->decomp_mutex) mutex_unlock(compr->decomp_mutex); if (err) @@ -244,7 +244,7 @@ out_lzo: /** * ubifs_compressors_exit - de-initialize UBIFS compressors. */ -void __exit ubifs_compressors_exit(void) +void ubifs_compressors_exit(void) { compr_exit(&lzo_compr); compr_exit(&zlib_compr); diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 510ffa0bbda4..792c5a16c182 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -32,6 +32,8 @@ #include "ubifs.h" #include <linux/module.h> #include <linux/moduleparam.h> +#include <linux/debugfs.h> +#include <linux/math64.h> #ifdef CONFIG_UBIFS_FS_DEBUG @@ -596,7 +598,9 @@ void dbg_dump_budg(struct ubifs_info *c) struct rb_node *rb; struct ubifs_bud *bud; struct ubifs_gced_idx_leb *idx_gc; + long long available, outstanding, free; + ubifs_assert(spin_is_locked(&c->space_lock)); spin_lock(&dbg_lock); printk(KERN_DEBUG "(pid %d) Budgeting info: budg_data_growth %lld, " "budg_dd_growth %lld, budg_idx_growth %lld\n", current->pid, @@ -629,6 +633,17 @@ void dbg_dump_budg(struct ubifs_info *c) printk(KERN_DEBUG "\tGC'ed idx LEB %d unmap %d\n", idx_gc->lnum, idx_gc->unmap); printk(KERN_DEBUG "\tcommit state %d\n", c->cmt_state); + + /* Print budgeting predictions */ + available = ubifs_calc_available(c, c->min_idx_lebs); + outstanding = c->budg_data_growth + c->budg_dd_growth; + if (available > outstanding) + free = ubifs_reported_space(c, available - outstanding); + else + free = 0; + printk(KERN_DEBUG "Budgeting predictions:\n"); + printk(KERN_DEBUG "\tavailable: %lld, outstanding %lld, free %lld\n", + available, outstanding, free); spin_unlock(&dbg_lock); } @@ -645,7 +660,8 @@ void dbg_dump_lprops(struct ubifs_info *c) struct ubifs_lprops lp; struct ubifs_lp_stats lst; - printk(KERN_DEBUG "(pid %d) Dumping LEB properties\n", current->pid); + printk(KERN_DEBUG "(pid %d) start dumping LEB properties\n", + current->pid); ubifs_get_lp_stats(c, &lst); dbg_dump_lstats(&lst); @@ -656,6 +672,8 @@ void dbg_dump_lprops(struct ubifs_info *c) dbg_dump_lprop(c, &lp); } + printk(KERN_DEBUG "(pid %d) finish dumping LEB properties\n", + current->pid); } void dbg_dump_lpt_info(struct ubifs_info *c) @@ -663,6 +681,7 @@ void dbg_dump_lpt_info(struct ubifs_info *c) int i; spin_lock(&dbg_lock); + printk(KERN_DEBUG "(pid %d) dumping LPT information\n", current->pid); printk(KERN_DEBUG "\tlpt_sz: %lld\n", c->lpt_sz); printk(KERN_DEBUG "\tpnode_sz: %d\n", c->pnode_sz); printk(KERN_DEBUG "\tnnode_sz: %d\n", c->nnode_sz); @@ -684,7 +703,8 @@ void dbg_dump_lpt_info(struct ubifs_info *c) printk(KERN_DEBUG "\tLPT root is at %d:%d\n", c->lpt_lnum, c->lpt_offs); printk(KERN_DEBUG "\tLPT head is at %d:%d\n", c->nhead_lnum, c->nhead_offs); - printk(KERN_DEBUG "\tLPT ltab is at %d:%d\n", c->ltab_lnum, c->ltab_offs); + printk(KERN_DEBUG "\tLPT ltab is at %d:%d\n", + c->ltab_lnum, c->ltab_offs); if (c->big_lpt) printk(KERN_DEBUG "\tLPT lsave is at %d:%d\n", c->lsave_lnum, c->lsave_offs); @@ -703,9 +723,9 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum) if (dbg_failure_mode) return; - printk(KERN_DEBUG "(pid %d) Dumping LEB %d\n", current->pid, lnum); - - sleb = ubifs_scan(c, lnum, 0, c->dbg_buf); + printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n", + current->pid, lnum); + sleb = ubifs_scan(c, lnum, 0, c->dbg->buf); if (IS_ERR(sleb)) { ubifs_err("scan error %d", (int)PTR_ERR(sleb)); return; @@ -721,6 +741,8 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum) dbg_dump_node(c, snod->node); } + printk(KERN_DEBUG "(pid %d) finish dumping LEB %d\n", + current->pid, lnum); ubifs_scan_destroy(sleb); return; } @@ -768,7 +790,7 @@ void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat) { int i; - printk(KERN_DEBUG "(pid %d) Dumping heap cat %d (%d elements)\n", + printk(KERN_DEBUG "(pid %d) start dumping heap cat %d (%d elements)\n", current->pid, cat, heap->cnt); for (i = 0; i < heap->cnt; i++) { struct ubifs_lprops *lprops = heap->arr[i]; @@ -777,6 +799,7 @@ void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat) "flags %d\n", i, lprops->lnum, lprops->hpos, lprops->free, lprops->dirty, lprops->flags); } + printk(KERN_DEBUG "(pid %d) finish dumping heap\n", current->pid); } void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, @@ -784,7 +807,7 @@ void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, { int i; - printk(KERN_DEBUG "(pid %d) Dumping pnode:\n", current->pid); + printk(KERN_DEBUG "(pid %d) dumping pnode:\n", current->pid); printk(KERN_DEBUG "\taddress %zx parent %zx cnext %zx\n", (size_t)pnode, (size_t)parent, (size_t)pnode->cnext); printk(KERN_DEBUG "\tflags %lu iip %d level %d num %d\n", @@ -803,7 +826,7 @@ void dbg_dump_tnc(struct ubifs_info *c) int level; printk(KERN_DEBUG "\n"); - printk(KERN_DEBUG "(pid %d) Dumping the TNC tree\n", current->pid); + printk(KERN_DEBUG "(pid %d) start dumping TNC tree\n", current->pid); znode = ubifs_tnc_levelorder_next(c->zroot.znode, NULL); level = znode->level; printk(KERN_DEBUG "== Level %d ==\n", level); @@ -815,8 +838,7 @@ void dbg_dump_tnc(struct ubifs_info *c) dbg_dump_znode(c, znode); znode = ubifs_tnc_levelorder_next(c->zroot.znode, znode); } - - printk(KERN_DEBUG "\n"); + printk(KERN_DEBUG "(pid %d) finish dumping TNC tree\n", current->pid); } static int dump_znode(struct ubifs_info *c, struct ubifs_znode *znode, @@ -992,8 +1014,8 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1, zbr1->offs, DBGKEY(&key)); dbg_err("but it should have key %s according to tnc", DBGKEY(&zbr1->key)); - dbg_dump_node(c, dent1); - goto out_free; + dbg_dump_node(c, dent1); + goto out_free; } key_read(c, &dent2->key, &key); @@ -1002,8 +1024,8 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1, zbr1->offs, DBGKEY(&key)); dbg_err("but it should have key %s according to tnc", DBGKEY(&zbr2->key)); - dbg_dump_node(c, dent2); - goto out_free; + dbg_dump_node(c, dent2); + goto out_free; } nlen1 = le16_to_cpu(dent1->nlen); @@ -1020,9 +1042,9 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1, dbg_err("bad order of colliding key %s", DBGKEY(&key)); - dbg_msg("first node at %d:%d\n", zbr1->lnum, zbr1->offs); + ubifs_msg("first node at %d:%d\n", zbr1->lnum, zbr1->offs); dbg_dump_node(c, dent1); - dbg_msg("second node at %d:%d\n", zbr2->lnum, zbr2->offs); + ubifs_msg("second node at %d:%d\n", zbr2->lnum, zbr2->offs); dbg_dump_node(c, dent2); out_free: @@ -2097,13 +2119,13 @@ static int simple_rand(void) return (next >> 16) & 32767; } -void dbg_failure_mode_registration(struct ubifs_info *c) +static void failure_mode_init(struct ubifs_info *c) { struct failure_mode_info *fmi; fmi = kmalloc(sizeof(struct failure_mode_info), GFP_NOFS); if (!fmi) { - dbg_err("Failed to register failure mode - no memory"); + ubifs_err("Failed to register failure mode - no memory"); return; } fmi->c = c; @@ -2112,7 +2134,7 @@ void dbg_failure_mode_registration(struct ubifs_info *c) spin_unlock(&fmi_lock); } -void dbg_failure_mode_deregistration(struct ubifs_info *c) +static void failure_mode_exit(struct ubifs_info *c) { struct failure_mode_info *fmi, *tmp; @@ -2146,42 +2168,44 @@ static int in_failure_mode(struct ubi_volume_desc *desc) struct ubifs_info *c = dbg_find_info(desc); if (c && dbg_failure_mode) - return c->failure_mode; + return c->dbg->failure_mode; return 0; } static int do_fail(struct ubi_volume_desc *desc, int lnum, int write) { struct ubifs_info *c = dbg_find_info(desc); + struct ubifs_debug_info *d; if (!c || !dbg_failure_mode) return 0; - if (c->failure_mode) + d = c->dbg; + if (d->failure_mode) return 1; - if (!c->fail_cnt) { + if (!d->fail_cnt) { /* First call - decide delay to failure */ if (chance(1, 2)) { unsigned int delay = 1 << (simple_rand() >> 11); if (chance(1, 2)) { - c->fail_delay = 1; - c->fail_timeout = jiffies + + d->fail_delay = 1; + d->fail_timeout = jiffies + msecs_to_jiffies(delay); dbg_rcvry("failing after %ums", delay); } else { - c->fail_delay = 2; - c->fail_cnt_max = delay; + d->fail_delay = 2; + d->fail_cnt_max = delay; dbg_rcvry("failing after %u calls", delay); } } - c->fail_cnt += 1; + d->fail_cnt += 1; } /* Determine if failure delay has expired */ - if (c->fail_delay == 1) { - if (time_before(jiffies, c->fail_timeout)) + if (d->fail_delay == 1) { + if (time_before(jiffies, d->fail_timeout)) return 0; - } else if (c->fail_delay == 2) - if (c->fail_cnt++ < c->fail_cnt_max) + } else if (d->fail_delay == 2) + if (d->fail_cnt++ < d->fail_cnt_max) return 0; if (lnum == UBIFS_SB_LNUM) { if (write) { @@ -2239,7 +2263,7 @@ static int do_fail(struct ubi_volume_desc *desc, int lnum, int write) dbg_rcvry("failing in bud LEB %d commit not running", lnum); } ubifs_err("*** SETTING FAILURE MODE ON (LEB %d) ***", lnum); - c->failure_mode = 1; + d->failure_mode = 1; dump_stack(); return 1; } @@ -2344,4 +2368,181 @@ int dbg_leb_map(struct ubi_volume_desc *desc, int lnum, int dtype) return 0; } +/** + * ubifs_debugging_init - initialize UBIFS debugging. + * @c: UBIFS file-system description object + * + * This function initializes debugging-related data for the file system. + * Returns zero in case of success and a negative error code in case of + * failure. + */ +int ubifs_debugging_init(struct ubifs_info *c) +{ + c->dbg = kzalloc(sizeof(struct ubifs_debug_info), GFP_KERNEL); + if (!c->dbg) + return -ENOMEM; + + c->dbg->buf = vmalloc(c->leb_size); + if (!c->dbg->buf) + goto out; + + failure_mode_init(c); + return 0; + +out: + kfree(c->dbg); + return -ENOMEM; +} + +/** + * ubifs_debugging_exit - free debugging data. + * @c: UBIFS file-system description object + */ +void ubifs_debugging_exit(struct ubifs_info *c) +{ + failure_mode_exit(c); + vfree(c->dbg->buf); + kfree(c->dbg); +} + +/* + * Root directory for UBIFS stuff in debugfs. Contains sub-directories which + * contain the stuff specific to particular file-system mounts. + */ +static struct dentry *debugfs_rootdir; + +/** + * dbg_debugfs_init - initialize debugfs file-system. + * + * UBIFS uses debugfs file-system to expose various debugging knobs to + * user-space. This function creates "ubifs" directory in the debugfs + * file-system. Returns zero in case of success and a negative error code in + * case of failure. + */ +int dbg_debugfs_init(void) +{ + debugfs_rootdir = debugfs_create_dir("ubifs", NULL); + if (IS_ERR(debugfs_rootdir)) { + int err = PTR_ERR(debugfs_rootdir); + ubifs_err("cannot create \"ubifs\" debugfs directory, " + "error %d\n", err); + return err; + } + + return 0; +} + +/** + * dbg_debugfs_exit - remove the "ubifs" directory from debugfs file-system. + */ +void dbg_debugfs_exit(void) +{ + debugfs_remove(debugfs_rootdir); +} + +static int open_debugfs_file(struct inode *inode, struct file *file) +{ + file->private_data = inode->i_private; + return 0; +} + +static ssize_t write_debugfs_file(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct ubifs_info *c = file->private_data; + struct ubifs_debug_info *d = c->dbg; + + if (file->f_path.dentry == d->dump_lprops) + dbg_dump_lprops(c); + else if (file->f_path.dentry == d->dump_budg) { + spin_lock(&c->space_lock); + dbg_dump_budg(c); + spin_unlock(&c->space_lock); + } else if (file->f_path.dentry == d->dump_tnc) { + mutex_lock(&c->tnc_mutex); + dbg_dump_tnc(c); + mutex_unlock(&c->tnc_mutex); + } else + return -EINVAL; + + *ppos += count; + return count; +} + +static const struct file_operations debugfs_fops = { + .open = open_debugfs_file, + .write = write_debugfs_file, + .owner = THIS_MODULE, +}; + +/** + * dbg_debugfs_init_fs - initialize debugfs for UBIFS instance. + * @c: UBIFS file-system description object + * + * This function creates all debugfs files for this instance of UBIFS. Returns + * zero in case of success and a negative error code in case of failure. + * + * Note, the only reason we have not merged this function with the + * 'ubifs_debugging_init()' function is because it is better to initialize + * debugfs interfaces at the very end of the mount process, and remove them at + * the very beginning of the mount process. + */ +int dbg_debugfs_init_fs(struct ubifs_info *c) +{ + int err; + const char *fname; + struct dentry *dent; + struct ubifs_debug_info *d = c->dbg; + + sprintf(d->debugfs_dir_name, "ubi%d_%d", c->vi.ubi_num, c->vi.vol_id); + d->debugfs_dir = debugfs_create_dir(d->debugfs_dir_name, + debugfs_rootdir); + if (IS_ERR(d->debugfs_dir)) { + err = PTR_ERR(d->debugfs_dir); + ubifs_err("cannot create \"%s\" debugfs directory, error %d\n", + d->debugfs_dir_name, err); + goto out; + } + + fname = "dump_lprops"; + dent = debugfs_create_file(fname, S_IWUGO, d->debugfs_dir, c, + &debugfs_fops); + if (IS_ERR(dent)) + goto out_remove; + d->dump_lprops = dent; + + fname = "dump_budg"; + dent = debugfs_create_file(fname, S_IWUGO, d->debugfs_dir, c, + &debugfs_fops); + if (IS_ERR(dent)) + goto out_remove; + d->dump_budg = dent; + + fname = "dump_tnc"; + dent = debugfs_create_file(fname, S_IWUGO, d->debugfs_dir, c, + &debugfs_fops); + if (IS_ERR(dent)) + goto out_remove; + d->dump_tnc = dent; + + return 0; + +out_remove: + err = PTR_ERR(dent); + ubifs_err("cannot create \"%s\" debugfs directory, error %d\n", + fname, err); + debugfs_remove_recursive(d->debugfs_dir); +out: + return err; +} + +/** + * dbg_debugfs_exit_fs - remove all debugfs files. + * @c: UBIFS file-system description object + */ +void dbg_debugfs_exit_fs(struct ubifs_info *c) +{ + debugfs_remove_recursive(c->dbg->debugfs_dir); +} + #endif /* CONFIG_UBIFS_FS_DEBUG */ diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h index 33d6b95071e4..9820d6999f7e 100644 --- a/fs/ubifs/debug.h +++ b/fs/ubifs/debug.h @@ -25,7 +25,56 @@ #ifdef CONFIG_UBIFS_FS_DEBUG -#define UBIFS_DBG(op) op +/** + * ubifs_debug_info - per-FS debugging information. + * @buf: a buffer of LEB size, used for various purposes + * @old_zroot: old index root - used by 'dbg_check_old_index()' + * @old_zroot_level: old index root level - used by 'dbg_check_old_index()' + * @old_zroot_sqnum: old index root sqnum - used by 'dbg_check_old_index()' + * @failure_mode: failure mode for recovery testing + * @fail_delay: 0=>don't delay, 1=>delay a time, 2=>delay a number of calls + * @fail_timeout: time in jiffies when delay of failure mode expires + * @fail_cnt: current number of calls to failure mode I/O functions + * @fail_cnt_max: number of calls by which to delay failure mode + * @chk_lpt_sz: used by LPT tree size checker + * @chk_lpt_sz2: used by LPT tree size checker + * @chk_lpt_wastage: used by LPT tree size checker + * @chk_lpt_lebs: used by LPT tree size checker + * @new_nhead_offs: used by LPT tree size checker + * @new_ihead_lnum: used by debugging to check ihead_lnum + * @new_ihead_offs: used by debugging to check ihead_offs + * + * debugfs_dir_name: name of debugfs directory containing this file-system's + * files + * debugfs_dir: direntry object of the file-system debugfs directory + * dump_lprops: "dump lprops" debugfs knob + * dump_budg: "dump budgeting information" debugfs knob + * dump_tnc: "dump TNC" debugfs knob + */ +struct ubifs_debug_info { + void *buf; + struct ubifs_zbranch old_zroot; + int old_zroot_level; + unsigned long long old_zroot_sqnum; + int failure_mode; + int fail_delay; + unsigned long fail_timeout; + unsigned int fail_cnt; + unsigned int fail_cnt_max; + long long chk_lpt_sz; + long long chk_lpt_sz2; + long long chk_lpt_wastage; + int chk_lpt_lebs; + int new_nhead_offs; + int new_ihead_lnum; + int new_ihead_offs; + + char debugfs_dir_name[100]; + struct dentry *debugfs_dir; + struct dentry *dump_lprops; + struct dentry *dump_budg; + struct dentry *dump_tnc; +}; #define ubifs_assert(expr) do { \ if (unlikely(!(expr))) { \ @@ -211,14 +260,18 @@ extern unsigned int ubifs_msg_flags; extern unsigned int ubifs_chk_flags; extern unsigned int ubifs_tst_flags; -/* Dump functions */ +int ubifs_debugging_init(struct ubifs_info *c); +void ubifs_debugging_exit(struct ubifs_info *c); +/* Dump functions */ const char *dbg_ntype(int type); const char *dbg_cstate(int cmt_state); const char *dbg_get_key_dump(const struct ubifs_info *c, const union ubifs_key *key); void dbg_dump_inode(const struct ubifs_info *c, const struct inode *inode); void dbg_dump_node(const struct ubifs_info *c, const void *node); +void dbg_dump_lpt_node(const struct ubifs_info *c, void *node, int lnum, + int offs); void dbg_dump_budget_req(const struct ubifs_budget_req *req); void dbg_dump_lstats(const struct ubifs_lp_stats *lst); void dbg_dump_budg(struct ubifs_info *c); @@ -233,9 +286,9 @@ void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, struct ubifs_nnode *parent, int iip); void dbg_dump_tnc(struct ubifs_info *c); void dbg_dump_index(struct ubifs_info *c); +void dbg_dump_lpt_lebs(const struct ubifs_info *c); /* Checking helper functions */ - typedef int (*dbg_leaf_callback)(struct ubifs_info *c, struct ubifs_zbranch *zbr, void *priv); typedef int (*dbg_znode_callback)(struct ubifs_info *c, @@ -274,9 +327,6 @@ int dbg_force_in_the_gaps(void); #define dbg_failure_mode (ubifs_tst_flags & UBIFS_TST_RCVRY) -void dbg_failure_mode_registration(struct ubifs_info *c); -void dbg_failure_mode_deregistration(struct ubifs_info *c); - #ifndef UBIFS_DBG_PRESERVE_UBI #define ubi_leb_read dbg_leb_read @@ -318,9 +368,13 @@ static inline int dbg_change(struct ubi_volume_desc *desc, int lnum, return dbg_leb_change(desc, lnum, buf, len, UBI_UNKNOWN); } -#else /* !CONFIG_UBIFS_FS_DEBUG */ +/* Debugfs-related stuff */ +int dbg_debugfs_init(void); +void dbg_debugfs_exit(void); +int dbg_debugfs_init_fs(struct ubifs_info *c); +void dbg_debugfs_exit_fs(struct ubifs_info *c); -#define UBIFS_DBG(op) +#else /* !CONFIG_UBIFS_FS_DEBUG */ /* Use "if (0)" to make compiler check arguments even if debugging is off */ #define ubifs_assert(expr) do { \ @@ -360,23 +414,28 @@ static inline int dbg_change(struct ubi_volume_desc *desc, int lnum, #define DBGKEY(key) ((char *)(key)) #define DBGKEY1(key) ((char *)(key)) -#define dbg_ntype(type) "" -#define dbg_cstate(cmt_state) "" -#define dbg_get_key_dump(c, key) ({}) -#define dbg_dump_inode(c, inode) ({}) -#define dbg_dump_node(c, node) ({}) -#define dbg_dump_budget_req(req) ({}) -#define dbg_dump_lstats(lst) ({}) -#define dbg_dump_budg(c) ({}) -#define dbg_dump_lprop(c, lp) ({}) -#define dbg_dump_lprops(c) ({}) -#define dbg_dump_lpt_info(c) ({}) -#define dbg_dump_leb(c, lnum) ({}) -#define dbg_dump_znode(c, znode) ({}) -#define dbg_dump_heap(c, heap, cat) ({}) -#define dbg_dump_pnode(c, pnode, parent, iip) ({}) -#define dbg_dump_tnc(c) ({}) -#define dbg_dump_index(c) ({}) +#define ubifs_debugging_init(c) 0 +#define ubifs_debugging_exit(c) ({}) + +#define dbg_ntype(type) "" +#define dbg_cstate(cmt_state) "" +#define dbg_get_key_dump(c, key) ({}) +#define dbg_dump_inode(c, inode) ({}) +#define dbg_dump_node(c, node) ({}) +#define dbg_dump_lpt_node(c, node, lnum, offs) ({}) +#define dbg_dump_budget_req(req) ({}) +#define dbg_dump_lstats(lst) ({}) +#define dbg_dump_budg(c) ({}) +#define dbg_dump_lprop(c, lp) ({}) +#define dbg_dump_lprops(c) ({}) +#define dbg_dump_lpt_info(c) ({}) +#define dbg_dump_leb(c, lnum) ({}) +#define dbg_dump_znode(c, znode) ({}) +#define dbg_dump_heap(c, heap, cat) ({}) +#define dbg_dump_pnode(c, pnode, parent, iip) ({}) +#define dbg_dump_tnc(c) ({}) +#define dbg_dump_index(c) ({}) +#define dbg_dump_lpt_lebs(c) ({}) #define dbg_walk_index(c, leaf_cb, znode_cb, priv) 0 #define dbg_old_index_check_init(c, zroot) 0 @@ -396,9 +455,11 @@ static inline int dbg_change(struct ubi_volume_desc *desc, int lnum, #define dbg_force_in_the_gaps_enabled 0 #define dbg_force_in_the_gaps() 0 #define dbg_failure_mode 0 -#define dbg_failure_mode_registration(c) ({}) -#define dbg_failure_mode_deregistration(c) ({}) -#endif /* !CONFIG_UBIFS_FS_DEBUG */ +#define dbg_debugfs_init() 0 +#define dbg_debugfs_exit() +#define dbg_debugfs_init_fs(c) 0 +#define dbg_debugfs_exit_fs(c) 0 +#endif /* !CONFIG_UBIFS_FS_DEBUG */ #endif /* !__UBIFS_DEBUG_H__ */ diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 2624411d9758..bf37374567fa 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -72,8 +72,8 @@ static int read_block(struct inode *inode, void *addr, unsigned int block, return err; } - ubifs_assert(le64_to_cpu(dn->ch.sqnum) > ubifs_inode(inode)->creat_sqnum); - + ubifs_assert(le64_to_cpu(dn->ch.sqnum) > + ubifs_inode(inode)->creat_sqnum); len = le32_to_cpu(dn->size); if (len <= 0 || len > UBIFS_BLOCK_SIZE) goto dump; @@ -219,7 +219,8 @@ static void release_existing_page_budget(struct ubifs_info *c) } static int write_begin_slow(struct address_space *mapping, - loff_t pos, unsigned len, struct page **pagep) + loff_t pos, unsigned len, struct page **pagep, + unsigned flags) { struct inode *inode = mapping->host; struct ubifs_info *c = inode->i_sb->s_fs_info; @@ -247,14 +248,14 @@ static int write_begin_slow(struct address_space *mapping, if (unlikely(err)) return err; - page = __grab_cache_page(mapping, index); + page = grab_cache_page_write_begin(mapping, index, flags); if (unlikely(!page)) { ubifs_release_budget(c, &req); return -ENOMEM; } if (!PageUptodate(page)) { - if (!(pos & PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) + if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) SetPageChecked(page); else { err = do_readpage(page); @@ -438,13 +439,13 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, return -EROFS; /* Try out the fast-path part first */ - page = __grab_cache_page(mapping, index); + page = grab_cache_page_write_begin(mapping, index, flags); if (unlikely(!page)) return -ENOMEM; if (!PageUptodate(page)) { /* The page is not loaded from the flash */ - if (!(pos & PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) + if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) /* * We change whole page so no need to load it. But we * have to set the @PG_checked flag to make the further @@ -483,7 +484,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, unlock_page(page); page_cache_release(page); - return write_begin_slow(mapping, pos, len, pagep); + return write_begin_slow(mapping, pos, len, pagep, flags); } /* diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c index 0bef6501d58a..9832f9abe28e 100644 --- a/fs/ubifs/gc.c +++ b/fs/ubifs/gc.c @@ -45,7 +45,7 @@ #define SMALL_NODE_WM UBIFS_MAX_DENT_NODE_SZ /* - * GC may need to move more then one LEB to make progress. The below constants + * GC may need to move more than one LEB to make progress. The below constants * define "soft" and "hard" limits on the number of LEBs the garbage collector * may move. */ diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c index 5e82cffe9695..6db7a6be6c97 100644 --- a/fs/ubifs/ioctl.c +++ b/fs/ubifs/ioctl.c @@ -154,6 +154,7 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case FS_IOC_GETFLAGS: flags = ubifs2ioctl(ubifs_inode(inode)->flags); + dbg_gen("get flags: %#x, i_flags %#x", flags, inode->i_flags); return put_user(flags, (int __user *) arg); case FS_IOC_SETFLAGS: { @@ -176,6 +177,7 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) err = mnt_want_write(file->f_path.mnt); if (err) return err; + dbg_gen("set flags: %#x, i_flags %#x", flags, inode->i_flags); err = setflags(inode, flags); mnt_drop_write(file->f_path.mnt); return err; diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index f91b745908ea..9b7c54e0cd2a 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -191,7 +191,7 @@ again: if (wbuf->lnum != -1 && avail >= len) { /* * Someone else has switched the journal head and we have - * enough space now. This happens when more then one process is + * enough space now. This happens when more than one process is * trying to write to the same journal head at the same time. */ dbg_jnl("return LEB %d back, already have LEB %d:%d", @@ -704,7 +704,7 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode, data->size = cpu_to_le32(len); zero_data_node_unused(data); - if (!(ui->flags && UBIFS_COMPR_FL)) + if (!(ui->flags & UBIFS_COMPR_FL)) /* Compression is disabled for this inode */ compr_type = UBIFS_COMPR_NONE; else @@ -1220,7 +1220,7 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, data_key_init(c, &key, inum, blk); bit = old_size & (UBIFS_BLOCK_SIZE - 1); - blk = (old_size >> UBIFS_BLOCK_SHIFT) - (bit ? 0: 1); + blk = (old_size >> UBIFS_BLOCK_SHIFT) - (bit ? 0 : 1); data_key_init(c, &to_key, inum, blk); err = ubifs_tnc_remove_range(c, &key, &to_key); diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h index 3f1f16bc25c9..efb3430a2581 100644 --- a/fs/ubifs/key.h +++ b/fs/ubifs/key.h @@ -38,6 +38,22 @@ #define __UBIFS_KEY_H__ /** + * key_mask_hash - mask a valid hash value. + * @val: value to be masked + * + * We use hash values as offset in directories, so values %0 and %1 are + * reserved for "." and "..". %2 is reserved for "end of readdir" marker. This + * function makes sure the reserved values are not used. + */ +static inline uint32_t key_mask_hash(uint32_t hash) +{ + hash &= UBIFS_S_KEY_HASH_MASK; + if (unlikely(hash <= 2)) + hash += 3; + return hash; +} + +/** * key_r5_hash - R5 hash function (borrowed from reiserfs). * @s: direntry name * @len: name length @@ -54,16 +70,7 @@ static inline uint32_t key_r5_hash(const char *s, int len) str++; } - a &= UBIFS_S_KEY_HASH_MASK; - - /* - * We use hash values as offset in directories, so values %0 and %1 are - * reserved for "." and "..". %2 is reserved for "end of readdir" - * marker. - */ - if (unlikely(a >= 0 && a <= 2)) - a += 3; - return a; + return key_mask_hash(a); } /** @@ -77,10 +84,7 @@ static inline uint32_t key_test_hash(const char *str, int len) len = min_t(uint32_t, len, 4); memcpy(&a, str, len); - a &= UBIFS_S_KEY_HASH_MASK; - if (unlikely(a >= 0 && a <= 2)) - a += 3; - return a; + return key_mask_hash(a); } /** diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c index f27176e9b70d..dfd2bcece27a 100644 --- a/fs/ubifs/lprops.c +++ b/fs/ubifs/lprops.c @@ -520,13 +520,13 @@ static int is_lprops_dirty(struct ubifs_info *c, struct ubifs_lprops *lprops) * @flags: new flags * @idx_gc_cnt: change to the count of idx_gc list * - * This function changes LEB properties. This function does not change a LEB - * property (@free, @dirty or @flag) if the value passed is %LPROPS_NC. + * This function changes LEB properties (@free, @dirty or @flag). However, the + * property which has the %LPROPS_NC value is not changed. Returns a pointer to + * the updated LEB properties on success and a negative error code on failure. * - * This function returns a pointer to the updated LEB properties on success - * and a negative error code on failure. N.B. the LEB properties may have had to - * be copied (due to COW) and consequently the pointer returned may not be the - * same as the pointer passed. + * Note, the LEB properties may have had to be copied (due to COW) and + * consequently the pointer returned may not be the same as the pointer + * passed. */ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c, const struct ubifs_lprops *lp, @@ -1088,7 +1088,7 @@ static int scan_check_cb(struct ubifs_info *c, } } - sleb = ubifs_scan(c, lnum, 0, c->dbg_buf); + sleb = ubifs_scan(c, lnum, 0, c->dbg->buf); if (IS_ERR(sleb)) { /* * After an unclean unmount, empty and freeable LEBs diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c index db8bd0e518b2..b2792e84d245 100644 --- a/fs/ubifs/lpt.c +++ b/fs/ubifs/lpt.c @@ -36,15 +36,16 @@ * can be written into a single eraseblock. In that case, garbage collection * consists of just writing the whole table, which therefore makes all other * eraseblocks reusable. In the case of the big model, dirty eraseblocks are - * selected for garbage collection, which consists are marking the nodes in + * selected for garbage collection, which consists of marking the clean nodes in * that LEB as dirty, and then only the dirty nodes are written out. Also, in * the case of the big model, a table of LEB numbers is saved so that the entire * LPT does not to be scanned looking for empty eraseblocks when UBIFS is first * mounted. */ -#include <linux/crc16.h> #include "ubifs.h" +#include <linux/crc16.h> +#include <linux/math64.h> /** * do_calc_lpt_geom - calculate sizes for the LPT area. @@ -135,15 +136,13 @@ static void do_calc_lpt_geom(struct ubifs_info *c) int ubifs_calc_lpt_geom(struct ubifs_info *c) { int lebs_needed; - uint64_t sz; + long long sz; do_calc_lpt_geom(c); /* Verify that lpt_lebs is big enough */ sz = c->lpt_sz * 2; /* Must have at least 2 times the size */ - sz += c->leb_size - 1; - do_div(sz, c->leb_size); - lebs_needed = sz; + lebs_needed = div_u64(sz + c->leb_size - 1, c->leb_size); if (lebs_needed > c->lpt_lebs) { ubifs_err("too few LPT LEBs"); return -EINVAL; @@ -156,7 +155,6 @@ int ubifs_calc_lpt_geom(struct ubifs_info *c) } c->check_lpt_free = c->big_lpt; - return 0; } @@ -176,7 +174,7 @@ static int calc_dflt_lpt_geom(struct ubifs_info *c, int *main_lebs, int *big_lpt) { int i, lebs_needed; - uint64_t sz; + long long sz; /* Start by assuming the minimum number of LPT LEBs */ c->lpt_lebs = UBIFS_MIN_LPT_LEBS; @@ -203,9 +201,7 @@ static int calc_dflt_lpt_geom(struct ubifs_info *c, int *main_lebs, /* Now check there are enough LPT LEBs */ for (i = 0; i < 64 ; i++) { sz = c->lpt_sz * 4; /* Allow 4 times the size */ - sz += c->leb_size - 1; - do_div(sz, c->leb_size); - lebs_needed = sz; + lebs_needed = div_u64(sz + c->leb_size - 1, c->leb_size); if (lebs_needed > c->lpt_lebs) { /* Not enough LPT LEBs so try again with more */ c->lpt_lebs = lebs_needed; @@ -558,7 +554,7 @@ static int calc_nnode_num(int row, int col) * This function calculates and returns the nnode number based on the parent's * nnode number and the index in parent. */ -static int calc_nnode_num_from_parent(struct ubifs_info *c, +static int calc_nnode_num_from_parent(const struct ubifs_info *c, struct ubifs_nnode *parent, int iip) { int num, shft; @@ -583,7 +579,7 @@ static int calc_nnode_num_from_parent(struct ubifs_info *c, * This function calculates and returns the pnode number based on the parent's * nnode number and the index in parent. */ -static int calc_pnode_num_from_parent(struct ubifs_info *c, +static int calc_pnode_num_from_parent(const struct ubifs_info *c, struct ubifs_nnode *parent, int iip) { int i, n = c->lpt_hght - 1, pnum = parent->num, num = 0; @@ -966,7 +962,7 @@ static int check_lpt_type(uint8_t **addr, int *pos, int type) * * This function returns %0 on success and a negative error code on failure. */ -static int unpack_pnode(struct ubifs_info *c, void *buf, +static int unpack_pnode(const struct ubifs_info *c, void *buf, struct ubifs_pnode *pnode) { uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; @@ -996,15 +992,15 @@ static int unpack_pnode(struct ubifs_info *c, void *buf, } /** - * unpack_nnode - unpack a nnode. + * ubifs_unpack_nnode - unpack a nnode. * @c: UBIFS file-system description object * @buf: buffer containing packed nnode to unpack * @nnode: nnode structure to fill * * This function returns %0 on success and a negative error code on failure. */ -static int unpack_nnode(struct ubifs_info *c, void *buf, - struct ubifs_nnode *nnode) +int ubifs_unpack_nnode(const struct ubifs_info *c, void *buf, + struct ubifs_nnode *nnode) { uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; int i, pos = 0, err; @@ -1036,7 +1032,7 @@ static int unpack_nnode(struct ubifs_info *c, void *buf, * * This function returns %0 on success and a negative error code on failure. */ -static int unpack_ltab(struct ubifs_info *c, void *buf) +static int unpack_ltab(const struct ubifs_info *c, void *buf) { uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; int i, pos = 0, err; @@ -1068,7 +1064,7 @@ static int unpack_ltab(struct ubifs_info *c, void *buf) * * This function returns %0 on success and a negative error code on failure. */ -static int unpack_lsave(struct ubifs_info *c, void *buf) +static int unpack_lsave(const struct ubifs_info *c, void *buf) { uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; int i, pos = 0, err; @@ -1096,7 +1092,7 @@ static int unpack_lsave(struct ubifs_info *c, void *buf) * * This function returns %0 on success and a negative error code on failure. */ -static int validate_nnode(struct ubifs_info *c, struct ubifs_nnode *nnode, +static int validate_nnode(const struct ubifs_info *c, struct ubifs_nnode *nnode, struct ubifs_nnode *parent, int iip) { int i, lvl, max_offs; @@ -1140,7 +1136,7 @@ static int validate_nnode(struct ubifs_info *c, struct ubifs_nnode *nnode, * * This function returns %0 on success and a negative error code on failure. */ -static int validate_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, +static int validate_pnode(const struct ubifs_info *c, struct ubifs_pnode *pnode, struct ubifs_nnode *parent, int iip) { int i; @@ -1174,7 +1170,8 @@ static int validate_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, * This function calculates the LEB numbers for the LEB properties it contains * based on the pnode number. */ -static void set_pnode_lnum(struct ubifs_info *c, struct ubifs_pnode *pnode) +static void set_pnode_lnum(const struct ubifs_info *c, + struct ubifs_pnode *pnode) { int i, lnum; @@ -1227,7 +1224,7 @@ int ubifs_read_nnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip) err = ubi_read(c->ubi, lnum, buf, offs, c->nnode_sz); if (err) goto out; - err = unpack_nnode(c, buf, nnode); + err = ubifs_unpack_nnode(c, buf, nnode); if (err) goto out; } @@ -1816,7 +1813,7 @@ static struct ubifs_nnode *scan_get_nnode(struct ubifs_info *c, c->nnode_sz); if (err) return ERR_PTR(err); - err = unpack_nnode(c, buf, nnode); + err = ubifs_unpack_nnode(c, buf, nnode); if (err) return ERR_PTR(err); } diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c index a41434b42785..96ca95707175 100644 --- a/fs/ubifs/lpt_commit.c +++ b/fs/ubifs/lpt_commit.c @@ -320,6 +320,8 @@ no_space: dbg_err("LPT out of space at LEB %d:%d needing %d, done_ltab %d, " "done_lsave %d", lnum, offs, len, done_ltab, done_lsave); dbg_dump_lpt_info(c); + dbg_dump_lpt_lebs(c); + dump_stack(); return err; } @@ -546,8 +548,10 @@ static int write_cnodes(struct ubifs_info *c) no_space: ubifs_err("LPT out of space mismatch"); dbg_err("LPT out of space mismatch at LEB %d:%d needing %d, done_ltab " - "%d, done_lsave %d", lnum, offs, len, done_ltab, done_lsave); + "%d, done_lsave %d", lnum, offs, len, done_ltab, done_lsave); dbg_dump_lpt_info(c); + dbg_dump_lpt_lebs(c); + dump_stack(); return err; } @@ -749,7 +753,7 @@ static void lpt_tgc_start(struct ubifs_info *c) * LPT trivial garbage collection is where a LPT LEB contains only dirty and * free space and so may be reused as soon as the next commit is completed. * This function is called after the commit is completed (master node has been - * written) and unmaps LPT LEBs that were marked for trivial GC. + * written) and un-maps LPT LEBs that were marked for trivial GC. */ static int lpt_tgc_end(struct ubifs_info *c) { @@ -1025,7 +1029,7 @@ static int make_node_dirty(struct ubifs_info *c, int node_type, int node_num, * @c: UBIFS file-system description object * @node_type: LPT node type */ -static int get_lpt_node_len(struct ubifs_info *c, int node_type) +static int get_lpt_node_len(const struct ubifs_info *c, int node_type) { switch (node_type) { case UBIFS_LPT_NNODE: @@ -1046,7 +1050,7 @@ static int get_lpt_node_len(struct ubifs_info *c, int node_type) * @buf: buffer * @len: length of buffer */ -static int get_pad_len(struct ubifs_info *c, uint8_t *buf, int len) +static int get_pad_len(const struct ubifs_info *c, uint8_t *buf, int len) { int offs, pad_len; @@ -1063,7 +1067,8 @@ static int get_pad_len(struct ubifs_info *c, uint8_t *buf, int len) * @buf: buffer * @node_num: node number is returned here */ -static int get_lpt_node_type(struct ubifs_info *c, uint8_t *buf, int *node_num) +static int get_lpt_node_type(const struct ubifs_info *c, uint8_t *buf, + int *node_num) { uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; int pos = 0, node_type; @@ -1081,7 +1086,7 @@ static int get_lpt_node_type(struct ubifs_info *c, uint8_t *buf, int *node_num) * * This function returns %1 if the buffer contains a node or %0 if it does not. */ -static int is_a_node(struct ubifs_info *c, uint8_t *buf, int len) +static int is_a_node(const struct ubifs_info *c, uint8_t *buf, int len) { uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; int pos = 0, node_type, node_len; @@ -1105,7 +1110,6 @@ static int is_a_node(struct ubifs_info *c, uint8_t *buf, int len) return 1; } - /** * lpt_gc_lnum - garbage collect a LPT LEB. * @c: UBIFS file-system description object @@ -1463,7 +1467,7 @@ void ubifs_lpt_free(struct ubifs_info *c, int wr_only) #ifdef CONFIG_UBIFS_FS_DEBUG /** - * dbg_is_all_ff - determine if a buffer contains only 0xff bytes. + * dbg_is_all_ff - determine if a buffer contains only 0xFF bytes. * @buf: buffer * @len: buffer length */ @@ -1488,7 +1492,7 @@ static int dbg_is_nnode_dirty(struct ubifs_info *c, int lnum, int offs) struct ubifs_nnode *nnode; int hght; - /* Entire tree is in memory so first_nnode / next_nnode are ok */ + /* Entire tree is in memory so first_nnode / next_nnode are OK */ nnode = first_nnode(c, &hght); for (; nnode; nnode = next_nnode(c, nnode, &hght)) { struct ubifs_nbranch *branch; @@ -1602,7 +1606,10 @@ static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum) { int err, len = c->leb_size, dirty = 0, node_type, node_num, node_len; int ret; - void *buf = c->dbg_buf; + void *buf = c->dbg->buf; + + if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS)) + return 0; dbg_lp("LEB %d", lnum); err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size); @@ -1704,6 +1711,9 @@ int dbg_chk_lpt_free_spc(struct ubifs_info *c) long long free = 0; int i; + if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS)) + return 0; + for (i = 0; i < c->lpt_lebs; i++) { if (c->ltab[i].tgc || c->ltab[i].cmt) continue; @@ -1716,6 +1726,8 @@ int dbg_chk_lpt_free_spc(struct ubifs_info *c) dbg_err("LPT space error: free %lld lpt_sz %lld", free, c->lpt_sz); dbg_dump_lpt_info(c); + dbg_dump_lpt_lebs(c); + dump_stack(); return -EINVAL; } return 0; @@ -1731,15 +1743,19 @@ int dbg_chk_lpt_free_spc(struct ubifs_info *c) */ int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len) { + struct ubifs_debug_info *d = c->dbg; long long chk_lpt_sz, lpt_sz; int err = 0; + if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS)) + return 0; + switch (action) { case 0: - c->chk_lpt_sz = 0; - c->chk_lpt_sz2 = 0; - c->chk_lpt_lebs = 0; - c->chk_lpt_wastage = 0; + d->chk_lpt_sz = 0; + d->chk_lpt_sz2 = 0; + d->chk_lpt_lebs = 0; + d->chk_lpt_wastage = 0; if (c->dirty_pn_cnt > c->pnode_cnt) { dbg_err("dirty pnodes %d exceed max %d", c->dirty_pn_cnt, c->pnode_cnt); @@ -1752,35 +1768,35 @@ int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len) } return err; case 1: - c->chk_lpt_sz += len; + d->chk_lpt_sz += len; return 0; case 2: - c->chk_lpt_sz += len; - c->chk_lpt_wastage += len; - c->chk_lpt_lebs += 1; + d->chk_lpt_sz += len; + d->chk_lpt_wastage += len; + d->chk_lpt_lebs += 1; return 0; case 3: chk_lpt_sz = c->leb_size; - chk_lpt_sz *= c->chk_lpt_lebs; + chk_lpt_sz *= d->chk_lpt_lebs; chk_lpt_sz += len - c->nhead_offs; - if (c->chk_lpt_sz != chk_lpt_sz) { + if (d->chk_lpt_sz != chk_lpt_sz) { dbg_err("LPT wrote %lld but space used was %lld", - c->chk_lpt_sz, chk_lpt_sz); + d->chk_lpt_sz, chk_lpt_sz); err = -EINVAL; } - if (c->chk_lpt_sz > c->lpt_sz) { + if (d->chk_lpt_sz > c->lpt_sz) { dbg_err("LPT wrote %lld but lpt_sz is %lld", - c->chk_lpt_sz, c->lpt_sz); + d->chk_lpt_sz, c->lpt_sz); err = -EINVAL; } - if (c->chk_lpt_sz2 && c->chk_lpt_sz != c->chk_lpt_sz2) { + if (d->chk_lpt_sz2 && d->chk_lpt_sz != d->chk_lpt_sz2) { dbg_err("LPT layout size %lld but wrote %lld", - c->chk_lpt_sz, c->chk_lpt_sz2); + d->chk_lpt_sz, d->chk_lpt_sz2); err = -EINVAL; } - if (c->chk_lpt_sz2 && c->new_nhead_offs != len) { + if (d->chk_lpt_sz2 && d->new_nhead_offs != len) { dbg_err("LPT new nhead offs: expected %d was %d", - c->new_nhead_offs, len); + d->new_nhead_offs, len); err = -EINVAL; } lpt_sz = (long long)c->pnode_cnt * c->pnode_sz; @@ -1788,26 +1804,146 @@ int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len) lpt_sz += c->ltab_sz; if (c->big_lpt) lpt_sz += c->lsave_sz; - if (c->chk_lpt_sz - c->chk_lpt_wastage > lpt_sz) { + if (d->chk_lpt_sz - d->chk_lpt_wastage > lpt_sz) { dbg_err("LPT chk_lpt_sz %lld + waste %lld exceeds %lld", - c->chk_lpt_sz, c->chk_lpt_wastage, lpt_sz); + d->chk_lpt_sz, d->chk_lpt_wastage, lpt_sz); err = -EINVAL; } - if (err) + if (err) { dbg_dump_lpt_info(c); - c->chk_lpt_sz2 = c->chk_lpt_sz; - c->chk_lpt_sz = 0; - c->chk_lpt_wastage = 0; - c->chk_lpt_lebs = 0; - c->new_nhead_offs = len; + dbg_dump_lpt_lebs(c); + dump_stack(); + } + d->chk_lpt_sz2 = d->chk_lpt_sz; + d->chk_lpt_sz = 0; + d->chk_lpt_wastage = 0; + d->chk_lpt_lebs = 0; + d->new_nhead_offs = len; return err; case 4: - c->chk_lpt_sz += len; - c->chk_lpt_wastage += len; + d->chk_lpt_sz += len; + d->chk_lpt_wastage += len; return 0; default: return -EINVAL; } } +/** + * dbg_dump_lpt_leb - dump an LPT LEB. + * @c: UBIFS file-system description object + * @lnum: LEB number to dump + * + * This function dumps an LEB from LPT area. Nodes in this area are very + * different to nodes in the main area (e.g., they do not have common headers, + * they do not have 8-byte alignments, etc), so we have a separate function to + * dump LPT area LEBs. Note, LPT has to be locked by the caller. + */ +static void dump_lpt_leb(const struct ubifs_info *c, int lnum) +{ + int err, len = c->leb_size, node_type, node_num, node_len, offs; + void *buf = c->dbg->buf; + + printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n", + current->pid, lnum); + err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size); + if (err) { + ubifs_err("cannot read LEB %d, error %d", lnum, err); + return; + } + while (1) { + offs = c->leb_size - len; + if (!is_a_node(c, buf, len)) { + int pad_len; + + pad_len = get_pad_len(c, buf, len); + if (pad_len) { + printk(KERN_DEBUG "LEB %d:%d, pad %d bytes\n", + lnum, offs, pad_len); + buf += pad_len; + len -= pad_len; + continue; + } + if (len) + printk(KERN_DEBUG "LEB %d:%d, free %d bytes\n", + lnum, offs, len); + break; + } + + node_type = get_lpt_node_type(c, buf, &node_num); + switch (node_type) { + case UBIFS_LPT_PNODE: + { + node_len = c->pnode_sz; + if (c->big_lpt) + printk(KERN_DEBUG "LEB %d:%d, pnode num %d\n", + lnum, offs, node_num); + else + printk(KERN_DEBUG "LEB %d:%d, pnode\n", + lnum, offs); + break; + } + case UBIFS_LPT_NNODE: + { + int i; + struct ubifs_nnode nnode; + + node_len = c->nnode_sz; + if (c->big_lpt) + printk(KERN_DEBUG "LEB %d:%d, nnode num %d, ", + lnum, offs, node_num); + else + printk(KERN_DEBUG "LEB %d:%d, nnode, ", + lnum, offs); + err = ubifs_unpack_nnode(c, buf, &nnode); + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + printk("%d:%d", nnode.nbranch[i].lnum, + nnode.nbranch[i].offs); + if (i != UBIFS_LPT_FANOUT - 1) + printk(", "); + } + printk("\n"); + break; + } + case UBIFS_LPT_LTAB: + node_len = c->ltab_sz; + printk(KERN_DEBUG "LEB %d:%d, ltab\n", + lnum, offs); + break; + case UBIFS_LPT_LSAVE: + node_len = c->lsave_sz; + printk(KERN_DEBUG "LEB %d:%d, lsave len\n", lnum, offs); + break; + default: + ubifs_err("LPT node type %d not recognized", node_type); + return; + } + + buf += node_len; + len -= node_len; + } + + printk(KERN_DEBUG "(pid %d) finish dumping LEB %d\n", + current->pid, lnum); +} + +/** + * dbg_dump_lpt_lebs - dump LPT lebs. + * @c: UBIFS file-system description object + * + * This function dumps all LPT LEBs. The caller has to make sure the LPT is + * locked. + */ +void dbg_dump_lpt_lebs(const struct ubifs_info *c) +{ + int i; + + printk(KERN_DEBUG "(pid %d) start dumping all LPT LEBs\n", + current->pid); + for (i = 0; i < c->lpt_lebs; i++) + dump_lpt_leb(c, i + c->lpt_first); + printk(KERN_DEBUG "(pid %d) finish dumping all LPT LEBs\n", + current->pid); +} + #endif /* CONFIG_UBIFS_FS_DEBUG */ diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index 9bd5a43d4526..9e6f403f170e 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -899,7 +899,7 @@ static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci) for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) { struct ubifs_scan_leb *sleb; - sleb = ubifs_scan(c, lnum, 0, c->dbg_buf); + sleb = ubifs_scan(c, lnum, 0, c->dbg->buf); if (IS_ERR(sleb)) { err = PTR_ERR(sleb); break; diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c index 21f7d047c306..ce42a7b0ca5a 100644 --- a/fs/ubifs/replay.c +++ b/fs/ubifs/replay.c @@ -144,7 +144,7 @@ static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r) /* * If the replay order was perfect the dirty space would now be * zero. The order is not perfect because the the journal heads - * race with eachother. This is not a problem but is does mean + * race with each other. This is not a problem but is does mean * that the dirty space may temporarily exceed c->leb_size * during the replay. */ @@ -656,7 +656,7 @@ out_dump: * @dirty: amount of dirty space from padding and deletion nodes * * This function inserts a reference node to the replay tree and returns zero - * in case of success ort a negative error code in case of failure. + * in case of success or a negative error code in case of failure. */ static int insert_ref_node(struct ubifs_info *c, int lnum, int offs, unsigned long long sqnum, int free, int dirty) @@ -883,7 +883,7 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf) * This means that we reached end of log and now * look to the older log data, which was already * committed but the eraseblock was not erased (UBIFS - * only unmaps it). So this basically means we have to + * only un-maps it). So this basically means we have to * exit with "end of log" code. */ err = 1; @@ -1062,6 +1062,15 @@ int ubifs_replay_journal(struct ubifs_info *c) if (err) goto out; + /* + * UBIFS budgeting calculations use @c->budg_uncommitted_idx variable + * to roughly estimate index growth. Things like @c->min_idx_lebs + * depend on it. This means we have to initialize it to make sure + * budgeting works properly. + */ + c->budg_uncommitted_idx = atomic_long_read(&c->dirty_zn_cnt); + c->budg_uncommitted_idx *= c->max_idx_node_sz; + ubifs_assert(c->bud_bytes <= c->max_bud_bytes || c->need_recovery); dbg_mnt("finished, log head LEB %d:%d, max_sqnum %llu, " "highest_inum %lu", c->lhead_lnum, c->lhead_offs, c->max_sqnum, diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c index 0f392351dc5a..e070c643d1bb 100644 --- a/fs/ubifs/sb.c +++ b/fs/ubifs/sb.c @@ -28,6 +28,7 @@ #include "ubifs.h" #include <linux/random.h> +#include <linux/math64.h> /* * Default journal size in logical eraseblocks as a percent of total @@ -80,7 +81,7 @@ static int create_default_filesystem(struct ubifs_info *c) int err, tmp, jnl_lebs, log_lebs, max_buds, main_lebs, main_first; int lpt_lebs, lpt_first, orph_lebs, big_lpt, ino_waste, sup_flags = 0; int min_leb_cnt = UBIFS_MIN_LEB_CNT; - uint64_t tmp64, main_bytes; + long long tmp64, main_bytes; __le64 tmp_le64; /* Some functions called from here depend on the @c->key_len filed */ @@ -160,7 +161,7 @@ static int create_default_filesystem(struct ubifs_info *c) if (!sup) return -ENOMEM; - tmp64 = (uint64_t)max_buds * c->leb_size; + tmp64 = (long long)max_buds * c->leb_size; if (big_lpt) sup_flags |= UBIFS_FLG_BIGLPT; @@ -179,14 +180,16 @@ static int create_default_filesystem(struct ubifs_info *c) sup->fanout = cpu_to_le32(DEFAULT_FANOUT); sup->lsave_cnt = cpu_to_le32(c->lsave_cnt); sup->fmt_version = cpu_to_le32(UBIFS_FORMAT_VERSION); - sup->default_compr = cpu_to_le16(UBIFS_COMPR_LZO); sup->time_gran = cpu_to_le32(DEFAULT_TIME_GRAN); + if (c->mount_opts.override_compr) + sup->default_compr = cpu_to_le16(c->mount_opts.compr_type); + else + sup->default_compr = cpu_to_le16(UBIFS_COMPR_LZO); generate_random_uuid(sup->uuid); - main_bytes = (uint64_t)main_lebs * c->leb_size; - tmp64 = main_bytes * DEFAULT_RP_PERCENT; - do_div(tmp64, 100); + main_bytes = (long long)main_lebs * c->leb_size; + tmp64 = div_u64(main_bytes * DEFAULT_RP_PERCENT, 100); if (tmp64 > DEFAULT_MAX_RP_SIZE) tmp64 = DEFAULT_MAX_RP_SIZE; sup->rp_size = cpu_to_le64(tmp64); @@ -582,16 +585,15 @@ int ubifs_read_superblock(struct ubifs_info *c) c->jhead_cnt = le32_to_cpu(sup->jhead_cnt) + NONDATA_JHEADS_CNT; c->fanout = le32_to_cpu(sup->fanout); c->lsave_cnt = le32_to_cpu(sup->lsave_cnt); - c->default_compr = le16_to_cpu(sup->default_compr); c->rp_size = le64_to_cpu(sup->rp_size); c->rp_uid = le32_to_cpu(sup->rp_uid); c->rp_gid = le32_to_cpu(sup->rp_gid); sup_flags = le32_to_cpu(sup->flags); + if (!c->mount_opts.override_compr) + c->default_compr = le16_to_cpu(sup->default_compr); c->vfs_sb->s_time_gran = le32_to_cpu(sup->time_gran); - memcpy(&c->uuid, &sup->uuid, 16); - c->big_lpt = !!(sup_flags & UBIFS_FLG_BIGLPT); /* Automatically increase file system size to the maximum size */ diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c index f248533841a2..e7bab52a1410 100644 --- a/fs/ubifs/shrinker.c +++ b/fs/ubifs/shrinker.c @@ -151,7 +151,7 @@ static int shrink_tnc(struct ubifs_info *c, int nr, int age, int *contention) * @contention: if any contention, this is set to %1 * * This function walks the list of mounted UBIFS file-systems and frees clean - * znodes which are older then @age, until at least @nr znodes are freed. + * znodes which are older than @age, until at least @nr znodes are freed. * Returns the number of freed znodes. */ static int shrink_tnc_trees(int nr, int age, int *contention) diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index d80b2aef42b6..89556ee72518 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -34,6 +34,8 @@ #include <linux/parser.h> #include <linux/seq_file.h> #include <linux/mount.h> +#include <linux/math64.h> +#include <linux/writeback.h> #include "ubifs.h" /* @@ -417,39 +419,61 @@ static int ubifs_show_options(struct seq_file *s, struct vfsmount *mnt) else if (c->mount_opts.chk_data_crc == 1) seq_printf(s, ",no_chk_data_crc"); + if (c->mount_opts.override_compr) { + seq_printf(s, ",compr="); + seq_printf(s, ubifs_compr_name(c->mount_opts.compr_type)); + } + return 0; } static int ubifs_sync_fs(struct super_block *sb, int wait) { + int i, err; struct ubifs_info *c = sb->s_fs_info; - int i, ret = 0, err; - long long bud_bytes; + struct writeback_control wbc = { + .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE, + .range_start = 0, + .range_end = LLONG_MAX, + .nr_to_write = LONG_MAX, + }; - if (c->jheads) { - for (i = 0; i < c->jhead_cnt; i++) { - err = ubifs_wbuf_sync(&c->jheads[i].wbuf); - if (err && !ret) - ret = err; - } + /* + * Note by akpm about WB_SYNC_NONE used above: zero @wait is just an + * advisory thing to help the file system shove lots of data into the + * queues. If some gets missed then it'll be picked up on the second + * '->sync_fs()' call, with non-zero @wait. + */ - /* Commit the journal unless it has too little data */ - spin_lock(&c->buds_lock); - bud_bytes = c->bud_bytes; - spin_unlock(&c->buds_lock); - if (bud_bytes > c->leb_size) { - err = ubifs_run_commit(c); - if (err) - return err; - } + if (sb->s_flags & MS_RDONLY) + return 0; + + /* + * Synchronize write buffers, because 'ubifs_run_commit()' does not + * do this if it waits for an already running commit. + */ + for (i = 0; i < c->jhead_cnt; i++) { + err = ubifs_wbuf_sync(&c->jheads[i].wbuf); + if (err) + return err; } /* - * We ought to call sync for c->ubi but it does not have one. If it had - * it would in turn call mtd->sync, however mtd operations are - * synchronous anyway, so we don't lose any sleep here. + * VFS calls '->sync_fs()' before synchronizing all dirty inodes and + * pages, so synchronize them first, then commit the journal. Strictly + * speaking, it is not necessary to commit the journal here, + * synchronizing write-buffers would be enough. But committing makes + * UBIFS free space predictions much more accurate, so we want to let + * the user be able to get more accurate results of 'statfs()' after + * they synchronize the file system. */ - return ret; + generic_sync_sb_inodes(sb, &wbc); + + err = ubifs_run_commit(c); + if (err) + return err; + + return ubi_sync(c->vi.ubi_num); } /** @@ -596,7 +620,7 @@ static int bud_wbuf_callback(struct ubifs_info *c, int lnum, int free, int pad) } /* - * init_constants_late - initialize UBIFS constants. + * init_constants_sb - initialize UBIFS constants. * @c: UBIFS file-system description object * * This is a helper function which initializes various UBIFS constants after @@ -604,10 +628,10 @@ static int bud_wbuf_callback(struct ubifs_info *c, int lnum, int free, int pad) * makes sure they are all right. Returns zero in case of success and a * negative error code in case of failure. */ -static int init_constants_late(struct ubifs_info *c) +static int init_constants_sb(struct ubifs_info *c) { int tmp, err; - uint64_t tmp64; + long long tmp64; c->main_bytes = (long long)c->main_lebs * c->leb_size; c->max_znode_sz = sizeof(struct ubifs_znode) + @@ -634,9 +658,8 @@ static int init_constants_late(struct ubifs_info *c) * Make sure that the log is large enough to fit reference nodes for * all buds plus one reserved LEB. */ - tmp64 = c->max_bud_bytes; - tmp = do_div(tmp64, c->leb_size); - c->max_bud_cnt = tmp64 + !!tmp; + tmp64 = c->max_bud_bytes + c->leb_size - 1; + c->max_bud_cnt = div_u64(tmp64, c->leb_size); tmp = (c->ref_node_alsz * c->max_bud_cnt + c->leb_size - 1); tmp /= c->leb_size; tmp += 1; @@ -672,7 +695,7 @@ static int init_constants_late(struct ubifs_info *c) * Consequently, if the journal is too small, UBIFS will treat it as * always full. */ - tmp64 = (uint64_t)(c->jhead_cnt + 1) * c->leb_size + 1; + tmp64 = (long long)(c->jhead_cnt + 1) * c->leb_size + 1; if (c->bg_bud_bytes < tmp64) c->bg_bud_bytes = tmp64; if (c->max_bud_bytes < tmp64 + c->leb_size) @@ -682,6 +705,21 @@ static int init_constants_late(struct ubifs_info *c) if (err) return err; + return 0; +} + +/* + * init_constants_master - initialize UBIFS constants. + * @c: UBIFS file-system description object + * + * This is a helper function which initializes various UBIFS constants after + * the master node has been read. It also checks various UBIFS parameters and + * makes sure they are all right. + */ +static void init_constants_master(struct ubifs_info *c) +{ + long long tmp64; + c->min_idx_lebs = ubifs_calc_min_idx_lebs(c); /* @@ -690,14 +728,13 @@ static int init_constants_late(struct ubifs_info *c) * necessary to report something for the 'statfs()' call. * * Subtract the LEB reserved for GC, the LEB which is reserved for - * deletions, and assume only one journal head is available. + * deletions, minimum LEBs for the index, and assume only one journal + * head is available. */ - tmp64 = c->main_lebs - 2 - c->jhead_cnt + 1; - tmp64 *= (uint64_t)c->leb_size - c->leb_overhead; + tmp64 = c->main_lebs - 1 - 1 - MIN_INDEX_LEBS - c->jhead_cnt + 1; + tmp64 *= (long long)c->leb_size - c->leb_overhead; tmp64 = ubifs_reported_space(c, tmp64); c->block_cnt = tmp64 >> UBIFS_BLOCK_SHIFT; - - return 0; } /** @@ -878,6 +915,7 @@ static int check_volume_empty(struct ubifs_info *c) * Opt_no_bulk_read: disable bulk-reads * Opt_chk_data_crc: check CRCs when reading data nodes * Opt_no_chk_data_crc: do not check CRCs when reading data nodes + * Opt_override_compr: override default compressor * Opt_err: just end of array marker */ enum { @@ -887,6 +925,7 @@ enum { Opt_no_bulk_read, Opt_chk_data_crc, Opt_no_chk_data_crc, + Opt_override_compr, Opt_err, }; @@ -897,6 +936,7 @@ static const match_table_t tokens = { {Opt_no_bulk_read, "no_bulk_read"}, {Opt_chk_data_crc, "chk_data_crc"}, {Opt_no_chk_data_crc, "no_chk_data_crc"}, + {Opt_override_compr, "compr=%s"}, {Opt_err, NULL}, }; @@ -950,6 +990,28 @@ static int ubifs_parse_options(struct ubifs_info *c, char *options, c->mount_opts.chk_data_crc = 1; c->no_chk_data_crc = 1; break; + case Opt_override_compr: + { + char *name = match_strdup(&args[0]); + + if (!name) + return -ENOMEM; + if (!strcmp(name, "none")) + c->mount_opts.compr_type = UBIFS_COMPR_NONE; + else if (!strcmp(name, "lzo")) + c->mount_opts.compr_type = UBIFS_COMPR_LZO; + else if (!strcmp(name, "zlib")) + c->mount_opts.compr_type = UBIFS_COMPR_ZLIB; + else { + ubifs_err("unknown compressor \"%s\"", name); + kfree(name); + return -EINVAL; + } + kfree(name); + c->mount_opts.override_compr = 1; + c->default_compr = c->mount_opts.compr_type; + break; + } default: ubifs_err("unrecognized mount option \"%s\" " "or missing value", p); @@ -1019,6 +1081,30 @@ again: } /** + * check_free_space - check if there is enough free space to mount. + * @c: UBIFS file-system description object + * + * This function makes sure UBIFS has enough free space to be mounted in + * read/write mode. UBIFS must always have some free space to allow deletions. + */ +static int check_free_space(struct ubifs_info *c) +{ + ubifs_assert(c->dark_wm > 0); + if (c->lst.total_free + c->lst.total_dirty < c->dark_wm) { + ubifs_err("insufficient free space to mount in read/write mode"); + dbg_dump_budg(c); + dbg_dump_lprops(c); + /* + * We return %-EINVAL instead of %-ENOSPC because it seems to + * be the closest error code mentioned in the mount function + * documentation. + */ + return -EINVAL; + } + return 0; +} + +/** * mount_ubifs - mount UBIFS file-system. * @c: UBIFS file-system description object * @@ -1039,11 +1125,9 @@ static int mount_ubifs(struct ubifs_info *c) if (err) return err; -#ifdef CONFIG_UBIFS_FS_DEBUG - c->dbg_buf = vmalloc(c->leb_size); - if (!c->dbg_buf) - return -ENOMEM; -#endif + err = ubifs_debugging_init(c); + if (err) + return err; err = check_volume_empty(c); if (err) @@ -1100,27 +1184,25 @@ static int mount_ubifs(struct ubifs_info *c) goto out_free; /* - * Make sure the compressor which is set as the default on in the - * superblock was actually compiled in. + * Make sure the compressor which is set as default in the superblock + * or overridden by mount options is actually compiled in. */ if (!ubifs_compr_present(c->default_compr)) { - ubifs_warn("'%s' compressor is set by superblock, but not " - "compiled in", ubifs_compr_name(c->default_compr)); - c->default_compr = UBIFS_COMPR_NONE; + ubifs_err("'compressor \"%s\" is not compiled in", + ubifs_compr_name(c->default_compr)); + goto out_free; } - dbg_failure_mode_registration(c); - - err = init_constants_late(c); + err = init_constants_sb(c); if (err) - goto out_dereg; + goto out_free; sz = ALIGN(c->max_idx_node_sz, c->min_io_size); sz = ALIGN(sz + c->max_idx_node_sz, c->min_io_size); c->cbuf = kmalloc(sz, GFP_NOFS); if (!c->cbuf) { err = -ENOMEM; - goto out_dereg; + goto out_free; } sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num, c->vi.vol_id); @@ -1145,6 +1227,8 @@ static int mount_ubifs(struct ubifs_info *c) if (err) goto out_master; + init_constants_master(c); + if ((c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY)) != 0) { ubifs_msg("recovery needed"); c->need_recovery = 1; @@ -1183,12 +1267,9 @@ static int mount_ubifs(struct ubifs_info *c) if (!mounted_read_only) { int lnum; - /* Check for enough free space */ - if (ubifs_calc_available(c, c->min_idx_lebs) <= 0) { - ubifs_err("insufficient available space"); - err = -EINVAL; + err = check_free_space(c); + if (err) goto out_orphans; - } /* Check for enough log space */ lnum = c->lhead_lnum + 1; @@ -1232,6 +1313,10 @@ static int mount_ubifs(struct ubifs_info *c) } } + err = dbg_debugfs_init_fs(c); + if (err) + goto out_infos; + err = dbg_check_filesystem(c); if (err) goto out_infos; @@ -1283,8 +1368,20 @@ static int mount_ubifs(struct ubifs_info *c) dbg_msg("tree fanout: %d", c->fanout); dbg_msg("reserved GC LEB: %d", c->gc_lnum); dbg_msg("first main LEB: %d", c->main_first); + dbg_msg("max. znode size %d", c->max_znode_sz); + dbg_msg("max. index node size %d", c->max_idx_node_sz); + dbg_msg("node sizes: data %zu, inode %zu, dentry %zu", + UBIFS_DATA_NODE_SZ, UBIFS_INO_NODE_SZ, UBIFS_DENT_NODE_SZ); + dbg_msg("node sizes: trun %zu, sb %zu, master %zu", + UBIFS_TRUN_NODE_SZ, UBIFS_SB_NODE_SZ, UBIFS_MST_NODE_SZ); + dbg_msg("node sizes: ref %zu, cmt. start %zu, orph %zu", + UBIFS_REF_NODE_SZ, UBIFS_CS_NODE_SZ, UBIFS_ORPH_NODE_SZ); + dbg_msg("max. node sizes: data %zu, inode %zu dentry %zu", + UBIFS_MAX_DATA_NODE_SZ, UBIFS_MAX_INO_NODE_SZ, + UBIFS_MAX_DENT_NODE_SZ); dbg_msg("dead watermark: %d", c->dead_wm); dbg_msg("dark watermark: %d", c->dark_wm); + dbg_msg("LEB overhead: %d", c->leb_overhead); x = (long long)c->main_lebs * c->dark_wm; dbg_msg("max. dark space: %lld (%lld KiB, %lld MiB)", x, x >> 10, x >> 20); @@ -1320,14 +1417,12 @@ out_wbufs: free_wbufs(c); out_cbuf: kfree(c->cbuf); -out_dereg: - dbg_failure_mode_deregistration(c); out_free: kfree(c->bu.buf); vfree(c->ileb_buf); vfree(c->sbuf); kfree(c->bottom_up_buf); - UBIFS_DBG(vfree(c->dbg_buf)); + ubifs_debugging_exit(c); return err; } @@ -1345,6 +1440,7 @@ static void ubifs_umount(struct ubifs_info *c) dbg_gen("un-mounting UBI device %d, volume %d", c->vi.ubi_num, c->vi.vol_id); + dbg_debugfs_exit_fs(c); spin_lock(&ubifs_infos_lock); list_del(&c->infos_list); spin_unlock(&ubifs_infos_lock); @@ -1364,8 +1460,7 @@ static void ubifs_umount(struct ubifs_info *c) vfree(c->ileb_buf); vfree(c->sbuf); kfree(c->bottom_up_buf); - UBIFS_DBG(vfree(c->dbg_buf)); - dbg_failure_mode_deregistration(c); + ubifs_debugging_exit(c); } /** @@ -1387,12 +1482,9 @@ static int ubifs_remount_rw(struct ubifs_info *c) c->remounting_rw = 1; c->always_chk_crc = 1; - /* Check for enough free space */ - if (ubifs_calc_available(c, c->min_idx_lebs) <= 0) { - ubifs_err("insufficient available space"); - err = -EINVAL; + err = check_free_space(c); + if (err) goto out; - } if (c->old_leb_cnt != c->leb_cnt) { struct ubifs_sb_node *sup; @@ -1515,20 +1607,24 @@ out: * @c: UBIFS file-system description object * * This function is called during un-mounting and re-mounting, and it commits - * the journal unless the "fast unmount" mode is enabled. It also avoids - * committing the journal if it contains too few data. + * the journal unless the "fast unmount" mode is enabled. */ static void commit_on_unmount(struct ubifs_info *c) { - if (!c->fast_unmount) { - long long bud_bytes; + struct super_block *sb = c->vfs_sb; + long long bud_bytes; - spin_lock(&c->buds_lock); - bud_bytes = c->bud_bytes; - spin_unlock(&c->buds_lock); - if (bud_bytes > c->leb_size) - ubifs_run_commit(c); - } + /* + * This function is called before the background thread is stopped, so + * we may race with ongoing commit, which means we have to take + * @c->bud_lock to access @c->bud_bytes. + */ + spin_lock(&c->buds_lock); + bud_bytes = c->bud_bytes; + spin_unlock(&c->buds_lock); + + if (!c->fast_unmount && !(sb->s_flags & MS_RDONLY) && bud_bytes) + ubifs_run_commit(c); } /** @@ -1849,7 +1945,6 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent) goto out_iput; mutex_unlock(&c->umount_mutex); - return 0; out_iput: @@ -1955,7 +2050,7 @@ static void ubifs_kill_sb(struct super_block *sb) * We do 'commit_on_unmount()' here instead of 'ubifs_put_super()' * in order to be outside BKL. */ - if (sb->s_root && !(sb->s_flags & MS_RDONLY)) + if (sb->s_root) commit_on_unmount(c); /* The un-mount routine is actually done in put_super() */ generic_shutdown_super(sb); @@ -2021,6 +2116,14 @@ static int __init ubifs_init(void) BUILD_BUG_ON(UBIFS_REF_NODE_SZ != 64); /* + * We use 2 bit wide bit-fields to store compression type, which should + * be amended if more compressors are added. The bit-fields are: + * @compr_type in 'struct ubifs_inode', @default_compr in + * 'struct ubifs_info' and @compr_type in 'struct ubifs_mount_opts'. + */ + BUILD_BUG_ON(UBIFS_COMPR_TYPES_CNT > 4); + + /* * We require that PAGE_CACHE_SIZE is greater-than-or-equal-to * UBIFS_BLOCK_SIZE. It is assumed that both are powers of 2. */ @@ -2049,11 +2152,17 @@ static int __init ubifs_init(void) err = ubifs_compressors_init(); if (err) + goto out_shrinker; + + err = dbg_debugfs_init(); + if (err) goto out_compr; return 0; out_compr: + ubifs_compressors_exit(); +out_shrinker: unregister_shrinker(&ubifs_shrinker_info); kmem_cache_destroy(ubifs_inode_slab); out_reg: @@ -2068,6 +2177,7 @@ static void __exit ubifs_exit(void) ubifs_assert(list_empty(&ubifs_infos)); ubifs_assert(atomic_long_read(&ubifs_clean_zn_cnt) == 0); + dbg_debugfs_exit(); ubifs_compressors_exit(); unregister_shrinker(&ubifs_shrinker_info); kmem_cache_destroy(ubifs_inode_slab); diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index 6eef5344a145..f7e36f545527 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -2245,12 +2245,11 @@ int ubifs_tnc_replace(struct ubifs_info *c, const union ubifs_key *key, if (found) { /* Ensure the znode is dirtied */ if (znode->cnext || !ubifs_zn_dirty(znode)) { - znode = dirty_cow_bottom_up(c, - znode); - if (IS_ERR(znode)) { - err = PTR_ERR(znode); - goto out_unlock; - } + znode = dirty_cow_bottom_up(c, znode); + if (IS_ERR(znode)) { + err = PTR_ERR(znode); + goto out_unlock; + } } zbr = &znode->zbranch[n]; lnc_free(zbr); @@ -2317,11 +2316,11 @@ int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key, /* Ensure the znode is dirtied */ if (znode->cnext || !ubifs_zn_dirty(znode)) { - znode = dirty_cow_bottom_up(c, znode); - if (IS_ERR(znode)) { - err = PTR_ERR(znode); - goto out_unlock; - } + znode = dirty_cow_bottom_up(c, znode); + if (IS_ERR(znode)) { + err = PTR_ERR(znode); + goto out_unlock; + } } if (found == 1) { @@ -2627,11 +2626,11 @@ int ubifs_tnc_remove_range(struct ubifs_info *c, union ubifs_key *from_key, /* Ensure the znode is dirtied */ if (znode->cnext || !ubifs_zn_dirty(znode)) { - znode = dirty_cow_bottom_up(c, znode); - if (IS_ERR(znode)) { - err = PTR_ERR(znode); - goto out_unlock; - } + znode = dirty_cow_bottom_up(c, znode); + if (IS_ERR(znode)) { + err = PTR_ERR(znode); + goto out_unlock; + } } /* Remove all keys in range except the first */ diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c index 8ac76b1c2d55..fde8d127c768 100644 --- a/fs/ubifs/tnc_commit.c +++ b/fs/ubifs/tnc_commit.c @@ -553,8 +553,8 @@ static int layout_in_empty_space(struct ubifs_info *c) } #ifdef CONFIG_UBIFS_FS_DEBUG - c->new_ihead_lnum = lnum; - c->new_ihead_offs = buf_offs; + c->dbg->new_ihead_lnum = lnum; + c->dbg->new_ihead_offs = buf_offs; #endif return 0; @@ -802,8 +802,10 @@ int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot) * budgeting subsystem to assume the index is already committed, * even though it is not. */ + ubifs_assert(c->min_idx_lebs == ubifs_calc_min_idx_lebs(c)); c->old_idx_sz = c->calc_idx_sz; c->budg_uncommitted_idx = 0; + c->min_idx_lebs = ubifs_calc_min_idx_lebs(c); spin_unlock(&c->space_lock); mutex_unlock(&c->tnc_mutex); @@ -1002,7 +1004,8 @@ static int write_index(struct ubifs_info *c) } #ifdef CONFIG_UBIFS_FS_DEBUG - if (lnum != c->new_ihead_lnum || buf_offs != c->new_ihead_offs) { + if (lnum != c->dbg->new_ihead_lnum || + buf_offs != c->dbg->new_ihead_offs) { ubifs_err("inconsistent ihead"); return -EINVAL; } diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h index 0b378042a3a2..b25fc36cf72f 100644 --- a/fs/ubifs/ubifs-media.h +++ b/fs/ubifs/ubifs-media.h @@ -51,6 +51,13 @@ */ #define UBIFS_MIN_COMPR_LEN 128 +/* + * If compressed data length is less than %UBIFS_MIN_COMPRESS_DIFF bytes + * shorter than uncompressed data length, UBIFS preferes to leave this data + * node uncompress, because it'll be read faster. + */ +#define UBIFS_MIN_COMPRESS_DIFF 64 + /* Root inode number */ #define UBIFS_ROOT_INO 1 diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 46b172560a06..fc2a4cc66d03 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -63,6 +63,14 @@ #define SQNUM_WARN_WATERMARK 0xFFFFFFFF00000000ULL #define SQNUM_WATERMARK 0xFFFFFFFFFF000000ULL +/* + * Minimum amount of LEBs reserved for the index. At present the index needs at + * least 2 LEBs: one for the index head and one for in-the-gaps method (which + * currently does not cater for the index head and so excludes it from + * consideration). + */ +#define MIN_INDEX_LEBS 2 + /* Minimum amount of data UBIFS writes to the flash */ #define MIN_WRITE_SZ (UBIFS_DATA_NODE_SZ + 8) @@ -386,12 +394,12 @@ struct ubifs_inode { unsigned int dirty:1; unsigned int xattr:1; unsigned int bulk_read:1; + unsigned int compr_type:2; struct mutex ui_mutex; spinlock_t ui_lock; loff_t synced_i_size; loff_t ui_size; int flags; - int compr_type; pgoff_t last_page_read; pgoff_t read_in_a_row; int data_len; @@ -419,7 +427,7 @@ struct ubifs_unclean_leb { * * LPROPS_UNCAT: not categorized * LPROPS_DIRTY: dirty > 0, not index - * LPROPS_DIRTY_IDX: dirty + free > UBIFS_CH_SZ and index + * LPROPS_DIRTY_IDX: dirty + free > @c->min_idx_node_sze and index * LPROPS_FREE: free > 0, not empty, not index * LPROPS_HEAP_CNT: number of heaps used for storing categorized LEBs * LPROPS_EMPTY: LEB is empty, not taken @@ -473,8 +481,8 @@ struct ubifs_lprops { struct ubifs_lpt_lprops { int free; int dirty; - unsigned tgc : 1; - unsigned cmt : 1; + unsigned tgc:1; + unsigned cmt:1; }; /** @@ -482,24 +490,26 @@ struct ubifs_lpt_lprops { * @empty_lebs: number of empty LEBs * @taken_empty_lebs: number of taken LEBs * @idx_lebs: number of indexing LEBs - * @total_free: total free space in bytes - * @total_dirty: total dirty space in bytes - * @total_used: total used space in bytes (includes only data LEBs) - * @total_dead: total dead space in bytes (includes only data LEBs) - * @total_dark: total dark space in bytes (includes only data LEBs) + * @total_free: total free space in bytes (includes all LEBs) + * @total_dirty: total dirty space in bytes (includes all LEBs) + * @total_used: total used space in bytes (does not include index LEBs) + * @total_dead: total dead space in bytes (does not include index LEBs) + * @total_dark: total dark space in bytes (does not include index LEBs) + * + * The @taken_empty_lebs field counts the LEBs that are in the transient state + * of having been "taken" for use but not yet written to. @taken_empty_lebs is + * needed to account correctly for @gc_lnum, otherwise @empty_lebs could be + * used by itself (in which case 'unused_lebs' would be a better name). In the + * case of @gc_lnum, it is "taken" at mount time or whenever a LEB is retained + * by GC, but unlike other empty LEBs that are "taken", it may not be written + * straight away (i.e. before the next commit start or unmount), so either + * @gc_lnum must be specially accounted for, or the current approach followed + * i.e. count it under @taken_empty_lebs. * - * N.B. total_dirty and total_used are different to other total_* fields, - * because they account _all_ LEBs, not just data LEBs. + * @empty_lebs includes @taken_empty_lebs. * - * 'taken_empty_lebs' counts the LEBs that are in the transient state of having - * been 'taken' for use but not yet written to. 'taken_empty_lebs' is needed - * to account correctly for gc_lnum, otherwise 'empty_lebs' could be used - * by itself (in which case 'unused_lebs' would be a better name). In the case - * of gc_lnum, it is 'taken' at mount time or whenever a LEB is retained by GC, - * but unlike other empty LEBs that are 'taken', it may not be written straight - * away (i.e. before the next commit start or unmount), so either gc_lnum must - * be specially accounted for, or the current approach followed i.e. count it - * under 'taken_empty_lebs'. + * @total_used, @total_dead and @total_dark fields do not account indexing + * LEBs. */ struct ubifs_lp_stats { int empty_lebs; @@ -893,15 +903,25 @@ struct ubifs_orphan { /** * struct ubifs_mount_opts - UBIFS-specific mount options information. * @unmount_mode: selected unmount mode (%0 default, %1 normal, %2 fast) - * @bulk_read: enable bulk-reads - * @chk_data_crc: check CRCs when reading data nodes + * @bulk_read: enable/disable bulk-reads (%0 default, %1 disabe, %2 enable) + * @chk_data_crc: enable/disable CRC data checking when reading data nodes + * (%0 default, %1 disabe, %2 enable) + * @override_compr: override default compressor (%0 - do not override and use + * superblock compressor, %1 - override and use compressor + * specified in @compr_type) + * @compr_type: compressor type to override the superblock compressor with + * (%UBIFS_COMPR_NONE, etc) */ struct ubifs_mount_opts { unsigned int unmount_mode:2; unsigned int bulk_read:2; unsigned int chk_data_crc:2; + unsigned int override_compr:1; + unsigned int compr_type:2; }; +struct ubifs_debug_info; + /** * struct ubifs_info - UBIFS file-system description data structure * (per-superblock). @@ -946,6 +966,7 @@ struct ubifs_mount_opts { * @no_chk_data_crc: do not check CRCs when reading data nodes (except during * recovery) * @bulk_read: enable bulk-reads + * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc) * * @tnc_mutex: protects the Tree Node Cache (TNC), @zroot, @cnext, @enext, and * @calc_idx_sz @@ -963,8 +984,6 @@ struct ubifs_mount_opts { * @ileb_nxt: next pre-allocated index LEBs * @old_idx: tree of index nodes obsoleted since the last commit start * @bottom_up_buf: a buffer which is used by 'dirty_cow_bottom_up()' in tnc.c - * @new_ihead_lnum: used by debugging to check ihead_lnum - * @new_ihead_offs: used by debugging to check ihead_offs * * @mst_node: master node * @mst_offs: offset of valid master node @@ -986,7 +1005,6 @@ struct ubifs_mount_opts { * @main_lebs: count of LEBs in the main area * @main_first: first LEB of the main area * @main_bytes: main area size in bytes - * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc) * * @key_hash_type: type of the key hash * @key_hash: direntry key hash function @@ -1149,15 +1167,7 @@ struct ubifs_mount_opts { * @always_chk_crc: always check CRCs (while mounting and remounting rw) * @mount_opts: UBIFS-specific mount options * - * @dbg_buf: a buffer of LEB size used for debugging purposes - * @old_zroot: old index root - used by 'dbg_check_old_index()' - * @old_zroot_level: old index root level - used by 'dbg_check_old_index()' - * @old_zroot_sqnum: old index root sqnum - used by 'dbg_check_old_index()' - * @failure_mode: failure mode for recovery testing - * @fail_delay: 0=>don't delay, 1=>delay a time, 2=>delay a number of calls - * @fail_timeout: time in jiffies when delay of failure mode expires - * @fail_cnt: current number of calls to failure mode I/O functions - * @fail_cnt_max: number of calls by which to delay failure mode + * @dbg: debugging-related information */ struct ubifs_info { struct super_block *vfs_sb; @@ -1196,6 +1206,7 @@ struct ubifs_info { unsigned int big_lpt:1; unsigned int no_chk_data_crc:1; unsigned int bulk_read:1; + unsigned int default_compr:2; struct mutex tnc_mutex; struct ubifs_zbranch zroot; @@ -1212,10 +1223,6 @@ struct ubifs_info { int ileb_nxt; struct rb_root old_idx; int *bottom_up_buf; -#ifdef CONFIG_UBIFS_FS_DEBUG - int new_ihead_lnum; - int new_ihead_offs; -#endif struct ubifs_mst_node *mst_node; int mst_offs; @@ -1237,7 +1244,6 @@ struct ubifs_info { int main_lebs; int main_first; long long main_bytes; - int default_compr; uint8_t key_hash_type; uint32_t (*key_hash)(const char *str, int len); @@ -1315,8 +1321,8 @@ struct ubifs_info { void *sbuf; struct list_head idx_gc; int idx_gc_cnt; - volatile int gc_seq; - volatile int gced_lnum; + int gc_seq; + int gced_lnum; struct list_head infos_list; struct mutex umount_mutex; @@ -1391,21 +1397,7 @@ struct ubifs_info { struct ubifs_mount_opts mount_opts; #ifdef CONFIG_UBIFS_FS_DEBUG - void *dbg_buf; - struct ubifs_zbranch old_zroot; - int old_zroot_level; - unsigned long long old_zroot_sqnum; - int failure_mode; - int fail_delay; - unsigned long fail_timeout; - unsigned int fail_cnt; - unsigned int fail_cnt_max; - long long chk_lpt_sz; - long long chk_lpt_sz2; - long long chk_lpt_wastage; - int chk_lpt_lebs; - int new_nhead_lnum; - int new_nhead_offs; + struct ubifs_debug_info *dbg; #endif }; @@ -1505,7 +1497,7 @@ void ubifs_cancel_ino_op(struct ubifs_info *c, struct inode *inode, long long ubifs_get_free_space(struct ubifs_info *c); int ubifs_calc_min_idx_lebs(struct ubifs_info *c); void ubifs_convert_page_budget(struct ubifs_info *c); -long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free); +long long ubifs_reported_space(const struct ubifs_info *c, long long free); long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs); /* find.c */ @@ -1639,6 +1631,9 @@ void ubifs_add_lpt_dirt(struct ubifs_info *c, int lnum, int dirty); void ubifs_add_nnode_dirt(struct ubifs_info *c, struct ubifs_nnode *nnode); uint32_t ubifs_unpack_bits(uint8_t **addr, int *pos, int nrbits); struct ubifs_nnode *ubifs_first_nnode(struct ubifs_info *c, int *hght); +/* Needed only in debugging code in lpt_commit.c */ +int ubifs_unpack_nnode(const struct ubifs_info *c, void *buf, + struct ubifs_nnode *nnode); /* lpt_commit.c */ int ubifs_lpt_start_commit(struct ubifs_info *c); @@ -1714,7 +1709,7 @@ long ubifs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); /* compressor.c */ int __init ubifs_compressors_init(void); -void __exit ubifs_compressors_exit(void); +void ubifs_compressors_exit(void); void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len, int *compr_type); int ubifs_decompress(const void *buf, int len, void *out, int *out_len, diff --git a/fs/xattr.c b/fs/xattr.c index 468377e66531..237804cd6b56 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -175,7 +175,7 @@ vfs_listxattr(struct dentry *d, char *list, size_t size) if (error) return error; error = -EOPNOTSUPP; - if (d->d_inode->i_op && d->d_inode->i_op->listxattr) { + if (d->d_inode->i_op->listxattr) { error = d->d_inode->i_op->listxattr(d, list, size); } else { error = security_inode_listsecurity(d->d_inode, list, size); diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 737c9a425361..c3dc491fff89 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -85,13 +85,13 @@ xfs-y += xfs_alloc.o \ xfs_trans_inode.o \ xfs_trans_item.o \ xfs_utils.o \ - xfs_vfsops.o \ xfs_vnodeops.o \ xfs_rw.o \ xfs_dmops.o \ xfs_qmops.o -xfs-$(CONFIG_XFS_TRACE) += xfs_dir2_trace.o +xfs-$(CONFIG_XFS_TRACE) += xfs_btree_trace.o \ + xfs_dir2_trace.o # Objects in linux/ xfs-y += $(addprefix $(XFS_LINUX)/, \ @@ -106,7 +106,7 @@ xfs-y += $(addprefix $(XFS_LINUX)/, \ xfs_iops.o \ xfs_lrw.o \ xfs_super.o \ - xfs_vnode.o \ + xfs_sync.o \ xfs_xattr.o) # Objects in support/ diff --git a/fs/xfs/linux-2.6/sv.h b/fs/xfs/linux-2.6/sv.h index 351a8f454bd1..4dfc7c370819 100644 --- a/fs/xfs/linux-2.6/sv.h +++ b/fs/xfs/linux-2.6/sv.h @@ -32,23 +32,15 @@ typedef struct sv_s { wait_queue_head_t waiters; } sv_t; -#define SV_FIFO 0x0 /* sv_t is FIFO type */ -#define SV_LIFO 0x2 /* sv_t is LIFO type */ -#define SV_PRIO 0x4 /* sv_t is PRIO type */ -#define SV_KEYED 0x6 /* sv_t is KEYED type */ -#define SV_DEFAULT SV_FIFO - - -static inline void _sv_wait(sv_t *sv, spinlock_t *lock, int state, - unsigned long timeout) +static inline void _sv_wait(sv_t *sv, spinlock_t *lock) { DECLARE_WAITQUEUE(wait, current); add_wait_queue_exclusive(&sv->waiters, &wait); - __set_current_state(state); + __set_current_state(TASK_UNINTERRUPTIBLE); spin_unlock(lock); - schedule_timeout(timeout); + schedule(); remove_wait_queue(&sv->waiters, &wait); } @@ -58,13 +50,7 @@ static inline void _sv_wait(sv_t *sv, spinlock_t *lock, int state, #define sv_destroy(sv) \ /*NOTHING*/ #define sv_wait(sv, pri, lock, s) \ - _sv_wait(sv, lock, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT) -#define sv_wait_sig(sv, pri, lock, s) \ - _sv_wait(sv, lock, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT) -#define sv_timedwait(sv, pri, lock, s, svf, ts, rts) \ - _sv_wait(sv, lock, TASK_UNINTERRUPTIBLE, timespec_to_jiffies(ts)) -#define sv_timedwait_sig(sv, pri, lock, s, svf, ts, rts) \ - _sv_wait(sv, lock, TASK_INTERRUPTIBLE, timespec_to_jiffies(ts)) + _sv_wait(sv, lock) #define sv_signal(sv) \ wake_up(&(sv)->waiters) #define sv_broadcast(sv) \ diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index a44d68eb50b5..de3a198f771e 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -42,6 +42,40 @@ #include <linux/pagevec.h> #include <linux/writeback.h> + +/* + * Prime number of hash buckets since address is used as the key. + */ +#define NVSYNC 37 +#define to_ioend_wq(v) (&xfs_ioend_wq[((unsigned long)v) % NVSYNC]) +static wait_queue_head_t xfs_ioend_wq[NVSYNC]; + +void __init +xfs_ioend_init(void) +{ + int i; + + for (i = 0; i < NVSYNC; i++) + init_waitqueue_head(&xfs_ioend_wq[i]); +} + +void +xfs_ioend_wait( + xfs_inode_t *ip) +{ + wait_queue_head_t *wq = to_ioend_wq(ip); + + wait_event(*wq, (atomic_read(&ip->i_iocount) == 0)); +} + +STATIC void +xfs_ioend_wake( + xfs_inode_t *ip) +{ + if (atomic_dec_and_test(&ip->i_iocount)) + wake_up(to_ioend_wq(ip)); +} + STATIC void xfs_count_page_state( struct page *page, @@ -146,16 +180,25 @@ xfs_destroy_ioend( xfs_ioend_t *ioend) { struct buffer_head *bh, *next; + struct xfs_inode *ip = XFS_I(ioend->io_inode); for (bh = ioend->io_buffer_head; bh; bh = next) { next = bh->b_private; bh->b_end_io(bh, !ioend->io_error); } - if (unlikely(ioend->io_error)) { - vn_ioerror(XFS_I(ioend->io_inode), ioend->io_error, - __FILE__,__LINE__); + + /* + * Volume managers supporting multiple paths can send back ENODEV + * when the final path disappears. In this case continuing to fill + * the page cache with dirty data which cannot be written out is + * evil, so prevent that. + */ + if (unlikely(ioend->io_error == -ENODEV)) { + xfs_do_force_shutdown(ip->i_mount, SHUTDOWN_DEVICE_REQ, + __FILE__, __LINE__); } - vn_iowake(XFS_I(ioend->io_inode)); + + xfs_ioend_wake(ip); mempool_free(ioend, xfs_ioend_pool); } @@ -191,7 +234,7 @@ xfs_setfilesize( ip->i_d.di_size = isize; ip->i_update_core = 1; ip->i_update_size = 1; - mark_inode_dirty_sync(ioend->io_inode); + xfs_mark_inode_dirty_sync(ip); } xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -317,14 +360,9 @@ xfs_map_blocks( xfs_iomap_t *mapp, int flags) { - xfs_inode_t *ip = XFS_I(inode); - int error, nmaps = 1; - - error = xfs_iomap(ip, offset, count, - flags, mapp, &nmaps); - if (!error && (flags & (BMAPI_WRITE|BMAPI_ALLOCATE))) - xfs_iflags_set(ip, XFS_IMODIFIED); - return -error; + int nmaps = 1; + + return -xfs_iomap(XFS_I(inode), offset, count, flags, mapp, &nmaps); } STATIC_INLINE int @@ -512,7 +550,7 @@ xfs_cancel_ioend( unlock_buffer(bh); } while ((bh = next_bh) != NULL); - vn_iowake(XFS_I(ioend->io_inode)); + xfs_ioend_wake(XFS_I(ioend->io_inode)); mempool_free(ioend, xfs_ioend_pool); } while ((ioend = next) != NULL); } diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h index 3ba0631a3818..7b26f5ff9692 100644 --- a/fs/xfs/linux-2.6/xfs_aops.h +++ b/fs/xfs/linux-2.6/xfs_aops.h @@ -43,4 +43,7 @@ typedef struct xfs_ioend { extern const struct address_space_operations xfs_address_space_operations; extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int); +extern void xfs_ioend_init(void); +extern void xfs_ioend_wait(struct xfs_inode *); + #endif /* __XFS_AOPS_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 36d5fcd3f593..cb329edc925b 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -630,6 +630,29 @@ xfs_buf_get_flags( return NULL; } +STATIC int +_xfs_buf_read( + xfs_buf_t *bp, + xfs_buf_flags_t flags) +{ + int status; + + XB_TRACE(bp, "_xfs_buf_read", (unsigned long)flags); + + ASSERT(!(flags & (XBF_DELWRI|XBF_WRITE))); + ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL); + + bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_DELWRI | \ + XBF_READ_AHEAD | _XBF_RUN_QUEUES); + bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | \ + XBF_READ_AHEAD | _XBF_RUN_QUEUES); + + status = xfs_buf_iorequest(bp); + if (!status && !(flags & XBF_ASYNC)) + status = xfs_buf_iowait(bp); + return status; +} + xfs_buf_t * xfs_buf_read_flags( xfs_buftarg_t *target, @@ -646,7 +669,7 @@ xfs_buf_read_flags( if (!XFS_BUF_ISDONE(bp)) { XB_TRACE(bp, "read", (unsigned long)flags); XFS_STATS_INC(xb_get_read); - xfs_buf_iostart(bp, flags); + _xfs_buf_read(bp, flags); } else if (flags & XBF_ASYNC) { XB_TRACE(bp, "read_async", (unsigned long)flags); /* @@ -1048,50 +1071,39 @@ xfs_buf_ioerror( XB_TRACE(bp, "ioerror", (unsigned long)error); } -/* - * Initiate I/O on a buffer, based on the flags supplied. - * The b_iodone routine in the buffer supplied will only be called - * when all of the subsidiary I/O requests, if any, have been completed. - */ int -xfs_buf_iostart( - xfs_buf_t *bp, - xfs_buf_flags_t flags) +xfs_bawrite( + void *mp, + struct xfs_buf *bp) { - int status = 0; + XB_TRACE(bp, "bawrite", 0); - XB_TRACE(bp, "iostart", (unsigned long)flags); + ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL); - if (flags & XBF_DELWRI) { - bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_ASYNC); - bp->b_flags |= flags & (XBF_DELWRI | XBF_ASYNC); - xfs_buf_delwri_queue(bp, 1); - return 0; - } + xfs_buf_delwri_dequeue(bp); - bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_ASYNC | XBF_DELWRI | \ - XBF_READ_AHEAD | _XBF_RUN_QUEUES); - bp->b_flags |= flags & (XBF_READ | XBF_WRITE | XBF_ASYNC | \ - XBF_READ_AHEAD | _XBF_RUN_QUEUES); + bp->b_flags &= ~(XBF_READ | XBF_DELWRI | XBF_READ_AHEAD); + bp->b_flags |= (XBF_WRITE | XBF_ASYNC | _XBF_RUN_QUEUES); + + bp->b_mount = mp; + bp->b_strat = xfs_bdstrat_cb; + return xfs_bdstrat_cb(bp); +} - BUG_ON(bp->b_bn == XFS_BUF_DADDR_NULL); +void +xfs_bdwrite( + void *mp, + struct xfs_buf *bp) +{ + XB_TRACE(bp, "bdwrite", 0); - /* For writes allow an alternate strategy routine to precede - * the actual I/O request (which may not be issued at all in - * a shutdown situation, for example). - */ - status = (flags & XBF_WRITE) ? - xfs_buf_iostrategy(bp) : xfs_buf_iorequest(bp); + bp->b_strat = xfs_bdstrat_cb; + bp->b_mount = mp; - /* Wait for I/O if we are not an async request. - * Note: async I/O request completion will release the buffer, - * and that can already be done by this point. So using the - * buffer pointer from here on, after async I/O, is invalid. - */ - if (!status && !(flags & XBF_ASYNC)) - status = xfs_buf_iowait(bp); + bp->b_flags &= ~XBF_READ; + bp->b_flags |= (XBF_DELWRI | XBF_ASYNC); - return status; + xfs_buf_delwri_queue(bp, 1); } STATIC_INLINE void @@ -1114,8 +1126,7 @@ xfs_buf_bio_end_io( unsigned int blocksize = bp->b_target->bt_bsize; struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; - if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) - bp->b_error = EIO; + xfs_buf_ioerror(bp, -error); do { struct page *page = bvec->bv_page; diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index 456519a088c7..288ae7c4c800 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -168,7 +168,7 @@ typedef struct xfs_buf { struct completion b_iowait; /* queue for I/O waiters */ void *b_fspriv; void *b_fspriv2; - void *b_fspriv3; + struct xfs_mount *b_mount; unsigned short b_error; /* error code on I/O */ unsigned int b_page_count; /* size of page array */ unsigned int b_offset; /* page offset in first page */ @@ -214,9 +214,10 @@ extern void xfs_buf_lock(xfs_buf_t *); extern void xfs_buf_unlock(xfs_buf_t *); /* Buffer Read and Write Routines */ +extern int xfs_bawrite(void *mp, xfs_buf_t *bp); +extern void xfs_bdwrite(void *mp, xfs_buf_t *bp); extern void xfs_buf_ioend(xfs_buf_t *, int); extern void xfs_buf_ioerror(xfs_buf_t *, int); -extern int xfs_buf_iostart(xfs_buf_t *, xfs_buf_flags_t); extern int xfs_buf_iorequest(xfs_buf_t *); extern int xfs_buf_iowait(xfs_buf_t *); extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, xfs_caddr_t, @@ -311,10 +312,6 @@ extern void xfs_buf_trace(xfs_buf_t *, char *, void *, void *); #define XFS_BUF_UNORDERED(bp) ((bp)->b_flags &= ~XBF_ORDERED) #define XFS_BUF_ISORDERED(bp) ((bp)->b_flags & XBF_ORDERED) -#define XFS_BUF_SHUT(bp) do { } while (0) -#define XFS_BUF_UNSHUT(bp) do { } while (0) -#define XFS_BUF_ISSHUT(bp) (0) - #define XFS_BUF_HOLD(bp) xfs_buf_hold(bp) #define XFS_BUF_READ(bp) ((bp)->b_flags |= XBF_READ) #define XFS_BUF_UNREAD(bp) ((bp)->b_flags &= ~XBF_READ) @@ -334,8 +331,6 @@ extern void xfs_buf_trace(xfs_buf_t *, char *, void *, void *); #define XFS_BUF_SET_FSPRIVATE(bp, val) ((bp)->b_fspriv = (void*)(val)) #define XFS_BUF_FSPRIVATE2(bp, type) ((type)(bp)->b_fspriv2) #define XFS_BUF_SET_FSPRIVATE2(bp, val) ((bp)->b_fspriv2 = (void*)(val)) -#define XFS_BUF_FSPRIVATE3(bp, type) ((type)(bp)->b_fspriv3) -#define XFS_BUF_SET_FSPRIVATE3(bp, val) ((bp)->b_fspriv3 = (void*)(val)) #define XFS_BUF_SET_START(bp) do { } while (0) #define XFS_BUF_SET_BRELSE_FUNC(bp, func) ((bp)->b_relse = (func)) @@ -366,14 +361,6 @@ extern void xfs_buf_trace(xfs_buf_t *, char *, void *, void *); #define XFS_BUF_TARGET(bp) ((bp)->b_target) #define XFS_BUFTARG_NAME(target) xfs_buf_target_name(target) -static inline int xfs_bawrite(void *mp, xfs_buf_t *bp) -{ - bp->b_fspriv3 = mp; - bp->b_strat = xfs_bdstrat_cb; - xfs_buf_delwri_dequeue(bp); - return xfs_buf_iostart(bp, XBF_WRITE | XBF_ASYNC | _XBF_RUN_QUEUES); -} - static inline void xfs_buf_relse(xfs_buf_t *bp) { if (!bp->b_relse) @@ -414,17 +401,6 @@ static inline int XFS_bwrite(xfs_buf_t *bp) return error; } -/* - * No error can be returned from xfs_buf_iostart for delwri - * buffers as they are queued and no I/O is issued. - */ -static inline void xfs_bdwrite(void *mp, xfs_buf_t *bp) -{ - bp->b_strat = xfs_bdstrat_cb; - bp->b_fspriv3 = mp; - (void)xfs_buf_iostart(bp, XBF_DELWRI | XBF_ASYNC); -} - #define XFS_bdstrat(bp) xfs_buf_iorequest(bp) #define xfs_iowait(bp) xfs_buf_iowait(bp) diff --git a/fs/xfs/linux-2.6/xfs_cred.h b/fs/xfs/linux-2.6/xfs_cred.h index 8c022cd0ad67..55bddf3b6091 100644 --- a/fs/xfs/linux-2.6/xfs_cred.h +++ b/fs/xfs/linux-2.6/xfs_cred.h @@ -25,12 +25,4 @@ */ typedef const struct cred cred_t; -extern cred_t *sys_cred; - -/* this is a hack.. (assumes sys_cred is the only cred_t in the system) */ -static inline int capable_cred(cred_t *cr, int cid) -{ - return (cr == sys_cred) ? 1 : capable(cid); -} - #endif /* __XFS_CRED_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c index 7f7abec25e14..595751f78350 100644 --- a/fs/xfs/linux-2.6/xfs_export.c +++ b/fs/xfs/linux-2.6/xfs_export.c @@ -29,7 +29,6 @@ #include "xfs_vnodeops.h" #include "xfs_bmap_btree.h" #include "xfs_inode.h" -#include "xfs_vfsops.h" /* * Note that we only accept fileids which are long enough rather than allow diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index 3fee790f138b..e14c4e3aea0c 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -36,89 +36,54 @@ #include "xfs_inode.h" #include "xfs_error.h" #include "xfs_rw.h" -#include "xfs_ioctl32.h" #include "xfs_vnodeops.h" +#include "xfs_da_btree.h" +#include "xfs_ioctl.h" #include <linux/dcache.h> #include <linux/smp_lock.h> static struct vm_operations_struct xfs_file_vm_ops; -STATIC_INLINE ssize_t -__xfs_file_read( +STATIC ssize_t +xfs_file_aio_read( struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, - int ioflags, loff_t pos) { struct file *file = iocb->ki_filp; + int ioflags = IO_ISAIO; BUG_ON(iocb->ki_pos != pos); if (unlikely(file->f_flags & O_DIRECT)) ioflags |= IO_ISDIRECT; + if (file->f_mode & FMODE_NOCMTIME) + ioflags |= IO_INVIS; return xfs_read(XFS_I(file->f_path.dentry->d_inode), iocb, iov, nr_segs, &iocb->ki_pos, ioflags); } STATIC ssize_t -xfs_file_aio_read( - struct kiocb *iocb, - const struct iovec *iov, - unsigned long nr_segs, - loff_t pos) -{ - return __xfs_file_read(iocb, iov, nr_segs, IO_ISAIO, pos); -} - -STATIC ssize_t -xfs_file_aio_read_invis( - struct kiocb *iocb, - const struct iovec *iov, - unsigned long nr_segs, - loff_t pos) -{ - return __xfs_file_read(iocb, iov, nr_segs, IO_ISAIO|IO_INVIS, pos); -} - -STATIC_INLINE ssize_t -__xfs_file_write( +xfs_file_aio_write( struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, - int ioflags, loff_t pos) { - struct file *file = iocb->ki_filp; + struct file *file = iocb->ki_filp; + int ioflags = IO_ISAIO; BUG_ON(iocb->ki_pos != pos); if (unlikely(file->f_flags & O_DIRECT)) ioflags |= IO_ISDIRECT; + if (file->f_mode & FMODE_NOCMTIME) + ioflags |= IO_INVIS; return xfs_write(XFS_I(file->f_mapping->host), iocb, iov, nr_segs, &iocb->ki_pos, ioflags); } STATIC ssize_t -xfs_file_aio_write( - struct kiocb *iocb, - const struct iovec *iov, - unsigned long nr_segs, - loff_t pos) -{ - return __xfs_file_write(iocb, iov, nr_segs, IO_ISAIO, pos); -} - -STATIC ssize_t -xfs_file_aio_write_invis( - struct kiocb *iocb, - const struct iovec *iov, - unsigned long nr_segs, - loff_t pos) -{ - return __xfs_file_write(iocb, iov, nr_segs, IO_ISAIO|IO_INVIS, pos); -} - -STATIC ssize_t xfs_file_splice_read( struct file *infilp, loff_t *ppos, @@ -126,20 +91,13 @@ xfs_file_splice_read( size_t len, unsigned int flags) { - return xfs_splice_read(XFS_I(infilp->f_path.dentry->d_inode), - infilp, ppos, pipe, len, flags, 0); -} + int ioflags = 0; + + if (infilp->f_mode & FMODE_NOCMTIME) + ioflags |= IO_INVIS; -STATIC ssize_t -xfs_file_splice_read_invis( - struct file *infilp, - loff_t *ppos, - struct pipe_inode_info *pipe, - size_t len, - unsigned int flags) -{ return xfs_splice_read(XFS_I(infilp->f_path.dentry->d_inode), - infilp, ppos, pipe, len, flags, IO_INVIS); + infilp, ppos, pipe, len, flags, ioflags); } STATIC ssize_t @@ -150,30 +108,49 @@ xfs_file_splice_write( size_t len, unsigned int flags) { - return xfs_splice_write(XFS_I(outfilp->f_path.dentry->d_inode), - pipe, outfilp, ppos, len, flags, 0); -} + int ioflags = 0; + + if (outfilp->f_mode & FMODE_NOCMTIME) + ioflags |= IO_INVIS; -STATIC ssize_t -xfs_file_splice_write_invis( - struct pipe_inode_info *pipe, - struct file *outfilp, - loff_t *ppos, - size_t len, - unsigned int flags) -{ return xfs_splice_write(XFS_I(outfilp->f_path.dentry->d_inode), - pipe, outfilp, ppos, len, flags, IO_INVIS); + pipe, outfilp, ppos, len, flags, ioflags); } STATIC int xfs_file_open( struct inode *inode, - struct file *filp) + struct file *file) { - if (!(filp->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS) + if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS) return -EFBIG; - return -xfs_open(XFS_I(inode)); + if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb))) + return -EIO; + return 0; +} + +STATIC int +xfs_dir_open( + struct inode *inode, + struct file *file) +{ + struct xfs_inode *ip = XFS_I(inode); + int mode; + int error; + + error = xfs_file_open(inode, file); + if (error) + return error; + + /* + * If there are any blocks, read-ahead block 0 as we're almost + * certain to have the next operation be a read there. + */ + mode = xfs_ilock_map_shared(ip); + if (ip->i_d.di_nextents > 0) + xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK); + xfs_iunlock(ip, mode); + return 0; } STATIC int @@ -227,7 +204,7 @@ xfs_file_readdir( * point we can change the ->readdir prototype to include the * buffer size. */ - bufsize = (size_t)min_t(loff_t, PAGE_SIZE, inode->i_size); + bufsize = (size_t)min_t(loff_t, PAGE_SIZE, ip->i_d.di_size); error = xfs_readdir(ip, dirent, bufsize, (xfs_off_t *)&filp->f_pos, filldir); @@ -248,48 +225,6 @@ xfs_file_mmap( return 0; } -STATIC long -xfs_file_ioctl( - struct file *filp, - unsigned int cmd, - unsigned long p) -{ - int error; - struct inode *inode = filp->f_path.dentry->d_inode; - - error = xfs_ioctl(XFS_I(inode), filp, 0, cmd, (void __user *)p); - xfs_iflags_set(XFS_I(inode), XFS_IMODIFIED); - - /* NOTE: some of the ioctl's return positive #'s as a - * byte count indicating success, such as - * readlink_by_handle. So we don't "sign flip" - * like most other routines. This means true - * errors need to be returned as a negative value. - */ - return error; -} - -STATIC long -xfs_file_ioctl_invis( - struct file *filp, - unsigned int cmd, - unsigned long p) -{ - int error; - struct inode *inode = filp->f_path.dentry->d_inode; - - error = xfs_ioctl(XFS_I(inode), filp, IO_INVIS, cmd, (void __user *)p); - xfs_iflags_set(XFS_I(inode), XFS_IMODIFIED); - - /* NOTE: some of the ioctl's return positive #'s as a - * byte count indicating success, such as - * readlink_by_handle. So we don't "sign flip" - * like most other routines. This means true - * errors need to be returned as a negative value. - */ - return error; -} - /* * mmap()d file has taken write protection fault and is being made * writable. We can set the page state up correctly for a writable @@ -325,26 +260,8 @@ const struct file_operations xfs_file_operations = { #endif }; -const struct file_operations xfs_invis_file_operations = { - .llseek = generic_file_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = xfs_file_aio_read_invis, - .aio_write = xfs_file_aio_write_invis, - .splice_read = xfs_file_splice_read_invis, - .splice_write = xfs_file_splice_write_invis, - .unlocked_ioctl = xfs_file_ioctl_invis, -#ifdef CONFIG_COMPAT - .compat_ioctl = xfs_file_compat_invis_ioctl, -#endif - .mmap = xfs_file_mmap, - .open = xfs_file_open, - .release = xfs_file_release, - .fsync = xfs_file_fsync, -}; - - const struct file_operations xfs_dir_file_operations = { + .open = xfs_dir_open, .read = generic_read_dir, .readdir = xfs_file_readdir, .llseek = generic_file_llseek, diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c index 36caa6d957df..5aeb77776961 100644 --- a/fs/xfs/linux-2.6/xfs_fs_subr.c +++ b/fs/xfs/linux-2.6/xfs_fs_subr.c @@ -24,6 +24,10 @@ int fs_noerr(void) { return 0; } int fs_nosys(void) { return ENOSYS; } void fs_noval(void) { return; } +/* + * note: all filemap functions return negative error codes. These + * need to be inverted before returning to the xfs core functions. + */ void xfs_tosspages( xfs_inode_t *ip, @@ -53,7 +57,7 @@ xfs_flushinval_pages( if (!ret) truncate_inode_pages(mapping, first); } - return ret; + return -ret; } int @@ -72,10 +76,23 @@ xfs_flush_pages( xfs_iflags_clear(ip, XFS_ITRUNCATED); ret = filemap_fdatawrite(mapping); if (flags & XFS_B_ASYNC) - return ret; + return -ret; ret2 = filemap_fdatawait(mapping); if (!ret) ret = ret2; } - return ret; + return -ret; +} + +int +xfs_wait_on_pages( + xfs_inode_t *ip, + xfs_off_t first, + xfs_off_t last) +{ + struct address_space *mapping = VFS_I(ip)->i_mapping; + + if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) + return -filemap_fdatawait(mapping); + return 0; } diff --git a/fs/xfs/linux-2.6/xfs_globals.c b/fs/xfs/linux-2.6/xfs_globals.c index ef90e64641e6..2ae8b1ccb02e 100644 --- a/fs/xfs/linux-2.6/xfs_globals.c +++ b/fs/xfs/linux-2.6/xfs_globals.c @@ -26,7 +26,6 @@ */ xfs_param_t xfs_params = { /* MIN DFLT MAX */ - .restrict_chown = { 0, 1, 1 }, .sgid_inherit = { 0, 0, 1 }, .symlink_mode = { 0, 0, 1 }, .panic_mask = { 0, 0, 255 }, @@ -43,10 +42,3 @@ xfs_param_t xfs_params = { .inherit_nodfrg = { 0, 1, 1 }, .fstrm_timer = { 1, 30*100, 3600*100}, }; - -/* - * Global system credential structure. - */ -static cred_t sys_cred_val; -cred_t *sys_cred = &sys_cred_val; - diff --git a/fs/xfs/linux-2.6/xfs_globals.h b/fs/xfs/linux-2.6/xfs_globals.h index 6eda8a3eb6f1..69f71caf061c 100644 --- a/fs/xfs/linux-2.6/xfs_globals.h +++ b/fs/xfs/linux-2.6/xfs_globals.h @@ -19,6 +19,5 @@ #define __XFS_GLOBALS_H__ extern uint64_t xfs_panic_mask; /* set to cause more panics */ -extern cred_t *sys_cred; #endif /* __XFS_GLOBALS_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index 281cbd5a25cf..e5be1e0be802 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -68,26 +68,22 @@ * XFS_IOC_PATH_TO_HANDLE * returns full handle for a path */ -STATIC int +int xfs_find_handle( unsigned int cmd, - void __user *arg) + xfs_fsop_handlereq_t *hreq) { int hsize; xfs_handle_t handle; - xfs_fsop_handlereq_t hreq; struct inode *inode; - if (copy_from_user(&hreq, arg, sizeof(hreq))) - return -XFS_ERROR(EFAULT); - memset((char *)&handle, 0, sizeof(handle)); switch (cmd) { case XFS_IOC_PATH_TO_FSHANDLE: case XFS_IOC_PATH_TO_HANDLE: { struct path path; - int error = user_lpath((const char __user *)hreq.path, &path); + int error = user_lpath((const char __user *)hreq->path, &path); if (error) return error; @@ -101,7 +97,7 @@ xfs_find_handle( case XFS_IOC_FD_TO_HANDLE: { struct file *file; - file = fget(hreq.fd); + file = fget(hreq->fd); if (!file) return -EBADF; @@ -158,8 +154,8 @@ xfs_find_handle( } /* now copy our handle into the user buffer & write out the size */ - if (copy_to_user(hreq.ohandle, &handle, hsize) || - copy_to_user(hreq.ohandlen, &hsize, sizeof(__s32))) { + if (copy_to_user(hreq->ohandle, &handle, hsize) || + copy_to_user(hreq->ohandlen, &hsize, sizeof(__s32))) { iput(inode); return -XFS_ERROR(EFAULT); } @@ -249,10 +245,10 @@ xfs_vget_fsop_handlereq( return 0; } -STATIC int +int xfs_open_by_handle( xfs_mount_t *mp, - void __user *arg, + xfs_fsop_handlereq_t *hreq, struct file *parfilp, struct inode *parinode) { @@ -263,14 +259,11 @@ xfs_open_by_handle( struct file *filp; struct inode *inode; struct dentry *dentry; - xfs_fsop_handlereq_t hreq; if (!capable(CAP_SYS_ADMIN)) return -XFS_ERROR(EPERM); - if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t))) - return -XFS_ERROR(EFAULT); - error = xfs_vget_fsop_handlereq(mp, parinode, &hreq, &inode); + error = xfs_vget_fsop_handlereq(mp, parinode, hreq, &inode); if (error) return -error; @@ -281,10 +274,10 @@ xfs_open_by_handle( } #if BITS_PER_LONG != 32 - hreq.oflags |= O_LARGEFILE; + hreq->oflags |= O_LARGEFILE; #endif /* Put open permission in namei format. */ - permflag = hreq.oflags; + permflag = hreq->oflags; if ((permflag+1) & O_ACCMODE) permflag++; if (permflag & O_TRUNC) @@ -322,15 +315,16 @@ xfs_open_by_handle( mntget(parfilp->f_path.mnt); /* Create file pointer. */ - filp = dentry_open(dentry, parfilp->f_path.mnt, hreq.oflags, cred); + filp = dentry_open(dentry, parfilp->f_path.mnt, hreq->oflags, cred); if (IS_ERR(filp)) { put_unused_fd(new_fd); return -XFS_ERROR(-PTR_ERR(filp)); } + if (inode->i_mode & S_IFREG) { /* invisible operation should not change atime */ filp->f_flags |= O_NOATIME; - filp->f_op = &xfs_invis_file_operations; + filp->f_mode |= FMODE_NOCMTIME; } fd_install(new_fd, filp); @@ -363,24 +357,21 @@ do_readlink( } -STATIC int +int xfs_readlink_by_handle( xfs_mount_t *mp, - void __user *arg, + xfs_fsop_handlereq_t *hreq, struct inode *parinode) { struct inode *inode; - xfs_fsop_handlereq_t hreq; __u32 olen; void *link; int error; if (!capable(CAP_SYS_ADMIN)) return -XFS_ERROR(EPERM); - if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t))) - return -XFS_ERROR(EFAULT); - error = xfs_vget_fsop_handlereq(mp, parinode, &hreq, &inode); + error = xfs_vget_fsop_handlereq(mp, parinode, hreq, &inode); if (error) return -error; @@ -390,7 +381,7 @@ xfs_readlink_by_handle( goto out_iput; } - if (copy_from_user(&olen, hreq.ohandlen, sizeof(__u32))) { + if (copy_from_user(&olen, hreq->ohandlen, sizeof(__u32))) { error = -XFS_ERROR(EFAULT); goto out_iput; } @@ -402,7 +393,7 @@ xfs_readlink_by_handle( error = -xfs_readlink(XFS_I(inode), link); if (error) goto out_kfree; - error = do_readlink(hreq.ohandle, olen, link); + error = do_readlink(hreq->ohandle, olen, link); if (error) goto out_kfree; @@ -501,7 +492,7 @@ xfs_attrlist_by_handle( return -error; } -STATIC int +int xfs_attrmulti_attr_get( struct inode *inode, char *name, @@ -530,7 +521,7 @@ xfs_attrmulti_attr_get( return error; } -STATIC int +int xfs_attrmulti_attr_set( struct inode *inode, char *name, @@ -560,7 +551,7 @@ xfs_attrmulti_attr_set( return error; } -STATIC int +int xfs_attrmulti_attr_remove( struct inode *inode, char *name, @@ -662,19 +653,26 @@ xfs_attrmulti_by_handle( return -error; } -STATIC int +int xfs_ioc_space( struct xfs_inode *ip, struct inode *inode, struct file *filp, int ioflags, unsigned int cmd, - void __user *arg) + xfs_flock64_t *bf) { - xfs_flock64_t bf; int attr_flags = 0; int error; + /* + * Only allow the sys admin to reserve space unless + * unwritten extents are enabled. + */ + if (!xfs_sb_version_hasextflgbit(&ip->i_mount->m_sb) && + !capable(CAP_SYS_ADMIN)) + return -XFS_ERROR(EPERM); + if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) return -XFS_ERROR(EPERM); @@ -684,16 +682,12 @@ xfs_ioc_space( if (!S_ISREG(inode->i_mode)) return -XFS_ERROR(EINVAL); - if (copy_from_user(&bf, arg, sizeof(bf))) - return -XFS_ERROR(EFAULT); - if (filp->f_flags & (O_NDELAY|O_NONBLOCK)) attr_flags |= XFS_ATTR_NONBLOCK; if (ioflags & IO_INVIS) attr_flags |= XFS_ATTR_DMI; - error = xfs_change_file_space(ip, cmd, &bf, filp->f_pos, - NULL, attr_flags); + error = xfs_change_file_space(ip, cmd, bf, filp->f_pos, attr_flags); return -error; } @@ -1105,10 +1099,6 @@ xfs_ioctl_setattr( /* * Change file ownership. Must be the owner or privileged. - * If the system was configured with the "restricted_chown" - * option, the owner is not permitted to give away the file, - * and can change the group id only to a group of which he - * or she is a member. */ if (mask & FSX_PROJID) { /* @@ -1137,7 +1127,7 @@ xfs_ioctl_setattr( * the superblock version number since projids didn't * exist before DINODE_VERSION_2 and SB_VERSION_NLINK. */ - if (ip->i_d.di_version == XFS_DINODE_VERSION_1) + if (ip->i_d.di_version == 1) xfs_bump_ino_vers2(tp, ip); } @@ -1256,43 +1246,67 @@ xfs_ioc_setxflags( } STATIC int +xfs_getbmap_format(void **ap, struct getbmapx *bmv, int *full) +{ + struct getbmap __user *base = *ap; + + /* copy only getbmap portion (not getbmapx) */ + if (copy_to_user(base, bmv, sizeof(struct getbmap))) + return XFS_ERROR(EFAULT); + + *ap += sizeof(struct getbmap); + return 0; +} + +STATIC int xfs_ioc_getbmap( struct xfs_inode *ip, int ioflags, unsigned int cmd, void __user *arg) { - struct getbmap bm; - int iflags; + struct getbmapx bmx; int error; - if (copy_from_user(&bm, arg, sizeof(bm))) + if (copy_from_user(&bmx, arg, sizeof(struct getbmapx))) return -XFS_ERROR(EFAULT); - if (bm.bmv_count < 2) + if (bmx.bmv_count < 2) return -XFS_ERROR(EINVAL); - iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0); + bmx.bmv_iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0); if (ioflags & IO_INVIS) - iflags |= BMV_IF_NO_DMAPI_READ; + bmx.bmv_iflags |= BMV_IF_NO_DMAPI_READ; - error = xfs_getbmap(ip, &bm, (struct getbmap __user *)arg+1, iflags); + error = xfs_getbmap(ip, &bmx, xfs_getbmap_format, + (struct getbmap *)arg+1); if (error) return -error; - if (copy_to_user(arg, &bm, sizeof(bm))) + /* copy back header - only size of getbmap */ + if (copy_to_user(arg, &bmx, sizeof(struct getbmap))) return -XFS_ERROR(EFAULT); return 0; } STATIC int +xfs_getbmapx_format(void **ap, struct getbmapx *bmv, int *full) +{ + struct getbmapx __user *base = *ap; + + if (copy_to_user(base, bmv, sizeof(struct getbmapx))) + return XFS_ERROR(EFAULT); + + *ap += sizeof(struct getbmapx); + return 0; +} + +STATIC int xfs_ioc_getbmapx( struct xfs_inode *ip, void __user *arg) { struct getbmapx bmx; - struct getbmap bm; - int iflags; int error; if (copy_from_user(&bmx, arg, sizeof(bmx))) @@ -1301,46 +1315,46 @@ xfs_ioc_getbmapx( if (bmx.bmv_count < 2) return -XFS_ERROR(EINVAL); - /* - * Map input getbmapx structure to a getbmap - * structure for xfs_getbmap. - */ - GETBMAP_CONVERT(bmx, bm); - - iflags = bmx.bmv_iflags; - - if (iflags & (~BMV_IF_VALID)) + if (bmx.bmv_iflags & (~BMV_IF_VALID)) return -XFS_ERROR(EINVAL); - iflags |= BMV_IF_EXTENDED; - - error = xfs_getbmap(ip, &bm, (struct getbmapx __user *)arg+1, iflags); + error = xfs_getbmap(ip, &bmx, xfs_getbmapx_format, + (struct getbmapx *)arg+1); if (error) return -error; - GETBMAP_CONVERT(bm, bmx); - - if (copy_to_user(arg, &bmx, sizeof(bmx))) + /* copy back header */ + if (copy_to_user(arg, &bmx, sizeof(struct getbmapx))) return -XFS_ERROR(EFAULT); return 0; } -int -xfs_ioctl( - xfs_inode_t *ip, +/* + * Note: some of the ioctl's return positive numbers as a + * byte count indicating success, such as readlink_by_handle. + * So we don't "sign flip" like most other routines. This means + * true errors need to be returned as a negative value. + */ +long +xfs_file_ioctl( struct file *filp, - int ioflags, unsigned int cmd, - void __user *arg) + unsigned long p) { struct inode *inode = filp->f_path.dentry->d_inode; - xfs_mount_t *mp = ip->i_mount; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + void __user *arg = (void __user *)p; + int ioflags = 0; int error; - xfs_itrace_entry(XFS_I(inode)); - switch (cmd) { + if (filp->f_mode & FMODE_NOCMTIME) + ioflags |= IO_INVIS; + xfs_itrace_entry(ip); + + switch (cmd) { case XFS_IOC_ALLOCSP: case XFS_IOC_FREESP: case XFS_IOC_RESVSP: @@ -1348,17 +1362,13 @@ xfs_ioctl( case XFS_IOC_ALLOCSP64: case XFS_IOC_FREESP64: case XFS_IOC_RESVSP64: - case XFS_IOC_UNRESVSP64: - /* - * Only allow the sys admin to reserve space unless - * unwritten extents are enabled. - */ - if (!xfs_sb_version_hasextflgbit(&mp->m_sb) && - !capable(CAP_SYS_ADMIN)) - return -EPERM; - - return xfs_ioc_space(ip, inode, filp, ioflags, cmd, arg); + case XFS_IOC_UNRESVSP64: { + xfs_flock64_t bf; + if (copy_from_user(&bf, arg, sizeof(bf))) + return -XFS_ERROR(EFAULT); + return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf); + } case XFS_IOC_DIOINFO: { struct dioattr da; xfs_buftarg_t *target = @@ -1418,18 +1428,30 @@ xfs_ioctl( case XFS_IOC_FD_TO_HANDLE: case XFS_IOC_PATH_TO_HANDLE: - case XFS_IOC_PATH_TO_FSHANDLE: - return xfs_find_handle(cmd, arg); + case XFS_IOC_PATH_TO_FSHANDLE: { + xfs_fsop_handlereq_t hreq; - case XFS_IOC_OPEN_BY_HANDLE: - return xfs_open_by_handle(mp, arg, filp, inode); + if (copy_from_user(&hreq, arg, sizeof(hreq))) + return -XFS_ERROR(EFAULT); + return xfs_find_handle(cmd, &hreq); + } + case XFS_IOC_OPEN_BY_HANDLE: { + xfs_fsop_handlereq_t hreq; + if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t))) + return -XFS_ERROR(EFAULT); + return xfs_open_by_handle(mp, &hreq, filp, inode); + } case XFS_IOC_FSSETDM_BY_HANDLE: return xfs_fssetdm_by_handle(mp, arg, inode); - case XFS_IOC_READLINK_BY_HANDLE: - return xfs_readlink_by_handle(mp, arg, inode); + case XFS_IOC_READLINK_BY_HANDLE: { + xfs_fsop_handlereq_t hreq; + if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t))) + return -XFS_ERROR(EFAULT); + return xfs_readlink_by_handle(mp, &hreq, inode); + } case XFS_IOC_ATTRLIST_BY_HANDLE: return xfs_attrlist_by_handle(mp, arg, inode); @@ -1437,7 +1459,11 @@ xfs_ioctl( return xfs_attrmulti_by_handle(mp, arg, filp, inode); case XFS_IOC_SWAPEXT: { - error = xfs_swapext((struct xfs_swapext __user *)arg); + struct xfs_swapext sxp; + + if (copy_from_user(&sxp, arg, sizeof(xfs_swapext_t))) + return -XFS_ERROR(EFAULT); + error = xfs_swapext(&sxp); return -error; } @@ -1493,9 +1519,6 @@ xfs_ioctl( case XFS_IOC_FSGROWFSDATA: { xfs_growfs_data_t in; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - if (copy_from_user(&in, arg, sizeof(in))) return -XFS_ERROR(EFAULT); @@ -1506,9 +1529,6 @@ xfs_ioctl( case XFS_IOC_FSGROWFSLOG: { xfs_growfs_log_t in; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - if (copy_from_user(&in, arg, sizeof(in))) return -XFS_ERROR(EFAULT); @@ -1519,9 +1539,6 @@ xfs_ioctl( case XFS_IOC_FSGROWFSRT: { xfs_growfs_rt_t in; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - if (copy_from_user(&in, arg, sizeof(in))) return -XFS_ERROR(EFAULT); @@ -1529,21 +1546,6 @@ xfs_ioctl( return -error; } - case XFS_IOC_FREEZE: - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (inode->i_sb->s_frozen == SB_UNFROZEN) - freeze_bdev(inode->i_sb->s_bdev); - return 0; - - case XFS_IOC_THAW: - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - if (inode->i_sb->s_frozen != SB_UNFROZEN) - thaw_bdev(inode->i_sb->s_bdev, inode->i_sb); - return 0; - case XFS_IOC_GOINGDOWN: { __uint32_t in; diff --git a/fs/xfs/linux-2.6/xfs_ioctl.h b/fs/xfs/linux-2.6/xfs_ioctl.h new file mode 100644 index 000000000000..8c16bf2d7e03 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_ioctl.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2008 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_IOCTL_H__ +#define __XFS_IOCTL_H__ + +extern int +xfs_ioc_space( + struct xfs_inode *ip, + struct inode *inode, + struct file *filp, + int ioflags, + unsigned int cmd, + xfs_flock64_t *bf); + +extern int +xfs_find_handle( + unsigned int cmd, + xfs_fsop_handlereq_t *hreq); + +extern int +xfs_open_by_handle( + xfs_mount_t *mp, + xfs_fsop_handlereq_t *hreq, + struct file *parfilp, + struct inode *parinode); + +extern int +xfs_readlink_by_handle( + xfs_mount_t *mp, + xfs_fsop_handlereq_t *hreq, + struct inode *parinode); + +extern int +xfs_attrmulti_attr_get( + struct inode *inode, + char *name, + char __user *ubuf, + __uint32_t *len, + __uint32_t flags); + +extern int + xfs_attrmulti_attr_set( + struct inode *inode, + char *name, + const char __user *ubuf, + __uint32_t len, + __uint32_t flags); + +extern int +xfs_attrmulti_attr_remove( + struct inode *inode, + char *name, + __uint32_t flags); + +extern long +xfs_file_ioctl( + struct file *filp, + unsigned int cmd, + unsigned long p); + +extern long +xfs_file_compat_ioctl( + struct file *file, + unsigned int cmd, + unsigned long arg); + +#endif diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c index a4b254eb43b2..50903ad3182e 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl32.c +++ b/fs/xfs/linux-2.6/xfs_ioctl32.c @@ -16,11 +16,7 @@ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include <linux/compat.h> -#include <linux/init.h> #include <linux/ioctl.h> -#include <linux/syscalls.h> -#include <linux/types.h> -#include <linux/fs.h> #include <asm/uaccess.h> #include "xfs.h" #include "xfs_fs.h" @@ -36,7 +32,6 @@ #include "xfs_bmap_btree.h" #include "xfs_attr_sf.h" #include "xfs_dir2_sf.h" -#include "xfs_vfs.h" #include "xfs_vnode.h" #include "xfs_dinode.h" #include "xfs_inode.h" @@ -44,221 +39,219 @@ #include "xfs_error.h" #include "xfs_dfrag.h" #include "xfs_vnodeops.h" +#include "xfs_fsops.h" +#include "xfs_alloc.h" +#include "xfs_rtalloc.h" +#include "xfs_attr.h" +#include "xfs_ioctl.h" #include "xfs_ioctl32.h" #define _NATIVE_IOC(cmd, type) \ _IOC(_IOC_DIR(cmd), _IOC_TYPE(cmd), _IOC_NR(cmd), sizeof(type)) -#if defined(CONFIG_IA64) || defined(CONFIG_X86_64) -#define BROKEN_X86_ALIGNMENT -#define _PACKED __attribute__((packed)) -/* on ia32 l_start is on a 32-bit boundary */ -typedef struct xfs_flock64_32 { - __s16 l_type; - __s16 l_whence; - __s64 l_start __attribute__((packed)); - /* len == 0 means until end of file */ - __s64 l_len __attribute__((packed)); - __s32 l_sysid; - __u32 l_pid; - __s32 l_pad[4]; /* reserve area */ -} xfs_flock64_32_t; - -#define XFS_IOC_ALLOCSP_32 _IOW ('X', 10, struct xfs_flock64_32) -#define XFS_IOC_FREESP_32 _IOW ('X', 11, struct xfs_flock64_32) -#define XFS_IOC_ALLOCSP64_32 _IOW ('X', 36, struct xfs_flock64_32) -#define XFS_IOC_FREESP64_32 _IOW ('X', 37, struct xfs_flock64_32) -#define XFS_IOC_RESVSP_32 _IOW ('X', 40, struct xfs_flock64_32) -#define XFS_IOC_UNRESVSP_32 _IOW ('X', 41, struct xfs_flock64_32) -#define XFS_IOC_RESVSP64_32 _IOW ('X', 42, struct xfs_flock64_32) -#define XFS_IOC_UNRESVSP64_32 _IOW ('X', 43, struct xfs_flock64_32) - -/* just account for different alignment */ -STATIC unsigned long -xfs_ioctl32_flock( - unsigned long arg) +#ifdef BROKEN_X86_ALIGNMENT +STATIC int +xfs_compat_flock64_copyin( + xfs_flock64_t *bf, + compat_xfs_flock64_t __user *arg32) { - xfs_flock64_32_t __user *p32 = (void __user *)arg; - xfs_flock64_t __user *p = compat_alloc_user_space(sizeof(*p)); - - if (copy_in_user(&p->l_type, &p32->l_type, sizeof(s16)) || - copy_in_user(&p->l_whence, &p32->l_whence, sizeof(s16)) || - copy_in_user(&p->l_start, &p32->l_start, sizeof(s64)) || - copy_in_user(&p->l_len, &p32->l_len, sizeof(s64)) || - copy_in_user(&p->l_sysid, &p32->l_sysid, sizeof(s32)) || - copy_in_user(&p->l_pid, &p32->l_pid, sizeof(u32)) || - copy_in_user(&p->l_pad, &p32->l_pad, 4*sizeof(u32))) - return -EFAULT; - - return (unsigned long)p; + if (get_user(bf->l_type, &arg32->l_type) || + get_user(bf->l_whence, &arg32->l_whence) || + get_user(bf->l_start, &arg32->l_start) || + get_user(bf->l_len, &arg32->l_len) || + get_user(bf->l_sysid, &arg32->l_sysid) || + get_user(bf->l_pid, &arg32->l_pid) || + copy_from_user(bf->l_pad, &arg32->l_pad, 4*sizeof(u32))) + return -XFS_ERROR(EFAULT); + return 0; } -typedef struct compat_xfs_fsop_geom_v1 { - __u32 blocksize; /* filesystem (data) block size */ - __u32 rtextsize; /* realtime extent size */ - __u32 agblocks; /* fsblocks in an AG */ - __u32 agcount; /* number of allocation groups */ - __u32 logblocks; /* fsblocks in the log */ - __u32 sectsize; /* (data) sector size, bytes */ - __u32 inodesize; /* inode size in bytes */ - __u32 imaxpct; /* max allowed inode space(%) */ - __u64 datablocks; /* fsblocks in data subvolume */ - __u64 rtblocks; /* fsblocks in realtime subvol */ - __u64 rtextents; /* rt extents in realtime subvol*/ - __u64 logstart; /* starting fsblock of the log */ - unsigned char uuid[16]; /* unique id of the filesystem */ - __u32 sunit; /* stripe unit, fsblocks */ - __u32 swidth; /* stripe width, fsblocks */ - __s32 version; /* structure version */ - __u32 flags; /* superblock version flags */ - __u32 logsectsize; /* log sector size, bytes */ - __u32 rtsectsize; /* realtime sector size, bytes */ - __u32 dirblocksize; /* directory block size, bytes */ -} __attribute__((packed)) compat_xfs_fsop_geom_v1_t; - -#define XFS_IOC_FSGEOMETRY_V1_32 \ - _IOR ('X', 100, struct compat_xfs_fsop_geom_v1) - -STATIC unsigned long xfs_ioctl32_geom_v1(unsigned long arg) +STATIC int +xfs_compat_ioc_fsgeometry_v1( + struct xfs_mount *mp, + compat_xfs_fsop_geom_v1_t __user *arg32) { - compat_xfs_fsop_geom_v1_t __user *p32 = (void __user *)arg; - xfs_fsop_geom_v1_t __user *p = compat_alloc_user_space(sizeof(*p)); + xfs_fsop_geom_t fsgeo; + int error; - if (copy_in_user(p, p32, sizeof(*p32))) - return -EFAULT; - return (unsigned long)p; + error = xfs_fs_geometry(mp, &fsgeo, 3); + if (error) + return -error; + /* The 32-bit variant simply has some padding at the end */ + if (copy_to_user(arg32, &fsgeo, sizeof(struct compat_xfs_fsop_geom_v1))) + return -XFS_ERROR(EFAULT); + return 0; } -typedef struct compat_xfs_inogrp { - __u64 xi_startino; /* starting inode number */ - __s32 xi_alloccount; /* # bits set in allocmask */ - __u64 xi_allocmask; /* mask of allocated inodes */ -} __attribute__((packed)) compat_xfs_inogrp_t; - -STATIC int xfs_inumbers_fmt_compat( - void __user *ubuffer, - const xfs_inogrp_t *buffer, - long count, - long *written) +STATIC int +xfs_compat_growfs_data_copyin( + struct xfs_growfs_data *in, + compat_xfs_growfs_data_t __user *arg32) { - compat_xfs_inogrp_t __user *p32 = ubuffer; - long i; + if (get_user(in->newblocks, &arg32->newblocks) || + get_user(in->imaxpct, &arg32->imaxpct)) + return -XFS_ERROR(EFAULT); + return 0; +} + +STATIC int +xfs_compat_growfs_rt_copyin( + struct xfs_growfs_rt *in, + compat_xfs_growfs_rt_t __user *arg32) +{ + if (get_user(in->newblocks, &arg32->newblocks) || + get_user(in->extsize, &arg32->extsize)) + return -XFS_ERROR(EFAULT); + return 0; +} + +STATIC int +xfs_inumbers_fmt_compat( + void __user *ubuffer, + const xfs_inogrp_t *buffer, + long count, + long *written) +{ + compat_xfs_inogrp_t __user *p32 = ubuffer; + long i; for (i = 0; i < count; i++) { if (put_user(buffer[i].xi_startino, &p32[i].xi_startino) || put_user(buffer[i].xi_alloccount, &p32[i].xi_alloccount) || put_user(buffer[i].xi_allocmask, &p32[i].xi_allocmask)) - return -EFAULT; + return -XFS_ERROR(EFAULT); } *written = count * sizeof(*p32); return 0; } #else - #define xfs_inumbers_fmt_compat xfs_inumbers_fmt -#define _PACKED +#endif /* BROKEN_X86_ALIGNMENT */ -#endif +STATIC int +xfs_ioctl32_bstime_copyin( + xfs_bstime_t *bstime, + compat_xfs_bstime_t __user *bstime32) +{ + compat_time_t sec32; /* tv_sec differs on 64 vs. 32 */ -/* XFS_IOC_FSBULKSTAT and friends */ + if (get_user(sec32, &bstime32->tv_sec) || + get_user(bstime->tv_nsec, &bstime32->tv_nsec)) + return -XFS_ERROR(EFAULT); + bstime->tv_sec = sec32; + return 0; +} + +/* xfs_bstat_t has differing alignment on intel, & bstime_t sizes everywhere */ +STATIC int +xfs_ioctl32_bstat_copyin( + xfs_bstat_t *bstat, + compat_xfs_bstat_t __user *bstat32) +{ + if (get_user(bstat->bs_ino, &bstat32->bs_ino) || + get_user(bstat->bs_mode, &bstat32->bs_mode) || + get_user(bstat->bs_nlink, &bstat32->bs_nlink) || + get_user(bstat->bs_uid, &bstat32->bs_uid) || + get_user(bstat->bs_gid, &bstat32->bs_gid) || + get_user(bstat->bs_rdev, &bstat32->bs_rdev) || + get_user(bstat->bs_blksize, &bstat32->bs_blksize) || + get_user(bstat->bs_size, &bstat32->bs_size) || + xfs_ioctl32_bstime_copyin(&bstat->bs_atime, &bstat32->bs_atime) || + xfs_ioctl32_bstime_copyin(&bstat->bs_mtime, &bstat32->bs_mtime) || + xfs_ioctl32_bstime_copyin(&bstat->bs_ctime, &bstat32->bs_ctime) || + get_user(bstat->bs_blocks, &bstat32->bs_size) || + get_user(bstat->bs_xflags, &bstat32->bs_size) || + get_user(bstat->bs_extsize, &bstat32->bs_extsize) || + get_user(bstat->bs_extents, &bstat32->bs_extents) || + get_user(bstat->bs_gen, &bstat32->bs_gen) || + get_user(bstat->bs_projid, &bstat32->bs_projid) || + get_user(bstat->bs_dmevmask, &bstat32->bs_dmevmask) || + get_user(bstat->bs_dmstate, &bstat32->bs_dmstate) || + get_user(bstat->bs_aextents, &bstat32->bs_aextents)) + return -XFS_ERROR(EFAULT); + return 0; +} -typedef struct compat_xfs_bstime { - __s32 tv_sec; /* seconds */ - __s32 tv_nsec; /* and nanoseconds */ -} compat_xfs_bstime_t; +/* XFS_IOC_FSBULKSTAT and friends */ -STATIC int xfs_bstime_store_compat( - compat_xfs_bstime_t __user *p32, - const xfs_bstime_t *p) +STATIC int +xfs_bstime_store_compat( + compat_xfs_bstime_t __user *p32, + const xfs_bstime_t *p) { - __s32 sec32; + __s32 sec32; sec32 = p->tv_sec; if (put_user(sec32, &p32->tv_sec) || put_user(p->tv_nsec, &p32->tv_nsec)) - return -EFAULT; + return -XFS_ERROR(EFAULT); return 0; } -typedef struct compat_xfs_bstat { - __u64 bs_ino; /* inode number */ - __u16 bs_mode; /* type and mode */ - __u16 bs_nlink; /* number of links */ - __u32 bs_uid; /* user id */ - __u32 bs_gid; /* group id */ - __u32 bs_rdev; /* device value */ - __s32 bs_blksize; /* block size */ - __s64 bs_size; /* file size */ - compat_xfs_bstime_t bs_atime; /* access time */ - compat_xfs_bstime_t bs_mtime; /* modify time */ - compat_xfs_bstime_t bs_ctime; /* inode change time */ - int64_t bs_blocks; /* number of blocks */ - __u32 bs_xflags; /* extended flags */ - __s32 bs_extsize; /* extent size */ - __s32 bs_extents; /* number of extents */ - __u32 bs_gen; /* generation count */ - __u16 bs_projid; /* project id */ - unsigned char bs_pad[14]; /* pad space, unused */ - __u32 bs_dmevmask; /* DMIG event mask */ - __u16 bs_dmstate; /* DMIG state info */ - __u16 bs_aextents; /* attribute number of extents */ -} _PACKED compat_xfs_bstat_t; - -STATIC int xfs_bulkstat_one_fmt_compat( +/* Return 0 on success or positive error (to xfs_bulkstat()) */ +STATIC int +xfs_bulkstat_one_fmt_compat( void __user *ubuffer, + int ubsize, + int *ubused, const xfs_bstat_t *buffer) { - compat_xfs_bstat_t __user *p32 = ubuffer; - - if (put_user(buffer->bs_ino, &p32->bs_ino) || - put_user(buffer->bs_mode, &p32->bs_mode) || - put_user(buffer->bs_nlink, &p32->bs_nlink) || - put_user(buffer->bs_uid, &p32->bs_uid) || - put_user(buffer->bs_gid, &p32->bs_gid) || - put_user(buffer->bs_rdev, &p32->bs_rdev) || - put_user(buffer->bs_blksize, &p32->bs_blksize) || - put_user(buffer->bs_size, &p32->bs_size) || + compat_xfs_bstat_t __user *p32 = ubuffer; + + if (ubsize < sizeof(*p32)) + return XFS_ERROR(ENOMEM); + + if (put_user(buffer->bs_ino, &p32->bs_ino) || + put_user(buffer->bs_mode, &p32->bs_mode) || + put_user(buffer->bs_nlink, &p32->bs_nlink) || + put_user(buffer->bs_uid, &p32->bs_uid) || + put_user(buffer->bs_gid, &p32->bs_gid) || + put_user(buffer->bs_rdev, &p32->bs_rdev) || + put_user(buffer->bs_blksize, &p32->bs_blksize) || + put_user(buffer->bs_size, &p32->bs_size) || xfs_bstime_store_compat(&p32->bs_atime, &buffer->bs_atime) || xfs_bstime_store_compat(&p32->bs_mtime, &buffer->bs_mtime) || xfs_bstime_store_compat(&p32->bs_ctime, &buffer->bs_ctime) || - put_user(buffer->bs_blocks, &p32->bs_blocks) || - put_user(buffer->bs_xflags, &p32->bs_xflags) || - put_user(buffer->bs_extsize, &p32->bs_extsize) || - put_user(buffer->bs_extents, &p32->bs_extents) || - put_user(buffer->bs_gen, &p32->bs_gen) || - put_user(buffer->bs_projid, &p32->bs_projid) || - put_user(buffer->bs_dmevmask, &p32->bs_dmevmask) || - put_user(buffer->bs_dmstate, &p32->bs_dmstate) || + put_user(buffer->bs_blocks, &p32->bs_blocks) || + put_user(buffer->bs_xflags, &p32->bs_xflags) || + put_user(buffer->bs_extsize, &p32->bs_extsize) || + put_user(buffer->bs_extents, &p32->bs_extents) || + put_user(buffer->bs_gen, &p32->bs_gen) || + put_user(buffer->bs_projid, &p32->bs_projid) || + put_user(buffer->bs_dmevmask, &p32->bs_dmevmask) || + put_user(buffer->bs_dmstate, &p32->bs_dmstate) || put_user(buffer->bs_aextents, &p32->bs_aextents)) - return -EFAULT; - return sizeof(*p32); + return XFS_ERROR(EFAULT); + if (ubused) + *ubused = sizeof(*p32); + return 0; } - - -typedef struct compat_xfs_fsop_bulkreq { - compat_uptr_t lastip; /* last inode # pointer */ - __s32 icount; /* count of entries in buffer */ - compat_uptr_t ubuffer; /* user buffer for inode desc. */ - compat_uptr_t ocount; /* output count pointer */ -} compat_xfs_fsop_bulkreq_t; - -#define XFS_IOC_FSBULKSTAT_32 \ - _IOWR('X', 101, struct compat_xfs_fsop_bulkreq) -#define XFS_IOC_FSBULKSTAT_SINGLE_32 \ - _IOWR('X', 102, struct compat_xfs_fsop_bulkreq) -#define XFS_IOC_FSINUMBERS_32 \ - _IOWR('X', 103, struct compat_xfs_fsop_bulkreq) +STATIC int +xfs_bulkstat_one_compat( + xfs_mount_t *mp, /* mount point for filesystem */ + xfs_ino_t ino, /* inode number to get data for */ + void __user *buffer, /* buffer to place output in */ + int ubsize, /* size of buffer */ + void *private_data, /* my private data */ + xfs_daddr_t bno, /* starting bno of inode cluster */ + int *ubused, /* bytes used by me */ + void *dibuff, /* on-disk inode buffer */ + int *stat) /* BULKSTAT_RV_... */ +{ + return xfs_bulkstat_one_int(mp, ino, buffer, ubsize, + xfs_bulkstat_one_fmt_compat, bno, + ubused, dibuff, stat); +} /* copied from xfs_ioctl.c */ STATIC int -xfs_ioc_bulkstat_compat( - xfs_mount_t *mp, - unsigned int cmd, - void __user *arg) +xfs_compat_ioc_bulkstat( + xfs_mount_t *mp, + unsigned int cmd, + compat_xfs_fsop_bulkreq_t __user *p32) { - compat_xfs_fsop_bulkreq_t __user *p32 = (void __user *)arg; u32 addr; xfs_fsop_bulkreq_t bulkreq; int count; /* # of records returned */ @@ -270,20 +263,20 @@ xfs_ioc_bulkstat_compat( /* should be called again (unused here, but used in dmapi) */ if (!capable(CAP_SYS_ADMIN)) - return -EPERM; + return -XFS_ERROR(EPERM); if (XFS_FORCED_SHUTDOWN(mp)) return -XFS_ERROR(EIO); if (get_user(addr, &p32->lastip)) - return -EFAULT; + return -XFS_ERROR(EFAULT); bulkreq.lastip = compat_ptr(addr); if (get_user(bulkreq.icount, &p32->icount) || get_user(addr, &p32->ubuffer)) - return -EFAULT; + return -XFS_ERROR(EFAULT); bulkreq.ubuffer = compat_ptr(addr); if (get_user(addr, &p32->ocount)) - return -EFAULT; + return -XFS_ERROR(EFAULT); bulkreq.ocount = compat_ptr(addr); if (copy_from_user(&inlast, bulkreq.lastip, sizeof(__s64))) @@ -295,17 +288,22 @@ xfs_ioc_bulkstat_compat( if (bulkreq.ubuffer == NULL) return -XFS_ERROR(EINVAL); - if (cmd == XFS_IOC_FSINUMBERS) + if (cmd == XFS_IOC_FSINUMBERS_32) { error = xfs_inumbers(mp, &inlast, &count, bulkreq.ubuffer, xfs_inumbers_fmt_compat); - else { - /* declare a var to get a warning in case the type changes */ - bulkstat_one_fmt_pf formatter = xfs_bulkstat_one_fmt_compat; + } else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE_32) { + int res; + + error = xfs_bulkstat_one_compat(mp, inlast, bulkreq.ubuffer, + sizeof(compat_xfs_bstat_t), + NULL, 0, NULL, NULL, &res); + } else if (cmd == XFS_IOC_FSBULKSTAT_32) { error = xfs_bulkstat(mp, &inlast, &count, - xfs_bulkstat_one, formatter, + xfs_bulkstat_one_compat, NULL, sizeof(compat_xfs_bstat_t), bulkreq.ubuffer, BULKSTAT_FG_QUICK, &done); - } + } else + error = XFS_ERROR(EINVAL); if (error) return -error; @@ -321,63 +319,306 @@ xfs_ioc_bulkstat_compat( return 0; } +STATIC int +xfs_compat_handlereq_copyin( + xfs_fsop_handlereq_t *hreq, + compat_xfs_fsop_handlereq_t __user *arg32) +{ + compat_xfs_fsop_handlereq_t hreq32; + + if (copy_from_user(&hreq32, arg32, sizeof(compat_xfs_fsop_handlereq_t))) + return -XFS_ERROR(EFAULT); + + hreq->fd = hreq32.fd; + hreq->path = compat_ptr(hreq32.path); + hreq->oflags = hreq32.oflags; + hreq->ihandle = compat_ptr(hreq32.ihandle); + hreq->ihandlen = hreq32.ihandlen; + hreq->ohandle = compat_ptr(hreq32.ohandle); + hreq->ohandlen = compat_ptr(hreq32.ohandlen); + return 0; +} -typedef struct compat_xfs_fsop_handlereq { - __u32 fd; /* fd for FD_TO_HANDLE */ - compat_uptr_t path; /* user pathname */ - __u32 oflags; /* open flags */ - compat_uptr_t ihandle; /* user supplied handle */ - __u32 ihandlen; /* user supplied length */ - compat_uptr_t ohandle; /* user buffer for handle */ - compat_uptr_t ohandlen; /* user buffer length */ -} compat_xfs_fsop_handlereq_t; - -#define XFS_IOC_PATH_TO_FSHANDLE_32 \ - _IOWR('X', 104, struct compat_xfs_fsop_handlereq) -#define XFS_IOC_PATH_TO_HANDLE_32 \ - _IOWR('X', 105, struct compat_xfs_fsop_handlereq) -#define XFS_IOC_FD_TO_HANDLE_32 \ - _IOWR('X', 106, struct compat_xfs_fsop_handlereq) -#define XFS_IOC_OPEN_BY_HANDLE_32 \ - _IOWR('X', 107, struct compat_xfs_fsop_handlereq) -#define XFS_IOC_READLINK_BY_HANDLE_32 \ - _IOWR('X', 108, struct compat_xfs_fsop_handlereq) - -STATIC unsigned long xfs_ioctl32_fshandle(unsigned long arg) +/* + * Convert userspace handle data into inode. + * + * We use the fact that all the fsop_handlereq ioctl calls have a data + * structure argument whose first component is always a xfs_fsop_handlereq_t, + * so we can pass that sub structure into this handy, shared routine. + * + * If no error, caller must always iput the returned inode. + */ +STATIC int +xfs_vget_fsop_handlereq_compat( + xfs_mount_t *mp, + struct inode *parinode, /* parent inode pointer */ + compat_xfs_fsop_handlereq_t *hreq, + struct inode **inode) { - compat_xfs_fsop_handlereq_t __user *p32 = (void __user *)arg; - xfs_fsop_handlereq_t __user *p = compat_alloc_user_space(sizeof(*p)); - u32 addr; - - if (copy_in_user(&p->fd, &p32->fd, sizeof(__u32)) || - get_user(addr, &p32->path) || - put_user(compat_ptr(addr), &p->path) || - copy_in_user(&p->oflags, &p32->oflags, sizeof(__u32)) || - get_user(addr, &p32->ihandle) || - put_user(compat_ptr(addr), &p->ihandle) || - copy_in_user(&p->ihandlen, &p32->ihandlen, sizeof(__u32)) || - get_user(addr, &p32->ohandle) || - put_user(compat_ptr(addr), &p->ohandle) || - get_user(addr, &p32->ohandlen) || - put_user(compat_ptr(addr), &p->ohandlen)) - return -EFAULT; - - return (unsigned long)p; + void __user *hanp; + size_t hlen; + xfs_fid_t *xfid; + xfs_handle_t *handlep; + xfs_handle_t handle; + xfs_inode_t *ip; + xfs_ino_t ino; + __u32 igen; + int error; + + /* + * Only allow handle opens under a directory. + */ + if (!S_ISDIR(parinode->i_mode)) + return XFS_ERROR(ENOTDIR); + + hanp = compat_ptr(hreq->ihandle); + hlen = hreq->ihandlen; + handlep = &handle; + + if (hlen < sizeof(handlep->ha_fsid) || hlen > sizeof(*handlep)) + return XFS_ERROR(EINVAL); + if (copy_from_user(handlep, hanp, hlen)) + return XFS_ERROR(EFAULT); + if (hlen < sizeof(*handlep)) + memset(((char *)handlep) + hlen, 0, sizeof(*handlep) - hlen); + if (hlen > sizeof(handlep->ha_fsid)) { + if (handlep->ha_fid.fid_len != + (hlen - sizeof(handlep->ha_fsid) - + sizeof(handlep->ha_fid.fid_len)) || + handlep->ha_fid.fid_pad) + return XFS_ERROR(EINVAL); + } + + /* + * Crack the handle, obtain the inode # & generation # + */ + xfid = (struct xfs_fid *)&handlep->ha_fid; + if (xfid->fid_len == sizeof(*xfid) - sizeof(xfid->fid_len)) { + ino = xfid->fid_ino; + igen = xfid->fid_gen; + } else { + return XFS_ERROR(EINVAL); + } + + /* + * Get the XFS inode, building a Linux inode to go with it. + */ + error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_SHARED, &ip, 0); + if (error) + return error; + if (ip == NULL) + return XFS_ERROR(EIO); + if (ip->i_d.di_gen != igen) { + xfs_iput_new(ip, XFS_ILOCK_SHARED); + return XFS_ERROR(ENOENT); + } + + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + *inode = VFS_I(ip); + return 0; } +STATIC int +xfs_compat_attrlist_by_handle( + xfs_mount_t *mp, + void __user *arg, + struct inode *parinode) +{ + int error; + attrlist_cursor_kern_t *cursor; + compat_xfs_fsop_attrlist_handlereq_t al_hreq; + struct inode *inode; + char *kbuf; + + if (!capable(CAP_SYS_ADMIN)) + return -XFS_ERROR(EPERM); + if (copy_from_user(&al_hreq, arg, + sizeof(compat_xfs_fsop_attrlist_handlereq_t))) + return -XFS_ERROR(EFAULT); + if (al_hreq.buflen > XATTR_LIST_MAX) + return -XFS_ERROR(EINVAL); + + /* + * Reject flags, only allow namespaces. + */ + if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE)) + return -XFS_ERROR(EINVAL); + + error = xfs_vget_fsop_handlereq_compat(mp, parinode, &al_hreq.hreq, + &inode); + if (error) + goto out; + + kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL); + if (!kbuf) + goto out_vn_rele; + + cursor = (attrlist_cursor_kern_t *)&al_hreq.pos; + error = xfs_attr_list(XFS_I(inode), kbuf, al_hreq.buflen, + al_hreq.flags, cursor); + if (error) + goto out_kfree; + + if (copy_to_user(compat_ptr(al_hreq.buffer), kbuf, al_hreq.buflen)) + error = -EFAULT; + + out_kfree: + kfree(kbuf); + out_vn_rele: + iput(inode); + out: + return -error; +} -STATIC long -xfs_compat_ioctl( - int mode, - struct file *file, - unsigned cmd, - unsigned long arg) +STATIC int +xfs_compat_attrmulti_by_handle( + xfs_mount_t *mp, + void __user *arg, + struct inode *parinode) +{ + int error; + compat_xfs_attr_multiop_t *ops; + compat_xfs_fsop_attrmulti_handlereq_t am_hreq; + struct inode *inode; + unsigned int i, size; + char *attr_name; + + if (!capable(CAP_SYS_ADMIN)) + return -XFS_ERROR(EPERM); + if (copy_from_user(&am_hreq, arg, + sizeof(compat_xfs_fsop_attrmulti_handlereq_t))) + return -XFS_ERROR(EFAULT); + + error = xfs_vget_fsop_handlereq_compat(mp, parinode, &am_hreq.hreq, + &inode); + if (error) + goto out; + + error = E2BIG; + size = am_hreq.opcount * sizeof(compat_xfs_attr_multiop_t); + if (!size || size > 16 * PAGE_SIZE) + goto out_vn_rele; + + error = ENOMEM; + ops = kmalloc(size, GFP_KERNEL); + if (!ops) + goto out_vn_rele; + + error = EFAULT; + if (copy_from_user(ops, compat_ptr(am_hreq.ops), size)) + goto out_kfree_ops; + + attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL); + if (!attr_name) + goto out_kfree_ops; + + + error = 0; + for (i = 0; i < am_hreq.opcount; i++) { + ops[i].am_error = strncpy_from_user(attr_name, + compat_ptr(ops[i].am_attrname), + MAXNAMELEN); + if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN) + error = -ERANGE; + if (ops[i].am_error < 0) + break; + + switch (ops[i].am_opcode) { + case ATTR_OP_GET: + ops[i].am_error = xfs_attrmulti_attr_get(inode, + attr_name, + compat_ptr(ops[i].am_attrvalue), + &ops[i].am_length, ops[i].am_flags); + break; + case ATTR_OP_SET: + ops[i].am_error = xfs_attrmulti_attr_set(inode, + attr_name, + compat_ptr(ops[i].am_attrvalue), + ops[i].am_length, ops[i].am_flags); + break; + case ATTR_OP_REMOVE: + ops[i].am_error = xfs_attrmulti_attr_remove(inode, + attr_name, ops[i].am_flags); + break; + default: + ops[i].am_error = EINVAL; + } + } + + if (copy_to_user(compat_ptr(am_hreq.ops), ops, size)) + error = XFS_ERROR(EFAULT); + + kfree(attr_name); + out_kfree_ops: + kfree(ops); + out_vn_rele: + iput(inode); + out: + return -error; +} + +STATIC int +xfs_compat_fssetdm_by_handle( + xfs_mount_t *mp, + void __user *arg, + struct inode *parinode) +{ + int error; + struct fsdmidata fsd; + compat_xfs_fsop_setdm_handlereq_t dmhreq; + struct inode *inode; + + if (!capable(CAP_MKNOD)) + return -XFS_ERROR(EPERM); + if (copy_from_user(&dmhreq, arg, + sizeof(compat_xfs_fsop_setdm_handlereq_t))) + return -XFS_ERROR(EFAULT); + + error = xfs_vget_fsop_handlereq_compat(mp, parinode, &dmhreq.hreq, + &inode); + if (error) + return -error; + + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) { + error = -XFS_ERROR(EPERM); + goto out; + } + + if (copy_from_user(&fsd, compat_ptr(dmhreq.data), sizeof(fsd))) { + error = -XFS_ERROR(EFAULT); + goto out; + } + + error = -xfs_set_dmattrs(XFS_I(inode), fsd.fsd_dmevmask, + fsd.fsd_dmstate); + +out: + iput(inode); + return error; +} + +long +xfs_file_compat_ioctl( + struct file *filp, + unsigned cmd, + unsigned long p) { - struct inode *inode = file->f_path.dentry->d_inode; - int error; + struct inode *inode = filp->f_path.dentry->d_inode; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + void __user *arg = (void __user *)p; + int ioflags = 0; + int error; + + if (filp->f_mode & FMODE_NOCMTIME) + ioflags |= IO_INVIS; + + xfs_itrace_entry(ip); switch (cmd) { + /* No size or alignment issues on any arch */ case XFS_IOC_DIOINFO: case XFS_IOC_FSGEOMETRY: case XFS_IOC_FSGETXATTR: @@ -387,48 +628,16 @@ xfs_compat_ioctl( case XFS_IOC_GETBMAP: case XFS_IOC_GETBMAPA: case XFS_IOC_GETBMAPX: -/* not handled - case XFS_IOC_FSSETDM_BY_HANDLE: - case XFS_IOC_ATTRLIST_BY_HANDLE: - case XFS_IOC_ATTRMULTI_BY_HANDLE: -*/ case XFS_IOC_FSCOUNTS: case XFS_IOC_SET_RESBLKS: case XFS_IOC_GET_RESBLKS: - case XFS_IOC_FSGROWFSDATA: case XFS_IOC_FSGROWFSLOG: - case XFS_IOC_FSGROWFSRT: - case XFS_IOC_FREEZE: - case XFS_IOC_THAW: case XFS_IOC_GOINGDOWN: case XFS_IOC_ERROR_INJECTION: case XFS_IOC_ERROR_CLEARALL: - break; - - case XFS_IOC32_GETXFLAGS: - case XFS_IOC32_SETXFLAGS: - case XFS_IOC32_GETVERSION: - cmd = _NATIVE_IOC(cmd, long); - break; -#ifdef BROKEN_X86_ALIGNMENT - /* xfs_flock_t has wrong u32 vs u64 alignment */ - case XFS_IOC_ALLOCSP_32: - case XFS_IOC_FREESP_32: - case XFS_IOC_ALLOCSP64_32: - case XFS_IOC_FREESP64_32: - case XFS_IOC_RESVSP_32: - case XFS_IOC_UNRESVSP_32: - case XFS_IOC_RESVSP64_32: - case XFS_IOC_UNRESVSP64_32: - arg = xfs_ioctl32_flock(arg); - cmd = _NATIVE_IOC(cmd, struct xfs_flock64); - break; - case XFS_IOC_FSGEOMETRY_V1_32: - arg = xfs_ioctl32_geom_v1(arg); - cmd = _NATIVE_IOC(cmd, struct xfs_fsop_geom_v1); - break; - -#else /* These are handled fine if no alignment issues */ + return xfs_file_ioctl(filp, cmd, p); +#ifndef BROKEN_X86_ALIGNMENT + /* These are handled fine if no alignment issues */ case XFS_IOC_ALLOCSP: case XFS_IOC_FREESP: case XFS_IOC_RESVSP: @@ -438,51 +647,97 @@ xfs_compat_ioctl( case XFS_IOC_RESVSP64: case XFS_IOC_UNRESVSP64: case XFS_IOC_FSGEOMETRY_V1: - break; + case XFS_IOC_FSGROWFSDATA: + case XFS_IOC_FSGROWFSRT: + return xfs_file_ioctl(filp, cmd, p); +#else + case XFS_IOC_ALLOCSP_32: + case XFS_IOC_FREESP_32: + case XFS_IOC_ALLOCSP64_32: + case XFS_IOC_FREESP64_32: + case XFS_IOC_RESVSP_32: + case XFS_IOC_UNRESVSP_32: + case XFS_IOC_RESVSP64_32: + case XFS_IOC_UNRESVSP64_32: { + struct xfs_flock64 bf; - /* xfs_bstat_t still has wrong u32 vs u64 alignment */ - case XFS_IOC_SWAPEXT: - break; + if (xfs_compat_flock64_copyin(&bf, arg)) + return -XFS_ERROR(EFAULT); + cmd = _NATIVE_IOC(cmd, struct xfs_flock64); + return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf); + } + case XFS_IOC_FSGEOMETRY_V1_32: + return xfs_compat_ioc_fsgeometry_v1(mp, arg); + case XFS_IOC_FSGROWFSDATA_32: { + struct xfs_growfs_data in; + + if (xfs_compat_growfs_data_copyin(&in, arg)) + return -XFS_ERROR(EFAULT); + error = xfs_growfs_data(mp, &in); + return -error; + } + case XFS_IOC_FSGROWFSRT_32: { + struct xfs_growfs_rt in; + if (xfs_compat_growfs_rt_copyin(&in, arg)) + return -XFS_ERROR(EFAULT); + error = xfs_growfs_rt(mp, &in); + return -error; + } #endif + /* long changes size, but xfs only copiese out 32 bits */ + case XFS_IOC_GETXFLAGS_32: + case XFS_IOC_SETXFLAGS_32: + case XFS_IOC_GETVERSION_32: + cmd = _NATIVE_IOC(cmd, long); + return xfs_file_ioctl(filp, cmd, p); + case XFS_IOC_SWAPEXT: { + struct xfs_swapext sxp; + struct compat_xfs_swapext __user *sxu = arg; + + /* Bulk copy in up to the sx_stat field, then copy bstat */ + if (copy_from_user(&sxp, sxu, + offsetof(struct xfs_swapext, sx_stat)) || + xfs_ioctl32_bstat_copyin(&sxp.sx_stat, &sxu->sx_stat)) + return -XFS_ERROR(EFAULT); + error = xfs_swapext(&sxp); + return -error; + } case XFS_IOC_FSBULKSTAT_32: case XFS_IOC_FSBULKSTAT_SINGLE_32: case XFS_IOC_FSINUMBERS_32: - cmd = _NATIVE_IOC(cmd, struct xfs_fsop_bulkreq); - return xfs_ioc_bulkstat_compat(XFS_I(inode)->i_mount, - cmd, (void __user*)arg); + return xfs_compat_ioc_bulkstat(mp, cmd, arg); case XFS_IOC_FD_TO_HANDLE_32: case XFS_IOC_PATH_TO_HANDLE_32: - case XFS_IOC_PATH_TO_FSHANDLE_32: - case XFS_IOC_OPEN_BY_HANDLE_32: - case XFS_IOC_READLINK_BY_HANDLE_32: - arg = xfs_ioctl32_fshandle(arg); + case XFS_IOC_PATH_TO_FSHANDLE_32: { + struct xfs_fsop_handlereq hreq; + + if (xfs_compat_handlereq_copyin(&hreq, arg)) + return -XFS_ERROR(EFAULT); cmd = _NATIVE_IOC(cmd, struct xfs_fsop_handlereq); - break; - default: - return -ENOIOCTLCMD; + return xfs_find_handle(cmd, &hreq); } + case XFS_IOC_OPEN_BY_HANDLE_32: { + struct xfs_fsop_handlereq hreq; - error = xfs_ioctl(XFS_I(inode), file, mode, cmd, (void __user *)arg); - xfs_iflags_set(XFS_I(inode), XFS_IMODIFIED); - - return error; -} - -long -xfs_file_compat_ioctl( - struct file *file, - unsigned cmd, - unsigned long arg) -{ - return xfs_compat_ioctl(0, file, cmd, arg); -} + if (xfs_compat_handlereq_copyin(&hreq, arg)) + return -XFS_ERROR(EFAULT); + return xfs_open_by_handle(mp, &hreq, filp, inode); + } + case XFS_IOC_READLINK_BY_HANDLE_32: { + struct xfs_fsop_handlereq hreq; -long -xfs_file_compat_invis_ioctl( - struct file *file, - unsigned cmd, - unsigned long arg) -{ - return xfs_compat_ioctl(IO_INVIS, file, cmd, arg); + if (xfs_compat_handlereq_copyin(&hreq, arg)) + return -XFS_ERROR(EFAULT); + return xfs_readlink_by_handle(mp, &hreq, inode); + } + case XFS_IOC_ATTRLIST_BY_HANDLE_32: + return xfs_compat_attrlist_by_handle(mp, arg, inode); + case XFS_IOC_ATTRMULTI_BY_HANDLE_32: + return xfs_compat_attrmulti_by_handle(mp, arg, inode); + case XFS_IOC_FSSETDM_BY_HANDLE_32: + return xfs_compat_fssetdm_by_handle(mp, arg, inode); + default: + return -XFS_ERROR(ENOIOCTLCMD); + } } diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.h b/fs/xfs/linux-2.6/xfs_ioctl32.h index 02de6e62ee37..1024c4f8ba0d 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl32.h +++ b/fs/xfs/linux-2.6/xfs_ioctl32.h @@ -18,7 +18,217 @@ #ifndef __XFS_IOCTL32_H__ #define __XFS_IOCTL32_H__ -extern long xfs_file_compat_ioctl(struct file *, unsigned, unsigned long); -extern long xfs_file_compat_invis_ioctl(struct file *, unsigned, unsigned long); +#include <linux/compat.h> + +/* + * on 32-bit arches, ioctl argument structures may have different sizes + * and/or alignment. We define compat structures which match the + * 32-bit sizes/alignments here, and their associated ioctl numbers. + * + * xfs_ioctl32.c contains routines to copy these structures in and out. + */ + +/* stock kernel-level ioctls we support */ +#define XFS_IOC_GETXFLAGS_32 FS_IOC32_GETFLAGS +#define XFS_IOC_SETXFLAGS_32 FS_IOC32_SETFLAGS +#define XFS_IOC_GETVERSION_32 FS_IOC32_GETVERSION + +/* + * On intel, even if sizes match, alignment and/or padding may differ. + */ +#if defined(CONFIG_IA64) || defined(CONFIG_X86_64) +#define BROKEN_X86_ALIGNMENT +#define __compat_packed __attribute__((packed)) +#else +#define __compat_packed +#endif + +typedef struct compat_xfs_bstime { + compat_time_t tv_sec; /* seconds */ + __s32 tv_nsec; /* and nanoseconds */ +} compat_xfs_bstime_t; + +typedef struct compat_xfs_bstat { + __u64 bs_ino; /* inode number */ + __u16 bs_mode; /* type and mode */ + __u16 bs_nlink; /* number of links */ + __u32 bs_uid; /* user id */ + __u32 bs_gid; /* group id */ + __u32 bs_rdev; /* device value */ + __s32 bs_blksize; /* block size */ + __s64 bs_size; /* file size */ + compat_xfs_bstime_t bs_atime; /* access time */ + compat_xfs_bstime_t bs_mtime; /* modify time */ + compat_xfs_bstime_t bs_ctime; /* inode change time */ + int64_t bs_blocks; /* number of blocks */ + __u32 bs_xflags; /* extended flags */ + __s32 bs_extsize; /* extent size */ + __s32 bs_extents; /* number of extents */ + __u32 bs_gen; /* generation count */ + __u16 bs_projid; /* project id */ + unsigned char bs_pad[14]; /* pad space, unused */ + __u32 bs_dmevmask; /* DMIG event mask */ + __u16 bs_dmstate; /* DMIG state info */ + __u16 bs_aextents; /* attribute number of extents */ +} __compat_packed compat_xfs_bstat_t; + +typedef struct compat_xfs_fsop_bulkreq { + compat_uptr_t lastip; /* last inode # pointer */ + __s32 icount; /* count of entries in buffer */ + compat_uptr_t ubuffer; /* user buffer for inode desc. */ + compat_uptr_t ocount; /* output count pointer */ +} compat_xfs_fsop_bulkreq_t; + +#define XFS_IOC_FSBULKSTAT_32 \ + _IOWR('X', 101, struct compat_xfs_fsop_bulkreq) +#define XFS_IOC_FSBULKSTAT_SINGLE_32 \ + _IOWR('X', 102, struct compat_xfs_fsop_bulkreq) +#define XFS_IOC_FSINUMBERS_32 \ + _IOWR('X', 103, struct compat_xfs_fsop_bulkreq) + +typedef struct compat_xfs_fsop_handlereq { + __u32 fd; /* fd for FD_TO_HANDLE */ + compat_uptr_t path; /* user pathname */ + __u32 oflags; /* open flags */ + compat_uptr_t ihandle; /* user supplied handle */ + __u32 ihandlen; /* user supplied length */ + compat_uptr_t ohandle; /* user buffer for handle */ + compat_uptr_t ohandlen; /* user buffer length */ +} compat_xfs_fsop_handlereq_t; + +#define XFS_IOC_PATH_TO_FSHANDLE_32 \ + _IOWR('X', 104, struct compat_xfs_fsop_handlereq) +#define XFS_IOC_PATH_TO_HANDLE_32 \ + _IOWR('X', 105, struct compat_xfs_fsop_handlereq) +#define XFS_IOC_FD_TO_HANDLE_32 \ + _IOWR('X', 106, struct compat_xfs_fsop_handlereq) +#define XFS_IOC_OPEN_BY_HANDLE_32 \ + _IOWR('X', 107, struct compat_xfs_fsop_handlereq) +#define XFS_IOC_READLINK_BY_HANDLE_32 \ + _IOWR('X', 108, struct compat_xfs_fsop_handlereq) + +/* The bstat field in the swapext struct needs translation */ +typedef struct compat_xfs_swapext { + __int64_t sx_version; /* version */ + __int64_t sx_fdtarget; /* fd of target file */ + __int64_t sx_fdtmp; /* fd of tmp file */ + xfs_off_t sx_offset; /* offset into file */ + xfs_off_t sx_length; /* leng from offset */ + char sx_pad[16]; /* pad space, unused */ + compat_xfs_bstat_t sx_stat; /* stat of target b4 copy */ +} __compat_packed compat_xfs_swapext_t; + +#define XFS_IOC_SWAPEXT_32 _IOWR('X', 109, struct compat_xfs_swapext) + +typedef struct compat_xfs_fsop_attrlist_handlereq { + struct compat_xfs_fsop_handlereq hreq; /* handle interface structure */ + struct xfs_attrlist_cursor pos; /* opaque cookie, list offset */ + __u32 flags; /* which namespace to use */ + __u32 buflen; /* length of buffer supplied */ + compat_uptr_t buffer; /* returned names */ +} __compat_packed compat_xfs_fsop_attrlist_handlereq_t; + +/* Note: actually this is read/write */ +#define XFS_IOC_ATTRLIST_BY_HANDLE_32 \ + _IOW('X', 122, struct compat_xfs_fsop_attrlist_handlereq) + +/* am_opcodes defined in xfs_fs.h */ +typedef struct compat_xfs_attr_multiop { + __u32 am_opcode; + __s32 am_error; + compat_uptr_t am_attrname; + compat_uptr_t am_attrvalue; + __u32 am_length; + __u32 am_flags; +} compat_xfs_attr_multiop_t; + +typedef struct compat_xfs_fsop_attrmulti_handlereq { + struct compat_xfs_fsop_handlereq hreq; /* handle interface structure */ + __u32 opcount;/* count of following multiop */ + /* ptr to compat_xfs_attr_multiop */ + compat_uptr_t ops; /* attr_multi data */ +} compat_xfs_fsop_attrmulti_handlereq_t; + +#define XFS_IOC_ATTRMULTI_BY_HANDLE_32 \ + _IOW('X', 123, struct compat_xfs_fsop_attrmulti_handlereq) + +typedef struct compat_xfs_fsop_setdm_handlereq { + struct compat_xfs_fsop_handlereq hreq; /* handle information */ + /* ptr to struct fsdmidata */ + compat_uptr_t data; /* DMAPI data */ +} compat_xfs_fsop_setdm_handlereq_t; + +#define XFS_IOC_FSSETDM_BY_HANDLE_32 \ + _IOW('X', 121, struct compat_xfs_fsop_setdm_handlereq) + +#ifdef BROKEN_X86_ALIGNMENT +/* on ia32 l_start is on a 32-bit boundary */ +typedef struct compat_xfs_flock64 { + __s16 l_type; + __s16 l_whence; + __s64 l_start __attribute__((packed)); + /* len == 0 means until end of file */ + __s64 l_len __attribute__((packed)); + __s32 l_sysid; + __u32 l_pid; + __s32 l_pad[4]; /* reserve area */ +} compat_xfs_flock64_t; + +#define XFS_IOC_ALLOCSP_32 _IOW('X', 10, struct compat_xfs_flock64) +#define XFS_IOC_FREESP_32 _IOW('X', 11, struct compat_xfs_flock64) +#define XFS_IOC_ALLOCSP64_32 _IOW('X', 36, struct compat_xfs_flock64) +#define XFS_IOC_FREESP64_32 _IOW('X', 37, struct compat_xfs_flock64) +#define XFS_IOC_RESVSP_32 _IOW('X', 40, struct compat_xfs_flock64) +#define XFS_IOC_UNRESVSP_32 _IOW('X', 41, struct compat_xfs_flock64) +#define XFS_IOC_RESVSP64_32 _IOW('X', 42, struct compat_xfs_flock64) +#define XFS_IOC_UNRESVSP64_32 _IOW('X', 43, struct compat_xfs_flock64) + +typedef struct compat_xfs_fsop_geom_v1 { + __u32 blocksize; /* filesystem (data) block size */ + __u32 rtextsize; /* realtime extent size */ + __u32 agblocks; /* fsblocks in an AG */ + __u32 agcount; /* number of allocation groups */ + __u32 logblocks; /* fsblocks in the log */ + __u32 sectsize; /* (data) sector size, bytes */ + __u32 inodesize; /* inode size in bytes */ + __u32 imaxpct; /* max allowed inode space(%) */ + __u64 datablocks; /* fsblocks in data subvolume */ + __u64 rtblocks; /* fsblocks in realtime subvol */ + __u64 rtextents; /* rt extents in realtime subvol*/ + __u64 logstart; /* starting fsblock of the log */ + unsigned char uuid[16]; /* unique id of the filesystem */ + __u32 sunit; /* stripe unit, fsblocks */ + __u32 swidth; /* stripe width, fsblocks */ + __s32 version; /* structure version */ + __u32 flags; /* superblock version flags */ + __u32 logsectsize; /* log sector size, bytes */ + __u32 rtsectsize; /* realtime sector size, bytes */ + __u32 dirblocksize; /* directory block size, bytes */ +} __attribute__((packed)) compat_xfs_fsop_geom_v1_t; + +#define XFS_IOC_FSGEOMETRY_V1_32 \ + _IOR('X', 100, struct compat_xfs_fsop_geom_v1) + +typedef struct compat_xfs_inogrp { + __u64 xi_startino; /* starting inode number */ + __s32 xi_alloccount; /* # bits set in allocmask */ + __u64 xi_allocmask; /* mask of allocated inodes */ +} __attribute__((packed)) compat_xfs_inogrp_t; + +/* These growfs input structures have padding on the end, so must translate */ +typedef struct compat_xfs_growfs_data { + __u64 newblocks; /* new data subvol size, fsblocks */ + __u32 imaxpct; /* new inode space percentage limit */ +} __attribute__((packed)) compat_xfs_growfs_data_t; + +typedef struct compat_xfs_growfs_rt { + __u64 newblocks; /* new realtime size, fsblocks */ + __u32 extsize; /* new realtime extent size, fsblocks */ +} __attribute__((packed)) compat_xfs_growfs_rt_t; + +#define XFS_IOC_FSGROWFSDATA_32 _IOW('X', 110, struct compat_xfs_growfs_data) +#define XFS_IOC_FSGROWFSRT_32 _IOW('X', 112, struct compat_xfs_growfs_rt) + +#endif /* BROKEN_X86_ALIGNMENT */ #endif /* __XFS_IOCTL32_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index 095d271f3434..7aa53fefc67f 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -53,6 +53,7 @@ #include <linux/namei.h> #include <linux/security.h> #include <linux/falloc.h> +#include <linux/fiemap.h> /* * Bring the atime in the XFS inode uptodate. @@ -64,14 +65,14 @@ xfs_synchronize_atime( { struct inode *inode = VFS_I(ip); - if (inode) { + if (!(inode->i_state & I_CLEAR)) { ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec; ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec; } } /* - * If the linux inode exists, mark it dirty. + * If the linux inode is valid, mark it dirty. * Used when commiting a dirty inode into a transaction so that * the inode will get written back by the linux code */ @@ -81,7 +82,7 @@ xfs_mark_inode_dirty_sync( { struct inode *inode = VFS_I(ip); - if (inode) + if (!(inode->i_state & (I_WILL_FREE|I_FREEING|I_CLEAR))) mark_inode_dirty_sync(inode); } @@ -128,7 +129,7 @@ xfs_ichgtime( if (sync_it) { SYNCHRONIZE(); ip->i_update_core = 1; - mark_inode_dirty_sync(inode); + xfs_mark_inode_dirty_sync(ip); } } @@ -158,8 +159,6 @@ xfs_init_security( } error = xfs_attr_set(ip, name, value, length, ATTR_SECURE); - if (!error) - xfs_iflags_set(ip, XFS_IMODIFIED); kfree(name); kfree(value); @@ -260,7 +259,6 @@ xfs_vn_mknod( error = _ACL_INHERIT(inode, mode, default_acl); if (unlikely(error)) goto out_cleanup_inode; - xfs_iflags_set(ip, XFS_IMODIFIED); _ACL_FREE(default_acl); } @@ -366,21 +364,17 @@ xfs_vn_link( struct inode *dir, struct dentry *dentry) { - struct inode *inode; /* inode of guy being linked to */ + struct inode *inode = old_dentry->d_inode; struct xfs_name name; int error; - inode = old_dentry->d_inode; xfs_dentry_to_name(&name, dentry); - igrab(inode); error = xfs_link(XFS_I(dir), XFS_I(inode), &name); - if (unlikely(error)) { - iput(inode); + if (unlikely(error)) return -error; - } - xfs_iflags_set(XFS_I(dir), XFS_IMODIFIED); + atomic_inc(&inode->i_count); d_instantiate(dentry, inode); return 0; } @@ -601,7 +595,7 @@ xfs_vn_setattr( struct dentry *dentry, struct iattr *iattr) { - return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0, NULL); + return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0); } /* @@ -642,7 +636,7 @@ xfs_vn_fallocate( xfs_ilock(ip, XFS_IOLOCK_EXCL); error = xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf, - 0, NULL, XFS_ATTR_NOLOCK); + 0, XFS_ATTR_NOLOCK); if (!error && !(mode & FALLOC_FL_KEEP_SIZE) && offset + len > i_size_read(inode)) new_size = offset + len; @@ -653,7 +647,7 @@ xfs_vn_fallocate( iattr.ia_valid = ATTR_SIZE; iattr.ia_size = new_size; - error = xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK, NULL); + error = xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK); } xfs_iunlock(ip, XFS_IOLOCK_EXCL); @@ -661,6 +655,88 @@ out_error: return error; } +#define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) + +/* + * Call fiemap helper to fill in user data. + * Returns positive errors to xfs_getbmap. + */ +STATIC int +xfs_fiemap_format( + void **arg, + struct getbmapx *bmv, + int *full) +{ + int error; + struct fiemap_extent_info *fieinfo = *arg; + u32 fiemap_flags = 0; + u64 logical, physical, length; + + /* Do nothing for a hole */ + if (bmv->bmv_block == -1LL) + return 0; + + logical = BBTOB(bmv->bmv_offset); + physical = BBTOB(bmv->bmv_block); + length = BBTOB(bmv->bmv_length); + + if (bmv->bmv_oflags & BMV_OF_PREALLOC) + fiemap_flags |= FIEMAP_EXTENT_UNWRITTEN; + else if (bmv->bmv_oflags & BMV_OF_DELALLOC) { + fiemap_flags |= FIEMAP_EXTENT_DELALLOC; + physical = 0; /* no block yet */ + } + if (bmv->bmv_oflags & BMV_OF_LAST) + fiemap_flags |= FIEMAP_EXTENT_LAST; + + error = fiemap_fill_next_extent(fieinfo, logical, physical, + length, fiemap_flags); + if (error > 0) { + error = 0; + *full = 1; /* user array now full */ + } + + return -error; +} + +STATIC int +xfs_vn_fiemap( + struct inode *inode, + struct fiemap_extent_info *fieinfo, + u64 start, + u64 length) +{ + xfs_inode_t *ip = XFS_I(inode); + struct getbmapx bm; + int error; + + error = fiemap_check_flags(fieinfo, XFS_FIEMAP_FLAGS); + if (error) + return error; + + /* Set up bmap header for xfs internal routine */ + bm.bmv_offset = BTOBB(start); + /* Special case for whole file */ + if (length == FIEMAP_MAX_OFFSET) + bm.bmv_length = -1LL; + else + bm.bmv_length = BTOBB(length); + + /* our formatter will tell xfs_getbmap when to stop. */ + bm.bmv_count = MAXEXTNUM; + bm.bmv_iflags = BMV_IF_PREALLOC; + if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) + bm.bmv_iflags |= BMV_IF_ATTRFORK; + if (!(fieinfo->fi_flags & FIEMAP_FLAG_SYNC)) + bm.bmv_iflags |= BMV_IF_DELALLOC; + + error = xfs_getbmap(ip, &bm, xfs_fiemap_format, fieinfo); + if (error) + return -error; + + return 0; +} + static const struct inode_operations xfs_inode_operations = { .permission = xfs_vn_permission, .truncate = xfs_vn_truncate, @@ -671,6 +747,7 @@ static const struct inode_operations xfs_inode_operations = { .removexattr = generic_removexattr, .listxattr = xfs_vn_listxattr, .fallocate = xfs_vn_fallocate, + .fiemap = xfs_vn_fiemap, }; static const struct inode_operations xfs_dir_inode_operations = { @@ -766,12 +843,20 @@ xfs_diflags_to_iflags( * When reading existing inodes from disk this is called directly * from xfs_iget, when creating a new inode it is called from * xfs_ialloc after setting up the inode. + * + * We are always called with an uninitialised linux inode here. + * We need to initialise the necessary fields and take a reference + * on it. */ void xfs_setup_inode( struct xfs_inode *ip) { - struct inode *inode = ip->i_vnode; + struct inode *inode = &ip->i_vnode; + + inode->i_ino = ip->i_ino; + inode->i_state = I_NEW|I_LOCK; + inode_add_to_lists(ip->i_mount->m_super, inode); inode->i_mode = ip->i_d.di_mode; inode->i_nlink = ip->i_d.di_nlink; @@ -799,7 +884,6 @@ xfs_setup_inode( inode->i_ctime.tv_sec = ip->i_d.di_ctime.t_sec; inode->i_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec; xfs_diflags_to_iflags(inode, ip); - xfs_iflags_clear(ip, XFS_IMODIFIED); switch (inode->i_mode & S_IFMT) { case S_IFREG: diff --git a/fs/xfs/linux-2.6/xfs_iops.h b/fs/xfs/linux-2.6/xfs_iops.h index 8b1a1e31dc21..ef41c92ce66e 100644 --- a/fs/xfs/linux-2.6/xfs_iops.h +++ b/fs/xfs/linux-2.6/xfs_iops.h @@ -22,7 +22,6 @@ struct xfs_inode; extern const struct file_operations xfs_file_operations; extern const struct file_operations xfs_dir_file_operations; -extern const struct file_operations xfs_invis_file_operations; extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size); diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h index cc0f7b3a9795..507492d6dccd 100644 --- a/fs/xfs/linux-2.6/xfs_linux.h +++ b/fs/xfs/linux-2.6/xfs_linux.h @@ -21,18 +21,12 @@ #include <linux/types.h> /* - * Some types are conditional depending on the target system. * XFS_BIG_BLKNOS needs block layer disk addresses to be 64 bits. - * XFS_BIG_INUMS needs the VFS inode number to be 64 bits, as well - * as requiring XFS_BIG_BLKNOS to be set. + * XFS_BIG_INUMS requires XFS_BIG_BLKNOS to be set. */ #if defined(CONFIG_LBD) || (BITS_PER_LONG == 64) # define XFS_BIG_BLKNOS 1 -# if BITS_PER_LONG == 64 -# define XFS_BIG_INUMS 1 -# else -# define XFS_BIG_INUMS 0 -# endif +# define XFS_BIG_INUMS 1 #else # define XFS_BIG_BLKNOS 0 # define XFS_BIG_INUMS 0 @@ -77,6 +71,7 @@ #include <linux/spinlock.h> #include <linux/random.h> #include <linux/ctype.h> +#include <linux/writeback.h> #include <asm/page.h> #include <asm/div64.h> @@ -85,7 +80,6 @@ #include <asm/byteorder.h> #include <asm/unaligned.h> -#include <xfs_vfs.h> #include <xfs_cred.h> #include <xfs_vnode.h> #include <xfs_stats.h> @@ -107,7 +101,6 @@ #undef HAVE_PERCPU_SB /* per cpu superblock counters are a 2.6 feature */ #endif -#define restricted_chown xfs_params.restrict_chown.val #define irix_sgid_inherit xfs_params.sgid_inherit.val #define irix_symlink_mode xfs_params.symlink_mode.val #define xfs_panic_mask xfs_params.panic_mask.val diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c index 1957e5357d04..7e90daa0d1d1 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.c +++ b/fs/xfs/linux-2.6/xfs_lrw.c @@ -51,7 +51,6 @@ #include "xfs_vnodeops.h" #include <linux/capability.h> -#include <linux/mount.h> #include <linux/writeback.h> @@ -243,7 +242,7 @@ xfs_read( if (unlikely(ioflags & IO_ISDIRECT)) { if (inode->i_mapping->nrpages) - ret = xfs_flushinval_pages(ip, (*offset & PAGE_CACHE_MASK), + ret = -xfs_flushinval_pages(ip, (*offset & PAGE_CACHE_MASK), -1, FI_REMAPF_LOCKED); mutex_unlock(&inode->i_mutex); if (ret) { @@ -668,15 +667,8 @@ start: if (new_size > xip->i_size) xip->i_new_size = new_size; - /* - * We're not supposed to change timestamps in readonly-mounted - * filesystems. Throw it away if anyone asks us. - */ - if (likely(!(ioflags & IO_INVIS) && - !mnt_want_write(file->f_path.mnt))) { + if (likely(!(ioflags & IO_INVIS))) xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - mnt_drop_write(file->f_path.mnt); - } /* * If the offset is beyond the size of the file, we have a couple @@ -715,7 +707,6 @@ start: } } -retry: /* We can write back this queue in page reclaim */ current->backing_dev_info = mapping->backing_dev_info; @@ -771,6 +762,17 @@ retry: if (ret == -EIOCBQUEUED && !(ioflags & IO_ISAIO)) ret = wait_on_sync_kiocb(iocb); + isize = i_size_read(inode); + if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize)) + *offset = isize; + + if (*offset > xip->i_size) { + xfs_ilock(xip, XFS_ILOCK_EXCL); + if (*offset > xip->i_size) + xip->i_size = *offset; + xfs_iunlock(xip, XFS_ILOCK_EXCL); + } + if (ret == -ENOSPC && DM_EVENT_ENABLED(xip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) { xfs_iunlock(xip, iolock); @@ -784,20 +786,7 @@ retry: xfs_ilock(xip, iolock); if (error) goto out_unlock_internal; - pos = xip->i_size; - ret = 0; - goto retry; - } - - isize = i_size_read(inode); - if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize)) - *offset = isize; - - if (*offset > xip->i_size) { - xfs_ilock(xip, XFS_ILOCK_EXCL); - if (*offset > xip->i_size) - xip->i_size = *offset; - xfs_iunlock(xip, XFS_ILOCK_EXCL); + goto start; } error = -ret; @@ -855,13 +844,7 @@ retry: int xfs_bdstrat_cb(struct xfs_buf *bp) { - xfs_mount_t *mp; - - mp = XFS_BUF_FSPRIVATE3(bp, xfs_mount_t *); - if (!XFS_FORCED_SHUTDOWN(mp)) { - xfs_buf_iorequest(bp); - return 0; - } else { + if (XFS_FORCED_SHUTDOWN(bp->b_mount)) { xfs_buftrace("XFS__BDSTRAT IOERROR", bp); /* * Metadata write that didn't get logged but @@ -874,6 +857,9 @@ xfs_bdstrat_cb(struct xfs_buf *bp) else return (xfs_bioerror(bp)); } + + xfs_buf_iorequest(bp); + return 0; } /* diff --git a/fs/xfs/linux-2.6/xfs_stats.c b/fs/xfs/linux-2.6/xfs_stats.c index 3d5b67c075c7..c3526d445f6a 100644 --- a/fs/xfs/linux-2.6/xfs_stats.c +++ b/fs/xfs/linux-2.6/xfs_stats.c @@ -53,11 +53,15 @@ xfs_read_xfsstats( { "icluster", XFSSTAT_END_INODE_CLUSTER }, { "vnodes", XFSSTAT_END_VNODE_OPS }, { "buf", XFSSTAT_END_BUF }, + { "abtb2", XFSSTAT_END_ABTB_V2 }, + { "abtc2", XFSSTAT_END_ABTC_V2 }, + { "bmbt2", XFSSTAT_END_BMBT_V2 }, + { "ibt2", XFSSTAT_END_IBT_V2 }, }; /* Loop over all stats groups */ for (i=j=len = 0; i < ARRAY_SIZE(xstats); i++) { - len += sprintf(buffer + len, xstats[i].desc); + len += sprintf(buffer + len, "%s", xstats[i].desc); /* inner loop does each group */ while (j < xstats[i].endpoint) { val = 0; diff --git a/fs/xfs/linux-2.6/xfs_stats.h b/fs/xfs/linux-2.6/xfs_stats.h index e83820febc9f..736854b1ca1a 100644 --- a/fs/xfs/linux-2.6/xfs_stats.h +++ b/fs/xfs/linux-2.6/xfs_stats.h @@ -118,6 +118,71 @@ struct xfsstats { __uint32_t xb_page_retries; __uint32_t xb_page_found; __uint32_t xb_get_read; +/* Version 2 btree counters */ +#define XFSSTAT_END_ABTB_V2 (XFSSTAT_END_BUF+15) + __uint32_t xs_abtb_2_lookup; + __uint32_t xs_abtb_2_compare; + __uint32_t xs_abtb_2_insrec; + __uint32_t xs_abtb_2_delrec; + __uint32_t xs_abtb_2_newroot; + __uint32_t xs_abtb_2_killroot; + __uint32_t xs_abtb_2_increment; + __uint32_t xs_abtb_2_decrement; + __uint32_t xs_abtb_2_lshift; + __uint32_t xs_abtb_2_rshift; + __uint32_t xs_abtb_2_split; + __uint32_t xs_abtb_2_join; + __uint32_t xs_abtb_2_alloc; + __uint32_t xs_abtb_2_free; + __uint32_t xs_abtb_2_moves; +#define XFSSTAT_END_ABTC_V2 (XFSSTAT_END_ABTB_V2+15) + __uint32_t xs_abtc_2_lookup; + __uint32_t xs_abtc_2_compare; + __uint32_t xs_abtc_2_insrec; + __uint32_t xs_abtc_2_delrec; + __uint32_t xs_abtc_2_newroot; + __uint32_t xs_abtc_2_killroot; + __uint32_t xs_abtc_2_increment; + __uint32_t xs_abtc_2_decrement; + __uint32_t xs_abtc_2_lshift; + __uint32_t xs_abtc_2_rshift; + __uint32_t xs_abtc_2_split; + __uint32_t xs_abtc_2_join; + __uint32_t xs_abtc_2_alloc; + __uint32_t xs_abtc_2_free; + __uint32_t xs_abtc_2_moves; +#define XFSSTAT_END_BMBT_V2 (XFSSTAT_END_ABTC_V2+15) + __uint32_t xs_bmbt_2_lookup; + __uint32_t xs_bmbt_2_compare; + __uint32_t xs_bmbt_2_insrec; + __uint32_t xs_bmbt_2_delrec; + __uint32_t xs_bmbt_2_newroot; + __uint32_t xs_bmbt_2_killroot; + __uint32_t xs_bmbt_2_increment; + __uint32_t xs_bmbt_2_decrement; + __uint32_t xs_bmbt_2_lshift; + __uint32_t xs_bmbt_2_rshift; + __uint32_t xs_bmbt_2_split; + __uint32_t xs_bmbt_2_join; + __uint32_t xs_bmbt_2_alloc; + __uint32_t xs_bmbt_2_free; + __uint32_t xs_bmbt_2_moves; +#define XFSSTAT_END_IBT_V2 (XFSSTAT_END_BMBT_V2+15) + __uint32_t xs_ibt_2_lookup; + __uint32_t xs_ibt_2_compare; + __uint32_t xs_ibt_2_insrec; + __uint32_t xs_ibt_2_delrec; + __uint32_t xs_ibt_2_newroot; + __uint32_t xs_ibt_2_killroot; + __uint32_t xs_ibt_2_increment; + __uint32_t xs_ibt_2_decrement; + __uint32_t xs_ibt_2_lshift; + __uint32_t xs_ibt_2_rshift; + __uint32_t xs_ibt_2_split; + __uint32_t xs_ibt_2_join; + __uint32_t xs_ibt_2_alloc; + __uint32_t xs_ibt_2_free; + __uint32_t xs_ibt_2_moves; /* Extra precision counters */ __uint64_t xs_xstrat_bytes; __uint64_t xs_write_bytes; diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 37ebe36056eb..95a971080368 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -18,7 +18,6 @@ #include "xfs.h" #include "xfs_bit.h" #include "xfs_log.h" -#include "xfs_clnt.h" #include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_sb.h" @@ -36,6 +35,7 @@ #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_btree.h" +#include "xfs_btree_trace.h" #include "xfs_ialloc.h" #include "xfs_bmap.h" #include "xfs_rtalloc.h" @@ -48,7 +48,6 @@ #include "xfs_buf_item.h" #include "xfs_utils.h" #include "xfs_vnodeops.h" -#include "xfs_vfsops.h" #include "xfs_version.h" #include "xfs_log_priv.h" #include "xfs_trans_priv.h" @@ -58,6 +57,7 @@ #include "xfs_extfree_item.h" #include "xfs_mru_cache.h" #include "xfs_inode_item.h" +#include "xfs_sync.h" #include <linux/namei.h> #include <linux/init.h> @@ -70,36 +70,9 @@ static struct quotactl_ops xfs_quotactl_operations; static struct super_operations xfs_super_operations; -static kmem_zone_t *xfs_vnode_zone; static kmem_zone_t *xfs_ioend_zone; mempool_t *xfs_ioend_pool; -STATIC struct xfs_mount_args * -xfs_args_allocate( - struct super_block *sb, - int silent) -{ - struct xfs_mount_args *args; - - args = kzalloc(sizeof(struct xfs_mount_args), GFP_KERNEL); - if (!args) - return NULL; - - args->logbufs = args->logbufsize = -1; - strncpy(args->fsname, sb->s_id, MAXNAMELEN); - - /* Copy the already-parsed mount(2) flags we're interested in */ - if (sb->s_flags & MS_DIRSYNC) - args->flags |= XFSMNT_DIRSYNC; - if (sb->s_flags & MS_SYNCHRONOUS) - args->flags |= XFSMNT_WSYNC; - if (silent) - args->flags |= XFSMNT_QUIET; - args->flags |= XFSMNT_32BITINODES; - - return args; -} - #define MNTOPT_LOGBUFS "logbufs" /* number of XFS log buffers */ #define MNTOPT_LOGBSIZE "logbsize" /* size of XFS log buffers */ #define MNTOPT_LOGDEV "logdev" /* log device */ @@ -188,26 +161,54 @@ suffix_strtoul(char *s, char **endp, unsigned int base) return simple_strtoul((const char *)s, endp, base) << shift_left_factor; } +/* + * This function fills in xfs_mount_t fields based on mount args. + * Note: the superblock has _not_ yet been read in. + * + * Note that this function leaks the various device name allocations on + * failure. The caller takes care of them. + */ STATIC int xfs_parseargs( struct xfs_mount *mp, char *options, - struct xfs_mount_args *args, - int update) + char **mtpt) { + struct super_block *sb = mp->m_super; char *this_char, *value, *eov; - int dsunit, dswidth, vol_dsunit, vol_dswidth; - int iosize; + int dsunit = 0; + int dswidth = 0; + int iosize = 0; int dmapi_implies_ikeep = 1; + uchar_t iosizelog = 0; + + /* + * Copy binary VFS mount flags we are interested in. + */ + if (sb->s_flags & MS_RDONLY) + mp->m_flags |= XFS_MOUNT_RDONLY; + if (sb->s_flags & MS_DIRSYNC) + mp->m_flags |= XFS_MOUNT_DIRSYNC; + if (sb->s_flags & MS_SYNCHRONOUS) + mp->m_flags |= XFS_MOUNT_WSYNC; + + /* + * Set some default flags that could be cleared by the mount option + * parsing. + */ + mp->m_flags |= XFS_MOUNT_BARRIER; + mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE; + mp->m_flags |= XFS_MOUNT_SMALL_INUMS; - args->flags |= XFSMNT_BARRIER; - args->flags2 |= XFSMNT2_COMPAT_IOSIZE; + /* + * These can be overridden by the mount option parsing. + */ + mp->m_logbufs = -1; + mp->m_logbsize = -1; if (!options) goto done; - iosize = dsunit = dswidth = vol_dsunit = vol_dswidth = 0; - while ((this_char = strsep(&options, ",")) != NULL) { if (!*this_char) continue; @@ -221,7 +222,7 @@ xfs_parseargs( this_char); return EINVAL; } - args->logbufs = simple_strtoul(value, &eov, 10); + mp->m_logbufs = simple_strtoul(value, &eov, 10); } else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) { if (!value || !*value) { cmn_err(CE_WARN, @@ -229,7 +230,7 @@ xfs_parseargs( this_char); return EINVAL; } - args->logbufsize = suffix_strtoul(value, &eov, 10); + mp->m_logbsize = suffix_strtoul(value, &eov, 10); } else if (!strcmp(this_char, MNTOPT_LOGDEV)) { if (!value || !*value) { cmn_err(CE_WARN, @@ -237,7 +238,9 @@ xfs_parseargs( this_char); return EINVAL; } - strncpy(args->logname, value, MAXNAMELEN); + mp->m_logname = kstrndup(value, MAXNAMELEN, GFP_KERNEL); + if (!mp->m_logname) + return ENOMEM; } else if (!strcmp(this_char, MNTOPT_MTPT)) { if (!value || !*value) { cmn_err(CE_WARN, @@ -245,7 +248,9 @@ xfs_parseargs( this_char); return EINVAL; } - strncpy(args->mtpt, value, MAXNAMELEN); + *mtpt = kstrndup(value, MAXNAMELEN, GFP_KERNEL); + if (!*mtpt) + return ENOMEM; } else if (!strcmp(this_char, MNTOPT_RTDEV)) { if (!value || !*value) { cmn_err(CE_WARN, @@ -253,7 +258,9 @@ xfs_parseargs( this_char); return EINVAL; } - strncpy(args->rtname, value, MAXNAMELEN); + mp->m_rtname = kstrndup(value, MAXNAMELEN, GFP_KERNEL); + if (!mp->m_rtname) + return ENOMEM; } else if (!strcmp(this_char, MNTOPT_BIOSIZE)) { if (!value || !*value) { cmn_err(CE_WARN, @@ -262,8 +269,7 @@ xfs_parseargs( return EINVAL; } iosize = simple_strtoul(value, &eov, 10); - args->flags |= XFSMNT_IOSIZE; - args->iosizelog = (uint8_t) iosize; + iosizelog = ffs(iosize) - 1; } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) { if (!value || !*value) { cmn_err(CE_WARN, @@ -272,8 +278,7 @@ xfs_parseargs( return EINVAL; } iosize = suffix_strtoul(value, &eov, 10); - args->flags |= XFSMNT_IOSIZE; - args->iosizelog = ffs(iosize) - 1; + iosizelog = ffs(iosize) - 1; } else if (!strcmp(this_char, MNTOPT_GRPID) || !strcmp(this_char, MNTOPT_BSDGROUPS)) { mp->m_flags |= XFS_MOUNT_GRPID; @@ -281,23 +286,25 @@ xfs_parseargs( !strcmp(this_char, MNTOPT_SYSVGROUPS)) { mp->m_flags &= ~XFS_MOUNT_GRPID; } else if (!strcmp(this_char, MNTOPT_WSYNC)) { - args->flags |= XFSMNT_WSYNC; + mp->m_flags |= XFS_MOUNT_WSYNC; } else if (!strcmp(this_char, MNTOPT_OSYNCISOSYNC)) { - args->flags |= XFSMNT_OSYNCISOSYNC; + mp->m_flags |= XFS_MOUNT_OSYNCISOSYNC; } else if (!strcmp(this_char, MNTOPT_NORECOVERY)) { - args->flags |= XFSMNT_NORECOVERY; + mp->m_flags |= XFS_MOUNT_NORECOVERY; } else if (!strcmp(this_char, MNTOPT_INO64)) { - args->flags |= XFSMNT_INO64; -#if !XFS_BIG_INUMS +#if XFS_BIG_INUMS + mp->m_flags |= XFS_MOUNT_INO64; + mp->m_inoadd = XFS_INO64_OFFSET; +#else cmn_err(CE_WARN, "XFS: %s option not allowed on this system", this_char); return EINVAL; #endif } else if (!strcmp(this_char, MNTOPT_NOALIGN)) { - args->flags |= XFSMNT_NOALIGN; + mp->m_flags |= XFS_MOUNT_NOALIGN; } else if (!strcmp(this_char, MNTOPT_SWALLOC)) { - args->flags |= XFSMNT_SWALLOC; + mp->m_flags |= XFS_MOUNT_SWALLOC; } else if (!strcmp(this_char, MNTOPT_SUNIT)) { if (!value || !*value) { cmn_err(CE_WARN, @@ -315,7 +322,7 @@ xfs_parseargs( } dswidth = simple_strtoul(value, &eov, 10); } else if (!strcmp(this_char, MNTOPT_64BITINODE)) { - args->flags &= ~XFSMNT_32BITINODES; + mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS; #if !XFS_BIG_INUMS cmn_err(CE_WARN, "XFS: %s option not allowed on this system", @@ -323,56 +330,61 @@ xfs_parseargs( return EINVAL; #endif } else if (!strcmp(this_char, MNTOPT_NOUUID)) { - args->flags |= XFSMNT_NOUUID; + mp->m_flags |= XFS_MOUNT_NOUUID; } else if (!strcmp(this_char, MNTOPT_BARRIER)) { - args->flags |= XFSMNT_BARRIER; + mp->m_flags |= XFS_MOUNT_BARRIER; } else if (!strcmp(this_char, MNTOPT_NOBARRIER)) { - args->flags &= ~XFSMNT_BARRIER; + mp->m_flags &= ~XFS_MOUNT_BARRIER; } else if (!strcmp(this_char, MNTOPT_IKEEP)) { - args->flags |= XFSMNT_IKEEP; + mp->m_flags |= XFS_MOUNT_IKEEP; } else if (!strcmp(this_char, MNTOPT_NOIKEEP)) { dmapi_implies_ikeep = 0; - args->flags &= ~XFSMNT_IKEEP; + mp->m_flags &= ~XFS_MOUNT_IKEEP; } else if (!strcmp(this_char, MNTOPT_LARGEIO)) { - args->flags2 &= ~XFSMNT2_COMPAT_IOSIZE; + mp->m_flags &= ~XFS_MOUNT_COMPAT_IOSIZE; } else if (!strcmp(this_char, MNTOPT_NOLARGEIO)) { - args->flags2 |= XFSMNT2_COMPAT_IOSIZE; + mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE; } else if (!strcmp(this_char, MNTOPT_ATTR2)) { - args->flags |= XFSMNT_ATTR2; + mp->m_flags |= XFS_MOUNT_ATTR2; } else if (!strcmp(this_char, MNTOPT_NOATTR2)) { - args->flags &= ~XFSMNT_ATTR2; - args->flags |= XFSMNT_NOATTR2; + mp->m_flags &= ~XFS_MOUNT_ATTR2; + mp->m_flags |= XFS_MOUNT_NOATTR2; } else if (!strcmp(this_char, MNTOPT_FILESTREAM)) { - args->flags2 |= XFSMNT2_FILESTREAMS; + mp->m_flags |= XFS_MOUNT_FILESTREAMS; } else if (!strcmp(this_char, MNTOPT_NOQUOTA)) { - args->flags &= ~(XFSMNT_UQUOTAENF|XFSMNT_UQUOTA); - args->flags &= ~(XFSMNT_GQUOTAENF|XFSMNT_GQUOTA); + mp->m_qflags &= ~(XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE | + XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE | + XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE | + XFS_UQUOTA_ENFD | XFS_OQUOTA_ENFD); } else if (!strcmp(this_char, MNTOPT_QUOTA) || !strcmp(this_char, MNTOPT_UQUOTA) || !strcmp(this_char, MNTOPT_USRQUOTA)) { - args->flags |= XFSMNT_UQUOTA | XFSMNT_UQUOTAENF; + mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE | + XFS_UQUOTA_ENFD); } else if (!strcmp(this_char, MNTOPT_QUOTANOENF) || !strcmp(this_char, MNTOPT_UQUOTANOENF)) { - args->flags |= XFSMNT_UQUOTA; - args->flags &= ~XFSMNT_UQUOTAENF; + mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE); + mp->m_qflags &= ~XFS_UQUOTA_ENFD; } else if (!strcmp(this_char, MNTOPT_PQUOTA) || !strcmp(this_char, MNTOPT_PRJQUOTA)) { - args->flags |= XFSMNT_PQUOTA | XFSMNT_PQUOTAENF; + mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE | + XFS_OQUOTA_ENFD); } else if (!strcmp(this_char, MNTOPT_PQUOTANOENF)) { - args->flags |= XFSMNT_PQUOTA; - args->flags &= ~XFSMNT_PQUOTAENF; + mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE); + mp->m_qflags &= ~XFS_OQUOTA_ENFD; } else if (!strcmp(this_char, MNTOPT_GQUOTA) || !strcmp(this_char, MNTOPT_GRPQUOTA)) { - args->flags |= XFSMNT_GQUOTA | XFSMNT_GQUOTAENF; + mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE | + XFS_OQUOTA_ENFD); } else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) { - args->flags |= XFSMNT_GQUOTA; - args->flags &= ~XFSMNT_GQUOTAENF; + mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE); + mp->m_qflags &= ~XFS_OQUOTA_ENFD; } else if (!strcmp(this_char, MNTOPT_DMAPI)) { - args->flags |= XFSMNT_DMAPI; + mp->m_flags |= XFS_MOUNT_DMAPI; } else if (!strcmp(this_char, MNTOPT_XDSM)) { - args->flags |= XFSMNT_DMAPI; + mp->m_flags |= XFS_MOUNT_DMAPI; } else if (!strcmp(this_char, MNTOPT_DMI)) { - args->flags |= XFSMNT_DMAPI; + mp->m_flags |= XFS_MOUNT_DMAPI; } else if (!strcmp(this_char, "ihashsize")) { cmn_err(CE_WARN, "XFS: ihashsize no longer used, option is deprecated."); @@ -390,27 +402,29 @@ xfs_parseargs( } } - if (args->flags & XFSMNT_NORECOVERY) { - if ((mp->m_flags & XFS_MOUNT_RDONLY) == 0) { - cmn_err(CE_WARN, - "XFS: no-recovery mounts must be read-only."); - return EINVAL; - } + /* + * no recovery flag requires a read-only mount + */ + if ((mp->m_flags & XFS_MOUNT_NORECOVERY) && + !(mp->m_flags & XFS_MOUNT_RDONLY)) { + cmn_err(CE_WARN, "XFS: no-recovery mounts must be read-only."); + return EINVAL; } - if ((args->flags & XFSMNT_NOALIGN) && (dsunit || dswidth)) { + if ((mp->m_flags & XFS_MOUNT_NOALIGN) && (dsunit || dswidth)) { cmn_err(CE_WARN, "XFS: sunit and swidth options incompatible with the noalign option"); return EINVAL; } - if ((args->flags & XFSMNT_GQUOTA) && (args->flags & XFSMNT_PQUOTA)) { + if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) && + (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE))) { cmn_err(CE_WARN, "XFS: cannot mount with both project and group quota"); return EINVAL; } - if ((args->flags & XFSMNT_DMAPI) && *args->mtpt == '\0') { + if ((mp->m_flags & XFS_MOUNT_DMAPI) && (!*mtpt || *mtpt[0] == '\0')) { printk("XFS: %s option needs the mount point option as well\n", MNTOPT_DMAPI); return EINVAL; @@ -438,27 +452,66 @@ xfs_parseargs( * Note that if "ikeep" or "noikeep" mount options are * supplied, then they are honored. */ - if ((args->flags & XFSMNT_DMAPI) && dmapi_implies_ikeep) - args->flags |= XFSMNT_IKEEP; + if ((mp->m_flags & XFS_MOUNT_DMAPI) && dmapi_implies_ikeep) + mp->m_flags |= XFS_MOUNT_IKEEP; - if ((args->flags & XFSMNT_NOALIGN) != XFSMNT_NOALIGN) { +done: + if (!(mp->m_flags & XFS_MOUNT_NOALIGN)) { + /* + * At this point the superblock has not been read + * in, therefore we do not know the block size. + * Before the mount call ends we will convert + * these to FSBs. + */ if (dsunit) { - args->sunit = dsunit; - args->flags |= XFSMNT_RETERR; - } else { - args->sunit = vol_dsunit; + mp->m_dalign = dsunit; + mp->m_flags |= XFS_MOUNT_RETERR; } - dswidth ? (args->swidth = dswidth) : - (args->swidth = vol_dswidth); - } else { - args->sunit = args->swidth = 0; + + if (dswidth) + mp->m_swidth = dswidth; + } + + if (mp->m_logbufs != -1 && + mp->m_logbufs != 0 && + (mp->m_logbufs < XLOG_MIN_ICLOGS || + mp->m_logbufs > XLOG_MAX_ICLOGS)) { + cmn_err(CE_WARN, + "XFS: invalid logbufs value: %d [not %d-%d]", + mp->m_logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS); + return XFS_ERROR(EINVAL); + } + if (mp->m_logbsize != -1 && + mp->m_logbsize != 0 && + (mp->m_logbsize < XLOG_MIN_RECORD_BSIZE || + mp->m_logbsize > XLOG_MAX_RECORD_BSIZE || + !is_power_of_2(mp->m_logbsize))) { + cmn_err(CE_WARN, + "XFS: invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]", + mp->m_logbsize); + return XFS_ERROR(EINVAL); + } + + mp->m_fsname = kstrndup(sb->s_id, MAXNAMELEN, GFP_KERNEL); + if (!mp->m_fsname) + return ENOMEM; + mp->m_fsname_len = strlen(mp->m_fsname) + 1; + + if (iosizelog) { + if (iosizelog > XFS_MAX_IO_LOG || + iosizelog < XFS_MIN_IO_LOG) { + cmn_err(CE_WARN, + "XFS: invalid log iosize: %d [not %d-%d]", + iosizelog, XFS_MIN_IO_LOG, + XFS_MAX_IO_LOG); + return XFS_ERROR(EINVAL); + } + + mp->m_flags |= XFS_MOUNT_DFLT_IOSIZE; + mp->m_readio_log = iosizelog; + mp->m_writeio_log = iosizelog; } -done: - if (args->flags & XFSMNT_32BITINODES) - mp->m_flags |= XFS_MOUNT_SMALL_INUMS; - if (args->flags2) - args->flags |= XFSMNT_FLAGS2; return 0; } @@ -704,8 +757,7 @@ xfs_close_devices( */ STATIC int xfs_open_devices( - struct xfs_mount *mp, - struct xfs_mount_args *args) + struct xfs_mount *mp) { struct block_device *ddev = mp->m_super->s_bdev; struct block_device *logdev = NULL, *rtdev = NULL; @@ -714,14 +766,14 @@ xfs_open_devices( /* * Open real time and log devices - order is important. */ - if (args->logname[0]) { - error = xfs_blkdev_get(mp, args->logname, &logdev); + if (mp->m_logname) { + error = xfs_blkdev_get(mp, mp->m_logname, &logdev); if (error) goto out; } - if (args->rtname[0]) { - error = xfs_blkdev_get(mp, args->rtname, &rtdev); + if (mp->m_rtname) { + error = xfs_blkdev_get(mp, mp->m_rtname, &rtdev); if (error) goto out_close_logdev; @@ -813,18 +865,18 @@ xfs_setup_devices( */ void xfsaild_wakeup( - xfs_mount_t *mp, + struct xfs_ail *ailp, xfs_lsn_t threshold_lsn) { - mp->m_ail.xa_target = threshold_lsn; - wake_up_process(mp->m_ail.xa_task); + ailp->xa_target = threshold_lsn; + wake_up_process(ailp->xa_task); } int xfsaild( void *data) { - xfs_mount_t *mp = (xfs_mount_t *)data; + struct xfs_ail *ailp = data; xfs_lsn_t last_pushed_lsn = 0; long tout = 0; @@ -836,11 +888,11 @@ xfsaild( /* swsusp */ try_to_freeze(); - ASSERT(mp->m_log); - if (XFS_FORCED_SHUTDOWN(mp)) + ASSERT(ailp->xa_mount->m_log); + if (XFS_FORCED_SHUTDOWN(ailp->xa_mount)) continue; - tout = xfsaild_push(mp, &last_pushed_lsn); + tout = xfsaild_push(ailp, &last_pushed_lsn); } return 0; @@ -848,43 +900,82 @@ xfsaild( int xfsaild_start( - xfs_mount_t *mp) + struct xfs_ail *ailp) { - mp->m_ail.xa_target = 0; - mp->m_ail.xa_task = kthread_run(xfsaild, mp, "xfsaild"); - if (IS_ERR(mp->m_ail.xa_task)) - return -PTR_ERR(mp->m_ail.xa_task); + ailp->xa_target = 0; + ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild"); + if (IS_ERR(ailp->xa_task)) + return -PTR_ERR(ailp->xa_task); return 0; } void xfsaild_stop( - xfs_mount_t *mp) + struct xfs_ail *ailp) { - kthread_stop(mp->m_ail.xa_task); + kthread_stop(ailp->xa_task); } - +/* Catch misguided souls that try to use this interface on XFS */ STATIC struct inode * xfs_fs_alloc_inode( struct super_block *sb) { - return kmem_zone_alloc(xfs_vnode_zone, KM_SLEEP); + BUG(); + return NULL; } +/* + * Now that the generic code is guaranteed not to be accessing + * the linux inode, we can reclaim the inode. + */ STATIC void xfs_fs_destroy_inode( - struct inode *inode) + struct inode *inode) { - kmem_zone_free(xfs_vnode_zone, inode); + xfs_inode_t *ip = XFS_I(inode); + + XFS_STATS_INC(vn_reclaim); + if (xfs_reclaim(ip)) + panic("%s: cannot reclaim 0x%p\n", __func__, inode); } +/* + * Slab object creation initialisation for the XFS inode. + * This covers only the idempotent fields in the XFS inode; + * all other fields need to be initialised on allocation + * from the slab. This avoids the need to repeatedly intialise + * fields in the xfs inode that left in the initialise state + * when freeing the inode. + */ STATIC void xfs_fs_inode_init_once( - void *vnode) + void *inode) { - inode_init_once((struct inode *)vnode); + struct xfs_inode *ip = inode; + + memset(ip, 0, sizeof(struct xfs_inode)); + + /* vfs inode */ + inode_init_once(VFS_I(ip)); + + /* xfs inode */ + atomic_set(&ip->i_iocount, 0); + atomic_set(&ip->i_pincount, 0); + spin_lock_init(&ip->i_flags_lock); + init_waitqueue_head(&ip->i_ipin_wait); + /* + * Because we want to use a counting completion, complete + * the flush completion once to allow a single access to + * the flush completion without blocking. + */ + init_completion(&ip->i_flush); + complete(&ip->i_flush); + + mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER, + "xfsino", ip->i_ino); + mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); } /* @@ -898,21 +989,26 @@ xfs_fs_write_inode( struct inode *inode, int sync) { + struct xfs_inode *ip = XFS_I(inode); int error = 0; int flags = 0; - xfs_itrace_entry(XFS_I(inode)); + xfs_itrace_entry(ip); if (sync) { - filemap_fdatawait(inode->i_mapping); + error = xfs_wait_on_pages(ip, 0, -1); + if (error) + goto out_error; flags |= FLUSH_SYNC; } - error = xfs_inode_flush(XFS_I(inode), flags); + error = xfs_inode_flush(ip, flags); + +out_error: /* * if we failed to write out the inode then mark * it dirty again so we'll try again later. */ if (error) - mark_inode_dirty_sync(inode); + xfs_mark_inode_dirty_sync(ip); return -error; } @@ -923,164 +1019,12 @@ xfs_fs_clear_inode( { xfs_inode_t *ip = XFS_I(inode); - /* - * ip can be null when xfs_iget_core calls xfs_idestroy if we - * find an inode with di_mode == 0 but without IGET_CREATE set. - */ - if (ip) { - xfs_itrace_entry(ip); - XFS_STATS_INC(vn_rele); - XFS_STATS_INC(vn_remove); - XFS_STATS_INC(vn_reclaim); - XFS_STATS_DEC(vn_active); - - xfs_inactive(ip); - xfs_iflags_clear(ip, XFS_IMODIFIED); - if (xfs_reclaim(ip)) - panic("%s: cannot reclaim 0x%p\n", __func__, inode); - } - - ASSERT(XFS_I(inode) == NULL); -} - -/* - * Enqueue a work item to be picked up by the vfs xfssyncd thread. - * Doing this has two advantages: - * - It saves on stack space, which is tight in certain situations - * - It can be used (with care) as a mechanism to avoid deadlocks. - * Flushing while allocating in a full filesystem requires both. - */ -STATIC void -xfs_syncd_queue_work( - struct xfs_mount *mp, - void *data, - void (*syncer)(struct xfs_mount *, void *)) -{ - struct bhv_vfs_sync_work *work; - - work = kmem_alloc(sizeof(struct bhv_vfs_sync_work), KM_SLEEP); - INIT_LIST_HEAD(&work->w_list); - work->w_syncer = syncer; - work->w_data = data; - work->w_mount = mp; - spin_lock(&mp->m_sync_lock); - list_add_tail(&work->w_list, &mp->m_sync_list); - spin_unlock(&mp->m_sync_lock); - wake_up_process(mp->m_sync_task); -} - -/* - * Flush delayed allocate data, attempting to free up reserved space - * from existing allocations. At this point a new allocation attempt - * has failed with ENOSPC and we are in the process of scratching our - * heads, looking about for more room... - */ -STATIC void -xfs_flush_inode_work( - struct xfs_mount *mp, - void *arg) -{ - struct inode *inode = arg; - filemap_flush(inode->i_mapping); - iput(inode); -} - -void -xfs_flush_inode( - xfs_inode_t *ip) -{ - struct inode *inode = VFS_I(ip); - - igrab(inode); - xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inode_work); - delay(msecs_to_jiffies(500)); -} - -/* - * This is the "bigger hammer" version of xfs_flush_inode_work... - * (IOW, "If at first you don't succeed, use a Bigger Hammer"). - */ -STATIC void -xfs_flush_device_work( - struct xfs_mount *mp, - void *arg) -{ - struct inode *inode = arg; - sync_blockdev(mp->m_super->s_bdev); - iput(inode); -} - -void -xfs_flush_device( - xfs_inode_t *ip) -{ - struct inode *inode = VFS_I(ip); + xfs_itrace_entry(ip); + XFS_STATS_INC(vn_rele); + XFS_STATS_INC(vn_remove); + XFS_STATS_DEC(vn_active); - igrab(inode); - xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_device_work); - delay(msecs_to_jiffies(500)); - xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC); -} - -STATIC void -xfs_sync_worker( - struct xfs_mount *mp, - void *unused) -{ - int error; - - if (!(mp->m_flags & XFS_MOUNT_RDONLY)) - error = xfs_sync(mp, SYNC_FSDATA | SYNC_BDFLUSH | SYNC_ATTR); - mp->m_sync_seq++; - wake_up(&mp->m_wait_single_sync_task); -} - -STATIC int -xfssyncd( - void *arg) -{ - struct xfs_mount *mp = arg; - long timeleft; - bhv_vfs_sync_work_t *work, *n; - LIST_HEAD (tmp); - - set_freezable(); - timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10); - for (;;) { - timeleft = schedule_timeout_interruptible(timeleft); - /* swsusp */ - try_to_freeze(); - if (kthread_should_stop() && list_empty(&mp->m_sync_list)) - break; - - spin_lock(&mp->m_sync_lock); - /* - * We can get woken by laptop mode, to do a sync - - * that's the (only!) case where the list would be - * empty with time remaining. - */ - if (!timeleft || list_empty(&mp->m_sync_list)) { - if (!timeleft) - timeleft = xfs_syncd_centisecs * - msecs_to_jiffies(10); - INIT_LIST_HEAD(&mp->m_sync_work.w_list); - list_add_tail(&mp->m_sync_work.w_list, - &mp->m_sync_list); - } - list_for_each_entry_safe(work, n, &mp->m_sync_list, w_list) - list_move(&work->w_list, &tmp); - spin_unlock(&mp->m_sync_lock); - - list_for_each_entry_safe(work, n, &tmp, w_list) { - (*work->w_syncer)(mp, work->w_data); - list_del(&work->w_list); - if (work == &mp->m_sync_work) - continue; - kmem_free(work); - } - } - - return 0; + xfs_inactive(ip); } STATIC void @@ -1099,11 +1043,9 @@ xfs_fs_put_super( struct xfs_mount *mp = XFS_M(sb); struct xfs_inode *rip = mp->m_rootip; int unmount_event_flags = 0; - int error; - - kthread_stop(mp->m_sync_task); - xfs_sync(mp, SYNC_ATTR | SYNC_DELWRI); + xfs_syncd_stop(mp); + xfs_sync_inodes(mp, SYNC_ATTR|SYNC_DELWRI); #ifdef HAVE_DMAPI if (mp->m_flags & XFS_MOUNT_DMAPI) { @@ -1128,18 +1070,6 @@ xfs_fs_put_super( xfs_filestream_unmount(mp); XFS_bflush(mp->m_ddev_targp); - error = xfs_unmount_flush(mp, 0); - WARN_ON(error); - - /* - * If we're forcing a shutdown, typically because of a media error, - * we want to make sure we invalidate dirty pages that belong to - * referenced vnodes as well. - */ - if (XFS_FORCED_SHUTDOWN(mp)) { - error = xfs_sync(mp, SYNC_WAIT | SYNC_CLOSE); - ASSERT(error != EFSCORRUPTED); - } if (mp->m_flags & XFS_MOUNT_DMAPI) { XFS_SEND_UNMOUNT(mp, rip, DM_RIGHT_NULL, 0, 0, @@ -1161,7 +1091,7 @@ xfs_fs_write_super( struct super_block *sb) { if (!(sb->s_flags & MS_RDONLY)) - xfs_sync(XFS_M(sb), SYNC_FSDATA); + xfs_sync_fsdata(XFS_M(sb), 0); sb->s_dirt = 0; } @@ -1172,7 +1102,6 @@ xfs_fs_sync_super( { struct xfs_mount *mp = XFS_M(sb); int error; - int flags; /* * Treat a sync operation like a freeze. This is to work @@ -1186,20 +1115,10 @@ xfs_fs_sync_super( * dirty the Linux inode until after the transaction I/O * completes. */ - if (wait || unlikely(sb->s_frozen == SB_FREEZE_WRITE)) { - /* - * First stage of freeze - no more writers will make progress - * now we are here, so we flush delwri and delalloc buffers - * here, then wait for all I/O to complete. Data is frozen at - * that point. Metadata is not frozen, transactions can still - * occur here so don't bother flushing the buftarg (i.e - * SYNC_QUIESCE) because it'll just get dirty again. - */ - flags = SYNC_DATA_QUIESCE; - } else - flags = SYNC_FSDATA; - - error = xfs_sync(mp, flags); + if (wait || unlikely(sb->s_frozen == SB_FREEZE_WRITE)) + error = xfs_quiesce_data(mp); + else + error = xfs_sync_fsdata(mp, 0); sb->s_dirt = 0; if (unlikely(laptop_mode)) { @@ -1337,9 +1256,8 @@ xfs_fs_remount( /* rw -> ro */ if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) { - xfs_filestream_flush(mp); - xfs_sync(mp, SYNC_DATA_QUIESCE); - xfs_attr_quiesce(mp); + xfs_quiesce_data(mp); + xfs_quiesce_attr(mp); mp->m_flags |= XFS_MOUNT_RDONLY; } @@ -1348,17 +1266,17 @@ xfs_fs_remount( /* * Second stage of a freeze. The data is already frozen so we only - * need to take care of themetadata. Once that's done write a dummy + * need to take care of the metadata. Once that's done write a dummy * record to dirty the log in case of a crash while frozen. */ -STATIC void -xfs_fs_lockfs( +STATIC int +xfs_fs_freeze( struct super_block *sb) { struct xfs_mount *mp = XFS_M(sb); - xfs_attr_quiesce(mp); - xfs_fs_log_dummy(mp); + xfs_quiesce_attr(mp); + return -xfs_fs_log_dummy(mp); } STATIC int @@ -1422,175 +1340,28 @@ xfs_fs_setxquota( /* * This function fills in xfs_mount_t fields based on mount args. - * Note: the superblock has _not_ yet been read in. - */ -STATIC int -xfs_start_flags( - struct xfs_mount_args *ap, - struct xfs_mount *mp) -{ - int error; - - /* Values are in BBs */ - if ((ap->flags & XFSMNT_NOALIGN) != XFSMNT_NOALIGN) { - /* - * At this point the superblock has not been read - * in, therefore we do not know the block size. - * Before the mount call ends we will convert - * these to FSBs. - */ - mp->m_dalign = ap->sunit; - mp->m_swidth = ap->swidth; - } - - if (ap->logbufs != -1 && - ap->logbufs != 0 && - (ap->logbufs < XLOG_MIN_ICLOGS || - ap->logbufs > XLOG_MAX_ICLOGS)) { - cmn_err(CE_WARN, - "XFS: invalid logbufs value: %d [not %d-%d]", - ap->logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS); - return XFS_ERROR(EINVAL); - } - mp->m_logbufs = ap->logbufs; - if (ap->logbufsize != -1 && - ap->logbufsize != 0 && - (ap->logbufsize < XLOG_MIN_RECORD_BSIZE || - ap->logbufsize > XLOG_MAX_RECORD_BSIZE || - !is_power_of_2(ap->logbufsize))) { - cmn_err(CE_WARN, - "XFS: invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]", - ap->logbufsize); - return XFS_ERROR(EINVAL); - } - - error = ENOMEM; - - mp->m_logbsize = ap->logbufsize; - mp->m_fsname_len = strlen(ap->fsname) + 1; - - mp->m_fsname = kstrdup(ap->fsname, GFP_KERNEL); - if (!mp->m_fsname) - goto out; - - if (ap->rtname[0]) { - mp->m_rtname = kstrdup(ap->rtname, GFP_KERNEL); - if (!mp->m_rtname) - goto out_free_fsname; - - } - - if (ap->logname[0]) { - mp->m_logname = kstrdup(ap->logname, GFP_KERNEL); - if (!mp->m_logname) - goto out_free_rtname; - } - - if (ap->flags & XFSMNT_WSYNC) - mp->m_flags |= XFS_MOUNT_WSYNC; -#if XFS_BIG_INUMS - if (ap->flags & XFSMNT_INO64) { - mp->m_flags |= XFS_MOUNT_INO64; - mp->m_inoadd = XFS_INO64_OFFSET; - } -#endif - if (ap->flags & XFSMNT_RETERR) - mp->m_flags |= XFS_MOUNT_RETERR; - if (ap->flags & XFSMNT_NOALIGN) - mp->m_flags |= XFS_MOUNT_NOALIGN; - if (ap->flags & XFSMNT_SWALLOC) - mp->m_flags |= XFS_MOUNT_SWALLOC; - if (ap->flags & XFSMNT_OSYNCISOSYNC) - mp->m_flags |= XFS_MOUNT_OSYNCISOSYNC; - if (ap->flags & XFSMNT_32BITINODES) - mp->m_flags |= XFS_MOUNT_32BITINODES; - - if (ap->flags & XFSMNT_IOSIZE) { - if (ap->iosizelog > XFS_MAX_IO_LOG || - ap->iosizelog < XFS_MIN_IO_LOG) { - cmn_err(CE_WARN, - "XFS: invalid log iosize: %d [not %d-%d]", - ap->iosizelog, XFS_MIN_IO_LOG, - XFS_MAX_IO_LOG); - return XFS_ERROR(EINVAL); - } - - mp->m_flags |= XFS_MOUNT_DFLT_IOSIZE; - mp->m_readio_log = mp->m_writeio_log = ap->iosizelog; - } - - if (ap->flags & XFSMNT_IKEEP) - mp->m_flags |= XFS_MOUNT_IKEEP; - if (ap->flags & XFSMNT_DIRSYNC) - mp->m_flags |= XFS_MOUNT_DIRSYNC; - if (ap->flags & XFSMNT_ATTR2) - mp->m_flags |= XFS_MOUNT_ATTR2; - if (ap->flags & XFSMNT_NOATTR2) - mp->m_flags |= XFS_MOUNT_NOATTR2; - - if (ap->flags2 & XFSMNT2_COMPAT_IOSIZE) - mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE; - - /* - * no recovery flag requires a read-only mount - */ - if (ap->flags & XFSMNT_NORECOVERY) { - if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { - cmn_err(CE_WARN, - "XFS: tried to mount a FS read-write without recovery!"); - return XFS_ERROR(EINVAL); - } - mp->m_flags |= XFS_MOUNT_NORECOVERY; - } - - if (ap->flags & XFSMNT_NOUUID) - mp->m_flags |= XFS_MOUNT_NOUUID; - if (ap->flags & XFSMNT_BARRIER) - mp->m_flags |= XFS_MOUNT_BARRIER; - else - mp->m_flags &= ~XFS_MOUNT_BARRIER; - - if (ap->flags2 & XFSMNT2_FILESTREAMS) - mp->m_flags |= XFS_MOUNT_FILESTREAMS; - - if (ap->flags & XFSMNT_DMAPI) - mp->m_flags |= XFS_MOUNT_DMAPI; - return 0; - - - out_free_rtname: - kfree(mp->m_rtname); - out_free_fsname: - kfree(mp->m_fsname); - out: - return error; -} - -/* - * This function fills in xfs_mount_t fields based on mount args. * Note: the superblock _has_ now been read in. */ STATIC int xfs_finish_flags( - struct xfs_mount_args *ap, struct xfs_mount *mp) { int ronly = (mp->m_flags & XFS_MOUNT_RDONLY); - /* Fail a mount where the logbuf is smaller then the log stripe */ + /* Fail a mount where the logbuf is smaller than the log stripe */ if (xfs_sb_version_haslogv2(&mp->m_sb)) { - if ((ap->logbufsize <= 0) && - (mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE)) { + if (mp->m_logbsize <= 0 && + mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE) { mp->m_logbsize = mp->m_sb.sb_logsunit; - } else if (ap->logbufsize > 0 && - ap->logbufsize < mp->m_sb.sb_logsunit) { + } else if (mp->m_logbsize > 0 && + mp->m_logbsize < mp->m_sb.sb_logsunit) { cmn_err(CE_WARN, "XFS: logbuf size must be greater than or equal to log stripe size"); return XFS_ERROR(EINVAL); } } else { /* Fail a mount if the logbuf is larger than 32K */ - if (ap->logbufsize > XLOG_BIG_RECORD_BSIZE) { + if (mp->m_logbsize > XLOG_BIG_RECORD_BSIZE) { cmn_err(CE_WARN, "XFS: logbuf size for version 1 logs must be 16K or 32K"); return XFS_ERROR(EINVAL); @@ -1602,7 +1373,7 @@ xfs_finish_flags( * told by noattr2 to turn it off */ if (xfs_sb_version_hasattr2(&mp->m_sb) && - !(ap->flags & XFSMNT_NOATTR2)) + !(mp->m_flags & XFS_MOUNT_NOATTR2)) mp->m_flags |= XFS_MOUNT_ATTR2; /* @@ -1614,48 +1385,6 @@ xfs_finish_flags( return XFS_ERROR(EROFS); } - /* - * check for shared mount. - */ - if (ap->flags & XFSMNT_SHARED) { - if (!xfs_sb_version_hasshared(&mp->m_sb)) - return XFS_ERROR(EINVAL); - - /* - * For IRIX 6.5, shared mounts must have the shared - * version bit set, have the persistent readonly - * field set, must be version 0 and can only be mounted - * read-only. - */ - if (!ronly || !(mp->m_sb.sb_flags & XFS_SBF_READONLY) || - (mp->m_sb.sb_shared_vn != 0)) - return XFS_ERROR(EINVAL); - - mp->m_flags |= XFS_MOUNT_SHARED; - - /* - * Shared XFS V0 can't deal with DMI. Return EINVAL. - */ - if (mp->m_sb.sb_shared_vn == 0 && (ap->flags & XFSMNT_DMAPI)) - return XFS_ERROR(EINVAL); - } - - if (ap->flags & XFSMNT_UQUOTA) { - mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE); - if (ap->flags & XFSMNT_UQUOTAENF) - mp->m_qflags |= XFS_UQUOTA_ENFD; - } - - if (ap->flags & XFSMNT_GQUOTA) { - mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE); - if (ap->flags & XFSMNT_GQUOTAENF) - mp->m_qflags |= XFS_OQUOTA_ENFD; - } else if (ap->flags & XFSMNT_PQUOTA) { - mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE); - if (ap->flags & XFSMNT_PQUOTAENF) - mp->m_qflags |= XFS_OQUOTA_ENFD; - } - return 0; } @@ -1667,19 +1396,14 @@ xfs_fs_fill_super( { struct inode *root; struct xfs_mount *mp = NULL; - struct xfs_mount_args *args; int flags = 0, error = ENOMEM; - - args = xfs_args_allocate(sb, silent); - if (!args) - return -ENOMEM; + char *mtpt = NULL; mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL); if (!mp) - goto out_free_args; + goto out; spin_lock_init(&mp->m_sb_lock); - mutex_init(&mp->m_ilock); mutex_init(&mp->m_growlock); atomic_set(&mp->m_active_trans, 0); INIT_LIST_HEAD(&mp->m_sync_list); @@ -1689,12 +1413,9 @@ xfs_fs_fill_super( mp->m_super = sb; sb->s_fs_info = mp; - if (sb->s_flags & MS_RDONLY) - mp->m_flags |= XFS_MOUNT_RDONLY; - - error = xfs_parseargs(mp, (char *)data, args, 0); + error = xfs_parseargs(mp, (char *)data, &mtpt); if (error) - goto out_free_mp; + goto out_free_fsname; sb_min_blocksize(sb, BBSIZE); sb->s_xattr = xfs_xattr_handlers; @@ -1702,33 +1423,28 @@ xfs_fs_fill_super( sb->s_qcop = &xfs_quotactl_operations; sb->s_op = &xfs_super_operations; - error = xfs_dmops_get(mp, args); + error = xfs_dmops_get(mp); if (error) - goto out_free_mp; - error = xfs_qmops_get(mp, args); + goto out_free_fsname; + error = xfs_qmops_get(mp); if (error) goto out_put_dmops; - if (args->flags & XFSMNT_QUIET) + if (silent) flags |= XFS_MFSI_QUIET; - error = xfs_open_devices(mp, args); + error = xfs_open_devices(mp); if (error) goto out_put_qmops; if (xfs_icsb_init_counters(mp)) mp->m_flags |= XFS_MOUNT_NO_PERCPU_SB; - /* - * Setup flags based on mount(2) options and then the superblock - */ - error = xfs_start_flags(args, mp); - if (error) - goto out_free_fsname; error = xfs_readsb(mp, flags); if (error) - goto out_free_fsname; - error = xfs_finish_flags(args, mp); + goto out_destroy_counters; + + error = xfs_finish_flags(mp); if (error) goto out_free_sb; @@ -1747,7 +1463,7 @@ xfs_fs_fill_super( if (error) goto out_filestream_unmount; - XFS_SEND_MOUNT(mp, DM_RIGHT_NULL, args->mtpt, args->fsname); + XFS_SEND_MOUNT(mp, DM_RIGHT_NULL, mtpt, mp->m_fsname); sb->s_dirt = 1; sb->s_magic = XFS_SB_MAGIC; @@ -1772,35 +1488,31 @@ xfs_fs_fill_super( goto fail_vnrele; } - mp->m_sync_work.w_syncer = xfs_sync_worker; - mp->m_sync_work.w_mount = mp; - mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd"); - if (IS_ERR(mp->m_sync_task)) { - error = -PTR_ERR(mp->m_sync_task); + error = xfs_syncd_init(mp); + if (error) goto fail_vnrele; - } - xfs_itrace_exit(XFS_I(sb->s_root->d_inode)); + kfree(mtpt); - kfree(args); + xfs_itrace_exit(XFS_I(sb->s_root->d_inode)); return 0; out_filestream_unmount: xfs_filestream_unmount(mp); out_free_sb: xfs_freesb(mp); - out_free_fsname: - xfs_free_fsname(mp); + out_destroy_counters: xfs_icsb_destroy_counters(mp); xfs_close_devices(mp); out_put_qmops: xfs_qmops_put(mp); out_put_dmops: xfs_dmops_put(mp); - out_free_mp: + out_free_fsname: + xfs_free_fsname(mp); + kfree(mtpt); kfree(mp); - out_free_args: - kfree(args); + out: return -error; fail_vnrele: @@ -1820,8 +1532,6 @@ xfs_fs_fill_super( xfs_filestream_unmount(mp); XFS_bflush(mp->m_ddev_targp); - error = xfs_unmount_flush(mp, 0); - WARN_ON(error); xfs_unmountfs(mp); goto out_free_sb; @@ -1847,7 +1557,7 @@ static struct super_operations xfs_super_operations = { .put_super = xfs_fs_put_super, .write_super = xfs_fs_write_super, .sync_fs = xfs_fs_sync_super, - .write_super_lockfs = xfs_fs_lockfs, + .freeze_fs = xfs_fs_freeze, .statfs = xfs_fs_statfs, .remount_fs = xfs_fs_remount, .show_options = xfs_fs_show_options, @@ -1882,10 +1592,19 @@ xfs_alloc_trace_bufs(void) if (!xfs_bmap_trace_buf) goto out_free_alloc_trace; #endif -#ifdef XFS_BMBT_TRACE +#ifdef XFS_BTREE_TRACE + xfs_allocbt_trace_buf = ktrace_alloc(XFS_ALLOCBT_TRACE_SIZE, + KM_MAYFAIL); + if (!xfs_allocbt_trace_buf) + goto out_free_bmap_trace; + + xfs_inobt_trace_buf = ktrace_alloc(XFS_INOBT_TRACE_SIZE, KM_MAYFAIL); + if (!xfs_inobt_trace_buf) + goto out_free_allocbt_trace; + xfs_bmbt_trace_buf = ktrace_alloc(XFS_BMBT_TRACE_SIZE, KM_MAYFAIL); if (!xfs_bmbt_trace_buf) - goto out_free_bmap_trace; + goto out_free_inobt_trace; #endif #ifdef XFS_ATTR_TRACE xfs_attr_trace_buf = ktrace_alloc(XFS_ATTR_TRACE_SIZE, KM_MAYFAIL); @@ -1907,8 +1626,12 @@ xfs_alloc_trace_bufs(void) ktrace_free(xfs_attr_trace_buf); out_free_bmbt_trace: #endif -#ifdef XFS_BMBT_TRACE +#ifdef XFS_BTREE_TRACE ktrace_free(xfs_bmbt_trace_buf); + out_free_inobt_trace: + ktrace_free(xfs_inobt_trace_buf); + out_free_allocbt_trace: + ktrace_free(xfs_allocbt_trace_buf); out_free_bmap_trace: #endif #ifdef XFS_BMAP_TRACE @@ -1931,8 +1654,10 @@ xfs_free_trace_bufs(void) #ifdef XFS_ATTR_TRACE ktrace_free(xfs_attr_trace_buf); #endif -#ifdef XFS_BMBT_TRACE +#ifdef XFS_BTREE_TRACE ktrace_free(xfs_bmbt_trace_buf); + ktrace_free(xfs_inobt_trace_buf); + ktrace_free(xfs_allocbt_trace_buf); #endif #ifdef XFS_BMAP_TRACE ktrace_free(xfs_bmap_trace_buf); @@ -1945,16 +1670,10 @@ xfs_free_trace_bufs(void) STATIC int __init xfs_init_zones(void) { - xfs_vnode_zone = kmem_zone_init_flags(sizeof(struct inode), "xfs_vnode", - KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | - KM_ZONE_SPREAD, - xfs_fs_inode_init_once); - if (!xfs_vnode_zone) - goto out; xfs_ioend_zone = kmem_zone_init(sizeof(xfs_ioend_t), "xfs_ioend"); if (!xfs_ioend_zone) - goto out_destroy_vnode_zone; + goto out; xfs_ioend_pool = mempool_create_slab_pool(4 * MAX_BUF_PER_PAGE, xfs_ioend_zone); @@ -1970,6 +1689,7 @@ xfs_init_zones(void) "xfs_bmap_free_item"); if (!xfs_bmap_free_item_zone) goto out_destroy_log_ticket_zone; + xfs_btree_cur_zone = kmem_zone_init(sizeof(xfs_btree_cur_t), "xfs_btree_cur"); if (!xfs_btree_cur_zone) @@ -2017,8 +1737,8 @@ xfs_init_zones(void) xfs_inode_zone = kmem_zone_init_flags(sizeof(xfs_inode_t), "xfs_inode", - KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | - KM_ZONE_SPREAD, NULL); + KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | KM_ZONE_SPREAD, + xfs_fs_inode_init_once); if (!xfs_inode_zone) goto out_destroy_efi_zone; @@ -2066,8 +1786,6 @@ xfs_init_zones(void) mempool_destroy(xfs_ioend_pool); out_destroy_ioend_zone: kmem_zone_destroy(xfs_ioend_zone); - out_destroy_vnode_zone: - kmem_zone_destroy(xfs_vnode_zone); out: return -ENOMEM; } @@ -2092,7 +1810,6 @@ xfs_destroy_zones(void) kmem_zone_destroy(xfs_log_ticket_zone); mempool_destroy(xfs_ioend_pool); kmem_zone_destroy(xfs_ioend_zone); - kmem_zone_destroy(xfs_vnode_zone); } @@ -2100,13 +1817,12 @@ STATIC int __init init_xfs_fs(void) { int error; - static char message[] __initdata = KERN_INFO \ - XFS_VERSION_STRING " with " XFS_BUILD_OPTIONS " enabled\n"; - printk(message); + printk(KERN_INFO XFS_VERSION_STRING " with " + XFS_BUILD_OPTIONS " enabled\n"); ktrace_init(64); - vn_init(); + xfs_ioend_init(); xfs_dir_startup(); error = xfs_init_zones(); diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h index fe2ef4e6a0f9..d5d776d4cd67 100644 --- a/fs/xfs/linux-2.6/xfs_super.h +++ b/fs/xfs/linux-2.6/xfs_super.h @@ -20,24 +20,12 @@ #include <linux/exportfs.h> -#ifdef CONFIG_XFS_DMAPI -# define vfs_insertdmapi(vfs) vfs_insertops(vfsp, &xfs_dmops) -# define vfs_initdmapi() dmapi_init() -# define vfs_exitdmapi() dmapi_uninit() -#else -# define vfs_insertdmapi(vfs) do { } while (0) -# define vfs_initdmapi() do { } while (0) -# define vfs_exitdmapi() do { } while (0) -#endif - #ifdef CONFIG_XFS_QUOTA -# define vfs_insertquota(vfs) vfs_insertops(vfsp, &xfs_qmops) extern void xfs_qm_init(void); extern void xfs_qm_exit(void); # define vfs_initquota() xfs_qm_init() # define vfs_exitquota() xfs_qm_exit() #else -# define vfs_insertquota(vfs) do { } while (0) # define vfs_initquota() do { } while (0) # define vfs_exitquota() do { } while (0) #endif @@ -101,9 +89,6 @@ struct block_device; extern __uint64_t xfs_max_file_offset(unsigned int); -extern void xfs_flush_inode(struct xfs_inode *); -extern void xfs_flush_device(struct xfs_inode *); - extern void xfs_blkdev_issue_flush(struct xfs_buftarg *); extern const struct export_operations xfs_export_operations; diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c new file mode 100644 index 000000000000..2ed035354c26 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -0,0 +1,762 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir2.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_btree.h" +#include "xfs_dir2_sf.h" +#include "xfs_attr_sf.h" +#include "xfs_inode.h" +#include "xfs_dinode.h" +#include "xfs_error.h" +#include "xfs_mru_cache.h" +#include "xfs_filestream.h" +#include "xfs_vnodeops.h" +#include "xfs_utils.h" +#include "xfs_buf_item.h" +#include "xfs_inode_item.h" +#include "xfs_rw.h" + +#include <linux/kthread.h> +#include <linux/freezer.h> + +/* + * Sync all the inodes in the given AG according to the + * direction given by the flags. + */ +STATIC int +xfs_sync_inodes_ag( + xfs_mount_t *mp, + int ag, + int flags) +{ + xfs_perag_t *pag = &mp->m_perag[ag]; + int nr_found; + uint32_t first_index = 0; + int error = 0; + int last_error = 0; + int fflag = XFS_B_ASYNC; + + if (flags & SYNC_DELWRI) + fflag = XFS_B_DELWRI; + if (flags & SYNC_WAIT) + fflag = 0; /* synchronous overrides all */ + + do { + struct inode *inode; + xfs_inode_t *ip = NULL; + int lock_flags = XFS_ILOCK_SHARED; + + /* + * use a gang lookup to find the next inode in the tree + * as the tree is sparse and a gang lookup walks to find + * the number of objects requested. + */ + read_lock(&pag->pag_ici_lock); + nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, + (void**)&ip, first_index, 1); + + if (!nr_found) { + read_unlock(&pag->pag_ici_lock); + break; + } + + /* + * Update the index for the next lookup. Catch overflows + * into the next AG range which can occur if we have inodes + * in the last block of the AG and we are currently + * pointing to the last inode. + */ + first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); + if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) { + read_unlock(&pag->pag_ici_lock); + break; + } + + /* nothing to sync during shutdown */ + if (XFS_FORCED_SHUTDOWN(mp)) { + read_unlock(&pag->pag_ici_lock); + return 0; + } + + /* + * If we can't get a reference on the inode, it must be + * in reclaim. Leave it for the reclaim code to flush. + */ + inode = VFS_I(ip); + if (!igrab(inode)) { + read_unlock(&pag->pag_ici_lock); + continue; + } + read_unlock(&pag->pag_ici_lock); + + /* avoid new or bad inodes */ + if (is_bad_inode(inode) || + xfs_iflags_test(ip, XFS_INEW)) { + IRELE(ip); + continue; + } + + /* + * If we have to flush data or wait for I/O completion + * we need to hold the iolock. + */ + if ((flags & SYNC_DELWRI) && VN_DIRTY(inode)) { + xfs_ilock(ip, XFS_IOLOCK_SHARED); + lock_flags |= XFS_IOLOCK_SHARED; + error = xfs_flush_pages(ip, 0, -1, fflag, FI_NONE); + if (flags & SYNC_IOWAIT) + xfs_ioend_wait(ip); + } + xfs_ilock(ip, XFS_ILOCK_SHARED); + + if ((flags & SYNC_ATTR) && !xfs_inode_clean(ip)) { + if (flags & SYNC_WAIT) { + xfs_iflock(ip); + if (!xfs_inode_clean(ip)) + error = xfs_iflush(ip, XFS_IFLUSH_SYNC); + else + xfs_ifunlock(ip); + } else if (xfs_iflock_nowait(ip)) { + if (!xfs_inode_clean(ip)) + error = xfs_iflush(ip, XFS_IFLUSH_DELWRI); + else + xfs_ifunlock(ip); + } + } + xfs_iput(ip, lock_flags); + + if (error) + last_error = error; + /* + * bail out if the filesystem is corrupted. + */ + if (error == EFSCORRUPTED) + return XFS_ERROR(error); + + } while (nr_found); + + return last_error; +} + +int +xfs_sync_inodes( + xfs_mount_t *mp, + int flags) +{ + int error; + int last_error; + int i; + int lflags = XFS_LOG_FORCE; + + if (mp->m_flags & XFS_MOUNT_RDONLY) + return 0; + error = 0; + last_error = 0; + + if (flags & SYNC_WAIT) + lflags |= XFS_LOG_SYNC; + + for (i = 0; i < mp->m_sb.sb_agcount; i++) { + if (!mp->m_perag[i].pag_ici_init) + continue; + error = xfs_sync_inodes_ag(mp, i, flags); + if (error) + last_error = error; + if (error == EFSCORRUPTED) + break; + } + if (flags & SYNC_DELWRI) + xfs_log_force(mp, 0, lflags); + + return XFS_ERROR(last_error); +} + +STATIC int +xfs_commit_dummy_trans( + struct xfs_mount *mp, + uint log_flags) +{ + struct xfs_inode *ip = mp->m_rootip; + struct xfs_trans *tp; + int error; + + /* + * Put a dummy transaction in the log to tell recovery + * that all others are OK. + */ + tp = xfs_trans_alloc(mp, XFS_TRANS_DUMMY1); + error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ihold(tp, ip); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + /* XXX(hch): ignoring the error here.. */ + error = xfs_trans_commit(tp, 0); + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + xfs_log_force(mp, 0, log_flags); + return 0; +} + +int +xfs_sync_fsdata( + struct xfs_mount *mp, + int flags) +{ + struct xfs_buf *bp; + struct xfs_buf_log_item *bip; + int error = 0; + + /* + * If this is xfssyncd() then only sync the superblock if we can + * lock it without sleeping and it is not pinned. + */ + if (flags & SYNC_BDFLUSH) { + ASSERT(!(flags & SYNC_WAIT)); + + bp = xfs_getsb(mp, XFS_BUF_TRYLOCK); + if (!bp) + goto out; + + bip = XFS_BUF_FSPRIVATE(bp, struct xfs_buf_log_item *); + if (!bip || !xfs_buf_item_dirty(bip) || XFS_BUF_ISPINNED(bp)) + goto out_brelse; + } else { + bp = xfs_getsb(mp, 0); + + /* + * If the buffer is pinned then push on the log so we won't + * get stuck waiting in the write for someone, maybe + * ourselves, to flush the log. + * + * Even though we just pushed the log above, we did not have + * the superblock buffer locked at that point so it can + * become pinned in between there and here. + */ + if (XFS_BUF_ISPINNED(bp)) + xfs_log_force(mp, 0, XFS_LOG_FORCE); + } + + + if (flags & SYNC_WAIT) + XFS_BUF_UNASYNC(bp); + else + XFS_BUF_ASYNC(bp); + + return xfs_bwrite(mp, bp); + + out_brelse: + xfs_buf_relse(bp); + out: + return error; +} + +/* + * When remounting a filesystem read-only or freezing the filesystem, we have + * two phases to execute. This first phase is syncing the data before we + * quiesce the filesystem, and the second is flushing all the inodes out after + * we've waited for all the transactions created by the first phase to + * complete. The second phase ensures that the inodes are written to their + * location on disk rather than just existing in transactions in the log. This + * means after a quiesce there is no log replay required to write the inodes to + * disk (this is the main difference between a sync and a quiesce). + */ +/* + * First stage of freeze - no writers will make progress now we are here, + * so we flush delwri and delalloc buffers here, then wait for all I/O to + * complete. Data is frozen at that point. Metadata is not frozen, + * transactions can still occur here so don't bother flushing the buftarg + * because it'll just get dirty again. + */ +int +xfs_quiesce_data( + struct xfs_mount *mp) +{ + int error; + + /* push non-blocking */ + xfs_sync_inodes(mp, SYNC_DELWRI|SYNC_BDFLUSH); + XFS_QM_DQSYNC(mp, SYNC_BDFLUSH); + xfs_filestream_flush(mp); + + /* push and block */ + xfs_sync_inodes(mp, SYNC_DELWRI|SYNC_WAIT|SYNC_IOWAIT); + XFS_QM_DQSYNC(mp, SYNC_WAIT); + + /* write superblock and hoover up shutdown errors */ + error = xfs_sync_fsdata(mp, 0); + + /* flush data-only devices */ + if (mp->m_rtdev_targp) + XFS_bflush(mp->m_rtdev_targp); + + return error; +} + +STATIC void +xfs_quiesce_fs( + struct xfs_mount *mp) +{ + int count = 0, pincount; + + xfs_flush_buftarg(mp->m_ddev_targp, 0); + xfs_reclaim_inodes(mp, 0, XFS_IFLUSH_DELWRI_ELSE_ASYNC); + + /* + * This loop must run at least twice. The first instance of the loop + * will flush most meta data but that will generate more meta data + * (typically directory updates). Which then must be flushed and + * logged before we can write the unmount record. + */ + do { + xfs_sync_inodes(mp, SYNC_ATTR|SYNC_WAIT); + pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1); + if (!pincount) { + delay(50); + count++; + } + } while (count < 2); +} + +/* + * Second stage of a quiesce. The data is already synced, now we have to take + * care of the metadata. New transactions are already blocked, so we need to + * wait for any remaining transactions to drain out before proceding. + */ +void +xfs_quiesce_attr( + struct xfs_mount *mp) +{ + int error = 0; + + /* wait for all modifications to complete */ + while (atomic_read(&mp->m_active_trans) > 0) + delay(100); + + /* flush inodes and push all remaining buffers out to disk */ + xfs_quiesce_fs(mp); + + ASSERT_ALWAYS(atomic_read(&mp->m_active_trans) == 0); + + /* Push the superblock and write an unmount record */ + error = xfs_log_sbcount(mp, 1); + if (error) + xfs_fs_cmn_err(CE_WARN, mp, + "xfs_attr_quiesce: failed to log sb changes. " + "Frozen image may not be consistent."); + xfs_log_unmount_write(mp); + xfs_unmountfs_writesb(mp); +} + +/* + * Enqueue a work item to be picked up by the vfs xfssyncd thread. + * Doing this has two advantages: + * - It saves on stack space, which is tight in certain situations + * - It can be used (with care) as a mechanism to avoid deadlocks. + * Flushing while allocating in a full filesystem requires both. + */ +STATIC void +xfs_syncd_queue_work( + struct xfs_mount *mp, + void *data, + void (*syncer)(struct xfs_mount *, void *)) +{ + struct bhv_vfs_sync_work *work; + + work = kmem_alloc(sizeof(struct bhv_vfs_sync_work), KM_SLEEP); + INIT_LIST_HEAD(&work->w_list); + work->w_syncer = syncer; + work->w_data = data; + work->w_mount = mp; + spin_lock(&mp->m_sync_lock); + list_add_tail(&work->w_list, &mp->m_sync_list); + spin_unlock(&mp->m_sync_lock); + wake_up_process(mp->m_sync_task); +} + +/* + * Flush delayed allocate data, attempting to free up reserved space + * from existing allocations. At this point a new allocation attempt + * has failed with ENOSPC and we are in the process of scratching our + * heads, looking about for more room... + */ +STATIC void +xfs_flush_inode_work( + struct xfs_mount *mp, + void *arg) +{ + struct inode *inode = arg; + filemap_flush(inode->i_mapping); + iput(inode); +} + +void +xfs_flush_inode( + xfs_inode_t *ip) +{ + struct inode *inode = VFS_I(ip); + + igrab(inode); + xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inode_work); + delay(msecs_to_jiffies(500)); +} + +/* + * This is the "bigger hammer" version of xfs_flush_inode_work... + * (IOW, "If at first you don't succeed, use a Bigger Hammer"). + */ +STATIC void +xfs_flush_device_work( + struct xfs_mount *mp, + void *arg) +{ + struct inode *inode = arg; + sync_blockdev(mp->m_super->s_bdev); + iput(inode); +} + +void +xfs_flush_device( + xfs_inode_t *ip) +{ + struct inode *inode = VFS_I(ip); + + igrab(inode); + xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_device_work); + delay(msecs_to_jiffies(500)); + xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC); +} + +/* + * Every sync period we need to unpin all items, reclaim inodes, sync + * quota and write out the superblock. We might need to cover the log + * to indicate it is idle. + */ +STATIC void +xfs_sync_worker( + struct xfs_mount *mp, + void *unused) +{ + int error; + + if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { + xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); + xfs_reclaim_inodes(mp, 0, XFS_IFLUSH_DELWRI_ELSE_ASYNC); + /* dgc: errors ignored here */ + error = XFS_QM_DQSYNC(mp, SYNC_BDFLUSH); + error = xfs_sync_fsdata(mp, SYNC_BDFLUSH); + if (xfs_log_need_covered(mp)) + error = xfs_commit_dummy_trans(mp, XFS_LOG_FORCE); + } + mp->m_sync_seq++; + wake_up(&mp->m_wait_single_sync_task); +} + +STATIC int +xfssyncd( + void *arg) +{ + struct xfs_mount *mp = arg; + long timeleft; + bhv_vfs_sync_work_t *work, *n; + LIST_HEAD (tmp); + + set_freezable(); + timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10); + for (;;) { + timeleft = schedule_timeout_interruptible(timeleft); + /* swsusp */ + try_to_freeze(); + if (kthread_should_stop() && list_empty(&mp->m_sync_list)) + break; + + spin_lock(&mp->m_sync_lock); + /* + * We can get woken by laptop mode, to do a sync - + * that's the (only!) case where the list would be + * empty with time remaining. + */ + if (!timeleft || list_empty(&mp->m_sync_list)) { + if (!timeleft) + timeleft = xfs_syncd_centisecs * + msecs_to_jiffies(10); + INIT_LIST_HEAD(&mp->m_sync_work.w_list); + list_add_tail(&mp->m_sync_work.w_list, + &mp->m_sync_list); + } + list_for_each_entry_safe(work, n, &mp->m_sync_list, w_list) + list_move(&work->w_list, &tmp); + spin_unlock(&mp->m_sync_lock); + + list_for_each_entry_safe(work, n, &tmp, w_list) { + (*work->w_syncer)(mp, work->w_data); + list_del(&work->w_list); + if (work == &mp->m_sync_work) + continue; + kmem_free(work); + } + } + + return 0; +} + +int +xfs_syncd_init( + struct xfs_mount *mp) +{ + mp->m_sync_work.w_syncer = xfs_sync_worker; + mp->m_sync_work.w_mount = mp; + mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd"); + if (IS_ERR(mp->m_sync_task)) + return -PTR_ERR(mp->m_sync_task); + return 0; +} + +void +xfs_syncd_stop( + struct xfs_mount *mp) +{ + kthread_stop(mp->m_sync_task); +} + +int +xfs_reclaim_inode( + xfs_inode_t *ip, + int locked, + int sync_mode) +{ + xfs_perag_t *pag = xfs_get_perag(ip->i_mount, ip->i_ino); + + /* The hash lock here protects a thread in xfs_iget_core from + * racing with us on linking the inode back with a vnode. + * Once we have the XFS_IRECLAIM flag set it will not touch + * us. + */ + write_lock(&pag->pag_ici_lock); + spin_lock(&ip->i_flags_lock); + if (__xfs_iflags_test(ip, XFS_IRECLAIM) || + !__xfs_iflags_test(ip, XFS_IRECLAIMABLE)) { + spin_unlock(&ip->i_flags_lock); + write_unlock(&pag->pag_ici_lock); + if (locked) { + xfs_ifunlock(ip); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + } + return 1; + } + __xfs_iflags_set(ip, XFS_IRECLAIM); + spin_unlock(&ip->i_flags_lock); + write_unlock(&pag->pag_ici_lock); + xfs_put_perag(ip->i_mount, pag); + + /* + * If the inode is still dirty, then flush it out. If the inode + * is not in the AIL, then it will be OK to flush it delwri as + * long as xfs_iflush() does not keep any references to the inode. + * We leave that decision up to xfs_iflush() since it has the + * knowledge of whether it's OK to simply do a delwri flush of + * the inode or whether we need to wait until the inode is + * pulled from the AIL. + * We get the flush lock regardless, though, just to make sure + * we don't free it while it is being flushed. + */ + if (!locked) { + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_iflock(ip); + } + + /* + * In the case of a forced shutdown we rely on xfs_iflush() to + * wait for the inode to be unpinned before returning an error. + */ + if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) { + /* synchronize with xfs_iflush_done */ + xfs_iflock(ip); + xfs_ifunlock(ip); + } + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_ireclaim(ip); + return 0; +} + +/* + * We set the inode flag atomically with the radix tree tag. + * Once we get tag lookups on the radix tree, this inode flag + * can go away. + */ +void +xfs_inode_set_reclaim_tag( + xfs_inode_t *ip) +{ + xfs_mount_t *mp = ip->i_mount; + xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino); + + read_lock(&pag->pag_ici_lock); + spin_lock(&ip->i_flags_lock); + radix_tree_tag_set(&pag->pag_ici_root, + XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); + __xfs_iflags_set(ip, XFS_IRECLAIMABLE); + spin_unlock(&ip->i_flags_lock); + read_unlock(&pag->pag_ici_lock); + xfs_put_perag(mp, pag); +} + +void +__xfs_inode_clear_reclaim_tag( + xfs_mount_t *mp, + xfs_perag_t *pag, + xfs_inode_t *ip) +{ + radix_tree_tag_clear(&pag->pag_ici_root, + XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); +} + +void +xfs_inode_clear_reclaim_tag( + xfs_inode_t *ip) +{ + xfs_mount_t *mp = ip->i_mount; + xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino); + + read_lock(&pag->pag_ici_lock); + spin_lock(&ip->i_flags_lock); + __xfs_inode_clear_reclaim_tag(mp, pag, ip); + spin_unlock(&ip->i_flags_lock); + read_unlock(&pag->pag_ici_lock); + xfs_put_perag(mp, pag); +} + + +STATIC void +xfs_reclaim_inodes_ag( + xfs_mount_t *mp, + int ag, + int noblock, + int mode) +{ + xfs_inode_t *ip = NULL; + xfs_perag_t *pag = &mp->m_perag[ag]; + int nr_found; + uint32_t first_index; + int skipped; + +restart: + first_index = 0; + skipped = 0; + do { + /* + * use a gang lookup to find the next inode in the tree + * as the tree is sparse and a gang lookup walks to find + * the number of objects requested. + */ + read_lock(&pag->pag_ici_lock); + nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root, + (void**)&ip, first_index, 1, + XFS_ICI_RECLAIM_TAG); + + if (!nr_found) { + read_unlock(&pag->pag_ici_lock); + break; + } + + /* + * Update the index for the next lookup. Catch overflows + * into the next AG range which can occur if we have inodes + * in the last block of the AG and we are currently + * pointing to the last inode. + */ + first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); + if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) { + read_unlock(&pag->pag_ici_lock); + break; + } + + /* ignore if already under reclaim */ + if (xfs_iflags_test(ip, XFS_IRECLAIM)) { + read_unlock(&pag->pag_ici_lock); + continue; + } + + if (noblock) { + if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { + read_unlock(&pag->pag_ici_lock); + continue; + } + if (xfs_ipincount(ip) || + !xfs_iflock_nowait(ip)) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + read_unlock(&pag->pag_ici_lock); + continue; + } + } + read_unlock(&pag->pag_ici_lock); + + /* + * hmmm - this is an inode already in reclaim. Do + * we even bother catching it here? + */ + if (xfs_reclaim_inode(ip, noblock, mode)) + skipped++; + } while (nr_found); + + if (skipped) { + delay(1); + goto restart; + } + return; + +} + +int +xfs_reclaim_inodes( + xfs_mount_t *mp, + int noblock, + int mode) +{ + int i; + + for (i = 0; i < mp->m_sb.sb_agcount; i++) { + if (!mp->m_perag[i].pag_ici_init) + continue; + xfs_reclaim_inodes_ag(mp, i, noblock, mode); + } + return 0; +} + + diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h new file mode 100644 index 000000000000..5f6de1efe1f6 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_sync.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef XFS_SYNC_H +#define XFS_SYNC_H 1 + +struct xfs_mount; + +typedef struct bhv_vfs_sync_work { + struct list_head w_list; + struct xfs_mount *w_mount; + void *w_data; /* syncer routine argument */ + void (*w_syncer)(struct xfs_mount *, void *); +} bhv_vfs_sync_work_t; + +#define SYNC_ATTR 0x0001 /* sync attributes */ +#define SYNC_DELWRI 0x0002 /* look at delayed writes */ +#define SYNC_WAIT 0x0004 /* wait for i/o to complete */ +#define SYNC_BDFLUSH 0x0008 /* BDFLUSH is calling -- don't block */ +#define SYNC_IOWAIT 0x0010 /* wait for all I/O to complete */ + +int xfs_syncd_init(struct xfs_mount *mp); +void xfs_syncd_stop(struct xfs_mount *mp); + +int xfs_sync_inodes(struct xfs_mount *mp, int flags); +int xfs_sync_fsdata(struct xfs_mount *mp, int flags); + +int xfs_quiesce_data(struct xfs_mount *mp); +void xfs_quiesce_attr(struct xfs_mount *mp); + +void xfs_flush_inode(struct xfs_inode *ip); +void xfs_flush_device(struct xfs_inode *ip); + +int xfs_reclaim_inode(struct xfs_inode *ip, int locked, int sync_mode); +int xfs_reclaim_inodes(struct xfs_mount *mp, int noblock, int mode); + +void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); +void xfs_inode_clear_reclaim_tag(struct xfs_inode *ip); +void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag, + struct xfs_inode *ip); +#endif diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c index 7dacb5bbde3f..916c0ffb6083 100644 --- a/fs/xfs/linux-2.6/xfs_sysctl.c +++ b/fs/xfs/linux-2.6/xfs_sysctl.c @@ -56,17 +56,6 @@ xfs_stats_clear_proc_handler( static ctl_table xfs_table[] = { { - .ctl_name = XFS_RESTRICT_CHOWN, - .procname = "restrict_chown", - .data = &xfs_params.restrict_chown.val, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, - .extra1 = &xfs_params.restrict_chown.min, - .extra2 = &xfs_params.restrict_chown.max - }, - { .ctl_name = XFS_SGID_INHERIT, .procname = "irix_sgid_inherit", .data = &xfs_params.sgid_inherit.val, diff --git a/fs/xfs/linux-2.6/xfs_sysctl.h b/fs/xfs/linux-2.6/xfs_sysctl.h index 4aadb8056c37..b9937d450f8e 100644 --- a/fs/xfs/linux-2.6/xfs_sysctl.h +++ b/fs/xfs/linux-2.6/xfs_sysctl.h @@ -31,7 +31,6 @@ typedef struct xfs_sysctl_val { } xfs_sysctl_val_t; typedef struct xfs_param { - xfs_sysctl_val_t restrict_chown;/* Root/non-root can give away files.*/ xfs_sysctl_val_t sgid_inherit; /* Inherit S_ISGID if process' GID is * not a member of parent dir GID. */ xfs_sysctl_val_t symlink_mode; /* Link creat mode affected by umask */ @@ -68,7 +67,7 @@ typedef struct xfs_param { enum { /* XFS_REFCACHE_SIZE = 1 */ /* XFS_REFCACHE_PURGE = 2 */ - XFS_RESTRICT_CHOWN = 3, + /* XFS_RESTRICT_CHOWN = 3 */ XFS_SGID_INHERIT = 4, XFS_SYMLINK_MODE = 5, XFS_PANIC_MASK = 6, diff --git a/fs/xfs/linux-2.6/xfs_vfs.h b/fs/xfs/linux-2.6/xfs_vfs.h deleted file mode 100644 index 7e60c7776b1c..000000000000 --- a/fs/xfs/linux-2.6/xfs_vfs.h +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright (c) 2000-2006 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_VFS_H__ -#define __XFS_VFS_H__ - -#include <linux/vfs.h> -#include "xfs_fs.h" - -struct inode; - -struct fid; -struct cred; -struct seq_file; -struct super_block; -struct xfs_inode; -struct xfs_mount; -struct xfs_mount_args; - -typedef struct kstatfs bhv_statvfs_t; - -typedef struct bhv_vfs_sync_work { - struct list_head w_list; - struct xfs_mount *w_mount; - void *w_data; /* syncer routine argument */ - void (*w_syncer)(struct xfs_mount *, void *); -} bhv_vfs_sync_work_t; - -#define SYNC_ATTR 0x0001 /* sync attributes */ -#define SYNC_CLOSE 0x0002 /* close file system down */ -#define SYNC_DELWRI 0x0004 /* look at delayed writes */ -#define SYNC_WAIT 0x0008 /* wait for i/o to complete */ -#define SYNC_BDFLUSH 0x0010 /* BDFLUSH is calling -- don't block */ -#define SYNC_FSDATA 0x0020 /* flush fs data (e.g. superblocks) */ -#define SYNC_REFCACHE 0x0040 /* prune some of the nfs ref cache */ -#define SYNC_REMOUNT 0x0080 /* remount readonly, no dummy LRs */ -#define SYNC_IOWAIT 0x0100 /* wait for all I/O to complete */ - -/* - * When remounting a filesystem read-only or freezing the filesystem, - * we have two phases to execute. This first phase is syncing the data - * before we quiesce the fielsystem, and the second is flushing all the - * inodes out after we've waited for all the transactions created by - * the first phase to complete. The second phase uses SYNC_INODE_QUIESCE - * to ensure that the inodes are written to their location on disk - * rather than just existing in transactions in the log. This means - * after a quiesce there is no log replay required to write the inodes - * to disk (this is the main difference between a sync and a quiesce). - */ -#define SYNC_DATA_QUIESCE (SYNC_DELWRI|SYNC_FSDATA|SYNC_WAIT|SYNC_IOWAIT) -#define SYNC_INODE_QUIESCE (SYNC_REMOUNT|SYNC_ATTR|SYNC_WAIT) - -#define SHUTDOWN_META_IO_ERROR 0x0001 /* write attempt to metadata failed */ -#define SHUTDOWN_LOG_IO_ERROR 0x0002 /* write attempt to the log failed */ -#define SHUTDOWN_FORCE_UMOUNT 0x0004 /* shutdown from a forced unmount */ -#define SHUTDOWN_CORRUPT_INCORE 0x0008 /* corrupt in-memory data structures */ -#define SHUTDOWN_REMOTE_REQ 0x0010 /* shutdown came from remote cell */ -#define SHUTDOWN_DEVICE_REQ 0x0020 /* failed all paths to the device */ - -#define xfs_test_for_freeze(mp) ((mp)->m_super->s_frozen) -#define xfs_wait_for_freeze(mp,l) vfs_check_frozen((mp)->m_super, (l)) - -#endif /* __XFS_VFS_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_vnode.c b/fs/xfs/linux-2.6/xfs_vnode.c deleted file mode 100644 index b52528bbbfff..000000000000 --- a/fs/xfs/linux-2.6/xfs_vnode.c +++ /dev/null @@ -1,145 +0,0 @@ -/* - * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_vnodeops.h" -#include "xfs_bmap_btree.h" -#include "xfs_inode.h" - -/* - * And this gunk is needed for xfs_mount.h" - */ -#include "xfs_log.h" -#include "xfs_trans.h" -#include "xfs_sb.h" -#include "xfs_dmapi.h" -#include "xfs_inum.h" -#include "xfs_ag.h" -#include "xfs_mount.h" - - -/* - * Dedicated vnode inactive/reclaim sync wait queues. - * Prime number of hash buckets since address is used as the key. - */ -#define NVSYNC 37 -#define vptosync(v) (&vsync[((unsigned long)v) % NVSYNC]) -static wait_queue_head_t vsync[NVSYNC]; - -void __init -vn_init(void) -{ - int i; - - for (i = 0; i < NVSYNC; i++) - init_waitqueue_head(&vsync[i]); -} - -void -vn_iowait( - xfs_inode_t *ip) -{ - wait_queue_head_t *wq = vptosync(ip); - - wait_event(*wq, (atomic_read(&ip->i_iocount) == 0)); -} - -void -vn_iowake( - xfs_inode_t *ip) -{ - if (atomic_dec_and_test(&ip->i_iocount)) - wake_up(vptosync(ip)); -} - -/* - * Volume managers supporting multiple paths can send back ENODEV when the - * final path disappears. In this case continuing to fill the page cache - * with dirty data which cannot be written out is evil, so prevent that. - */ -void -vn_ioerror( - xfs_inode_t *ip, - int error, - char *f, - int l) -{ - if (unlikely(error == -ENODEV)) - xfs_do_force_shutdown(ip->i_mount, SHUTDOWN_DEVICE_REQ, f, l); -} - -#ifdef XFS_INODE_TRACE - -/* - * Reference count of Linux inode if present, -1 if the xfs_inode - * has no associated Linux inode. - */ -static inline int xfs_icount(struct xfs_inode *ip) -{ - struct inode *vp = VFS_I(ip); - - if (vp) - return vn_count(vp); - return -1; -} - -#define KTRACE_ENTER(ip, vk, s, line, ra) \ - ktrace_enter( (ip)->i_trace, \ -/* 0 */ (void *)(__psint_t)(vk), \ -/* 1 */ (void *)(s), \ -/* 2 */ (void *)(__psint_t) line, \ -/* 3 */ (void *)(__psint_t)xfs_icount(ip), \ -/* 4 */ (void *)(ra), \ -/* 5 */ NULL, \ -/* 6 */ (void *)(__psint_t)current_cpu(), \ -/* 7 */ (void *)(__psint_t)current_pid(), \ -/* 8 */ (void *)__return_address, \ -/* 9 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL) - -/* - * Vnode tracing code. - */ -void -_xfs_itrace_entry(xfs_inode_t *ip, const char *func, inst_t *ra) -{ - KTRACE_ENTER(ip, INODE_KTRACE_ENTRY, func, 0, ra); -} - -void -_xfs_itrace_exit(xfs_inode_t *ip, const char *func, inst_t *ra) -{ - KTRACE_ENTER(ip, INODE_KTRACE_EXIT, func, 0, ra); -} - -void -xfs_itrace_hold(xfs_inode_t *ip, char *file, int line, inst_t *ra) -{ - KTRACE_ENTER(ip, INODE_KTRACE_HOLD, file, line, ra); -} - -void -_xfs_itrace_ref(xfs_inode_t *ip, char *file, int line, inst_t *ra) -{ - KTRACE_ENTER(ip, INODE_KTRACE_REF, file, line, ra); -} - -void -xfs_itrace_rele(xfs_inode_t *ip, char *file, int line, inst_t *ra) -{ - KTRACE_ENTER(ip, INODE_KTRACE_RELE, file, line, ra); -} -#endif /* XFS_INODE_TRACE */ diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h index 683ce16210ff..f65983a230d3 100644 --- a/fs/xfs/linux-2.6/xfs_vnode.h +++ b/fs/xfs/linux-2.6/xfs_vnode.h @@ -18,7 +18,10 @@ #ifndef __XFS_VNODE_H__ #define __XFS_VNODE_H__ +#include "xfs_fs.h" + struct file; +struct xfs_inode; struct xfs_iomap; struct attrlist_cursor_kern; @@ -51,40 +54,6 @@ struct attrlist_cursor_kern; Prevent VM access to the pages until the operation completes. */ - -extern void vn_init(void); - -/* - * Yeah, these don't take vnode anymore at all, all this should be - * cleaned up at some point. - */ -extern void vn_iowait(struct xfs_inode *ip); -extern void vn_iowake(struct xfs_inode *ip); -extern void vn_ioerror(struct xfs_inode *ip, int error, char *f, int l); - -static inline int vn_count(struct inode *vp) -{ - return atomic_read(&vp->i_count); -} - -#define IHOLD(ip) \ -do { \ - ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \ - atomic_inc(&(VFS_I(ip)->i_count)); \ - xfs_itrace_hold((ip), __FILE__, __LINE__, (inst_t *)__return_address); \ -} while (0) - -#define IRELE(ip) \ -do { \ - xfs_itrace_rele((ip), __FILE__, __LINE__, (inst_t *)__return_address); \ - iput(VFS_I(ip)); \ -} while (0) - -static inline struct inode *vn_grab(struct inode *vp) -{ - return igrab(vp); -} - /* * Dealing with bad inodes */ @@ -121,39 +90,4 @@ static inline void vn_atime_to_time_t(struct inode *vp, time_t *tt) PAGECACHE_TAG_DIRTY) -/* - * Tracking vnode activity. - */ -#if defined(XFS_INODE_TRACE) - -#define INODE_TRACE_SIZE 16 /* number of trace entries */ -#define INODE_KTRACE_ENTRY 1 -#define INODE_KTRACE_EXIT 2 -#define INODE_KTRACE_HOLD 3 -#define INODE_KTRACE_REF 4 -#define INODE_KTRACE_RELE 5 - -extern void _xfs_itrace_entry(struct xfs_inode *, const char *, inst_t *); -extern void _xfs_itrace_exit(struct xfs_inode *, const char *, inst_t *); -extern void xfs_itrace_hold(struct xfs_inode *, char *, int, inst_t *); -extern void _xfs_itrace_ref(struct xfs_inode *, char *, int, inst_t *); -extern void xfs_itrace_rele(struct xfs_inode *, char *, int, inst_t *); -#define xfs_itrace_entry(ip) \ - _xfs_itrace_entry(ip, __func__, (inst_t *)__return_address) -#define xfs_itrace_exit(ip) \ - _xfs_itrace_exit(ip, __func__, (inst_t *)__return_address) -#define xfs_itrace_exit_tag(ip, tag) \ - _xfs_itrace_exit(ip, tag, (inst_t *)__return_address) -#define xfs_itrace_ref(ip) \ - _xfs_itrace_ref(ip, __FILE__, __LINE__, (inst_t *)__return_address) - -#else -#define xfs_itrace_entry(a) -#define xfs_itrace_exit(a) -#define xfs_itrace_exit_tag(a, b) -#define xfs_itrace_hold(a, b, c, d) -#define xfs_itrace_ref(a) -#define xfs_itrace_rele(a, b, c, d) -#endif - #endif /* __XFS_VNODE_H__ */ diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c index f2705f2fd43c..591ca6602bfb 100644 --- a/fs/xfs/quota/xfs_dquot.c +++ b/fs/xfs/quota/xfs_dquot.c @@ -101,7 +101,7 @@ xfs_qm_dqinit( if (brandnewdquot) { dqp->dq_flnext = dqp->dq_flprev = dqp; mutex_init(&dqp->q_qlock); - sv_init(&dqp->q_pinwait, SV_DEFAULT, "pdq"); + init_waitqueue_head(&dqp->q_pinwait); /* * Because we want to use a counting completion, complete @@ -131,7 +131,7 @@ xfs_qm_dqinit( dqp->q_res_bcount = 0; dqp->q_res_icount = 0; dqp->q_res_rtbcount = 0; - dqp->q_pincount = 0; + atomic_set(&dqp->q_pincount, 0); dqp->q_hash = NULL; ASSERT(dqp->dq_flnext == dqp->dq_flprev); @@ -1221,16 +1221,14 @@ xfs_qm_dqflush( xfs_dqtrace_entry(dqp, "DQFLUSH"); /* - * If not dirty, nada. + * If not dirty, or it's pinned and we are not supposed to + * block, nada. */ - if (!XFS_DQ_IS_DIRTY(dqp)) { + if (!XFS_DQ_IS_DIRTY(dqp) || + (!(flags & XFS_QMOPT_SYNC) && atomic_read(&dqp->q_pincount) > 0)) { xfs_dqfunlock(dqp); - return (0); + return 0; } - - /* - * Cant flush a pinned dquot. Wait for it. - */ xfs_qm_dqunpin_wait(dqp); /* @@ -1274,10 +1272,8 @@ xfs_qm_dqflush( dqp->dq_flags &= ~(XFS_DQ_DIRTY); mp = dqp->q_mount; - /* lsn is 64 bits */ - spin_lock(&mp->m_ail_lock); - dqp->q_logitem.qli_flush_lsn = dqp->q_logitem.qli_item.li_lsn; - spin_unlock(&mp->m_ail_lock); + xfs_trans_ail_copy_lsn(mp->m_ail, &dqp->q_logitem.qli_flush_lsn, + &dqp->q_logitem.qli_item.li_lsn); /* * Attach an iodone routine so that we can remove this dquot from the @@ -1323,8 +1319,10 @@ xfs_qm_dqflush_done( xfs_dq_logitem_t *qip) { xfs_dquot_t *dqp; + struct xfs_ail *ailp; dqp = qip->qli_dquot; + ailp = qip->qli_item.li_ailp; /* * We only want to pull the item from the AIL if its @@ -1337,15 +1335,12 @@ xfs_qm_dqflush_done( if ((qip->qli_item.li_flags & XFS_LI_IN_AIL) && qip->qli_item.li_lsn == qip->qli_flush_lsn) { - spin_lock(&dqp->q_mount->m_ail_lock); - /* - * xfs_trans_delete_ail() drops the AIL lock. - */ + /* xfs_trans_ail_delete() drops the AIL lock. */ + spin_lock(&ailp->xa_lock); if (qip->qli_item.li_lsn == qip->qli_flush_lsn) - xfs_trans_delete_ail(dqp->q_mount, - (xfs_log_item_t*)qip); + xfs_trans_ail_delete(ailp, (xfs_log_item_t*)qip); else - spin_unlock(&dqp->q_mount->m_ail_lock); + spin_unlock(&ailp->xa_lock); } /* @@ -1375,7 +1370,7 @@ xfs_dqunlock( mutex_unlock(&(dqp->q_qlock)); if (dqp->q_logitem.qli_dquot == dqp) { /* Once was dqp->q_mount, but might just have been cleared */ - xfs_trans_unlocked_item(dqp->q_logitem.qli_item.li_mountp, + xfs_trans_unlocked_item(dqp->q_logitem.qli_item.li_ailp, (xfs_log_item_t*)&(dqp->q_logitem)); } } @@ -1489,7 +1484,7 @@ xfs_qm_dqpurge( "xfs_qm_dqpurge: dquot %p flush failed", dqp); xfs_dqflock(dqp); } - ASSERT(dqp->q_pincount == 0); + ASSERT(atomic_read(&dqp->q_pincount) == 0); ASSERT(XFS_FORCED_SHUTDOWN(mp) || !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL)); diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h index 8958d0faf8d3..7e455337e2ba 100644 --- a/fs/xfs/quota/xfs_dquot.h +++ b/fs/xfs/quota/xfs_dquot.h @@ -83,8 +83,8 @@ typedef struct xfs_dquot { xfs_qcnt_t q_res_rtbcount;/* total realtime blks used+reserved */ mutex_t q_qlock; /* quota lock */ struct completion q_flush; /* flush completion queue */ - uint q_pincount; /* pin count for this dquot */ - sv_t q_pinwait; /* sync var for pinning */ + atomic_t q_pincount; /* dquot pin count */ + wait_queue_head_t q_pinwait; /* dquot pinning wait queue */ #ifdef XFS_DQUOT_TRACE struct ktrace *q_trace; /* trace header structure */ #endif diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c index f028644caa5e..1728f6a7c4f5 100644 --- a/fs/xfs/quota/xfs_dquot_item.c +++ b/fs/xfs/quota/xfs_dquot_item.c @@ -88,25 +88,22 @@ xfs_qm_dquot_logitem_format( /* * Increment the pin count of the given dquot. - * This value is protected by pinlock spinlock in the xQM structure. */ STATIC void xfs_qm_dquot_logitem_pin( xfs_dq_logitem_t *logitem) { - xfs_dquot_t *dqp; + xfs_dquot_t *dqp = logitem->qli_dquot; - dqp = logitem->qli_dquot; ASSERT(XFS_DQ_IS_LOCKED(dqp)); - spin_lock(&(XFS_DQ_TO_QINF(dqp)->qi_pinlock)); - dqp->q_pincount++; - spin_unlock(&(XFS_DQ_TO_QINF(dqp)->qi_pinlock)); + atomic_inc(&dqp->q_pincount); } /* * Decrement the pin count of the given dquot, and wake up * anyone in xfs_dqwait_unpin() if the count goes to 0. The - * dquot must have been previously pinned with a call to xfs_dqpin(). + * dquot must have been previously pinned with a call to + * xfs_qm_dquot_logitem_pin(). */ /* ARGSUSED */ STATIC void @@ -114,16 +111,11 @@ xfs_qm_dquot_logitem_unpin( xfs_dq_logitem_t *logitem, int stale) { - xfs_dquot_t *dqp; + xfs_dquot_t *dqp = logitem->qli_dquot; - dqp = logitem->qli_dquot; - ASSERT(dqp->q_pincount > 0); - spin_lock(&(XFS_DQ_TO_QINF(dqp)->qi_pinlock)); - dqp->q_pincount--; - if (dqp->q_pincount == 0) { - sv_broadcast(&dqp->q_pinwait); - } - spin_unlock(&(XFS_DQ_TO_QINF(dqp)->qi_pinlock)); + ASSERT(atomic_read(&dqp->q_pincount) > 0); + if (atomic_dec_and_test(&dqp->q_pincount)) + wake_up(&dqp->q_pinwait); } /* ARGSUSED */ @@ -193,21 +185,14 @@ xfs_qm_dqunpin_wait( xfs_dquot_t *dqp) { ASSERT(XFS_DQ_IS_LOCKED(dqp)); - if (dqp->q_pincount == 0) { + if (atomic_read(&dqp->q_pincount) == 0) return; - } /* * Give the log a push so we don't wait here too long. */ xfs_log_force(dqp->q_mount, (xfs_lsn_t)0, XFS_LOG_FORCE); - spin_lock(&(XFS_DQ_TO_QINF(dqp)->qi_pinlock)); - if (dqp->q_pincount == 0) { - spin_unlock(&(XFS_DQ_TO_QINF(dqp)->qi_pinlock)); - return; - } - sv_wait(&(dqp->q_pinwait), PINOD, - &(XFS_DQ_TO_QINF(dqp)->qi_pinlock), s); + wait_event(dqp->q_pinwait, (atomic_read(&dqp->q_pincount) == 0)); } /* @@ -310,7 +295,7 @@ xfs_qm_dquot_logitem_trylock( uint retval; dqp = qip->qli_dquot; - if (dqp->q_pincount > 0) + if (atomic_read(&dqp->q_pincount) > 0) return (XFS_ITEM_PINNED); if (! xfs_qm_dqlock_nowait(dqp)) @@ -568,14 +553,16 @@ xfs_qm_qoffend_logitem_committed( xfs_lsn_t lsn) { xfs_qoff_logitem_t *qfs; + struct xfs_ail *ailp; qfs = qfe->qql_start_lip; - spin_lock(&qfs->qql_item.li_mountp->m_ail_lock); + ailp = qfs->qql_item.li_ailp; + spin_lock(&ailp->xa_lock); /* * Delete the qoff-start logitem from the AIL. - * xfs_trans_delete_ail() drops the AIL lock. + * xfs_trans_ail_delete() drops the AIL lock. */ - xfs_trans_delete_ail(qfs->qql_item.li_mountp, (xfs_log_item_t *)qfs); + xfs_trans_ail_delete(ailp, (xfs_log_item_t *)qfs); kmem_free(qfs); kmem_free(qfe); return (xfs_lsn_t)-1; diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c index df0ffef9775a..6b13960cf318 100644 --- a/fs/xfs/quota/xfs_qm.c +++ b/fs/xfs/quota/xfs_qm.c @@ -20,7 +20,6 @@ #include "xfs_bit.h" #include "xfs_log.h" #include "xfs_inum.h" -#include "xfs_clnt.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -396,13 +395,10 @@ xfs_qm_mount_quotas( /* * Called from the vfsops layer. */ -int +void xfs_qm_unmount_quotas( xfs_mount_t *mp) { - xfs_inode_t *uqp, *gqp; - int error = 0; - /* * Release the dquots that root inode, et al might be holding, * before we flush quotas and blow away the quotainfo structure. @@ -415,43 +411,18 @@ xfs_qm_unmount_quotas( xfs_qm_dqdetach(mp->m_rsumip); /* - * Flush out the quota inodes. + * Release the quota inodes. */ - uqp = gqp = NULL; if (mp->m_quotainfo) { - if ((uqp = mp->m_quotainfo->qi_uquotaip) != NULL) { - xfs_ilock(uqp, XFS_ILOCK_EXCL); - xfs_iflock(uqp); - error = xfs_iflush(uqp, XFS_IFLUSH_SYNC); - xfs_iunlock(uqp, XFS_ILOCK_EXCL); - if (unlikely(error == EFSCORRUPTED)) { - XFS_ERROR_REPORT("xfs_qm_unmount_quotas(1)", - XFS_ERRLEVEL_LOW, mp); - goto out; - } + if (mp->m_quotainfo->qi_uquotaip) { + IRELE(mp->m_quotainfo->qi_uquotaip); + mp->m_quotainfo->qi_uquotaip = NULL; } - if ((gqp = mp->m_quotainfo->qi_gquotaip) != NULL) { - xfs_ilock(gqp, XFS_ILOCK_EXCL); - xfs_iflock(gqp); - error = xfs_iflush(gqp, XFS_IFLUSH_SYNC); - xfs_iunlock(gqp, XFS_ILOCK_EXCL); - if (unlikely(error == EFSCORRUPTED)) { - XFS_ERROR_REPORT("xfs_qm_unmount_quotas(2)", - XFS_ERRLEVEL_LOW, mp); - goto out; - } + if (mp->m_quotainfo->qi_gquotaip) { + IRELE(mp->m_quotainfo->qi_gquotaip); + mp->m_quotainfo->qi_gquotaip = NULL; } } - if (uqp) { - IRELE(uqp); - mp->m_quotainfo->qi_uquotaip = NULL; - } - if (gqp) { - IRELE(gqp); - mp->m_quotainfo->qi_gquotaip = NULL; - } -out: - return XFS_ERROR(error); } /* @@ -987,14 +958,10 @@ xfs_qm_dqdetach( } /* - * This is called by VFS_SYNC and flags arg determines the caller, - * and its motives, as done in xfs_sync. - * - * vfs_sync: SYNC_FSDATA|SYNC_ATTR|SYNC_BDFLUSH 0x31 - * syscall sync: SYNC_FSDATA|SYNC_ATTR|SYNC_DELWRI 0x25 - * umountroot : SYNC_WAIT | SYNC_CLOSE | SYNC_ATTR | SYNC_FSDATA + * This is called to sync quotas. We can be told to use non-blocking + * semantics by either the SYNC_BDFLUSH flag or the absence of the + * SYNC_WAIT flag. */ - int xfs_qm_sync( xfs_mount_t *mp, @@ -1137,7 +1104,6 @@ xfs_qm_init_quotainfo( return error; } - spin_lock_init(&qinf->qi_pinlock); xfs_qm_list_init(&qinf->qi_dqlist, "mpdqlist", 0); qinf->qi_dqreclaims = 0; @@ -1234,7 +1200,6 @@ xfs_qm_destroy_quotainfo( */ xfs_qm_rele_quotafs_ref(mp); - spinlock_destroy(&qi->qi_pinlock); xfs_qm_list_destroy(&qi->qi_dqlist); if (qi->qi_uquotaip) { diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h index 44f25349e478..ddf09166387c 100644 --- a/fs/xfs/quota/xfs_qm.h +++ b/fs/xfs/quota/xfs_qm.h @@ -106,7 +106,6 @@ typedef struct xfs_qm { typedef struct xfs_quotainfo { xfs_inode_t *qi_uquotaip; /* user quota inode */ xfs_inode_t *qi_gquotaip; /* group quota inode */ - spinlock_t qi_pinlock; /* dquot pinning lock */ xfs_dqlist_t qi_dqlist; /* all dquots in filesys */ int qi_dqreclaims; /* a change here indicates a removal in the dqlist */ @@ -168,7 +167,7 @@ extern void xfs_qm_destroy_quotainfo(xfs_mount_t *); extern void xfs_qm_mount_quotas(xfs_mount_t *); extern int xfs_qm_quotacheck(xfs_mount_t *); extern void xfs_qm_unmount_quotadestroy(xfs_mount_t *); -extern int xfs_qm_unmount_quotas(xfs_mount_t *); +extern void xfs_qm_unmount_quotas(xfs_mount_t *); extern int xfs_qm_write_sb_changes(xfs_mount_t *, __int64_t); extern int xfs_qm_sync(xfs_mount_t *, int); diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c index eea2e60b456b..bc6c5cca3e12 100644 --- a/fs/xfs/quota/xfs_qm_bhv.c +++ b/fs/xfs/quota/xfs_qm_bhv.c @@ -20,7 +20,6 @@ #include "xfs_bit.h" #include "xfs_log.h" #include "xfs_inum.h" -#include "xfs_clnt.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" @@ -51,7 +50,7 @@ STATIC void xfs_fill_statvfs_from_dquot( - bhv_statvfs_t *statp, + struct kstatfs *statp, xfs_disk_dquot_t *dp) { __uint64_t limit; @@ -88,7 +87,7 @@ xfs_fill_statvfs_from_dquot( STATIC void xfs_qm_statvfs( xfs_inode_t *ip, - bhv_statvfs_t *statp) + struct kstatfs *statp) { xfs_mount_t *mp = ip->i_mount; xfs_dquot_t *dqp; diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c index 1a3b803dfa55..68139b38aede 100644 --- a/fs/xfs/quota/xfs_qm_syscalls.c +++ b/fs/xfs/quota/xfs_qm_syscalls.c @@ -127,7 +127,7 @@ xfs_qm_quotactl( break; case Q_XQUOTASYNC: - return (xfs_sync_inodes(mp, SYNC_DELWRI, NULL)); + return xfs_sync_inodes(mp, SYNC_DELWRI); default: break; @@ -1022,101 +1022,104 @@ xfs_qm_export_flags( /* - * Go thru all the inodes in the file system, releasing their dquots. - * Note that the mount structure gets modified to indicate that quotas are off - * AFTER this, in the case of quotaoff. This also gets called from - * xfs_rootumount. + * Release all the dquots on the inodes in an AG. */ -void -xfs_qm_dqrele_all_inodes( - struct xfs_mount *mp, - uint flags) +STATIC void +xfs_qm_dqrele_inodes_ag( + xfs_mount_t *mp, + int ag, + uint flags) { - xfs_inode_t *ip, *topino; - uint ireclaims; - struct inode *vp; - boolean_t vnode_refd; + xfs_inode_t *ip = NULL; + xfs_perag_t *pag = &mp->m_perag[ag]; + int first_index = 0; + int nr_found; - ASSERT(mp->m_quotainfo); - - XFS_MOUNT_ILOCK(mp); -again: - ip = mp->m_inodes; - if (ip == NULL) { - XFS_MOUNT_IUNLOCK(mp); - return; - } do { - /* Skip markers inserted by xfs_sync */ - if (ip->i_mount == NULL) { - ip = ip->i_mnext; - continue; + /* + * use a gang lookup to find the next inode in the tree + * as the tree is sparse and a gang lookup walks to find + * the number of objects requested. + */ + read_lock(&pag->pag_ici_lock); + nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, + (void**)&ip, first_index, 1); + + if (!nr_found) { + read_unlock(&pag->pag_ici_lock); + break; } - /* Root inode, rbmip and rsumip have associated blocks */ + + /* + * Update the index for the next lookup. Catch overflows + * into the next AG range which can occur if we have inodes + * in the last block of the AG and we are currently + * pointing to the last inode. + */ + first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); + if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) { + read_unlock(&pag->pag_ici_lock); + break; + } + + /* skip quota inodes */ if (ip == XFS_QI_UQIP(mp) || ip == XFS_QI_GQIP(mp)) { ASSERT(ip->i_udquot == NULL); ASSERT(ip->i_gdquot == NULL); - ip = ip->i_mnext; + read_unlock(&pag->pag_ici_lock); continue; } - vp = VFS_I(ip); - if (!vp) { - ASSERT(ip->i_udquot == NULL); - ASSERT(ip->i_gdquot == NULL); - ip = ip->i_mnext; + + /* + * If we can't get a reference on the inode, it must be + * in reclaim. Leave it for the reclaim code to flush. + */ + if (!igrab(VFS_I(ip))) { + read_unlock(&pag->pag_ici_lock); continue; } - vnode_refd = B_FALSE; - if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0) { - ireclaims = mp->m_ireclaims; - topino = mp->m_inodes; - vp = vn_grab(vp); - if (!vp) - goto again; - - XFS_MOUNT_IUNLOCK(mp); - /* XXX restart limit ? */ - xfs_ilock(ip, XFS_ILOCK_EXCL); - vnode_refd = B_TRUE; - } else { - ireclaims = mp->m_ireclaims; - topino = mp->m_inodes; - XFS_MOUNT_IUNLOCK(mp); + read_unlock(&pag->pag_ici_lock); + + /* avoid new inodes though we shouldn't find any here */ + if (xfs_iflags_test(ip, XFS_INEW)) { + IRELE(ip); + continue; } - /* - * We don't keep the mountlock across the dqrele() call, - * since it can take a while.. - */ + xfs_ilock(ip, XFS_ILOCK_EXCL); if ((flags & XFS_UQUOTA_ACCT) && ip->i_udquot) { xfs_qm_dqrele(ip->i_udquot); ip->i_udquot = NULL; } - if (flags & (XFS_PQUOTA_ACCT|XFS_GQUOTA_ACCT) && ip->i_gdquot) { + if (flags & (XFS_PQUOTA_ACCT|XFS_GQUOTA_ACCT) && + ip->i_gdquot) { xfs_qm_dqrele(ip->i_gdquot); ip->i_gdquot = NULL; } - xfs_iunlock(ip, XFS_ILOCK_EXCL); - /* - * Wait until we've dropped the ilock and mountlock to - * do the vn_rele. Or be condemned to an eternity in the - * inactive code in hell. - */ - if (vnode_refd) - IRELE(ip); - XFS_MOUNT_ILOCK(mp); - /* - * If an inode was inserted or removed, we gotta - * start over again. - */ - if (topino != mp->m_inodes || mp->m_ireclaims != ireclaims) { - /* XXX use a sentinel */ - goto again; - } - ip = ip->i_mnext; - } while (ip != mp->m_inodes); + xfs_iput(ip, XFS_ILOCK_EXCL); + + } while (nr_found); +} + +/* + * Go thru all the inodes in the file system, releasing their dquots. + * Note that the mount structure gets modified to indicate that quotas are off + * AFTER this, in the case of quotaoff. This also gets called from + * xfs_rootumount. + */ +void +xfs_qm_dqrele_all_inodes( + struct xfs_mount *mp, + uint flags) +{ + int i; - XFS_MOUNT_IUNLOCK(mp); + ASSERT(mp->m_quotainfo); + for (i = 0; i < mp->m_sb.sb_agcount; i++) { + if (!mp->m_perag[i].pag_ici_init) + continue; + xfs_qm_dqrele_inodes_ag(mp, i, flags); + } } /*------------------------------------------------------------------------*/ diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c index c27abef7b84f..ae5482965424 100644 --- a/fs/xfs/support/debug.c +++ b/fs/xfs/support/debug.c @@ -18,6 +18,13 @@ #include <xfs.h> #include "debug.h" +/* xfs_mount.h drags a lot of crap in, sorry.. */ +#include "xfs_sb.h" +#include "xfs_inum.h" +#include "xfs_ag.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" + static char message[1024]; /* keep it off the stack */ static DEFINE_SPINLOCK(xfs_err_lock); @@ -55,22 +62,42 @@ cmn_err(register int level, char *fmt, ...) } void -icmn_err(register int level, char *fmt, va_list ap) +xfs_fs_vcmn_err( + int level, + struct xfs_mount *mp, + char *fmt, + va_list ap) { - ulong flags; - int len; + unsigned long flags; + int len = 0; level &= XFS_ERR_MASK; - if(level > XFS_MAX_ERR_LEVEL) + if (level > XFS_MAX_ERR_LEVEL) level = XFS_MAX_ERR_LEVEL; + spin_lock_irqsave(&xfs_err_lock,flags); - len = vsnprintf(message, sizeof(message), fmt, ap); + + if (mp) { + len = sprintf(message, "Filesystem \"%s\": ", mp->m_fsname); + + /* + * Skip the printk if we can't print anything useful + * due to an over-long device name. + */ + if (len >= sizeof(message)) + goto out; + } + + len = vsnprintf(message + len, sizeof(message) - len, fmt, ap); if (len >= sizeof(message)) len = sizeof(message) - 1; if (message[len-1] == '\n') message[len-1] = 0; + printk("%s%s\n", err_level[level], message); + out: spin_unlock_irqrestore(&xfs_err_lock,flags); + BUG_ON(level == CE_PANIC); } @@ -84,5 +111,5 @@ assfail(char *expr, char *file, int line) void xfs_hex_dump(void *p, int length) { - print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_OFFSET, 16, 1, p, length, 1); + print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_ADDRESS, 16, 1, p, length, 1); } diff --git a/fs/xfs/support/debug.h b/fs/xfs/support/debug.h index 75845f950814..6f4fd37c67af 100644 --- a/fs/xfs/support/debug.h +++ b/fs/xfs/support/debug.h @@ -27,8 +27,6 @@ #define CE_ALERT 1 /* alert */ #define CE_PANIC 0 /* panic */ -extern void icmn_err(int, char *, va_list) - __attribute__ ((format (printf, 2, 0))); extern void cmn_err(int, char *, ...) __attribute__ ((format (printf, 2, 3))); extern void assfail(char *expr, char *f, int l); diff --git a/fs/xfs/support/ktrace.c b/fs/xfs/support/ktrace.c index a34ef05489b1..2d494c26717f 100644 --- a/fs/xfs/support/ktrace.c +++ b/fs/xfs/support/ktrace.c @@ -113,21 +113,16 @@ ktrace_alloc(int nentries, unsigned int __nocast sleep) void ktrace_free(ktrace_t *ktp) { - int entries_size; - if (ktp == (ktrace_t *)NULL) return; /* * Special treatment for the Vnode trace buffer. */ - if (ktp->kt_nentries == ktrace_zentries) { + if (ktp->kt_nentries == ktrace_zentries) kmem_zone_free(ktrace_ent_zone, ktp->kt_entries); - } else { - entries_size = (int)(ktp->kt_nentries * sizeof(ktrace_entry_t)); - + else kmem_free(ktp->kt_entries); - } kmem_zone_free(ktrace_hdr_zone, ktp); } diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h index 540e4c989825..17254b529c54 100644 --- a/fs/xfs/xfs.h +++ b/fs/xfs/xfs.h @@ -30,7 +30,7 @@ #define XFS_ATTR_TRACE 1 #define XFS_BLI_TRACE 1 #define XFS_BMAP_TRACE 1 -#define XFS_BMBT_TRACE 1 +#define XFS_BTREE_TRACE 1 #define XFS_DIR2_TRACE 1 #define XFS_DQUOT_TRACE 1 #define XFS_ILOCK_TRACE 1 diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index 91d69338d3b2..a8cdd73999a4 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -758,7 +758,7 @@ xfs_acl_setmode( if (gap && nomask) iattr.ia_mode |= gap->ae_perm << 3; - return xfs_setattr(XFS_I(vp), &iattr, 0, sys_cred); + return xfs_setattr(XFS_I(vp), &iattr, 0); } /* diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index 61b292a9fb41..f2e21817a226 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -91,6 +91,8 @@ typedef struct xfs_agf { #define XFS_AGF_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp)) #define XFS_BUF_TO_AGF(bp) ((xfs_agf_t *)XFS_BUF_PTR(bp)) +extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_agnumber_t agno, int flags, struct xfs_buf **bpp); /* * Size of the unlinked inode hash table in the agi. @@ -142,6 +144,9 @@ typedef struct xfs_agi { #define XFS_AGI_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGI_DADDR(mp)) #define XFS_BUF_TO_AGI(bp) ((xfs_agi_t *)XFS_BUF_PTR(bp)) +extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_agnumber_t agno, struct xfs_buf **bpp); + /* * The third a.g. block contains the a.g. freelist, an array * of block pointers to blocks owned by the allocation btree code. @@ -192,17 +197,23 @@ typedef struct xfs_perag xfs_agino_t pagi_freecount; /* number of free inodes */ xfs_agino_t pagi_count; /* number of allocated inodes */ int pagb_count; /* pagb slots in use */ + xfs_perag_busy_t *pagb_list; /* unstable blocks */ #ifdef __KERNEL__ spinlock_t pagb_lock; /* lock for pagb_list */ -#endif - xfs_perag_busy_t *pagb_list; /* unstable blocks */ + atomic_t pagf_fstrms; /* # of filestreams active in this AG */ int pag_ici_init; /* incore inode cache initialised */ rwlock_t pag_ici_lock; /* incore inode lock */ struct radix_tree_root pag_ici_root; /* incore inode cache root */ +#endif } xfs_perag_t; +/* + * tags for inode radix tree + */ +#define XFS_ICI_RECLAIM_TAG 0 /* inode is to be reclaimed */ + #define XFS_AG_MAXLEVELS(mp) ((mp)->m_ag_maxlevels) #define XFS_MIN_FREELIST_RAW(bl,cl,mp) \ (MIN(bl + 1, XFS_AG_MAXLEVELS(mp)) + MIN(cl + 1, XFS_AG_MAXLEVELS(mp))) diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 1956f83489f1..028e44e58ea9 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -90,6 +90,92 @@ STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *, */ /* + * Lookup the record equal to [bno, len] in the btree given by cur. + */ +STATIC int /* error */ +xfs_alloc_lookup_eq( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t bno, /* starting block of extent */ + xfs_extlen_t len, /* length of extent */ + int *stat) /* success/failure */ +{ + cur->bc_rec.a.ar_startblock = bno; + cur->bc_rec.a.ar_blockcount = len; + return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat); +} + +/* + * Lookup the first record greater than or equal to [bno, len] + * in the btree given by cur. + */ +STATIC int /* error */ +xfs_alloc_lookup_ge( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t bno, /* starting block of extent */ + xfs_extlen_t len, /* length of extent */ + int *stat) /* success/failure */ +{ + cur->bc_rec.a.ar_startblock = bno; + cur->bc_rec.a.ar_blockcount = len; + return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat); +} + +/* + * Lookup the first record less than or equal to [bno, len] + * in the btree given by cur. + */ +STATIC int /* error */ +xfs_alloc_lookup_le( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t bno, /* starting block of extent */ + xfs_extlen_t len, /* length of extent */ + int *stat) /* success/failure */ +{ + cur->bc_rec.a.ar_startblock = bno; + cur->bc_rec.a.ar_blockcount = len; + return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat); +} + +/* + * Update the record referred to by cur to the value given + * by [bno, len]. + * This either works (return 0) or gets an EFSCORRUPTED error. + */ +STATIC int /* error */ +xfs_alloc_update( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t bno, /* starting block of extent */ + xfs_extlen_t len) /* length of extent */ +{ + union xfs_btree_rec rec; + + rec.alloc.ar_startblock = cpu_to_be32(bno); + rec.alloc.ar_blockcount = cpu_to_be32(len); + return xfs_btree_update(cur, &rec); +} + +/* + * Get the data from the pointed-to record. + */ +STATIC int /* error */ +xfs_alloc_get_rec( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t *bno, /* output: starting block of extent */ + xfs_extlen_t *len, /* output: length of extent */ + int *stat) /* output: success/failure */ +{ + union xfs_btree_rec *rec; + int error; + + error = xfs_btree_get_rec(cur, &rec, stat); + if (!error && *stat == 1) { + *bno = be32_to_cpu(rec->alloc.ar_startblock); + *len = be32_to_cpu(rec->alloc.ar_blockcount); + } + return error; +} + +/* * Compute aligned version of the found extent. * Takes alignment and min length into account. */ @@ -294,21 +380,20 @@ xfs_alloc_fixup_trees( return error; XFS_WANT_CORRUPTED_RETURN(i == 1); } + #ifdef DEBUG - { - xfs_alloc_block_t *bnoblock; - xfs_alloc_block_t *cntblock; - - if (bno_cur->bc_nlevels == 1 && - cnt_cur->bc_nlevels == 1) { - bnoblock = XFS_BUF_TO_ALLOC_BLOCK(bno_cur->bc_bufs[0]); - cntblock = XFS_BUF_TO_ALLOC_BLOCK(cnt_cur->bc_bufs[0]); - XFS_WANT_CORRUPTED_RETURN( - be16_to_cpu(bnoblock->bb_numrecs) == - be16_to_cpu(cntblock->bb_numrecs)); - } + if (bno_cur->bc_nlevels == 1 && cnt_cur->bc_nlevels == 1) { + struct xfs_btree_block *bnoblock; + struct xfs_btree_block *cntblock; + + bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_bufs[0]); + cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_bufs[0]); + + XFS_WANT_CORRUPTED_RETURN( + bnoblock->bb_numrecs == cntblock->bb_numrecs); } #endif + /* * Deal with all four cases: the allocated record is contained * within the freespace record, so we can have new freespace @@ -333,7 +418,7 @@ xfs_alloc_fixup_trees( /* * Delete the entry from the by-size btree. */ - if ((error = xfs_alloc_delete(cnt_cur, &i))) + if ((error = xfs_btree_delete(cnt_cur, &i))) return error; XFS_WANT_CORRUPTED_RETURN(i == 1); /* @@ -343,7 +428,7 @@ xfs_alloc_fixup_trees( if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i))) return error; XFS_WANT_CORRUPTED_RETURN(i == 0); - if ((error = xfs_alloc_insert(cnt_cur, &i))) + if ((error = xfs_btree_insert(cnt_cur, &i))) return error; XFS_WANT_CORRUPTED_RETURN(i == 1); } @@ -351,7 +436,7 @@ xfs_alloc_fixup_trees( if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i))) return error; XFS_WANT_CORRUPTED_RETURN(i == 0); - if ((error = xfs_alloc_insert(cnt_cur, &i))) + if ((error = xfs_btree_insert(cnt_cur, &i))) return error; XFS_WANT_CORRUPTED_RETURN(i == 1); } @@ -362,7 +447,7 @@ xfs_alloc_fixup_trees( /* * No remaining freespace, just delete the by-block tree entry. */ - if ((error = xfs_alloc_delete(bno_cur, &i))) + if ((error = xfs_btree_delete(bno_cur, &i))) return error; XFS_WANT_CORRUPTED_RETURN(i == 1); } else { @@ -379,7 +464,7 @@ xfs_alloc_fixup_trees( if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i))) return error; XFS_WANT_CORRUPTED_RETURN(i == 0); - if ((error = xfs_alloc_insert(bno_cur, &i))) + if ((error = xfs_btree_insert(bno_cur, &i))) return error; XFS_WANT_CORRUPTED_RETURN(i == 1); } @@ -640,8 +725,8 @@ xfs_alloc_ag_vextent_exact( /* * Allocate/initialize a cursor for the by-number freespace btree. */ - bno_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp, - args->agno, XFS_BTNUM_BNO, NULL, 0); + bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, + args->agno, XFS_BTNUM_BNO); /* * Lookup bno and minlen in the btree (minlen is irrelevant, really). * Look for the closest free block <= bno, it must contain bno @@ -696,8 +781,8 @@ xfs_alloc_ag_vextent_exact( * We are allocating agbno for rlen [agbno .. end] * Allocate/initialize a cursor for the by-size btree. */ - cnt_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp, - args->agno, XFS_BTNUM_CNT, NULL, 0); + cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, + args->agno, XFS_BTNUM_CNT); ASSERT(args->agbno + args->len <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, @@ -759,8 +844,8 @@ xfs_alloc_ag_vextent_near( /* * Get a cursor for the by-size btree. */ - cnt_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp, - args->agno, XFS_BTNUM_CNT, NULL, 0); + cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, + args->agno, XFS_BTNUM_CNT); ltlen = 0; bno_cur_lt = bno_cur_gt = NULL; /* @@ -818,7 +903,7 @@ xfs_alloc_ag_vextent_near( XFS_WANT_CORRUPTED_GOTO(i == 1, error0); if (ltlen >= args->minlen) break; - if ((error = xfs_alloc_increment(cnt_cur, 0, &i))) + if ((error = xfs_btree_increment(cnt_cur, 0, &i))) goto error0; } while (i); ASSERT(ltlen >= args->minlen); @@ -828,7 +913,7 @@ xfs_alloc_ag_vextent_near( i = cnt_cur->bc_ptrs[0]; for (j = 1, blen = 0, bdiff = 0; !error && j && (blen < args->maxlen || bdiff > 0); - error = xfs_alloc_increment(cnt_cur, 0, &j)) { + error = xfs_btree_increment(cnt_cur, 0, &j)) { /* * For each entry, decide if it's better than * the previous best entry. @@ -886,8 +971,8 @@ xfs_alloc_ag_vextent_near( /* * Set up a cursor for the by-bno tree. */ - bno_cur_lt = xfs_btree_init_cursor(args->mp, args->tp, - args->agbp, args->agno, XFS_BTNUM_BNO, NULL, 0); + bno_cur_lt = xfs_allocbt_init_cursor(args->mp, args->tp, + args->agbp, args->agno, XFS_BTNUM_BNO); /* * Fix up the btree entries. */ @@ -914,8 +999,8 @@ xfs_alloc_ag_vextent_near( /* * Allocate and initialize the cursor for the leftward search. */ - bno_cur_lt = xfs_btree_init_cursor(args->mp, args->tp, args->agbp, - args->agno, XFS_BTNUM_BNO, NULL, 0); + bno_cur_lt = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, + args->agno, XFS_BTNUM_BNO); /* * Lookup <= bno to find the leftward search's starting point. */ @@ -938,7 +1023,7 @@ xfs_alloc_ag_vextent_near( * Increment the cursor, so we will point at the entry just right * of the leftward entry if any, or to the leftmost entry. */ - if ((error = xfs_alloc_increment(bno_cur_gt, 0, &i))) + if ((error = xfs_btree_increment(bno_cur_gt, 0, &i))) goto error0; if (!i) { /* @@ -961,7 +1046,7 @@ xfs_alloc_ag_vextent_near( args->minlen, <bnoa, <lena); if (ltlena >= args->minlen) break; - if ((error = xfs_alloc_decrement(bno_cur_lt, 0, &i))) + if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i))) goto error0; if (!i) { xfs_btree_del_cursor(bno_cur_lt, @@ -977,7 +1062,7 @@ xfs_alloc_ag_vextent_near( args->minlen, >bnoa, >lena); if (gtlena >= args->minlen) break; - if ((error = xfs_alloc_increment(bno_cur_gt, 0, &i))) + if ((error = xfs_btree_increment(bno_cur_gt, 0, &i))) goto error0; if (!i) { xfs_btree_del_cursor(bno_cur_gt, @@ -1066,7 +1151,7 @@ xfs_alloc_ag_vextent_near( /* * Fell off the right end. */ - if ((error = xfs_alloc_increment( + if ((error = xfs_btree_increment( bno_cur_gt, 0, &i))) goto error0; if (!i) { @@ -1162,7 +1247,7 @@ xfs_alloc_ag_vextent_near( /* * Fell off the left end. */ - if ((error = xfs_alloc_decrement( + if ((error = xfs_btree_decrement( bno_cur_lt, 0, &i))) goto error0; if (!i) { @@ -1267,8 +1352,8 @@ xfs_alloc_ag_vextent_size( /* * Allocate and initialize a cursor for the by-size btree. */ - cnt_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp, - args->agno, XFS_BTNUM_CNT, NULL, 0); + cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, + args->agno, XFS_BTNUM_CNT); bno_cur = NULL; /* * Look for an entry >= maxlen+alignment-1 blocks. @@ -1321,7 +1406,7 @@ xfs_alloc_ag_vextent_size( bestflen = flen; bestfbno = fbno; for (;;) { - if ((error = xfs_alloc_decrement(cnt_cur, 0, &i))) + if ((error = xfs_btree_decrement(cnt_cur, 0, &i))) goto error0; if (i == 0) break; @@ -1372,8 +1457,8 @@ xfs_alloc_ag_vextent_size( /* * Allocate and initialize a cursor for the by-block tree. */ - bno_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp, - args->agno, XFS_BTNUM_BNO, NULL, 0); + bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, + args->agno, XFS_BTNUM_BNO); if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, rbno, rlen, XFSA_FIXUP_CNT_OK))) goto error0; @@ -1416,7 +1501,7 @@ xfs_alloc_ag_vextent_small( xfs_extlen_t flen; int i; - if ((error = xfs_alloc_decrement(ccur, 0, &i))) + if ((error = xfs_btree_decrement(ccur, 0, &i))) goto error0; if (i) { if ((error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i))) @@ -1515,8 +1600,7 @@ xfs_free_ag_extent( /* * Allocate and initialize a cursor for the by-block btree. */ - bno_cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO, NULL, - 0); + bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO); cnt_cur = NULL; /* * Look for a neighboring block on the left (lower block numbers) @@ -1549,7 +1633,7 @@ xfs_free_ag_extent( * Look for a neighboring block on the right (higher block numbers) * that is contiguous with this space. */ - if ((error = xfs_alloc_increment(bno_cur, 0, &haveright))) + if ((error = xfs_btree_increment(bno_cur, 0, &haveright))) goto error0; if (haveright) { /* @@ -1575,8 +1659,7 @@ xfs_free_ag_extent( /* * Now allocate and initialize a cursor for the by-size tree. */ - cnt_cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_CNT, NULL, - 0); + cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_CNT); /* * Have both left and right contiguous neighbors. * Merge all three into a single free block. @@ -1588,7 +1671,7 @@ xfs_free_ag_extent( if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i))) goto error0; XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - if ((error = xfs_alloc_delete(cnt_cur, &i))) + if ((error = xfs_btree_delete(cnt_cur, &i))) goto error0; XFS_WANT_CORRUPTED_GOTO(i == 1, error0); /* @@ -1597,19 +1680,19 @@ xfs_free_ag_extent( if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i))) goto error0; XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - if ((error = xfs_alloc_delete(cnt_cur, &i))) + if ((error = xfs_btree_delete(cnt_cur, &i))) goto error0; XFS_WANT_CORRUPTED_GOTO(i == 1, error0); /* * Delete the old by-block entry for the right block. */ - if ((error = xfs_alloc_delete(bno_cur, &i))) + if ((error = xfs_btree_delete(bno_cur, &i))) goto error0; XFS_WANT_CORRUPTED_GOTO(i == 1, error0); /* * Move the by-block cursor back to the left neighbor. */ - if ((error = xfs_alloc_decrement(bno_cur, 0, &i))) + if ((error = xfs_btree_decrement(bno_cur, 0, &i))) goto error0; XFS_WANT_CORRUPTED_GOTO(i == 1, error0); #ifdef DEBUG @@ -1648,14 +1731,14 @@ xfs_free_ag_extent( if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i))) goto error0; XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - if ((error = xfs_alloc_delete(cnt_cur, &i))) + if ((error = xfs_btree_delete(cnt_cur, &i))) goto error0; XFS_WANT_CORRUPTED_GOTO(i == 1, error0); /* * Back up the by-block cursor to the left neighbor, and * update its length. */ - if ((error = xfs_alloc_decrement(bno_cur, 0, &i))) + if ((error = xfs_btree_decrement(bno_cur, 0, &i))) goto error0; XFS_WANT_CORRUPTED_GOTO(i == 1, error0); nbno = ltbno; @@ -1674,7 +1757,7 @@ xfs_free_ag_extent( if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i))) goto error0; XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - if ((error = xfs_alloc_delete(cnt_cur, &i))) + if ((error = xfs_btree_delete(cnt_cur, &i))) goto error0; XFS_WANT_CORRUPTED_GOTO(i == 1, error0); /* @@ -1693,7 +1776,7 @@ xfs_free_ag_extent( else { nbno = bno; nlen = len; - if ((error = xfs_alloc_insert(bno_cur, &i))) + if ((error = xfs_btree_insert(bno_cur, &i))) goto error0; XFS_WANT_CORRUPTED_GOTO(i == 1, error0); } @@ -1705,7 +1788,7 @@ xfs_free_ag_extent( if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i))) goto error0; XFS_WANT_CORRUPTED_GOTO(i == 0, error0); - if ((error = xfs_alloc_insert(cnt_cur, &i))) + if ((error = xfs_btree_insert(cnt_cur, &i))) goto error0; XFS_WANT_CORRUPTED_GOTO(i == 1, error0); xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); @@ -2150,51 +2233,83 @@ xfs_alloc_put_freelist( * Read in the allocation group header (free/alloc section). */ int /* error */ -xfs_alloc_read_agf( - xfs_mount_t *mp, /* mount point structure */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ - int flags, /* XFS_ALLOC_FLAG_... */ - xfs_buf_t **bpp) /* buffer for the ag freelist header */ +xfs_read_agf( + struct xfs_mount *mp, /* mount point structure */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + int flags, /* XFS_BUF_ */ + struct xfs_buf **bpp) /* buffer for the ag freelist header */ { - xfs_agf_t *agf; /* ag freelist header */ + struct xfs_agf *agf; /* ag freelist header */ int agf_ok; /* set if agf is consistent */ - xfs_buf_t *bp; /* return value */ - xfs_perag_t *pag; /* per allocation group data */ int error; ASSERT(agno != NULLAGNUMBER); error = xfs_trans_read_buf( mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), - (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XFS_BUF_TRYLOCK : 0U, - &bp); + XFS_FSS_TO_BB(mp, 1), flags, bpp); if (error) return error; - ASSERT(!bp || !XFS_BUF_GETERROR(bp)); - if (!bp) { - *bpp = NULL; + if (!*bpp) return 0; - } + + ASSERT(!XFS_BUF_GETERROR(*bpp)); + agf = XFS_BUF_TO_AGF(*bpp); + /* * Validate the magic number of the agf block. */ - agf = XFS_BUF_TO_AGF(bp); agf_ok = be32_to_cpu(agf->agf_magicnum) == XFS_AGF_MAGIC && XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) && be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) && be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) && be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) && - be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp); + be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp) && + be32_to_cpu(agf->agf_seqno) == agno; + if (xfs_sb_version_haslazysbcount(&mp->m_sb)) + agf_ok = agf_ok && be32_to_cpu(agf->agf_btreeblks) <= + be32_to_cpu(agf->agf_length); if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF, XFS_RANDOM_ALLOC_READ_AGF))) { XFS_CORRUPTION_ERROR("xfs_alloc_read_agf", XFS_ERRLEVEL_LOW, mp, agf); - xfs_trans_brelse(tp, bp); + xfs_trans_brelse(tp, *bpp); return XFS_ERROR(EFSCORRUPTED); } + + XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_AGF, XFS_AGF_REF); + return 0; +} + +/* + * Read in the allocation group header (free/alloc section). + */ +int /* error */ +xfs_alloc_read_agf( + struct xfs_mount *mp, /* mount point structure */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + int flags, /* XFS_ALLOC_FLAG_... */ + struct xfs_buf **bpp) /* buffer for the ag freelist header */ +{ + struct xfs_agf *agf; /* ag freelist header */ + struct xfs_perag *pag; /* per allocation group data */ + int error; + + ASSERT(agno != NULLAGNUMBER); + + error = xfs_read_agf(mp, tp, agno, + (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XFS_BUF_TRYLOCK : 0, + bpp); + if (error) + return error; + if (!*bpp) + return 0; + ASSERT(!XFS_BUF_GETERROR(*bpp)); + + agf = XFS_BUF_TO_AGF(*bpp); pag = &mp->m_perag[agno]; if (!pag->pagf_init) { pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks); @@ -2213,6 +2328,7 @@ xfs_alloc_read_agf( #ifdef DEBUG else if (!XFS_FORCED_SHUTDOWN(mp)) { ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks)); + ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks)); ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount)); ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest)); ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] == @@ -2221,8 +2337,6 @@ xfs_alloc_read_agf( be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi])); } #endif - XFS_BUF_SET_VTYPE_REF(bp, B_FS_AGF, XFS_AGF_REF); - *bpp = bp; return 0; } diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h index 5aec15d0651e..588172796f7b 100644 --- a/fs/xfs/xfs_alloc.h +++ b/fs/xfs/xfs_alloc.h @@ -121,6 +121,19 @@ extern ktrace_t *xfs_alloc_trace_buf; #define XFS_ALLOC_KTRACE_BUSYSEARCH 6 #endif +void +xfs_alloc_mark_busy(xfs_trans_t *tp, + xfs_agnumber_t agno, + xfs_agblock_t bno, + xfs_extlen_t len); + +void +xfs_alloc_clear_busy(xfs_trans_t *tp, + xfs_agnumber_t ag, + int idx); + +#endif /* __KERNEL__ */ + /* * Compute and fill in value of m_ag_maxlevels. */ @@ -196,18 +209,4 @@ xfs_free_extent( xfs_fsblock_t bno, /* starting block number of extent */ xfs_extlen_t len); /* length of extent */ -void -xfs_alloc_mark_busy(xfs_trans_t *tp, - xfs_agnumber_t agno, - xfs_agblock_t bno, - xfs_extlen_t len); - -void -xfs_alloc_clear_busy(xfs_trans_t *tp, - xfs_agnumber_t ag, - int idx); - - -#endif /* __KERNEL__ */ - #endif /* __XFS_ALLOC_H__ */ diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c index 3ce2645508ae..733cb75a8c5d 100644 --- a/fs/xfs/xfs_alloc_btree.c +++ b/fs/xfs/xfs_alloc_btree.c @@ -35,2177 +35,464 @@ #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_btree.h" +#include "xfs_btree_trace.h" #include "xfs_ialloc.h" #include "xfs_alloc.h" #include "xfs_error.h" -/* - * Prototypes for internal functions. - */ -STATIC void xfs_alloc_log_block(xfs_trans_t *, xfs_buf_t *, int); -STATIC void xfs_alloc_log_keys(xfs_btree_cur_t *, xfs_buf_t *, int, int); -STATIC void xfs_alloc_log_ptrs(xfs_btree_cur_t *, xfs_buf_t *, int, int); -STATIC void xfs_alloc_log_recs(xfs_btree_cur_t *, xfs_buf_t *, int, int); -STATIC int xfs_alloc_lshift(xfs_btree_cur_t *, int, int *); -STATIC int xfs_alloc_newroot(xfs_btree_cur_t *, int *); -STATIC int xfs_alloc_rshift(xfs_btree_cur_t *, int, int *); -STATIC int xfs_alloc_split(xfs_btree_cur_t *, int, xfs_agblock_t *, - xfs_alloc_key_t *, xfs_btree_cur_t **, int *); -STATIC int xfs_alloc_updkey(xfs_btree_cur_t *, xfs_alloc_key_t *, int); +STATIC struct xfs_btree_cur * +xfs_allocbt_dup_cursor( + struct xfs_btree_cur *cur) +{ + return xfs_allocbt_init_cursor(cur->bc_mp, cur->bc_tp, + cur->bc_private.a.agbp, cur->bc_private.a.agno, + cur->bc_btnum); +} -/* - * Internal functions. - */ +STATIC void +xfs_allocbt_set_root( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + int inc) +{ + struct xfs_buf *agbp = cur->bc_private.a.agbp; + struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); + int btnum = cur->bc_btnum; -/* - * Single level of the xfs_alloc_delete record deletion routine. - * Delete record pointed to by cur/level. - * Remove the record from its block then rebalance the tree. - * Return 0 for error, 1 for done, 2 to go on to the next level. - */ -STATIC int /* error */ -xfs_alloc_delrec( - xfs_btree_cur_t *cur, /* btree cursor */ - int level, /* level removing record from */ - int *stat) /* fail/done/go-on */ + ASSERT(ptr->s != 0); + + agf->agf_roots[btnum] = ptr->s; + be32_add_cpu(&agf->agf_levels[btnum], inc); + cur->bc_mp->m_perag[seqno].pagf_levels[btnum] += inc; + + xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); +} + +STATIC int +xfs_allocbt_alloc_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int length, + int *stat) { - xfs_agf_t *agf; /* allocation group freelist header */ - xfs_alloc_block_t *block; /* btree block record/key lives in */ - xfs_agblock_t bno; /* btree block number */ - xfs_buf_t *bp; /* buffer for block */ - int error; /* error return value */ - int i; /* loop index */ - xfs_alloc_key_t key; /* kp points here if block is level 0 */ - xfs_agblock_t lbno; /* left block's block number */ - xfs_buf_t *lbp; /* left block's buffer pointer */ - xfs_alloc_block_t *left; /* left btree block */ - xfs_alloc_key_t *lkp=NULL; /* left block key pointer */ - xfs_alloc_ptr_t *lpp=NULL; /* left block address pointer */ - int lrecs=0; /* number of records in left block */ - xfs_alloc_rec_t *lrp; /* left block record pointer */ - xfs_mount_t *mp; /* mount structure */ - int ptr; /* index in btree block for this rec */ - xfs_agblock_t rbno; /* right block's block number */ - xfs_buf_t *rbp; /* right block's buffer pointer */ - xfs_alloc_block_t *right; /* right btree block */ - xfs_alloc_key_t *rkp; /* right block key pointer */ - xfs_alloc_ptr_t *rpp; /* right block address pointer */ - int rrecs=0; /* number of records in right block */ - int numrecs; - xfs_alloc_rec_t *rrp; /* right block record pointer */ - xfs_btree_cur_t *tcur; /* temporary btree cursor */ + int error; + xfs_agblock_t bno; - /* - * Get the index of the entry being deleted, check for nothing there. - */ - ptr = cur->bc_ptrs[level]; - if (ptr == 0) { - *stat = 0; - return 0; - } - /* - * Get the buffer & block containing the record or key/ptr. - */ - bp = cur->bc_bufs[level]; - block = XFS_BUF_TO_ALLOC_BLOCK(bp); -#ifdef DEBUG - if ((error = xfs_btree_check_sblock(cur, block, level, bp))) + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + + /* Allocate the new block from the freelist. If we can't, give up. */ + error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp, + &bno, 1); + if (error) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); return error; -#endif - /* - * Fail if we're off the end of the block. - */ - numrecs = be16_to_cpu(block->bb_numrecs); - if (ptr > numrecs) { + } + + if (bno == NULLAGBLOCK) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 0; return 0; } - XFS_STATS_INC(xs_abt_delrec); - /* - * It's a nonleaf. Excise the key and ptr being deleted, by - * sliding the entries past them down one. - * Log the changed areas of the block. - */ - if (level > 0) { - lkp = XFS_ALLOC_KEY_ADDR(block, 1, cur); - lpp = XFS_ALLOC_PTR_ADDR(block, 1, cur); -#ifdef DEBUG - for (i = ptr; i < numrecs; i++) { - if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(lpp[i]), level))) - return error; - } -#endif - if (ptr < numrecs) { - memmove(&lkp[ptr - 1], &lkp[ptr], - (numrecs - ptr) * sizeof(*lkp)); - memmove(&lpp[ptr - 1], &lpp[ptr], - (numrecs - ptr) * sizeof(*lpp)); - xfs_alloc_log_ptrs(cur, bp, ptr, numrecs - 1); - xfs_alloc_log_keys(cur, bp, ptr, numrecs - 1); - } - } - /* - * It's a leaf. Excise the record being deleted, by sliding the - * entries past it down one. Log the changed areas of the block. - */ - else { - lrp = XFS_ALLOC_REC_ADDR(block, 1, cur); - if (ptr < numrecs) { - memmove(&lrp[ptr - 1], &lrp[ptr], - (numrecs - ptr) * sizeof(*lrp)); - xfs_alloc_log_recs(cur, bp, ptr, numrecs - 1); - } - /* - * If it's the first record in the block, we'll need a key - * structure to pass up to the next level (updkey). - */ - if (ptr == 1) { - key.ar_startblock = lrp->ar_startblock; - key.ar_blockcount = lrp->ar_blockcount; - lkp = &key; - } - } - /* - * Decrement and log the number of entries in the block. - */ - numrecs--; - block->bb_numrecs = cpu_to_be16(numrecs); - xfs_alloc_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS); - /* - * See if the longest free extent in the allocation group was - * changed by this operation. True if it's the by-size btree, and - * this is the leaf level, and there is no right sibling block, - * and this was the last record. - */ - agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); - mp = cur->bc_mp; - if (level == 0 && - cur->bc_btnum == XFS_BTNUM_CNT && - be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK && - ptr > numrecs) { - ASSERT(ptr == numrecs + 1); - /* - * There are still records in the block. Grab the size - * from the last one. - */ - if (numrecs) { - rrp = XFS_ALLOC_REC_ADDR(block, numrecs, cur); - agf->agf_longest = rrp->ar_blockcount; - } - /* - * No free extents left. - */ - else - agf->agf_longest = 0; - mp->m_perag[be32_to_cpu(agf->agf_seqno)].pagf_longest = - be32_to_cpu(agf->agf_longest); - xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, - XFS_AGF_LONGEST); - } - /* - * Is this the root level? If so, we're almost done. - */ - if (level == cur->bc_nlevels - 1) { - /* - * If this is the root level, - * and there's only one entry left, - * and it's NOT the leaf level, - * then we can get rid of this level. - */ - if (numrecs == 1 && level > 0) { - /* - * lpp is still set to the first pointer in the block. - * Make it the new root of the btree. - */ - bno = be32_to_cpu(agf->agf_roots[cur->bc_btnum]); - agf->agf_roots[cur->bc_btnum] = *lpp; - be32_add_cpu(&agf->agf_levels[cur->bc_btnum], -1); - mp->m_perag[be32_to_cpu(agf->agf_seqno)].pagf_levels[cur->bc_btnum]--; - /* - * Put this buffer/block on the ag's freelist. - */ - error = xfs_alloc_put_freelist(cur->bc_tp, - cur->bc_private.a.agbp, NULL, bno, 1); - if (error) - return error; - /* - * Since blocks move to the free list without the - * coordination used in xfs_bmap_finish, we can't allow - * block to be available for reallocation and - * non-transaction writing (user data) until we know - * that the transaction that moved it to the free list - * is permanently on disk. We track the blocks by - * declaring these blocks as "busy"; the busy list is - * maintained on a per-ag basis and each transaction - * records which entries should be removed when the - * iclog commits to disk. If a busy block is - * allocated, the iclog is pushed up to the LSN - * that freed the block. - */ - xfs_alloc_mark_busy(cur->bc_tp, - be32_to_cpu(agf->agf_seqno), bno, 1); + xfs_trans_agbtree_delta(cur->bc_tp, 1); + new->s = cpu_to_be32(bno); - xfs_trans_agbtree_delta(cur->bc_tp, -1); - xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, - XFS_AGF_ROOTS | XFS_AGF_LEVELS); - /* - * Update the cursor so there's one fewer level. - */ - xfs_btree_setbuf(cur, level, NULL); - cur->bc_nlevels--; - } else if (level > 0 && - (error = xfs_alloc_decrement(cur, level, &i))) - return error; - *stat = 1; - return 0; - } - /* - * If we deleted the leftmost entry in the block, update the - * key values above us in the tree. - */ - if (ptr == 1 && (error = xfs_alloc_updkey(cur, lkp, level + 1))) - return error; - /* - * If the number of records remaining in the block is at least - * the minimum, we're done. - */ - if (numrecs >= XFS_ALLOC_BLOCK_MINRECS(level, cur)) { - if (level > 0 && (error = xfs_alloc_decrement(cur, level, &i))) - return error; - *stat = 1; - return 0; - } - /* - * Otherwise, we have to move some records around to keep the - * tree balanced. Look at the left and right sibling blocks to - * see if we can re-balance by moving only one record. - */ - rbno = be32_to_cpu(block->bb_rightsib); - lbno = be32_to_cpu(block->bb_leftsib); - bno = NULLAGBLOCK; - ASSERT(rbno != NULLAGBLOCK || lbno != NULLAGBLOCK); - /* - * Duplicate the cursor so our btree manipulations here won't - * disrupt the next level up. - */ - if ((error = xfs_btree_dup_cursor(cur, &tcur))) - return error; - /* - * If there's a right sibling, see if it's ok to shift an entry - * out of it. - */ - if (rbno != NULLAGBLOCK) { - /* - * Move the temp cursor to the last entry in the next block. - * Actually any entry but the first would suffice. - */ - i = xfs_btree_lastrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - if ((error = xfs_alloc_increment(tcur, level, &i))) - goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - i = xfs_btree_lastrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - /* - * Grab a pointer to the block. - */ - rbp = tcur->bc_bufs[level]; - right = XFS_BUF_TO_ALLOC_BLOCK(rbp); -#ifdef DEBUG - if ((error = xfs_btree_check_sblock(cur, right, level, rbp))) - goto error0; -#endif - /* - * Grab the current block number, for future use. - */ - bno = be32_to_cpu(right->bb_leftsib); - /* - * If right block is full enough so that removing one entry - * won't make it too empty, and left-shifting an entry out - * of right to us works, we're done. - */ - if (be16_to_cpu(right->bb_numrecs) - 1 >= - XFS_ALLOC_BLOCK_MINRECS(level, cur)) { - if ((error = xfs_alloc_lshift(tcur, level, &i))) - goto error0; - if (i) { - ASSERT(be16_to_cpu(block->bb_numrecs) >= - XFS_ALLOC_BLOCK_MINRECS(level, cur)); - xfs_btree_del_cursor(tcur, - XFS_BTREE_NOERROR); - if (level > 0 && - (error = xfs_alloc_decrement(cur, level, - &i))) - return error; - *stat = 1; - return 0; - } - } - /* - * Otherwise, grab the number of records in right for - * future reference, and fix up the temp cursor to point - * to our block again (last record). - */ - rrecs = be16_to_cpu(right->bb_numrecs); - if (lbno != NULLAGBLOCK) { - i = xfs_btree_firstrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - if ((error = xfs_alloc_decrement(tcur, level, &i))) - goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - } - } - /* - * If there's a left sibling, see if it's ok to shift an entry - * out of it. - */ - if (lbno != NULLAGBLOCK) { - /* - * Move the temp cursor to the first entry in the - * previous block. - */ - i = xfs_btree_firstrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - if ((error = xfs_alloc_decrement(tcur, level, &i))) - goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - xfs_btree_firstrec(tcur, level); - /* - * Grab a pointer to the block. - */ - lbp = tcur->bc_bufs[level]; - left = XFS_BUF_TO_ALLOC_BLOCK(lbp); -#ifdef DEBUG - if ((error = xfs_btree_check_sblock(cur, left, level, lbp))) - goto error0; -#endif - /* - * Grab the current block number, for future use. - */ - bno = be32_to_cpu(left->bb_rightsib); - /* - * If left block is full enough so that removing one entry - * won't make it too empty, and right-shifting an entry out - * of left to us works, we're done. - */ - if (be16_to_cpu(left->bb_numrecs) - 1 >= - XFS_ALLOC_BLOCK_MINRECS(level, cur)) { - if ((error = xfs_alloc_rshift(tcur, level, &i))) - goto error0; - if (i) { - ASSERT(be16_to_cpu(block->bb_numrecs) >= - XFS_ALLOC_BLOCK_MINRECS(level, cur)); - xfs_btree_del_cursor(tcur, - XFS_BTREE_NOERROR); - if (level == 0) - cur->bc_ptrs[0]++; - *stat = 1; - return 0; - } - } - /* - * Otherwise, grab the number of records in right for - * future reference. - */ - lrecs = be16_to_cpu(left->bb_numrecs); - } - /* - * Delete the temp cursor, we're done with it. - */ - xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); - /* - * If here, we need to do a join to keep the tree balanced. - */ - ASSERT(bno != NULLAGBLOCK); - /* - * See if we can join with the left neighbor block. - */ - if (lbno != NULLAGBLOCK && - lrecs + numrecs <= XFS_ALLOC_BLOCK_MAXRECS(level, cur)) { - /* - * Set "right" to be the starting block, - * "left" to be the left neighbor. - */ - rbno = bno; - right = block; - rrecs = be16_to_cpu(right->bb_numrecs); - rbp = bp; - if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, - cur->bc_private.a.agno, lbno, 0, &lbp, - XFS_ALLOC_BTREE_REF))) - return error; - left = XFS_BUF_TO_ALLOC_BLOCK(lbp); - lrecs = be16_to_cpu(left->bb_numrecs); - if ((error = xfs_btree_check_sblock(cur, left, level, lbp))) - return error; - } - /* - * If that won't work, see if we can join with the right neighbor block. - */ - else if (rbno != NULLAGBLOCK && - rrecs + numrecs <= XFS_ALLOC_BLOCK_MAXRECS(level, cur)) { - /* - * Set "left" to be the starting block, - * "right" to be the right neighbor. - */ - lbno = bno; - left = block; - lrecs = be16_to_cpu(left->bb_numrecs); - lbp = bp; - if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, - cur->bc_private.a.agno, rbno, 0, &rbp, - XFS_ALLOC_BTREE_REF))) - return error; - right = XFS_BUF_TO_ALLOC_BLOCK(rbp); - rrecs = be16_to_cpu(right->bb_numrecs); - if ((error = xfs_btree_check_sblock(cur, right, level, rbp))) - return error; - } - /* - * Otherwise, we can't fix the imbalance. - * Just return. This is probably a logic error, but it's not fatal. - */ - else { - if (level > 0 && (error = xfs_alloc_decrement(cur, level, &i))) - return error; - *stat = 1; - return 0; - } - /* - * We're now going to join "left" and "right" by moving all the stuff - * in "right" to "left" and deleting "right". - */ - if (level > 0) { - /* - * It's a non-leaf. Move keys and pointers. - */ - lkp = XFS_ALLOC_KEY_ADDR(left, lrecs + 1, cur); - lpp = XFS_ALLOC_PTR_ADDR(left, lrecs + 1, cur); - rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur); - rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur); -#ifdef DEBUG - for (i = 0; i < rrecs; i++) { - if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i]), level))) - return error; - } -#endif - memcpy(lkp, rkp, rrecs * sizeof(*lkp)); - memcpy(lpp, rpp, rrecs * sizeof(*lpp)); - xfs_alloc_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs); - xfs_alloc_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs); - } else { - /* - * It's a leaf. Move records. - */ - lrp = XFS_ALLOC_REC_ADDR(left, lrecs + 1, cur); - rrp = XFS_ALLOC_REC_ADDR(right, 1, cur); - memcpy(lrp, rrp, rrecs * sizeof(*lrp)); - xfs_alloc_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs); - } - /* - * If we joined with the left neighbor, set the buffer in the - * cursor to the left block, and fix up the index. - */ - if (bp != lbp) { - xfs_btree_setbuf(cur, level, lbp); - cur->bc_ptrs[level] += lrecs; - } - /* - * If we joined with the right neighbor and there's a level above - * us, increment the cursor at that level. - */ - else if (level + 1 < cur->bc_nlevels && - (error = xfs_alloc_increment(cur, level + 1, &i))) - return error; - /* - * Fix up the number of records in the surviving block. - */ - lrecs += rrecs; - left->bb_numrecs = cpu_to_be16(lrecs); - /* - * Fix up the right block pointer in the surviving block, and log it. - */ - left->bb_rightsib = right->bb_rightsib; - xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB); - /* - * If there is a right sibling now, make it point to the - * remaining block. - */ - if (be32_to_cpu(left->bb_rightsib) != NULLAGBLOCK) { - xfs_alloc_block_t *rrblock; - xfs_buf_t *rrbp; + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; +} - if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, - cur->bc_private.a.agno, be32_to_cpu(left->bb_rightsib), 0, - &rrbp, XFS_ALLOC_BTREE_REF))) - return error; - rrblock = XFS_BUF_TO_ALLOC_BLOCK(rrbp); - if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp))) - return error; - rrblock->bb_leftsib = cpu_to_be32(lbno); - xfs_alloc_log_block(cur->bc_tp, rrbp, XFS_BB_LEFTSIB); - } - /* - * Free the deleting block by putting it on the freelist. - */ - error = xfs_alloc_put_freelist(cur->bc_tp, - cur->bc_private.a.agbp, NULL, rbno, 1); +STATIC int +xfs_allocbt_free_block( + struct xfs_btree_cur *cur, + struct xfs_buf *bp) +{ + struct xfs_buf *agbp = cur->bc_private.a.agbp; + struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + xfs_agblock_t bno; + int error; + + bno = XFS_DADDR_TO_AGBNO(cur->bc_mp, XFS_BUF_ADDR(bp)); + error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1); if (error) return error; + /* - * Since blocks move to the free list without the coordination - * used in xfs_bmap_finish, we can't allow block to be available - * for reallocation and non-transaction writing (user data) - * until we know that the transaction that moved it to the free - * list is permanently on disk. We track the blocks by declaring - * these blocks as "busy"; the busy list is maintained on a - * per-ag basis and each transaction records which entries - * should be removed when the iclog commits to disk. If a - * busy block is allocated, the iclog is pushed up to the + * Since blocks move to the free list without the coordination used in + * xfs_bmap_finish, we can't allow block to be available for + * reallocation and non-transaction writing (user data) until we know + * that the transaction that moved it to the free list is permanently + * on disk. We track the blocks by declaring these blocks as "busy"; + * the busy list is maintained on a per-ag basis and each transaction + * records which entries should be removed when the iclog commits to + * disk. If a busy block is allocated, the iclog is pushed up to the * LSN that freed the block. */ xfs_alloc_mark_busy(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1); xfs_trans_agbtree_delta(cur->bc_tp, -1); - - /* - * Adjust the current level's cursor so that we're left referring - * to the right node, after we're done. - * If this leaves the ptr value 0 our caller will fix it up. - */ - if (level > 0) - cur->bc_ptrs[level]--; - /* - * Return value means the next level up has something to do. - */ - *stat = 2; return 0; - -error0: - xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); - return error; } /* - * Insert one record/level. Return information to the caller - * allowing the next level up to proceed if necessary. + * Update the longest extent in the AGF */ -STATIC int /* error */ -xfs_alloc_insrec( - xfs_btree_cur_t *cur, /* btree cursor */ - int level, /* level to insert record at */ - xfs_agblock_t *bnop, /* i/o: block number inserted */ - xfs_alloc_rec_t *recp, /* i/o: record data inserted */ - xfs_btree_cur_t **curp, /* output: new cursor replacing cur */ - int *stat) /* output: success/failure */ +STATIC void +xfs_allocbt_update_lastrec( + struct xfs_btree_cur *cur, + struct xfs_btree_block *block, + union xfs_btree_rec *rec, + int ptr, + int reason) { - xfs_agf_t *agf; /* allocation group freelist header */ - xfs_alloc_block_t *block; /* btree block record/key lives in */ - xfs_buf_t *bp; /* buffer for block */ - int error; /* error return value */ - int i; /* loop index */ - xfs_alloc_key_t key; /* key value being inserted */ - xfs_alloc_key_t *kp; /* pointer to btree keys */ - xfs_agblock_t nbno; /* block number of allocated block */ - xfs_btree_cur_t *ncur; /* new cursor to be used at next lvl */ - xfs_alloc_key_t nkey; /* new key value, from split */ - xfs_alloc_rec_t nrec; /* new record value, for caller */ + struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); + xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); + __be32 len; int numrecs; - int optr; /* old ptr value */ - xfs_alloc_ptr_t *pp; /* pointer to btree addresses */ - int ptr; /* index in btree block for this rec */ - xfs_alloc_rec_t *rp; /* pointer to btree records */ - ASSERT(be32_to_cpu(recp->ar_blockcount) > 0); + ASSERT(cur->bc_btnum == XFS_BTNUM_CNT); + + switch (reason) { + case LASTREC_UPDATE: + /* + * If this is the last leaf block and it's the last record, + * then update the size of the longest extent in the AG. + */ + if (ptr != xfs_btree_get_numrecs(block)) + return; + len = rec->alloc.ar_blockcount; + break; + case LASTREC_INSREC: + if (be32_to_cpu(rec->alloc.ar_blockcount) <= + be32_to_cpu(agf->agf_longest)) + return; + len = rec->alloc.ar_blockcount; + break; + case LASTREC_DELREC: + numrecs = xfs_btree_get_numrecs(block); + if (ptr <= numrecs) + return; + ASSERT(ptr == numrecs + 1); - /* - * GCC doesn't understand the (arguably complex) control flow in - * this function and complains about uninitialized structure fields - * without this. - */ - memset(&nrec, 0, sizeof(nrec)); + if (numrecs) { + xfs_alloc_rec_t *rrp; - /* - * If we made it to the root level, allocate a new root block - * and we're done. - */ - if (level >= cur->bc_nlevels) { - XFS_STATS_INC(xs_abt_insrec); - if ((error = xfs_alloc_newroot(cur, &i))) - return error; - *bnop = NULLAGBLOCK; - *stat = i; - return 0; - } - /* - * Make a key out of the record data to be inserted, and save it. - */ - key.ar_startblock = recp->ar_startblock; - key.ar_blockcount = recp->ar_blockcount; - optr = ptr = cur->bc_ptrs[level]; - /* - * If we're off the left edge, return failure. - */ - if (ptr == 0) { - *stat = 0; - return 0; - } - XFS_STATS_INC(xs_abt_insrec); - /* - * Get pointers to the btree buffer and block. - */ - bp = cur->bc_bufs[level]; - block = XFS_BUF_TO_ALLOC_BLOCK(bp); - numrecs = be16_to_cpu(block->bb_numrecs); -#ifdef DEBUG - if ((error = xfs_btree_check_sblock(cur, block, level, bp))) - return error; - /* - * Check that the new entry is being inserted in the right place. - */ - if (ptr <= numrecs) { - if (level == 0) { - rp = XFS_ALLOC_REC_ADDR(block, ptr, cur); - xfs_btree_check_rec(cur->bc_btnum, recp, rp); + rrp = XFS_ALLOC_REC_ADDR(cur->bc_mp, block, numrecs); + len = rrp->ar_blockcount; } else { - kp = XFS_ALLOC_KEY_ADDR(block, ptr, cur); - xfs_btree_check_key(cur->bc_btnum, &key, kp); - } - } -#endif - nbno = NULLAGBLOCK; - ncur = NULL; - /* - * If the block is full, we can't insert the new entry until we - * make the block un-full. - */ - if (numrecs == XFS_ALLOC_BLOCK_MAXRECS(level, cur)) { - /* - * First, try shifting an entry to the right neighbor. - */ - if ((error = xfs_alloc_rshift(cur, level, &i))) - return error; - if (i) { - /* nothing */ - } - /* - * Next, try shifting an entry to the left neighbor. - */ - else { - if ((error = xfs_alloc_lshift(cur, level, &i))) - return error; - if (i) - optr = ptr = cur->bc_ptrs[level]; - else { - /* - * Next, try splitting the current block in - * half. If this works we have to re-set our - * variables because we could be in a - * different block now. - */ - if ((error = xfs_alloc_split(cur, level, &nbno, - &nkey, &ncur, &i))) - return error; - if (i) { - bp = cur->bc_bufs[level]; - block = XFS_BUF_TO_ALLOC_BLOCK(bp); -#ifdef DEBUG - if ((error = - xfs_btree_check_sblock(cur, - block, level, bp))) - return error; -#endif - ptr = cur->bc_ptrs[level]; - nrec.ar_startblock = nkey.ar_startblock; - nrec.ar_blockcount = nkey.ar_blockcount; - } - /* - * Otherwise the insert fails. - */ - else { - *stat = 0; - return 0; - } - } - } - } - /* - * At this point we know there's room for our new entry in the block - * we're pointing at. - */ - numrecs = be16_to_cpu(block->bb_numrecs); - if (level > 0) { - /* - * It's a non-leaf entry. Make a hole for the new data - * in the key and ptr regions of the block. - */ - kp = XFS_ALLOC_KEY_ADDR(block, 1, cur); - pp = XFS_ALLOC_PTR_ADDR(block, 1, cur); -#ifdef DEBUG - for (i = numrecs; i >= ptr; i--) { - if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(pp[i - 1]), level))) - return error; + len = 0; } -#endif - memmove(&kp[ptr], &kp[ptr - 1], - (numrecs - ptr + 1) * sizeof(*kp)); - memmove(&pp[ptr], &pp[ptr - 1], - (numrecs - ptr + 1) * sizeof(*pp)); -#ifdef DEBUG - if ((error = xfs_btree_check_sptr(cur, *bnop, level))) - return error; -#endif - /* - * Now stuff the new data in, bump numrecs and log the new data. - */ - kp[ptr - 1] = key; - pp[ptr - 1] = cpu_to_be32(*bnop); - numrecs++; - block->bb_numrecs = cpu_to_be16(numrecs); - xfs_alloc_log_keys(cur, bp, ptr, numrecs); - xfs_alloc_log_ptrs(cur, bp, ptr, numrecs); -#ifdef DEBUG - if (ptr < numrecs) - xfs_btree_check_key(cur->bc_btnum, kp + ptr - 1, - kp + ptr); -#endif - } else { - /* - * It's a leaf entry. Make a hole for the new record. - */ - rp = XFS_ALLOC_REC_ADDR(block, 1, cur); - memmove(&rp[ptr], &rp[ptr - 1], - (numrecs - ptr + 1) * sizeof(*rp)); - /* - * Now stuff the new record in, bump numrecs - * and log the new data. - */ - rp[ptr - 1] = *recp; - numrecs++; - block->bb_numrecs = cpu_to_be16(numrecs); - xfs_alloc_log_recs(cur, bp, ptr, numrecs); -#ifdef DEBUG - if (ptr < numrecs) - xfs_btree_check_rec(cur->bc_btnum, rp + ptr - 1, - rp + ptr); -#endif - } - /* - * Log the new number of records in the btree header. - */ - xfs_alloc_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS); - /* - * If we inserted at the start of a block, update the parents' keys. - */ - if (optr == 1 && (error = xfs_alloc_updkey(cur, &key, level + 1))) - return error; - /* - * Look to see if the longest extent in the allocation group - * needs to be updated. - */ - agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); - if (level == 0 && - cur->bc_btnum == XFS_BTNUM_CNT && - be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK && - be32_to_cpu(recp->ar_blockcount) > be32_to_cpu(agf->agf_longest)) { - /* - * If this is a leaf in the by-size btree and there - * is no right sibling block and this block is bigger - * than the previous longest block, update it. - */ - agf->agf_longest = recp->ar_blockcount; - cur->bc_mp->m_perag[be32_to_cpu(agf->agf_seqno)].pagf_longest - = be32_to_cpu(recp->ar_blockcount); - xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, - XFS_AGF_LONGEST); + break; + default: + ASSERT(0); + return; } - /* - * Return the new block number, if any. - * If there is one, give back a record value and a cursor too. - */ - *bnop = nbno; - if (nbno != NULLAGBLOCK) { - *recp = nrec; - *curp = ncur; - } - *stat = 1; - return 0; + + agf->agf_longest = len; + cur->bc_mp->m_perag[seqno].pagf_longest = be32_to_cpu(len); + xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, XFS_AGF_LONGEST); } -/* - * Log header fields from a btree block. - */ -STATIC void -xfs_alloc_log_block( - xfs_trans_t *tp, /* transaction pointer */ - xfs_buf_t *bp, /* buffer containing btree block */ - int fields) /* mask of fields: XFS_BB_... */ +STATIC int +xfs_allocbt_get_minrecs( + struct xfs_btree_cur *cur, + int level) { - int first; /* first byte offset logged */ - int last; /* last byte offset logged */ - static const short offsets[] = { /* table of offsets */ - offsetof(xfs_alloc_block_t, bb_magic), - offsetof(xfs_alloc_block_t, bb_level), - offsetof(xfs_alloc_block_t, bb_numrecs), - offsetof(xfs_alloc_block_t, bb_leftsib), - offsetof(xfs_alloc_block_t, bb_rightsib), - sizeof(xfs_alloc_block_t) - }; + return cur->bc_mp->m_alloc_mnr[level != 0]; +} - xfs_btree_offsets(fields, offsets, XFS_BB_NUM_BITS, &first, &last); - xfs_trans_log_buf(tp, bp, first, last); +STATIC int +xfs_allocbt_get_maxrecs( + struct xfs_btree_cur *cur, + int level) +{ + return cur->bc_mp->m_alloc_mxr[level != 0]; } -/* - * Log keys from a btree block (nonleaf). - */ STATIC void -xfs_alloc_log_keys( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_buf_t *bp, /* buffer containing btree block */ - int kfirst, /* index of first key to log */ - int klast) /* index of last key to log */ +xfs_allocbt_init_key_from_rec( + union xfs_btree_key *key, + union xfs_btree_rec *rec) { - xfs_alloc_block_t *block; /* btree block to log from */ - int first; /* first byte offset logged */ - xfs_alloc_key_t *kp; /* key pointer in btree block */ - int last; /* last byte offset logged */ + ASSERT(rec->alloc.ar_startblock != 0); - block = XFS_BUF_TO_ALLOC_BLOCK(bp); - kp = XFS_ALLOC_KEY_ADDR(block, 1, cur); - first = (int)((xfs_caddr_t)&kp[kfirst - 1] - (xfs_caddr_t)block); - last = (int)(((xfs_caddr_t)&kp[klast] - 1) - (xfs_caddr_t)block); - xfs_trans_log_buf(cur->bc_tp, bp, first, last); + key->alloc.ar_startblock = rec->alloc.ar_startblock; + key->alloc.ar_blockcount = rec->alloc.ar_blockcount; } -/* - * Log block pointer fields from a btree block (nonleaf). - */ STATIC void -xfs_alloc_log_ptrs( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_buf_t *bp, /* buffer containing btree block */ - int pfirst, /* index of first pointer to log */ - int plast) /* index of last pointer to log */ +xfs_allocbt_init_rec_from_key( + union xfs_btree_key *key, + union xfs_btree_rec *rec) { - xfs_alloc_block_t *block; /* btree block to log from */ - int first; /* first byte offset logged */ - int last; /* last byte offset logged */ - xfs_alloc_ptr_t *pp; /* block-pointer pointer in btree blk */ + ASSERT(key->alloc.ar_startblock != 0); - block = XFS_BUF_TO_ALLOC_BLOCK(bp); - pp = XFS_ALLOC_PTR_ADDR(block, 1, cur); - first = (int)((xfs_caddr_t)&pp[pfirst - 1] - (xfs_caddr_t)block); - last = (int)(((xfs_caddr_t)&pp[plast] - 1) - (xfs_caddr_t)block); - xfs_trans_log_buf(cur->bc_tp, bp, first, last); + rec->alloc.ar_startblock = key->alloc.ar_startblock; + rec->alloc.ar_blockcount = key->alloc.ar_blockcount; } -/* - * Log records from a btree block (leaf). - */ STATIC void -xfs_alloc_log_recs( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_buf_t *bp, /* buffer containing btree block */ - int rfirst, /* index of first record to log */ - int rlast) /* index of last record to log */ +xfs_allocbt_init_rec_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec) { - xfs_alloc_block_t *block; /* btree block to log from */ - int first; /* first byte offset logged */ - int last; /* last byte offset logged */ - xfs_alloc_rec_t *rp; /* record pointer for btree block */ - + ASSERT(cur->bc_rec.a.ar_startblock != 0); - block = XFS_BUF_TO_ALLOC_BLOCK(bp); - rp = XFS_ALLOC_REC_ADDR(block, 1, cur); -#ifdef DEBUG - { - xfs_agf_t *agf; - xfs_alloc_rec_t *p; - - agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); - for (p = &rp[rfirst - 1]; p <= &rp[rlast - 1]; p++) - ASSERT(be32_to_cpu(p->ar_startblock) + - be32_to_cpu(p->ar_blockcount) <= - be32_to_cpu(agf->agf_length)); - } -#endif - first = (int)((xfs_caddr_t)&rp[rfirst - 1] - (xfs_caddr_t)block); - last = (int)(((xfs_caddr_t)&rp[rlast] - 1) - (xfs_caddr_t)block); - xfs_trans_log_buf(cur->bc_tp, bp, first, last); + rec->alloc.ar_startblock = cpu_to_be32(cur->bc_rec.a.ar_startblock); + rec->alloc.ar_blockcount = cpu_to_be32(cur->bc_rec.a.ar_blockcount); } -/* - * Lookup the record. The cursor is made to point to it, based on dir. - * Return 0 if can't find any such record, 1 for success. - */ -STATIC int /* error */ -xfs_alloc_lookup( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_lookup_t dir, /* <=, ==, or >= */ - int *stat) /* success/failure */ +STATIC void +xfs_allocbt_init_ptr_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) { - xfs_agblock_t agbno; /* a.g. relative btree block number */ - xfs_agnumber_t agno; /* allocation group number */ - xfs_alloc_block_t *block=NULL; /* current btree block */ - int diff; /* difference for the current key */ - int error; /* error return value */ - int keyno=0; /* current key number */ - int level; /* level in the btree */ - xfs_mount_t *mp; /* file system mount point */ - - XFS_STATS_INC(xs_abt_lookup); - /* - * Get the allocation group header, and the root block number. - */ - mp = cur->bc_mp; - - { - xfs_agf_t *agf; /* a.g. freespace header */ - - agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); - agno = be32_to_cpu(agf->agf_seqno); - agbno = be32_to_cpu(agf->agf_roots[cur->bc_btnum]); - } - /* - * Iterate over each level in the btree, starting at the root. - * For each level above the leaves, find the key we need, based - * on the lookup record, then follow the corresponding block - * pointer down to the next level. - */ - for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) { - xfs_buf_t *bp; /* buffer pointer for btree block */ - xfs_daddr_t d; /* disk address of btree block */ - - /* - * Get the disk address we're looking for. - */ - d = XFS_AGB_TO_DADDR(mp, agno, agbno); - /* - * If the old buffer at this level is for a different block, - * throw it away, otherwise just use it. - */ - bp = cur->bc_bufs[level]; - if (bp && XFS_BUF_ADDR(bp) != d) - bp = NULL; - if (!bp) { - /* - * Need to get a new buffer. Read it, then - * set it in the cursor, releasing the old one. - */ - if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, agno, - agbno, 0, &bp, XFS_ALLOC_BTREE_REF))) - return error; - xfs_btree_setbuf(cur, level, bp); - /* - * Point to the btree block, now that we have the buffer - */ - block = XFS_BUF_TO_ALLOC_BLOCK(bp); - if ((error = xfs_btree_check_sblock(cur, block, level, - bp))) - return error; - } else - block = XFS_BUF_TO_ALLOC_BLOCK(bp); - /* - * If we already had a key match at a higher level, we know - * we need to use the first entry in this block. - */ - if (diff == 0) - keyno = 1; - /* - * Otherwise we need to search this block. Do a binary search. - */ - else { - int high; /* high entry number */ - xfs_alloc_key_t *kkbase=NULL;/* base of keys in block */ - xfs_alloc_rec_t *krbase=NULL;/* base of records in block */ - int low; /* low entry number */ - - /* - * Get a pointer to keys or records. - */ - if (level > 0) - kkbase = XFS_ALLOC_KEY_ADDR(block, 1, cur); - else - krbase = XFS_ALLOC_REC_ADDR(block, 1, cur); - /* - * Set low and high entry numbers, 1-based. - */ - low = 1; - if (!(high = be16_to_cpu(block->bb_numrecs))) { - /* - * If the block is empty, the tree must - * be an empty leaf. - */ - ASSERT(level == 0 && cur->bc_nlevels == 1); - cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE; - *stat = 0; - return 0; - } - /* - * Binary search the block. - */ - while (low <= high) { - xfs_extlen_t blockcount; /* key value */ - xfs_agblock_t startblock; /* key value */ - - XFS_STATS_INC(xs_abt_compare); - /* - * keyno is average of low and high. - */ - keyno = (low + high) >> 1; - /* - * Get startblock & blockcount. - */ - if (level > 0) { - xfs_alloc_key_t *kkp; - - kkp = kkbase + keyno - 1; - startblock = be32_to_cpu(kkp->ar_startblock); - blockcount = be32_to_cpu(kkp->ar_blockcount); - } else { - xfs_alloc_rec_t *krp; + struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); - krp = krbase + keyno - 1; - startblock = be32_to_cpu(krp->ar_startblock); - blockcount = be32_to_cpu(krp->ar_blockcount); - } - /* - * Compute difference to get next direction. - */ - if (cur->bc_btnum == XFS_BTNUM_BNO) - diff = (int)startblock - - (int)cur->bc_rec.a.ar_startblock; - else if (!(diff = (int)blockcount - - (int)cur->bc_rec.a.ar_blockcount)) - diff = (int)startblock - - (int)cur->bc_rec.a.ar_startblock; - /* - * Less than, move right. - */ - if (diff < 0) - low = keyno + 1; - /* - * Greater than, move left. - */ - else if (diff > 0) - high = keyno - 1; - /* - * Equal, we're done. - */ - else - break; - } - } - /* - * If there are more levels, set up for the next level - * by getting the block number and filling in the cursor. - */ - if (level > 0) { - /* - * If we moved left, need the previous key number, - * unless there isn't one. - */ - if (diff > 0 && --keyno < 1) - keyno = 1; - agbno = be32_to_cpu(*XFS_ALLOC_PTR_ADDR(block, keyno, cur)); -#ifdef DEBUG - if ((error = xfs_btree_check_sptr(cur, agbno, level))) - return error; -#endif - cur->bc_ptrs[level] = keyno; - } - } - /* - * Done with the search. - * See if we need to adjust the results. - */ - if (dir != XFS_LOOKUP_LE && diff < 0) { - keyno++; - /* - * If ge search and we went off the end of the block, but it's - * not the last block, we're in the wrong block. - */ - if (dir == XFS_LOOKUP_GE && - keyno > be16_to_cpu(block->bb_numrecs) && - be32_to_cpu(block->bb_rightsib) != NULLAGBLOCK) { - int i; + ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno)); + ASSERT(agf->agf_roots[cur->bc_btnum] != 0); - cur->bc_ptrs[0] = keyno; - if ((error = xfs_alloc_increment(cur, 0, &i))) - return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); - *stat = 1; - return 0; - } - } - else if (dir == XFS_LOOKUP_LE && diff > 0) - keyno--; - cur->bc_ptrs[0] = keyno; - /* - * Return if we succeeded or not. - */ - if (keyno == 0 || keyno > be16_to_cpu(block->bb_numrecs)) - *stat = 0; - else - *stat = ((dir != XFS_LOOKUP_EQ) || (diff == 0)); - return 0; + ptr->s = agf->agf_roots[cur->bc_btnum]; } -/* - * Move 1 record left from cur/level if possible. - * Update cur to reflect the new path. - */ -STATIC int /* error */ -xfs_alloc_lshift( - xfs_btree_cur_t *cur, /* btree cursor */ - int level, /* level to shift record on */ - int *stat) /* success/failure */ +STATIC __int64_t +xfs_allocbt_key_diff( + struct xfs_btree_cur *cur, + union xfs_btree_key *key) { - int error; /* error return value */ -#ifdef DEBUG - int i; /* loop index */ -#endif - xfs_alloc_key_t key; /* key value for leaf level upward */ - xfs_buf_t *lbp; /* buffer for left neighbor block */ - xfs_alloc_block_t *left; /* left neighbor btree block */ - int nrec; /* new number of left block entries */ - xfs_buf_t *rbp; /* buffer for right (current) block */ - xfs_alloc_block_t *right; /* right (current) btree block */ - xfs_alloc_key_t *rkp=NULL; /* key pointer for right block */ - xfs_alloc_ptr_t *rpp=NULL; /* address pointer for right block */ - xfs_alloc_rec_t *rrp=NULL; /* record pointer for right block */ + xfs_alloc_rec_incore_t *rec = &cur->bc_rec.a; + xfs_alloc_key_t *kp = &key->alloc; + __int64_t diff; - /* - * Set up variables for this block as "right". - */ - rbp = cur->bc_bufs[level]; - right = XFS_BUF_TO_ALLOC_BLOCK(rbp); -#ifdef DEBUG - if ((error = xfs_btree_check_sblock(cur, right, level, rbp))) - return error; -#endif - /* - * If we've got no left sibling then we can't shift an entry left. - */ - if (be32_to_cpu(right->bb_leftsib) == NULLAGBLOCK) { - *stat = 0; - return 0; - } - /* - * If the cursor entry is the one that would be moved, don't - * do it... it's too complicated. - */ - if (cur->bc_ptrs[level] <= 1) { - *stat = 0; - return 0; - } - /* - * Set up the left neighbor as "left". - */ - if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp, - cur->bc_private.a.agno, be32_to_cpu(right->bb_leftsib), - 0, &lbp, XFS_ALLOC_BTREE_REF))) - return error; - left = XFS_BUF_TO_ALLOC_BLOCK(lbp); - if ((error = xfs_btree_check_sblock(cur, left, level, lbp))) - return error; - /* - * If it's full, it can't take another entry. - */ - if (be16_to_cpu(left->bb_numrecs) == XFS_ALLOC_BLOCK_MAXRECS(level, cur)) { - *stat = 0; - return 0; + if (cur->bc_btnum == XFS_BTNUM_BNO) { + return (__int64_t)be32_to_cpu(kp->ar_startblock) - + rec->ar_startblock; } - nrec = be16_to_cpu(left->bb_numrecs) + 1; - /* - * If non-leaf, copy a key and a ptr to the left block. - */ - if (level > 0) { - xfs_alloc_key_t *lkp; /* key pointer for left block */ - xfs_alloc_ptr_t *lpp; /* address pointer for left block */ - lkp = XFS_ALLOC_KEY_ADDR(left, nrec, cur); - rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur); - *lkp = *rkp; - xfs_alloc_log_keys(cur, lbp, nrec, nrec); - lpp = XFS_ALLOC_PTR_ADDR(left, nrec, cur); - rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur); -#ifdef DEBUG - if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(*rpp), level))) - return error; -#endif - *lpp = *rpp; - xfs_alloc_log_ptrs(cur, lbp, nrec, nrec); - xfs_btree_check_key(cur->bc_btnum, lkp - 1, lkp); - } - /* - * If leaf, copy a record to the left block. - */ - else { - xfs_alloc_rec_t *lrp; /* record pointer for left block */ + diff = (__int64_t)be32_to_cpu(kp->ar_blockcount) - rec->ar_blockcount; + if (diff) + return diff; - lrp = XFS_ALLOC_REC_ADDR(left, nrec, cur); - rrp = XFS_ALLOC_REC_ADDR(right, 1, cur); - *lrp = *rrp; - xfs_alloc_log_recs(cur, lbp, nrec, nrec); - xfs_btree_check_rec(cur->bc_btnum, lrp - 1, lrp); - } - /* - * Bump and log left's numrecs, decrement and log right's numrecs. - */ - be16_add_cpu(&left->bb_numrecs, 1); - xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS); - be16_add_cpu(&right->bb_numrecs, -1); - xfs_alloc_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS); - /* - * Slide the contents of right down one entry. - */ - if (level > 0) { -#ifdef DEBUG - for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) { - if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i + 1]), - level))) - return error; - } -#endif - memmove(rkp, rkp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp)); - memmove(rpp, rpp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp)); - xfs_alloc_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs)); - xfs_alloc_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs)); - } else { - memmove(rrp, rrp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp)); - xfs_alloc_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs)); - key.ar_startblock = rrp->ar_startblock; - key.ar_blockcount = rrp->ar_blockcount; - rkp = &key; - } - /* - * Update the parent key values of right. - */ - if ((error = xfs_alloc_updkey(cur, rkp, level + 1))) - return error; - /* - * Slide the cursor value left one. - */ - cur->bc_ptrs[level]--; - *stat = 1; - return 0; + return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock; } -/* - * Allocate a new root block, fill it in. - */ -STATIC int /* error */ -xfs_alloc_newroot( - xfs_btree_cur_t *cur, /* btree cursor */ - int *stat) /* success/failure */ +STATIC int +xfs_allocbt_kill_root( + struct xfs_btree_cur *cur, + struct xfs_buf *bp, + int level, + union xfs_btree_ptr *newroot) { - int error; /* error return value */ - xfs_agblock_t lbno; /* left block number */ - xfs_buf_t *lbp; /* left btree buffer */ - xfs_alloc_block_t *left; /* left btree block */ - xfs_mount_t *mp; /* mount structure */ - xfs_agblock_t nbno; /* new block number */ - xfs_buf_t *nbp; /* new (root) buffer */ - xfs_alloc_block_t *new; /* new (root) btree block */ - int nptr; /* new value for key index, 1 or 2 */ - xfs_agblock_t rbno; /* right block number */ - xfs_buf_t *rbp; /* right btree buffer */ - xfs_alloc_block_t *right; /* right btree block */ - - mp = cur->bc_mp; + int error; - ASSERT(cur->bc_nlevels < XFS_AG_MAXLEVELS(mp)); - /* - * Get a buffer from the freelist blocks, for the new root. - */ - error = xfs_alloc_get_freelist(cur->bc_tp, - cur->bc_private.a.agbp, &nbno, 1); - if (error) - return error; - /* - * None available, we fail. - */ - if (nbno == NULLAGBLOCK) { - *stat = 0; - return 0; - } - xfs_trans_agbtree_delta(cur->bc_tp, 1); - nbp = xfs_btree_get_bufs(mp, cur->bc_tp, cur->bc_private.a.agno, nbno, - 0); - new = XFS_BUF_TO_ALLOC_BLOCK(nbp); - /* - * Set the root data in the a.g. freespace structure. - */ - { - xfs_agf_t *agf; /* a.g. freespace header */ - xfs_agnumber_t seqno; + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_STATS_INC(cur, killroot); - agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); - agf->agf_roots[cur->bc_btnum] = cpu_to_be32(nbno); - be32_add_cpu(&agf->agf_levels[cur->bc_btnum], 1); - seqno = be32_to_cpu(agf->agf_seqno); - mp->m_perag[seqno].pagf_levels[cur->bc_btnum]++; - xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, - XFS_AGF_ROOTS | XFS_AGF_LEVELS); - } /* - * At the previous root level there are now two blocks: the old - * root, and the new block generated when it was split. - * We don't know which one the cursor is pointing at, so we - * set up variables "left" and "right" for each case. + * Update the root pointer, decreasing the level by 1 and then + * free the old root. */ - lbp = cur->bc_bufs[cur->bc_nlevels - 1]; - left = XFS_BUF_TO_ALLOC_BLOCK(lbp); -#ifdef DEBUG - if ((error = xfs_btree_check_sblock(cur, left, cur->bc_nlevels - 1, lbp))) + xfs_allocbt_set_root(cur, newroot, -1); + error = xfs_allocbt_free_block(cur, bp); + if (error) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); return error; -#endif - if (be32_to_cpu(left->bb_rightsib) != NULLAGBLOCK) { - /* - * Our block is left, pick up the right block. - */ - lbno = XFS_DADDR_TO_AGBNO(mp, XFS_BUF_ADDR(lbp)); - rbno = be32_to_cpu(left->bb_rightsib); - if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, - cur->bc_private.a.agno, rbno, 0, &rbp, - XFS_ALLOC_BTREE_REF))) - return error; - right = XFS_BUF_TO_ALLOC_BLOCK(rbp); - if ((error = xfs_btree_check_sblock(cur, right, - cur->bc_nlevels - 1, rbp))) - return error; - nptr = 1; - } else { - /* - * Our block is right, pick up the left block. - */ - rbp = lbp; - right = left; - rbno = XFS_DADDR_TO_AGBNO(mp, XFS_BUF_ADDR(rbp)); - lbno = be32_to_cpu(right->bb_leftsib); - if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, - cur->bc_private.a.agno, lbno, 0, &lbp, - XFS_ALLOC_BTREE_REF))) - return error; - left = XFS_BUF_TO_ALLOC_BLOCK(lbp); - if ((error = xfs_btree_check_sblock(cur, left, - cur->bc_nlevels - 1, lbp))) - return error; - nptr = 2; } - /* - * Fill in the new block's btree header and log it. - */ - new->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]); - new->bb_level = cpu_to_be16(cur->bc_nlevels); - new->bb_numrecs = cpu_to_be16(2); - new->bb_leftsib = cpu_to_be32(NULLAGBLOCK); - new->bb_rightsib = cpu_to_be32(NULLAGBLOCK); - xfs_alloc_log_block(cur->bc_tp, nbp, XFS_BB_ALL_BITS); - ASSERT(lbno != NULLAGBLOCK && rbno != NULLAGBLOCK); - /* - * Fill in the key data in the new root. - */ - { - xfs_alloc_key_t *kp; /* btree key pointer */ - kp = XFS_ALLOC_KEY_ADDR(new, 1, cur); - if (be16_to_cpu(left->bb_level) > 0) { - kp[0] = *XFS_ALLOC_KEY_ADDR(left, 1, cur); - kp[1] = *XFS_ALLOC_KEY_ADDR(right, 1, cur); - } else { - xfs_alloc_rec_t *rp; /* btree record pointer */ + XFS_BTREE_STATS_INC(cur, free); - rp = XFS_ALLOC_REC_ADDR(left, 1, cur); - kp[0].ar_startblock = rp->ar_startblock; - kp[0].ar_blockcount = rp->ar_blockcount; - rp = XFS_ALLOC_REC_ADDR(right, 1, cur); - kp[1].ar_startblock = rp->ar_startblock; - kp[1].ar_blockcount = rp->ar_blockcount; - } - } - xfs_alloc_log_keys(cur, nbp, 1, 2); - /* - * Fill in the pointer data in the new root. - */ - { - xfs_alloc_ptr_t *pp; /* btree address pointer */ + xfs_btree_setbuf(cur, level, NULL); + cur->bc_nlevels--; - pp = XFS_ALLOC_PTR_ADDR(new, 1, cur); - pp[0] = cpu_to_be32(lbno); - pp[1] = cpu_to_be32(rbno); - } - xfs_alloc_log_ptrs(cur, nbp, 1, 2); - /* - * Fix up the cursor. - */ - xfs_btree_setbuf(cur, cur->bc_nlevels, nbp); - cur->bc_ptrs[cur->bc_nlevels] = nptr; - cur->bc_nlevels++; - *stat = 1; + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); return 0; } -/* - * Move 1 record right from cur/level if possible. - * Update cur to reflect the new path. - */ -STATIC int /* error */ -xfs_alloc_rshift( - xfs_btree_cur_t *cur, /* btree cursor */ - int level, /* level to shift record on */ - int *stat) /* success/failure */ -{ - int error; /* error return value */ - int i; /* loop index */ - xfs_alloc_key_t key; /* key value for leaf level upward */ - xfs_buf_t *lbp; /* buffer for left (current) block */ - xfs_alloc_block_t *left; /* left (current) btree block */ - xfs_buf_t *rbp; /* buffer for right neighbor block */ - xfs_alloc_block_t *right; /* right neighbor btree block */ - xfs_alloc_key_t *rkp; /* key pointer for right block */ - xfs_btree_cur_t *tcur; /* temporary cursor */ - - /* - * Set up variables for this block as "left". - */ - lbp = cur->bc_bufs[level]; - left = XFS_BUF_TO_ALLOC_BLOCK(lbp); -#ifdef DEBUG - if ((error = xfs_btree_check_sblock(cur, left, level, lbp))) - return error; -#endif - /* - * If we've got no right sibling then we can't shift an entry right. - */ - if (be32_to_cpu(left->bb_rightsib) == NULLAGBLOCK) { - *stat = 0; - return 0; - } - /* - * If the cursor entry is the one that would be moved, don't - * do it... it's too complicated. - */ - if (cur->bc_ptrs[level] >= be16_to_cpu(left->bb_numrecs)) { - *stat = 0; - return 0; - } - /* - * Set up the right neighbor as "right". - */ - if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp, - cur->bc_private.a.agno, be32_to_cpu(left->bb_rightsib), - 0, &rbp, XFS_ALLOC_BTREE_REF))) - return error; - right = XFS_BUF_TO_ALLOC_BLOCK(rbp); - if ((error = xfs_btree_check_sblock(cur, right, level, rbp))) - return error; - /* - * If it's full, it can't take another entry. - */ - if (be16_to_cpu(right->bb_numrecs) == XFS_ALLOC_BLOCK_MAXRECS(level, cur)) { - *stat = 0; - return 0; - } - /* - * Make a hole at the start of the right neighbor block, then - * copy the last left block entry to the hole. - */ - if (level > 0) { - xfs_alloc_key_t *lkp; /* key pointer for left block */ - xfs_alloc_ptr_t *lpp; /* address pointer for left block */ - xfs_alloc_ptr_t *rpp; /* address pointer for right block */ - - lkp = XFS_ALLOC_KEY_ADDR(left, be16_to_cpu(left->bb_numrecs), cur); - lpp = XFS_ALLOC_PTR_ADDR(left, be16_to_cpu(left->bb_numrecs), cur); - rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur); - rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur); #ifdef DEBUG - for (i = be16_to_cpu(right->bb_numrecs) - 1; i >= 0; i--) { - if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i]), level))) - return error; - } -#endif - memmove(rkp + 1, rkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp)); - memmove(rpp + 1, rpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp)); -#ifdef DEBUG - if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(*lpp), level))) - return error; -#endif - *rkp = *lkp; - *rpp = *lpp; - xfs_alloc_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1); - xfs_alloc_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1); - xfs_btree_check_key(cur->bc_btnum, rkp, rkp + 1); +STATIC int +xfs_allocbt_keys_inorder( + struct xfs_btree_cur *cur, + union xfs_btree_key *k1, + union xfs_btree_key *k2) +{ + if (cur->bc_btnum == XFS_BTNUM_BNO) { + return be32_to_cpu(k1->alloc.ar_startblock) < + be32_to_cpu(k2->alloc.ar_startblock); } else { - xfs_alloc_rec_t *lrp; /* record pointer for left block */ - xfs_alloc_rec_t *rrp; /* record pointer for right block */ - - lrp = XFS_ALLOC_REC_ADDR(left, be16_to_cpu(left->bb_numrecs), cur); - rrp = XFS_ALLOC_REC_ADDR(right, 1, cur); - memmove(rrp + 1, rrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp)); - *rrp = *lrp; - xfs_alloc_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1); - key.ar_startblock = rrp->ar_startblock; - key.ar_blockcount = rrp->ar_blockcount; - rkp = &key; - xfs_btree_check_rec(cur->bc_btnum, rrp, rrp + 1); + return be32_to_cpu(k1->alloc.ar_blockcount) < + be32_to_cpu(k2->alloc.ar_blockcount) || + (k1->alloc.ar_blockcount == k2->alloc.ar_blockcount && + be32_to_cpu(k1->alloc.ar_startblock) < + be32_to_cpu(k2->alloc.ar_startblock)); } - /* - * Decrement and log left's numrecs, bump and log right's numrecs. - */ - be16_add_cpu(&left->bb_numrecs, -1); - xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS); - be16_add_cpu(&right->bb_numrecs, 1); - xfs_alloc_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS); - /* - * Using a temporary cursor, update the parent key values of the - * block on the right. - */ - if ((error = xfs_btree_dup_cursor(cur, &tcur))) - return error; - i = xfs_btree_lastrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - if ((error = xfs_alloc_increment(tcur, level, &i)) || - (error = xfs_alloc_updkey(tcur, rkp, level + 1))) - goto error0; - xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); - *stat = 1; - return 0; -error0: - xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); - return error; } -/* - * Split cur/level block in half. - * Return new block number and its first record (to be inserted into parent). - */ -STATIC int /* error */ -xfs_alloc_split( - xfs_btree_cur_t *cur, /* btree cursor */ - int level, /* level to split */ - xfs_agblock_t *bnop, /* output: block number allocated */ - xfs_alloc_key_t *keyp, /* output: first key of new block */ - xfs_btree_cur_t **curp, /* output: new cursor */ - int *stat) /* success/failure */ +STATIC int +xfs_allocbt_recs_inorder( + struct xfs_btree_cur *cur, + union xfs_btree_rec *r1, + union xfs_btree_rec *r2) { - int error; /* error return value */ - int i; /* loop index/record number */ - xfs_agblock_t lbno; /* left (current) block number */ - xfs_buf_t *lbp; /* buffer for left block */ - xfs_alloc_block_t *left; /* left (current) btree block */ - xfs_agblock_t rbno; /* right (new) block number */ - xfs_buf_t *rbp; /* buffer for right block */ - xfs_alloc_block_t *right; /* right (new) btree block */ - - /* - * Allocate the new block from the freelist. - * If we can't do it, we're toast. Give up. - */ - error = xfs_alloc_get_freelist(cur->bc_tp, - cur->bc_private.a.agbp, &rbno, 1); - if (error) - return error; - if (rbno == NULLAGBLOCK) { - *stat = 0; - return 0; - } - xfs_trans_agbtree_delta(cur->bc_tp, 1); - rbp = xfs_btree_get_bufs(cur->bc_mp, cur->bc_tp, cur->bc_private.a.agno, - rbno, 0); - /* - * Set up the new block as "right". - */ - right = XFS_BUF_TO_ALLOC_BLOCK(rbp); - /* - * "Left" is the current (according to the cursor) block. - */ - lbp = cur->bc_bufs[level]; - left = XFS_BUF_TO_ALLOC_BLOCK(lbp); -#ifdef DEBUG - if ((error = xfs_btree_check_sblock(cur, left, level, lbp))) - return error; -#endif - /* - * Fill in the btree header for the new block. - */ - right->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]); - right->bb_level = left->bb_level; - right->bb_numrecs = cpu_to_be16(be16_to_cpu(left->bb_numrecs) / 2); - /* - * Make sure that if there's an odd number of entries now, that - * each new block will have the same number of entries. - */ - if ((be16_to_cpu(left->bb_numrecs) & 1) && - cur->bc_ptrs[level] <= be16_to_cpu(right->bb_numrecs) + 1) - be16_add_cpu(&right->bb_numrecs, 1); - i = be16_to_cpu(left->bb_numrecs) - be16_to_cpu(right->bb_numrecs) + 1; - /* - * For non-leaf blocks, copy keys and addresses over to the new block. - */ - if (level > 0) { - xfs_alloc_key_t *lkp; /* left btree key pointer */ - xfs_alloc_ptr_t *lpp; /* left btree address pointer */ - xfs_alloc_key_t *rkp; /* right btree key pointer */ - xfs_alloc_ptr_t *rpp; /* right btree address pointer */ - - lkp = XFS_ALLOC_KEY_ADDR(left, i, cur); - lpp = XFS_ALLOC_PTR_ADDR(left, i, cur); - rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur); - rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur); -#ifdef DEBUG - for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) { - if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(lpp[i]), level))) - return error; - } -#endif - memcpy(rkp, lkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp)); - memcpy(rpp, lpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp)); - xfs_alloc_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs)); - xfs_alloc_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs)); - *keyp = *rkp; + if (cur->bc_btnum == XFS_BTNUM_BNO) { + return be32_to_cpu(r1->alloc.ar_startblock) + + be32_to_cpu(r1->alloc.ar_blockcount) <= + be32_to_cpu(r2->alloc.ar_startblock); + } else { + return be32_to_cpu(r1->alloc.ar_blockcount) < + be32_to_cpu(r2->alloc.ar_blockcount) || + (r1->alloc.ar_blockcount == r2->alloc.ar_blockcount && + be32_to_cpu(r1->alloc.ar_startblock) < + be32_to_cpu(r2->alloc.ar_startblock)); } - /* - * For leaf blocks, copy records over to the new block. - */ - else { - xfs_alloc_rec_t *lrp; /* left btree record pointer */ - xfs_alloc_rec_t *rrp; /* right btree record pointer */ +} +#endif /* DEBUG */ - lrp = XFS_ALLOC_REC_ADDR(left, i, cur); - rrp = XFS_ALLOC_REC_ADDR(right, 1, cur); - memcpy(rrp, lrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp)); - xfs_alloc_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs)); - keyp->ar_startblock = rrp->ar_startblock; - keyp->ar_blockcount = rrp->ar_blockcount; - } - /* - * Find the left block number by looking in the buffer. - * Adjust numrecs, sibling pointers. - */ - lbno = XFS_DADDR_TO_AGBNO(cur->bc_mp, XFS_BUF_ADDR(lbp)); - be16_add_cpu(&left->bb_numrecs, -(be16_to_cpu(right->bb_numrecs))); - right->bb_rightsib = left->bb_rightsib; - left->bb_rightsib = cpu_to_be32(rbno); - right->bb_leftsib = cpu_to_be32(lbno); - xfs_alloc_log_block(cur->bc_tp, rbp, XFS_BB_ALL_BITS); - xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB); - /* - * If there's a block to the new block's right, make that block - * point back to right instead of to left. - */ - if (be32_to_cpu(right->bb_rightsib) != NULLAGBLOCK) { - xfs_alloc_block_t *rrblock; /* rr btree block */ - xfs_buf_t *rrbp; /* buffer for rrblock */ +#ifdef XFS_BTREE_TRACE +ktrace_t *xfs_allocbt_trace_buf; - if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp, - cur->bc_private.a.agno, be32_to_cpu(right->bb_rightsib), 0, - &rrbp, XFS_ALLOC_BTREE_REF))) - return error; - rrblock = XFS_BUF_TO_ALLOC_BLOCK(rrbp); - if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp))) - return error; - rrblock->bb_leftsib = cpu_to_be32(rbno); - xfs_alloc_log_block(cur->bc_tp, rrbp, XFS_BB_LEFTSIB); - } - /* - * If the cursor is really in the right block, move it there. - * If it's just pointing past the last entry in left, then we'll - * insert there, so don't change anything in that case. - */ - if (cur->bc_ptrs[level] > be16_to_cpu(left->bb_numrecs) + 1) { - xfs_btree_setbuf(cur, level, rbp); - cur->bc_ptrs[level] -= be16_to_cpu(left->bb_numrecs); - } - /* - * If there are more levels, we'll need another cursor which refers to - * the right block, no matter where this cursor was. - */ - if (level + 1 < cur->bc_nlevels) { - if ((error = xfs_btree_dup_cursor(cur, curp))) - return error; - (*curp)->bc_ptrs[level + 1]++; - } - *bnop = rbno; - *stat = 1; - return 0; +STATIC void +xfs_allocbt_trace_enter( + struct xfs_btree_cur *cur, + const char *func, + char *s, + int type, + int line, + __psunsigned_t a0, + __psunsigned_t a1, + __psunsigned_t a2, + __psunsigned_t a3, + __psunsigned_t a4, + __psunsigned_t a5, + __psunsigned_t a6, + __psunsigned_t a7, + __psunsigned_t a8, + __psunsigned_t a9, + __psunsigned_t a10) +{ + ktrace_enter(xfs_allocbt_trace_buf, (void *)(__psint_t)type, + (void *)func, (void *)s, NULL, (void *)cur, + (void *)a0, (void *)a1, (void *)a2, (void *)a3, + (void *)a4, (void *)a5, (void *)a6, (void *)a7, + (void *)a8, (void *)a9, (void *)a10); } -/* - * Update keys at all levels from here to the root along the cursor's path. - */ -STATIC int /* error */ -xfs_alloc_updkey( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_alloc_key_t *keyp, /* new key value to update to */ - int level) /* starting level for update */ +STATIC void +xfs_allocbt_trace_cursor( + struct xfs_btree_cur *cur, + __uint32_t *s0, + __uint64_t *l0, + __uint64_t *l1) { - int ptr; /* index of key in block */ - - /* - * Go up the tree from this level toward the root. - * At each level, update the key value to the value input. - * Stop when we reach a level where the cursor isn't pointing - * at the first entry in the block. - */ - for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) { - xfs_alloc_block_t *block; /* btree block */ - xfs_buf_t *bp; /* buffer for block */ -#ifdef DEBUG - int error; /* error return value */ -#endif - xfs_alloc_key_t *kp; /* ptr to btree block keys */ - - bp = cur->bc_bufs[level]; - block = XFS_BUF_TO_ALLOC_BLOCK(bp); -#ifdef DEBUG - if ((error = xfs_btree_check_sblock(cur, block, level, bp))) - return error; -#endif - ptr = cur->bc_ptrs[level]; - kp = XFS_ALLOC_KEY_ADDR(block, ptr, cur); - *kp = *keyp; - xfs_alloc_log_keys(cur, bp, ptr, ptr); - } - return 0; + *s0 = cur->bc_private.a.agno; + *l0 = cur->bc_rec.a.ar_startblock; + *l1 = cur->bc_rec.a.ar_blockcount; } -/* - * Externally visible routines. - */ - -/* - * Decrement cursor by one record at the level. - * For nonzero levels the leaf-ward information is untouched. - */ -int /* error */ -xfs_alloc_decrement( - xfs_btree_cur_t *cur, /* btree cursor */ - int level, /* level in btree, 0 is leaf */ - int *stat) /* success/failure */ +STATIC void +xfs_allocbt_trace_key( + struct xfs_btree_cur *cur, + union xfs_btree_key *key, + __uint64_t *l0, + __uint64_t *l1) { - xfs_alloc_block_t *block; /* btree block */ - int error; /* error return value */ - int lev; /* btree level */ - - ASSERT(level < cur->bc_nlevels); - /* - * Read-ahead to the left at this level. - */ - xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA); - /* - * Decrement the ptr at this level. If we're still in the block - * then we're done. - */ - if (--cur->bc_ptrs[level] > 0) { - *stat = 1; - return 0; - } - /* - * Get a pointer to the btree block. - */ - block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[level]); -#ifdef DEBUG - if ((error = xfs_btree_check_sblock(cur, block, level, - cur->bc_bufs[level]))) - return error; -#endif - /* - * If we just went off the left edge of the tree, return failure. - */ - if (be32_to_cpu(block->bb_leftsib) == NULLAGBLOCK) { - *stat = 0; - return 0; - } - /* - * March up the tree decrementing pointers. - * Stop when we don't go off the left edge of a block. - */ - for (lev = level + 1; lev < cur->bc_nlevels; lev++) { - if (--cur->bc_ptrs[lev] > 0) - break; - /* - * Read-ahead the left block, we're going to read it - * in the next loop. - */ - xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA); - } - /* - * If we went off the root then we are seriously confused. - */ - ASSERT(lev < cur->bc_nlevels); - /* - * Now walk back down the tree, fixing up the cursor's buffer - * pointers and key numbers. - */ - for (block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[lev]); lev > level; ) { - xfs_agblock_t agbno; /* block number of btree block */ - xfs_buf_t *bp; /* buffer pointer for block */ - - agbno = be32_to_cpu(*XFS_ALLOC_PTR_ADDR(block, cur->bc_ptrs[lev], cur)); - if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp, - cur->bc_private.a.agno, agbno, 0, &bp, - XFS_ALLOC_BTREE_REF))) - return error; - lev--; - xfs_btree_setbuf(cur, lev, bp); - block = XFS_BUF_TO_ALLOC_BLOCK(bp); - if ((error = xfs_btree_check_sblock(cur, block, lev, bp))) - return error; - cur->bc_ptrs[lev] = be16_to_cpu(block->bb_numrecs); - } - *stat = 1; - return 0; + *l0 = be32_to_cpu(key->alloc.ar_startblock); + *l1 = be32_to_cpu(key->alloc.ar_blockcount); } -/* - * Delete the record pointed to by cur. - * The cursor refers to the place where the record was (could be inserted) - * when the operation returns. - */ -int /* error */ -xfs_alloc_delete( - xfs_btree_cur_t *cur, /* btree cursor */ - int *stat) /* success/failure */ +STATIC void +xfs_allocbt_trace_record( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec, + __uint64_t *l0, + __uint64_t *l1, + __uint64_t *l2) { - int error; /* error return value */ - int i; /* result code */ - int level; /* btree level */ - - /* - * Go up the tree, starting at leaf level. - * If 2 is returned then a join was done; go to the next level. - * Otherwise we are done. - */ - for (level = 0, i = 2; i == 2; level++) { - if ((error = xfs_alloc_delrec(cur, level, &i))) - return error; - } - if (i == 0) { - for (level = 1; level < cur->bc_nlevels; level++) { - if (cur->bc_ptrs[level] == 0) { - if ((error = xfs_alloc_decrement(cur, level, &i))) - return error; - break; - } - } - } - *stat = i; - return 0; + *l0 = be32_to_cpu(rec->alloc.ar_startblock); + *l1 = be32_to_cpu(rec->alloc.ar_blockcount); + *l2 = 0; } +#endif /* XFS_BTREE_TRACE */ + +static const struct xfs_btree_ops xfs_allocbt_ops = { + .rec_len = sizeof(xfs_alloc_rec_t), + .key_len = sizeof(xfs_alloc_key_t), + + .dup_cursor = xfs_allocbt_dup_cursor, + .set_root = xfs_allocbt_set_root, + .kill_root = xfs_allocbt_kill_root, + .alloc_block = xfs_allocbt_alloc_block, + .free_block = xfs_allocbt_free_block, + .update_lastrec = xfs_allocbt_update_lastrec, + .get_minrecs = xfs_allocbt_get_minrecs, + .get_maxrecs = xfs_allocbt_get_maxrecs, + .init_key_from_rec = xfs_allocbt_init_key_from_rec, + .init_rec_from_key = xfs_allocbt_init_rec_from_key, + .init_rec_from_cur = xfs_allocbt_init_rec_from_cur, + .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur, + .key_diff = xfs_allocbt_key_diff, -/* - * Get the data from the pointed-to record. - */ -int /* error */ -xfs_alloc_get_rec( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_agblock_t *bno, /* output: starting block of extent */ - xfs_extlen_t *len, /* output: length of extent */ - int *stat) /* output: success/failure */ -{ - xfs_alloc_block_t *block; /* btree block */ #ifdef DEBUG - int error; /* error return value */ + .keys_inorder = xfs_allocbt_keys_inorder, + .recs_inorder = xfs_allocbt_recs_inorder, #endif - int ptr; /* record number */ - ptr = cur->bc_ptrs[0]; - block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[0]); -#ifdef DEBUG - if ((error = xfs_btree_check_sblock(cur, block, 0, cur->bc_bufs[0]))) - return error; +#ifdef XFS_BTREE_TRACE + .trace_enter = xfs_allocbt_trace_enter, + .trace_cursor = xfs_allocbt_trace_cursor, + .trace_key = xfs_allocbt_trace_key, + .trace_record = xfs_allocbt_trace_record, #endif - /* - * Off the right end or left end, return failure. - */ - if (ptr > be16_to_cpu(block->bb_numrecs) || ptr <= 0) { - *stat = 0; - return 0; - } - /* - * Point to the record and extract its data. - */ - { - xfs_alloc_rec_t *rec; /* record data */ - - rec = XFS_ALLOC_REC_ADDR(block, ptr, cur); - *bno = be32_to_cpu(rec->ar_startblock); - *len = be32_to_cpu(rec->ar_blockcount); - } - *stat = 1; - return 0; -} +}; /* - * Increment cursor by one record at the level. - * For nonzero levels the leaf-ward information is untouched. + * Allocate a new allocation btree cursor. */ -int /* error */ -xfs_alloc_increment( - xfs_btree_cur_t *cur, /* btree cursor */ - int level, /* level in btree, 0 is leaf */ - int *stat) /* success/failure */ +struct xfs_btree_cur * /* new alloc btree cursor */ +xfs_allocbt_init_cursor( + struct xfs_mount *mp, /* file system mount point */ + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_buf *agbp, /* buffer for agf structure */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_btnum_t btnum) /* btree identifier */ { - xfs_alloc_block_t *block; /* btree block */ - xfs_buf_t *bp; /* tree block buffer */ - int error; /* error return value */ - int lev; /* btree level */ - - ASSERT(level < cur->bc_nlevels); - /* - * Read-ahead to the right at this level. - */ - xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA); - /* - * Get a pointer to the btree block. - */ - bp = cur->bc_bufs[level]; - block = XFS_BUF_TO_ALLOC_BLOCK(bp); -#ifdef DEBUG - if ((error = xfs_btree_check_sblock(cur, block, level, bp))) - return error; -#endif - /* - * Increment the ptr at this level. If we're still in the block - * then we're done. - */ - if (++cur->bc_ptrs[level] <= be16_to_cpu(block->bb_numrecs)) { - *stat = 1; - return 0; - } - /* - * If we just went off the right edge of the tree, return failure. - */ - if (be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK) { - *stat = 0; - return 0; - } - /* - * March up the tree incrementing pointers. - * Stop when we don't go off the right edge of a block. - */ - for (lev = level + 1; lev < cur->bc_nlevels; lev++) { - bp = cur->bc_bufs[lev]; - block = XFS_BUF_TO_ALLOC_BLOCK(bp); -#ifdef DEBUG - if ((error = xfs_btree_check_sblock(cur, block, lev, bp))) - return error; -#endif - if (++cur->bc_ptrs[lev] <= be16_to_cpu(block->bb_numrecs)) - break; - /* - * Read-ahead the right block, we're going to read it - * in the next loop. - */ - xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA); - } - /* - * If we went off the root then we are seriously confused. - */ - ASSERT(lev < cur->bc_nlevels); - /* - * Now walk back down the tree, fixing up the cursor's buffer - * pointers and key numbers. - */ - for (bp = cur->bc_bufs[lev], block = XFS_BUF_TO_ALLOC_BLOCK(bp); - lev > level; ) { - xfs_agblock_t agbno; /* block number of btree block */ + struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_btree_cur *cur; - agbno = be32_to_cpu(*XFS_ALLOC_PTR_ADDR(block, cur->bc_ptrs[lev], cur)); - if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp, - cur->bc_private.a.agno, agbno, 0, &bp, - XFS_ALLOC_BTREE_REF))) - return error; - lev--; - xfs_btree_setbuf(cur, lev, bp); - block = XFS_BUF_TO_ALLOC_BLOCK(bp); - if ((error = xfs_btree_check_sblock(cur, block, lev, bp))) - return error; - cur->bc_ptrs[lev] = 1; - } - *stat = 1; - return 0; -} + ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT); -/* - * Insert the current record at the point referenced by cur. - * The cursor may be inconsistent on return if splits have been done. - */ -int /* error */ -xfs_alloc_insert( - xfs_btree_cur_t *cur, /* btree cursor */ - int *stat) /* success/failure */ -{ - int error; /* error return value */ - int i; /* result value, 0 for failure */ - int level; /* current level number in btree */ - xfs_agblock_t nbno; /* new block number (split result) */ - xfs_btree_cur_t *ncur; /* new cursor (split result) */ - xfs_alloc_rec_t nrec; /* record being inserted this level */ - xfs_btree_cur_t *pcur; /* previous level's cursor */ + cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP); - level = 0; - nbno = NULLAGBLOCK; - nrec.ar_startblock = cpu_to_be32(cur->bc_rec.a.ar_startblock); - nrec.ar_blockcount = cpu_to_be32(cur->bc_rec.a.ar_blockcount); - ncur = NULL; - pcur = cur; - /* - * Loop going up the tree, starting at the leaf level. - * Stop when we don't get a split block, that must mean that - * the insert is finished with this level. - */ - do { - /* - * Insert nrec/nbno into this level of the tree. - * Note if we fail, nbno will be null. - */ - if ((error = xfs_alloc_insrec(pcur, level++, &nbno, &nrec, &ncur, - &i))) { - if (pcur != cur) - xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR); - return error; - } - /* - * See if the cursor we just used is trash. - * Can't trash the caller's cursor, but otherwise we should - * if ncur is a new cursor or we're about to be done. - */ - if (pcur != cur && (ncur || nbno == NULLAGBLOCK)) { - cur->bc_nlevels = pcur->bc_nlevels; - xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR); - } - /* - * If we got a new cursor, switch to it. - */ - if (ncur) { - pcur = ncur; - ncur = NULL; - } - } while (nbno != NULLAGBLOCK); - *stat = i; - return 0; -} + cur->bc_tp = tp; + cur->bc_mp = mp; + cur->bc_nlevels = be32_to_cpu(agf->agf_levels[btnum]); + cur->bc_btnum = btnum; + cur->bc_blocklog = mp->m_sb.sb_blocklog; -/* - * Lookup the record equal to [bno, len] in the btree given by cur. - */ -int /* error */ -xfs_alloc_lookup_eq( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_agblock_t bno, /* starting block of extent */ - xfs_extlen_t len, /* length of extent */ - int *stat) /* success/failure */ -{ - cur->bc_rec.a.ar_startblock = bno; - cur->bc_rec.a.ar_blockcount = len; - return xfs_alloc_lookup(cur, XFS_LOOKUP_EQ, stat); -} + cur->bc_ops = &xfs_allocbt_ops; + if (btnum == XFS_BTNUM_CNT) + cur->bc_flags = XFS_BTREE_LASTREC_UPDATE; -/* - * Lookup the first record greater than or equal to [bno, len] - * in the btree given by cur. - */ -int /* error */ -xfs_alloc_lookup_ge( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_agblock_t bno, /* starting block of extent */ - xfs_extlen_t len, /* length of extent */ - int *stat) /* success/failure */ -{ - cur->bc_rec.a.ar_startblock = bno; - cur->bc_rec.a.ar_blockcount = len; - return xfs_alloc_lookup(cur, XFS_LOOKUP_GE, stat); -} + cur->bc_private.a.agbp = agbp; + cur->bc_private.a.agno = agno; -/* - * Lookup the first record less than or equal to [bno, len] - * in the btree given by cur. - */ -int /* error */ -xfs_alloc_lookup_le( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_agblock_t bno, /* starting block of extent */ - xfs_extlen_t len, /* length of extent */ - int *stat) /* success/failure */ -{ - cur->bc_rec.a.ar_startblock = bno; - cur->bc_rec.a.ar_blockcount = len; - return xfs_alloc_lookup(cur, XFS_LOOKUP_LE, stat); + return cur; } /* - * Update the record referred to by cur, to the value given by [bno, len]. - * This either works (return 0) or gets an EFSCORRUPTED error. + * Calculate number of records in an alloc btree block. */ -int /* error */ -xfs_alloc_update( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_agblock_t bno, /* starting block of extent */ - xfs_extlen_t len) /* length of extent */ +int +xfs_allocbt_maxrecs( + struct xfs_mount *mp, + int blocklen, + int leaf) { - xfs_alloc_block_t *block; /* btree block to update */ - int error; /* error return value */ - int ptr; /* current record number (updating) */ + blocklen -= XFS_ALLOC_BLOCK_LEN(mp); - ASSERT(len > 0); - /* - * Pick up the a.g. freelist struct and the current block. - */ - block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[0]); -#ifdef DEBUG - if ((error = xfs_btree_check_sblock(cur, block, 0, cur->bc_bufs[0]))) - return error; -#endif - /* - * Get the address of the rec to be updated. - */ - ptr = cur->bc_ptrs[0]; - { - xfs_alloc_rec_t *rp; /* pointer to updated record */ - - rp = XFS_ALLOC_REC_ADDR(block, ptr, cur); - /* - * Fill in the new contents and log them. - */ - rp->ar_startblock = cpu_to_be32(bno); - rp->ar_blockcount = cpu_to_be32(len); - xfs_alloc_log_recs(cur, cur->bc_bufs[0], ptr, ptr); - } - /* - * If it's the by-size btree and it's the last leaf block and - * it's the last record... then update the size of the longest - * extent in the a.g., which we cache in the a.g. freelist header. - */ - if (cur->bc_btnum == XFS_BTNUM_CNT && - be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK && - ptr == be16_to_cpu(block->bb_numrecs)) { - xfs_agf_t *agf; /* a.g. freespace header */ - xfs_agnumber_t seqno; - - agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); - seqno = be32_to_cpu(agf->agf_seqno); - cur->bc_mp->m_perag[seqno].pagf_longest = len; - agf->agf_longest = cpu_to_be32(len); - xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, - XFS_AGF_LONGEST); - } - /* - * Updating first record in leaf. Pass new key value up to our parent. - */ - if (ptr == 1) { - xfs_alloc_key_t key; /* key containing [bno, len] */ - - key.ar_startblock = cpu_to_be32(bno); - key.ar_blockcount = cpu_to_be32(len); - if ((error = xfs_alloc_updkey(cur, &key, 1))) - return error; - } - return 0; + if (leaf) + return blocklen / sizeof(xfs_alloc_rec_t); + return blocklen / (sizeof(xfs_alloc_key_t) + sizeof(xfs_alloc_ptr_t)); } diff --git a/fs/xfs/xfs_alloc_btree.h b/fs/xfs/xfs_alloc_btree.h index 5bd1a2c8bd07..a6caa0022c9b 100644 --- a/fs/xfs/xfs_alloc_btree.h +++ b/fs/xfs/xfs_alloc_btree.h @@ -24,7 +24,6 @@ struct xfs_buf; struct xfs_btree_cur; -struct xfs_btree_sblock; struct xfs_mount; /* @@ -50,16 +49,6 @@ typedef struct xfs_alloc_rec_incore { /* btree pointer type */ typedef __be32 xfs_alloc_ptr_t; -/* btree block header type */ -typedef struct xfs_btree_sblock xfs_alloc_block_t; - -#define XFS_BUF_TO_ALLOC_BLOCK(bp) ((xfs_alloc_block_t *)XFS_BUF_PTR(bp)) - -/* - * Real block structures have a size equal to the disk block size. - */ -#define XFS_ALLOC_BLOCK_MAXRECS(lev,cur) ((cur)->bc_mp->m_alloc_mxr[lev != 0]) -#define XFS_ALLOC_BLOCK_MINRECS(lev,cur) ((cur)->bc_mp->m_alloc_mnr[lev != 0]) /* * Minimum and maximum blocksize and sectorsize. @@ -83,73 +72,39 @@ typedef struct xfs_btree_sblock xfs_alloc_block_t; #define XFS_CNT_BLOCK(mp) ((xfs_agblock_t)(XFS_BNO_BLOCK(mp) + 1)) /* - * Record, key, and pointer address macros for btree blocks. - */ -#define XFS_ALLOC_REC_ADDR(bb,i,cur) \ - XFS_BTREE_REC_ADDR(xfs_alloc, bb, i) - -#define XFS_ALLOC_KEY_ADDR(bb,i,cur) \ - XFS_BTREE_KEY_ADDR(xfs_alloc, bb, i) - -#define XFS_ALLOC_PTR_ADDR(bb,i,cur) \ - XFS_BTREE_PTR_ADDR(xfs_alloc, bb, i, XFS_ALLOC_BLOCK_MAXRECS(1, cur)) - -/* - * Decrement cursor by one record at the level. - * For nonzero levels the leaf-ward information is untouched. - */ -extern int xfs_alloc_decrement(struct xfs_btree_cur *cur, int level, int *stat); - -/* - * Delete the record pointed to by cur. - * The cursor refers to the place where the record was (could be inserted) - * when the operation returns. - */ -extern int xfs_alloc_delete(struct xfs_btree_cur *cur, int *stat); - -/* - * Get the data from the pointed-to record. - */ -extern int xfs_alloc_get_rec(struct xfs_btree_cur *cur, xfs_agblock_t *bno, - xfs_extlen_t *len, int *stat); - -/* - * Increment cursor by one record at the level. - * For nonzero levels the leaf-ward information is untouched. - */ -extern int xfs_alloc_increment(struct xfs_btree_cur *cur, int level, int *stat); - -/* - * Insert the current record at the point referenced by cur. - * The cursor may be inconsistent on return if splits have been done. - */ -extern int xfs_alloc_insert(struct xfs_btree_cur *cur, int *stat); - -/* - * Lookup the record equal to [bno, len] in the btree given by cur. - */ -extern int xfs_alloc_lookup_eq(struct xfs_btree_cur *cur, xfs_agblock_t bno, - xfs_extlen_t len, int *stat); - -/* - * Lookup the first record greater than or equal to [bno, len] - * in the btree given by cur. - */ -extern int xfs_alloc_lookup_ge(struct xfs_btree_cur *cur, xfs_agblock_t bno, - xfs_extlen_t len, int *stat); - -/* - * Lookup the first record less than or equal to [bno, len] - * in the btree given by cur. + * Btree block header size depends on a superblock flag. + * + * (not quite yet, but soon) */ -extern int xfs_alloc_lookup_le(struct xfs_btree_cur *cur, xfs_agblock_t bno, - xfs_extlen_t len, int *stat); +#define XFS_ALLOC_BLOCK_LEN(mp) XFS_BTREE_SBLOCK_LEN /* - * Update the record referred to by cur, to the value given by [bno, len]. - * This either works (return 0) or gets an EFSCORRUPTED error. - */ -extern int xfs_alloc_update(struct xfs_btree_cur *cur, xfs_agblock_t bno, - xfs_extlen_t len); + * Record, key, and pointer address macros for btree blocks. + * + * (note that some of these may appear unused, but they are used in userspace) + */ +#define XFS_ALLOC_REC_ADDR(mp, block, index) \ + ((xfs_alloc_rec_t *) \ + ((char *)(block) + \ + XFS_ALLOC_BLOCK_LEN(mp) + \ + (((index) - 1) * sizeof(xfs_alloc_rec_t)))) + +#define XFS_ALLOC_KEY_ADDR(mp, block, index) \ + ((xfs_alloc_key_t *) \ + ((char *)(block) + \ + XFS_ALLOC_BLOCK_LEN(mp) + \ + ((index) - 1) * sizeof(xfs_alloc_key_t))) + +#define XFS_ALLOC_PTR_ADDR(mp, block, index, maxrecs) \ + ((xfs_alloc_ptr_t *) \ + ((char *)(block) + \ + XFS_ALLOC_BLOCK_LEN(mp) + \ + (maxrecs) * sizeof(xfs_alloc_key_t) + \ + ((index) - 1) * sizeof(xfs_alloc_ptr_t))) + +extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *, + struct xfs_trans *, struct xfs_buf *, + xfs_agnumber_t, xfs_btnum_t); +extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int); #endif /* __XFS_ALLOC_BTREE_H__ */ diff --git a/fs/xfs/xfs_arch.h b/fs/xfs/xfs_arch.h index 0b3b5efe848c..53d5e70d1360 100644 --- a/fs/xfs/xfs_arch.h +++ b/fs/xfs/xfs_arch.h @@ -41,21 +41,36 @@ #endif #ifdef XFS_NATIVE_HOST -#define cpu_to_be16(val) ((__be16)(val)) -#define cpu_to_be32(val) ((__be32)(val)) -#define cpu_to_be64(val) ((__be64)(val)) -#define be16_to_cpu(val) ((__uint16_t)(val)) -#define be32_to_cpu(val) ((__uint32_t)(val)) -#define be64_to_cpu(val) ((__uint64_t)(val)) +#define cpu_to_be16(val) ((__force __be16)(__u16)(val)) +#define cpu_to_be32(val) ((__force __be32)(__u32)(val)) +#define cpu_to_be64(val) ((__force __be64)(__u64)(val)) +#define be16_to_cpu(val) ((__force __u16)(__be16)(val)) +#define be32_to_cpu(val) ((__force __u32)(__be32)(val)) +#define be64_to_cpu(val) ((__force __u64)(__be64)(val)) #else -#define cpu_to_be16(val) (__swab16((__uint16_t)(val))) -#define cpu_to_be32(val) (__swab32((__uint32_t)(val))) -#define cpu_to_be64(val) (__swab64((__uint64_t)(val))) -#define be16_to_cpu(val) (__swab16((__be16)(val))) -#define be32_to_cpu(val) (__swab32((__be32)(val))) -#define be64_to_cpu(val) (__swab64((__be64)(val))) +#define cpu_to_be16(val) ((__force __be16)__swab16((__u16)(val))) +#define cpu_to_be32(val) ((__force __be32)__swab32((__u32)(val))) +#define cpu_to_be64(val) ((__force __be64)__swab64((__u64)(val))) +#define be16_to_cpu(val) (__swab16((__force __u16)(__be16)(val))) +#define be32_to_cpu(val) (__swab32((__force __u32)(__be32)(val))) +#define be64_to_cpu(val) (__swab64((__force __u64)(__be64)(val))) #endif +static inline void be16_add_cpu(__be16 *a, __s16 b) +{ + *a = cpu_to_be16(be16_to_cpu(*a) + b); +} + +static inline void be32_add_cpu(__be32 *a, __s32 b) +{ + *a = cpu_to_be32(be32_to_cpu(*a) + b); +} + +static inline void be64_add_cpu(__be64 *a, __s64 b) +{ + *a = cpu_to_be64(be64_to_cpu(*a) + b); +} + #endif /* __KERNEL__ */ /* do we need conversion? */ diff --git a/fs/xfs/xfs_bit.h b/fs/xfs/xfs_bit.h index 8e0e463dae2d..bca7b243c319 100644 --- a/fs/xfs/xfs_bit.h +++ b/fs/xfs/xfs_bit.h @@ -61,8 +61,7 @@ static inline int xfs_highbit64(__uint64_t v) /* Get low bit set out of 32-bit argument, -1 if none set */ static inline int xfs_lowbit32(__uint32_t v) { - unsigned long t = v; - return (v) ? find_first_bit(&t, 32) : -1; + return ffs(v) - 1; } /* Get low bit set out of 64-bit argument, -1 if none set */ diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index a1aab9275d5a..138308e70d14 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -393,8 +393,8 @@ xfs_bmap_count_leaves( STATIC void xfs_bmap_disk_count_leaves( - xfs_extnum_t idx, - xfs_bmbt_block_t *block, + struct xfs_mount *mp, + struct xfs_btree_block *block, int numrecs, int *count); @@ -402,6 +402,53 @@ xfs_bmap_disk_count_leaves( * Bmap internal routines. */ +STATIC int /* error */ +xfs_bmbt_lookup_eq( + struct xfs_btree_cur *cur, + xfs_fileoff_t off, + xfs_fsblock_t bno, + xfs_filblks_t len, + int *stat) /* success/failure */ +{ + cur->bc_rec.b.br_startoff = off; + cur->bc_rec.b.br_startblock = bno; + cur->bc_rec.b.br_blockcount = len; + return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat); +} + +STATIC int /* error */ +xfs_bmbt_lookup_ge( + struct xfs_btree_cur *cur, + xfs_fileoff_t off, + xfs_fsblock_t bno, + xfs_filblks_t len, + int *stat) /* success/failure */ +{ + cur->bc_rec.b.br_startoff = off; + cur->bc_rec.b.br_startblock = bno; + cur->bc_rec.b.br_blockcount = len; + return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat); +} + +/* +* Update the record referred to by cur to the value given + * by [off, bno, len, state]. + * This either works (return 0) or gets an EFSCORRUPTED error. + */ +STATIC int +xfs_bmbt_update( + struct xfs_btree_cur *cur, + xfs_fileoff_t off, + xfs_fsblock_t bno, + xfs_filblks_t len, + xfs_exntst_t state) +{ + union xfs_btree_rec rec; + + xfs_bmbt_disk_set_allf(&rec.bmbt, off, bno, len, state); + return xfs_btree_update(cur, &rec); +} + /* * Called from xfs_bmap_add_attrfork to handle btree format files. */ @@ -422,15 +469,14 @@ xfs_bmap_add_attrfork_btree( if (ip->i_df.if_broot_bytes <= XFS_IFORK_DSIZE(ip)) *flags |= XFS_ILOG_DBROOT; else { - cur = xfs_btree_init_cursor(mp, tp, NULL, 0, XFS_BTNUM_BMAP, ip, - XFS_DATA_FORK); + cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK); cur->bc_private.b.flist = flist; cur->bc_private.b.firstblock = *firstblock; if ((error = xfs_bmbt_lookup_ge(cur, 0, 0, 0, &stat))) goto error0; /* must be at least one entry */ XFS_WANT_CORRUPTED_GOTO(stat == 1, error0); - if ((error = xfs_bmbt_newroot(cur, flags, &stat))) + if ((error = xfs_btree_new_iroot(cur, flags, &stat))) goto error0; if (stat == 0) { xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); @@ -818,10 +864,10 @@ xfs_bmap_add_extent_delay_real( RIGHT.br_blockcount, &i))) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_bmbt_delete(cur, &i))) + if ((error = xfs_btree_delete(cur, &i))) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_bmbt_decrement(cur, 0, &i))) + if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, @@ -931,7 +977,7 @@ xfs_bmap_add_extent_delay_real( goto done; XFS_WANT_CORRUPTED_GOTO(i == 0, done); cur->bc_rec.b.br_state = XFS_EXT_NORM; - if ((error = xfs_bmbt_insert(cur, &i))) + if ((error = xfs_btree_insert(cur, &i))) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); } @@ -1007,7 +1053,7 @@ xfs_bmap_add_extent_delay_real( goto done; XFS_WANT_CORRUPTED_GOTO(i == 0, done); cur->bc_rec.b.br_state = XFS_EXT_NORM; - if ((error = xfs_bmbt_insert(cur, &i))) + if ((error = xfs_btree_insert(cur, &i))) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); } @@ -1097,7 +1143,7 @@ xfs_bmap_add_extent_delay_real( goto done; XFS_WANT_CORRUPTED_GOTO(i == 0, done); cur->bc_rec.b.br_state = XFS_EXT_NORM; - if ((error = xfs_bmbt_insert(cur, &i))) + if ((error = xfs_btree_insert(cur, &i))) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); } @@ -1152,7 +1198,7 @@ xfs_bmap_add_extent_delay_real( goto done; XFS_WANT_CORRUPTED_GOTO(i == 0, done); cur->bc_rec.b.br_state = XFS_EXT_NORM; - if ((error = xfs_bmbt_insert(cur, &i))) + if ((error = xfs_btree_insert(cur, &i))) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); } @@ -1379,16 +1425,16 @@ xfs_bmap_add_extent_unwritten_real( RIGHT.br_blockcount, &i))) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_bmbt_delete(cur, &i))) + if ((error = xfs_btree_delete(cur, &i))) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_bmbt_decrement(cur, 0, &i))) + if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_bmbt_delete(cur, &i))) + if ((error = xfs_btree_delete(cur, &i))) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_bmbt_decrement(cur, 0, &i))) + if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, @@ -1428,10 +1474,10 @@ xfs_bmap_add_extent_unwritten_real( &i))) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_bmbt_delete(cur, &i))) + if ((error = xfs_btree_delete(cur, &i))) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_bmbt_decrement(cur, 0, &i))) + if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, @@ -1471,10 +1517,10 @@ xfs_bmap_add_extent_unwritten_real( RIGHT.br_blockcount, &i))) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_bmbt_delete(cur, &i))) + if ((error = xfs_btree_delete(cur, &i))) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_bmbt_decrement(cur, 0, &i))) + if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_update(cur, new->br_startoff, @@ -1557,7 +1603,7 @@ xfs_bmap_add_extent_unwritten_real( PREV.br_blockcount - new->br_blockcount, oldext))) goto done; - if ((error = xfs_bmbt_decrement(cur, 0, &i))) + if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; if (xfs_bmbt_update(cur, LEFT.br_startoff, LEFT.br_startblock, @@ -1605,7 +1651,7 @@ xfs_bmap_add_extent_unwritten_real( oldext))) goto done; cur->bc_rec.b = *new; - if ((error = xfs_bmbt_insert(cur, &i))) + if ((error = xfs_btree_insert(cur, &i))) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); } @@ -1647,7 +1693,7 @@ xfs_bmap_add_extent_unwritten_real( PREV.br_blockcount - new->br_blockcount, oldext))) goto done; - if ((error = xfs_bmbt_increment(cur, 0, &i))) + if ((error = xfs_btree_increment(cur, 0, &i))) goto done; if ((error = xfs_bmbt_update(cur, new->br_startoff, new->br_startblock, @@ -1695,7 +1741,7 @@ xfs_bmap_add_extent_unwritten_real( goto done; XFS_WANT_CORRUPTED_GOTO(i == 0, done); cur->bc_rec.b.br_state = XFS_EXT_NORM; - if ((error = xfs_bmbt_insert(cur, &i))) + if ((error = xfs_btree_insert(cur, &i))) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); } @@ -1743,7 +1789,7 @@ xfs_bmap_add_extent_unwritten_real( cur->bc_rec.b = PREV; cur->bc_rec.b.br_blockcount = new->br_startoff - PREV.br_startoff; - if ((error = xfs_bmbt_insert(cur, &i))) + if ((error = xfs_btree_insert(cur, &i))) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); /* @@ -1758,7 +1804,7 @@ xfs_bmap_add_extent_unwritten_real( XFS_WANT_CORRUPTED_GOTO(i == 0, done); /* new middle extent - newext */ cur->bc_rec.b.br_state = new->br_state; - if ((error = xfs_bmbt_insert(cur, &i))) + if ((error = xfs_btree_insert(cur, &i))) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); } @@ -2106,10 +2152,10 @@ xfs_bmap_add_extent_hole_real( right.br_blockcount, &i))) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_bmbt_delete(cur, &i))) + if ((error = xfs_btree_delete(cur, &i))) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); - if ((error = xfs_bmbt_decrement(cur, 0, &i))) + if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_update(cur, left.br_startoff, @@ -2218,7 +2264,7 @@ xfs_bmap_add_extent_hole_real( goto done; XFS_WANT_CORRUPTED_GOTO(i == 0, done); cur->bc_rec.b.br_state = new->br_state; - if ((error = xfs_bmbt_insert(cur, &i))) + if ((error = xfs_btree_insert(cur, &i))) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); } @@ -2996,24 +3042,24 @@ xfs_bmap_btree_to_extents( int whichfork) /* data or attr fork */ { /* REFERENCED */ - xfs_bmbt_block_t *cblock;/* child btree block */ + struct xfs_btree_block *cblock;/* child btree block */ xfs_fsblock_t cbno; /* child block number */ xfs_buf_t *cbp; /* child block's buffer */ int error; /* error return value */ xfs_ifork_t *ifp; /* inode fork data */ xfs_mount_t *mp; /* mount point structure */ __be64 *pp; /* ptr to block address */ - xfs_bmbt_block_t *rblock;/* root btree block */ + struct xfs_btree_block *rblock;/* root btree block */ + mp = ip->i_mount; ifp = XFS_IFORK_PTR(ip, whichfork); ASSERT(ifp->if_flags & XFS_IFEXTENTS); ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE); rblock = ifp->if_broot; ASSERT(be16_to_cpu(rblock->bb_level) == 1); ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1); - ASSERT(XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes) == 1); - mp = ip->i_mount; - pp = XFS_BMAP_BROOT_PTR_ADDR(rblock, 1, ifp->if_broot_bytes); + ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1); + pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes); cbno = be64_to_cpu(*pp); *logflagsp = 0; #ifdef DEBUG @@ -3023,8 +3069,8 @@ xfs_bmap_btree_to_extents( if ((error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, XFS_BMAP_BTREE_REF))) return error; - cblock = XFS_BUF_TO_BMBT_BLOCK(cbp); - if ((error = xfs_btree_check_lblock(cur, cblock, 0, cbp))) + cblock = XFS_BUF_TO_BLOCK(cbp); + if ((error = xfs_btree_check_block(cur, cblock, 0, cbp))) return error; xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp); ip->i_d.di_nblocks--; @@ -3170,7 +3216,7 @@ xfs_bmap_del_extent( flags |= XFS_ILOG_FEXT(whichfork); break; } - if ((error = xfs_bmbt_delete(cur, &i))) + if ((error = xfs_btree_delete(cur, &i))) goto done; XFS_WANT_CORRUPTED_GOTO(i == 1, done); break; @@ -3254,10 +3300,10 @@ xfs_bmap_del_extent( got.br_startblock, temp, got.br_state))) goto done; - if ((error = xfs_bmbt_increment(cur, 0, &i))) + if ((error = xfs_btree_increment(cur, 0, &i))) goto done; cur->bc_rec.b = new; - error = xfs_bmbt_insert(cur, &i); + error = xfs_btree_insert(cur, &i); if (error && error != ENOSPC) goto done; /* @@ -3404,11 +3450,11 @@ xfs_bmap_extents_to_btree( int *logflagsp, /* inode logging flags */ int whichfork) /* data or attr fork */ { - xfs_bmbt_block_t *ablock; /* allocated (child) bt block */ + struct xfs_btree_block *ablock; /* allocated (child) bt block */ xfs_buf_t *abp; /* buffer for ablock */ xfs_alloc_arg_t args; /* allocation arguments */ xfs_bmbt_rec_t *arp; /* child record pointer */ - xfs_bmbt_block_t *block; /* btree root block */ + struct xfs_btree_block *block; /* btree root block */ xfs_btree_cur_t *cur; /* bmap btree cursor */ xfs_bmbt_rec_host_t *ep; /* extent record pointer */ int error; /* error return value */ @@ -3428,6 +3474,7 @@ xfs_bmap_extents_to_btree( */ xfs_iroot_realloc(ip, 1, whichfork); ifp->if_flags |= XFS_IFBROOT; + /* * Fill in the root. */ @@ -3435,14 +3482,14 @@ xfs_bmap_extents_to_btree( block->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC); block->bb_level = cpu_to_be16(1); block->bb_numrecs = cpu_to_be16(1); - block->bb_leftsib = cpu_to_be64(NULLDFSBNO); - block->bb_rightsib = cpu_to_be64(NULLDFSBNO); + block->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO); + block->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO); + /* * Need a cursor. Can't allocate until bb_level is filled in. */ mp = ip->i_mount; - cur = xfs_btree_init_cursor(mp, tp, NULL, 0, XFS_BTNUM_BMAP, ip, - whichfork); + cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); cur->bc_private.b.firstblock = *firstblock; cur->bc_private.b.flist = flist; cur->bc_private.b.flags = wasdel ? XFS_BTCUR_BPRV_WASDEL : 0; @@ -3489,12 +3536,12 @@ xfs_bmap_extents_to_btree( /* * Fill in the child block. */ - ablock = XFS_BUF_TO_BMBT_BLOCK(abp); + ablock = XFS_BUF_TO_BLOCK(abp); ablock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC); ablock->bb_level = 0; - ablock->bb_leftsib = cpu_to_be64(NULLDFSBNO); - ablock->bb_rightsib = cpu_to_be64(NULLDFSBNO); - arp = XFS_BMAP_REC_IADDR(ablock, 1, cur); + ablock->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO); + ablock->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO); + arp = XFS_BMBT_REC_ADDR(mp, ablock, 1); nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); for (cnt = i = 0; i < nextents; i++) { ep = xfs_iext_get_ext(ifp, i); @@ -3505,21 +3552,24 @@ xfs_bmap_extents_to_btree( } } ASSERT(cnt == XFS_IFORK_NEXTENTS(ip, whichfork)); - ablock->bb_numrecs = cpu_to_be16(cnt); + xfs_btree_set_numrecs(ablock, cnt); + /* * Fill in the root key and pointer. */ - kp = XFS_BMAP_KEY_IADDR(block, 1, cur); - arp = XFS_BMAP_REC_IADDR(ablock, 1, cur); + kp = XFS_BMBT_KEY_ADDR(mp, block, 1); + arp = XFS_BMBT_REC_ADDR(mp, ablock, 1); kp->br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(arp)); - pp = XFS_BMAP_PTR_IADDR(block, 1, cur); + pp = XFS_BMBT_PTR_ADDR(mp, block, 1, xfs_bmbt_get_maxrecs(cur, + be16_to_cpu(block->bb_level))); *pp = cpu_to_be64(args.fsbno); + /* * Do all this logging at the end so that * the root is at the right level. */ - xfs_bmbt_log_block(cur, abp, XFS_BB_ALL_BITS); - xfs_bmbt_log_recs(cur, abp, 1, be16_to_cpu(ablock->bb_numrecs)); + xfs_btree_log_block(cur, abp, XFS_BB_ALL_BITS); + xfs_btree_log_recs(cur, abp, 1, be16_to_cpu(ablock->bb_numrecs)); ASSERT(*curp == NULL); *curp = cur; *logflagsp = XFS_ILOG_CORE | XFS_ILOG_FBROOT(whichfork); @@ -4176,7 +4226,7 @@ xfs_bmap_compute_maxlevels( maxleafents = MAXAEXTNUM; sz = XFS_BMDR_SPACE_CALC(MINABTPTRS); } - maxrootrecs = (int)XFS_BTREE_BLOCK_MAXRECS(sz, xfs_bmdr, 0); + maxrootrecs = xfs_bmdr_maxrecs(mp, sz, 0); minleafrecs = mp->m_bmap_dmnr[0]; minnoderecs = mp->m_bmap_dmnr[1]; maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs; @@ -4242,9 +4292,15 @@ xfs_bmap_finish( * We have a new transaction, so we should return committed=1, * even though we're returning an error. */ - if (error) { + if (error) return error; - } + + /* + * transaction commit worked ok so we can drop the extra ticket + * reference that we gained in xfs_trans_dup() + */ + xfs_log_ticket_put(ntp->t_ticket); + if ((error = xfs_trans_reserve(ntp, 0, logres, 0, XFS_TRANS_PERM_LOG_RES, logcount))) return error; @@ -4474,6 +4530,22 @@ xfs_bmap_one_block( return rval; } +STATIC int +xfs_bmap_sanity_check( + struct xfs_mount *mp, + struct xfs_buf *bp, + int level) +{ + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + + if (be32_to_cpu(block->bb_magic) != XFS_BMAP_MAGIC || + be16_to_cpu(block->bb_level) != level || + be16_to_cpu(block->bb_numrecs) == 0 || + be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0]) + return 0; + return 1; +} + /* * Read in the extents to if_extents. * All inode fields are set up by caller, we just traverse the btree @@ -4486,7 +4558,7 @@ xfs_bmap_read_extents( xfs_inode_t *ip, /* incore inode */ int whichfork) /* data or attr fork */ { - xfs_bmbt_block_t *block; /* current btree block */ + struct xfs_btree_block *block; /* current btree block */ xfs_fsblock_t bno; /* block # of "block" */ xfs_buf_t *bp; /* buffer for "block" */ int error; /* error return value */ @@ -4510,7 +4582,7 @@ xfs_bmap_read_extents( */ level = be16_to_cpu(block->bb_level); ASSERT(level > 0); - pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes); + pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); bno = be64_to_cpu(*pp); ASSERT(bno != NULLDFSBNO); ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); @@ -4523,13 +4595,13 @@ xfs_bmap_read_extents( if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF))) return error; - block = XFS_BUF_TO_BMBT_BLOCK(bp); + block = XFS_BUF_TO_BLOCK(bp); XFS_WANT_CORRUPTED_GOTO( - XFS_BMAP_SANITY_CHECK(mp, block, level), + xfs_bmap_sanity_check(mp, bp, level), error0); if (level == 0) break; - pp = XFS_BTREE_PTR_ADDR(xfs_bmbt, block, 1, mp->m_bmap_dmxr[1]); + pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); bno = be64_to_cpu(*pp); XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0); xfs_trans_brelse(tp, bp); @@ -4549,7 +4621,7 @@ xfs_bmap_read_extents( xfs_extnum_t start; - num_recs = be16_to_cpu(block->bb_numrecs); + num_recs = xfs_btree_get_numrecs(block); if (unlikely(i + num_recs > room)) { ASSERT(i + num_recs <= room); xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, @@ -4561,18 +4633,18 @@ xfs_bmap_read_extents( goto error0; } XFS_WANT_CORRUPTED_GOTO( - XFS_BMAP_SANITY_CHECK(mp, block, 0), + xfs_bmap_sanity_check(mp, bp, 0), error0); /* * Read-ahead the next leaf block, if any. */ - nextbno = be64_to_cpu(block->bb_rightsib); + nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); if (nextbno != NULLFSBLOCK) xfs_btree_reada_bufl(mp, nextbno, 1); /* * Copy records into the extent records. */ - frp = XFS_BTREE_REC_ADDR(xfs_bmbt, block, 1); + frp = XFS_BMBT_REC_ADDR(mp, block, 1); start = i; for (j = 0; j < num_recs; j++, i++, frp++) { xfs_bmbt_rec_host_t *trp = xfs_iext_get_ext(ifp, i); @@ -4603,7 +4675,7 @@ xfs_bmap_read_extents( if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF))) return error; - block = XFS_BUF_TO_BMBT_BLOCK(bp); + block = XFS_BUF_TO_BLOCK(bp); } ASSERT(i == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))); ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork)); @@ -5029,8 +5101,7 @@ xfs_bmapi( if (abno == NULLFSBLOCK) break; if ((ifp->if_flags & XFS_IFBROOT) && !cur) { - cur = xfs_btree_init_cursor(mp, - tp, NULL, 0, XFS_BTNUM_BMAP, + cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); cur->bc_private.b.firstblock = *firstblock; @@ -5147,9 +5218,8 @@ xfs_bmapi( */ ASSERT(mval->br_blockcount <= len); if ((ifp->if_flags & XFS_IFBROOT) && !cur) { - cur = xfs_btree_init_cursor(mp, - tp, NULL, 0, XFS_BTNUM_BMAP, - ip, whichfork); + cur = xfs_bmbt_init_cursor(mp, + tp, ip, whichfork); cur->bc_private.b.firstblock = *firstblock; cur->bc_private.b.flist = flist; @@ -5440,8 +5510,7 @@ xfs_bunmapi( logflags = 0; if (ifp->if_flags & XFS_IFBROOT) { ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE); - cur = xfs_btree_init_cursor(mp, tp, NULL, 0, XFS_BTNUM_BMAP, ip, - whichfork); + cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); cur->bc_private.b.firstblock = *firstblock; cur->bc_private.b.flist = flist; cur->bc_private.b.flags = 0; @@ -5742,14 +5811,17 @@ error0: STATIC int xfs_getbmapx_fix_eof_hole( xfs_inode_t *ip, /* xfs incore inode pointer */ - struct getbmap *out, /* output structure */ + struct getbmapx *out, /* output structure */ int prealloced, /* this is a file with - * preallocated data space */ + * preallocated data space */ __int64_t end, /* last block requested */ xfs_fsblock_t startblock) { __int64_t fixlen; xfs_mount_t *mp; /* file system mount point */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_extnum_t lastx; /* last extent pointer */ + xfs_fileoff_t fileblock; if (startblock == HOLESTARTBLOCK) { mp = ip->i_mount; @@ -5763,21 +5835,33 @@ xfs_getbmapx_fix_eof_hole( out->bmv_length = fixlen; } } else { - out->bmv_block = XFS_FSB_TO_DB(ip, startblock); + if (startblock == DELAYSTARTBLOCK) + out->bmv_block = -2; + else + out->bmv_block = XFS_FSB_TO_DB(ip, startblock); + fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset); + ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + if (xfs_iext_bno_to_ext(ifp, fileblock, &lastx) && + (lastx == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))-1)) + out->bmv_oflags |= BMV_OF_LAST; } return 1; } /* - * Fcntl interface to xfs_bmapi. + * Get inode's extents as described in bmv, and format for output. + * Calls formatter to fill the user's buffer until all extents + * are mapped, until the passed-in bmv->bmv_count slots have + * been filled, or until the formatter short-circuits the loop, + * if it is tracking filled-in extents on its own. */ int /* error code */ xfs_getbmap( xfs_inode_t *ip, - struct getbmap *bmv, /* user bmap structure */ - void __user *ap, /* pointer to user's array */ - int interface) /* interface flags */ + struct getbmapx *bmv, /* user bmap structure */ + xfs_bmap_format_t formatter, /* format to user */ + void *arg) /* formatter arg */ { __int64_t bmvend; /* last block requested */ int error; /* return value */ @@ -5790,19 +5874,17 @@ xfs_getbmap( int nexleft; /* # of user extents left */ int subnex; /* # of bmapi's can do */ int nmap; /* number of map entries */ - struct getbmap out; /* output structure */ + struct getbmapx out; /* output structure */ int whichfork; /* data or attr fork */ int prealloced; /* this is a file with * preallocated data space */ - int sh_unwritten; /* true, if unwritten */ - /* extents listed separately */ + int iflags; /* interface flags */ int bmapi_flags; /* flags for xfs_bmapi */ - __int32_t oflags; /* getbmapx bmv_oflags field */ mp = ip->i_mount; + iflags = bmv->bmv_iflags; - whichfork = interface & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK; - sh_unwritten = (interface & BMV_IF_PREALLOC) != 0; + whichfork = iflags & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK; /* If the BMV_IF_NO_DMAPI_READ interface bit specified, do not * generate a DMAPI read event. Otherwise, if the DM_EVENT_READ @@ -5817,7 +5899,7 @@ xfs_getbmap( * could misinterpret holes in a DMAPI file as true holes, * when in fact they may represent offline user data. */ - if ((interface & BMV_IF_NO_DMAPI_READ) == 0 && + if ((iflags & BMV_IF_NO_DMAPI_READ) == 0 && DM_EVENT_ENABLED(ip, DM_EVENT_READ) && whichfork == XFS_DATA_FORK) { error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, 0, 0, 0, NULL); @@ -5873,8 +5955,9 @@ xfs_getbmap( xfs_ilock(ip, XFS_IOLOCK_SHARED); - if (whichfork == XFS_DATA_FORK && - (ip->i_delayed_blks || ip->i_size > ip->i_d.di_size)) { + if (((iflags & BMV_IF_DELALLOC) == 0) && + (whichfork == XFS_DATA_FORK) && + (ip->i_delayed_blks || ip->i_size > ip->i_d.di_size)) { /* xfs_fsize_t last_byte = xfs_file_last_byte(ip); */ error = xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF); @@ -5884,7 +5967,8 @@ xfs_getbmap( } } - ASSERT(whichfork == XFS_ATTR_FORK || ip->i_delayed_blks == 0); + ASSERT(whichfork == XFS_ATTR_FORK || (iflags & BMV_IF_DELALLOC) || + ip->i_delayed_blks == 0); lock = xfs_ilock_map_shared(ip); @@ -5896,7 +5980,7 @@ xfs_getbmap( nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1; bmapi_flags = XFS_BMAPI_AFLAG(whichfork) | - ((sh_unwritten) ? 0 : XFS_BMAPI_IGSTATE); + ((iflags & BMV_IF_PREALLOC) ? 0 : XFS_BMAPI_IGSTATE); /* * Allocate enough space to handle "subnex" maps at a time. @@ -5906,9 +5990,12 @@ xfs_getbmap( bmv->bmv_entries = 0; - if (XFS_IFORK_NEXTENTS(ip, whichfork) == 0) { - error = 0; - goto unlock_and_return; + if ((XFS_IFORK_NEXTENTS(ip, whichfork) == 0)) { + if (((iflags & BMV_IF_DELALLOC) == 0) || + whichfork == XFS_ATTR_FORK) { + error = 0; + goto unlock_and_return; + } } nexleft = nex; @@ -5924,52 +6011,40 @@ xfs_getbmap( ASSERT(nmap <= subnex); for (i = 0; i < nmap && nexleft && bmv->bmv_length; i++) { - nexleft--; - oflags = (map[i].br_state == XFS_EXT_UNWRITTEN) ? - BMV_OF_PREALLOC : 0; + out.bmv_oflags = 0; + if (map[i].br_state == XFS_EXT_UNWRITTEN) + out.bmv_oflags |= BMV_OF_PREALLOC; + else if (map[i].br_startblock == DELAYSTARTBLOCK) + out.bmv_oflags |= BMV_OF_DELALLOC; out.bmv_offset = XFS_FSB_TO_BB(mp, map[i].br_startoff); out.bmv_length = XFS_FSB_TO_BB(mp, map[i].br_blockcount); - ASSERT(map[i].br_startblock != DELAYSTARTBLOCK); + out.bmv_unused1 = out.bmv_unused2 = 0; + ASSERT(((iflags & BMV_IF_DELALLOC) != 0) || + (map[i].br_startblock != DELAYSTARTBLOCK)); if (map[i].br_startblock == HOLESTARTBLOCK && whichfork == XFS_ATTR_FORK) { /* came to the end of attribute fork */ + out.bmv_oflags |= BMV_OF_LAST; goto unlock_and_return; } else { + int full = 0; /* user array is full */ + if (!xfs_getbmapx_fix_eof_hole(ip, &out, prealloced, bmvend, map[i].br_startblock)) { goto unlock_and_return; } - /* return either getbmap/getbmapx structure. */ - if (interface & BMV_IF_EXTENDED) { - struct getbmapx outx; - - GETBMAP_CONVERT(out,outx); - outx.bmv_oflags = oflags; - outx.bmv_unused1 = outx.bmv_unused2 = 0; - if (copy_to_user(ap, &outx, - sizeof(outx))) { - error = XFS_ERROR(EFAULT); - goto unlock_and_return; - } - } else { - if (copy_to_user(ap, &out, - sizeof(out))) { - error = XFS_ERROR(EFAULT); - goto unlock_and_return; - } - } + /* format results & advance arg */ + error = formatter(&arg, &out, &full); + if (error || full) + goto unlock_and_return; + nexleft--; bmv->bmv_offset = out.bmv_offset + out.bmv_length; bmv->bmv_length = MAX((__int64_t)0, (__int64_t)(bmvend - bmv->bmv_offset)); bmv->bmv_entries++; - ap = (interface & BMV_IF_EXTENDED) ? - (void __user *) - ((struct getbmapx __user *)ap + 1) : - (void __user *) - ((struct getbmap __user *)ap + 1); } } } while (nmap && nexleft && bmv->bmv_length); @@ -6131,7 +6206,7 @@ xfs_bmap_get_bp( void xfs_check_block( - xfs_bmbt_block_t *block, + struct xfs_btree_block *block, xfs_mount_t *mp, int root, short sz) @@ -6143,36 +6218,29 @@ xfs_check_block( ASSERT(be16_to_cpu(block->bb_level) > 0); prevp = NULL; - for( i = 1; i <= be16_to_cpu(block->bb_numrecs); i++) { + for( i = 1; i <= xfs_btree_get_numrecs(block); i++) { dmxr = mp->m_bmap_dmxr[0]; - - if (root) { - keyp = XFS_BMAP_BROOT_KEY_ADDR(block, i, sz); - } else { - keyp = XFS_BTREE_KEY_ADDR(xfs_bmbt, block, i); - } + keyp = XFS_BMBT_KEY_ADDR(mp, block, i); if (prevp) { - xfs_btree_check_key(XFS_BTNUM_BMAP, prevp, keyp); + ASSERT(be64_to_cpu(prevp->br_startoff) < + be64_to_cpu(keyp->br_startoff)); } prevp = keyp; /* * Compare the block numbers to see if there are dups. */ + if (root) + pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, i, sz); + else + pp = XFS_BMBT_PTR_ADDR(mp, block, i, dmxr); - if (root) { - pp = XFS_BMAP_BROOT_PTR_ADDR(block, i, sz); - } else { - pp = XFS_BTREE_PTR_ADDR(xfs_bmbt, block, i, dmxr); - } for (j = i+1; j <= be16_to_cpu(block->bb_numrecs); j++) { - if (root) { - thispa = XFS_BMAP_BROOT_PTR_ADDR(block, j, sz); - } else { - thispa = XFS_BTREE_PTR_ADDR(xfs_bmbt, block, j, - dmxr); - } + if (root) + thispa = XFS_BMAP_BROOT_PTR_ADDR(mp, block, j, sz); + else + thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr); if (*thispa == *pp) { cmn_err(CE_WARN, "%s: thispa(%d) == pp(%d) %Ld", __func__, j, i, @@ -6195,7 +6263,7 @@ xfs_bmap_check_leaf_extents( xfs_inode_t *ip, /* incore inode pointer */ int whichfork) /* data or attr fork */ { - xfs_bmbt_block_t *block; /* current btree block */ + struct xfs_btree_block *block; /* current btree block */ xfs_fsblock_t bno; /* block # of "block" */ xfs_buf_t *bp; /* buffer for "block" */ int error; /* error return value */ @@ -6223,7 +6291,7 @@ xfs_bmap_check_leaf_extents( level = be16_to_cpu(block->bb_level); ASSERT(level > 0); xfs_check_block(block, mp, 1, ifp->if_broot_bytes); - pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes); + pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); bno = be64_to_cpu(*pp); ASSERT(bno != NULLDFSBNO); @@ -6245,9 +6313,9 @@ xfs_bmap_check_leaf_extents( if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, XFS_BMAP_BTREE_REF))) goto error_norelse; - block = XFS_BUF_TO_BMBT_BLOCK(bp); + block = XFS_BUF_TO_BLOCK(bp); XFS_WANT_CORRUPTED_GOTO( - XFS_BMAP_SANITY_CHECK(mp, block, level), + xfs_bmap_sanity_check(mp, bp, level), error0); if (level == 0) break; @@ -6258,7 +6326,7 @@ xfs_bmap_check_leaf_extents( */ xfs_check_block(block, mp, 0, 0); - pp = XFS_BTREE_PTR_ADDR(xfs_bmbt, block, 1, mp->m_bmap_dmxr[1]); + pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); bno = be64_to_cpu(*pp); XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0); if (bp_release) { @@ -6280,13 +6348,13 @@ xfs_bmap_check_leaf_extents( xfs_extnum_t num_recs; - num_recs = be16_to_cpu(block->bb_numrecs); + num_recs = xfs_btree_get_numrecs(block); /* * Read-ahead the next leaf block, if any. */ - nextbno = be64_to_cpu(block->bb_rightsib); + nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); /* * Check all the extents to make sure they are OK. @@ -6294,13 +6362,17 @@ xfs_bmap_check_leaf_extents( * conform with the first entry in this one. */ - ep = XFS_BTREE_REC_ADDR(xfs_bmbt, block, 1); + ep = XFS_BMBT_REC_ADDR(mp, block, 1); if (i) { - xfs_btree_check_rec(XFS_BTNUM_BMAP, &last, ep); + ASSERT(xfs_bmbt_disk_get_startoff(&last) + + xfs_bmbt_disk_get_blockcount(&last) <= + xfs_bmbt_disk_get_startoff(ep)); } for (j = 1; j < num_recs; j++) { - nextp = XFS_BTREE_REC_ADDR(xfs_bmbt, block, j + 1); - xfs_btree_check_rec(XFS_BTNUM_BMAP, ep, nextp); + nextp = XFS_BMBT_REC_ADDR(mp, block, j + 1); + ASSERT(xfs_bmbt_disk_get_startoff(ep) + + xfs_bmbt_disk_get_blockcount(ep) <= + xfs_bmbt_disk_get_startoff(nextp)); ep = nextp; } @@ -6326,7 +6398,7 @@ xfs_bmap_check_leaf_extents( if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, XFS_BMAP_BTREE_REF))) goto error_norelse; - block = XFS_BUF_TO_BMBT_BLOCK(bp); + block = XFS_BUF_TO_BLOCK(bp); } if (bp_release) { bp_release = 0; @@ -6356,7 +6428,7 @@ xfs_bmap_count_blocks( int whichfork, /* data or attr fork */ int *count) /* out: count of blocks */ { - xfs_bmbt_block_t *block; /* current btree block */ + struct xfs_btree_block *block; /* current btree block */ xfs_fsblock_t bno; /* block # of "block" */ xfs_ifork_t *ifp; /* fork structure */ int level; /* btree level, for checking */ @@ -6379,7 +6451,7 @@ xfs_bmap_count_blocks( block = ifp->if_broot; level = be16_to_cpu(block->bb_level); ASSERT(level > 0); - pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes); + pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); bno = be64_to_cpu(*pp); ASSERT(bno != NULLDFSBNO); ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); @@ -6413,29 +6485,29 @@ xfs_bmap_count_tree( __be64 *pp; xfs_fsblock_t bno = blockno; xfs_fsblock_t nextbno; - xfs_bmbt_block_t *block, *nextblock; + struct xfs_btree_block *block, *nextblock; int numrecs; if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF))) return error; *count += 1; - block = XFS_BUF_TO_BMBT_BLOCK(bp); + block = XFS_BUF_TO_BLOCK(bp); if (--level) { /* Not at node above leafs, count this level of nodes */ - nextbno = be64_to_cpu(block->bb_rightsib); + nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); while (nextbno != NULLFSBLOCK) { if ((error = xfs_btree_read_bufl(mp, tp, nextbno, 0, &nbp, XFS_BMAP_BTREE_REF))) return error; *count += 1; - nextblock = XFS_BUF_TO_BMBT_BLOCK(nbp); - nextbno = be64_to_cpu(nextblock->bb_rightsib); + nextblock = XFS_BUF_TO_BLOCK(nbp); + nextbno = be64_to_cpu(nextblock->bb_u.l.bb_rightsib); xfs_trans_brelse(tp, nbp); } /* Dive to the next level */ - pp = XFS_BTREE_PTR_ADDR(xfs_bmbt, block, 1, mp->m_bmap_dmxr[1]); + pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); bno = be64_to_cpu(*pp); if (unlikely((error = xfs_bmap_count_tree(mp, tp, ifp, bno, level, count)) < 0)) { @@ -6448,9 +6520,9 @@ xfs_bmap_count_tree( } else { /* count all level 1 nodes and their leaves */ for (;;) { - nextbno = be64_to_cpu(block->bb_rightsib); + nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); numrecs = be16_to_cpu(block->bb_numrecs); - xfs_bmap_disk_count_leaves(0, block, numrecs, count); + xfs_bmap_disk_count_leaves(mp, block, numrecs, count); xfs_trans_brelse(tp, bp); if (nextbno == NULLFSBLOCK) break; @@ -6459,7 +6531,7 @@ xfs_bmap_count_tree( XFS_BMAP_BTREE_REF))) return error; *count += 1; - block = XFS_BUF_TO_BMBT_BLOCK(bp); + block = XFS_BUF_TO_BLOCK(bp); } } return 0; @@ -6489,8 +6561,8 @@ xfs_bmap_count_leaves( */ STATIC void xfs_bmap_disk_count_leaves( - xfs_extnum_t idx, - xfs_bmbt_block_t *block, + struct xfs_mount *mp, + struct xfs_btree_block *block, int numrecs, int *count) { @@ -6498,7 +6570,7 @@ xfs_bmap_disk_count_leaves( xfs_bmbt_rec_t *frp; for (b = 1; b <= numrecs; b++) { - frp = XFS_BTREE_REC_ADDR(xfs_bmbt, block, idx + b); + frp = XFS_BMBT_REC_ADDR(mp, block, b); *count += xfs_bmbt_disk_get_blockcount(frp); } } diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index 9f3e3a836d15..284571c05ed0 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -137,9 +137,7 @@ typedef struct xfs_bmalloca { char conv; /* overwriting unwritten extents */ } xfs_bmalloca_t; -#ifdef __KERNEL__ - -#if defined(XFS_BMAP_TRACE) +#if defined(__KERNEL__) && defined(XFS_BMAP_TRACE) /* * Trace operations for bmap extent tracing */ @@ -163,9 +161,12 @@ xfs_bmap_trace_exlist( int whichfork); /* data or attr fork */ #define XFS_BMAP_TRACE_EXLIST(ip,c,w) \ xfs_bmap_trace_exlist(__func__,ip,c,w) -#else + +#else /* __KERNEL__ && XFS_BMAP_TRACE */ + #define XFS_BMAP_TRACE_EXLIST(ip,c,w) -#endif + +#endif /* __KERNEL__ && XFS_BMAP_TRACE */ /* * Convert inode from non-attributed to attributed. @@ -206,20 +207,6 @@ xfs_bmap_compute_maxlevels( int whichfork); /* data or attr fork */ /* - * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi - * caller. Frees all the extents that need freeing, which must be done - * last due to locking considerations. - * - * Return 1 if the given transaction was committed and a new one allocated, - * and 0 otherwise. - */ -int /* error */ -xfs_bmap_finish( - struct xfs_trans **tp, /* transaction pointer addr */ - xfs_bmap_free_t *flist, /* i/o: list extents to free */ - int *committed); /* xact committed or not */ - -/* * Returns the file-relative block number of the first unused block in the file. * This is the lowest-address hole if the file has holes, else the first block * past the end of file. @@ -344,14 +331,43 @@ xfs_bunmapi( int *done); /* set if not done yet */ /* - * Fcntl interface to xfs_bmapi. + * Check an extent list, which has just been read, for + * any bit in the extent flag field. + */ +int +xfs_check_nostate_extents( + struct xfs_ifork *ifp, + xfs_extnum_t idx, + xfs_extnum_t num); + +#ifdef __KERNEL__ + +/* + * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi + * caller. Frees all the extents that need freeing, which must be done + * last due to locking considerations. + * + * Return 1 if the given transaction was committed and a new one allocated, + * and 0 otherwise. + */ +int /* error */ +xfs_bmap_finish( + struct xfs_trans **tp, /* transaction pointer addr */ + xfs_bmap_free_t *flist, /* i/o: list extents to free */ + int *committed); /* xact committed or not */ + +/* bmap to userspace formatter - copy to user & advance pointer */ +typedef int (*xfs_bmap_format_t)(void **, struct getbmapx *, int *); + +/* + * Get inode's extents as described in bmv, and format for output. */ int /* error code */ xfs_getbmap( xfs_inode_t *ip, - struct getbmap *bmv, /* user bmap structure */ - void __user *ap, /* pointer to user's array */ - int iflags); /* interface flags */ + struct getbmapx *bmv, /* user bmap structure */ + xfs_bmap_format_t formatter, /* format to user */ + void *arg); /* formatter arg */ /* * Check if the endoff is outside the last extent. If so the caller will grow @@ -375,16 +391,6 @@ xfs_bmap_count_blocks( int *count); /* - * Check an extent list, which has just been read, for - * any bit in the extent flag field. - */ -int -xfs_check_nostate_extents( - struct xfs_ifork *ifp, - xfs_extnum_t idx, - xfs_extnum_t num); - -/* * Search the extent records for the entry containing block bno. * If bno lies in a hole, point to the next entry. If bno lies * past eof, *eofp will be set, and *prevp will contain the last diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c index 23efad29a5cd..8f1ec73725d3 100644 --- a/fs/xfs/xfs_bmap_btree.c +++ b/fs/xfs/xfs_bmap_btree.c @@ -37,1406 +37,13 @@ #include "xfs_inode_item.h" #include "xfs_alloc.h" #include "xfs_btree.h" +#include "xfs_btree_trace.h" #include "xfs_ialloc.h" #include "xfs_itable.h" #include "xfs_bmap.h" #include "xfs_error.h" #include "xfs_quota.h" -#if defined(XFS_BMBT_TRACE) -ktrace_t *xfs_bmbt_trace_buf; -#endif - -/* - * Prototypes for internal btree functions. - */ - - -STATIC int xfs_bmbt_killroot(xfs_btree_cur_t *); -STATIC void xfs_bmbt_log_keys(xfs_btree_cur_t *, xfs_buf_t *, int, int); -STATIC void xfs_bmbt_log_ptrs(xfs_btree_cur_t *, xfs_buf_t *, int, int); -STATIC int xfs_bmbt_lshift(xfs_btree_cur_t *, int, int *); -STATIC int xfs_bmbt_rshift(xfs_btree_cur_t *, int, int *); -STATIC int xfs_bmbt_split(xfs_btree_cur_t *, int, xfs_fsblock_t *, - __uint64_t *, xfs_btree_cur_t **, int *); -STATIC int xfs_bmbt_updkey(xfs_btree_cur_t *, xfs_bmbt_key_t *, int); - - -#if defined(XFS_BMBT_TRACE) - -static char ARGS[] = "args"; -static char ENTRY[] = "entry"; -static char ERROR[] = "error"; -#undef EXIT -static char EXIT[] = "exit"; - -/* - * Add a trace buffer entry for the arguments given to the routine, - * generic form. - */ -STATIC void -xfs_bmbt_trace_enter( - const char *func, - xfs_btree_cur_t *cur, - char *s, - int type, - int line, - __psunsigned_t a0, - __psunsigned_t a1, - __psunsigned_t a2, - __psunsigned_t a3, - __psunsigned_t a4, - __psunsigned_t a5, - __psunsigned_t a6, - __psunsigned_t a7, - __psunsigned_t a8, - __psunsigned_t a9, - __psunsigned_t a10) -{ - xfs_inode_t *ip; - int whichfork; - - ip = cur->bc_private.b.ip; - whichfork = cur->bc_private.b.whichfork; - ktrace_enter(xfs_bmbt_trace_buf, - (void *)((__psint_t)type | (whichfork << 8) | (line << 16)), - (void *)func, (void *)s, (void *)ip, (void *)cur, - (void *)a0, (void *)a1, (void *)a2, (void *)a3, - (void *)a4, (void *)a5, (void *)a6, (void *)a7, - (void *)a8, (void *)a9, (void *)a10); - ASSERT(ip->i_btrace); - ktrace_enter(ip->i_btrace, - (void *)((__psint_t)type | (whichfork << 8) | (line << 16)), - (void *)func, (void *)s, (void *)ip, (void *)cur, - (void *)a0, (void *)a1, (void *)a2, (void *)a3, - (void *)a4, (void *)a5, (void *)a6, (void *)a7, - (void *)a8, (void *)a9, (void *)a10); -} -/* - * Add a trace buffer entry for arguments, for a buffer & 1 integer arg. - */ -STATIC void -xfs_bmbt_trace_argbi( - const char *func, - xfs_btree_cur_t *cur, - xfs_buf_t *b, - int i, - int line) -{ - xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGBI, line, - (__psunsigned_t)b, i, 0, 0, - 0, 0, 0, 0, - 0, 0, 0); -} - -/* - * Add a trace buffer entry for arguments, for a buffer & 2 integer args. - */ -STATIC void -xfs_bmbt_trace_argbii( - const char *func, - xfs_btree_cur_t *cur, - xfs_buf_t *b, - int i0, - int i1, - int line) -{ - xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGBII, line, - (__psunsigned_t)b, i0, i1, 0, - 0, 0, 0, 0, - 0, 0, 0); -} - -/* - * Add a trace buffer entry for arguments, for 3 block-length args - * and an integer arg. - */ -STATIC void -xfs_bmbt_trace_argfffi( - const char *func, - xfs_btree_cur_t *cur, - xfs_dfiloff_t o, - xfs_dfsbno_t b, - xfs_dfilblks_t i, - int j, - int line) -{ - xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGFFFI, line, - o >> 32, (int)o, b >> 32, (int)b, - i >> 32, (int)i, (int)j, 0, - 0, 0, 0); -} - -/* - * Add a trace buffer entry for arguments, for one integer arg. - */ -STATIC void -xfs_bmbt_trace_argi( - const char *func, - xfs_btree_cur_t *cur, - int i, - int line) -{ - xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGI, line, - i, 0, 0, 0, - 0, 0, 0, 0, - 0, 0, 0); -} - -/* - * Add a trace buffer entry for arguments, for int, fsblock, key. - */ -STATIC void -xfs_bmbt_trace_argifk( - const char *func, - xfs_btree_cur_t *cur, - int i, - xfs_fsblock_t f, - xfs_dfiloff_t o, - int line) -{ - xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGIFK, line, - i, (xfs_dfsbno_t)f >> 32, (int)f, o >> 32, - (int)o, 0, 0, 0, - 0, 0, 0); -} - -/* - * Add a trace buffer entry for arguments, for int, fsblock, rec. - */ -STATIC void -xfs_bmbt_trace_argifr( - const char *func, - xfs_btree_cur_t *cur, - int i, - xfs_fsblock_t f, - xfs_bmbt_rec_t *r, - int line) -{ - xfs_dfsbno_t b; - xfs_dfilblks_t c; - xfs_dfsbno_t d; - xfs_dfiloff_t o; - xfs_bmbt_irec_t s; - - d = (xfs_dfsbno_t)f; - xfs_bmbt_disk_get_all(r, &s); - o = (xfs_dfiloff_t)s.br_startoff; - b = (xfs_dfsbno_t)s.br_startblock; - c = s.br_blockcount; - xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGIFR, line, - i, d >> 32, (int)d, o >> 32, - (int)o, b >> 32, (int)b, c >> 32, - (int)c, 0, 0); -} - -/* - * Add a trace buffer entry for arguments, for int, key. - */ -STATIC void -xfs_bmbt_trace_argik( - const char *func, - xfs_btree_cur_t *cur, - int i, - xfs_bmbt_key_t *k, - int line) -{ - xfs_dfiloff_t o; - - o = be64_to_cpu(k->br_startoff); - xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGIFK, line, - i, o >> 32, (int)o, 0, - 0, 0, 0, 0, - 0, 0, 0); -} - -/* - * Add a trace buffer entry for the cursor/operation. - */ -STATIC void -xfs_bmbt_trace_cursor( - const char *func, - xfs_btree_cur_t *cur, - char *s, - int line) -{ - xfs_bmbt_rec_host_t r; - - xfs_bmbt_set_all(&r, &cur->bc_rec.b); - xfs_bmbt_trace_enter(func, cur, s, XFS_BMBT_KTRACE_CUR, line, - (cur->bc_nlevels << 24) | (cur->bc_private.b.flags << 16) | - cur->bc_private.b.allocated, - r.l0 >> 32, (int)r.l0, - r.l1 >> 32, (int)r.l1, - (unsigned long)cur->bc_bufs[0], (unsigned long)cur->bc_bufs[1], - (unsigned long)cur->bc_bufs[2], (unsigned long)cur->bc_bufs[3], - (cur->bc_ptrs[0] << 16) | cur->bc_ptrs[1], - (cur->bc_ptrs[2] << 16) | cur->bc_ptrs[3]); -} - -#define XFS_BMBT_TRACE_ARGBI(c,b,i) \ - xfs_bmbt_trace_argbi(__func__, c, b, i, __LINE__) -#define XFS_BMBT_TRACE_ARGBII(c,b,i,j) \ - xfs_bmbt_trace_argbii(__func__, c, b, i, j, __LINE__) -#define XFS_BMBT_TRACE_ARGFFFI(c,o,b,i,j) \ - xfs_bmbt_trace_argfffi(__func__, c, o, b, i, j, __LINE__) -#define XFS_BMBT_TRACE_ARGI(c,i) \ - xfs_bmbt_trace_argi(__func__, c, i, __LINE__) -#define XFS_BMBT_TRACE_ARGIFK(c,i,f,s) \ - xfs_bmbt_trace_argifk(__func__, c, i, f, s, __LINE__) -#define XFS_BMBT_TRACE_ARGIFR(c,i,f,r) \ - xfs_bmbt_trace_argifr(__func__, c, i, f, r, __LINE__) -#define XFS_BMBT_TRACE_ARGIK(c,i,k) \ - xfs_bmbt_trace_argik(__func__, c, i, k, __LINE__) -#define XFS_BMBT_TRACE_CURSOR(c,s) \ - xfs_bmbt_trace_cursor(__func__, c, s, __LINE__) -#else -#define XFS_BMBT_TRACE_ARGBI(c,b,i) -#define XFS_BMBT_TRACE_ARGBII(c,b,i,j) -#define XFS_BMBT_TRACE_ARGFFFI(c,o,b,i,j) -#define XFS_BMBT_TRACE_ARGI(c,i) -#define XFS_BMBT_TRACE_ARGIFK(c,i,f,s) -#define XFS_BMBT_TRACE_ARGIFR(c,i,f,r) -#define XFS_BMBT_TRACE_ARGIK(c,i,k) -#define XFS_BMBT_TRACE_CURSOR(c,s) -#endif /* XFS_BMBT_TRACE */ - - -/* - * Internal functions. - */ - -/* - * Delete record pointed to by cur/level. - */ -STATIC int /* error */ -xfs_bmbt_delrec( - xfs_btree_cur_t *cur, - int level, - int *stat) /* success/failure */ -{ - xfs_bmbt_block_t *block; /* bmap btree block */ - xfs_fsblock_t bno; /* fs-relative block number */ - xfs_buf_t *bp; /* buffer for block */ - int error; /* error return value */ - int i; /* loop counter */ - int j; /* temp state */ - xfs_bmbt_key_t key; /* bmap btree key */ - xfs_bmbt_key_t *kp=NULL; /* pointer to bmap btree key */ - xfs_fsblock_t lbno; /* left sibling block number */ - xfs_buf_t *lbp; /* left buffer pointer */ - xfs_bmbt_block_t *left; /* left btree block */ - xfs_bmbt_key_t *lkp; /* left btree key */ - xfs_bmbt_ptr_t *lpp; /* left address pointer */ - int lrecs=0; /* left record count */ - xfs_bmbt_rec_t *lrp; /* left record pointer */ - xfs_mount_t *mp; /* file system mount point */ - xfs_bmbt_ptr_t *pp; /* pointer to bmap block addr */ - int ptr; /* key/record index */ - xfs_fsblock_t rbno; /* right sibling block number */ - xfs_buf_t *rbp; /* right buffer pointer */ - xfs_bmbt_block_t *right; /* right btree block */ - xfs_bmbt_key_t *rkp; /* right btree key */ - xfs_bmbt_rec_t *rp; /* pointer to bmap btree rec */ - xfs_bmbt_ptr_t *rpp; /* right address pointer */ - xfs_bmbt_block_t *rrblock; /* right-right btree block */ - xfs_buf_t *rrbp; /* right-right buffer pointer */ - int rrecs=0; /* right record count */ - xfs_bmbt_rec_t *rrp; /* right record pointer */ - xfs_btree_cur_t *tcur; /* temporary btree cursor */ - int numrecs; /* temporary numrec count */ - int numlrecs, numrrecs; - - XFS_BMBT_TRACE_CURSOR(cur, ENTRY); - XFS_BMBT_TRACE_ARGI(cur, level); - ptr = cur->bc_ptrs[level]; - tcur = NULL; - if (ptr == 0) { - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 0; - return 0; - } - block = xfs_bmbt_get_block(cur, level, &bp); - numrecs = be16_to_cpu(block->bb_numrecs); -#ifdef DEBUG - if ((error = xfs_btree_check_lblock(cur, block, level, bp))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - goto error0; - } -#endif - if (ptr > numrecs) { - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 0; - return 0; - } - XFS_STATS_INC(xs_bmbt_delrec); - if (level > 0) { - kp = XFS_BMAP_KEY_IADDR(block, 1, cur); - pp = XFS_BMAP_PTR_IADDR(block, 1, cur); -#ifdef DEBUG - for (i = ptr; i < numrecs; i++) { - if ((error = xfs_btree_check_lptr_disk(cur, pp[i], level))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - goto error0; - } - } -#endif - if (ptr < numrecs) { - memmove(&kp[ptr - 1], &kp[ptr], - (numrecs - ptr) * sizeof(*kp)); - memmove(&pp[ptr - 1], &pp[ptr], - (numrecs - ptr) * sizeof(*pp)); - xfs_bmbt_log_ptrs(cur, bp, ptr, numrecs - 1); - xfs_bmbt_log_keys(cur, bp, ptr, numrecs - 1); - } - } else { - rp = XFS_BMAP_REC_IADDR(block, 1, cur); - if (ptr < numrecs) { - memmove(&rp[ptr - 1], &rp[ptr], - (numrecs - ptr) * sizeof(*rp)); - xfs_bmbt_log_recs(cur, bp, ptr, numrecs - 1); - } - if (ptr == 1) { - key.br_startoff = - cpu_to_be64(xfs_bmbt_disk_get_startoff(rp)); - kp = &key; - } - } - numrecs--; - block->bb_numrecs = cpu_to_be16(numrecs); - xfs_bmbt_log_block(cur, bp, XFS_BB_NUMRECS); - /* - * We're at the root level. - * First, shrink the root block in-memory. - * Try to get rid of the next level down. - * If we can't then there's nothing left to do. - */ - if (level == cur->bc_nlevels - 1) { - xfs_iroot_realloc(cur->bc_private.b.ip, -1, - cur->bc_private.b.whichfork); - if ((error = xfs_bmbt_killroot(cur))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - goto error0; - } - if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &j))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - goto error0; - } - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 1; - return 0; - } - if (ptr == 1 && (error = xfs_bmbt_updkey(cur, kp, level + 1))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - goto error0; - } - if (numrecs >= XFS_BMAP_BLOCK_IMINRECS(level, cur)) { - if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &j))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - goto error0; - } - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 1; - return 0; - } - rbno = be64_to_cpu(block->bb_rightsib); - lbno = be64_to_cpu(block->bb_leftsib); - /* - * One child of root, need to get a chance to copy its contents - * into the root and delete it. Can't go up to next level, - * there's nothing to delete there. - */ - if (lbno == NULLFSBLOCK && rbno == NULLFSBLOCK && - level == cur->bc_nlevels - 2) { - if ((error = xfs_bmbt_killroot(cur))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - goto error0; - } - if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &i))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - goto error0; - } - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 1; - return 0; - } - ASSERT(rbno != NULLFSBLOCK || lbno != NULLFSBLOCK); - if ((error = xfs_btree_dup_cursor(cur, &tcur))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - goto error0; - } - bno = NULLFSBLOCK; - if (rbno != NULLFSBLOCK) { - i = xfs_btree_lastrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - if ((error = xfs_bmbt_increment(tcur, level, &i))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - goto error0; - } - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - i = xfs_btree_lastrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - rbp = tcur->bc_bufs[level]; - right = XFS_BUF_TO_BMBT_BLOCK(rbp); -#ifdef DEBUG - if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - goto error0; - } -#endif - bno = be64_to_cpu(right->bb_leftsib); - if (be16_to_cpu(right->bb_numrecs) - 1 >= - XFS_BMAP_BLOCK_IMINRECS(level, cur)) { - if ((error = xfs_bmbt_lshift(tcur, level, &i))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - goto error0; - } - if (i) { - ASSERT(be16_to_cpu(block->bb_numrecs) >= - XFS_BMAP_BLOCK_IMINRECS(level, tcur)); - xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); - tcur = NULL; - if (level > 0) { - if ((error = xfs_bmbt_decrement(cur, - level, &i))) { - XFS_BMBT_TRACE_CURSOR(cur, - ERROR); - goto error0; - } - } - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 1; - return 0; - } - } - rrecs = be16_to_cpu(right->bb_numrecs); - if (lbno != NULLFSBLOCK) { - i = xfs_btree_firstrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - if ((error = xfs_bmbt_decrement(tcur, level, &i))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - goto error0; - } - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - } - } - if (lbno != NULLFSBLOCK) { - i = xfs_btree_firstrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - /* - * decrement to last in block - */ - if ((error = xfs_bmbt_decrement(tcur, level, &i))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - goto error0; - } - i = xfs_btree_firstrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - lbp = tcur->bc_bufs[level]; - left = XFS_BUF_TO_BMBT_BLOCK(lbp); -#ifdef DEBUG - if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - goto error0; - } -#endif - bno = be64_to_cpu(left->bb_rightsib); - if (be16_to_cpu(left->bb_numrecs) - 1 >= - XFS_BMAP_BLOCK_IMINRECS(level, cur)) { - if ((error = xfs_bmbt_rshift(tcur, level, &i))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - goto error0; - } - if (i) { - ASSERT(be16_to_cpu(block->bb_numrecs) >= - XFS_BMAP_BLOCK_IMINRECS(level, tcur)); - xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); - tcur = NULL; - if (level == 0) - cur->bc_ptrs[0]++; - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 1; - return 0; - } - } - lrecs = be16_to_cpu(left->bb_numrecs); - } - xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); - tcur = NULL; - mp = cur->bc_mp; - ASSERT(bno != NULLFSBLOCK); - if (lbno != NULLFSBLOCK && - lrecs + be16_to_cpu(block->bb_numrecs) <= XFS_BMAP_BLOCK_IMAXRECS(level, cur)) { - rbno = bno; - right = block; - rbp = bp; - if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, lbno, 0, &lbp, - XFS_BMAP_BTREE_REF))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - goto error0; - } - left = XFS_BUF_TO_BMBT_BLOCK(lbp); - if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - goto error0; - } - } else if (rbno != NULLFSBLOCK && - rrecs + be16_to_cpu(block->bb_numrecs) <= - XFS_BMAP_BLOCK_IMAXRECS(level, cur)) { - lbno = bno; - left = block; - lbp = bp; - if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, rbno, 0, &rbp, - XFS_BMAP_BTREE_REF))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - goto error0; - } - right = XFS_BUF_TO_BMBT_BLOCK(rbp); - if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - goto error0; - } - lrecs = be16_to_cpu(left->bb_numrecs); - } else { - if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &i))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - goto error0; - } - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 1; - return 0; - } - numlrecs = be16_to_cpu(left->bb_numrecs); - numrrecs = be16_to_cpu(right->bb_numrecs); - if (level > 0) { - lkp = XFS_BMAP_KEY_IADDR(left, numlrecs + 1, cur); - lpp = XFS_BMAP_PTR_IADDR(left, numlrecs + 1, cur); - rkp = XFS_BMAP_KEY_IADDR(right, 1, cur); - rpp = XFS_BMAP_PTR_IADDR(right, 1, cur); -#ifdef DEBUG - for (i = 0; i < numrrecs; i++) { - if ((error = xfs_btree_check_lptr_disk(cur, rpp[i], level))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - goto error0; - } - } -#endif - memcpy(lkp, rkp, numrrecs * sizeof(*lkp)); - memcpy(lpp, rpp, numrrecs * sizeof(*lpp)); - xfs_bmbt_log_keys(cur, lbp, numlrecs + 1, numlrecs + numrrecs); - xfs_bmbt_log_ptrs(cur, lbp, numlrecs + 1, numlrecs + numrrecs); - } else { - lrp = XFS_BMAP_REC_IADDR(left, numlrecs + 1, cur); - rrp = XFS_BMAP_REC_IADDR(right, 1, cur); - memcpy(lrp, rrp, numrrecs * sizeof(*lrp)); - xfs_bmbt_log_recs(cur, lbp, numlrecs + 1, numlrecs + numrrecs); - } - be16_add_cpu(&left->bb_numrecs, numrrecs); - left->bb_rightsib = right->bb_rightsib; - xfs_bmbt_log_block(cur, lbp, XFS_BB_RIGHTSIB | XFS_BB_NUMRECS); - if (be64_to_cpu(left->bb_rightsib) != NULLDFSBNO) { - if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, - be64_to_cpu(left->bb_rightsib), - 0, &rrbp, XFS_BMAP_BTREE_REF))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - goto error0; - } - rrblock = XFS_BUF_TO_BMBT_BLOCK(rrbp); - if ((error = xfs_btree_check_lblock(cur, rrblock, level, rrbp))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - goto error0; - } - rrblock->bb_leftsib = cpu_to_be64(lbno); - xfs_bmbt_log_block(cur, rrbp, XFS_BB_LEFTSIB); - } - xfs_bmap_add_free(XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(rbp)), 1, - cur->bc_private.b.flist, mp); - cur->bc_private.b.ip->i_d.di_nblocks--; - xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, XFS_ILOG_CORE); - XFS_TRANS_MOD_DQUOT_BYINO(mp, cur->bc_tp, cur->bc_private.b.ip, - XFS_TRANS_DQ_BCOUNT, -1L); - xfs_trans_binval(cur->bc_tp, rbp); - if (bp != lbp) { - cur->bc_bufs[level] = lbp; - cur->bc_ptrs[level] += lrecs; - cur->bc_ra[level] = 0; - } else if ((error = xfs_bmbt_increment(cur, level + 1, &i))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - goto error0; - } - if (level > 0) - cur->bc_ptrs[level]--; - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 2; - return 0; - -error0: - if (tcur) - xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); - return error; -} - -/* - * Insert one record/level. Return information to the caller - * allowing the next level up to proceed if necessary. - */ -STATIC int /* error */ -xfs_bmbt_insrec( - xfs_btree_cur_t *cur, - int level, - xfs_fsblock_t *bnop, - xfs_bmbt_rec_t *recp, - xfs_btree_cur_t **curp, - int *stat) /* no-go/done/continue */ -{ - xfs_bmbt_block_t *block; /* bmap btree block */ - xfs_buf_t *bp; /* buffer for block */ - int error; /* error return value */ - int i; /* loop index */ - xfs_bmbt_key_t key; /* bmap btree key */ - xfs_bmbt_key_t *kp=NULL; /* pointer to bmap btree key */ - int logflags; /* inode logging flags */ - xfs_fsblock_t nbno; /* new block number */ - struct xfs_btree_cur *ncur; /* new btree cursor */ - __uint64_t startoff; /* new btree key value */ - xfs_bmbt_rec_t nrec; /* new record count */ - int optr; /* old key/record index */ - xfs_bmbt_ptr_t *pp; /* pointer to bmap block addr */ - int ptr; /* key/record index */ - xfs_bmbt_rec_t *rp=NULL; /* pointer to bmap btree rec */ - int numrecs; - - ASSERT(level < cur->bc_nlevels); - XFS_BMBT_TRACE_CURSOR(cur, ENTRY); - XFS_BMBT_TRACE_ARGIFR(cur, level, *bnop, recp); - ncur = NULL; - key.br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(recp)); - optr = ptr = cur->bc_ptrs[level]; - if (ptr == 0) { - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 0; - return 0; - } - XFS_STATS_INC(xs_bmbt_insrec); - block = xfs_bmbt_get_block(cur, level, &bp); - numrecs = be16_to_cpu(block->bb_numrecs); -#ifdef DEBUG - if ((error = xfs_btree_check_lblock(cur, block, level, bp))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - if (ptr <= numrecs) { - if (level == 0) { - rp = XFS_BMAP_REC_IADDR(block, ptr, cur); - xfs_btree_check_rec(XFS_BTNUM_BMAP, recp, rp); - } else { - kp = XFS_BMAP_KEY_IADDR(block, ptr, cur); - xfs_btree_check_key(XFS_BTNUM_BMAP, &key, kp); - } - } -#endif - nbno = NULLFSBLOCK; - if (numrecs == XFS_BMAP_BLOCK_IMAXRECS(level, cur)) { - if (numrecs < XFS_BMAP_BLOCK_DMAXRECS(level, cur)) { - /* - * A root block, that can be made bigger. - */ - xfs_iroot_realloc(cur->bc_private.b.ip, 1, - cur->bc_private.b.whichfork); - block = xfs_bmbt_get_block(cur, level, &bp); - } else if (level == cur->bc_nlevels - 1) { - if ((error = xfs_bmbt_newroot(cur, &logflags, stat)) || - *stat == 0) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, - logflags); - block = xfs_bmbt_get_block(cur, level, &bp); - } else { - if ((error = xfs_bmbt_rshift(cur, level, &i))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - if (i) { - /* nothing */ - } else { - if ((error = xfs_bmbt_lshift(cur, level, &i))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - if (i) { - optr = ptr = cur->bc_ptrs[level]; - } else { - if ((error = xfs_bmbt_split(cur, level, - &nbno, &startoff, &ncur, - &i))) { - XFS_BMBT_TRACE_CURSOR(cur, - ERROR); - return error; - } - if (i) { - block = xfs_bmbt_get_block( - cur, level, &bp); -#ifdef DEBUG - if ((error = - xfs_btree_check_lblock(cur, - block, level, bp))) { - XFS_BMBT_TRACE_CURSOR( - cur, ERROR); - return error; - } -#endif - ptr = cur->bc_ptrs[level]; - xfs_bmbt_disk_set_allf(&nrec, - startoff, 0, 0, - XFS_EXT_NORM); - } else { - XFS_BMBT_TRACE_CURSOR(cur, - EXIT); - *stat = 0; - return 0; - } - } - } - } - } - numrecs = be16_to_cpu(block->bb_numrecs); - if (level > 0) { - kp = XFS_BMAP_KEY_IADDR(block, 1, cur); - pp = XFS_BMAP_PTR_IADDR(block, 1, cur); -#ifdef DEBUG - for (i = numrecs; i >= ptr; i--) { - if ((error = xfs_btree_check_lptr_disk(cur, pp[i - 1], - level))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - } -#endif - memmove(&kp[ptr], &kp[ptr - 1], - (numrecs - ptr + 1) * sizeof(*kp)); - memmove(&pp[ptr], &pp[ptr - 1], - (numrecs - ptr + 1) * sizeof(*pp)); -#ifdef DEBUG - if ((error = xfs_btree_check_lptr(cur, *bnop, level))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } -#endif - kp[ptr - 1] = key; - pp[ptr - 1] = cpu_to_be64(*bnop); - numrecs++; - block->bb_numrecs = cpu_to_be16(numrecs); - xfs_bmbt_log_keys(cur, bp, ptr, numrecs); - xfs_bmbt_log_ptrs(cur, bp, ptr, numrecs); - } else { - rp = XFS_BMAP_REC_IADDR(block, 1, cur); - memmove(&rp[ptr], &rp[ptr - 1], - (numrecs - ptr + 1) * sizeof(*rp)); - rp[ptr - 1] = *recp; - numrecs++; - block->bb_numrecs = cpu_to_be16(numrecs); - xfs_bmbt_log_recs(cur, bp, ptr, numrecs); - } - xfs_bmbt_log_block(cur, bp, XFS_BB_NUMRECS); -#ifdef DEBUG - if (ptr < numrecs) { - if (level == 0) - xfs_btree_check_rec(XFS_BTNUM_BMAP, rp + ptr - 1, - rp + ptr); - else - xfs_btree_check_key(XFS_BTNUM_BMAP, kp + ptr - 1, - kp + ptr); - } -#endif - if (optr == 1 && (error = xfs_bmbt_updkey(cur, &key, level + 1))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - *bnop = nbno; - if (nbno != NULLFSBLOCK) { - *recp = nrec; - *curp = ncur; - } - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 1; - return 0; -} - -STATIC int -xfs_bmbt_killroot( - xfs_btree_cur_t *cur) -{ - xfs_bmbt_block_t *block; - xfs_bmbt_block_t *cblock; - xfs_buf_t *cbp; - xfs_bmbt_key_t *ckp; - xfs_bmbt_ptr_t *cpp; -#ifdef DEBUG - int error; -#endif - int i; - xfs_bmbt_key_t *kp; - xfs_inode_t *ip; - xfs_ifork_t *ifp; - int level; - xfs_bmbt_ptr_t *pp; - - XFS_BMBT_TRACE_CURSOR(cur, ENTRY); - level = cur->bc_nlevels - 1; - ASSERT(level >= 1); - /* - * Don't deal with the root block needs to be a leaf case. - * We're just going to turn the thing back into extents anyway. - */ - if (level == 1) { - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - return 0; - } - block = xfs_bmbt_get_block(cur, level, &cbp); - /* - * Give up if the root has multiple children. - */ - if (be16_to_cpu(block->bb_numrecs) != 1) { - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - return 0; - } - /* - * Only do this if the next level will fit. - * Then the data must be copied up to the inode, - * instead of freeing the root you free the next level. - */ - cbp = cur->bc_bufs[level - 1]; - cblock = XFS_BUF_TO_BMBT_BLOCK(cbp); - if (be16_to_cpu(cblock->bb_numrecs) > XFS_BMAP_BLOCK_DMAXRECS(level, cur)) { - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - return 0; - } - ASSERT(be64_to_cpu(cblock->bb_leftsib) == NULLDFSBNO); - ASSERT(be64_to_cpu(cblock->bb_rightsib) == NULLDFSBNO); - ip = cur->bc_private.b.ip; - ifp = XFS_IFORK_PTR(ip, cur->bc_private.b.whichfork); - ASSERT(XFS_BMAP_BLOCK_IMAXRECS(level, cur) == - XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes)); - i = (int)(be16_to_cpu(cblock->bb_numrecs) - XFS_BMAP_BLOCK_IMAXRECS(level, cur)); - if (i) { - xfs_iroot_realloc(ip, i, cur->bc_private.b.whichfork); - block = ifp->if_broot; - } - be16_add_cpu(&block->bb_numrecs, i); - ASSERT(block->bb_numrecs == cblock->bb_numrecs); - kp = XFS_BMAP_KEY_IADDR(block, 1, cur); - ckp = XFS_BMAP_KEY_IADDR(cblock, 1, cur); - memcpy(kp, ckp, be16_to_cpu(block->bb_numrecs) * sizeof(*kp)); - pp = XFS_BMAP_PTR_IADDR(block, 1, cur); - cpp = XFS_BMAP_PTR_IADDR(cblock, 1, cur); -#ifdef DEBUG - for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) { - if ((error = xfs_btree_check_lptr_disk(cur, cpp[i], level - 1))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - } -#endif - memcpy(pp, cpp, be16_to_cpu(block->bb_numrecs) * sizeof(*pp)); - xfs_bmap_add_free(XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(cbp)), 1, - cur->bc_private.b.flist, cur->bc_mp); - ip->i_d.di_nblocks--; - XFS_TRANS_MOD_DQUOT_BYINO(cur->bc_mp, cur->bc_tp, ip, - XFS_TRANS_DQ_BCOUNT, -1L); - xfs_trans_binval(cur->bc_tp, cbp); - cur->bc_bufs[level - 1] = NULL; - be16_add_cpu(&block->bb_level, -1); - xfs_trans_log_inode(cur->bc_tp, ip, - XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork)); - cur->bc_nlevels--; - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - return 0; -} - -/* - * Log key values from the btree block. - */ -STATIC void -xfs_bmbt_log_keys( - xfs_btree_cur_t *cur, - xfs_buf_t *bp, - int kfirst, - int klast) -{ - xfs_trans_t *tp; - - XFS_BMBT_TRACE_CURSOR(cur, ENTRY); - XFS_BMBT_TRACE_ARGBII(cur, bp, kfirst, klast); - tp = cur->bc_tp; - if (bp) { - xfs_bmbt_block_t *block; - int first; - xfs_bmbt_key_t *kp; - int last; - - block = XFS_BUF_TO_BMBT_BLOCK(bp); - kp = XFS_BMAP_KEY_DADDR(block, 1, cur); - first = (int)((xfs_caddr_t)&kp[kfirst - 1] - (xfs_caddr_t)block); - last = (int)(((xfs_caddr_t)&kp[klast] - 1) - (xfs_caddr_t)block); - xfs_trans_log_buf(tp, bp, first, last); - } else { - xfs_inode_t *ip; - - ip = cur->bc_private.b.ip; - xfs_trans_log_inode(tp, ip, - XFS_ILOG_FBROOT(cur->bc_private.b.whichfork)); - } - XFS_BMBT_TRACE_CURSOR(cur, EXIT); -} - -/* - * Log pointer values from the btree block. - */ -STATIC void -xfs_bmbt_log_ptrs( - xfs_btree_cur_t *cur, - xfs_buf_t *bp, - int pfirst, - int plast) -{ - xfs_trans_t *tp; - - XFS_BMBT_TRACE_CURSOR(cur, ENTRY); - XFS_BMBT_TRACE_ARGBII(cur, bp, pfirst, plast); - tp = cur->bc_tp; - if (bp) { - xfs_bmbt_block_t *block; - int first; - int last; - xfs_bmbt_ptr_t *pp; - - block = XFS_BUF_TO_BMBT_BLOCK(bp); - pp = XFS_BMAP_PTR_DADDR(block, 1, cur); - first = (int)((xfs_caddr_t)&pp[pfirst - 1] - (xfs_caddr_t)block); - last = (int)(((xfs_caddr_t)&pp[plast] - 1) - (xfs_caddr_t)block); - xfs_trans_log_buf(tp, bp, first, last); - } else { - xfs_inode_t *ip; - - ip = cur->bc_private.b.ip; - xfs_trans_log_inode(tp, ip, - XFS_ILOG_FBROOT(cur->bc_private.b.whichfork)); - } - XFS_BMBT_TRACE_CURSOR(cur, EXIT); -} - -/* - * Lookup the record. The cursor is made to point to it, based on dir. - */ -STATIC int /* error */ -xfs_bmbt_lookup( - xfs_btree_cur_t *cur, - xfs_lookup_t dir, - int *stat) /* success/failure */ -{ - xfs_bmbt_block_t *block=NULL; - xfs_buf_t *bp; - xfs_daddr_t d; - xfs_sfiloff_t diff; - int error; /* error return value */ - xfs_fsblock_t fsbno=0; - int high; - int i; - int keyno=0; - xfs_bmbt_key_t *kkbase=NULL; - xfs_bmbt_key_t *kkp; - xfs_bmbt_rec_t *krbase=NULL; - xfs_bmbt_rec_t *krp; - int level; - int low; - xfs_mount_t *mp; - xfs_bmbt_ptr_t *pp; - xfs_bmbt_irec_t *rp; - xfs_fileoff_t startoff; - xfs_trans_t *tp; - - XFS_STATS_INC(xs_bmbt_lookup); - XFS_BMBT_TRACE_CURSOR(cur, ENTRY); - XFS_BMBT_TRACE_ARGI(cur, (int)dir); - tp = cur->bc_tp; - mp = cur->bc_mp; - rp = &cur->bc_rec.b; - for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) { - if (level < cur->bc_nlevels - 1) { - d = XFS_FSB_TO_DADDR(mp, fsbno); - bp = cur->bc_bufs[level]; - if (bp && XFS_BUF_ADDR(bp) != d) - bp = NULL; - if (!bp) { - if ((error = xfs_btree_read_bufl(mp, tp, fsbno, - 0, &bp, XFS_BMAP_BTREE_REF))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - xfs_btree_setbuf(cur, level, bp); - block = XFS_BUF_TO_BMBT_BLOCK(bp); - if ((error = xfs_btree_check_lblock(cur, block, - level, bp))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - } else - block = XFS_BUF_TO_BMBT_BLOCK(bp); - } else - block = xfs_bmbt_get_block(cur, level, &bp); - if (diff == 0) - keyno = 1; - else { - if (level > 0) - kkbase = XFS_BMAP_KEY_IADDR(block, 1, cur); - else - krbase = XFS_BMAP_REC_IADDR(block, 1, cur); - low = 1; - if (!(high = be16_to_cpu(block->bb_numrecs))) { - ASSERT(level == 0); - cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE; - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 0; - return 0; - } - while (low <= high) { - XFS_STATS_INC(xs_bmbt_compare); - keyno = (low + high) >> 1; - if (level > 0) { - kkp = kkbase + keyno - 1; - startoff = be64_to_cpu(kkp->br_startoff); - } else { - krp = krbase + keyno - 1; - startoff = xfs_bmbt_disk_get_startoff(krp); - } - diff = (xfs_sfiloff_t) - (startoff - rp->br_startoff); - if (diff < 0) - low = keyno + 1; - else if (diff > 0) - high = keyno - 1; - else - break; - } - } - if (level > 0) { - if (diff > 0 && --keyno < 1) - keyno = 1; - pp = XFS_BMAP_PTR_IADDR(block, keyno, cur); - fsbno = be64_to_cpu(*pp); -#ifdef DEBUG - if ((error = xfs_btree_check_lptr(cur, fsbno, level))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } -#endif - cur->bc_ptrs[level] = keyno; - } - } - if (dir != XFS_LOOKUP_LE && diff < 0) { - keyno++; - /* - * If ge search and we went off the end of the block, but it's - * not the last block, we're in the wrong block. - */ - if (dir == XFS_LOOKUP_GE && keyno > be16_to_cpu(block->bb_numrecs) && - be64_to_cpu(block->bb_rightsib) != NULLDFSBNO) { - cur->bc_ptrs[0] = keyno; - if ((error = xfs_bmbt_increment(cur, 0, &i))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - XFS_WANT_CORRUPTED_RETURN(i == 1); - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 1; - return 0; - } - } - else if (dir == XFS_LOOKUP_LE && diff > 0) - keyno--; - cur->bc_ptrs[0] = keyno; - if (keyno == 0 || keyno > be16_to_cpu(block->bb_numrecs)) { - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 0; - } else { - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = ((dir != XFS_LOOKUP_EQ) || (diff == 0)); - } - return 0; -} - -/* - * Move 1 record left from cur/level if possible. - * Update cur to reflect the new path. - */ -STATIC int /* error */ -xfs_bmbt_lshift( - xfs_btree_cur_t *cur, - int level, - int *stat) /* success/failure */ -{ - int error; /* error return value */ -#ifdef DEBUG - int i; /* loop counter */ -#endif - xfs_bmbt_key_t key; /* bmap btree key */ - xfs_buf_t *lbp; /* left buffer pointer */ - xfs_bmbt_block_t *left; /* left btree block */ - xfs_bmbt_key_t *lkp=NULL; /* left btree key */ - xfs_bmbt_ptr_t *lpp; /* left address pointer */ - int lrecs; /* left record count */ - xfs_bmbt_rec_t *lrp=NULL; /* left record pointer */ - xfs_mount_t *mp; /* file system mount point */ - xfs_buf_t *rbp; /* right buffer pointer */ - xfs_bmbt_block_t *right; /* right btree block */ - xfs_bmbt_key_t *rkp=NULL; /* right btree key */ - xfs_bmbt_ptr_t *rpp=NULL; /* right address pointer */ - xfs_bmbt_rec_t *rrp=NULL; /* right record pointer */ - int rrecs; /* right record count */ - - XFS_BMBT_TRACE_CURSOR(cur, ENTRY); - XFS_BMBT_TRACE_ARGI(cur, level); - if (level == cur->bc_nlevels - 1) { - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 0; - return 0; - } - rbp = cur->bc_bufs[level]; - right = XFS_BUF_TO_BMBT_BLOCK(rbp); -#ifdef DEBUG - if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } -#endif - if (be64_to_cpu(right->bb_leftsib) == NULLDFSBNO) { - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 0; - return 0; - } - if (cur->bc_ptrs[level] <= 1) { - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 0; - return 0; - } - mp = cur->bc_mp; - if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, be64_to_cpu(right->bb_leftsib), 0, - &lbp, XFS_BMAP_BTREE_REF))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - left = XFS_BUF_TO_BMBT_BLOCK(lbp); - if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - if (be16_to_cpu(left->bb_numrecs) == XFS_BMAP_BLOCK_IMAXRECS(level, cur)) { - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 0; - return 0; - } - lrecs = be16_to_cpu(left->bb_numrecs) + 1; - if (level > 0) { - lkp = XFS_BMAP_KEY_IADDR(left, lrecs, cur); - rkp = XFS_BMAP_KEY_IADDR(right, 1, cur); - *lkp = *rkp; - xfs_bmbt_log_keys(cur, lbp, lrecs, lrecs); - lpp = XFS_BMAP_PTR_IADDR(left, lrecs, cur); - rpp = XFS_BMAP_PTR_IADDR(right, 1, cur); -#ifdef DEBUG - if ((error = xfs_btree_check_lptr_disk(cur, *rpp, level))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } -#endif - *lpp = *rpp; - xfs_bmbt_log_ptrs(cur, lbp, lrecs, lrecs); - } else { - lrp = XFS_BMAP_REC_IADDR(left, lrecs, cur); - rrp = XFS_BMAP_REC_IADDR(right, 1, cur); - *lrp = *rrp; - xfs_bmbt_log_recs(cur, lbp, lrecs, lrecs); - } - left->bb_numrecs = cpu_to_be16(lrecs); - xfs_bmbt_log_block(cur, lbp, XFS_BB_NUMRECS); -#ifdef DEBUG - if (level > 0) - xfs_btree_check_key(XFS_BTNUM_BMAP, lkp - 1, lkp); - else - xfs_btree_check_rec(XFS_BTNUM_BMAP, lrp - 1, lrp); -#endif - rrecs = be16_to_cpu(right->bb_numrecs) - 1; - right->bb_numrecs = cpu_to_be16(rrecs); - xfs_bmbt_log_block(cur, rbp, XFS_BB_NUMRECS); - if (level > 0) { -#ifdef DEBUG - for (i = 0; i < rrecs; i++) { - if ((error = xfs_btree_check_lptr_disk(cur, rpp[i + 1], - level))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - } -#endif - memmove(rkp, rkp + 1, rrecs * sizeof(*rkp)); - memmove(rpp, rpp + 1, rrecs * sizeof(*rpp)); - xfs_bmbt_log_keys(cur, rbp, 1, rrecs); - xfs_bmbt_log_ptrs(cur, rbp, 1, rrecs); - } else { - memmove(rrp, rrp + 1, rrecs * sizeof(*rrp)); - xfs_bmbt_log_recs(cur, rbp, 1, rrecs); - key.br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(rrp)); - rkp = &key; - } - if ((error = xfs_bmbt_updkey(cur, rkp, level + 1))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - cur->bc_ptrs[level]--; - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 1; - return 0; -} - -/* - * Move 1 record right from cur/level if possible. - * Update cur to reflect the new path. - */ -STATIC int /* error */ -xfs_bmbt_rshift( - xfs_btree_cur_t *cur, - int level, - int *stat) /* success/failure */ -{ - int error; /* error return value */ - int i; /* loop counter */ - xfs_bmbt_key_t key; /* bmap btree key */ - xfs_buf_t *lbp; /* left buffer pointer */ - xfs_bmbt_block_t *left; /* left btree block */ - xfs_bmbt_key_t *lkp; /* left btree key */ - xfs_bmbt_ptr_t *lpp; /* left address pointer */ - xfs_bmbt_rec_t *lrp; /* left record pointer */ - xfs_mount_t *mp; /* file system mount point */ - xfs_buf_t *rbp; /* right buffer pointer */ - xfs_bmbt_block_t *right; /* right btree block */ - xfs_bmbt_key_t *rkp; /* right btree key */ - xfs_bmbt_ptr_t *rpp; /* right address pointer */ - xfs_bmbt_rec_t *rrp=NULL; /* right record pointer */ - struct xfs_btree_cur *tcur; /* temporary btree cursor */ - - XFS_BMBT_TRACE_CURSOR(cur, ENTRY); - XFS_BMBT_TRACE_ARGI(cur, level); - if (level == cur->bc_nlevels - 1) { - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 0; - return 0; - } - lbp = cur->bc_bufs[level]; - left = XFS_BUF_TO_BMBT_BLOCK(lbp); -#ifdef DEBUG - if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } -#endif - if (be64_to_cpu(left->bb_rightsib) == NULLDFSBNO) { - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 0; - return 0; - } - if (cur->bc_ptrs[level] >= be16_to_cpu(left->bb_numrecs)) { - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 0; - return 0; - } - mp = cur->bc_mp; - if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, be64_to_cpu(left->bb_rightsib), 0, - &rbp, XFS_BMAP_BTREE_REF))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - right = XFS_BUF_TO_BMBT_BLOCK(rbp); - if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - if (be16_to_cpu(right->bb_numrecs) == XFS_BMAP_BLOCK_IMAXRECS(level, cur)) { - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 0; - return 0; - } - if (level > 0) { - lkp = XFS_BMAP_KEY_IADDR(left, be16_to_cpu(left->bb_numrecs), cur); - lpp = XFS_BMAP_PTR_IADDR(left, be16_to_cpu(left->bb_numrecs), cur); - rkp = XFS_BMAP_KEY_IADDR(right, 1, cur); - rpp = XFS_BMAP_PTR_IADDR(right, 1, cur); -#ifdef DEBUG - for (i = be16_to_cpu(right->bb_numrecs) - 1; i >= 0; i--) { - if ((error = xfs_btree_check_lptr_disk(cur, rpp[i], level))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - } -#endif - memmove(rkp + 1, rkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp)); - memmove(rpp + 1, rpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp)); -#ifdef DEBUG - if ((error = xfs_btree_check_lptr_disk(cur, *lpp, level))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } -#endif - *rkp = *lkp; - *rpp = *lpp; - xfs_bmbt_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1); - xfs_bmbt_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1); - } else { - lrp = XFS_BMAP_REC_IADDR(left, be16_to_cpu(left->bb_numrecs), cur); - rrp = XFS_BMAP_REC_IADDR(right, 1, cur); - memmove(rrp + 1, rrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp)); - *rrp = *lrp; - xfs_bmbt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1); - key.br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(rrp)); - rkp = &key; - } - be16_add_cpu(&left->bb_numrecs, -1); - xfs_bmbt_log_block(cur, lbp, XFS_BB_NUMRECS); - be16_add_cpu(&right->bb_numrecs, 1); -#ifdef DEBUG - if (level > 0) - xfs_btree_check_key(XFS_BTNUM_BMAP, rkp, rkp + 1); - else - xfs_btree_check_rec(XFS_BTNUM_BMAP, rrp, rrp + 1); -#endif - xfs_bmbt_log_block(cur, rbp, XFS_BB_NUMRECS); - if ((error = xfs_btree_dup_cursor(cur, &tcur))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - i = xfs_btree_lastrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - if ((error = xfs_bmbt_increment(tcur, level, &i))) { - XFS_BMBT_TRACE_CURSOR(tcur, ERROR); - goto error1; - } - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - if ((error = xfs_bmbt_updkey(tcur, rkp, level + 1))) { - XFS_BMBT_TRACE_CURSOR(tcur, ERROR); - goto error1; - } - xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 1; - return 0; -error0: - XFS_BMBT_TRACE_CURSOR(cur, ERROR); -error1: - xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); - return error; -} - /* * Determine the extent state. */ @@ -1453,229 +60,15 @@ xfs_extent_state( return XFS_EXT_NORM; } - -/* - * Split cur/level block in half. - * Return new block number and its first record (to be inserted into parent). - */ -STATIC int /* error */ -xfs_bmbt_split( - xfs_btree_cur_t *cur, - int level, - xfs_fsblock_t *bnop, - __uint64_t *startoff, - xfs_btree_cur_t **curp, - int *stat) /* success/failure */ -{ - xfs_alloc_arg_t args; /* block allocation args */ - int error; /* error return value */ - int i; /* loop counter */ - xfs_fsblock_t lbno; /* left sibling block number */ - xfs_buf_t *lbp; /* left buffer pointer */ - xfs_bmbt_block_t *left; /* left btree block */ - xfs_bmbt_key_t *lkp; /* left btree key */ - xfs_bmbt_ptr_t *lpp; /* left address pointer */ - xfs_bmbt_rec_t *lrp; /* left record pointer */ - xfs_buf_t *rbp; /* right buffer pointer */ - xfs_bmbt_block_t *right; /* right btree block */ - xfs_bmbt_key_t *rkp; /* right btree key */ - xfs_bmbt_ptr_t *rpp; /* right address pointer */ - xfs_bmbt_block_t *rrblock; /* right-right btree block */ - xfs_buf_t *rrbp; /* right-right buffer pointer */ - xfs_bmbt_rec_t *rrp; /* right record pointer */ - - XFS_BMBT_TRACE_CURSOR(cur, ENTRY); - XFS_BMBT_TRACE_ARGIFK(cur, level, *bnop, *startoff); - args.tp = cur->bc_tp; - args.mp = cur->bc_mp; - lbp = cur->bc_bufs[level]; - lbno = XFS_DADDR_TO_FSB(args.mp, XFS_BUF_ADDR(lbp)); - left = XFS_BUF_TO_BMBT_BLOCK(lbp); - args.fsbno = cur->bc_private.b.firstblock; - args.firstblock = args.fsbno; - args.minleft = 0; - if (args.fsbno == NULLFSBLOCK) { - args.fsbno = lbno; - args.type = XFS_ALLOCTYPE_START_BNO; - /* - * Make sure there is sufficient room left in the AG to - * complete a full tree split for an extent insert. If - * we are converting the middle part of an extent then - * we may need space for two tree splits. - * - * We are relying on the caller to make the correct block - * reservation for this operation to succeed. If the - * reservation amount is insufficient then we may fail a - * block allocation here and corrupt the filesystem. - */ - args.minleft = xfs_trans_get_block_res(args.tp); - } else if (cur->bc_private.b.flist->xbf_low) - args.type = XFS_ALLOCTYPE_START_BNO; - else - args.type = XFS_ALLOCTYPE_NEAR_BNO; - args.mod = args.alignment = args.total = args.isfl = - args.userdata = args.minalignslop = 0; - args.minlen = args.maxlen = args.prod = 1; - args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL; - if (!args.wasdel && xfs_trans_get_block_res(args.tp) == 0) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return XFS_ERROR(ENOSPC); - } - if ((error = xfs_alloc_vextent(&args))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - if (args.fsbno == NULLFSBLOCK && args.minleft) { - /* - * Could not find an AG with enough free space to satisfy - * a full btree split. Try again without minleft and if - * successful activate the lowspace algorithm. - */ - args.fsbno = 0; - args.type = XFS_ALLOCTYPE_FIRST_AG; - args.minleft = 0; - if ((error = xfs_alloc_vextent(&args))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - cur->bc_private.b.flist->xbf_low = 1; - } - if (args.fsbno == NULLFSBLOCK) { - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 0; - return 0; - } - ASSERT(args.len == 1); - cur->bc_private.b.firstblock = args.fsbno; - cur->bc_private.b.allocated++; - cur->bc_private.b.ip->i_d.di_nblocks++; - xfs_trans_log_inode(args.tp, cur->bc_private.b.ip, XFS_ILOG_CORE); - XFS_TRANS_MOD_DQUOT_BYINO(args.mp, args.tp, cur->bc_private.b.ip, - XFS_TRANS_DQ_BCOUNT, 1L); - rbp = xfs_btree_get_bufl(args.mp, args.tp, args.fsbno, 0); - right = XFS_BUF_TO_BMBT_BLOCK(rbp); -#ifdef DEBUG - if ((error = xfs_btree_check_lblock(cur, left, level, rbp))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } -#endif - right->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC); - right->bb_level = left->bb_level; - right->bb_numrecs = cpu_to_be16(be16_to_cpu(left->bb_numrecs) / 2); - if ((be16_to_cpu(left->bb_numrecs) & 1) && - cur->bc_ptrs[level] <= be16_to_cpu(right->bb_numrecs) + 1) - be16_add_cpu(&right->bb_numrecs, 1); - i = be16_to_cpu(left->bb_numrecs) - be16_to_cpu(right->bb_numrecs) + 1; - if (level > 0) { - lkp = XFS_BMAP_KEY_IADDR(left, i, cur); - lpp = XFS_BMAP_PTR_IADDR(left, i, cur); - rkp = XFS_BMAP_KEY_IADDR(right, 1, cur); - rpp = XFS_BMAP_PTR_IADDR(right, 1, cur); -#ifdef DEBUG - for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) { - if ((error = xfs_btree_check_lptr_disk(cur, lpp[i], level))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - } -#endif - memcpy(rkp, lkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp)); - memcpy(rpp, lpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp)); - xfs_bmbt_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs)); - xfs_bmbt_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs)); - *startoff = be64_to_cpu(rkp->br_startoff); - } else { - lrp = XFS_BMAP_REC_IADDR(left, i, cur); - rrp = XFS_BMAP_REC_IADDR(right, 1, cur); - memcpy(rrp, lrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp)); - xfs_bmbt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs)); - *startoff = xfs_bmbt_disk_get_startoff(rrp); - } - be16_add_cpu(&left->bb_numrecs, -(be16_to_cpu(right->bb_numrecs))); - right->bb_rightsib = left->bb_rightsib; - left->bb_rightsib = cpu_to_be64(args.fsbno); - right->bb_leftsib = cpu_to_be64(lbno); - xfs_bmbt_log_block(cur, rbp, XFS_BB_ALL_BITS); - xfs_bmbt_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB); - if (be64_to_cpu(right->bb_rightsib) != NULLDFSBNO) { - if ((error = xfs_btree_read_bufl(args.mp, args.tp, - be64_to_cpu(right->bb_rightsib), 0, &rrbp, - XFS_BMAP_BTREE_REF))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - rrblock = XFS_BUF_TO_BMBT_BLOCK(rrbp); - if ((error = xfs_btree_check_lblock(cur, rrblock, level, rrbp))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - rrblock->bb_leftsib = cpu_to_be64(args.fsbno); - xfs_bmbt_log_block(cur, rrbp, XFS_BB_LEFTSIB); - } - if (cur->bc_ptrs[level] > be16_to_cpu(left->bb_numrecs) + 1) { - xfs_btree_setbuf(cur, level, rbp); - cur->bc_ptrs[level] -= be16_to_cpu(left->bb_numrecs); - } - if (level + 1 < cur->bc_nlevels) { - if ((error = xfs_btree_dup_cursor(cur, curp))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - (*curp)->bc_ptrs[level + 1]++; - } - *bnop = args.fsbno; - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 1; - return 0; -} - - -/* - * Update keys for the record. - */ -STATIC int -xfs_bmbt_updkey( - xfs_btree_cur_t *cur, - xfs_bmbt_key_t *keyp, /* on-disk format */ - int level) -{ - xfs_bmbt_block_t *block; - xfs_buf_t *bp; -#ifdef DEBUG - int error; -#endif - xfs_bmbt_key_t *kp; - int ptr; - - ASSERT(level >= 1); - XFS_BMBT_TRACE_CURSOR(cur, ENTRY); - XFS_BMBT_TRACE_ARGIK(cur, level, keyp); - for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) { - block = xfs_bmbt_get_block(cur, level, &bp); -#ifdef DEBUG - if ((error = xfs_btree_check_lblock(cur, block, level, bp))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } -#endif - ptr = cur->bc_ptrs[level]; - kp = XFS_BMAP_KEY_IADDR(block, ptr, cur); - *kp = *keyp; - xfs_bmbt_log_keys(cur, bp, ptr, ptr); - } - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - return 0; -} - /* * Convert on-disk form of btree root to in-memory form. */ void xfs_bmdr_to_bmbt( + struct xfs_mount *mp, xfs_bmdr_block_t *dblock, int dblocklen, - xfs_bmbt_block_t *rblock, + struct xfs_btree_block *rblock, int rblocklen) { int dmxr; @@ -1688,129 +81,19 @@ xfs_bmdr_to_bmbt( rblock->bb_level = dblock->bb_level; ASSERT(be16_to_cpu(rblock->bb_level) > 0); rblock->bb_numrecs = dblock->bb_numrecs; - rblock->bb_leftsib = cpu_to_be64(NULLDFSBNO); - rblock->bb_rightsib = cpu_to_be64(NULLDFSBNO); - dmxr = (int)XFS_BTREE_BLOCK_MAXRECS(dblocklen, xfs_bmdr, 0); - fkp = XFS_BTREE_KEY_ADDR(xfs_bmdr, dblock, 1); - tkp = XFS_BMAP_BROOT_KEY_ADDR(rblock, 1, rblocklen); - fpp = XFS_BTREE_PTR_ADDR(xfs_bmdr, dblock, 1, dmxr); - tpp = XFS_BMAP_BROOT_PTR_ADDR(rblock, 1, rblocklen); + rblock->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO); + rblock->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO); + dmxr = xfs_bmdr_maxrecs(mp, dblocklen, 0); + fkp = XFS_BMDR_KEY_ADDR(dblock, 1); + tkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1); + fpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr); + tpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen); dmxr = be16_to_cpu(dblock->bb_numrecs); memcpy(tkp, fkp, sizeof(*fkp) * dmxr); memcpy(tpp, fpp, sizeof(*fpp) * dmxr); } /* - * Decrement cursor by one record at the level. - * For nonzero levels the leaf-ward information is untouched. - */ -int /* error */ -xfs_bmbt_decrement( - xfs_btree_cur_t *cur, - int level, - int *stat) /* success/failure */ -{ - xfs_bmbt_block_t *block; - xfs_buf_t *bp; - int error; /* error return value */ - xfs_fsblock_t fsbno; - int lev; - xfs_mount_t *mp; - xfs_trans_t *tp; - - XFS_BMBT_TRACE_CURSOR(cur, ENTRY); - XFS_BMBT_TRACE_ARGI(cur, level); - ASSERT(level < cur->bc_nlevels); - if (level < cur->bc_nlevels - 1) - xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA); - if (--cur->bc_ptrs[level] > 0) { - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 1; - return 0; - } - block = xfs_bmbt_get_block(cur, level, &bp); -#ifdef DEBUG - if ((error = xfs_btree_check_lblock(cur, block, level, bp))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } -#endif - if (be64_to_cpu(block->bb_leftsib) == NULLDFSBNO) { - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 0; - return 0; - } - for (lev = level + 1; lev < cur->bc_nlevels; lev++) { - if (--cur->bc_ptrs[lev] > 0) - break; - if (lev < cur->bc_nlevels - 1) - xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA); - } - if (lev == cur->bc_nlevels) { - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 0; - return 0; - } - tp = cur->bc_tp; - mp = cur->bc_mp; - for (block = xfs_bmbt_get_block(cur, lev, &bp); lev > level; ) { - fsbno = be64_to_cpu(*XFS_BMAP_PTR_IADDR(block, cur->bc_ptrs[lev], cur)); - if ((error = xfs_btree_read_bufl(mp, tp, fsbno, 0, &bp, - XFS_BMAP_BTREE_REF))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - lev--; - xfs_btree_setbuf(cur, lev, bp); - block = XFS_BUF_TO_BMBT_BLOCK(bp); - if ((error = xfs_btree_check_lblock(cur, block, lev, bp))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - cur->bc_ptrs[lev] = be16_to_cpu(block->bb_numrecs); - } - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 1; - return 0; -} - -/* - * Delete the record pointed to by cur. - */ -int /* error */ -xfs_bmbt_delete( - xfs_btree_cur_t *cur, - int *stat) /* success/failure */ -{ - int error; /* error return value */ - int i; - int level; - - XFS_BMBT_TRACE_CURSOR(cur, ENTRY); - for (level = 0, i = 2; i == 2; level++) { - if ((error = xfs_bmbt_delrec(cur, level, &i))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - } - if (i == 0) { - for (level = 1; level < cur->bc_nlevels; level++) { - if (cur->bc_ptrs[level] == 0) { - if ((error = xfs_bmbt_decrement(cur, level, - &i))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - break; - } - } - } - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = i; - return 0; -} - -/* * Convert a compressed bmap extent record to an uncompressed form. * This code must be in sync with the routines xfs_bmbt_get_startoff, * xfs_bmbt_get_startblock, xfs_bmbt_get_blockcount and xfs_bmbt_get_state. @@ -1864,31 +147,6 @@ xfs_bmbt_get_all( } /* - * Get the block pointer for the given level of the cursor. - * Fill in the buffer pointer, if applicable. - */ -xfs_bmbt_block_t * -xfs_bmbt_get_block( - xfs_btree_cur_t *cur, - int level, - xfs_buf_t **bpp) -{ - xfs_ifork_t *ifp; - xfs_bmbt_block_t *rval; - - if (level < cur->bc_nlevels - 1) { - *bpp = cur->bc_bufs[level]; - rval = XFS_BUF_TO_BMBT_BLOCK(*bpp); - } else { - *bpp = NULL; - ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, - cur->bc_private.b.whichfork); - rval = ifp->if_broot; - } - return rval; -} - -/* * Extract the blockcount field from an in memory bmap extent record. */ xfs_filblks_t @@ -1950,7 +208,8 @@ xfs_bmbt_disk_get_all( xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s) { - __xfs_bmbt_get_all(be64_to_cpu(r->l0), be64_to_cpu(r->l1), s); + __xfs_bmbt_get_all(get_unaligned_be64(&r->l0), + get_unaligned_be64(&r->l1), s); } /* @@ -1974,348 +233,6 @@ xfs_bmbt_disk_get_startoff( XFS_MASK64LO(64 - BMBT_EXNTFLAG_BITLEN)) >> 9; } -/* - * Increment cursor by one record at the level. - * For nonzero levels the leaf-ward information is untouched. - */ -int /* error */ -xfs_bmbt_increment( - xfs_btree_cur_t *cur, - int level, - int *stat) /* success/failure */ -{ - xfs_bmbt_block_t *block; - xfs_buf_t *bp; - int error; /* error return value */ - xfs_fsblock_t fsbno; - int lev; - xfs_mount_t *mp; - xfs_trans_t *tp; - - XFS_BMBT_TRACE_CURSOR(cur, ENTRY); - XFS_BMBT_TRACE_ARGI(cur, level); - ASSERT(level < cur->bc_nlevels); - if (level < cur->bc_nlevels - 1) - xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA); - block = xfs_bmbt_get_block(cur, level, &bp); -#ifdef DEBUG - if ((error = xfs_btree_check_lblock(cur, block, level, bp))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } -#endif - if (++cur->bc_ptrs[level] <= be16_to_cpu(block->bb_numrecs)) { - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 1; - return 0; - } - if (be64_to_cpu(block->bb_rightsib) == NULLDFSBNO) { - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 0; - return 0; - } - for (lev = level + 1; lev < cur->bc_nlevels; lev++) { - block = xfs_bmbt_get_block(cur, lev, &bp); -#ifdef DEBUG - if ((error = xfs_btree_check_lblock(cur, block, lev, bp))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } -#endif - if (++cur->bc_ptrs[lev] <= be16_to_cpu(block->bb_numrecs)) - break; - if (lev < cur->bc_nlevels - 1) - xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA); - } - if (lev == cur->bc_nlevels) { - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 0; - return 0; - } - tp = cur->bc_tp; - mp = cur->bc_mp; - for (block = xfs_bmbt_get_block(cur, lev, &bp); lev > level; ) { - fsbno = be64_to_cpu(*XFS_BMAP_PTR_IADDR(block, cur->bc_ptrs[lev], cur)); - if ((error = xfs_btree_read_bufl(mp, tp, fsbno, 0, &bp, - XFS_BMAP_BTREE_REF))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - lev--; - xfs_btree_setbuf(cur, lev, bp); - block = XFS_BUF_TO_BMBT_BLOCK(bp); - if ((error = xfs_btree_check_lblock(cur, block, lev, bp))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - cur->bc_ptrs[lev] = 1; - } - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 1; - return 0; -} - -/* - * Insert the current record at the point referenced by cur. - * - * A multi-level split of the tree on insert will invalidate the original - * cursor. All callers of this function should assume that the cursor is - * no longer valid and revalidate it. - */ -int /* error */ -xfs_bmbt_insert( - xfs_btree_cur_t *cur, - int *stat) /* success/failure */ -{ - int error; /* error return value */ - int i; - int level; - xfs_fsblock_t nbno; - xfs_btree_cur_t *ncur; - xfs_bmbt_rec_t nrec; - xfs_btree_cur_t *pcur; - - XFS_BMBT_TRACE_CURSOR(cur, ENTRY); - level = 0; - nbno = NULLFSBLOCK; - xfs_bmbt_disk_set_all(&nrec, &cur->bc_rec.b); - ncur = NULL; - pcur = cur; - do { - if ((error = xfs_bmbt_insrec(pcur, level++, &nbno, &nrec, &ncur, - &i))) { - if (pcur != cur) - xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR); - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - if (pcur != cur && (ncur || nbno == NULLFSBLOCK)) { - cur->bc_nlevels = pcur->bc_nlevels; - cur->bc_private.b.allocated += - pcur->bc_private.b.allocated; - pcur->bc_private.b.allocated = 0; - ASSERT((cur->bc_private.b.firstblock != NULLFSBLOCK) || - XFS_IS_REALTIME_INODE(cur->bc_private.b.ip)); - cur->bc_private.b.firstblock = - pcur->bc_private.b.firstblock; - ASSERT(cur->bc_private.b.flist == - pcur->bc_private.b.flist); - xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR); - } - if (ncur) { - pcur = ncur; - ncur = NULL; - } - } while (nbno != NULLFSBLOCK); - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = i; - return 0; -error0: - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; -} - -/* - * Log fields from the btree block header. - */ -void -xfs_bmbt_log_block( - xfs_btree_cur_t *cur, - xfs_buf_t *bp, - int fields) -{ - int first; - int last; - xfs_trans_t *tp; - static const short offsets[] = { - offsetof(xfs_bmbt_block_t, bb_magic), - offsetof(xfs_bmbt_block_t, bb_level), - offsetof(xfs_bmbt_block_t, bb_numrecs), - offsetof(xfs_bmbt_block_t, bb_leftsib), - offsetof(xfs_bmbt_block_t, bb_rightsib), - sizeof(xfs_bmbt_block_t) - }; - - XFS_BMBT_TRACE_CURSOR(cur, ENTRY); - XFS_BMBT_TRACE_ARGBI(cur, bp, fields); - tp = cur->bc_tp; - if (bp) { - xfs_btree_offsets(fields, offsets, XFS_BB_NUM_BITS, &first, - &last); - xfs_trans_log_buf(tp, bp, first, last); - } else - xfs_trans_log_inode(tp, cur->bc_private.b.ip, - XFS_ILOG_FBROOT(cur->bc_private.b.whichfork)); - XFS_BMBT_TRACE_CURSOR(cur, EXIT); -} - -/* - * Log record values from the btree block. - */ -void -xfs_bmbt_log_recs( - xfs_btree_cur_t *cur, - xfs_buf_t *bp, - int rfirst, - int rlast) -{ - xfs_bmbt_block_t *block; - int first; - int last; - xfs_bmbt_rec_t *rp; - xfs_trans_t *tp; - - XFS_BMBT_TRACE_CURSOR(cur, ENTRY); - XFS_BMBT_TRACE_ARGBII(cur, bp, rfirst, rlast); - ASSERT(bp); - tp = cur->bc_tp; - block = XFS_BUF_TO_BMBT_BLOCK(bp); - rp = XFS_BMAP_REC_DADDR(block, 1, cur); - first = (int)((xfs_caddr_t)&rp[rfirst - 1] - (xfs_caddr_t)block); - last = (int)(((xfs_caddr_t)&rp[rlast] - 1) - (xfs_caddr_t)block); - xfs_trans_log_buf(tp, bp, first, last); - XFS_BMBT_TRACE_CURSOR(cur, EXIT); -} - -int /* error */ -xfs_bmbt_lookup_eq( - xfs_btree_cur_t *cur, - xfs_fileoff_t off, - xfs_fsblock_t bno, - xfs_filblks_t len, - int *stat) /* success/failure */ -{ - cur->bc_rec.b.br_startoff = off; - cur->bc_rec.b.br_startblock = bno; - cur->bc_rec.b.br_blockcount = len; - return xfs_bmbt_lookup(cur, XFS_LOOKUP_EQ, stat); -} - -int /* error */ -xfs_bmbt_lookup_ge( - xfs_btree_cur_t *cur, - xfs_fileoff_t off, - xfs_fsblock_t bno, - xfs_filblks_t len, - int *stat) /* success/failure */ -{ - cur->bc_rec.b.br_startoff = off; - cur->bc_rec.b.br_startblock = bno; - cur->bc_rec.b.br_blockcount = len; - return xfs_bmbt_lookup(cur, XFS_LOOKUP_GE, stat); -} - -/* - * Give the bmap btree a new root block. Copy the old broot contents - * down into a real block and make the broot point to it. - */ -int /* error */ -xfs_bmbt_newroot( - xfs_btree_cur_t *cur, /* btree cursor */ - int *logflags, /* logging flags for inode */ - int *stat) /* return status - 0 fail */ -{ - xfs_alloc_arg_t args; /* allocation arguments */ - xfs_bmbt_block_t *block; /* bmap btree block */ - xfs_buf_t *bp; /* buffer for block */ - xfs_bmbt_block_t *cblock; /* child btree block */ - xfs_bmbt_key_t *ckp; /* child key pointer */ - xfs_bmbt_ptr_t *cpp; /* child ptr pointer */ - int error; /* error return code */ -#ifdef DEBUG - int i; /* loop counter */ -#endif - xfs_bmbt_key_t *kp; /* pointer to bmap btree key */ - int level; /* btree level */ - xfs_bmbt_ptr_t *pp; /* pointer to bmap block addr */ - - XFS_BMBT_TRACE_CURSOR(cur, ENTRY); - level = cur->bc_nlevels - 1; - block = xfs_bmbt_get_block(cur, level, &bp); - /* - * Copy the root into a real block. - */ - args.mp = cur->bc_mp; - pp = XFS_BMAP_PTR_IADDR(block, 1, cur); - args.tp = cur->bc_tp; - args.fsbno = cur->bc_private.b.firstblock; - args.mod = args.minleft = args.alignment = args.total = args.isfl = - args.userdata = args.minalignslop = 0; - args.minlen = args.maxlen = args.prod = 1; - args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL; - args.firstblock = args.fsbno; - if (args.fsbno == NULLFSBLOCK) { -#ifdef DEBUG - if ((error = xfs_btree_check_lptr_disk(cur, *pp, level))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } -#endif - args.fsbno = be64_to_cpu(*pp); - args.type = XFS_ALLOCTYPE_START_BNO; - } else if (cur->bc_private.b.flist->xbf_low) - args.type = XFS_ALLOCTYPE_START_BNO; - else - args.type = XFS_ALLOCTYPE_NEAR_BNO; - if ((error = xfs_alloc_vextent(&args))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - if (args.fsbno == NULLFSBLOCK) { - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 0; - return 0; - } - ASSERT(args.len == 1); - cur->bc_private.b.firstblock = args.fsbno; - cur->bc_private.b.allocated++; - cur->bc_private.b.ip->i_d.di_nblocks++; - XFS_TRANS_MOD_DQUOT_BYINO(args.mp, args.tp, cur->bc_private.b.ip, - XFS_TRANS_DQ_BCOUNT, 1L); - bp = xfs_btree_get_bufl(args.mp, cur->bc_tp, args.fsbno, 0); - cblock = XFS_BUF_TO_BMBT_BLOCK(bp); - *cblock = *block; - be16_add_cpu(&block->bb_level, 1); - block->bb_numrecs = cpu_to_be16(1); - cur->bc_nlevels++; - cur->bc_ptrs[level + 1] = 1; - kp = XFS_BMAP_KEY_IADDR(block, 1, cur); - ckp = XFS_BMAP_KEY_IADDR(cblock, 1, cur); - memcpy(ckp, kp, be16_to_cpu(cblock->bb_numrecs) * sizeof(*kp)); - cpp = XFS_BMAP_PTR_IADDR(cblock, 1, cur); -#ifdef DEBUG - for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) { - if ((error = xfs_btree_check_lptr_disk(cur, pp[i], level))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - } -#endif - memcpy(cpp, pp, be16_to_cpu(cblock->bb_numrecs) * sizeof(*pp)); -#ifdef DEBUG - if ((error = xfs_btree_check_lptr(cur, args.fsbno, level))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } -#endif - *pp = cpu_to_be64(args.fsbno); - xfs_iroot_realloc(cur->bc_private.b.ip, 1 - be16_to_cpu(cblock->bb_numrecs), - cur->bc_private.b.whichfork); - xfs_btree_setbuf(cur, level, bp); - /* - * Do all this logging at the end so that - * the root is at the right level. - */ - xfs_bmbt_log_block(cur, bp, XFS_BB_ALL_BITS); - xfs_bmbt_log_keys(cur, bp, 1, be16_to_cpu(cblock->bb_numrecs)); - xfs_bmbt_log_ptrs(cur, bp, 1, be16_to_cpu(cblock->bb_numrecs)); - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *logflags |= - XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork); - *stat = 1; - return 0; -} /* * Set all the fields in a bmap extent record from the arguments. @@ -2512,7 +429,8 @@ xfs_bmbt_set_state( */ void xfs_bmbt_to_bmdr( - xfs_bmbt_block_t *rblock, + struct xfs_mount *mp, + struct xfs_btree_block *rblock, int rblocklen, xfs_bmdr_block_t *dblock, int dblocklen) @@ -2524,67 +442,22 @@ xfs_bmbt_to_bmdr( __be64 *tpp; ASSERT(be32_to_cpu(rblock->bb_magic) == XFS_BMAP_MAGIC); - ASSERT(be64_to_cpu(rblock->bb_leftsib) == NULLDFSBNO); - ASSERT(be64_to_cpu(rblock->bb_rightsib) == NULLDFSBNO); + ASSERT(be64_to_cpu(rblock->bb_u.l.bb_leftsib) == NULLDFSBNO); + ASSERT(be64_to_cpu(rblock->bb_u.l.bb_rightsib) == NULLDFSBNO); ASSERT(be16_to_cpu(rblock->bb_level) > 0); dblock->bb_level = rblock->bb_level; dblock->bb_numrecs = rblock->bb_numrecs; - dmxr = (int)XFS_BTREE_BLOCK_MAXRECS(dblocklen, xfs_bmdr, 0); - fkp = XFS_BMAP_BROOT_KEY_ADDR(rblock, 1, rblocklen); - tkp = XFS_BTREE_KEY_ADDR(xfs_bmdr, dblock, 1); - fpp = XFS_BMAP_BROOT_PTR_ADDR(rblock, 1, rblocklen); - tpp = XFS_BTREE_PTR_ADDR(xfs_bmdr, dblock, 1, dmxr); + dmxr = xfs_bmdr_maxrecs(mp, dblocklen, 0); + fkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1); + tkp = XFS_BMDR_KEY_ADDR(dblock, 1); + fpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen); + tpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr); dmxr = be16_to_cpu(dblock->bb_numrecs); memcpy(tkp, fkp, sizeof(*fkp) * dmxr); memcpy(tpp, fpp, sizeof(*fpp) * dmxr); } /* - * Update the record to the passed values. - */ -int -xfs_bmbt_update( - xfs_btree_cur_t *cur, - xfs_fileoff_t off, - xfs_fsblock_t bno, - xfs_filblks_t len, - xfs_exntst_t state) -{ - xfs_bmbt_block_t *block; - xfs_buf_t *bp; - int error; - xfs_bmbt_key_t key; - int ptr; - xfs_bmbt_rec_t *rp; - - XFS_BMBT_TRACE_CURSOR(cur, ENTRY); - XFS_BMBT_TRACE_ARGFFFI(cur, (xfs_dfiloff_t)off, (xfs_dfsbno_t)bno, - (xfs_dfilblks_t)len, (int)state); - block = xfs_bmbt_get_block(cur, 0, &bp); -#ifdef DEBUG - if ((error = xfs_btree_check_lblock(cur, block, 0, bp))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } -#endif - ptr = cur->bc_ptrs[0]; - rp = XFS_BMAP_REC_IADDR(block, ptr, cur); - xfs_bmbt_disk_set_allf(rp, off, bno, len, state); - xfs_bmbt_log_recs(cur, bp, ptr, ptr); - if (ptr > 1) { - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - return 0; - } - key.br_startoff = cpu_to_be64(off); - if ((error = xfs_bmbt_updkey(cur, &key, 1))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - return 0; -} - -/* * Check extent records, which have just been read, for * any bit in the extent flag field. ASSERT on debug * kernels, as this condition should not occur. @@ -2608,3 +481,451 @@ xfs_check_nostate_extents( } return 0; } + + +STATIC struct xfs_btree_cur * +xfs_bmbt_dup_cursor( + struct xfs_btree_cur *cur) +{ + struct xfs_btree_cur *new; + + new = xfs_bmbt_init_cursor(cur->bc_mp, cur->bc_tp, + cur->bc_private.b.ip, cur->bc_private.b.whichfork); + + /* + * Copy the firstblock, flist, and flags values, + * since init cursor doesn't get them. + */ + new->bc_private.b.firstblock = cur->bc_private.b.firstblock; + new->bc_private.b.flist = cur->bc_private.b.flist; + new->bc_private.b.flags = cur->bc_private.b.flags; + + return new; +} + +STATIC void +xfs_bmbt_update_cursor( + struct xfs_btree_cur *src, + struct xfs_btree_cur *dst) +{ + ASSERT((dst->bc_private.b.firstblock != NULLFSBLOCK) || + (dst->bc_private.b.ip->i_d.di_flags & XFS_DIFLAG_REALTIME)); + ASSERT(dst->bc_private.b.flist == src->bc_private.b.flist); + + dst->bc_private.b.allocated += src->bc_private.b.allocated; + dst->bc_private.b.firstblock = src->bc_private.b.firstblock; + + src->bc_private.b.allocated = 0; +} + +STATIC int +xfs_bmbt_alloc_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int length, + int *stat) +{ + xfs_alloc_arg_t args; /* block allocation args */ + int error; /* error return value */ + + memset(&args, 0, sizeof(args)); + args.tp = cur->bc_tp; + args.mp = cur->bc_mp; + args.fsbno = cur->bc_private.b.firstblock; + args.firstblock = args.fsbno; + + if (args.fsbno == NULLFSBLOCK) { + args.fsbno = be64_to_cpu(start->l); + args.type = XFS_ALLOCTYPE_START_BNO; + /* + * Make sure there is sufficient room left in the AG to + * complete a full tree split for an extent insert. If + * we are converting the middle part of an extent then + * we may need space for two tree splits. + * + * We are relying on the caller to make the correct block + * reservation for this operation to succeed. If the + * reservation amount is insufficient then we may fail a + * block allocation here and corrupt the filesystem. + */ + args.minleft = xfs_trans_get_block_res(args.tp); + } else if (cur->bc_private.b.flist->xbf_low) { + args.type = XFS_ALLOCTYPE_START_BNO; + } else { + args.type = XFS_ALLOCTYPE_NEAR_BNO; + } + + args.minlen = args.maxlen = args.prod = 1; + args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL; + if (!args.wasdel && xfs_trans_get_block_res(args.tp) == 0) { + error = XFS_ERROR(ENOSPC); + goto error0; + } + error = xfs_alloc_vextent(&args); + if (error) + goto error0; + + if (args.fsbno == NULLFSBLOCK && args.minleft) { + /* + * Could not find an AG with enough free space to satisfy + * a full btree split. Try again without minleft and if + * successful activate the lowspace algorithm. + */ + args.fsbno = 0; + args.type = XFS_ALLOCTYPE_FIRST_AG; + args.minleft = 0; + error = xfs_alloc_vextent(&args); + if (error) + goto error0; + cur->bc_private.b.flist->xbf_low = 1; + } + if (args.fsbno == NULLFSBLOCK) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + } + ASSERT(args.len == 1); + cur->bc_private.b.firstblock = args.fsbno; + cur->bc_private.b.allocated++; + cur->bc_private.b.ip->i_d.di_nblocks++; + xfs_trans_log_inode(args.tp, cur->bc_private.b.ip, XFS_ILOG_CORE); + XFS_TRANS_MOD_DQUOT_BYINO(args.mp, args.tp, cur->bc_private.b.ip, + XFS_TRANS_DQ_BCOUNT, 1L); + + new->l = cpu_to_be64(args.fsbno); + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; + + error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +} + +STATIC int +xfs_bmbt_free_block( + struct xfs_btree_cur *cur, + struct xfs_buf *bp) +{ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_inode *ip = cur->bc_private.b.ip; + struct xfs_trans *tp = cur->bc_tp; + xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp)); + + xfs_bmap_add_free(fsbno, 1, cur->bc_private.b.flist, mp); + ip->i_d.di_nblocks--; + + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); + xfs_trans_binval(tp, bp); + return 0; +} + +STATIC int +xfs_bmbt_get_minrecs( + struct xfs_btree_cur *cur, + int level) +{ + if (level == cur->bc_nlevels - 1) { + struct xfs_ifork *ifp; + + ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, + cur->bc_private.b.whichfork); + + return xfs_bmbt_maxrecs(cur->bc_mp, + ifp->if_broot_bytes, level == 0) / 2; + } + + return cur->bc_mp->m_bmap_dmnr[level != 0]; +} + +int +xfs_bmbt_get_maxrecs( + struct xfs_btree_cur *cur, + int level) +{ + if (level == cur->bc_nlevels - 1) { + struct xfs_ifork *ifp; + + ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, + cur->bc_private.b.whichfork); + + return xfs_bmbt_maxrecs(cur->bc_mp, + ifp->if_broot_bytes, level == 0); + } + + return cur->bc_mp->m_bmap_dmxr[level != 0]; + +} + +/* + * Get the maximum records we could store in the on-disk format. + * + * For non-root nodes this is equivalent to xfs_bmbt_get_maxrecs, but + * for the root node this checks the available space in the dinode fork + * so that we can resize the in-memory buffer to match it. After a + * resize to the maximum size this function returns the same value + * as xfs_bmbt_get_maxrecs for the root node, too. + */ +STATIC int +xfs_bmbt_get_dmaxrecs( + struct xfs_btree_cur *cur, + int level) +{ + if (level != cur->bc_nlevels - 1) + return cur->bc_mp->m_bmap_dmxr[level != 0]; + return xfs_bmdr_maxrecs(cur->bc_mp, cur->bc_private.b.forksize, + level == 0); +} + +STATIC void +xfs_bmbt_init_key_from_rec( + union xfs_btree_key *key, + union xfs_btree_rec *rec) +{ + key->bmbt.br_startoff = + cpu_to_be64(xfs_bmbt_disk_get_startoff(&rec->bmbt)); +} + +STATIC void +xfs_bmbt_init_rec_from_key( + union xfs_btree_key *key, + union xfs_btree_rec *rec) +{ + ASSERT(key->bmbt.br_startoff != 0); + + xfs_bmbt_disk_set_allf(&rec->bmbt, be64_to_cpu(key->bmbt.br_startoff), + 0, 0, XFS_EXT_NORM); +} + +STATIC void +xfs_bmbt_init_rec_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec) +{ + xfs_bmbt_disk_set_all(&rec->bmbt, &cur->bc_rec.b); +} + +STATIC void +xfs_bmbt_init_ptr_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + ptr->l = 0; +} + +STATIC __int64_t +xfs_bmbt_key_diff( + struct xfs_btree_cur *cur, + union xfs_btree_key *key) +{ + return (__int64_t)be64_to_cpu(key->bmbt.br_startoff) - + cur->bc_rec.b.br_startoff; +} + +#ifdef DEBUG +STATIC int +xfs_bmbt_keys_inorder( + struct xfs_btree_cur *cur, + union xfs_btree_key *k1, + union xfs_btree_key *k2) +{ + return be64_to_cpu(k1->bmbt.br_startoff) < + be64_to_cpu(k2->bmbt.br_startoff); +} + +STATIC int +xfs_bmbt_recs_inorder( + struct xfs_btree_cur *cur, + union xfs_btree_rec *r1, + union xfs_btree_rec *r2) +{ + return xfs_bmbt_disk_get_startoff(&r1->bmbt) + + xfs_bmbt_disk_get_blockcount(&r1->bmbt) <= + xfs_bmbt_disk_get_startoff(&r2->bmbt); +} +#endif /* DEBUG */ + +#ifdef XFS_BTREE_TRACE +ktrace_t *xfs_bmbt_trace_buf; + +STATIC void +xfs_bmbt_trace_enter( + struct xfs_btree_cur *cur, + const char *func, + char *s, + int type, + int line, + __psunsigned_t a0, + __psunsigned_t a1, + __psunsigned_t a2, + __psunsigned_t a3, + __psunsigned_t a4, + __psunsigned_t a5, + __psunsigned_t a6, + __psunsigned_t a7, + __psunsigned_t a8, + __psunsigned_t a9, + __psunsigned_t a10) +{ + struct xfs_inode *ip = cur->bc_private.b.ip; + int whichfork = cur->bc_private.b.whichfork; + + ktrace_enter(xfs_bmbt_trace_buf, + (void *)((__psint_t)type | (whichfork << 8) | (line << 16)), + (void *)func, (void *)s, (void *)ip, (void *)cur, + (void *)a0, (void *)a1, (void *)a2, (void *)a3, + (void *)a4, (void *)a5, (void *)a6, (void *)a7, + (void *)a8, (void *)a9, (void *)a10); + ktrace_enter(ip->i_btrace, + (void *)((__psint_t)type | (whichfork << 8) | (line << 16)), + (void *)func, (void *)s, (void *)ip, (void *)cur, + (void *)a0, (void *)a1, (void *)a2, (void *)a3, + (void *)a4, (void *)a5, (void *)a6, (void *)a7, + (void *)a8, (void *)a9, (void *)a10); +} + +STATIC void +xfs_bmbt_trace_cursor( + struct xfs_btree_cur *cur, + __uint32_t *s0, + __uint64_t *l0, + __uint64_t *l1) +{ + struct xfs_bmbt_rec_host r; + + xfs_bmbt_set_all(&r, &cur->bc_rec.b); + + *s0 = (cur->bc_nlevels << 24) | + (cur->bc_private.b.flags << 16) | + cur->bc_private.b.allocated; + *l0 = r.l0; + *l1 = r.l1; +} + +STATIC void +xfs_bmbt_trace_key( + struct xfs_btree_cur *cur, + union xfs_btree_key *key, + __uint64_t *l0, + __uint64_t *l1) +{ + *l0 = be64_to_cpu(key->bmbt.br_startoff); + *l1 = 0; +} + +STATIC void +xfs_bmbt_trace_record( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec, + __uint64_t *l0, + __uint64_t *l1, + __uint64_t *l2) +{ + struct xfs_bmbt_irec irec; + + xfs_bmbt_disk_get_all(&rec->bmbt, &irec); + *l0 = irec.br_startoff; + *l1 = irec.br_startblock; + *l2 = irec.br_blockcount; +} +#endif /* XFS_BTREE_TRACE */ + +static const struct xfs_btree_ops xfs_bmbt_ops = { + .rec_len = sizeof(xfs_bmbt_rec_t), + .key_len = sizeof(xfs_bmbt_key_t), + + .dup_cursor = xfs_bmbt_dup_cursor, + .update_cursor = xfs_bmbt_update_cursor, + .alloc_block = xfs_bmbt_alloc_block, + .free_block = xfs_bmbt_free_block, + .get_maxrecs = xfs_bmbt_get_maxrecs, + .get_minrecs = xfs_bmbt_get_minrecs, + .get_dmaxrecs = xfs_bmbt_get_dmaxrecs, + .init_key_from_rec = xfs_bmbt_init_key_from_rec, + .init_rec_from_key = xfs_bmbt_init_rec_from_key, + .init_rec_from_cur = xfs_bmbt_init_rec_from_cur, + .init_ptr_from_cur = xfs_bmbt_init_ptr_from_cur, + .key_diff = xfs_bmbt_key_diff, + +#ifdef DEBUG + .keys_inorder = xfs_bmbt_keys_inorder, + .recs_inorder = xfs_bmbt_recs_inorder, +#endif + +#ifdef XFS_BTREE_TRACE + .trace_enter = xfs_bmbt_trace_enter, + .trace_cursor = xfs_bmbt_trace_cursor, + .trace_key = xfs_bmbt_trace_key, + .trace_record = xfs_bmbt_trace_record, +#endif +}; + +/* + * Allocate a new bmap btree cursor. + */ +struct xfs_btree_cur * /* new bmap btree cursor */ +xfs_bmbt_init_cursor( + struct xfs_mount *mp, /* file system mount point */ + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_inode *ip, /* inode owning the btree */ + int whichfork) /* data or attr fork */ +{ + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_btree_cur *cur; + + cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP); + + cur->bc_tp = tp; + cur->bc_mp = mp; + cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1; + cur->bc_btnum = XFS_BTNUM_BMAP; + cur->bc_blocklog = mp->m_sb.sb_blocklog; + + cur->bc_ops = &xfs_bmbt_ops; + cur->bc_flags = XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE; + + cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork); + cur->bc_private.b.ip = ip; + cur->bc_private.b.firstblock = NULLFSBLOCK; + cur->bc_private.b.flist = NULL; + cur->bc_private.b.allocated = 0; + cur->bc_private.b.flags = 0; + cur->bc_private.b.whichfork = whichfork; + + return cur; +} + +/* + * Calculate number of records in a bmap btree block. + */ +int +xfs_bmbt_maxrecs( + struct xfs_mount *mp, + int blocklen, + int leaf) +{ + blocklen -= XFS_BMBT_BLOCK_LEN(mp); + + if (leaf) + return blocklen / sizeof(xfs_bmbt_rec_t); + return blocklen / (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t)); +} + +/* + * Calculate number of records in a bmap btree inode root. + */ +int +xfs_bmdr_maxrecs( + struct xfs_mount *mp, + int blocklen, + int leaf) +{ + blocklen -= sizeof(xfs_bmdr_block_t); + + if (leaf) + return blocklen / sizeof(xfs_bmdr_rec_t); + return blocklen / (sizeof(xfs_bmdr_key_t) + sizeof(xfs_bmdr_ptr_t)); +} diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h index cd0d4b4bb816..a4555abb6622 100644 --- a/fs/xfs/xfs_bmap_btree.h +++ b/fs/xfs/xfs_bmap_btree.h @@ -21,9 +21,10 @@ #define XFS_BMAP_MAGIC 0x424d4150 /* 'BMAP' */ struct xfs_btree_cur; -struct xfs_btree_lblock; +struct xfs_btree_block; struct xfs_mount; struct xfs_inode; +struct xfs_trans; /* * Bmap root header, on-disk form only. @@ -145,71 +146,60 @@ typedef struct xfs_bmbt_key { /* btree pointer type */ typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t; -/* btree block header type */ -typedef struct xfs_btree_lblock xfs_bmbt_block_t; - -#define XFS_BUF_TO_BMBT_BLOCK(bp) ((xfs_bmbt_block_t *)XFS_BUF_PTR(bp)) - -#define XFS_BMAP_RBLOCK_DSIZE(lev,cur) ((cur)->bc_private.b.forksize) -#define XFS_BMAP_RBLOCK_ISIZE(lev,cur) \ - ((int)XFS_IFORK_PTR((cur)->bc_private.b.ip, \ - (cur)->bc_private.b.whichfork)->if_broot_bytes) - -#define XFS_BMAP_BLOCK_DMAXRECS(lev,cur) \ - (((lev) == (cur)->bc_nlevels - 1 ? \ - XFS_BTREE_BLOCK_MAXRECS(XFS_BMAP_RBLOCK_DSIZE(lev,cur), \ - xfs_bmdr, (lev) == 0) : \ - ((cur)->bc_mp->m_bmap_dmxr[(lev) != 0]))) -#define XFS_BMAP_BLOCK_IMAXRECS(lev,cur) \ - (((lev) == (cur)->bc_nlevels - 1 ? \ - XFS_BTREE_BLOCK_MAXRECS(XFS_BMAP_RBLOCK_ISIZE(lev,cur),\ - xfs_bmbt, (lev) == 0) : \ - ((cur)->bc_mp->m_bmap_dmxr[(lev) != 0]))) - -#define XFS_BMAP_BLOCK_DMINRECS(lev,cur) \ - (((lev) == (cur)->bc_nlevels - 1 ? \ - XFS_BTREE_BLOCK_MINRECS(XFS_BMAP_RBLOCK_DSIZE(lev,cur),\ - xfs_bmdr, (lev) == 0) : \ - ((cur)->bc_mp->m_bmap_dmnr[(lev) != 0]))) -#define XFS_BMAP_BLOCK_IMINRECS(lev,cur) \ - (((lev) == (cur)->bc_nlevels - 1 ? \ - XFS_BTREE_BLOCK_MINRECS(XFS_BMAP_RBLOCK_ISIZE(lev,cur),\ - xfs_bmbt, (lev) == 0) : \ - ((cur)->bc_mp->m_bmap_dmnr[(lev) != 0]))) - -#define XFS_BMAP_REC_DADDR(bb,i,cur) (XFS_BTREE_REC_ADDR(xfs_bmbt, bb, i)) - -#define XFS_BMAP_REC_IADDR(bb,i,cur) (XFS_BTREE_REC_ADDR(xfs_bmbt, bb, i)) - -#define XFS_BMAP_KEY_DADDR(bb,i,cur) \ - (XFS_BTREE_KEY_ADDR(xfs_bmbt, bb, i)) - -#define XFS_BMAP_KEY_IADDR(bb,i,cur) \ - (XFS_BTREE_KEY_ADDR(xfs_bmbt, bb, i)) - -#define XFS_BMAP_PTR_DADDR(bb,i,cur) \ - (XFS_BTREE_PTR_ADDR(xfs_bmbt, bb, i, XFS_BMAP_BLOCK_DMAXRECS( \ - be16_to_cpu((bb)->bb_level), cur))) -#define XFS_BMAP_PTR_IADDR(bb,i,cur) \ - (XFS_BTREE_PTR_ADDR(xfs_bmbt, bb, i, XFS_BMAP_BLOCK_IMAXRECS( \ - be16_to_cpu((bb)->bb_level), cur))) +/* + * Btree block header size depends on a superblock flag. + * + * (not quite yet, but soon) + */ +#define XFS_BMBT_BLOCK_LEN(mp) XFS_BTREE_LBLOCK_LEN + +#define XFS_BMBT_REC_ADDR(mp, block, index) \ + ((xfs_bmbt_rec_t *) \ + ((char *)(block) + \ + XFS_BMBT_BLOCK_LEN(mp) + \ + ((index) - 1) * sizeof(xfs_bmbt_rec_t))) + +#define XFS_BMBT_KEY_ADDR(mp, block, index) \ + ((xfs_bmbt_key_t *) \ + ((char *)(block) + \ + XFS_BMBT_BLOCK_LEN(mp) + \ + ((index) - 1) * sizeof(xfs_bmbt_key_t))) + +#define XFS_BMBT_PTR_ADDR(mp, block, index, maxrecs) \ + ((xfs_bmbt_ptr_t *) \ + ((char *)(block) + \ + XFS_BMBT_BLOCK_LEN(mp) + \ + (maxrecs) * sizeof(xfs_bmbt_key_t) + \ + ((index) - 1) * sizeof(xfs_bmbt_ptr_t))) + +#define XFS_BMDR_REC_ADDR(block, index) \ + ((xfs_bmdr_rec_t *) \ + ((char *)(block) + \ + sizeof(struct xfs_bmdr_block) + \ + ((index) - 1) * sizeof(xfs_bmdr_rec_t))) + +#define XFS_BMDR_KEY_ADDR(block, index) \ + ((xfs_bmdr_key_t *) \ + ((char *)(block) + \ + sizeof(struct xfs_bmdr_block) + \ + ((index) - 1) * sizeof(xfs_bmdr_key_t))) + +#define XFS_BMDR_PTR_ADDR(block, index, maxrecs) \ + ((xfs_bmdr_ptr_t *) \ + ((char *)(block) + \ + sizeof(struct xfs_bmdr_block) + \ + (maxrecs) * sizeof(xfs_bmdr_key_t) + \ + ((index) - 1) * sizeof(xfs_bmdr_ptr_t))) /* * These are to be used when we know the size of the block and * we don't have a cursor. */ -#define XFS_BMAP_BROOT_REC_ADDR(bb,i,sz) \ - (XFS_BTREE_REC_ADDR(xfs_bmbt,bb,i)) -#define XFS_BMAP_BROOT_KEY_ADDR(bb,i,sz) \ - (XFS_BTREE_KEY_ADDR(xfs_bmbt,bb,i)) -#define XFS_BMAP_BROOT_PTR_ADDR(bb,i,sz) \ - (XFS_BTREE_PTR_ADDR(xfs_bmbt,bb,i,XFS_BMAP_BROOT_MAXRECS(sz))) - -#define XFS_BMAP_BROOT_NUMRECS(bb) be16_to_cpu((bb)->bb_numrecs) -#define XFS_BMAP_BROOT_MAXRECS(sz) XFS_BTREE_BLOCK_MAXRECS(sz,xfs_bmbt,0) +#define XFS_BMAP_BROOT_PTR_ADDR(mp, bb, i, sz) \ + XFS_BMBT_PTR_ADDR(mp, bb, i, xfs_bmbt_maxrecs(mp, sz, 0)) #define XFS_BMAP_BROOT_SPACE_CALC(nrecs) \ - (int)(sizeof(xfs_bmbt_block_t) + \ + (int)(XFS_BTREE_LBLOCK_LEN + \ ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t)))) #define XFS_BMAP_BROOT_SPACE(bb) \ @@ -223,42 +213,12 @@ typedef struct xfs_btree_lblock xfs_bmbt_block_t; */ #define XFS_BM_MAXLEVELS(mp,w) ((mp)->m_bm_maxlevels[(w)]) -#define XFS_BMAP_SANITY_CHECK(mp,bb,level) \ - (be32_to_cpu((bb)->bb_magic) == XFS_BMAP_MAGIC && \ - be16_to_cpu((bb)->bb_level) == level && \ - be16_to_cpu((bb)->bb_numrecs) > 0 && \ - be16_to_cpu((bb)->bb_numrecs) <= (mp)->m_bmap_dmxr[(level) != 0]) - - -#ifdef __KERNEL__ - -#if defined(XFS_BMBT_TRACE) -/* - * Trace buffer entry types. - */ -#define XFS_BMBT_KTRACE_ARGBI 1 -#define XFS_BMBT_KTRACE_ARGBII 2 -#define XFS_BMBT_KTRACE_ARGFFFI 3 -#define XFS_BMBT_KTRACE_ARGI 4 -#define XFS_BMBT_KTRACE_ARGIFK 5 -#define XFS_BMBT_KTRACE_ARGIFR 6 -#define XFS_BMBT_KTRACE_ARGIK 7 -#define XFS_BMBT_KTRACE_CUR 8 - -#define XFS_BMBT_TRACE_SIZE 4096 /* size of global trace buffer */ -#define XFS_BMBT_KTRACE_SIZE 32 /* size of per-inode trace buffer */ -extern ktrace_t *xfs_bmbt_trace_buf; -#endif - /* * Prototypes for xfs_bmap.c to call. */ -extern void xfs_bmdr_to_bmbt(xfs_bmdr_block_t *, int, xfs_bmbt_block_t *, int); -extern int xfs_bmbt_decrement(struct xfs_btree_cur *, int, int *); -extern int xfs_bmbt_delete(struct xfs_btree_cur *, int *); +extern void xfs_bmdr_to_bmbt(struct xfs_mount *, xfs_bmdr_block_t *, int, + struct xfs_btree_block *, int); extern void xfs_bmbt_get_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s); -extern xfs_bmbt_block_t *xfs_bmbt_get_block(struct xfs_btree_cur *cur, - int, struct xfs_buf **bpp); extern xfs_filblks_t xfs_bmbt_get_blockcount(xfs_bmbt_rec_host_t *r); extern xfs_fsblock_t xfs_bmbt_get_startblock(xfs_bmbt_rec_host_t *r); extern xfs_fileoff_t xfs_bmbt_get_startoff(xfs_bmbt_rec_host_t *r); @@ -268,22 +228,6 @@ extern void xfs_bmbt_disk_get_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s); extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(xfs_bmbt_rec_t *r); extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(xfs_bmbt_rec_t *r); -extern int xfs_bmbt_increment(struct xfs_btree_cur *, int, int *); -extern int xfs_bmbt_insert(struct xfs_btree_cur *, int *); -extern void xfs_bmbt_log_block(struct xfs_btree_cur *, struct xfs_buf *, int); -extern void xfs_bmbt_log_recs(struct xfs_btree_cur *, struct xfs_buf *, int, - int); -extern int xfs_bmbt_lookup_eq(struct xfs_btree_cur *, xfs_fileoff_t, - xfs_fsblock_t, xfs_filblks_t, int *); -extern int xfs_bmbt_lookup_ge(struct xfs_btree_cur *, xfs_fileoff_t, - xfs_fsblock_t, xfs_filblks_t, int *); - -/* - * Give the bmap btree a new root block. Copy the old broot contents - * down into a real block and make the broot point to it. - */ -extern int xfs_bmbt_newroot(struct xfs_btree_cur *cur, int *lflags, int *stat); - extern void xfs_bmbt_set_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s); extern void xfs_bmbt_set_allf(xfs_bmbt_rec_host_t *r, xfs_fileoff_t o, xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v); @@ -296,10 +240,15 @@ extern void xfs_bmbt_disk_set_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s); extern void xfs_bmbt_disk_set_allf(xfs_bmbt_rec_t *r, xfs_fileoff_t o, xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v); -extern void xfs_bmbt_to_bmdr(xfs_bmbt_block_t *, int, xfs_bmdr_block_t *, int); -extern int xfs_bmbt_update(struct xfs_btree_cur *, xfs_fileoff_t, - xfs_fsblock_t, xfs_filblks_t, xfs_exntst_t); +extern void xfs_bmbt_to_bmdr(struct xfs_mount *, struct xfs_btree_block *, int, + xfs_bmdr_block_t *, int); + +extern int xfs_bmbt_get_maxrecs(struct xfs_btree_cur *, int level); +extern int xfs_bmdr_maxrecs(struct xfs_mount *, int blocklen, int leaf); +extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf); + +extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *, + struct xfs_trans *, struct xfs_inode *, int); -#endif /* __KERNEL__ */ #endif /* __XFS_BMAP_BTREE_H__ */ diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index cc593a84c345..7ed59267420d 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -34,7 +34,9 @@ #include "xfs_attr_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" +#include "xfs_inode_item.h" #include "xfs_btree.h" +#include "xfs_btree_trace.h" #include "xfs_ialloc.h" #include "xfs_error.h" @@ -50,135 +52,33 @@ const __uint32_t xfs_magics[XFS_BTNUM_MAX] = { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC }; -/* - * Checking routine: return maxrecs for the block. - */ -STATIC int /* number of records fitting in block */ -xfs_btree_maxrecs( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_btree_block_t *block) /* generic btree block pointer */ -{ - switch (cur->bc_btnum) { - case XFS_BTNUM_BNO: - case XFS_BTNUM_CNT: - return (int)XFS_ALLOC_BLOCK_MAXRECS( - be16_to_cpu(block->bb_h.bb_level), cur); - case XFS_BTNUM_BMAP: - return (int)XFS_BMAP_BLOCK_IMAXRECS( - be16_to_cpu(block->bb_h.bb_level), cur); - case XFS_BTNUM_INO: - return (int)XFS_INOBT_BLOCK_MAXRECS( - be16_to_cpu(block->bb_h.bb_level), cur); - default: - ASSERT(0); - return 0; - } -} - -/* - * External routines. - */ - -#ifdef DEBUG -/* - * Debug routine: check that block header is ok. - */ -void -xfs_btree_check_block( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_btree_block_t *block, /* generic btree block pointer */ - int level, /* level of the btree block */ - xfs_buf_t *bp) /* buffer containing block, if any */ -{ - if (XFS_BTREE_LONG_PTRS(cur->bc_btnum)) - xfs_btree_check_lblock(cur, (xfs_btree_lblock_t *)block, level, - bp); - else - xfs_btree_check_sblock(cur, (xfs_btree_sblock_t *)block, level, - bp); -} - -/* - * Debug routine: check that keys are in the right order. - */ -void -xfs_btree_check_key( - xfs_btnum_t btnum, /* btree identifier */ - void *ak1, /* pointer to left (lower) key */ - void *ak2) /* pointer to right (higher) key */ -{ - switch (btnum) { - case XFS_BTNUM_BNO: { - xfs_alloc_key_t *k1; - xfs_alloc_key_t *k2; - - k1 = ak1; - k2 = ak2; - ASSERT(be32_to_cpu(k1->ar_startblock) < be32_to_cpu(k2->ar_startblock)); - break; - } - case XFS_BTNUM_CNT: { - xfs_alloc_key_t *k1; - xfs_alloc_key_t *k2; - - k1 = ak1; - k2 = ak2; - ASSERT(be32_to_cpu(k1->ar_blockcount) < be32_to_cpu(k2->ar_blockcount) || - (k1->ar_blockcount == k2->ar_blockcount && - be32_to_cpu(k1->ar_startblock) < be32_to_cpu(k2->ar_startblock))); - break; - } - case XFS_BTNUM_BMAP: { - xfs_bmbt_key_t *k1; - xfs_bmbt_key_t *k2; - - k1 = ak1; - k2 = ak2; - ASSERT(be64_to_cpu(k1->br_startoff) < be64_to_cpu(k2->br_startoff)); - break; - } - case XFS_BTNUM_INO: { - xfs_inobt_key_t *k1; - xfs_inobt_key_t *k2; - - k1 = ak1; - k2 = ak2; - ASSERT(be32_to_cpu(k1->ir_startino) < be32_to_cpu(k2->ir_startino)); - break; - } - default: - ASSERT(0); - } -} -#endif /* DEBUG */ -/* - * Checking routine: check that long form block header is ok. - */ -/* ARGSUSED */ -int /* error (0 or EFSCORRUPTED) */ +STATIC int /* error (0 or EFSCORRUPTED) */ xfs_btree_check_lblock( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_btree_lblock_t *block, /* btree long form block pointer */ + struct xfs_btree_cur *cur, /* btree cursor */ + struct xfs_btree_block *block, /* btree long form block pointer */ int level, /* level of the btree block */ - xfs_buf_t *bp) /* buffer for block, if any */ + struct xfs_buf *bp) /* buffer for block, if any */ { int lblock_ok; /* block passes checks */ - xfs_mount_t *mp; /* file system mount point */ + struct xfs_mount *mp; /* file system mount point */ mp = cur->bc_mp; lblock_ok = be32_to_cpu(block->bb_magic) == xfs_magics[cur->bc_btnum] && be16_to_cpu(block->bb_level) == level && be16_to_cpu(block->bb_numrecs) <= - xfs_btree_maxrecs(cur, (xfs_btree_block_t *)block) && - block->bb_leftsib && - (be64_to_cpu(block->bb_leftsib) == NULLDFSBNO || - XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_leftsib))) && - block->bb_rightsib && - (be64_to_cpu(block->bb_rightsib) == NULLDFSBNO || - XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_rightsib))); - if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp, XFS_ERRTAG_BTREE_CHECK_LBLOCK, + cur->bc_ops->get_maxrecs(cur, level) && + block->bb_u.l.bb_leftsib && + (be64_to_cpu(block->bb_u.l.bb_leftsib) == NULLDFSBNO || + XFS_FSB_SANITY_CHECK(mp, + be64_to_cpu(block->bb_u.l.bb_leftsib))) && + block->bb_u.l.bb_rightsib && + (be64_to_cpu(block->bb_u.l.bb_rightsib) == NULLDFSBNO || + XFS_FSB_SANITY_CHECK(mp, + be64_to_cpu(block->bb_u.l.bb_rightsib))); + if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp, + XFS_ERRTAG_BTREE_CHECK_LBLOCK, XFS_RANDOM_BTREE_CHECK_LBLOCK))) { if (bp) xfs_buftrace("LBTREE ERROR", bp); @@ -189,98 +89,15 @@ xfs_btree_check_lblock( return 0; } -/* - * Checking routine: check that (long) pointer is ok. - */ -int /* error (0 or EFSCORRUPTED) */ -xfs_btree_check_lptr( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_dfsbno_t ptr, /* btree block disk address */ - int level) /* btree block level */ -{ - xfs_mount_t *mp; /* file system mount point */ - - mp = cur->bc_mp; - XFS_WANT_CORRUPTED_RETURN( - level > 0 && - ptr != NULLDFSBNO && - XFS_FSB_SANITY_CHECK(mp, ptr)); - return 0; -} - -#ifdef DEBUG -/* - * Debug routine: check that records are in the right order. - */ -void -xfs_btree_check_rec( - xfs_btnum_t btnum, /* btree identifier */ - void *ar1, /* pointer to left (lower) record */ - void *ar2) /* pointer to right (higher) record */ -{ - switch (btnum) { - case XFS_BTNUM_BNO: { - xfs_alloc_rec_t *r1; - xfs_alloc_rec_t *r2; - - r1 = ar1; - r2 = ar2; - ASSERT(be32_to_cpu(r1->ar_startblock) + - be32_to_cpu(r1->ar_blockcount) <= - be32_to_cpu(r2->ar_startblock)); - break; - } - case XFS_BTNUM_CNT: { - xfs_alloc_rec_t *r1; - xfs_alloc_rec_t *r2; - - r1 = ar1; - r2 = ar2; - ASSERT(be32_to_cpu(r1->ar_blockcount) < be32_to_cpu(r2->ar_blockcount) || - (r1->ar_blockcount == r2->ar_blockcount && - be32_to_cpu(r1->ar_startblock) < be32_to_cpu(r2->ar_startblock))); - break; - } - case XFS_BTNUM_BMAP: { - xfs_bmbt_rec_t *r1; - xfs_bmbt_rec_t *r2; - - r1 = ar1; - r2 = ar2; - ASSERT(xfs_bmbt_disk_get_startoff(r1) + - xfs_bmbt_disk_get_blockcount(r1) <= - xfs_bmbt_disk_get_startoff(r2)); - break; - } - case XFS_BTNUM_INO: { - xfs_inobt_rec_t *r1; - xfs_inobt_rec_t *r2; - - r1 = ar1; - r2 = ar2; - ASSERT(be32_to_cpu(r1->ir_startino) + XFS_INODES_PER_CHUNK <= - be32_to_cpu(r2->ir_startino)); - break; - } - default: - ASSERT(0); - } -} -#endif /* DEBUG */ - -/* - * Checking routine: check that block header is ok. - */ -/* ARGSUSED */ -int /* error (0 or EFSCORRUPTED) */ +STATIC int /* error (0 or EFSCORRUPTED) */ xfs_btree_check_sblock( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_btree_sblock_t *block, /* btree short form block pointer */ + struct xfs_btree_cur *cur, /* btree cursor */ + struct xfs_btree_block *block, /* btree short form block pointer */ int level, /* level of the btree block */ - xfs_buf_t *bp) /* buffer containing block */ + struct xfs_buf *bp) /* buffer containing block */ { - xfs_buf_t *agbp; /* buffer for ag. freespace struct */ - xfs_agf_t *agf; /* ag. freespace structure */ + struct xfs_buf *agbp; /* buffer for ag. freespace struct */ + struct xfs_agf *agf; /* ag. freespace structure */ xfs_agblock_t agflen; /* native ag. freespace length */ int sblock_ok; /* block passes checks */ @@ -291,13 +108,13 @@ xfs_btree_check_sblock( be32_to_cpu(block->bb_magic) == xfs_magics[cur->bc_btnum] && be16_to_cpu(block->bb_level) == level && be16_to_cpu(block->bb_numrecs) <= - xfs_btree_maxrecs(cur, (xfs_btree_block_t *)block) && - (be32_to_cpu(block->bb_leftsib) == NULLAGBLOCK || - be32_to_cpu(block->bb_leftsib) < agflen) && - block->bb_leftsib && - (be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK || - be32_to_cpu(block->bb_rightsib) < agflen) && - block->bb_rightsib; + cur->bc_ops->get_maxrecs(cur, level) && + (be32_to_cpu(block->bb_u.s.bb_leftsib) == NULLAGBLOCK || + be32_to_cpu(block->bb_u.s.bb_leftsib) < agflen) && + block->bb_u.s.bb_leftsib && + (be32_to_cpu(block->bb_u.s.bb_rightsib) == NULLAGBLOCK || + be32_to_cpu(block->bb_u.s.bb_rightsib) < agflen) && + block->bb_u.s.bb_rightsib; if (unlikely(XFS_TEST_ERROR(!sblock_ok, cur->bc_mp, XFS_ERRTAG_BTREE_CHECK_SBLOCK, XFS_RANDOM_BTREE_CHECK_SBLOCK))) { @@ -311,27 +128,78 @@ xfs_btree_check_sblock( } /* - * Checking routine: check that (short) pointer is ok. + * Debug routine: check that block header is ok. + */ +int +xfs_btree_check_block( + struct xfs_btree_cur *cur, /* btree cursor */ + struct xfs_btree_block *block, /* generic btree block pointer */ + int level, /* level of the btree block */ + struct xfs_buf *bp) /* buffer containing block, if any */ +{ + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + return xfs_btree_check_lblock(cur, block, level, bp); + else + return xfs_btree_check_sblock(cur, block, level, bp); +} + +/* + * Check that (long) pointer is ok. */ int /* error (0 or EFSCORRUPTED) */ +xfs_btree_check_lptr( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_dfsbno_t bno, /* btree block disk address */ + int level) /* btree block level */ +{ + XFS_WANT_CORRUPTED_RETURN( + level > 0 && + bno != NULLDFSBNO && + XFS_FSB_SANITY_CHECK(cur->bc_mp, bno)); + return 0; +} + +#ifdef DEBUG +/* + * Check that (short) pointer is ok. + */ +STATIC int /* error (0 or EFSCORRUPTED) */ xfs_btree_check_sptr( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_agblock_t ptr, /* btree block disk address */ - int level) /* btree block level */ + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t bno, /* btree block disk address */ + int level) /* btree block level */ { - xfs_buf_t *agbp; /* buffer for ag. freespace struct */ - xfs_agf_t *agf; /* ag. freespace structure */ + xfs_agblock_t agblocks = cur->bc_mp->m_sb.sb_agblocks; - agbp = cur->bc_private.a.agbp; - agf = XFS_BUF_TO_AGF(agbp); XFS_WANT_CORRUPTED_RETURN( level > 0 && - ptr != NULLAGBLOCK && ptr != 0 && - ptr < be32_to_cpu(agf->agf_length)); + bno != NULLAGBLOCK && + bno != 0 && + bno < agblocks); return 0; } /* + * Check that block ptr is ok. + */ +STATIC int /* error (0 or EFSCORRUPTED) */ +xfs_btree_check_ptr( + struct xfs_btree_cur *cur, /* btree cursor */ + union xfs_btree_ptr *ptr, /* btree block disk address */ + int index, /* offset from ptr to check */ + int level) /* btree block level */ +{ + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + return xfs_btree_check_lptr(cur, + be64_to_cpu((&ptr->l)[index]), level); + } else { + return xfs_btree_check_sptr(cur, + be32_to_cpu((&ptr->s)[index]), level); + } +} +#endif + +/* * Delete the btree cursor. */ void @@ -387,16 +255,17 @@ xfs_btree_dup_cursor( tp = cur->bc_tp; mp = cur->bc_mp; + /* * Allocate a new cursor like the old one. */ - new = xfs_btree_init_cursor(mp, tp, cur->bc_private.a.agbp, - cur->bc_private.a.agno, cur->bc_btnum, cur->bc_private.b.ip, - cur->bc_private.b.whichfork); + new = cur->bc_ops->dup_cursor(cur); + /* * Copy the record currently in the cursor. */ new->bc_rec = cur->bc_rec; + /* * For each level current, re-get the buffer and copy the ptr value. */ @@ -416,46 +285,174 @@ xfs_btree_dup_cursor( } else new->bc_bufs[i] = NULL; } - /* - * For bmap btrees, copy the firstblock, flist, and flags values, - * since init cursor doesn't get them. - */ - if (new->bc_btnum == XFS_BTNUM_BMAP) { - new->bc_private.b.firstblock = cur->bc_private.b.firstblock; - new->bc_private.b.flist = cur->bc_private.b.flist; - new->bc_private.b.flags = cur->bc_private.b.flags; - } *ncur = new; return 0; } /* + * XFS btree block layout and addressing: + * + * There are two types of blocks in the btree: leaf and non-leaf blocks. + * + * The leaf record start with a header then followed by records containing + * the values. A non-leaf block also starts with the same header, and + * then first contains lookup keys followed by an equal number of pointers + * to the btree blocks at the previous level. + * + * +--------+-------+-------+-------+-------+-------+-------+ + * Leaf: | header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N | + * +--------+-------+-------+-------+-------+-------+-------+ + * + * +--------+-------+-------+-------+-------+-------+-------+ + * Non-Leaf: | header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N | + * +--------+-------+-------+-------+-------+-------+-------+ + * + * The header is called struct xfs_btree_block for reasons better left unknown + * and comes in different versions for short (32bit) and long (64bit) block + * pointers. The record and key structures are defined by the btree instances + * and opaque to the btree core. The block pointers are simple disk endian + * integers, available in a short (32bit) and long (64bit) variant. + * + * The helpers below calculate the offset of a given record, key or pointer + * into a btree block (xfs_btree_*_offset) or return a pointer to the given + * record, key or pointer (xfs_btree_*_addr). Note that all addressing + * inside the btree block is done using indices starting at one, not zero! + */ + +/* + * Return size of the btree block header for this btree instance. + */ +static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur) +{ + return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ? + XFS_BTREE_LBLOCK_LEN : + XFS_BTREE_SBLOCK_LEN; +} + +/* + * Return size of btree block pointers for this btree instance. + */ +static inline size_t xfs_btree_ptr_len(struct xfs_btree_cur *cur) +{ + return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ? + sizeof(__be64) : sizeof(__be32); +} + +/* + * Calculate offset of the n-th record in a btree block. + */ +STATIC size_t +xfs_btree_rec_offset( + struct xfs_btree_cur *cur, + int n) +{ + return xfs_btree_block_len(cur) + + (n - 1) * cur->bc_ops->rec_len; +} + +/* + * Calculate offset of the n-th key in a btree block. + */ +STATIC size_t +xfs_btree_key_offset( + struct xfs_btree_cur *cur, + int n) +{ + return xfs_btree_block_len(cur) + + (n - 1) * cur->bc_ops->key_len; +} + +/* + * Calculate offset of the n-th block pointer in a btree block. + */ +STATIC size_t +xfs_btree_ptr_offset( + struct xfs_btree_cur *cur, + int n, + int level) +{ + return xfs_btree_block_len(cur) + + cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len + + (n - 1) * xfs_btree_ptr_len(cur); +} + +/* + * Return a pointer to the n-th record in the btree block. + */ +STATIC union xfs_btree_rec * +xfs_btree_rec_addr( + struct xfs_btree_cur *cur, + int n, + struct xfs_btree_block *block) +{ + return (union xfs_btree_rec *) + ((char *)block + xfs_btree_rec_offset(cur, n)); +} + +/* + * Return a pointer to the n-th key in the btree block. + */ +STATIC union xfs_btree_key * +xfs_btree_key_addr( + struct xfs_btree_cur *cur, + int n, + struct xfs_btree_block *block) +{ + return (union xfs_btree_key *) + ((char *)block + xfs_btree_key_offset(cur, n)); +} + +/* + * Return a pointer to the n-th block pointer in the btree block. + */ +STATIC union xfs_btree_ptr * +xfs_btree_ptr_addr( + struct xfs_btree_cur *cur, + int n, + struct xfs_btree_block *block) +{ + int level = xfs_btree_get_level(block); + + ASSERT(block->bb_level != 0); + + return (union xfs_btree_ptr *) + ((char *)block + xfs_btree_ptr_offset(cur, n, level)); +} + +/* + * Get a the root block which is stored in the inode. + * + * For now this btree implementation assumes the btree root is always + * stored in the if_broot field of an inode fork. + */ +STATIC struct xfs_btree_block * +xfs_btree_get_iroot( + struct xfs_btree_cur *cur) +{ + struct xfs_ifork *ifp; + + ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork); + return (struct xfs_btree_block *)ifp->if_broot; +} + +/* * Retrieve the block pointer from the cursor at the given level. - * This may be a bmap btree root or from a buffer. + * This may be an inode btree root or from a buffer. */ -STATIC xfs_btree_block_t * /* generic btree block pointer */ +STATIC struct xfs_btree_block * /* generic btree block pointer */ xfs_btree_get_block( - xfs_btree_cur_t *cur, /* btree cursor */ + struct xfs_btree_cur *cur, /* btree cursor */ int level, /* level in btree */ - xfs_buf_t **bpp) /* buffer containing the block */ -{ - xfs_btree_block_t *block; /* return value */ - xfs_buf_t *bp; /* return buffer */ - xfs_ifork_t *ifp; /* inode fork pointer */ - int whichfork; /* data or attr fork */ - - if (cur->bc_btnum == XFS_BTNUM_BMAP && level == cur->bc_nlevels - 1) { - whichfork = cur->bc_private.b.whichfork; - ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, whichfork); - block = (xfs_btree_block_t *)ifp->if_broot; - bp = NULL; - } else { - bp = cur->bc_bufs[level]; - block = XFS_BUF_TO_BLOCK(bp); + struct xfs_buf **bpp) /* buffer containing the block */ +{ + if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + (level == cur->bc_nlevels - 1)) { + *bpp = NULL; + return xfs_btree_get_iroot(cur); } - ASSERT(block != NULL); - *bpp = bp; - return block; + + *bpp = cur->bc_bufs[level]; + return XFS_BUF_TO_BLOCK(*bpp); } /* @@ -505,97 +502,6 @@ xfs_btree_get_bufs( } /* - * Allocate a new btree cursor. - * The cursor is either for allocation (A) or bmap (B) or inodes (I). - */ -xfs_btree_cur_t * /* new btree cursor */ -xfs_btree_init_cursor( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_buf_t *agbp, /* (A only) buffer for agf structure */ - /* (I only) buffer for agi structure */ - xfs_agnumber_t agno, /* (AI only) allocation group number */ - xfs_btnum_t btnum, /* btree identifier */ - xfs_inode_t *ip, /* (B only) inode owning the btree */ - int whichfork) /* (B only) data or attr fork */ -{ - xfs_agf_t *agf; /* (A) allocation group freespace */ - xfs_agi_t *agi; /* (I) allocation group inodespace */ - xfs_btree_cur_t *cur; /* return value */ - xfs_ifork_t *ifp; /* (I) inode fork pointer */ - int nlevels=0; /* number of levels in the btree */ - - ASSERT(xfs_btree_cur_zone != NULL); - /* - * Allocate a new cursor. - */ - cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP); - /* - * Deduce the number of btree levels from the arguments. - */ - switch (btnum) { - case XFS_BTNUM_BNO: - case XFS_BTNUM_CNT: - agf = XFS_BUF_TO_AGF(agbp); - nlevels = be32_to_cpu(agf->agf_levels[btnum]); - break; - case XFS_BTNUM_BMAP: - ifp = XFS_IFORK_PTR(ip, whichfork); - nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1; - break; - case XFS_BTNUM_INO: - agi = XFS_BUF_TO_AGI(agbp); - nlevels = be32_to_cpu(agi->agi_level); - break; - default: - ASSERT(0); - } - /* - * Fill in the common fields. - */ - cur->bc_tp = tp; - cur->bc_mp = mp; - cur->bc_nlevels = nlevels; - cur->bc_btnum = btnum; - cur->bc_blocklog = mp->m_sb.sb_blocklog; - /* - * Fill in private fields. - */ - switch (btnum) { - case XFS_BTNUM_BNO: - case XFS_BTNUM_CNT: - /* - * Allocation btree fields. - */ - cur->bc_private.a.agbp = agbp; - cur->bc_private.a.agno = agno; - break; - case XFS_BTNUM_INO: - /* - * Inode allocation btree fields. - */ - cur->bc_private.a.agbp = agbp; - cur->bc_private.a.agno = agno; - break; - case XFS_BTNUM_BMAP: - /* - * Bmap btree fields. - */ - cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork); - cur->bc_private.b.ip = ip; - cur->bc_private.b.firstblock = NULLFSBLOCK; - cur->bc_private.b.flist = NULL; - cur->bc_private.b.allocated = 0; - cur->bc_private.b.flags = 0; - cur->bc_private.b.whichfork = whichfork; - break; - default: - ASSERT(0); - } - return cur; -} - -/* * Check for the cursor referring to the last block at the given level. */ int /* 1=is last block, 0=not last block */ @@ -603,12 +509,12 @@ xfs_btree_islastblock( xfs_btree_cur_t *cur, /* btree cursor */ int level) /* level to check */ { - xfs_btree_block_t *block; /* generic btree block pointer */ + struct xfs_btree_block *block; /* generic btree block pointer */ xfs_buf_t *bp; /* buffer containing block */ block = xfs_btree_get_block(cur, level, &bp); xfs_btree_check_block(cur, block, level, bp); - if (XFS_BTREE_LONG_PTRS(cur->bc_btnum)) + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) return be64_to_cpu(block->bb_u.l.bb_rightsib) == NULLDFSBNO; else return be32_to_cpu(block->bb_u.s.bb_rightsib) == NULLAGBLOCK; @@ -618,12 +524,12 @@ xfs_btree_islastblock( * Change the cursor to point to the first record at the given level. * Other levels are unaffected. */ -int /* success=1, failure=0 */ +STATIC int /* success=1, failure=0 */ xfs_btree_firstrec( xfs_btree_cur_t *cur, /* btree cursor */ int level) /* level to change */ { - xfs_btree_block_t *block; /* generic btree block pointer */ + struct xfs_btree_block *block; /* generic btree block pointer */ xfs_buf_t *bp; /* buffer containing block */ /* @@ -634,7 +540,7 @@ xfs_btree_firstrec( /* * It's empty, there is no such record. */ - if (!block->bb_h.bb_numrecs) + if (!block->bb_numrecs) return 0; /* * Set the ptr value to 1, that's the first record/key. @@ -647,12 +553,12 @@ xfs_btree_firstrec( * Change the cursor to point to the last record in the current block * at the given level. Other levels are unaffected. */ -int /* success=1, failure=0 */ +STATIC int /* success=1, failure=0 */ xfs_btree_lastrec( xfs_btree_cur_t *cur, /* btree cursor */ int level) /* level to change */ { - xfs_btree_block_t *block; /* generic btree block pointer */ + struct xfs_btree_block *block; /* generic btree block pointer */ xfs_buf_t *bp; /* buffer containing block */ /* @@ -663,12 +569,12 @@ xfs_btree_lastrec( /* * It's empty, there is no such record. */ - if (!block->bb_h.bb_numrecs) + if (!block->bb_numrecs) return 0; /* * Set the ptr value to numrecs, that's the last record/key. */ - cur->bc_ptrs[level] = be16_to_cpu(block->bb_h.bb_numrecs); + cur->bc_ptrs[level] = be16_to_cpu(block->bb_numrecs); return 1; } @@ -817,66 +723,84 @@ xfs_btree_reada_bufs( xfs_baread(mp->m_ddev_targp, d, mp->m_bsize * count); } +STATIC int +xfs_btree_readahead_lblock( + struct xfs_btree_cur *cur, + int lr, + struct xfs_btree_block *block) +{ + int rval = 0; + xfs_fsblock_t left = be64_to_cpu(block->bb_u.l.bb_leftsib); + xfs_fsblock_t right = be64_to_cpu(block->bb_u.l.bb_rightsib); + + if ((lr & XFS_BTCUR_LEFTRA) && left != NULLDFSBNO) { + xfs_btree_reada_bufl(cur->bc_mp, left, 1); + rval++; + } + + if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLDFSBNO) { + xfs_btree_reada_bufl(cur->bc_mp, right, 1); + rval++; + } + + return rval; +} + +STATIC int +xfs_btree_readahead_sblock( + struct xfs_btree_cur *cur, + int lr, + struct xfs_btree_block *block) +{ + int rval = 0; + xfs_agblock_t left = be32_to_cpu(block->bb_u.s.bb_leftsib); + xfs_agblock_t right = be32_to_cpu(block->bb_u.s.bb_rightsib); + + + if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) { + xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, + left, 1); + rval++; + } + + if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) { + xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, + right, 1); + rval++; + } + + return rval; +} + /* * Read-ahead btree blocks, at the given level. * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA. */ -int -xfs_btree_readahead_core( - xfs_btree_cur_t *cur, /* btree cursor */ +STATIC int +xfs_btree_readahead( + struct xfs_btree_cur *cur, /* btree cursor */ int lev, /* level in btree */ int lr) /* left/right bits */ { - xfs_alloc_block_t *a; - xfs_bmbt_block_t *b; - xfs_inobt_block_t *i; - int rval = 0; + struct xfs_btree_block *block; + + /* + * No readahead needed if we are at the root level and the + * btree root is stored in the inode. + */ + if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + (lev == cur->bc_nlevels - 1)) + return 0; + + if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev]) + return 0; - ASSERT(cur->bc_bufs[lev] != NULL); cur->bc_ra[lev] |= lr; - switch (cur->bc_btnum) { - case XFS_BTNUM_BNO: - case XFS_BTNUM_CNT: - a = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[lev]); - if ((lr & XFS_BTCUR_LEFTRA) && be32_to_cpu(a->bb_leftsib) != NULLAGBLOCK) { - xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, - be32_to_cpu(a->bb_leftsib), 1); - rval++; - } - if ((lr & XFS_BTCUR_RIGHTRA) && be32_to_cpu(a->bb_rightsib) != NULLAGBLOCK) { - xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, - be32_to_cpu(a->bb_rightsib), 1); - rval++; - } - break; - case XFS_BTNUM_BMAP: - b = XFS_BUF_TO_BMBT_BLOCK(cur->bc_bufs[lev]); - if ((lr & XFS_BTCUR_LEFTRA) && be64_to_cpu(b->bb_leftsib) != NULLDFSBNO) { - xfs_btree_reada_bufl(cur->bc_mp, be64_to_cpu(b->bb_leftsib), 1); - rval++; - } - if ((lr & XFS_BTCUR_RIGHTRA) && be64_to_cpu(b->bb_rightsib) != NULLDFSBNO) { - xfs_btree_reada_bufl(cur->bc_mp, be64_to_cpu(b->bb_rightsib), 1); - rval++; - } - break; - case XFS_BTNUM_INO: - i = XFS_BUF_TO_INOBT_BLOCK(cur->bc_bufs[lev]); - if ((lr & XFS_BTCUR_LEFTRA) && be32_to_cpu(i->bb_leftsib) != NULLAGBLOCK) { - xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, - be32_to_cpu(i->bb_leftsib), 1); - rval++; - } - if ((lr & XFS_BTCUR_RIGHTRA) && be32_to_cpu(i->bb_rightsib) != NULLAGBLOCK) { - xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, - be32_to_cpu(i->bb_rightsib), 1); - rval++; - } - break; - default: - ASSERT(0); - } - return rval; + block = XFS_BUF_TO_BLOCK(cur->bc_bufs[lev]); + + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + return xfs_btree_readahead_lblock(cur, lr, block); + return xfs_btree_readahead_sblock(cur, lr, block); } /* @@ -889,7 +813,7 @@ xfs_btree_setbuf( int lev, /* level in btree */ xfs_buf_t *bp) /* new buffer to set */ { - xfs_btree_block_t *b; /* btree block */ + struct xfs_btree_block *b; /* btree block */ xfs_buf_t *obp; /* old buffer pointer */ obp = cur->bc_bufs[lev]; @@ -900,7 +824,7 @@ xfs_btree_setbuf( if (!bp) return; b = XFS_BUF_TO_BLOCK(bp); - if (XFS_BTREE_LONG_PTRS(cur->bc_btnum)) { + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { if (be64_to_cpu(b->bb_u.l.bb_leftsib) == NULLDFSBNO) cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA; if (be64_to_cpu(b->bb_u.l.bb_rightsib) == NULLDFSBNO) @@ -912,3 +836,2855 @@ xfs_btree_setbuf( cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA; } } + +STATIC int +xfs_btree_ptr_is_null( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + return be64_to_cpu(ptr->l) == NULLFSBLOCK; + else + return be32_to_cpu(ptr->s) == NULLAGBLOCK; +} + +STATIC void +xfs_btree_set_ptr_null( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + ptr->l = cpu_to_be64(NULLFSBLOCK); + else + ptr->s = cpu_to_be32(NULLAGBLOCK); +} + +/* + * Get/set/init sibling pointers + */ +STATIC void +xfs_btree_get_sibling( + struct xfs_btree_cur *cur, + struct xfs_btree_block *block, + union xfs_btree_ptr *ptr, + int lr) +{ + ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB); + + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + if (lr == XFS_BB_RIGHTSIB) + ptr->l = block->bb_u.l.bb_rightsib; + else + ptr->l = block->bb_u.l.bb_leftsib; + } else { + if (lr == XFS_BB_RIGHTSIB) + ptr->s = block->bb_u.s.bb_rightsib; + else + ptr->s = block->bb_u.s.bb_leftsib; + } +} + +STATIC void +xfs_btree_set_sibling( + struct xfs_btree_cur *cur, + struct xfs_btree_block *block, + union xfs_btree_ptr *ptr, + int lr) +{ + ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB); + + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + if (lr == XFS_BB_RIGHTSIB) + block->bb_u.l.bb_rightsib = ptr->l; + else + block->bb_u.l.bb_leftsib = ptr->l; + } else { + if (lr == XFS_BB_RIGHTSIB) + block->bb_u.s.bb_rightsib = ptr->s; + else + block->bb_u.s.bb_leftsib = ptr->s; + } +} + +STATIC void +xfs_btree_init_block( + struct xfs_btree_cur *cur, + int level, + int numrecs, + struct xfs_btree_block *new) /* new block */ +{ + new->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]); + new->bb_level = cpu_to_be16(level); + new->bb_numrecs = cpu_to_be16(numrecs); + + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + new->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK); + new->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK); + } else { + new->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); + new->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); + } +} + +/* + * Return true if ptr is the last record in the btree and + * we need to track updateѕ to this record. The decision + * will be further refined in the update_lastrec method. + */ +STATIC int +xfs_btree_is_lastrec( + struct xfs_btree_cur *cur, + struct xfs_btree_block *block, + int level) +{ + union xfs_btree_ptr ptr; + + if (level > 0) + return 0; + if (!(cur->bc_flags & XFS_BTREE_LASTREC_UPDATE)) + return 0; + + xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); + if (!xfs_btree_ptr_is_null(cur, &ptr)) + return 0; + return 1; +} + +STATIC void +xfs_btree_buf_to_ptr( + struct xfs_btree_cur *cur, + struct xfs_buf *bp, + union xfs_btree_ptr *ptr) +{ + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp, + XFS_BUF_ADDR(bp))); + else { + ptr->s = cpu_to_be32(XFS_DADDR_TO_AGBNO(cur->bc_mp, + XFS_BUF_ADDR(bp))); + } +} + +STATIC xfs_daddr_t +xfs_btree_ptr_to_daddr( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + ASSERT(be64_to_cpu(ptr->l) != NULLFSBLOCK); + + return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l)); + } else { + ASSERT(cur->bc_private.a.agno != NULLAGNUMBER); + ASSERT(be32_to_cpu(ptr->s) != NULLAGBLOCK); + + return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno, + be32_to_cpu(ptr->s)); + } +} + +STATIC void +xfs_btree_set_refs( + struct xfs_btree_cur *cur, + struct xfs_buf *bp) +{ + switch (cur->bc_btnum) { + case XFS_BTNUM_BNO: + case XFS_BTNUM_CNT: + XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_MAP, XFS_ALLOC_BTREE_REF); + break; + case XFS_BTNUM_INO: + XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_INOMAP, XFS_INO_BTREE_REF); + break; + case XFS_BTNUM_BMAP: + XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_MAP, XFS_BMAP_BTREE_REF); + break; + default: + ASSERT(0); + } +} + +STATIC int +xfs_btree_get_buf_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + int flags, + struct xfs_btree_block **block, + struct xfs_buf **bpp) +{ + struct xfs_mount *mp = cur->bc_mp; + xfs_daddr_t d; + + /* need to sort out how callers deal with failures first */ + ASSERT(!(flags & XFS_BUF_TRYLOCK)); + + d = xfs_btree_ptr_to_daddr(cur, ptr); + *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d, + mp->m_bsize, flags); + + ASSERT(*bpp); + ASSERT(!XFS_BUF_GETERROR(*bpp)); + + *block = XFS_BUF_TO_BLOCK(*bpp); + return 0; +} + +/* + * Read in the buffer at the given ptr and return the buffer and + * the block pointer within the buffer. + */ +STATIC int +xfs_btree_read_buf_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + int level, + int flags, + struct xfs_btree_block **block, + struct xfs_buf **bpp) +{ + struct xfs_mount *mp = cur->bc_mp; + xfs_daddr_t d; + int error; + + /* need to sort out how callers deal with failures first */ + ASSERT(!(flags & XFS_BUF_TRYLOCK)); + + d = xfs_btree_ptr_to_daddr(cur, ptr); + error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d, + mp->m_bsize, flags, bpp); + if (error) + return error; + + ASSERT(*bpp != NULL); + ASSERT(!XFS_BUF_GETERROR(*bpp)); + + xfs_btree_set_refs(cur, *bpp); + *block = XFS_BUF_TO_BLOCK(*bpp); + + error = xfs_btree_check_block(cur, *block, level, *bpp); + if (error) + xfs_trans_brelse(cur->bc_tp, *bpp); + return error; +} + +/* + * Copy keys from one btree block to another. + */ +STATIC void +xfs_btree_copy_keys( + struct xfs_btree_cur *cur, + union xfs_btree_key *dst_key, + union xfs_btree_key *src_key, + int numkeys) +{ + ASSERT(numkeys >= 0); + memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len); +} + +/* + * Copy records from one btree block to another. + */ +STATIC void +xfs_btree_copy_recs( + struct xfs_btree_cur *cur, + union xfs_btree_rec *dst_rec, + union xfs_btree_rec *src_rec, + int numrecs) +{ + ASSERT(numrecs >= 0); + memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len); +} + +/* + * Copy block pointers from one btree block to another. + */ +STATIC void +xfs_btree_copy_ptrs( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *dst_ptr, + union xfs_btree_ptr *src_ptr, + int numptrs) +{ + ASSERT(numptrs >= 0); + memcpy(dst_ptr, src_ptr, numptrs * xfs_btree_ptr_len(cur)); +} + +/* + * Shift keys one index left/right inside a single btree block. + */ +STATIC void +xfs_btree_shift_keys( + struct xfs_btree_cur *cur, + union xfs_btree_key *key, + int dir, + int numkeys) +{ + char *dst_key; + + ASSERT(numkeys >= 0); + ASSERT(dir == 1 || dir == -1); + + dst_key = (char *)key + (dir * cur->bc_ops->key_len); + memmove(dst_key, key, numkeys * cur->bc_ops->key_len); +} + +/* + * Shift records one index left/right inside a single btree block. + */ +STATIC void +xfs_btree_shift_recs( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec, + int dir, + int numrecs) +{ + char *dst_rec; + + ASSERT(numrecs >= 0); + ASSERT(dir == 1 || dir == -1); + + dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len); + memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len); +} + +/* + * Shift block pointers one index left/right inside a single btree block. + */ +STATIC void +xfs_btree_shift_ptrs( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + int dir, + int numptrs) +{ + char *dst_ptr; + + ASSERT(numptrs >= 0); + ASSERT(dir == 1 || dir == -1); + + dst_ptr = (char *)ptr + (dir * xfs_btree_ptr_len(cur)); + memmove(dst_ptr, ptr, numptrs * xfs_btree_ptr_len(cur)); +} + +/* + * Log key values from the btree block. + */ +STATIC void +xfs_btree_log_keys( + struct xfs_btree_cur *cur, + struct xfs_buf *bp, + int first, + int last) +{ + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGBII(cur, bp, first, last); + + if (bp) { + xfs_trans_log_buf(cur->bc_tp, bp, + xfs_btree_key_offset(cur, first), + xfs_btree_key_offset(cur, last + 1) - 1); + } else { + xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, + xfs_ilog_fbroot(cur->bc_private.b.whichfork)); + } + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); +} + +/* + * Log record values from the btree block. + */ +void +xfs_btree_log_recs( + struct xfs_btree_cur *cur, + struct xfs_buf *bp, + int first, + int last) +{ + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGBII(cur, bp, first, last); + + xfs_trans_log_buf(cur->bc_tp, bp, + xfs_btree_rec_offset(cur, first), + xfs_btree_rec_offset(cur, last + 1) - 1); + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); +} + +/* + * Log block pointer fields from a btree block (nonleaf). + */ +STATIC void +xfs_btree_log_ptrs( + struct xfs_btree_cur *cur, /* btree cursor */ + struct xfs_buf *bp, /* buffer containing btree block */ + int first, /* index of first pointer to log */ + int last) /* index of last pointer to log */ +{ + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGBII(cur, bp, first, last); + + if (bp) { + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + int level = xfs_btree_get_level(block); + + xfs_trans_log_buf(cur->bc_tp, bp, + xfs_btree_ptr_offset(cur, first, level), + xfs_btree_ptr_offset(cur, last + 1, level) - 1); + } else { + xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, + xfs_ilog_fbroot(cur->bc_private.b.whichfork)); + } + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); +} + +/* + * Log fields from a btree block header. + */ +void +xfs_btree_log_block( + struct xfs_btree_cur *cur, /* btree cursor */ + struct xfs_buf *bp, /* buffer containing btree block */ + int fields) /* mask of fields: XFS_BB_... */ +{ + int first; /* first byte offset logged */ + int last; /* last byte offset logged */ + static const short soffsets[] = { /* table of offsets (short) */ + offsetof(struct xfs_btree_block, bb_magic), + offsetof(struct xfs_btree_block, bb_level), + offsetof(struct xfs_btree_block, bb_numrecs), + offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib), + offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib), + XFS_BTREE_SBLOCK_LEN + }; + static const short loffsets[] = { /* table of offsets (long) */ + offsetof(struct xfs_btree_block, bb_magic), + offsetof(struct xfs_btree_block, bb_level), + offsetof(struct xfs_btree_block, bb_numrecs), + offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib), + offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib), + XFS_BTREE_LBLOCK_LEN + }; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGBI(cur, bp, fields); + + if (bp) { + xfs_btree_offsets(fields, + (cur->bc_flags & XFS_BTREE_LONG_PTRS) ? + loffsets : soffsets, + XFS_BB_NUM_BITS, &first, &last); + xfs_trans_log_buf(cur->bc_tp, bp, first, last); + } else { + xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, + xfs_ilog_fbroot(cur->bc_private.b.whichfork)); + } + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); +} + +/* + * Increment cursor by one record at the level. + * For nonzero levels the leaf-ward information is untouched. + */ +int /* error */ +xfs_btree_increment( + struct xfs_btree_cur *cur, + int level, + int *stat) /* success/failure */ +{ + struct xfs_btree_block *block; + union xfs_btree_ptr ptr; + struct xfs_buf *bp; + int error; /* error return value */ + int lev; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGI(cur, level); + + ASSERT(level < cur->bc_nlevels); + + /* Read-ahead to the right at this level. */ + xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA); + + /* Get a pointer to the btree block. */ + block = xfs_btree_get_block(cur, level, &bp); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, block, level, bp); + if (error) + goto error0; +#endif + + /* We're done if we remain in the block after the increment. */ + if (++cur->bc_ptrs[level] <= xfs_btree_get_numrecs(block)) + goto out1; + + /* Fail if we just went off the right edge of the tree. */ + xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); + if (xfs_btree_ptr_is_null(cur, &ptr)) + goto out0; + + XFS_BTREE_STATS_INC(cur, increment); + + /* + * March up the tree incrementing pointers. + * Stop when we don't go off the right edge of a block. + */ + for (lev = level + 1; lev < cur->bc_nlevels; lev++) { + block = xfs_btree_get_block(cur, lev, &bp); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, block, lev, bp); + if (error) + goto error0; +#endif + + if (++cur->bc_ptrs[lev] <= xfs_btree_get_numrecs(block)) + break; + + /* Read-ahead the right block for the next loop. */ + xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA); + } + + /* + * If we went off the root then we are either seriously + * confused or have the tree root in an inode. + */ + if (lev == cur->bc_nlevels) { + if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) + goto out0; + ASSERT(0); + error = EFSCORRUPTED; + goto error0; + } + ASSERT(lev < cur->bc_nlevels); + + /* + * Now walk back down the tree, fixing up the cursor's buffer + * pointers and key numbers. + */ + for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) { + union xfs_btree_ptr *ptrp; + + ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block); + error = xfs_btree_read_buf_block(cur, ptrp, --lev, + 0, &block, &bp); + if (error) + goto error0; + + xfs_btree_setbuf(cur, lev, bp); + cur->bc_ptrs[lev] = 1; + } +out1: + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; + +out0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +} + +/* + * Decrement cursor by one record at the level. + * For nonzero levels the leaf-ward information is untouched. + */ +int /* error */ +xfs_btree_decrement( + struct xfs_btree_cur *cur, + int level, + int *stat) /* success/failure */ +{ + struct xfs_btree_block *block; + xfs_buf_t *bp; + int error; /* error return value */ + int lev; + union xfs_btree_ptr ptr; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGI(cur, level); + + ASSERT(level < cur->bc_nlevels); + + /* Read-ahead to the left at this level. */ + xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA); + + /* We're done if we remain in the block after the decrement. */ + if (--cur->bc_ptrs[level] > 0) + goto out1; + + /* Get a pointer to the btree block. */ + block = xfs_btree_get_block(cur, level, &bp); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, block, level, bp); + if (error) + goto error0; +#endif + + /* Fail if we just went off the left edge of the tree. */ + xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB); + if (xfs_btree_ptr_is_null(cur, &ptr)) + goto out0; + + XFS_BTREE_STATS_INC(cur, decrement); + + /* + * March up the tree decrementing pointers. + * Stop when we don't go off the left edge of a block. + */ + for (lev = level + 1; lev < cur->bc_nlevels; lev++) { + if (--cur->bc_ptrs[lev] > 0) + break; + /* Read-ahead the left block for the next loop. */ + xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA); + } + + /* + * If we went off the root then we are seriously confused. + * or the root of the tree is in an inode. + */ + if (lev == cur->bc_nlevels) { + if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) + goto out0; + ASSERT(0); + error = EFSCORRUPTED; + goto error0; + } + ASSERT(lev < cur->bc_nlevels); + + /* + * Now walk back down the tree, fixing up the cursor's buffer + * pointers and key numbers. + */ + for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) { + union xfs_btree_ptr *ptrp; + + ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block); + error = xfs_btree_read_buf_block(cur, ptrp, --lev, + 0, &block, &bp); + if (error) + goto error0; + xfs_btree_setbuf(cur, lev, bp); + cur->bc_ptrs[lev] = xfs_btree_get_numrecs(block); + } +out1: + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; + +out0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +} + +STATIC int +xfs_btree_lookup_get_block( + struct xfs_btree_cur *cur, /* btree cursor */ + int level, /* level in the btree */ + union xfs_btree_ptr *pp, /* ptr to btree block */ + struct xfs_btree_block **blkp) /* return btree block */ +{ + struct xfs_buf *bp; /* buffer pointer for btree block */ + int error = 0; + + /* special case the root block if in an inode */ + if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + (level == cur->bc_nlevels - 1)) { + *blkp = xfs_btree_get_iroot(cur); + return 0; + } + + /* + * If the old buffer at this level for the disk address we are + * looking for re-use it. + * + * Otherwise throw it away and get a new one. + */ + bp = cur->bc_bufs[level]; + if (bp && XFS_BUF_ADDR(bp) == xfs_btree_ptr_to_daddr(cur, pp)) { + *blkp = XFS_BUF_TO_BLOCK(bp); + return 0; + } + + error = xfs_btree_read_buf_block(cur, pp, level, 0, blkp, &bp); + if (error) + return error; + + xfs_btree_setbuf(cur, level, bp); + return 0; +} + +/* + * Get current search key. For level 0 we don't actually have a key + * structure so we make one up from the record. For all other levels + * we just return the right key. + */ +STATIC union xfs_btree_key * +xfs_lookup_get_search_key( + struct xfs_btree_cur *cur, + int level, + int keyno, + struct xfs_btree_block *block, + union xfs_btree_key *kp) +{ + if (level == 0) { + cur->bc_ops->init_key_from_rec(kp, + xfs_btree_rec_addr(cur, keyno, block)); + return kp; + } + + return xfs_btree_key_addr(cur, keyno, block); +} + +/* + * Lookup the record. The cursor is made to point to it, based on dir. + * Return 0 if can't find any such record, 1 for success. + */ +int /* error */ +xfs_btree_lookup( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_lookup_t dir, /* <=, ==, or >= */ + int *stat) /* success/failure */ +{ + struct xfs_btree_block *block; /* current btree block */ + __int64_t diff; /* difference for the current key */ + int error; /* error return value */ + int keyno; /* current key number */ + int level; /* level in the btree */ + union xfs_btree_ptr *pp; /* ptr to btree block */ + union xfs_btree_ptr ptr; /* ptr to btree block */ + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGI(cur, dir); + + XFS_BTREE_STATS_INC(cur, lookup); + + block = NULL; + keyno = 0; + + /* initialise start pointer from cursor */ + cur->bc_ops->init_ptr_from_cur(cur, &ptr); + pp = &ptr; + + /* + * Iterate over each level in the btree, starting at the root. + * For each level above the leaves, find the key we need, based + * on the lookup record, then follow the corresponding block + * pointer down to the next level. + */ + for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) { + /* Get the block we need to do the lookup on. */ + error = xfs_btree_lookup_get_block(cur, level, pp, &block); + if (error) + goto error0; + + if (diff == 0) { + /* + * If we already had a key match at a higher level, we + * know we need to use the first entry in this block. + */ + keyno = 1; + } else { + /* Otherwise search this block. Do a binary search. */ + + int high; /* high entry number */ + int low; /* low entry number */ + + /* Set low and high entry numbers, 1-based. */ + low = 1; + high = xfs_btree_get_numrecs(block); + if (!high) { + /* Block is empty, must be an empty leaf. */ + ASSERT(level == 0 && cur->bc_nlevels == 1); + + cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE; + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + } + + /* Binary search the block. */ + while (low <= high) { + union xfs_btree_key key; + union xfs_btree_key *kp; + + XFS_BTREE_STATS_INC(cur, compare); + + /* keyno is average of low and high. */ + keyno = (low + high) >> 1; + + /* Get current search key */ + kp = xfs_lookup_get_search_key(cur, level, + keyno, block, &key); + + /* + * Compute difference to get next direction: + * - less than, move right + * - greater than, move left + * - equal, we're done + */ + diff = cur->bc_ops->key_diff(cur, kp); + if (diff < 0) + low = keyno + 1; + else if (diff > 0) + high = keyno - 1; + else + break; + } + } + + /* + * If there are more levels, set up for the next level + * by getting the block number and filling in the cursor. + */ + if (level > 0) { + /* + * If we moved left, need the previous key number, + * unless there isn't one. + */ + if (diff > 0 && --keyno < 1) + keyno = 1; + pp = xfs_btree_ptr_addr(cur, keyno, block); + +#ifdef DEBUG + error = xfs_btree_check_ptr(cur, pp, 0, level); + if (error) + goto error0; +#endif + cur->bc_ptrs[level] = keyno; + } + } + + /* Done with the search. See if we need to adjust the results. */ + if (dir != XFS_LOOKUP_LE && diff < 0) { + keyno++; + /* + * If ge search and we went off the end of the block, but it's + * not the last block, we're in the wrong block. + */ + xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); + if (dir == XFS_LOOKUP_GE && + keyno > xfs_btree_get_numrecs(block) && + !xfs_btree_ptr_is_null(cur, &ptr)) { + int i; + + cur->bc_ptrs[0] = keyno; + error = xfs_btree_increment(cur, 0, &i); + if (error) + goto error0; + XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; + } + } else if (dir == XFS_LOOKUP_LE && diff > 0) + keyno--; + cur->bc_ptrs[0] = keyno; + + /* Return if we succeeded or not. */ + if (keyno == 0 || keyno > xfs_btree_get_numrecs(block)) + *stat = 0; + else if (dir != XFS_LOOKUP_EQ || diff == 0) + *stat = 1; + else + *stat = 0; + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + return 0; + +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +} + +/* + * Update keys at all levels from here to the root along the cursor's path. + */ +STATIC int +xfs_btree_updkey( + struct xfs_btree_cur *cur, + union xfs_btree_key *keyp, + int level) +{ + struct xfs_btree_block *block; + struct xfs_buf *bp; + union xfs_btree_key *kp; + int ptr; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGIK(cur, level, keyp); + + ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) || level >= 1); + + /* + * Go up the tree from this level toward the root. + * At each level, update the key value to the value input. + * Stop when we reach a level where the cursor isn't pointing + * at the first entry in the block. + */ + for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) { +#ifdef DEBUG + int error; +#endif + block = xfs_btree_get_block(cur, level, &bp); +#ifdef DEBUG + error = xfs_btree_check_block(cur, block, level, bp); + if (error) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; + } +#endif + ptr = cur->bc_ptrs[level]; + kp = xfs_btree_key_addr(cur, ptr, block); + xfs_btree_copy_keys(cur, kp, keyp, 1); + xfs_btree_log_keys(cur, bp, ptr, ptr); + } + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + return 0; +} + +/* + * Update the record referred to by cur to the value in the + * given record. This either works (return 0) or gets an + * EFSCORRUPTED error. + */ +int +xfs_btree_update( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec) +{ + struct xfs_btree_block *block; + struct xfs_buf *bp; + int error; + int ptr; + union xfs_btree_rec *rp; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGR(cur, rec); + + /* Pick up the current block. */ + block = xfs_btree_get_block(cur, 0, &bp); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, block, 0, bp); + if (error) + goto error0; +#endif + /* Get the address of the rec to be updated. */ + ptr = cur->bc_ptrs[0]; + rp = xfs_btree_rec_addr(cur, ptr, block); + + /* Fill in the new contents and log them. */ + xfs_btree_copy_recs(cur, rp, rec, 1); + xfs_btree_log_recs(cur, bp, ptr, ptr); + + /* + * If we are tracking the last record in the tree and + * we are at the far right edge of the tree, update it. + */ + if (xfs_btree_is_lastrec(cur, block, 0)) { + cur->bc_ops->update_lastrec(cur, block, rec, + ptr, LASTREC_UPDATE); + } + + /* Updating first rec in leaf. Pass new key value up to our parent. */ + if (ptr == 1) { + union xfs_btree_key key; + + cur->bc_ops->init_key_from_rec(&key, rec); + error = xfs_btree_updkey(cur, &key, 1); + if (error) + goto error0; + } + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + return 0; + +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +} + +/* + * Move 1 record left from cur/level if possible. + * Update cur to reflect the new path. + */ +STATIC int /* error */ +xfs_btree_lshift( + struct xfs_btree_cur *cur, + int level, + int *stat) /* success/failure */ +{ + union xfs_btree_key key; /* btree key */ + struct xfs_buf *lbp; /* left buffer pointer */ + struct xfs_btree_block *left; /* left btree block */ + int lrecs; /* left record count */ + struct xfs_buf *rbp; /* right buffer pointer */ + struct xfs_btree_block *right; /* right btree block */ + int rrecs; /* right record count */ + union xfs_btree_ptr lptr; /* left btree pointer */ + union xfs_btree_key *rkp = NULL; /* right btree key */ + union xfs_btree_ptr *rpp = NULL; /* right address pointer */ + union xfs_btree_rec *rrp = NULL; /* right record pointer */ + int error; /* error return value */ + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGI(cur, level); + + if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + level == cur->bc_nlevels - 1) + goto out0; + + /* Set up variables for this block as "right". */ + right = xfs_btree_get_block(cur, level, &rbp); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, right, level, rbp); + if (error) + goto error0; +#endif + + /* If we've got no left sibling then we can't shift an entry left. */ + xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB); + if (xfs_btree_ptr_is_null(cur, &lptr)) + goto out0; + + /* + * If the cursor entry is the one that would be moved, don't + * do it... it's too complicated. + */ + if (cur->bc_ptrs[level] <= 1) + goto out0; + + /* Set up the left neighbor as "left". */ + error = xfs_btree_read_buf_block(cur, &lptr, level, 0, &left, &lbp); + if (error) + goto error0; + + /* If it's full, it can't take another entry. */ + lrecs = xfs_btree_get_numrecs(left); + if (lrecs == cur->bc_ops->get_maxrecs(cur, level)) + goto out0; + + rrecs = xfs_btree_get_numrecs(right); + + /* + * We add one entry to the left side and remove one for the right side. + * Accout for it here, the changes will be updated on disk and logged + * later. + */ + lrecs++; + rrecs--; + + XFS_BTREE_STATS_INC(cur, lshift); + XFS_BTREE_STATS_ADD(cur, moves, 1); + + /* + * If non-leaf, copy a key and a ptr to the left block. + * Log the changes to the left block. + */ + if (level > 0) { + /* It's a non-leaf. Move keys and pointers. */ + union xfs_btree_key *lkp; /* left btree key */ + union xfs_btree_ptr *lpp; /* left address pointer */ + + lkp = xfs_btree_key_addr(cur, lrecs, left); + rkp = xfs_btree_key_addr(cur, 1, right); + + lpp = xfs_btree_ptr_addr(cur, lrecs, left); + rpp = xfs_btree_ptr_addr(cur, 1, right); +#ifdef DEBUG + error = xfs_btree_check_ptr(cur, rpp, 0, level); + if (error) + goto error0; +#endif + xfs_btree_copy_keys(cur, lkp, rkp, 1); + xfs_btree_copy_ptrs(cur, lpp, rpp, 1); + + xfs_btree_log_keys(cur, lbp, lrecs, lrecs); + xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs); + + ASSERT(cur->bc_ops->keys_inorder(cur, + xfs_btree_key_addr(cur, lrecs - 1, left), lkp)); + } else { + /* It's a leaf. Move records. */ + union xfs_btree_rec *lrp; /* left record pointer */ + + lrp = xfs_btree_rec_addr(cur, lrecs, left); + rrp = xfs_btree_rec_addr(cur, 1, right); + + xfs_btree_copy_recs(cur, lrp, rrp, 1); + xfs_btree_log_recs(cur, lbp, lrecs, lrecs); + + ASSERT(cur->bc_ops->recs_inorder(cur, + xfs_btree_rec_addr(cur, lrecs - 1, left), lrp)); + } + + xfs_btree_set_numrecs(left, lrecs); + xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS); + + xfs_btree_set_numrecs(right, rrecs); + xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS); + + /* + * Slide the contents of right down one entry. + */ + XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1); + if (level > 0) { + /* It's a nonleaf. operate on keys and ptrs */ +#ifdef DEBUG + int i; /* loop index */ + + for (i = 0; i < rrecs; i++) { + error = xfs_btree_check_ptr(cur, rpp, i + 1, level); + if (error) + goto error0; + } +#endif + xfs_btree_shift_keys(cur, + xfs_btree_key_addr(cur, 2, right), + -1, rrecs); + xfs_btree_shift_ptrs(cur, + xfs_btree_ptr_addr(cur, 2, right), + -1, rrecs); + + xfs_btree_log_keys(cur, rbp, 1, rrecs); + xfs_btree_log_ptrs(cur, rbp, 1, rrecs); + } else { + /* It's a leaf. operate on records */ + xfs_btree_shift_recs(cur, + xfs_btree_rec_addr(cur, 2, right), + -1, rrecs); + xfs_btree_log_recs(cur, rbp, 1, rrecs); + + /* + * If it's the first record in the block, we'll need a key + * structure to pass up to the next level (updkey). + */ + cur->bc_ops->init_key_from_rec(&key, + xfs_btree_rec_addr(cur, 1, right)); + rkp = &key; + } + + /* Update the parent key values of right. */ + error = xfs_btree_updkey(cur, rkp, level + 1); + if (error) + goto error0; + + /* Slide the cursor value left one. */ + cur->bc_ptrs[level]--; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; + +out0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +} + +/* + * Move 1 record right from cur/level if possible. + * Update cur to reflect the new path. + */ +STATIC int /* error */ +xfs_btree_rshift( + struct xfs_btree_cur *cur, + int level, + int *stat) /* success/failure */ +{ + union xfs_btree_key key; /* btree key */ + struct xfs_buf *lbp; /* left buffer pointer */ + struct xfs_btree_block *left; /* left btree block */ + struct xfs_buf *rbp; /* right buffer pointer */ + struct xfs_btree_block *right; /* right btree block */ + struct xfs_btree_cur *tcur; /* temporary btree cursor */ + union xfs_btree_ptr rptr; /* right block pointer */ + union xfs_btree_key *rkp; /* right btree key */ + int rrecs; /* right record count */ + int lrecs; /* left record count */ + int error; /* error return value */ + int i; /* loop counter */ + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGI(cur, level); + + if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + (level == cur->bc_nlevels - 1)) + goto out0; + + /* Set up variables for this block as "left". */ + left = xfs_btree_get_block(cur, level, &lbp); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, left, level, lbp); + if (error) + goto error0; +#endif + + /* If we've got no right sibling then we can't shift an entry right. */ + xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB); + if (xfs_btree_ptr_is_null(cur, &rptr)) + goto out0; + + /* + * If the cursor entry is the one that would be moved, don't + * do it... it's too complicated. + */ + lrecs = xfs_btree_get_numrecs(left); + if (cur->bc_ptrs[level] >= lrecs) + goto out0; + + /* Set up the right neighbor as "right". */ + error = xfs_btree_read_buf_block(cur, &rptr, level, 0, &right, &rbp); + if (error) + goto error0; + + /* If it's full, it can't take another entry. */ + rrecs = xfs_btree_get_numrecs(right); + if (rrecs == cur->bc_ops->get_maxrecs(cur, level)) + goto out0; + + XFS_BTREE_STATS_INC(cur, rshift); + XFS_BTREE_STATS_ADD(cur, moves, rrecs); + + /* + * Make a hole at the start of the right neighbor block, then + * copy the last left block entry to the hole. + */ + if (level > 0) { + /* It's a nonleaf. make a hole in the keys and ptrs */ + union xfs_btree_key *lkp; + union xfs_btree_ptr *lpp; + union xfs_btree_ptr *rpp; + + lkp = xfs_btree_key_addr(cur, lrecs, left); + lpp = xfs_btree_ptr_addr(cur, lrecs, left); + rkp = xfs_btree_key_addr(cur, 1, right); + rpp = xfs_btree_ptr_addr(cur, 1, right); + +#ifdef DEBUG + for (i = rrecs - 1; i >= 0; i--) { + error = xfs_btree_check_ptr(cur, rpp, i, level); + if (error) + goto error0; + } +#endif + + xfs_btree_shift_keys(cur, rkp, 1, rrecs); + xfs_btree_shift_ptrs(cur, rpp, 1, rrecs); + +#ifdef DEBUG + error = xfs_btree_check_ptr(cur, lpp, 0, level); + if (error) + goto error0; +#endif + + /* Now put the new data in, and log it. */ + xfs_btree_copy_keys(cur, rkp, lkp, 1); + xfs_btree_copy_ptrs(cur, rpp, lpp, 1); + + xfs_btree_log_keys(cur, rbp, 1, rrecs + 1); + xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1); + + ASSERT(cur->bc_ops->keys_inorder(cur, rkp, + xfs_btree_key_addr(cur, 2, right))); + } else { + /* It's a leaf. make a hole in the records */ + union xfs_btree_rec *lrp; + union xfs_btree_rec *rrp; + + lrp = xfs_btree_rec_addr(cur, lrecs, left); + rrp = xfs_btree_rec_addr(cur, 1, right); + + xfs_btree_shift_recs(cur, rrp, 1, rrecs); + + /* Now put the new data in, and log it. */ + xfs_btree_copy_recs(cur, rrp, lrp, 1); + xfs_btree_log_recs(cur, rbp, 1, rrecs + 1); + + cur->bc_ops->init_key_from_rec(&key, rrp); + rkp = &key; + + ASSERT(cur->bc_ops->recs_inorder(cur, rrp, + xfs_btree_rec_addr(cur, 2, right))); + } + + /* + * Decrement and log left's numrecs, bump and log right's numrecs. + */ + xfs_btree_set_numrecs(left, --lrecs); + xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS); + + xfs_btree_set_numrecs(right, ++rrecs); + xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS); + + /* + * Using a temporary cursor, update the parent key values of the + * block on the right. + */ + error = xfs_btree_dup_cursor(cur, &tcur); + if (error) + goto error0; + i = xfs_btree_lastrec(tcur, level); + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + + error = xfs_btree_increment(tcur, level, &i); + if (error) + goto error1; + + error = xfs_btree_updkey(tcur, rkp, level + 1); + if (error) + goto error1; + + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; + +out0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; + +error1: + XFS_BTREE_TRACE_CURSOR(tcur, XBT_ERROR); + xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); + return error; +} + +/* + * Split cur/level block in half. + * Return new block number and the key to its first + * record (to be inserted into parent). + */ +STATIC int /* error */ +xfs_btree_split( + struct xfs_btree_cur *cur, + int level, + union xfs_btree_ptr *ptrp, + union xfs_btree_key *key, + struct xfs_btree_cur **curp, + int *stat) /* success/failure */ +{ + union xfs_btree_ptr lptr; /* left sibling block ptr */ + struct xfs_buf *lbp; /* left buffer pointer */ + struct xfs_btree_block *left; /* left btree block */ + union xfs_btree_ptr rptr; /* right sibling block ptr */ + struct xfs_buf *rbp; /* right buffer pointer */ + struct xfs_btree_block *right; /* right btree block */ + union xfs_btree_ptr rrptr; /* right-right sibling ptr */ + struct xfs_buf *rrbp; /* right-right buffer pointer */ + struct xfs_btree_block *rrblock; /* right-right btree block */ + int lrecs; + int rrecs; + int src_index; + int error; /* error return value */ +#ifdef DEBUG + int i; +#endif + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGIPK(cur, level, *ptrp, key); + + XFS_BTREE_STATS_INC(cur, split); + + /* Set up left block (current one). */ + left = xfs_btree_get_block(cur, level, &lbp); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, left, level, lbp); + if (error) + goto error0; +#endif + + xfs_btree_buf_to_ptr(cur, lbp, &lptr); + + /* Allocate the new block. If we can't do it, we're toast. Give up. */ + error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, 1, stat); + if (error) + goto error0; + if (*stat == 0) + goto out0; + XFS_BTREE_STATS_INC(cur, alloc); + + /* Set up the new block as "right". */ + error = xfs_btree_get_buf_block(cur, &rptr, 0, &right, &rbp); + if (error) + goto error0; + + /* Fill in the btree header for the new right block. */ + xfs_btree_init_block(cur, xfs_btree_get_level(left), 0, right); + + /* + * Split the entries between the old and the new block evenly. + * Make sure that if there's an odd number of entries now, that + * each new block will have the same number of entries. + */ + lrecs = xfs_btree_get_numrecs(left); + rrecs = lrecs / 2; + if ((lrecs & 1) && cur->bc_ptrs[level] <= rrecs + 1) + rrecs++; + src_index = (lrecs - rrecs + 1); + + XFS_BTREE_STATS_ADD(cur, moves, rrecs); + + /* + * Copy btree block entries from the left block over to the + * new block, the right. Update the right block and log the + * changes. + */ + if (level > 0) { + /* It's a non-leaf. Move keys and pointers. */ + union xfs_btree_key *lkp; /* left btree key */ + union xfs_btree_ptr *lpp; /* left address pointer */ + union xfs_btree_key *rkp; /* right btree key */ + union xfs_btree_ptr *rpp; /* right address pointer */ + + lkp = xfs_btree_key_addr(cur, src_index, left); + lpp = xfs_btree_ptr_addr(cur, src_index, left); + rkp = xfs_btree_key_addr(cur, 1, right); + rpp = xfs_btree_ptr_addr(cur, 1, right); + +#ifdef DEBUG + for (i = src_index; i < rrecs; i++) { + error = xfs_btree_check_ptr(cur, lpp, i, level); + if (error) + goto error0; + } +#endif + + xfs_btree_copy_keys(cur, rkp, lkp, rrecs); + xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs); + + xfs_btree_log_keys(cur, rbp, 1, rrecs); + xfs_btree_log_ptrs(cur, rbp, 1, rrecs); + + /* Grab the keys to the entries moved to the right block */ + xfs_btree_copy_keys(cur, key, rkp, 1); + } else { + /* It's a leaf. Move records. */ + union xfs_btree_rec *lrp; /* left record pointer */ + union xfs_btree_rec *rrp; /* right record pointer */ + + lrp = xfs_btree_rec_addr(cur, src_index, left); + rrp = xfs_btree_rec_addr(cur, 1, right); + + xfs_btree_copy_recs(cur, rrp, lrp, rrecs); + xfs_btree_log_recs(cur, rbp, 1, rrecs); + + cur->bc_ops->init_key_from_rec(key, + xfs_btree_rec_addr(cur, 1, right)); + } + + + /* + * Find the left block number by looking in the buffer. + * Adjust numrecs, sibling pointers. + */ + xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB); + xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB); + xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB); + xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB); + + lrecs -= rrecs; + xfs_btree_set_numrecs(left, lrecs); + xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs); + + xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS); + xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB); + + /* + * If there's a block to the new block's right, make that block + * point back to right instead of to left. + */ + if (!xfs_btree_ptr_is_null(cur, &rrptr)) { + error = xfs_btree_read_buf_block(cur, &rrptr, level, + 0, &rrblock, &rrbp); + if (error) + goto error0; + xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB); + xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB); + } + /* + * If the cursor is really in the right block, move it there. + * If it's just pointing past the last entry in left, then we'll + * insert there, so don't change anything in that case. + */ + if (cur->bc_ptrs[level] > lrecs + 1) { + xfs_btree_setbuf(cur, level, rbp); + cur->bc_ptrs[level] -= lrecs; + } + /* + * If there are more levels, we'll need another cursor which refers + * the right block, no matter where this cursor was. + */ + if (level + 1 < cur->bc_nlevels) { + error = xfs_btree_dup_cursor(cur, curp); + if (error) + goto error0; + (*curp)->bc_ptrs[level + 1]++; + } + *ptrp = rptr; + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; +out0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +} + +/* + * Copy the old inode root contents into a real block and make the + * broot point to it. + */ +int /* error */ +xfs_btree_new_iroot( + struct xfs_btree_cur *cur, /* btree cursor */ + int *logflags, /* logging flags for inode */ + int *stat) /* return status - 0 fail */ +{ + struct xfs_buf *cbp; /* buffer for cblock */ + struct xfs_btree_block *block; /* btree block */ + struct xfs_btree_block *cblock; /* child btree block */ + union xfs_btree_key *ckp; /* child key pointer */ + union xfs_btree_ptr *cpp; /* child ptr pointer */ + union xfs_btree_key *kp; /* pointer to btree key */ + union xfs_btree_ptr *pp; /* pointer to block addr */ + union xfs_btree_ptr nptr; /* new block addr */ + int level; /* btree level */ + int error; /* error return code */ +#ifdef DEBUG + int i; /* loop counter */ +#endif + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_STATS_INC(cur, newroot); + + ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); + + level = cur->bc_nlevels - 1; + + block = xfs_btree_get_iroot(cur); + pp = xfs_btree_ptr_addr(cur, 1, block); + + /* Allocate the new block. If we can't do it, we're toast. Give up. */ + error = cur->bc_ops->alloc_block(cur, pp, &nptr, 1, stat); + if (error) + goto error0; + if (*stat == 0) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + return 0; + } + XFS_BTREE_STATS_INC(cur, alloc); + + /* Copy the root into a real block. */ + error = xfs_btree_get_buf_block(cur, &nptr, 0, &cblock, &cbp); + if (error) + goto error0; + + memcpy(cblock, block, xfs_btree_block_len(cur)); + + be16_add_cpu(&block->bb_level, 1); + xfs_btree_set_numrecs(block, 1); + cur->bc_nlevels++; + cur->bc_ptrs[level + 1] = 1; + + kp = xfs_btree_key_addr(cur, 1, block); + ckp = xfs_btree_key_addr(cur, 1, cblock); + xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock)); + + cpp = xfs_btree_ptr_addr(cur, 1, cblock); +#ifdef DEBUG + for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) { + error = xfs_btree_check_ptr(cur, pp, i, level); + if (error) + goto error0; + } +#endif + xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock)); + +#ifdef DEBUG + error = xfs_btree_check_ptr(cur, &nptr, 0, level); + if (error) + goto error0; +#endif + xfs_btree_copy_ptrs(cur, pp, &nptr, 1); + + xfs_iroot_realloc(cur->bc_private.b.ip, + 1 - xfs_btree_get_numrecs(cblock), + cur->bc_private.b.whichfork); + + xfs_btree_setbuf(cur, level, cbp); + + /* + * Do all this logging at the end so that + * the root is at the right level. + */ + xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS); + xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs)); + xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs)); + + *logflags |= + XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork); + *stat = 1; + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + return 0; +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +} + +/* + * Allocate a new root block, fill it in. + */ +STATIC int /* error */ +xfs_btree_new_root( + struct xfs_btree_cur *cur, /* btree cursor */ + int *stat) /* success/failure */ +{ + struct xfs_btree_block *block; /* one half of the old root block */ + struct xfs_buf *bp; /* buffer containing block */ + int error; /* error return value */ + struct xfs_buf *lbp; /* left buffer pointer */ + struct xfs_btree_block *left; /* left btree block */ + struct xfs_buf *nbp; /* new (root) buffer */ + struct xfs_btree_block *new; /* new (root) btree block */ + int nptr; /* new value for key index, 1 or 2 */ + struct xfs_buf *rbp; /* right buffer pointer */ + struct xfs_btree_block *right; /* right btree block */ + union xfs_btree_ptr rptr; + union xfs_btree_ptr lptr; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_STATS_INC(cur, newroot); + + /* initialise our start point from the cursor */ + cur->bc_ops->init_ptr_from_cur(cur, &rptr); + + /* Allocate the new block. If we can't do it, we're toast. Give up. */ + error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, 1, stat); + if (error) + goto error0; + if (*stat == 0) + goto out0; + XFS_BTREE_STATS_INC(cur, alloc); + + /* Set up the new block. */ + error = xfs_btree_get_buf_block(cur, &lptr, 0, &new, &nbp); + if (error) + goto error0; + + /* Set the root in the holding structure increasing the level by 1. */ + cur->bc_ops->set_root(cur, &lptr, 1); + + /* + * At the previous root level there are now two blocks: the old root, + * and the new block generated when it was split. We don't know which + * one the cursor is pointing at, so we set up variables "left" and + * "right" for each case. + */ + block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp); + if (error) + goto error0; +#endif + + xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); + if (!xfs_btree_ptr_is_null(cur, &rptr)) { + /* Our block is left, pick up the right block. */ + lbp = bp; + xfs_btree_buf_to_ptr(cur, lbp, &lptr); + left = block; + error = xfs_btree_read_buf_block(cur, &rptr, + cur->bc_nlevels - 1, 0, &right, &rbp); + if (error) + goto error0; + bp = rbp; + nptr = 1; + } else { + /* Our block is right, pick up the left block. */ + rbp = bp; + xfs_btree_buf_to_ptr(cur, rbp, &rptr); + right = block; + xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB); + error = xfs_btree_read_buf_block(cur, &lptr, + cur->bc_nlevels - 1, 0, &left, &lbp); + if (error) + goto error0; + bp = lbp; + nptr = 2; + } + /* Fill in the new block's btree header and log it. */ + xfs_btree_init_block(cur, cur->bc_nlevels, 2, new); + xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS); + ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) && + !xfs_btree_ptr_is_null(cur, &rptr)); + + /* Fill in the key data in the new root. */ + if (xfs_btree_get_level(left) > 0) { + xfs_btree_copy_keys(cur, + xfs_btree_key_addr(cur, 1, new), + xfs_btree_key_addr(cur, 1, left), 1); + xfs_btree_copy_keys(cur, + xfs_btree_key_addr(cur, 2, new), + xfs_btree_key_addr(cur, 1, right), 1); + } else { + cur->bc_ops->init_key_from_rec( + xfs_btree_key_addr(cur, 1, new), + xfs_btree_rec_addr(cur, 1, left)); + cur->bc_ops->init_key_from_rec( + xfs_btree_key_addr(cur, 2, new), + xfs_btree_rec_addr(cur, 1, right)); + } + xfs_btree_log_keys(cur, nbp, 1, 2); + + /* Fill in the pointer data in the new root. */ + xfs_btree_copy_ptrs(cur, + xfs_btree_ptr_addr(cur, 1, new), &lptr, 1); + xfs_btree_copy_ptrs(cur, + xfs_btree_ptr_addr(cur, 2, new), &rptr, 1); + xfs_btree_log_ptrs(cur, nbp, 1, 2); + + /* Fix up the cursor. */ + xfs_btree_setbuf(cur, cur->bc_nlevels, nbp); + cur->bc_ptrs[cur->bc_nlevels] = nptr; + cur->bc_nlevels++; + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +out0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; +} + +STATIC int +xfs_btree_make_block_unfull( + struct xfs_btree_cur *cur, /* btree cursor */ + int level, /* btree level */ + int numrecs,/* # of recs in block */ + int *oindex,/* old tree index */ + int *index, /* new tree index */ + union xfs_btree_ptr *nptr, /* new btree ptr */ + struct xfs_btree_cur **ncur, /* new btree cursor */ + union xfs_btree_rec *nrec, /* new record */ + int *stat) +{ + union xfs_btree_key key; /* new btree key value */ + int error = 0; + + if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + level == cur->bc_nlevels - 1) { + struct xfs_inode *ip = cur->bc_private.b.ip; + + if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) { + /* A root block that can be made bigger. */ + + xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork); + } else { + /* A root block that needs replacing */ + int logflags = 0; + + error = xfs_btree_new_iroot(cur, &logflags, stat); + if (error || *stat == 0) + return error; + + xfs_trans_log_inode(cur->bc_tp, ip, logflags); + } + + return 0; + } + + /* First, try shifting an entry to the right neighbor. */ + error = xfs_btree_rshift(cur, level, stat); + if (error || *stat) + return error; + + /* Next, try shifting an entry to the left neighbor. */ + error = xfs_btree_lshift(cur, level, stat); + if (error) + return error; + + if (*stat) { + *oindex = *index = cur->bc_ptrs[level]; + return 0; + } + + /* + * Next, try splitting the current block in half. + * + * If this works we have to re-set our variables because we + * could be in a different block now. + */ + error = xfs_btree_split(cur, level, nptr, &key, ncur, stat); + if (error || *stat == 0) + return error; + + + *index = cur->bc_ptrs[level]; + cur->bc_ops->init_rec_from_key(&key, nrec); + return 0; +} + +/* + * Insert one record/level. Return information to the caller + * allowing the next level up to proceed if necessary. + */ +STATIC int +xfs_btree_insrec( + struct xfs_btree_cur *cur, /* btree cursor */ + int level, /* level to insert record at */ + union xfs_btree_ptr *ptrp, /* i/o: block number inserted */ + union xfs_btree_rec *recp, /* i/o: record data inserted */ + struct xfs_btree_cur **curp, /* output: new cursor replacing cur */ + int *stat) /* success/failure */ +{ + struct xfs_btree_block *block; /* btree block */ + struct xfs_buf *bp; /* buffer for block */ + union xfs_btree_key key; /* btree key */ + union xfs_btree_ptr nptr; /* new block ptr */ + struct xfs_btree_cur *ncur; /* new btree cursor */ + union xfs_btree_rec nrec; /* new record count */ + int optr; /* old key/record index */ + int ptr; /* key/record index */ + int numrecs;/* number of records */ + int error; /* error return value */ +#ifdef DEBUG + int i; +#endif + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, recp); + + ncur = NULL; + + /* + * If we have an external root pointer, and we've made it to the + * root level, allocate a new root block and we're done. + */ + if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + (level >= cur->bc_nlevels)) { + error = xfs_btree_new_root(cur, stat); + xfs_btree_set_ptr_null(cur, ptrp); + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + return error; + } + + /* If we're off the left edge, return failure. */ + ptr = cur->bc_ptrs[level]; + if (ptr == 0) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + } + + /* Make a key out of the record data to be inserted, and save it. */ + cur->bc_ops->init_key_from_rec(&key, recp); + + optr = ptr; + + XFS_BTREE_STATS_INC(cur, insrec); + + /* Get pointers to the btree buffer and block. */ + block = xfs_btree_get_block(cur, level, &bp); + numrecs = xfs_btree_get_numrecs(block); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, block, level, bp); + if (error) + goto error0; + + /* Check that the new entry is being inserted in the right place. */ + if (ptr <= numrecs) { + if (level == 0) { + ASSERT(cur->bc_ops->recs_inorder(cur, recp, + xfs_btree_rec_addr(cur, ptr, block))); + } else { + ASSERT(cur->bc_ops->keys_inorder(cur, &key, + xfs_btree_key_addr(cur, ptr, block))); + } + } +#endif + + /* + * If the block is full, we can't insert the new entry until we + * make the block un-full. + */ + xfs_btree_set_ptr_null(cur, &nptr); + if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) { + error = xfs_btree_make_block_unfull(cur, level, numrecs, + &optr, &ptr, &nptr, &ncur, &nrec, stat); + if (error || *stat == 0) + goto error0; + } + + /* + * The current block may have changed if the block was + * previously full and we have just made space in it. + */ + block = xfs_btree_get_block(cur, level, &bp); + numrecs = xfs_btree_get_numrecs(block); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, block, level, bp); + if (error) + return error; +#endif + + /* + * At this point we know there's room for our new entry in the block + * we're pointing at. + */ + XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1); + + if (level > 0) { + /* It's a nonleaf. make a hole in the keys and ptrs */ + union xfs_btree_key *kp; + union xfs_btree_ptr *pp; + + kp = xfs_btree_key_addr(cur, ptr, block); + pp = xfs_btree_ptr_addr(cur, ptr, block); + +#ifdef DEBUG + for (i = numrecs - ptr; i >= 0; i--) { + error = xfs_btree_check_ptr(cur, pp, i, level); + if (error) + return error; + } +#endif + + xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1); + xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1); + +#ifdef DEBUG + error = xfs_btree_check_ptr(cur, ptrp, 0, level); + if (error) + goto error0; +#endif + + /* Now put the new data in, bump numrecs and log it. */ + xfs_btree_copy_keys(cur, kp, &key, 1); + xfs_btree_copy_ptrs(cur, pp, ptrp, 1); + numrecs++; + xfs_btree_set_numrecs(block, numrecs); + xfs_btree_log_ptrs(cur, bp, ptr, numrecs); + xfs_btree_log_keys(cur, bp, ptr, numrecs); +#ifdef DEBUG + if (ptr < numrecs) { + ASSERT(cur->bc_ops->keys_inorder(cur, kp, + xfs_btree_key_addr(cur, ptr + 1, block))); + } +#endif + } else { + /* It's a leaf. make a hole in the records */ + union xfs_btree_rec *rp; + + rp = xfs_btree_rec_addr(cur, ptr, block); + + xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1); + + /* Now put the new data in, bump numrecs and log it. */ + xfs_btree_copy_recs(cur, rp, recp, 1); + xfs_btree_set_numrecs(block, ++numrecs); + xfs_btree_log_recs(cur, bp, ptr, numrecs); +#ifdef DEBUG + if (ptr < numrecs) { + ASSERT(cur->bc_ops->recs_inorder(cur, rp, + xfs_btree_rec_addr(cur, ptr + 1, block))); + } +#endif + } + + /* Log the new number of records in the btree header. */ + xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS); + + /* If we inserted at the start of a block, update the parents' keys. */ + if (optr == 1) { + error = xfs_btree_updkey(cur, &key, level + 1); + if (error) + goto error0; + } + + /* + * If we are tracking the last record in the tree and + * we are at the far right edge of the tree, update it. + */ + if (xfs_btree_is_lastrec(cur, block, level)) { + cur->bc_ops->update_lastrec(cur, block, recp, + ptr, LASTREC_INSREC); + } + + /* + * Return the new block number, if any. + * If there is one, give back a record value and a cursor too. + */ + *ptrp = nptr; + if (!xfs_btree_ptr_is_null(cur, &nptr)) { + *recp = nrec; + *curp = ncur; + } + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; + +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +} + +/* + * Insert the record at the point referenced by cur. + * + * A multi-level split of the tree on insert will invalidate the original + * cursor. All callers of this function should assume that the cursor is + * no longer valid and revalidate it. + */ +int +xfs_btree_insert( + struct xfs_btree_cur *cur, + int *stat) +{ + int error; /* error return value */ + int i; /* result value, 0 for failure */ + int level; /* current level number in btree */ + union xfs_btree_ptr nptr; /* new block number (split result) */ + struct xfs_btree_cur *ncur; /* new cursor (split result) */ + struct xfs_btree_cur *pcur; /* previous level's cursor */ + union xfs_btree_rec rec; /* record to insert */ + + level = 0; + ncur = NULL; + pcur = cur; + + xfs_btree_set_ptr_null(cur, &nptr); + cur->bc_ops->init_rec_from_cur(cur, &rec); + + /* + * Loop going up the tree, starting at the leaf level. + * Stop when we don't get a split block, that must mean that + * the insert is finished with this level. + */ + do { + /* + * Insert nrec/nptr into this level of the tree. + * Note if we fail, nptr will be null. + */ + error = xfs_btree_insrec(pcur, level, &nptr, &rec, &ncur, &i); + if (error) { + if (pcur != cur) + xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR); + goto error0; + } + + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + level++; + + /* + * See if the cursor we just used is trash. + * Can't trash the caller's cursor, but otherwise we should + * if ncur is a new cursor or we're about to be done. + */ + if (pcur != cur && + (ncur || xfs_btree_ptr_is_null(cur, &nptr))) { + /* Save the state from the cursor before we trash it */ + if (cur->bc_ops->update_cursor) + cur->bc_ops->update_cursor(pcur, cur); + cur->bc_nlevels = pcur->bc_nlevels; + xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR); + } + /* If we got a new cursor, switch to it. */ + if (ncur) { + pcur = ncur; + ncur = NULL; + } + } while (!xfs_btree_ptr_is_null(cur, &nptr)); + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = i; + return 0; +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +} + +/* + * Try to merge a non-leaf block back into the inode root. + * + * Note: the killroot names comes from the fact that we're effectively + * killing the old root block. But because we can't just delete the + * inode we have to copy the single block it was pointing to into the + * inode. + */ +int +xfs_btree_kill_iroot( + struct xfs_btree_cur *cur) +{ + int whichfork = cur->bc_private.b.whichfork; + struct xfs_inode *ip = cur->bc_private.b.ip; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_btree_block *block; + struct xfs_btree_block *cblock; + union xfs_btree_key *kp; + union xfs_btree_key *ckp; + union xfs_btree_ptr *pp; + union xfs_btree_ptr *cpp; + struct xfs_buf *cbp; + int level; + int index; + int numrecs; +#ifdef DEBUG + union xfs_btree_ptr ptr; + int i; +#endif + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + + ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); + ASSERT(cur->bc_nlevels > 1); + + /* + * Don't deal with the root block needs to be a leaf case. + * We're just going to turn the thing back into extents anyway. + */ + level = cur->bc_nlevels - 1; + if (level == 1) + goto out0; + + /* + * Give up if the root has multiple children. + */ + block = xfs_btree_get_iroot(cur); + if (xfs_btree_get_numrecs(block) != 1) + goto out0; + + cblock = xfs_btree_get_block(cur, level - 1, &cbp); + numrecs = xfs_btree_get_numrecs(cblock); + + /* + * Only do this if the next level will fit. + * Then the data must be copied up to the inode, + * instead of freeing the root you free the next level. + */ + if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level)) + goto out0; + + XFS_BTREE_STATS_INC(cur, killroot); + +#ifdef DEBUG + xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB); + ASSERT(xfs_btree_ptr_is_null(cur, &ptr)); + xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); + ASSERT(xfs_btree_ptr_is_null(cur, &ptr)); +#endif + + index = numrecs - cur->bc_ops->get_maxrecs(cur, level); + if (index) { + xfs_iroot_realloc(cur->bc_private.b.ip, index, + cur->bc_private.b.whichfork); + block = ifp->if_broot; + } + + be16_add_cpu(&block->bb_numrecs, index); + ASSERT(block->bb_numrecs == cblock->bb_numrecs); + + kp = xfs_btree_key_addr(cur, 1, block); + ckp = xfs_btree_key_addr(cur, 1, cblock); + xfs_btree_copy_keys(cur, kp, ckp, numrecs); + + pp = xfs_btree_ptr_addr(cur, 1, block); + cpp = xfs_btree_ptr_addr(cur, 1, cblock); +#ifdef DEBUG + for (i = 0; i < numrecs; i++) { + int error; + + error = xfs_btree_check_ptr(cur, cpp, i, level - 1); + if (error) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; + } + } +#endif + xfs_btree_copy_ptrs(cur, pp, cpp, numrecs); + + cur->bc_ops->free_block(cur, cbp); + XFS_BTREE_STATS_INC(cur, free); + + cur->bc_bufs[level - 1] = NULL; + be16_add_cpu(&block->bb_level, -1); + xfs_trans_log_inode(cur->bc_tp, ip, + XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork)); + cur->bc_nlevels--; +out0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + return 0; +} + +STATIC int +xfs_btree_dec_cursor( + struct xfs_btree_cur *cur, + int level, + int *stat) +{ + int error; + int i; + + if (level > 0) { + error = xfs_btree_decrement(cur, level, &i); + if (error) + return error; + } + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; +} + +/* + * Single level of the btree record deletion routine. + * Delete record pointed to by cur/level. + * Remove the record from its block then rebalance the tree. + * Return 0 for error, 1 for done, 2 to go on to the next level. + */ +STATIC int /* error */ +xfs_btree_delrec( + struct xfs_btree_cur *cur, /* btree cursor */ + int level, /* level removing record from */ + int *stat) /* fail/done/go-on */ +{ + struct xfs_btree_block *block; /* btree block */ + union xfs_btree_ptr cptr; /* current block ptr */ + struct xfs_buf *bp; /* buffer for block */ + int error; /* error return value */ + int i; /* loop counter */ + union xfs_btree_key key; /* storage for keyp */ + union xfs_btree_key *keyp = &key; /* passed to the next level */ + union xfs_btree_ptr lptr; /* left sibling block ptr */ + struct xfs_buf *lbp; /* left buffer pointer */ + struct xfs_btree_block *left; /* left btree block */ + int lrecs = 0; /* left record count */ + int ptr; /* key/record index */ + union xfs_btree_ptr rptr; /* right sibling block ptr */ + struct xfs_buf *rbp; /* right buffer pointer */ + struct xfs_btree_block *right; /* right btree block */ + struct xfs_btree_block *rrblock; /* right-right btree block */ + struct xfs_buf *rrbp; /* right-right buffer pointer */ + int rrecs = 0; /* right record count */ + struct xfs_btree_cur *tcur; /* temporary btree cursor */ + int numrecs; /* temporary numrec count */ + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGI(cur, level); + + tcur = NULL; + + /* Get the index of the entry being deleted, check for nothing there. */ + ptr = cur->bc_ptrs[level]; + if (ptr == 0) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + } + + /* Get the buffer & block containing the record or key/ptr. */ + block = xfs_btree_get_block(cur, level, &bp); + numrecs = xfs_btree_get_numrecs(block); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, block, level, bp); + if (error) + goto error0; +#endif + + /* Fail if we're off the end of the block. */ + if (ptr > numrecs) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + } + + XFS_BTREE_STATS_INC(cur, delrec); + XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr); + + /* Excise the entries being deleted. */ + if (level > 0) { + /* It's a nonleaf. operate on keys and ptrs */ + union xfs_btree_key *lkp; + union xfs_btree_ptr *lpp; + + lkp = xfs_btree_key_addr(cur, ptr + 1, block); + lpp = xfs_btree_ptr_addr(cur, ptr + 1, block); + +#ifdef DEBUG + for (i = 0; i < numrecs - ptr; i++) { + error = xfs_btree_check_ptr(cur, lpp, i, level); + if (error) + goto error0; + } +#endif + + if (ptr < numrecs) { + xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr); + xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr); + xfs_btree_log_keys(cur, bp, ptr, numrecs - 1); + xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1); + } + + /* + * If it's the first record in the block, we'll need to pass a + * key up to the next level (updkey). + */ + if (ptr == 1) + keyp = xfs_btree_key_addr(cur, 1, block); + } else { + /* It's a leaf. operate on records */ + if (ptr < numrecs) { + xfs_btree_shift_recs(cur, + xfs_btree_rec_addr(cur, ptr + 1, block), + -1, numrecs - ptr); + xfs_btree_log_recs(cur, bp, ptr, numrecs - 1); + } + + /* + * If it's the first record in the block, we'll need a key + * structure to pass up to the next level (updkey). + */ + if (ptr == 1) { + cur->bc_ops->init_key_from_rec(&key, + xfs_btree_rec_addr(cur, 1, block)); + keyp = &key; + } + } + + /* + * Decrement and log the number of entries in the block. + */ + xfs_btree_set_numrecs(block, --numrecs); + xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS); + + /* + * If we are tracking the last record in the tree and + * we are at the far right edge of the tree, update it. + */ + if (xfs_btree_is_lastrec(cur, block, level)) { + cur->bc_ops->update_lastrec(cur, block, NULL, + ptr, LASTREC_DELREC); + } + + /* + * We're at the root level. First, shrink the root block in-memory. + * Try to get rid of the next level down. If we can't then there's + * nothing left to do. + */ + if (level == cur->bc_nlevels - 1) { + if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) { + xfs_iroot_realloc(cur->bc_private.b.ip, -1, + cur->bc_private.b.whichfork); + + error = xfs_btree_kill_iroot(cur); + if (error) + goto error0; + + error = xfs_btree_dec_cursor(cur, level, stat); + if (error) + goto error0; + *stat = 1; + return 0; + } + + /* + * If this is the root level, and there's only one entry left, + * and it's NOT the leaf level, then we can get rid of this + * level. + */ + if (numrecs == 1 && level > 0) { + union xfs_btree_ptr *pp; + /* + * pp is still set to the first pointer in the block. + * Make it the new root of the btree. + */ + pp = xfs_btree_ptr_addr(cur, 1, block); + error = cur->bc_ops->kill_root(cur, bp, level, pp); + if (error) + goto error0; + } else if (level > 0) { + error = xfs_btree_dec_cursor(cur, level, stat); + if (error) + goto error0; + } + *stat = 1; + return 0; + } + + /* + * If we deleted the leftmost entry in the block, update the + * key values above us in the tree. + */ + if (ptr == 1) { + error = xfs_btree_updkey(cur, keyp, level + 1); + if (error) + goto error0; + } + + /* + * If the number of records remaining in the block is at least + * the minimum, we're done. + */ + if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) { + error = xfs_btree_dec_cursor(cur, level, stat); + if (error) + goto error0; + return 0; + } + + /* + * Otherwise, we have to move some records around to keep the + * tree balanced. Look at the left and right sibling blocks to + * see if we can re-balance by moving only one record. + */ + xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); + xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB); + + if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) { + /* + * One child of root, need to get a chance to copy its contents + * into the root and delete it. Can't go up to next level, + * there's nothing to delete there. + */ + if (xfs_btree_ptr_is_null(cur, &rptr) && + xfs_btree_ptr_is_null(cur, &lptr) && + level == cur->bc_nlevels - 2) { + error = xfs_btree_kill_iroot(cur); + if (!error) + error = xfs_btree_dec_cursor(cur, level, stat); + if (error) + goto error0; + return 0; + } + } + + ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) || + !xfs_btree_ptr_is_null(cur, &lptr)); + + /* + * Duplicate the cursor so our btree manipulations here won't + * disrupt the next level up. + */ + error = xfs_btree_dup_cursor(cur, &tcur); + if (error) + goto error0; + + /* + * If there's a right sibling, see if it's ok to shift an entry + * out of it. + */ + if (!xfs_btree_ptr_is_null(cur, &rptr)) { + /* + * Move the temp cursor to the last entry in the next block. + * Actually any entry but the first would suffice. + */ + i = xfs_btree_lastrec(tcur, level); + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + + error = xfs_btree_increment(tcur, level, &i); + if (error) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + + i = xfs_btree_lastrec(tcur, level); + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + + /* Grab a pointer to the block. */ + right = xfs_btree_get_block(tcur, level, &rbp); +#ifdef DEBUG + error = xfs_btree_check_block(tcur, right, level, rbp); + if (error) + goto error0; +#endif + /* Grab the current block number, for future use. */ + xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB); + + /* + * If right block is full enough so that removing one entry + * won't make it too empty, and left-shifting an entry out + * of right to us works, we're done. + */ + if (xfs_btree_get_numrecs(right) - 1 >= + cur->bc_ops->get_minrecs(tcur, level)) { + error = xfs_btree_lshift(tcur, level, &i); + if (error) + goto error0; + if (i) { + ASSERT(xfs_btree_get_numrecs(block) >= + cur->bc_ops->get_minrecs(tcur, level)); + + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + tcur = NULL; + + error = xfs_btree_dec_cursor(cur, level, stat); + if (error) + goto error0; + return 0; + } + } + + /* + * Otherwise, grab the number of records in right for + * future reference, and fix up the temp cursor to point + * to our block again (last record). + */ + rrecs = xfs_btree_get_numrecs(right); + if (!xfs_btree_ptr_is_null(cur, &lptr)) { + i = xfs_btree_firstrec(tcur, level); + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + + error = xfs_btree_decrement(tcur, level, &i); + if (error) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + } + } + + /* + * If there's a left sibling, see if it's ok to shift an entry + * out of it. + */ + if (!xfs_btree_ptr_is_null(cur, &lptr)) { + /* + * Move the temp cursor to the first entry in the + * previous block. + */ + i = xfs_btree_firstrec(tcur, level); + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + + error = xfs_btree_decrement(tcur, level, &i); + if (error) + goto error0; + i = xfs_btree_firstrec(tcur, level); + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + + /* Grab a pointer to the block. */ + left = xfs_btree_get_block(tcur, level, &lbp); +#ifdef DEBUG + error = xfs_btree_check_block(cur, left, level, lbp); + if (error) + goto error0; +#endif + /* Grab the current block number, for future use. */ + xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB); + + /* + * If left block is full enough so that removing one entry + * won't make it too empty, and right-shifting an entry out + * of left to us works, we're done. + */ + if (xfs_btree_get_numrecs(left) - 1 >= + cur->bc_ops->get_minrecs(tcur, level)) { + error = xfs_btree_rshift(tcur, level, &i); + if (error) + goto error0; + if (i) { + ASSERT(xfs_btree_get_numrecs(block) >= + cur->bc_ops->get_minrecs(tcur, level)); + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + tcur = NULL; + if (level == 0) + cur->bc_ptrs[0]++; + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; + } + } + + /* + * Otherwise, grab the number of records in right for + * future reference. + */ + lrecs = xfs_btree_get_numrecs(left); + } + + /* Delete the temp cursor, we're done with it. */ + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + tcur = NULL; + + /* If here, we need to do a join to keep the tree balanced. */ + ASSERT(!xfs_btree_ptr_is_null(cur, &cptr)); + + if (!xfs_btree_ptr_is_null(cur, &lptr) && + lrecs + xfs_btree_get_numrecs(block) <= + cur->bc_ops->get_maxrecs(cur, level)) { + /* + * Set "right" to be the starting block, + * "left" to be the left neighbor. + */ + rptr = cptr; + right = block; + rbp = bp; + error = xfs_btree_read_buf_block(cur, &lptr, level, + 0, &left, &lbp); + if (error) + goto error0; + + /* + * If that won't work, see if we can join with the right neighbor block. + */ + } else if (!xfs_btree_ptr_is_null(cur, &rptr) && + rrecs + xfs_btree_get_numrecs(block) <= + cur->bc_ops->get_maxrecs(cur, level)) { + /* + * Set "left" to be the starting block, + * "right" to be the right neighbor. + */ + lptr = cptr; + left = block; + lbp = bp; + error = xfs_btree_read_buf_block(cur, &rptr, level, + 0, &right, &rbp); + if (error) + goto error0; + + /* + * Otherwise, we can't fix the imbalance. + * Just return. This is probably a logic error, but it's not fatal. + */ + } else { + error = xfs_btree_dec_cursor(cur, level, stat); + if (error) + goto error0; + return 0; + } + + rrecs = xfs_btree_get_numrecs(right); + lrecs = xfs_btree_get_numrecs(left); + + /* + * We're now going to join "left" and "right" by moving all the stuff + * in "right" to "left" and deleting "right". + */ + XFS_BTREE_STATS_ADD(cur, moves, rrecs); + if (level > 0) { + /* It's a non-leaf. Move keys and pointers. */ + union xfs_btree_key *lkp; /* left btree key */ + union xfs_btree_ptr *lpp; /* left address pointer */ + union xfs_btree_key *rkp; /* right btree key */ + union xfs_btree_ptr *rpp; /* right address pointer */ + + lkp = xfs_btree_key_addr(cur, lrecs + 1, left); + lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left); + rkp = xfs_btree_key_addr(cur, 1, right); + rpp = xfs_btree_ptr_addr(cur, 1, right); +#ifdef DEBUG + for (i = 1; i < rrecs; i++) { + error = xfs_btree_check_ptr(cur, rpp, i, level); + if (error) + goto error0; + } +#endif + xfs_btree_copy_keys(cur, lkp, rkp, rrecs); + xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs); + + xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs); + xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs); + } else { + /* It's a leaf. Move records. */ + union xfs_btree_rec *lrp; /* left record pointer */ + union xfs_btree_rec *rrp; /* right record pointer */ + + lrp = xfs_btree_rec_addr(cur, lrecs + 1, left); + rrp = xfs_btree_rec_addr(cur, 1, right); + + xfs_btree_copy_recs(cur, lrp, rrp, rrecs); + xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs); + } + + XFS_BTREE_STATS_INC(cur, join); + + /* + * Fix up the the number of records and right block pointer in the + * surviving block, and log it. + */ + xfs_btree_set_numrecs(left, lrecs + rrecs); + xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB), + xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB); + xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB); + + /* If there is a right sibling, point it to the remaining block. */ + xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB); + if (!xfs_btree_ptr_is_null(cur, &cptr)) { + error = xfs_btree_read_buf_block(cur, &cptr, level, + 0, &rrblock, &rrbp); + if (error) + goto error0; + xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB); + xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB); + } + + /* Free the deleted block. */ + error = cur->bc_ops->free_block(cur, rbp); + if (error) + goto error0; + XFS_BTREE_STATS_INC(cur, free); + + /* + * If we joined with the left neighbor, set the buffer in the + * cursor to the left block, and fix up the index. + */ + if (bp != lbp) { + cur->bc_bufs[level] = lbp; + cur->bc_ptrs[level] += lrecs; + cur->bc_ra[level] = 0; + } + /* + * If we joined with the right neighbor and there's a level above + * us, increment the cursor at that level. + */ + else if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) || + (level + 1 < cur->bc_nlevels)) { + error = xfs_btree_increment(cur, level + 1, &i); + if (error) + goto error0; + } + + /* + * Readjust the ptr at this level if it's not a leaf, since it's + * still pointing at the deletion point, which makes the cursor + * inconsistent. If this makes the ptr 0, the caller fixes it up. + * We can't use decrement because it would change the next level up. + */ + if (level > 0) + cur->bc_ptrs[level]--; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + /* Return value means the next level up has something to do. */ + *stat = 2; + return 0; + +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + if (tcur) + xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); + return error; +} + +/* + * Delete the record pointed to by cur. + * The cursor refers to the place where the record was (could be inserted) + * when the operation returns. + */ +int /* error */ +xfs_btree_delete( + struct xfs_btree_cur *cur, + int *stat) /* success/failure */ +{ + int error; /* error return value */ + int level; + int i; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + + /* + * Go up the tree, starting at leaf level. + * + * If 2 is returned then a join was done; go to the next level. + * Otherwise we are done. + */ + for (level = 0, i = 2; i == 2; level++) { + error = xfs_btree_delrec(cur, level, &i); + if (error) + goto error0; + } + + if (i == 0) { + for (level = 1; level < cur->bc_nlevels; level++) { + if (cur->bc_ptrs[level] == 0) { + error = xfs_btree_decrement(cur, level, &i); + if (error) + goto error0; + break; + } + } + } + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = i; + return 0; +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +} + +/* + * Get the data from the pointed-to record. + */ +int /* error */ +xfs_btree_get_rec( + struct xfs_btree_cur *cur, /* btree cursor */ + union xfs_btree_rec **recp, /* output: btree record */ + int *stat) /* output: success/failure */ +{ + struct xfs_btree_block *block; /* btree block */ + struct xfs_buf *bp; /* buffer pointer */ + int ptr; /* record number */ +#ifdef DEBUG + int error; /* error return value */ +#endif + + ptr = cur->bc_ptrs[0]; + block = xfs_btree_get_block(cur, 0, &bp); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, block, 0, bp); + if (error) + return error; +#endif + + /* + * Off the right end or left end, return failure. + */ + if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) { + *stat = 0; + return 0; + } + + /* + * Point to the record and extract its data. + */ + *recp = xfs_btree_rec_addr(cur, ptr, block); + *stat = 1; + return 0; +} diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h index 1f528a2a3754..789fffdf8b2f 100644 --- a/fs/xfs/xfs_btree.h +++ b/fs/xfs/xfs_btree.h @@ -39,39 +39,19 @@ extern kmem_zone_t *xfs_btree_cur_zone; #define XFS_BTNUM_INO ((xfs_btnum_t)XFS_BTNUM_INOi) /* - * Short form header: space allocation btrees. - */ -typedef struct xfs_btree_sblock { - __be32 bb_magic; /* magic number for block type */ - __be16 bb_level; /* 0 is a leaf */ - __be16 bb_numrecs; /* current # of data records */ - __be32 bb_leftsib; /* left sibling block or NULLAGBLOCK */ - __be32 bb_rightsib; /* right sibling block or NULLAGBLOCK */ -} xfs_btree_sblock_t; - -/* - * Long form header: bmap btrees. - */ -typedef struct xfs_btree_lblock { - __be32 bb_magic; /* magic number for block type */ - __be16 bb_level; /* 0 is a leaf */ - __be16 bb_numrecs; /* current # of data records */ - __be64 bb_leftsib; /* left sibling block or NULLDFSBNO */ - __be64 bb_rightsib; /* right sibling block or NULLDFSBNO */ -} xfs_btree_lblock_t; - -/* - * Combined header and structure, used by common code. + * Generic btree header. + * + * This is a comination of the actual format used on disk for short and long + * format btrees. The first three fields are shared by both format, but + * the pointers are different and should be used with care. + * + * To get the size of the actual short or long form headers please use + * the size macros below. Never use sizeof(xfs_btree_block). */ -typedef struct xfs_btree_hdr -{ +struct xfs_btree_block { __be32 bb_magic; /* magic number for block type */ __be16 bb_level; /* 0 is a leaf */ __be16 bb_numrecs; /* current # of data records */ -} xfs_btree_hdr_t; - -typedef struct xfs_btree_block { - xfs_btree_hdr_t bb_h; /* header */ union { struct { __be32 bb_leftsib; @@ -82,7 +62,36 @@ typedef struct xfs_btree_block { __be64 bb_rightsib; } l; /* long form pointers */ } bb_u; /* rest */ -} xfs_btree_block_t; +}; + +#define XFS_BTREE_SBLOCK_LEN 16 /* size of a short form block */ +#define XFS_BTREE_LBLOCK_LEN 24 /* size of a long form block */ + + +/* + * Generic key, ptr and record wrapper structures. + * + * These are disk format structures, and are converted where necessary + * by the btree specific code that needs to interpret them. + */ +union xfs_btree_ptr { + __be32 s; /* short form ptr */ + __be64 l; /* long form ptr */ +}; + +union xfs_btree_key { + xfs_bmbt_key_t bmbt; + xfs_bmdr_key_t bmbr; /* bmbt root block */ + xfs_alloc_key_t alloc; + xfs_inobt_key_t inobt; +}; + +union xfs_btree_rec { + xfs_bmbt_rec_t bmbt; + xfs_bmdr_rec_t bmbr; /* bmbt root block */ + xfs_alloc_rec_t alloc; + xfs_inobt_rec_t inobt; +}; /* * For logging record fields. @@ -96,46 +105,131 @@ typedef struct xfs_btree_block { #define XFS_BB_ALL_BITS ((1 << XFS_BB_NUM_BITS) - 1) /* - * Boolean to select which form of xfs_btree_block_t.bb_u to use. - */ -#define XFS_BTREE_LONG_PTRS(btnum) ((btnum) == XFS_BTNUM_BMAP) - -/* * Magic numbers for btree blocks. */ extern const __uint32_t xfs_magics[]; /* - * Maximum and minimum records in a btree block. - * Given block size, type prefix, and leaf flag (0 or 1). - * The divisor below is equivalent to lf ? (e1) : (e2) but that produces - * compiler warnings. - */ -#define XFS_BTREE_BLOCK_MAXRECS(bsz,t,lf) \ - ((int)(((bsz) - (uint)sizeof(t ## _block_t)) / \ - (((lf) * (uint)sizeof(t ## _rec_t)) + \ - ((1 - (lf)) * \ - ((uint)sizeof(t ## _key_t) + (uint)sizeof(t ## _ptr_t)))))) -#define XFS_BTREE_BLOCK_MINRECS(bsz,t,lf) \ - (XFS_BTREE_BLOCK_MAXRECS(bsz,t,lf) / 2) - -/* - * Record, key, and pointer address calculation macros. - * Given block size, type prefix, block pointer, and index of requested entry - * (first entry numbered 1). - */ -#define XFS_BTREE_REC_ADDR(t,bb,i) \ - ((t ## _rec_t *)((char *)(bb) + sizeof(t ## _block_t) + \ - ((i) - 1) * sizeof(t ## _rec_t))) -#define XFS_BTREE_KEY_ADDR(t,bb,i) \ - ((t ## _key_t *)((char *)(bb) + sizeof(t ## _block_t) + \ - ((i) - 1) * sizeof(t ## _key_t))) -#define XFS_BTREE_PTR_ADDR(t,bb,i,mxr) \ - ((t ## _ptr_t *)((char *)(bb) + sizeof(t ## _block_t) + \ - (mxr) * sizeof(t ## _key_t) + ((i) - 1) * sizeof(t ## _ptr_t))) + * Generic stats interface + */ +#define __XFS_BTREE_STATS_INC(type, stat) \ + XFS_STATS_INC(xs_ ## type ## _2_ ## stat) +#define XFS_BTREE_STATS_INC(cur, stat) \ +do { \ + switch (cur->bc_btnum) { \ + case XFS_BTNUM_BNO: __XFS_BTREE_STATS_INC(abtb, stat); break; \ + case XFS_BTNUM_CNT: __XFS_BTREE_STATS_INC(abtc, stat); break; \ + case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_INC(bmbt, stat); break; \ + case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(ibt, stat); break; \ + case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \ + } \ +} while (0) + +#define __XFS_BTREE_STATS_ADD(type, stat, val) \ + XFS_STATS_ADD(xs_ ## type ## _2_ ## stat, val) +#define XFS_BTREE_STATS_ADD(cur, stat, val) \ +do { \ + switch (cur->bc_btnum) { \ + case XFS_BTNUM_BNO: __XFS_BTREE_STATS_ADD(abtb, stat, val); break; \ + case XFS_BTNUM_CNT: __XFS_BTREE_STATS_ADD(abtc, stat, val); break; \ + case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_ADD(bmbt, stat, val); break; \ + case XFS_BTNUM_INO: __XFS_BTREE_STATS_ADD(ibt, stat, val); break; \ + case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \ + } \ +} while (0) #define XFS_BTREE_MAXLEVELS 8 /* max of all btrees */ +struct xfs_btree_ops { + /* size of the key and record structures */ + size_t key_len; + size_t rec_len; + + /* cursor operations */ + struct xfs_btree_cur *(*dup_cursor)(struct xfs_btree_cur *); + void (*update_cursor)(struct xfs_btree_cur *src, + struct xfs_btree_cur *dst); + + /* update btree root pointer */ + void (*set_root)(struct xfs_btree_cur *cur, + union xfs_btree_ptr *nptr, int level_change); + int (*kill_root)(struct xfs_btree_cur *cur, struct xfs_buf *bp, + int level, union xfs_btree_ptr *newroot); + + /* block allocation / freeing */ + int (*alloc_block)(struct xfs_btree_cur *cur, + union xfs_btree_ptr *start_bno, + union xfs_btree_ptr *new_bno, + int length, int *stat); + int (*free_block)(struct xfs_btree_cur *cur, struct xfs_buf *bp); + + /* update last record information */ + void (*update_lastrec)(struct xfs_btree_cur *cur, + struct xfs_btree_block *block, + union xfs_btree_rec *rec, + int ptr, int reason); + + /* records in block/level */ + int (*get_minrecs)(struct xfs_btree_cur *cur, int level); + int (*get_maxrecs)(struct xfs_btree_cur *cur, int level); + + /* records on disk. Matter for the root in inode case. */ + int (*get_dmaxrecs)(struct xfs_btree_cur *cur, int level); + + /* init values of btree structures */ + void (*init_key_from_rec)(union xfs_btree_key *key, + union xfs_btree_rec *rec); + void (*init_rec_from_key)(union xfs_btree_key *key, + union xfs_btree_rec *rec); + void (*init_rec_from_cur)(struct xfs_btree_cur *cur, + union xfs_btree_rec *rec); + void (*init_ptr_from_cur)(struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr); + + /* difference between key value and cursor value */ + __int64_t (*key_diff)(struct xfs_btree_cur *cur, + union xfs_btree_key *key); + +#ifdef DEBUG + /* check that k1 is lower than k2 */ + int (*keys_inorder)(struct xfs_btree_cur *cur, + union xfs_btree_key *k1, + union xfs_btree_key *k2); + + /* check that r1 is lower than r2 */ + int (*recs_inorder)(struct xfs_btree_cur *cur, + union xfs_btree_rec *r1, + union xfs_btree_rec *r2); +#endif + + /* btree tracing */ +#ifdef XFS_BTREE_TRACE + void (*trace_enter)(struct xfs_btree_cur *, const char *, + char *, int, int, __psunsigned_t, + __psunsigned_t, __psunsigned_t, + __psunsigned_t, __psunsigned_t, + __psunsigned_t, __psunsigned_t, + __psunsigned_t, __psunsigned_t, + __psunsigned_t, __psunsigned_t); + void (*trace_cursor)(struct xfs_btree_cur *, __uint32_t *, + __uint64_t *, __uint64_t *); + void (*trace_key)(struct xfs_btree_cur *, + union xfs_btree_key *, __uint64_t *, + __uint64_t *); + void (*trace_record)(struct xfs_btree_cur *, + union xfs_btree_rec *, __uint64_t *, + __uint64_t *, __uint64_t *); +#endif +}; + +/* + * Reasons for the update_lastrec method to be called. + */ +#define LASTREC_UPDATE 0 +#define LASTREC_INSREC 1 +#define LASTREC_DELREC 2 + + /* * Btree cursor structure. * This collects all information needed by the btree code in one place. @@ -144,6 +238,8 @@ typedef struct xfs_btree_cur { struct xfs_trans *bc_tp; /* transaction we're in, if any */ struct xfs_mount *bc_mp; /* file system mount struct */ + const struct xfs_btree_ops *bc_ops; + uint bc_flags; /* btree features - below */ union { xfs_alloc_rec_incore_t a; xfs_bmbt_irec_t b; @@ -175,94 +271,40 @@ typedef struct xfs_btree_cur } bc_private; /* per-btree type data */ } xfs_btree_cur_t; +/* cursor flags */ +#define XFS_BTREE_LONG_PTRS (1<<0) /* pointers are 64bits long */ +#define XFS_BTREE_ROOT_IN_INODE (1<<1) /* root may be variable size */ +#define XFS_BTREE_LASTREC_UPDATE (1<<2) /* track last rec externally */ + + #define XFS_BTREE_NOERROR 0 #define XFS_BTREE_ERROR 1 /* * Convert from buffer to btree block header. */ -#define XFS_BUF_TO_BLOCK(bp) ((xfs_btree_block_t *)XFS_BUF_PTR(bp)) -#define XFS_BUF_TO_LBLOCK(bp) ((xfs_btree_lblock_t *)XFS_BUF_PTR(bp)) -#define XFS_BUF_TO_SBLOCK(bp) ((xfs_btree_sblock_t *)XFS_BUF_PTR(bp)) +#define XFS_BUF_TO_BLOCK(bp) ((struct xfs_btree_block *)XFS_BUF_PTR(bp)) -#ifdef __KERNEL__ - -#ifdef DEBUG /* - * Debug routine: check that block header is ok. + * Check that block header is ok. */ -void +int xfs_btree_check_block( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_btree_block_t *block, /* generic btree block pointer */ - int level, /* level of the btree block */ - struct xfs_buf *bp); /* buffer containing block, if any */ - -/* - * Debug routine: check that keys are in the right order. - */ -void -xfs_btree_check_key( - xfs_btnum_t btnum, /* btree identifier */ - void *ak1, /* pointer to left (lower) key */ - void *ak2); /* pointer to right (higher) key */ - -/* - * Debug routine: check that records are in the right order. - */ -void -xfs_btree_check_rec( - xfs_btnum_t btnum, /* btree identifier */ - void *ar1, /* pointer to left (lower) record */ - void *ar2); /* pointer to right (higher) record */ -#else -#define xfs_btree_check_block(a,b,c,d) -#define xfs_btree_check_key(a,b,c) -#define xfs_btree_check_rec(a,b,c) -#endif /* DEBUG */ - -/* - * Checking routine: check that long form block header is ok. - */ -int /* error (0 or EFSCORRUPTED) */ -xfs_btree_check_lblock( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_btree_lblock_t *block, /* btree long form block pointer */ + struct xfs_btree_cur *cur, /* btree cursor */ + struct xfs_btree_block *block, /* generic btree block pointer */ int level, /* level of the btree block */ struct xfs_buf *bp); /* buffer containing block, if any */ /* - * Checking routine: check that (long) pointer is ok. + * Check that (long) pointer is ok. */ int /* error (0 or EFSCORRUPTED) */ xfs_btree_check_lptr( - xfs_btree_cur_t *cur, /* btree cursor */ + struct xfs_btree_cur *cur, /* btree cursor */ xfs_dfsbno_t ptr, /* btree block disk address */ int level); /* btree block level */ -#define xfs_btree_check_lptr_disk(cur, ptr, level) \ - xfs_btree_check_lptr(cur, be64_to_cpu(ptr), level) - -/* - * Checking routine: check that short form block header is ok. - */ -int /* error (0 or EFSCORRUPTED) */ -xfs_btree_check_sblock( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_btree_sblock_t *block, /* btree short form block pointer */ - int level, /* level of the btree block */ - struct xfs_buf *bp); /* buffer containing block */ - -/* - * Checking routine: check that (short) pointer is ok. - */ -int /* error (0 or EFSCORRUPTED) */ -xfs_btree_check_sptr( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_agblock_t ptr, /* btree block disk address */ - int level); /* btree block level */ - /* * Delete the btree cursor. */ @@ -281,15 +323,6 @@ xfs_btree_dup_cursor( xfs_btree_cur_t **ncur);/* output cursor */ /* - * Change the cursor to point to the first record in the current block - * at the given level. Other levels are unaffected. - */ -int /* success=1, failure=0 */ -xfs_btree_firstrec( - xfs_btree_cur_t *cur, /* btree cursor */ - int level); /* level to change */ - -/* * Get a buffer for the block, return it with no data read. * Long-form addressing. */ @@ -313,20 +346,6 @@ xfs_btree_get_bufs( uint lock); /* lock flags for get_buf */ /* - * Allocate a new btree cursor. - * The cursor is either for allocation (A) or bmap (B). - */ -xfs_btree_cur_t * /* new btree cursor */ -xfs_btree_init_cursor( - struct xfs_mount *mp, /* file system mount point */ - struct xfs_trans *tp, /* transaction pointer */ - struct xfs_buf *agbp, /* (A only) buffer for agf structure */ - xfs_agnumber_t agno, /* (A only) allocation group number */ - xfs_btnum_t btnum, /* btree identifier */ - struct xfs_inode *ip, /* (B only) inode owning the btree */ - int whichfork); /* (B only) data/attr fork */ - -/* * Check for the cursor referring to the last block at the given level. */ int /* 1=is last block, 0=not last block */ @@ -335,15 +354,6 @@ xfs_btree_islastblock( int level); /* level to check */ /* - * Change the cursor to point to the last record in the current block - * at the given level. Other levels are unaffected. - */ -int /* success=1, failure=0 */ -xfs_btree_lastrec( - xfs_btree_cur_t *cur, /* btree cursor */ - int level); /* level to change */ - -/* * Compute first and last byte offsets for the fields given. * Interprets the offsets table, which contains struct field offsets. */ @@ -404,39 +414,53 @@ xfs_btree_reada_bufs( xfs_extlen_t count); /* count of filesystem blocks */ /* - * Read-ahead btree blocks, at the given level. - * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA. + * Set the buffer for level "lev" in the cursor to bp, releasing + * any previous buffer. */ -int /* readahead block count */ -xfs_btree_readahead_core( +void +xfs_btree_setbuf( xfs_btree_cur_t *cur, /* btree cursor */ int lev, /* level in btree */ - int lr); /* left/right bits */ + struct xfs_buf *bp); /* new buffer to set */ -static inline int /* readahead block count */ -xfs_btree_readahead( - xfs_btree_cur_t *cur, /* btree cursor */ - int lev, /* level in btree */ - int lr) /* left/right bits */ -{ - if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev]) - return 0; - return xfs_btree_readahead_core(cur, lev, lr); -} +/* + * Common btree core entry points. + */ +int xfs_btree_increment(struct xfs_btree_cur *, int, int *); +int xfs_btree_decrement(struct xfs_btree_cur *, int, int *); +int xfs_btree_lookup(struct xfs_btree_cur *, xfs_lookup_t, int *); +int xfs_btree_update(struct xfs_btree_cur *, union xfs_btree_rec *); +int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *); +int xfs_btree_kill_iroot(struct xfs_btree_cur *); +int xfs_btree_insert(struct xfs_btree_cur *, int *); +int xfs_btree_delete(struct xfs_btree_cur *, int *); +int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *); +/* + * Internal btree helpers also used by xfs_bmap.c. + */ +void xfs_btree_log_block(struct xfs_btree_cur *, struct xfs_buf *, int); +void xfs_btree_log_recs(struct xfs_btree_cur *, struct xfs_buf *, int, int); /* - * Set the buffer for level "lev" in the cursor to bp, releasing - * any previous buffer. + * Helpers. */ -void -xfs_btree_setbuf( - xfs_btree_cur_t *cur, /* btree cursor */ - int lev, /* level in btree */ - struct xfs_buf *bp); /* new buffer to set */ +static inline int xfs_btree_get_numrecs(struct xfs_btree_block *block) +{ + return be16_to_cpu(block->bb_numrecs); +} + +static inline void xfs_btree_set_numrecs(struct xfs_btree_block *block, + __uint16_t numrecs) +{ + block->bb_numrecs = cpu_to_be16(numrecs); +} -#endif /* __KERNEL__ */ +static inline int xfs_btree_get_level(struct xfs_btree_block *block) +{ + return be16_to_cpu(block->bb_level); +} /* diff --git a/fs/xfs/xfs_btree_trace.c b/fs/xfs/xfs_btree_trace.c new file mode 100644 index 000000000000..44ff942a0fda --- /dev/null +++ b/fs/xfs/xfs_btree_trace.c @@ -0,0 +1,249 @@ +/* + * Copyright (c) 2008 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_types.h" +#include "xfs_inum.h" +#include "xfs_bmap_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_btree_trace.h" + +STATIC void +xfs_btree_trace_ptr( + struct xfs_btree_cur *cur, + union xfs_btree_ptr ptr, + __psunsigned_t *high, + __psunsigned_t *low) +{ + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + __u64 val = be64_to_cpu(ptr.l); + *high = val >> 32; + *low = (int)val; + } else { + *high = 0; + *low = be32_to_cpu(ptr.s); + } +} + +/* + * Add a trace buffer entry for arguments, for a buffer & 1 integer arg. + */ +void +xfs_btree_trace_argbi( + const char *func, + struct xfs_btree_cur *cur, + struct xfs_buf *b, + int i, + int line) +{ + cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGBI, + line, (__psunsigned_t)b, i, 0, 0, 0, 0, 0, + 0, 0, 0, 0); +} + +/* + * Add a trace buffer entry for arguments, for a buffer & 2 integer args. + */ +void +xfs_btree_trace_argbii( + const char *func, + struct xfs_btree_cur *cur, + struct xfs_buf *b, + int i0, + int i1, + int line) +{ + cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGBII, + line, (__psunsigned_t)b, i0, i1, 0, 0, 0, 0, + 0, 0, 0, 0); +} + +/* + * Add a trace buffer entry for arguments, for 3 block-length args + * and an integer arg. + */ +void +xfs_btree_trace_argfffi( + const char *func, + struct xfs_btree_cur *cur, + xfs_dfiloff_t o, + xfs_dfsbno_t b, + xfs_dfilblks_t i, + int j, + int line) +{ + cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGFFFI, + line, + o >> 32, (int)o, + b >> 32, (int)b, + i >> 32, (int)i, + (int)j, 0, 0, 0, 0); +} + +/* + * Add a trace buffer entry for arguments, for one integer arg. + */ +void +xfs_btree_trace_argi( + const char *func, + struct xfs_btree_cur *cur, + int i, + int line) +{ + cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGI, + line, i, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); +} + +/* + * Add a trace buffer entry for arguments, for int, fsblock, key. + */ +void +xfs_btree_trace_argipk( + const char *func, + struct xfs_btree_cur *cur, + int i, + union xfs_btree_ptr ptr, + union xfs_btree_key *key, + int line) +{ + __psunsigned_t high, low; + __uint64_t l0, l1; + + xfs_btree_trace_ptr(cur, ptr, &high, &low); + cur->bc_ops->trace_key(cur, key, &l0, &l1); + cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGIPK, + line, i, high, low, + l0 >> 32, (int)l0, + l1 >> 32, (int)l1, + 0, 0, 0, 0); +} + +/* + * Add a trace buffer entry for arguments, for int, fsblock, rec. + */ +void +xfs_btree_trace_argipr( + const char *func, + struct xfs_btree_cur *cur, + int i, + union xfs_btree_ptr ptr, + union xfs_btree_rec *rec, + int line) +{ + __psunsigned_t high, low; + __uint64_t l0, l1, l2; + + xfs_btree_trace_ptr(cur, ptr, &high, &low); + cur->bc_ops->trace_record(cur, rec, &l0, &l1, &l2); + cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGIPR, + line, i, + high, low, + l0 >> 32, (int)l0, + l1 >> 32, (int)l1, + l2 >> 32, (int)l2, + 0, 0); +} + +/* + * Add a trace buffer entry for arguments, for int, key. + */ +void +xfs_btree_trace_argik( + const char *func, + struct xfs_btree_cur *cur, + int i, + union xfs_btree_key *key, + int line) +{ + __uint64_t l0, l1; + + cur->bc_ops->trace_key(cur, key, &l0, &l1); + cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGIK, + line, i, + l0 >> 32, (int)l0, + l1 >> 32, (int)l1, + 0, 0, 0, 0, 0, 0); +} + +/* + * Add a trace buffer entry for arguments, for record. + */ +void +xfs_btree_trace_argr( + const char *func, + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec, + int line) +{ + __uint64_t l0, l1, l2; + + cur->bc_ops->trace_record(cur, rec, &l0, &l1, &l2); + cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGR, + line, + l0 >> 32, (int)l0, + l1 >> 32, (int)l1, + l2 >> 32, (int)l2, + 0, 0, 0, 0, 0); +} + +/* + * Add a trace buffer entry for the cursor/operation. + */ +void +xfs_btree_trace_cursor( + const char *func, + struct xfs_btree_cur *cur, + int type, + int line) +{ + __uint32_t s0; + __uint64_t l0, l1; + char *s; + + switch (type) { + case XBT_ARGS: + s = "args"; + break; + case XBT_ENTRY: + s = "entry"; + break; + case XBT_ERROR: + s = "error"; + break; + case XBT_EXIT: + s = "exit"; + break; + default: + s = "unknown"; + break; + } + + cur->bc_ops->trace_cursor(cur, &s0, &l0, &l1); + cur->bc_ops->trace_enter(cur, func, s, XFS_BTREE_KTRACE_CUR, line, + s0, + l0 >> 32, (int)l0, + l1 >> 32, (int)l1, + (__psunsigned_t)cur->bc_bufs[0], + (__psunsigned_t)cur->bc_bufs[1], + (__psunsigned_t)cur->bc_bufs[2], + (__psunsigned_t)cur->bc_bufs[3], + (cur->bc_ptrs[0] << 16) | cur->bc_ptrs[1], + (cur->bc_ptrs[2] << 16) | cur->bc_ptrs[3]); +} diff --git a/fs/xfs/xfs_btree_trace.h b/fs/xfs/xfs_btree_trace.h new file mode 100644 index 000000000000..b3f5eb3c3c6c --- /dev/null +++ b/fs/xfs/xfs_btree_trace.h @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2008 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_BTREE_TRACE_H__ +#define __XFS_BTREE_TRACE_H__ + +struct xfs_btree_cur; +struct xfs_buf; + + +/* + * Trace hooks. + * i,j = integer (32 bit) + * b = btree block buffer (xfs_buf_t) + * p = btree ptr + * r = btree record + * k = btree key + */ + +#ifdef XFS_BTREE_TRACE + +/* + * Trace buffer entry types. + */ +#define XFS_BTREE_KTRACE_ARGBI 1 +#define XFS_BTREE_KTRACE_ARGBII 2 +#define XFS_BTREE_KTRACE_ARGFFFI 3 +#define XFS_BTREE_KTRACE_ARGI 4 +#define XFS_BTREE_KTRACE_ARGIPK 5 +#define XFS_BTREE_KTRACE_ARGIPR 6 +#define XFS_BTREE_KTRACE_ARGIK 7 +#define XFS_BTREE_KTRACE_ARGR 8 +#define XFS_BTREE_KTRACE_CUR 9 + +/* + * Sub-types for cursor traces. + */ +#define XBT_ARGS 0 +#define XBT_ENTRY 1 +#define XBT_ERROR 2 +#define XBT_EXIT 3 + +void xfs_btree_trace_argbi(const char *, struct xfs_btree_cur *, + struct xfs_buf *, int, int); +void xfs_btree_trace_argbii(const char *, struct xfs_btree_cur *, + struct xfs_buf *, int, int, int); +void xfs_btree_trace_argfffi(const char *, struct xfs_btree_cur *, + xfs_dfiloff_t, xfs_dfsbno_t, xfs_dfilblks_t, int, int); +void xfs_btree_trace_argi(const char *, struct xfs_btree_cur *, int, int); +void xfs_btree_trace_argipk(const char *, struct xfs_btree_cur *, int, + union xfs_btree_ptr, union xfs_btree_key *, int); +void xfs_btree_trace_argipr(const char *, struct xfs_btree_cur *, int, + union xfs_btree_ptr, union xfs_btree_rec *, int); +void xfs_btree_trace_argik(const char *, struct xfs_btree_cur *, int, + union xfs_btree_key *, int); +void xfs_btree_trace_argr(const char *, struct xfs_btree_cur *, + union xfs_btree_rec *, int); +void xfs_btree_trace_cursor(const char *, struct xfs_btree_cur *, int, int); + + +#define XFS_ALLOCBT_TRACE_SIZE 4096 /* size of global trace buffer */ +extern ktrace_t *xfs_allocbt_trace_buf; + +#define XFS_INOBT_TRACE_SIZE 4096 /* size of global trace buffer */ +extern ktrace_t *xfs_inobt_trace_buf; + +#define XFS_BMBT_TRACE_SIZE 4096 /* size of global trace buffer */ +#define XFS_BMBT_KTRACE_SIZE 32 /* size of per-inode trace buffer */ +extern ktrace_t *xfs_bmbt_trace_buf; + + +#define XFS_BTREE_TRACE_ARGBI(c, b, i) \ + xfs_btree_trace_argbi(__func__, c, b, i, __LINE__) +#define XFS_BTREE_TRACE_ARGBII(c, b, i, j) \ + xfs_btree_trace_argbii(__func__, c, b, i, j, __LINE__) +#define XFS_BTREE_TRACE_ARGFFFI(c, o, b, i, j) \ + xfs_btree_trace_argfffi(__func__, c, o, b, i, j, __LINE__) +#define XFS_BTREE_TRACE_ARGI(c, i) \ + xfs_btree_trace_argi(__func__, c, i, __LINE__) +#define XFS_BTREE_TRACE_ARGIPK(c, i, p, k) \ + xfs_btree_trace_argipk(__func__, c, i, p, k, __LINE__) +#define XFS_BTREE_TRACE_ARGIPR(c, i, p, r) \ + xfs_btree_trace_argipr(__func__, c, i, p, r, __LINE__) +#define XFS_BTREE_TRACE_ARGIK(c, i, k) \ + xfs_btree_trace_argik(__func__, c, i, k, __LINE__) +#define XFS_BTREE_TRACE_ARGR(c, r) \ + xfs_btree_trace_argr(__func__, c, r, __LINE__) +#define XFS_BTREE_TRACE_CURSOR(c, t) \ + xfs_btree_trace_cursor(__func__, c, t, __LINE__) +#else +#define XFS_BTREE_TRACE_ARGBI(c, b, i) +#define XFS_BTREE_TRACE_ARGBII(c, b, i, j) +#define XFS_BTREE_TRACE_ARGFFFI(c, o, b, i, j) +#define XFS_BTREE_TRACE_ARGI(c, i) +#define XFS_BTREE_TRACE_ARGIPK(c, i, p, s) +#define XFS_BTREE_TRACE_ARGIPR(c, i, p, r) +#define XFS_BTREE_TRACE_ARGIK(c, i, k) +#define XFS_BTREE_TRACE_ARGR(c, r) +#define XFS_BTREE_TRACE_CURSOR(c, t) +#endif /* XFS_BTREE_TRACE */ + +#endif /* __XFS_BTREE_TRACE_H__ */ diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 002fc2617c8e..92af4098c7e8 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -375,7 +375,7 @@ xfs_buf_item_unpin( xfs_buf_log_item_t *bip, int stale) { - xfs_mount_t *mp; + struct xfs_ail *ailp; xfs_buf_t *bp; int freed; @@ -387,7 +387,7 @@ xfs_buf_item_unpin( xfs_buftrace("XFS_UNPIN", bp); freed = atomic_dec_and_test(&bip->bli_refcount); - mp = bip->bli_item.li_mountp; + ailp = bip->bli_item.li_ailp; xfs_bunpin(bp); if (freed && stale) { ASSERT(bip->bli_flags & XFS_BLI_STALE); @@ -399,17 +399,17 @@ xfs_buf_item_unpin( xfs_buftrace("XFS_UNPIN STALE", bp); /* * If we get called here because of an IO error, we may - * or may not have the item on the AIL. xfs_trans_delete_ail() + * or may not have the item on the AIL. xfs_trans_ail_delete() * will take care of that situation. - * xfs_trans_delete_ail() drops the AIL lock. + * xfs_trans_ail_delete() drops the AIL lock. */ if (bip->bli_flags & XFS_BLI_STALE_INODE) { xfs_buf_do_callbacks(bp, (xfs_log_item_t *)bip); XFS_BUF_SET_FSPRIVATE(bp, NULL); XFS_BUF_CLR_IODONE_FUNC(bp); } else { - spin_lock(&mp->m_ail_lock); - xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip); + spin_lock(&ailp->xa_lock); + xfs_trans_ail_delete(ailp, (xfs_log_item_t *)bip); xfs_buf_item_relse(bp); ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL); } @@ -707,8 +707,8 @@ xfs_buf_item_init( * the first. If we do already have one, there is * nothing to do here so return. */ - if (XFS_BUF_FSPRIVATE3(bp, xfs_mount_t *) != mp) - XFS_BUF_SET_FSPRIVATE3(bp, mp); + if (bp->b_mount != mp) + bp->b_mount = mp; XFS_BUF_SET_BDSTRAT_FUNC(bp, xfs_bdstrat_cb); if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) { lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); @@ -731,6 +731,7 @@ xfs_buf_item_init( bip->bli_item.li_type = XFS_LI_BUF; bip->bli_item.li_ops = &xfs_buf_item_ops; bip->bli_item.li_mountp = mp; + bip->bli_item.li_ailp = mp->m_ail; bip->bli_buf = bp; xfs_buf_hold(bp); bip->bli_format.blf_type = XFS_LI_BUF; @@ -997,21 +998,7 @@ xfs_buf_iodone_callbacks( xfs_buf_do_callbacks(bp, lip); XFS_BUF_SET_FSPRIVATE(bp, NULL); XFS_BUF_CLR_IODONE_FUNC(bp); - - /* - * XFS_SHUT flag gets set when we go thru the - * entire buffer cache and deliberately start - * throwing away delayed write buffers. - * Since there's no biowait done on those, - * we should just brelse them. - */ - if (XFS_BUF_ISSHUT(bp)) { - XFS_BUF_UNSHUT(bp); - xfs_buf_relse(bp); - } else { - xfs_biodone(bp); - } - + xfs_biodone(bp); return; } @@ -1122,27 +1109,23 @@ xfs_buf_iodone( xfs_buf_t *bp, xfs_buf_log_item_t *bip) { - struct xfs_mount *mp; + struct xfs_ail *ailp = bip->bli_item.li_ailp; ASSERT(bip->bli_buf == bp); xfs_buf_rele(bp); - mp = bip->bli_item.li_mountp; /* * If we are forcibly shutting down, this may well be * off the AIL already. That's because we simulate the * log-committed callbacks to unpin these buffers. Or we may never * have put this item on AIL because of the transaction was - * aborted forcibly. xfs_trans_delete_ail() takes care of these. + * aborted forcibly. xfs_trans_ail_delete() takes care of these. * * Either way, AIL is useless if we're forcing a shutdown. */ - spin_lock(&mp->m_ail_lock); - /* - * xfs_trans_delete_ail() drops the AIL lock. - */ - xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip); + spin_lock(&ailp->xa_lock); + xfs_trans_ail_delete(ailp, (xfs_log_item_t *)bip); xfs_buf_item_free(bip); } diff --git a/fs/xfs/xfs_clnt.h b/fs/xfs/xfs_clnt.h deleted file mode 100644 index d2ce5dd70d87..000000000000 --- a/fs/xfs/xfs_clnt.h +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Copyright (c) 2000-2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_CLNT_H__ -#define __XFS_CLNT_H__ - -/* - * XFS arguments structure, constructed from the arguments we - * are passed via the mount system call. - * - * NOTE: The mount system call is handled differently between - * Linux and IRIX. In IRIX we worked work with a binary data - * structure coming in across the syscall interface from user - * space (the mount userspace knows about each filesystem type - * and the set of valid options for it, and converts the users - * argument string into a binary structure _before_ making the - * system call), and the ABI issues that this implies. - * - * In Linux, we are passed a comma separated set of options; - * ie. a NULL terminated string of characters. Userspace mount - * code does not have any knowledge of mount options expected by - * each filesystem type and so each filesystem parses its mount - * options in kernel space. - * - * For the Linux port, we kept this structure pretty much intact - * and use it internally (because the existing code groks it). - */ -struct xfs_mount_args { - int flags; /* flags -> see XFSMNT_... macros below */ - int flags2; /* flags -> see XFSMNT2_... macros below */ - int logbufs; /* Number of log buffers, -1 to default */ - int logbufsize; /* Size of log buffers, -1 to default */ - char fsname[MAXNAMELEN+1]; /* data device name */ - char rtname[MAXNAMELEN+1]; /* realtime device filename */ - char logname[MAXNAMELEN+1]; /* journal device filename */ - char mtpt[MAXNAMELEN+1]; /* filesystem mount point */ - int sunit; /* stripe unit (BBs) */ - int swidth; /* stripe width (BBs), multiple of sunit */ - uchar_t iosizelog; /* log2 of the preferred I/O size */ - int ihashsize; /* inode hash table size (buckets) */ -}; - -/* - * XFS mount option flags -- args->flags1 - */ -#define XFSMNT_ATTR2 0x00000001 /* allow ATTR2 EA format */ -#define XFSMNT_WSYNC 0x00000002 /* safe mode nfs mount - * compatible */ -#define XFSMNT_INO64 0x00000004 /* move inode numbers up - * past 2^32 */ -#define XFSMNT_UQUOTA 0x00000008 /* user quota accounting */ -#define XFSMNT_PQUOTA 0x00000010 /* IRIX prj quota accounting */ -#define XFSMNT_UQUOTAENF 0x00000020 /* user quota limit - * enforcement */ -#define XFSMNT_PQUOTAENF 0x00000040 /* IRIX project quota limit - * enforcement */ -#define XFSMNT_QUIET 0x00000080 /* don't report mount errors */ -#define XFSMNT_NOALIGN 0x00000200 /* don't allocate at - * stripe boundaries*/ -#define XFSMNT_RETERR 0x00000400 /* return error to user */ -#define XFSMNT_NORECOVERY 0x00000800 /* no recovery, implies - * read-only mount */ -#define XFSMNT_SHARED 0x00001000 /* shared XFS mount */ -#define XFSMNT_IOSIZE 0x00002000 /* optimize for I/O size */ -#define XFSMNT_OSYNCISOSYNC 0x00004000 /* o_sync is REALLY o_sync */ - /* (osyncisdsync is default) */ -#define XFSMNT_NOATTR2 0x00008000 /* turn off ATTR2 EA format */ -#define XFSMNT_32BITINODES 0x00200000 /* restrict inodes to 32 - * bits of address space */ -#define XFSMNT_GQUOTA 0x00400000 /* group quota accounting */ -#define XFSMNT_GQUOTAENF 0x00800000 /* group quota limit - * enforcement */ -#define XFSMNT_NOUUID 0x01000000 /* Ignore fs uuid */ -#define XFSMNT_DMAPI 0x02000000 /* enable dmapi/xdsm */ -#define XFSMNT_BARRIER 0x04000000 /* use write barriers */ -#define XFSMNT_IKEEP 0x08000000 /* inode cluster delete */ -#define XFSMNT_SWALLOC 0x10000000 /* turn on stripe width - * allocation */ -#define XFSMNT_DIRSYNC 0x40000000 /* sync creat,link,unlink,rename - * symlink,mkdir,rmdir,mknod */ -#define XFSMNT_FLAGS2 0x80000000 /* more flags set in flags2 */ - -/* - * XFS mount option flags -- args->flags2 - */ -#define XFSMNT2_COMPAT_IOSIZE 0x00000001 /* don't report large preferred - * I/O size in stat(2) */ -#define XFSMNT2_FILESTREAMS 0x00000002 /* enable the filestreams - * allocator */ - -#endif /* __XFS_CLNT_H__ */ diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h index 8be0b00ede9a..70b710c1792d 100644 --- a/fs/xfs/xfs_da_btree.h +++ b/fs/xfs/xfs_da_btree.h @@ -72,27 +72,7 @@ typedef struct xfs_da_intnode { typedef struct xfs_da_node_hdr xfs_da_node_hdr_t; typedef struct xfs_da_node_entry xfs_da_node_entry_t; -#define XFS_DA_MAXHASH ((xfs_dahash_t)-1) /* largest valid hash value */ - #define XFS_LBSIZE(mp) (mp)->m_sb.sb_blocksize -#define XFS_LBLOG(mp) (mp)->m_sb.sb_blocklog - -#define XFS_DA_MAKE_BNOENTRY(mp,bno,entry) \ - (((bno) << (mp)->m_dircook_elog) | (entry)) -#define XFS_DA_MAKE_COOKIE(mp,bno,entry,hash) \ - (((xfs_off_t)XFS_DA_MAKE_BNOENTRY(mp, bno, entry) << 32) | (hash)) -#define XFS_DA_COOKIE_HASH(mp,cookie) ((xfs_dahash_t)cookie) -#define XFS_DA_COOKIE_BNO(mp,cookie) \ - ((((xfs_off_t)(cookie) >> 31) == -1LL ? \ - (xfs_dablk_t)0 : \ - (xfs_dablk_t)((xfs_off_t)(cookie) >> \ - ((mp)->m_dircook_elog + 32)))) -#define XFS_DA_COOKIE_ENTRY(mp,cookie) \ - ((((xfs_off_t)(cookie) >> 31) == -1LL ? \ - (xfs_dablk_t)0 : \ - (xfs_dablk_t)(((xfs_off_t)(cookie) >> 32) & \ - ((1 << (mp)->m_dircook_elog) - 1)))) - /*======================================================================== * Btree searching and modification structure definitions. @@ -226,9 +206,8 @@ struct xfs_nameops { }; -#ifdef __KERNEL__ /*======================================================================== - * Function prototypes for the kernel. + * Function prototypes. *========================================================================*/ /* @@ -289,6 +268,5 @@ xfs_daddr_t xfs_da_blkno(xfs_dabuf_t *dabuf); extern struct kmem_zone *xfs_da_state_zone; extern struct kmem_zone *xfs_dabuf_zone; -#endif /* __KERNEL__ */ #endif /* __XFS_DA_BTREE_H__ */ diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index 75b0cd4da0ea..b4c1ee713492 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -49,9 +49,8 @@ */ int xfs_swapext( - xfs_swapext_t __user *sxu) + xfs_swapext_t *sxp) { - xfs_swapext_t *sxp; xfs_inode_t *ip, *tip; struct file *file, *target_file; int error = 0; @@ -62,11 +61,6 @@ xfs_swapext( goto out; } - if (copy_from_user(sxp, sxu, sizeof(xfs_swapext_t))) { - error = XFS_ERROR(EFAULT); - goto out_free_sxp; - } - /* Pull information for the target fd */ file = fget((int)sxp->sx_fdtarget); if (!file) { diff --git a/fs/xfs/xfs_dfrag.h b/fs/xfs/xfs_dfrag.h index da178205be68..4f55a6306558 100644 --- a/fs/xfs/xfs_dfrag.h +++ b/fs/xfs/xfs_dfrag.h @@ -46,7 +46,7 @@ typedef struct xfs_swapext /* * Syscall interface for xfs_swapext */ -int xfs_swapext(struct xfs_swapext __user *sx); +int xfs_swapext(struct xfs_swapext *sx); int xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip, struct xfs_swapext *sxp); diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h index c9065eaf2a4d..162e8726df5e 100644 --- a/fs/xfs/xfs_dinode.h +++ b/fs/xfs/xfs_dinode.h @@ -18,32 +18,29 @@ #ifndef __XFS_DINODE_H__ #define __XFS_DINODE_H__ -struct xfs_buf; -struct xfs_mount; +#define XFS_DINODE_MAGIC 0x494e /* 'IN' */ +#define XFS_DINODE_GOOD_VERSION(v) (((v) == 1 || (v) == 2)) -#define XFS_DINODE_VERSION_1 1 -#define XFS_DINODE_VERSION_2 2 -#define XFS_DINODE_GOOD_VERSION(v) \ - (((v) == XFS_DINODE_VERSION_1 || (v) == XFS_DINODE_VERSION_2)) -#define XFS_DINODE_MAGIC 0x494e /* 'IN' */ - -/* - * Disk inode structure. - * This is just the header; the inode is expanded to fill a variable size - * with the last field expanding. It is split into the core and "other" - * because we only need the core part in the in-core inode. - */ typedef struct xfs_timestamp { __be32 t_sec; /* timestamp seconds */ __be32 t_nsec; /* timestamp nanoseconds */ } xfs_timestamp_t; /* - * Note: Coordinate changes to this structure with the XFS_DI_* #defines - * below, the offsets table in xfs_ialloc_log_di() and struct xfs_icdinode - * in xfs_inode.h. + * On-disk inode structure. + * + * This is just the header or "dinode core", the inode is expanded to fill a + * variable size the leftover area split into a data and an attribute fork. + * The format of the data and attribute fork depends on the format of the + * inode as indicated by di_format and di_aformat. To access the data and + * attribute use the XFS_DFORK_PTR, XFS_DFORK_DPTR, and XFS_DFORK_PTR macros + * below. + * + * There is a very similar struct icdinode in xfs_inode which matches the + * layout of the first 96 bytes of this structure, but is kept in native + * format instead of big endian. */ -typedef struct xfs_dinode_core { +typedef struct xfs_dinode { __be16 di_magic; /* inode magic # = XFS_DINODE_MAGIC */ __be16 di_mode; /* mode and type of file */ __u8 di_version; /* inode version */ @@ -69,34 +66,12 @@ typedef struct xfs_dinode_core { __be16 di_dmstate; /* DMIG state info */ __be16 di_flags; /* random flags, XFS_DIFLAG_... */ __be32 di_gen; /* generation number */ -} xfs_dinode_core_t; -#define DI_MAX_FLUSH 0xffff + /* di_next_unlinked is the only non-core field in the old dinode */ + __be32 di_next_unlinked;/* agi unlinked list ptr */ +} __attribute__((packed)) xfs_dinode_t; -typedef struct xfs_dinode -{ - xfs_dinode_core_t di_core; - /* - * In adding anything between the core and the union, be - * sure to update the macros like XFS_LITINO below and - * XFS_BMAP_RBLOCK_DSIZE in xfs_bmap_btree.h. - */ - __be32 di_next_unlinked;/* agi unlinked list ptr */ - union { - xfs_bmdr_block_t di_bmbt; /* btree root block */ - xfs_bmbt_rec_32_t di_bmx[1]; /* extent list */ - xfs_dir2_sf_t di_dir2sf; /* shortform directory v2 */ - char di_c[1]; /* local contents */ - __be32 di_dev; /* device for S_IFCHR/S_IFBLK */ - uuid_t di_muuid; /* mount point value */ - char di_symlink[1]; /* local symbolic link */ - } di_u; - union { - xfs_bmdr_block_t di_abmbt; /* btree root block */ - xfs_bmbt_rec_32_t di_abmx[1]; /* extent list */ - xfs_attr_shortform_t di_attrsf; /* shortform attribute list */ - } di_a; -} xfs_dinode_t; +#define DI_MAX_FLUSH 0xffff /* * The 32 bit link count in the inode theoretically maxes out at UINT_MAX. @@ -107,50 +82,14 @@ typedef struct xfs_dinode #define XFS_MAXLINK_1 65535U /* - * Bit names for logging disk inodes only - */ -#define XFS_DI_MAGIC 0x0000001 -#define XFS_DI_MODE 0x0000002 -#define XFS_DI_VERSION 0x0000004 -#define XFS_DI_FORMAT 0x0000008 -#define XFS_DI_ONLINK 0x0000010 -#define XFS_DI_UID 0x0000020 -#define XFS_DI_GID 0x0000040 -#define XFS_DI_NLINK 0x0000080 -#define XFS_DI_PROJID 0x0000100 -#define XFS_DI_PAD 0x0000200 -#define XFS_DI_ATIME 0x0000400 -#define XFS_DI_MTIME 0x0000800 -#define XFS_DI_CTIME 0x0001000 -#define XFS_DI_SIZE 0x0002000 -#define XFS_DI_NBLOCKS 0x0004000 -#define XFS_DI_EXTSIZE 0x0008000 -#define XFS_DI_NEXTENTS 0x0010000 -#define XFS_DI_NAEXTENTS 0x0020000 -#define XFS_DI_FORKOFF 0x0040000 -#define XFS_DI_AFORMAT 0x0080000 -#define XFS_DI_DMEVMASK 0x0100000 -#define XFS_DI_DMSTATE 0x0200000 -#define XFS_DI_FLAGS 0x0400000 -#define XFS_DI_GEN 0x0800000 -#define XFS_DI_NEXT_UNLINKED 0x1000000 -#define XFS_DI_U 0x2000000 -#define XFS_DI_A 0x4000000 -#define XFS_DI_NUM_BITS 27 -#define XFS_DI_ALL_BITS ((1 << XFS_DI_NUM_BITS) - 1) -#define XFS_DI_CORE_BITS (XFS_DI_ALL_BITS & ~(XFS_DI_U|XFS_DI_A)) - -/* * Values for di_format */ -typedef enum xfs_dinode_fmt -{ - XFS_DINODE_FMT_DEV, /* CHR, BLK: di_dev */ - XFS_DINODE_FMT_LOCAL, /* DIR, REG: di_c */ - /* LNK: di_symlink */ - XFS_DINODE_FMT_EXTENTS, /* DIR, REG, LNK: di_bmx */ - XFS_DINODE_FMT_BTREE, /* DIR, REG, LNK: di_bmbt */ - XFS_DINODE_FMT_UUID /* MNT: di_uuid */ +typedef enum xfs_dinode_fmt { + XFS_DINODE_FMT_DEV, /* xfs_dev_t */ + XFS_DINODE_FMT_LOCAL, /* bulk data */ + XFS_DINODE_FMT_EXTENTS, /* struct xfs_bmbt_rec */ + XFS_DINODE_FMT_BTREE, /* struct xfs_bmdr_block */ + XFS_DINODE_FMT_UUID /* uuid_t */ } xfs_dinode_fmt_t; /* @@ -166,13 +105,13 @@ typedef enum xfs_dinode_fmt */ #define XFS_LITINO(mp) ((mp)->m_litino) #define XFS_BROOT_SIZE_ADJ \ - (sizeof(xfs_bmbt_block_t) - sizeof(xfs_bmdr_block_t)) + (XFS_BTREE_LBLOCK_LEN - sizeof(xfs_bmdr_block_t)) /* * Inode data & attribute fork sizes, per inode. */ -#define XFS_DFORK_Q(dip) ((dip)->di_core.di_forkoff != 0) -#define XFS_DFORK_BOFF(dip) ((int)((dip)->di_core.di_forkoff << 3)) +#define XFS_DFORK_Q(dip) ((dip)->di_forkoff != 0) +#define XFS_DFORK_BOFF(dip) ((int)((dip)->di_forkoff << 3)) #define XFS_DFORK_DSIZE(dip,mp) \ (XFS_DFORK_Q(dip) ? \ @@ -187,23 +126,42 @@ typedef enum xfs_dinode_fmt XFS_DFORK_DSIZE(dip, mp) : \ XFS_DFORK_ASIZE(dip, mp)) -#define XFS_DFORK_DPTR(dip) ((dip)->di_u.di_c) +/* + * Return pointers to the data or attribute forks. + */ +#define XFS_DFORK_DPTR(dip) \ + ((char *)(dip) + sizeof(struct xfs_dinode)) #define XFS_DFORK_APTR(dip) \ - ((dip)->di_u.di_c + XFS_DFORK_BOFF(dip)) + (XFS_DFORK_DPTR(dip) + XFS_DFORK_BOFF(dip)) #define XFS_DFORK_PTR(dip,w) \ ((w) == XFS_DATA_FORK ? XFS_DFORK_DPTR(dip) : XFS_DFORK_APTR(dip)) + #define XFS_DFORK_FORMAT(dip,w) \ ((w) == XFS_DATA_FORK ? \ - (dip)->di_core.di_format : \ - (dip)->di_core.di_aformat) + (dip)->di_format : \ + (dip)->di_aformat) #define XFS_DFORK_NEXTENTS(dip,w) \ ((w) == XFS_DATA_FORK ? \ - be32_to_cpu((dip)->di_core.di_nextents) : \ - be16_to_cpu((dip)->di_core.di_anextents)) + be32_to_cpu((dip)->di_nextents) : \ + be16_to_cpu((dip)->di_anextents)) #define XFS_BUF_TO_DINODE(bp) ((xfs_dinode_t *)XFS_BUF_PTR(bp)) /* + * For block and character special files the 32bit dev_t is stored at the + * beginning of the data fork. + */ +static inline xfs_dev_t xfs_dinode_get_rdev(struct xfs_dinode *dip) +{ + return be32_to_cpu(*(__be32 *)XFS_DFORK_DPTR(dip)); +} + +static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) +{ + *(__be32 *)XFS_DFORK_DPTR(dip) = cpu_to_be32(rdev); +} + +/* * Values for di_flags * There should be a one-to-one correspondence between these flags and the * XFS_XFLAG_s. diff --git a/fs/xfs/xfs_dir2_sf.h b/fs/xfs/xfs_dir2_sf.h index deecc9d238f8..6ac44b550d39 100644 --- a/fs/xfs/xfs_dir2_sf.h +++ b/fs/xfs/xfs_dir2_sf.h @@ -34,13 +34,6 @@ struct xfs_mount; struct xfs_trans; /* - * Maximum size of a shortform directory. - */ -#define XFS_DIR2_SF_MAX_SIZE \ - (XFS_DINODE_MAX_SIZE - (uint)sizeof(xfs_dinode_core_t) - \ - (uint)sizeof(xfs_agino_t)) - -/* * Inode number stored as 8 8-bit values. */ typedef struct { __uint8_t i[8]; } xfs_dir2_ino8_t; diff --git a/fs/xfs/xfs_dmops.c b/fs/xfs/xfs_dmops.c index a1e55fb9d5dd..e71e2581c0c3 100644 --- a/fs/xfs/xfs_dmops.c +++ b/fs/xfs/xfs_dmops.c @@ -25,7 +25,6 @@ #include "xfs_inum.h" #include "xfs_ag.h" #include "xfs_mount.h" -#include "xfs_clnt.h" static struct xfs_dmops xfs_dmcore_stub = { @@ -38,9 +37,9 @@ static struct xfs_dmops xfs_dmcore_stub = { }; int -xfs_dmops_get(struct xfs_mount *mp, struct xfs_mount_args *args) +xfs_dmops_get(struct xfs_mount *mp) { - if (args->flags & XFSMNT_DMAPI) { + if (mp->m_flags & XFS_MOUNT_DMAPI) { cmn_err(CE_WARN, "XFS: dmapi support not available in this kernel."); return EINVAL; diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index f227ecd1a294..92d5cd5bf4f2 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -153,21 +153,6 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud) } #endif /* DEBUG */ -static void -xfs_fs_vcmn_err(int level, xfs_mount_t *mp, char *fmt, va_list ap) -{ - if (mp != NULL) { - char *newfmt; - int len = 16 + mp->m_fsname_len + strlen(fmt); - - newfmt = kmem_alloc(len, KM_SLEEP); - sprintf(newfmt, "Filesystem \"%s\": %s", mp->m_fsname, fmt); - icmn_err(level, newfmt, ap); - kmem_free(newfmt); - } else { - icmn_err(level, fmt, ap); - } -} void xfs_fs_cmn_err(int level, xfs_mount_t *mp, char *fmt, ...) diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 11543f10b0c6..0c93051c4651 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -159,11 +159,15 @@ extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud); #define XFS_PTAG_FSBLOCK_ZERO 0x00000080 struct xfs_mount; -/* PRINTFLIKE4 */ + +extern void xfs_fs_vcmn_err(int level, struct xfs_mount *mp, + char *fmt, va_list ap) + __attribute__ ((format (printf, 3, 0))); extern void xfs_cmn_err(int panic_tag, int level, struct xfs_mount *mp, - char *fmt, ...); -/* PRINTFLIKE3 */ -extern void xfs_fs_cmn_err(int level, struct xfs_mount *mp, char *fmt, ...); + char *fmt, ...) + __attribute__ ((format (printf, 4, 5))); +extern void xfs_fs_cmn_err(int level, struct xfs_mount *mp, char *fmt, ...) + __attribute__ ((format (printf, 3, 4))); extern void xfs_hex_dump(void *p, int length); diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 8aa28f751b2a..05a4bdd4be39 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -108,19 +108,16 @@ xfs_efi_item_pin(xfs_efi_log_item_t *efip) STATIC void xfs_efi_item_unpin(xfs_efi_log_item_t *efip, int stale) { - xfs_mount_t *mp; + struct xfs_ail *ailp = efip->efi_item.li_ailp; - mp = efip->efi_item.li_mountp; - spin_lock(&mp->m_ail_lock); + spin_lock(&ailp->xa_lock); if (efip->efi_flags & XFS_EFI_CANCELED) { - /* - * xfs_trans_delete_ail() drops the AIL lock. - */ - xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip); + /* xfs_trans_ail_delete() drops the AIL lock. */ + xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip); xfs_efi_item_free(efip); } else { efip->efi_flags |= XFS_EFI_COMMITTED; - spin_unlock(&mp->m_ail_lock); + spin_unlock(&ailp->xa_lock); } } @@ -134,26 +131,23 @@ xfs_efi_item_unpin(xfs_efi_log_item_t *efip, int stale) STATIC void xfs_efi_item_unpin_remove(xfs_efi_log_item_t *efip, xfs_trans_t *tp) { - xfs_mount_t *mp; + struct xfs_ail *ailp = efip->efi_item.li_ailp; xfs_log_item_desc_t *lidp; - mp = efip->efi_item.li_mountp; - spin_lock(&mp->m_ail_lock); + spin_lock(&ailp->xa_lock); if (efip->efi_flags & XFS_EFI_CANCELED) { /* * free the xaction descriptor pointing to this item */ lidp = xfs_trans_find_item(tp, (xfs_log_item_t *) efip); xfs_trans_free_item(tp, lidp); - /* - * pull the item off the AIL. - * xfs_trans_delete_ail() drops the AIL lock. - */ - xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip); + + /* xfs_trans_ail_delete() drops the AIL lock. */ + xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip); xfs_efi_item_free(efip); } else { efip->efi_flags |= XFS_EFI_COMMITTED; - spin_unlock(&mp->m_ail_lock); + spin_unlock(&ailp->xa_lock); } } @@ -268,6 +262,7 @@ xfs_efi_init(xfs_mount_t *mp, efip->efi_item.li_type = XFS_LI_EFI; efip->efi_item.li_ops = &xfs_efi_item_ops; efip->efi_item.li_mountp = mp; + efip->efi_item.li_ailp = mp->m_ail; efip->efi_format.efi_nextents = nextents; efip->efi_format.efi_id = (__psint_t)(void*)efip; @@ -345,25 +340,22 @@ void xfs_efi_release(xfs_efi_log_item_t *efip, uint nextents) { - xfs_mount_t *mp; - int extents_left; + struct xfs_ail *ailp = efip->efi_item.li_ailp; + int extents_left; - mp = efip->efi_item.li_mountp; ASSERT(efip->efi_next_extent > 0); ASSERT(efip->efi_flags & XFS_EFI_COMMITTED); - spin_lock(&mp->m_ail_lock); + spin_lock(&ailp->xa_lock); ASSERT(efip->efi_next_extent >= nextents); efip->efi_next_extent -= nextents; extents_left = efip->efi_next_extent; if (extents_left == 0) { - /* - * xfs_trans_delete_ail() drops the AIL lock. - */ - xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip); + /* xfs_trans_ail_delete() drops the AIL lock. */ + xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip); xfs_efi_item_free(efip); } else { - spin_unlock(&mp->m_ail_lock); + spin_unlock(&ailp->xa_lock); } } @@ -565,6 +557,7 @@ xfs_efd_init(xfs_mount_t *mp, efdp->efd_item.li_type = XFS_LI_EFD; efdp->efd_item.li_ops = &xfs_efd_item_ops; efdp->efd_item.li_mountp = mp; + efdp->efd_item.li_ailp = mp->m_ail; efdp->efd_efip = efip; efdp->efd_format.efd_nextents = nextents; efdp->efd_format.efd_efi_id = efip->efi_format.efi_id; diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h index 01c0cc88d3f3..f7c06fac8229 100644 --- a/fs/xfs/xfs_fs.h +++ b/fs/xfs/xfs_fs.h @@ -113,22 +113,14 @@ struct getbmapx { #define BMV_IF_ATTRFORK 0x1 /* return attr fork rather than data */ #define BMV_IF_NO_DMAPI_READ 0x2 /* Do not generate DMAPI read event */ #define BMV_IF_PREALLOC 0x4 /* rtn status BMV_OF_PREALLOC if req */ -#define BMV_IF_VALID (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC) -#ifdef __KERNEL__ -#define BMV_IF_EXTENDED 0x40000000 /* getpmapx if set */ -#endif +#define BMV_IF_DELALLOC 0x8 /* rtn status BMV_OF_DELALLOC if req */ +#define BMV_IF_VALID \ + (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC|BMV_IF_DELALLOC) /* bmv_oflags values - returned for for each non-header segment */ #define BMV_OF_PREALLOC 0x1 /* segment = unwritten pre-allocation */ - -/* Convert getbmap <-> getbmapx - move fields from p1 to p2. */ -#define GETBMAP_CONVERT(p1,p2) { \ - p2.bmv_offset = p1.bmv_offset; \ - p2.bmv_block = p1.bmv_block; \ - p2.bmv_length = p1.bmv_length; \ - p2.bmv_count = p1.bmv_count; \ - p2.bmv_entries = p1.bmv_entries; } - +#define BMV_OF_DELALLOC 0x2 /* segment = delayed allocation */ +#define BMV_OF_LAST 0x4 /* segment is the last in the file */ /* * Structure for XFS_IOC_FSSETDM. @@ -426,10 +418,6 @@ typedef struct xfs_handle { #define XFS_IOC_GETXFLAGS FS_IOC_GETFLAGS #define XFS_IOC_SETXFLAGS FS_IOC_SETFLAGS #define XFS_IOC_GETVERSION FS_IOC_GETVERSION -/* 32-bit compat counterparts */ -#define XFS_IOC32_GETXFLAGS FS_IOC32_GETFLAGS -#define XFS_IOC32_SETXFLAGS FS_IOC32_SETFLAGS -#define XFS_IOC32_GETVERSION FS_IOC32_GETVERSION /* * ioctl commands that replace IRIX fcntl()'s @@ -477,8 +465,8 @@ typedef struct xfs_handle { #define XFS_IOC_ERROR_INJECTION _IOW ('X', 116, struct xfs_error_injection) #define XFS_IOC_ERROR_CLEARALL _IOW ('X', 117, struct xfs_error_injection) /* XFS_IOC_ATTRCTL_BY_HANDLE -- deprecated 118 */ -#define XFS_IOC_FREEZE _IOWR('X', 119, int) -#define XFS_IOC_THAW _IOWR('X', 120, int) +/* XFS_IOC_FREEZE -- FIFREEZE 119 */ +/* XFS_IOC_THAW -- FITHAW 120 */ #define XFS_IOC_FSSETDM_BY_HANDLE _IOW ('X', 121, struct xfs_fsop_setdm_handlereq) #define XFS_IOC_ATTRLIST_BY_HANDLE _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq) #define XFS_IOC_ATTRMULTI_BY_HANDLE _IOW ('X', 123, struct xfs_fsop_attrmulti_handlereq) diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 84583cf73db3..680d0e0ec932 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -126,7 +126,7 @@ xfs_growfs_data_private( xfs_extlen_t agsize; xfs_extlen_t tmpsize; xfs_alloc_rec_t *arec; - xfs_btree_sblock_t *block; + struct xfs_btree_block *block; xfs_buf_t *bp; int bucket; int dpct; @@ -251,14 +251,14 @@ xfs_growfs_data_private( bp = xfs_buf_get(mp->m_ddev_targp, XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)), BTOBB(mp->m_sb.sb_blocksize), 0); - block = XFS_BUF_TO_SBLOCK(bp); + block = XFS_BUF_TO_BLOCK(bp); memset(block, 0, mp->m_sb.sb_blocksize); block->bb_magic = cpu_to_be32(XFS_ABTB_MAGIC); block->bb_level = 0; block->bb_numrecs = cpu_to_be16(1); - block->bb_leftsib = cpu_to_be32(NULLAGBLOCK); - block->bb_rightsib = cpu_to_be32(NULLAGBLOCK); - arec = XFS_BTREE_REC_ADDR(xfs_alloc, block, 1); + block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); + block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); + arec = XFS_ALLOC_REC_ADDR(mp, block, 1); arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); arec->ar_blockcount = cpu_to_be32( agsize - be32_to_cpu(arec->ar_startblock)); @@ -272,14 +272,14 @@ xfs_growfs_data_private( bp = xfs_buf_get(mp->m_ddev_targp, XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)), BTOBB(mp->m_sb.sb_blocksize), 0); - block = XFS_BUF_TO_SBLOCK(bp); + block = XFS_BUF_TO_BLOCK(bp); memset(block, 0, mp->m_sb.sb_blocksize); block->bb_magic = cpu_to_be32(XFS_ABTC_MAGIC); block->bb_level = 0; block->bb_numrecs = cpu_to_be16(1); - block->bb_leftsib = cpu_to_be32(NULLAGBLOCK); - block->bb_rightsib = cpu_to_be32(NULLAGBLOCK); - arec = XFS_BTREE_REC_ADDR(xfs_alloc, block, 1); + block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); + block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); + arec = XFS_ALLOC_REC_ADDR(mp, block, 1); arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); arec->ar_blockcount = cpu_to_be32( agsize - be32_to_cpu(arec->ar_startblock)); @@ -294,13 +294,13 @@ xfs_growfs_data_private( bp = xfs_buf_get(mp->m_ddev_targp, XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)), BTOBB(mp->m_sb.sb_blocksize), 0); - block = XFS_BUF_TO_SBLOCK(bp); + block = XFS_BUF_TO_BLOCK(bp); memset(block, 0, mp->m_sb.sb_blocksize); block->bb_magic = cpu_to_be32(XFS_IBT_MAGIC); block->bb_level = 0; block->bb_numrecs = 0; - block->bb_leftsib = cpu_to_be32(NULLAGBLOCK); - block->bb_rightsib = cpu_to_be32(NULLAGBLOCK); + block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); + block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); error = xfs_bwrite(mp, bp); if (error) { goto error0; @@ -435,6 +435,9 @@ xfs_growfs_data( xfs_growfs_data_t *in) { int error; + + if (!capable(CAP_SYS_ADMIN)) + return XFS_ERROR(EPERM); if (!mutex_trylock(&mp->m_growlock)) return XFS_ERROR(EWOULDBLOCK); error = xfs_growfs_data_private(mp, in); @@ -448,6 +451,9 @@ xfs_growfs_log( xfs_growfs_log_t *in) { int error; + + if (!capable(CAP_SYS_ADMIN)) + return XFS_ERROR(EPERM); if (!mutex_trylock(&mp->m_growlock)) return XFS_ERROR(EWOULDBLOCK); error = xfs_growfs_log_private(mp, in); @@ -589,17 +595,19 @@ out: return 0; } -void +int xfs_fs_log_dummy( xfs_mount_t *mp) { xfs_trans_t *tp; xfs_inode_t *ip; + int error; tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1); - if (xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0)) { + error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0); + if (error) { xfs_trans_cancel(tp, 0); - return; + return error; } ip = mp->m_rootip; @@ -609,9 +617,10 @@ xfs_fs_log_dummy( xfs_trans_ihold(tp, ip); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); xfs_trans_set_sync(tp); - xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp, 0); xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; } int diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h index 300d0c9d61ad..88435e0a77c9 100644 --- a/fs/xfs/xfs_fsops.h +++ b/fs/xfs/xfs_fsops.h @@ -25,6 +25,6 @@ extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt); extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval, xfs_fsop_resblks_t *outval); extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags); -extern void xfs_fs_log_dummy(xfs_mount_t *mp); +extern int xfs_fs_log_dummy(xfs_mount_t *mp); #endif /* __XFS_FSOPS_H__ */ diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index aad8c5da38af..e6ebbaeb4dc6 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -41,68 +41,6 @@ #include "xfs_error.h" #include "xfs_bmap.h" -/* - * Log specified fields for the inode given by bp and off. - */ -STATIC void -xfs_ialloc_log_di( - xfs_trans_t *tp, /* transaction pointer */ - xfs_buf_t *bp, /* inode buffer */ - int off, /* index of inode in buffer */ - int fields) /* bitmask of fields to log */ -{ - int first; /* first byte number */ - int ioffset; /* off in bytes */ - int last; /* last byte number */ - xfs_mount_t *mp; /* mount point structure */ - static const short offsets[] = { /* field offsets */ - /* keep in sync with bits */ - offsetof(xfs_dinode_core_t, di_magic), - offsetof(xfs_dinode_core_t, di_mode), - offsetof(xfs_dinode_core_t, di_version), - offsetof(xfs_dinode_core_t, di_format), - offsetof(xfs_dinode_core_t, di_onlink), - offsetof(xfs_dinode_core_t, di_uid), - offsetof(xfs_dinode_core_t, di_gid), - offsetof(xfs_dinode_core_t, di_nlink), - offsetof(xfs_dinode_core_t, di_projid), - offsetof(xfs_dinode_core_t, di_pad), - offsetof(xfs_dinode_core_t, di_atime), - offsetof(xfs_dinode_core_t, di_mtime), - offsetof(xfs_dinode_core_t, di_ctime), - offsetof(xfs_dinode_core_t, di_size), - offsetof(xfs_dinode_core_t, di_nblocks), - offsetof(xfs_dinode_core_t, di_extsize), - offsetof(xfs_dinode_core_t, di_nextents), - offsetof(xfs_dinode_core_t, di_anextents), - offsetof(xfs_dinode_core_t, di_forkoff), - offsetof(xfs_dinode_core_t, di_aformat), - offsetof(xfs_dinode_core_t, di_dmevmask), - offsetof(xfs_dinode_core_t, di_dmstate), - offsetof(xfs_dinode_core_t, di_flags), - offsetof(xfs_dinode_core_t, di_gen), - offsetof(xfs_dinode_t, di_next_unlinked), - offsetof(xfs_dinode_t, di_u), - offsetof(xfs_dinode_t, di_a), - sizeof(xfs_dinode_t) - }; - - - ASSERT(offsetof(xfs_dinode_t, di_core) == 0); - ASSERT((fields & (XFS_DI_U|XFS_DI_A)) == 0); - mp = tp->t_mountp; - /* - * Get the inode-relative first and last bytes for these fields - */ - xfs_btree_offsets(fields, offsets, XFS_DI_NUM_BITS, &first, &last); - /* - * Convert to buffer offsets and log it. - */ - ioffset = off << mp->m_sb.sb_inodelog; - first += ioffset; - last += ioffset; - xfs_trans_log_buf(tp, bp, first, last); -} /* * Allocation group level functions. @@ -119,6 +57,102 @@ xfs_ialloc_cluster_alignment( } /* + * Lookup the record equal to ino in the btree given by cur. + */ +STATIC int /* error */ +xfs_inobt_lookup_eq( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agino_t ino, /* starting inode of chunk */ + __int32_t fcnt, /* free inode count */ + xfs_inofree_t free, /* free inode mask */ + int *stat) /* success/failure */ +{ + cur->bc_rec.i.ir_startino = ino; + cur->bc_rec.i.ir_freecount = fcnt; + cur->bc_rec.i.ir_free = free; + return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat); +} + +/* + * Lookup the first record greater than or equal to ino + * in the btree given by cur. + */ +int /* error */ +xfs_inobt_lookup_ge( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agino_t ino, /* starting inode of chunk */ + __int32_t fcnt, /* free inode count */ + xfs_inofree_t free, /* free inode mask */ + int *stat) /* success/failure */ +{ + cur->bc_rec.i.ir_startino = ino; + cur->bc_rec.i.ir_freecount = fcnt; + cur->bc_rec.i.ir_free = free; + return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat); +} + +/* + * Lookup the first record less than or equal to ino + * in the btree given by cur. + */ +int /* error */ +xfs_inobt_lookup_le( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agino_t ino, /* starting inode of chunk */ + __int32_t fcnt, /* free inode count */ + xfs_inofree_t free, /* free inode mask */ + int *stat) /* success/failure */ +{ + cur->bc_rec.i.ir_startino = ino; + cur->bc_rec.i.ir_freecount = fcnt; + cur->bc_rec.i.ir_free = free; + return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat); +} + +/* + * Update the record referred to by cur to the value given + * by [ino, fcnt, free]. + * This either works (return 0) or gets an EFSCORRUPTED error. + */ +STATIC int /* error */ +xfs_inobt_update( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agino_t ino, /* starting inode of chunk */ + __int32_t fcnt, /* free inode count */ + xfs_inofree_t free) /* free inode mask */ +{ + union xfs_btree_rec rec; + + rec.inobt.ir_startino = cpu_to_be32(ino); + rec.inobt.ir_freecount = cpu_to_be32(fcnt); + rec.inobt.ir_free = cpu_to_be64(free); + return xfs_btree_update(cur, &rec); +} + +/* + * Get the data from the pointed-to record. + */ +int /* error */ +xfs_inobt_get_rec( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agino_t *ino, /* output: starting inode of chunk */ + __int32_t *fcnt, /* output: number of free inodes */ + xfs_inofree_t *free, /* output: free inode mask */ + int *stat) /* output: success/failure */ +{ + union xfs_btree_rec *rec; + int error; + + error = xfs_btree_get_rec(cur, &rec, stat); + if (!error && *stat == 1) { + *ino = be32_to_cpu(rec->inobt.ir_startino); + *fcnt = be32_to_cpu(rec->inobt.ir_freecount); + *free = be64_to_cpu(rec->inobt.ir_free); + } + return error; +} + +/* * Allocate new inodes in the allocation group specified by agbp. * Return 0 for success, else error code. */ @@ -287,9 +321,9 @@ xfs_ialloc_ag_alloc( * able to use the file system. */ if (xfs_sb_version_hasnlink(&args.mp->m_sb)) - version = XFS_DINODE_VERSION_2; + version = 2; else - version = XFS_DINODE_VERSION_1; + version = 1; /* * Seed the new inode cluster with a random generation number. This @@ -310,18 +344,25 @@ xfs_ialloc_ag_alloc( XFS_BUF_LOCK); ASSERT(fbuf); ASSERT(!XFS_BUF_GETERROR(fbuf)); + /* - * Set initial values for the inodes in this buffer. + * Initialize all inodes in this buffer and then log them. + * + * XXX: It would be much better if we had just one transaction to + * log a whole cluster of inodes instead of all the indivdual + * transactions causing a lot of log traffic. */ xfs_biozero(fbuf, 0, ninodes << args.mp->m_sb.sb_inodelog); for (i = 0; i < ninodes; i++) { + int ioffset = i << args.mp->m_sb.sb_inodelog; + uint isize = sizeof(struct xfs_dinode); + free = XFS_MAKE_IPTR(args.mp, fbuf, i); - free->di_core.di_magic = cpu_to_be16(XFS_DINODE_MAGIC); - free->di_core.di_version = version; - free->di_core.di_gen = cpu_to_be32(gen); + free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); + free->di_version = version; + free->di_gen = cpu_to_be32(gen); free->di_next_unlinked = cpu_to_be32(NULLAGINO); - xfs_ialloc_log_di(tp, fbuf, i, - XFS_DI_CORE_BITS | XFS_DI_NEXT_UNLINKED); + xfs_trans_log_buf(tp, fbuf, ioffset, ioffset + isize - 1); } xfs_trans_inode_alloc_buf(tp, fbuf); } @@ -335,8 +376,7 @@ xfs_ialloc_ag_alloc( /* * Insert records describing the new inode chunk into the btree. */ - cur = xfs_btree_init_cursor(args.mp, tp, agbp, agno, - XFS_BTNUM_INO, (xfs_inode_t *)0, 0); + cur = xfs_inobt_init_cursor(args.mp, tp, agbp, agno); for (thisino = newino; thisino < newino + newlen; thisino += XFS_INODES_PER_CHUNK) { @@ -346,7 +386,7 @@ xfs_ialloc_ag_alloc( return error; } ASSERT(i == 0); - if ((error = xfs_inobt_insert(cur, &i))) { + if ((error = xfs_btree_insert(cur, &i))) { xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); return error; } @@ -676,8 +716,7 @@ nextag: */ agno = tagno; *IO_agbp = NULL; - cur = xfs_btree_init_cursor(mp, tp, agbp, be32_to_cpu(agi->agi_seqno), - XFS_BTNUM_INO, (xfs_inode_t *)0, 0); + cur = xfs_inobt_init_cursor(mp, tp, agbp, be32_to_cpu(agi->agi_seqno)); /* * If pagino is 0 (this is the root inode allocation) use newino. * This must work because we've just allocated some. @@ -697,7 +736,7 @@ nextag: goto error0; XFS_WANT_CORRUPTED_GOTO(i == 1, error0); freecount += rec.ir_freecount; - if ((error = xfs_inobt_increment(cur, 0, &i))) + if ((error = xfs_btree_increment(cur, 0, &i))) goto error0; } while (i == 1); @@ -741,7 +780,7 @@ nextag: /* * Search left with tcur, back up 1 record. */ - if ((error = xfs_inobt_decrement(tcur, 0, &i))) + if ((error = xfs_btree_decrement(tcur, 0, &i))) goto error1; doneleft = !i; if (!doneleft) { @@ -755,7 +794,7 @@ nextag: /* * Search right with cur, go forward 1 record. */ - if ((error = xfs_inobt_increment(cur, 0, &i))) + if ((error = xfs_btree_increment(cur, 0, &i))) goto error1; doneright = !i; if (!doneright) { @@ -817,7 +856,7 @@ nextag: * further left. */ if (useleft) { - if ((error = xfs_inobt_decrement(tcur, 0, + if ((error = xfs_btree_decrement(tcur, 0, &i))) goto error1; doneleft = !i; @@ -837,7 +876,7 @@ nextag: * further right. */ else { - if ((error = xfs_inobt_increment(cur, 0, + if ((error = xfs_btree_increment(cur, 0, &i))) goto error1; doneright = !i; @@ -892,7 +931,7 @@ nextag: XFS_WANT_CORRUPTED_GOTO(i == 1, error0); if (rec.ir_freecount > 0) break; - if ((error = xfs_inobt_increment(cur, 0, &i))) + if ((error = xfs_btree_increment(cur, 0, &i))) goto error0; XFS_WANT_CORRUPTED_GOTO(i == 1, error0); } @@ -926,7 +965,7 @@ nextag: goto error0; XFS_WANT_CORRUPTED_GOTO(i == 1, error0); freecount += rec.ir_freecount; - if ((error = xfs_inobt_increment(cur, 0, &i))) + if ((error = xfs_btree_increment(cur, 0, &i))) goto error0; } while (i == 1); ASSERT(freecount == be32_to_cpu(agi->agi_freecount) || @@ -1022,8 +1061,7 @@ xfs_difree( /* * Initialize the cursor. */ - cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO, - (xfs_inode_t *)0, 0); + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno); #ifdef DEBUG if (cur->bc_nlevels == 1) { int freecount = 0; @@ -1036,7 +1074,7 @@ xfs_difree( goto error0; if (i) { freecount += rec.ir_freecount; - if ((error = xfs_inobt_increment(cur, 0, &i))) + if ((error = xfs_btree_increment(cur, 0, &i))) goto error0; } } while (i == 1); @@ -1098,8 +1136,8 @@ xfs_difree( xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen); xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1)); - if ((error = xfs_inobt_delete(cur, &i))) { - cmn_err(CE_WARN, "xfs_difree: xfs_inobt_delete returned an error %d on %s.\n", + if ((error = xfs_btree_delete(cur, &i))) { + cmn_err(CE_WARN, "xfs_difree: xfs_btree_delete returned an error %d on %s.\n", error, mp->m_fsname); goto error0; } @@ -1141,7 +1179,7 @@ xfs_difree( goto error0; if (i) { freecount += rec.ir_freecount; - if ((error = xfs_inobt_increment(cur, 0, &i))) + if ((error = xfs_btree_increment(cur, 0, &i))) goto error0; } } while (i == 1); @@ -1158,36 +1196,28 @@ error0: } /* - * Return the location of the inode in bno/off, for mapping it into a buffer. + * Return the location of the inode in imap, for mapping it into a buffer. */ -/*ARGSUSED*/ int -xfs_dilocate( - xfs_mount_t *mp, /* file system mount structure */ - xfs_trans_t *tp, /* transaction pointer */ +xfs_imap( + xfs_mount_t *mp, /* file system mount structure */ + xfs_trans_t *tp, /* transaction pointer */ xfs_ino_t ino, /* inode to locate */ - xfs_fsblock_t *bno, /* output: block containing inode */ - int *len, /* output: num blocks in inode cluster */ - int *off, /* output: index in block of inode */ - uint flags) /* flags concerning inode lookup */ + struct xfs_imap *imap, /* location map structure */ + uint flags) /* flags for inode btree lookup */ { xfs_agblock_t agbno; /* block number of inode in the alloc group */ - xfs_buf_t *agbp; /* agi buffer */ xfs_agino_t agino; /* inode number within alloc group */ xfs_agnumber_t agno; /* allocation group number */ int blks_per_cluster; /* num blocks per inode cluster */ xfs_agblock_t chunk_agbno; /* first block in inode chunk */ - xfs_agino_t chunk_agino; /* first agino in inode chunk */ - __int32_t chunk_cnt; /* count of free inodes in chunk */ - xfs_inofree_t chunk_free; /* mask of free inodes in chunk */ xfs_agblock_t cluster_agbno; /* first block in inode cluster */ - xfs_btree_cur_t *cur; /* inode btree cursor */ int error; /* error code */ - int i; /* temp state */ int offset; /* index of inode in its buffer */ int offset_agbno; /* blks from chunk start to inode */ ASSERT(ino != NULLFSINO); + /* * Split up the inode number into its parts. */ @@ -1198,24 +1228,24 @@ xfs_dilocate( ino != XFS_AGINO_TO_INO(mp, agno, agino)) { #ifdef DEBUG /* no diagnostics for bulkstat, ino comes from userspace */ - if (flags & XFS_IMAP_BULKSTAT) + if (flags & XFS_IGET_BULKSTAT) return XFS_ERROR(EINVAL); if (agno >= mp->m_sb.sb_agcount) { xfs_fs_cmn_err(CE_ALERT, mp, - "xfs_dilocate: agno (%d) >= " + "xfs_imap: agno (%d) >= " "mp->m_sb.sb_agcount (%d)", agno, mp->m_sb.sb_agcount); } if (agbno >= mp->m_sb.sb_agblocks) { xfs_fs_cmn_err(CE_ALERT, mp, - "xfs_dilocate: agbno (0x%llx) >= " + "xfs_imap: agbno (0x%llx) >= " "mp->m_sb.sb_agblocks (0x%lx)", (unsigned long long) agbno, (unsigned long) mp->m_sb.sb_agblocks); } if (ino != XFS_AGINO_TO_INO(mp, agno, agino)) { xfs_fs_cmn_err(CE_ALERT, mp, - "xfs_dilocate: ino (0x%llx) != " + "xfs_imap: ino (0x%llx) != " "XFS_AGINO_TO_INO(mp, agno, agino) " "(0x%llx)", ino, XFS_AGINO_TO_INO(mp, agno, agino)); @@ -1224,65 +1254,89 @@ xfs_dilocate( #endif /* DEBUG */ return XFS_ERROR(EINVAL); } - if ((mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) || - !(flags & XFS_IMAP_LOOKUP)) { + + /* + * If the inode cluster size is the same as the blocksize or + * smaller we get to the buffer by simple arithmetics. + */ + if (XFS_INODE_CLUSTER_SIZE(mp) <= mp->m_sb.sb_blocksize) { offset = XFS_INO_TO_OFFSET(mp, ino); ASSERT(offset < mp->m_sb.sb_inopblock); - *bno = XFS_AGB_TO_FSB(mp, agno, agbno); - *off = offset; - *len = 1; + + imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno); + imap->im_len = XFS_FSB_TO_BB(mp, 1); + imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog); return 0; } + blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_blocklog; - if (*bno != NULLFSBLOCK) { + + /* + * If we get a block number passed from bulkstat we can use it to + * find the buffer easily. + */ + if (imap->im_blkno) { offset = XFS_INO_TO_OFFSET(mp, ino); ASSERT(offset < mp->m_sb.sb_inopblock); - cluster_agbno = XFS_FSB_TO_AGBNO(mp, *bno); - *off = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) + - offset; - *len = blks_per_cluster; + + cluster_agbno = XFS_DADDR_TO_AGBNO(mp, imap->im_blkno); + offset += (agbno - cluster_agbno) * mp->m_sb.sb_inopblock; + + imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster); + imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog); return 0; } + + /* + * If the inode chunks are aligned then use simple maths to + * find the location. Otherwise we have to do a btree + * lookup to find the location. + */ if (mp->m_inoalign_mask) { offset_agbno = agbno & mp->m_inoalign_mask; chunk_agbno = agbno - offset_agbno; } else { + xfs_btree_cur_t *cur; /* inode btree cursor */ + xfs_agino_t chunk_agino; /* first agino in inode chunk */ + __int32_t chunk_cnt; /* count of free inodes in chunk */ + xfs_inofree_t chunk_free; /* mask of free inodes in chunk */ + xfs_buf_t *agbp; /* agi buffer */ + int i; /* temp state */ + down_read(&mp->m_peraglock); error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); up_read(&mp->m_peraglock); if (error) { -#ifdef DEBUG - xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: " + xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: " "xfs_ialloc_read_agi() returned " "error %d, agno %d", error, agno); -#endif /* DEBUG */ return error; } - cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO, - (xfs_inode_t *)0, 0); - if ((error = xfs_inobt_lookup_le(cur, agino, 0, 0, &i))) { -#ifdef DEBUG - xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: " + + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno); + error = xfs_inobt_lookup_le(cur, agino, 0, 0, &i); + if (error) { + xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: " "xfs_inobt_lookup_le() failed"); -#endif /* DEBUG */ goto error0; } - if ((error = xfs_inobt_get_rec(cur, &chunk_agino, &chunk_cnt, - &chunk_free, &i))) { -#ifdef DEBUG - xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: " + + error = xfs_inobt_get_rec(cur, &chunk_agino, &chunk_cnt, + &chunk_free, &i); + if (error) { + xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: " "xfs_inobt_get_rec() failed"); -#endif /* DEBUG */ goto error0; } if (i == 0) { #ifdef DEBUG - xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: " + xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: " "xfs_inobt_get_rec() failed"); #endif /* DEBUG */ error = XFS_ERROR(EINVAL); } + error0: xfs_trans_brelse(tp, agbp); xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); if (error) @@ -1290,19 +1344,35 @@ xfs_dilocate( chunk_agbno = XFS_AGINO_TO_AGBNO(mp, chunk_agino); offset_agbno = agbno - chunk_agbno; } + ASSERT(agbno >= chunk_agbno); cluster_agbno = chunk_agbno + ((offset_agbno / blks_per_cluster) * blks_per_cluster); offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) + XFS_INO_TO_OFFSET(mp, ino); - *bno = XFS_AGB_TO_FSB(mp, agno, cluster_agbno); - *off = offset; - *len = blks_per_cluster; + + imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, cluster_agbno); + imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster); + imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog); + + /* + * If the inode number maps to a block outside the bounds + * of the file system then return NULL rather than calling + * read_buf and panicing when we get an error from the + * driver. + */ + if ((imap->im_blkno + imap->im_len) > + XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) { + xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: " + "(imap->im_blkno (0x%llx) + imap->im_len (0x%llx)) > " + " XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) (0x%llx)", + (unsigned long long) imap->im_blkno, + (unsigned long long) imap->im_len, + XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)); + return XFS_ERROR(EINVAL); + } + return 0; -error0: - xfs_trans_brelse(tp, agbp); - xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); - return error; } /* @@ -1370,70 +1440,95 @@ xfs_ialloc_log_agi( xfs_trans_log_buf(tp, bp, first, last); } +#ifdef DEBUG +STATIC void +xfs_check_agi_unlinked( + struct xfs_agi *agi) +{ + int i; + + for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) + ASSERT(agi->agi_unlinked[i]); +} +#else +#define xfs_check_agi_unlinked(agi) +#endif + /* * Read in the allocation group header (inode allocation section) */ int -xfs_ialloc_read_agi( - xfs_mount_t *mp, /* file system mount structure */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ - xfs_buf_t **bpp) /* allocation group hdr buf */ +xfs_read_agi( + struct xfs_mount *mp, /* file system mount structure */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + struct xfs_buf **bpp) /* allocation group hdr buf */ { - xfs_agi_t *agi; /* allocation group header */ - int agi_ok; /* agi is consistent */ - xfs_buf_t *bp; /* allocation group hdr buf */ - xfs_perag_t *pag; /* per allocation group data */ - int error; + struct xfs_agi *agi; /* allocation group header */ + int agi_ok; /* agi is consistent */ + int error; ASSERT(agno != NULLAGNUMBER); - error = xfs_trans_read_buf( - mp, tp, mp->m_ddev_targp, + + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0, &bp); + XFS_FSS_TO_BB(mp, 1), 0, bpp); if (error) return error; - ASSERT(bp && !XFS_BUF_GETERROR(bp)); + + ASSERT(*bpp && !XFS_BUF_GETERROR(*bpp)); + agi = XFS_BUF_TO_AGI(*bpp); /* * Validate the magic number of the agi block. */ - agi = XFS_BUF_TO_AGI(bp); - agi_ok = - be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC && - XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)); + agi_ok = be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC && + XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)) && + be32_to_cpu(agi->agi_seqno) == agno; if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI, XFS_RANDOM_IALLOC_READ_AGI))) { - XFS_CORRUPTION_ERROR("xfs_ialloc_read_agi", XFS_ERRLEVEL_LOW, + XFS_CORRUPTION_ERROR("xfs_read_agi", XFS_ERRLEVEL_LOW, mp, agi); - xfs_trans_brelse(tp, bp); + xfs_trans_brelse(tp, *bpp); return XFS_ERROR(EFSCORRUPTED); } + + XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_AGI, XFS_AGI_REF); + + xfs_check_agi_unlinked(agi); + return 0; +} + +int +xfs_ialloc_read_agi( + struct xfs_mount *mp, /* file system mount structure */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + struct xfs_buf **bpp) /* allocation group hdr buf */ +{ + struct xfs_agi *agi; /* allocation group header */ + struct xfs_perag *pag; /* per allocation group data */ + int error; + + error = xfs_read_agi(mp, tp, agno, bpp); + if (error) + return error; + + agi = XFS_BUF_TO_AGI(*bpp); pag = &mp->m_perag[agno]; + if (!pag->pagi_init) { pag->pagi_freecount = be32_to_cpu(agi->agi_freecount); pag->pagi_count = be32_to_cpu(agi->agi_count); pag->pagi_init = 1; - } else { - /* - * It's possible for these to be out of sync if - * we are in the middle of a forced shutdown. - */ - ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) || - XFS_FORCED_SHUTDOWN(mp)); } -#ifdef DEBUG - { - int i; - - for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) - ASSERT(agi->agi_unlinked[i]); - } -#endif - - XFS_BUF_SET_VTYPE_REF(bp, B_FS_AGI, XFS_AGI_REF); - *bpp = bp; + /* + * It's possible for these to be out of sync if + * we are in the middle of a forced shutdown. + */ + ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) || + XFS_FORCED_SHUTDOWN(mp)); return 0; } diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h index 4e30ec1d13bc..50f558a4e0a8 100644 --- a/fs/xfs/xfs_ialloc.h +++ b/fs/xfs/xfs_ialloc.h @@ -20,6 +20,7 @@ struct xfs_buf; struct xfs_dinode; +struct xfs_imap; struct xfs_mount; struct xfs_trans; @@ -56,7 +57,6 @@ static inline int xfs_ialloc_find_free(xfs_inofree_t *fp) } -#ifdef __KERNEL__ /* * Allocate an inode on disk. * Mode is used to tell whether the new inode will need space, and whether @@ -105,17 +105,14 @@ xfs_difree( xfs_ino_t *first_ino); /* first inode in deleted cluster */ /* - * Return the location of the inode in bno/len/off, - * for mapping it into a buffer. + * Return the location of the inode in imap, for mapping it into a buffer. */ int -xfs_dilocate( +xfs_imap( struct xfs_mount *mp, /* file system mount structure */ struct xfs_trans *tp, /* transaction pointer */ xfs_ino_t ino, /* inode to locate */ - xfs_fsblock_t *bno, /* output: block containing inode */ - int *len, /* output: num blocks in cluster*/ - int *off, /* output: index in block of inode */ + struct xfs_imap *imap, /* location map structure */ uint flags); /* flags for inode btree lookup */ /* @@ -154,6 +151,24 @@ xfs_ialloc_pagi_init( struct xfs_trans *tp, /* transaction pointer */ xfs_agnumber_t agno); /* allocation group number */ -#endif /* __KERNEL__ */ +/* + * Lookup the first record greater than or equal to ino + * in the btree given by cur. + */ +int xfs_inobt_lookup_ge(struct xfs_btree_cur *cur, xfs_agino_t ino, + __int32_t fcnt, xfs_inofree_t free, int *stat); + +/* + * Lookup the first record less than or equal to ino + * in the btree given by cur. + */ +int xfs_inobt_lookup_le(struct xfs_btree_cur *cur, xfs_agino_t ino, + __int32_t fcnt, xfs_inofree_t free, int *stat); + +/* + * Get the data from the pointed-to record. + */ +extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur, xfs_agino_t *ino, + __int32_t *fcnt, xfs_inofree_t *free, int *stat); #endif /* __XFS_IALLOC_H__ */ diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c index 83502f3edef0..99f2408e8d8e 100644 --- a/fs/xfs/xfs_ialloc_btree.c +++ b/fs/xfs/xfs_ialloc_btree.c @@ -35,2044 +35,349 @@ #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_btree.h" +#include "xfs_btree_trace.h" #include "xfs_ialloc.h" #include "xfs_alloc.h" #include "xfs_error.h" -STATIC void xfs_inobt_log_block(xfs_trans_t *, xfs_buf_t *, int); -STATIC void xfs_inobt_log_keys(xfs_btree_cur_t *, xfs_buf_t *, int, int); -STATIC void xfs_inobt_log_ptrs(xfs_btree_cur_t *, xfs_buf_t *, int, int); -STATIC void xfs_inobt_log_recs(xfs_btree_cur_t *, xfs_buf_t *, int, int); -STATIC int xfs_inobt_lshift(xfs_btree_cur_t *, int, int *); -STATIC int xfs_inobt_newroot(xfs_btree_cur_t *, int *); -STATIC int xfs_inobt_rshift(xfs_btree_cur_t *, int, int *); -STATIC int xfs_inobt_split(xfs_btree_cur_t *, int, xfs_agblock_t *, - xfs_inobt_key_t *, xfs_btree_cur_t **, int *); -STATIC int xfs_inobt_updkey(xfs_btree_cur_t *, xfs_inobt_key_t *, int); -/* - * Single level of the xfs_inobt_delete record deletion routine. - * Delete record pointed to by cur/level. - * Remove the record from its block then rebalance the tree. - * Return 0 for error, 1 for done, 2 to go on to the next level. - */ -STATIC int /* error */ -xfs_inobt_delrec( - xfs_btree_cur_t *cur, /* btree cursor */ - int level, /* level removing record from */ - int *stat) /* fail/done/go-on */ +STATIC int +xfs_inobt_get_minrecs( + struct xfs_btree_cur *cur, + int level) { - xfs_buf_t *agbp; /* buffer for a.g. inode header */ - xfs_mount_t *mp; /* mount structure */ - xfs_agi_t *agi; /* allocation group inode header */ - xfs_inobt_block_t *block; /* btree block record/key lives in */ - xfs_agblock_t bno; /* btree block number */ - xfs_buf_t *bp; /* buffer for block */ - int error; /* error return value */ - int i; /* loop index */ - xfs_inobt_key_t key; /* kp points here if block is level 0 */ - xfs_inobt_key_t *kp = NULL; /* pointer to btree keys */ - xfs_agblock_t lbno; /* left block's block number */ - xfs_buf_t *lbp; /* left block's buffer pointer */ - xfs_inobt_block_t *left; /* left btree block */ - xfs_inobt_key_t *lkp; /* left block key pointer */ - xfs_inobt_ptr_t *lpp; /* left block address pointer */ - int lrecs = 0; /* number of records in left block */ - xfs_inobt_rec_t *lrp; /* left block record pointer */ - xfs_inobt_ptr_t *pp = NULL; /* pointer to btree addresses */ - int ptr; /* index in btree block for this rec */ - xfs_agblock_t rbno; /* right block's block number */ - xfs_buf_t *rbp; /* right block's buffer pointer */ - xfs_inobt_block_t *right; /* right btree block */ - xfs_inobt_key_t *rkp; /* right block key pointer */ - xfs_inobt_rec_t *rp; /* pointer to btree records */ - xfs_inobt_ptr_t *rpp; /* right block address pointer */ - int rrecs = 0; /* number of records in right block */ - int numrecs; - xfs_inobt_rec_t *rrp; /* right block record pointer */ - xfs_btree_cur_t *tcur; /* temporary btree cursor */ - - mp = cur->bc_mp; - - /* - * Get the index of the entry being deleted, check for nothing there. - */ - ptr = cur->bc_ptrs[level]; - if (ptr == 0) { - *stat = 0; - return 0; - } - - /* - * Get the buffer & block containing the record or key/ptr. - */ - bp = cur->bc_bufs[level]; - block = XFS_BUF_TO_INOBT_BLOCK(bp); -#ifdef DEBUG - if ((error = xfs_btree_check_sblock(cur, block, level, bp))) - return error; -#endif - /* - * Fail if we're off the end of the block. - */ + return cur->bc_mp->m_inobt_mnr[level != 0]; +} - numrecs = be16_to_cpu(block->bb_numrecs); - if (ptr > numrecs) { - *stat = 0; - return 0; - } - /* - * It's a nonleaf. Excise the key and ptr being deleted, by - * sliding the entries past them down one. - * Log the changed areas of the block. - */ - if (level > 0) { - kp = XFS_INOBT_KEY_ADDR(block, 1, cur); - pp = XFS_INOBT_PTR_ADDR(block, 1, cur); -#ifdef DEBUG - for (i = ptr; i < numrecs; i++) { - if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(pp[i]), level))) - return error; - } -#endif - if (ptr < numrecs) { - memmove(&kp[ptr - 1], &kp[ptr], - (numrecs - ptr) * sizeof(*kp)); - memmove(&pp[ptr - 1], &pp[ptr], - (numrecs - ptr) * sizeof(*kp)); - xfs_inobt_log_keys(cur, bp, ptr, numrecs - 1); - xfs_inobt_log_ptrs(cur, bp, ptr, numrecs - 1); - } - } - /* - * It's a leaf. Excise the record being deleted, by sliding the - * entries past it down one. Log the changed areas of the block. - */ - else { - rp = XFS_INOBT_REC_ADDR(block, 1, cur); - if (ptr < numrecs) { - memmove(&rp[ptr - 1], &rp[ptr], - (numrecs - ptr) * sizeof(*rp)); - xfs_inobt_log_recs(cur, bp, ptr, numrecs - 1); - } - /* - * If it's the first record in the block, we'll need a key - * structure to pass up to the next level (updkey). - */ - if (ptr == 1) { - key.ir_startino = rp->ir_startino; - kp = &key; - } - } - /* - * Decrement and log the number of entries in the block. - */ - numrecs--; - block->bb_numrecs = cpu_to_be16(numrecs); - xfs_inobt_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS); - /* - * Is this the root level? If so, we're almost done. - */ - if (level == cur->bc_nlevels - 1) { - /* - * If this is the root level, - * and there's only one entry left, - * and it's NOT the leaf level, - * then we can get rid of this level. - */ - if (numrecs == 1 && level > 0) { - agbp = cur->bc_private.a.agbp; - agi = XFS_BUF_TO_AGI(agbp); - /* - * pp is still set to the first pointer in the block. - * Make it the new root of the btree. - */ - bno = be32_to_cpu(agi->agi_root); - agi->agi_root = *pp; - be32_add_cpu(&agi->agi_level, -1); - /* - * Free the block. - */ - if ((error = xfs_free_extent(cur->bc_tp, - XFS_AGB_TO_FSB(mp, cur->bc_private.a.agno, bno), 1))) - return error; - xfs_trans_binval(cur->bc_tp, bp); - xfs_ialloc_log_agi(cur->bc_tp, agbp, - XFS_AGI_ROOT | XFS_AGI_LEVEL); - /* - * Update the cursor so there's one fewer level. - */ - cur->bc_bufs[level] = NULL; - cur->bc_nlevels--; - } else if (level > 0 && - (error = xfs_inobt_decrement(cur, level, &i))) - return error; - *stat = 1; - return 0; - } - /* - * If we deleted the leftmost entry in the block, update the - * key values above us in the tree. - */ - if (ptr == 1 && (error = xfs_inobt_updkey(cur, kp, level + 1))) - return error; - /* - * If the number of records remaining in the block is at least - * the minimum, we're done. - */ - if (numrecs >= XFS_INOBT_BLOCK_MINRECS(level, cur)) { - if (level > 0 && - (error = xfs_inobt_decrement(cur, level, &i))) - return error; - *stat = 1; - return 0; - } - /* - * Otherwise, we have to move some records around to keep the - * tree balanced. Look at the left and right sibling blocks to - * see if we can re-balance by moving only one record. - */ - rbno = be32_to_cpu(block->bb_rightsib); - lbno = be32_to_cpu(block->bb_leftsib); - bno = NULLAGBLOCK; - ASSERT(rbno != NULLAGBLOCK || lbno != NULLAGBLOCK); - /* - * Duplicate the cursor so our btree manipulations here won't - * disrupt the next level up. - */ - if ((error = xfs_btree_dup_cursor(cur, &tcur))) - return error; - /* - * If there's a right sibling, see if it's ok to shift an entry - * out of it. - */ - if (rbno != NULLAGBLOCK) { - /* - * Move the temp cursor to the last entry in the next block. - * Actually any entry but the first would suffice. - */ - i = xfs_btree_lastrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - if ((error = xfs_inobt_increment(tcur, level, &i))) - goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - i = xfs_btree_lastrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - /* - * Grab a pointer to the block. - */ - rbp = tcur->bc_bufs[level]; - right = XFS_BUF_TO_INOBT_BLOCK(rbp); -#ifdef DEBUG - if ((error = xfs_btree_check_sblock(cur, right, level, rbp))) - goto error0; -#endif - /* - * Grab the current block number, for future use. - */ - bno = be32_to_cpu(right->bb_leftsib); - /* - * If right block is full enough so that removing one entry - * won't make it too empty, and left-shifting an entry out - * of right to us works, we're done. - */ - if (be16_to_cpu(right->bb_numrecs) - 1 >= - XFS_INOBT_BLOCK_MINRECS(level, cur)) { - if ((error = xfs_inobt_lshift(tcur, level, &i))) - goto error0; - if (i) { - ASSERT(be16_to_cpu(block->bb_numrecs) >= - XFS_INOBT_BLOCK_MINRECS(level, cur)); - xfs_btree_del_cursor(tcur, - XFS_BTREE_NOERROR); - if (level > 0 && - (error = xfs_inobt_decrement(cur, level, - &i))) - return error; - *stat = 1; - return 0; - } - } - /* - * Otherwise, grab the number of records in right for - * future reference, and fix up the temp cursor to point - * to our block again (last record). - */ - rrecs = be16_to_cpu(right->bb_numrecs); - if (lbno != NULLAGBLOCK) { - xfs_btree_firstrec(tcur, level); - if ((error = xfs_inobt_decrement(tcur, level, &i))) - goto error0; - } - } - /* - * If there's a left sibling, see if it's ok to shift an entry - * out of it. - */ - if (lbno != NULLAGBLOCK) { - /* - * Move the temp cursor to the first entry in the - * previous block. - */ - xfs_btree_firstrec(tcur, level); - if ((error = xfs_inobt_decrement(tcur, level, &i))) - goto error0; - xfs_btree_firstrec(tcur, level); - /* - * Grab a pointer to the block. - */ - lbp = tcur->bc_bufs[level]; - left = XFS_BUF_TO_INOBT_BLOCK(lbp); -#ifdef DEBUG - if ((error = xfs_btree_check_sblock(cur, left, level, lbp))) - goto error0; -#endif - /* - * Grab the current block number, for future use. - */ - bno = be32_to_cpu(left->bb_rightsib); - /* - * If left block is full enough so that removing one entry - * won't make it too empty, and right-shifting an entry out - * of left to us works, we're done. - */ - if (be16_to_cpu(left->bb_numrecs) - 1 >= - XFS_INOBT_BLOCK_MINRECS(level, cur)) { - if ((error = xfs_inobt_rshift(tcur, level, &i))) - goto error0; - if (i) { - ASSERT(be16_to_cpu(block->bb_numrecs) >= - XFS_INOBT_BLOCK_MINRECS(level, cur)); - xfs_btree_del_cursor(tcur, - XFS_BTREE_NOERROR); - if (level == 0) - cur->bc_ptrs[0]++; - *stat = 1; - return 0; - } - } - /* - * Otherwise, grab the number of records in right for - * future reference. - */ - lrecs = be16_to_cpu(left->bb_numrecs); - } - /* - * Delete the temp cursor, we're done with it. - */ - xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); - /* - * If here, we need to do a join to keep the tree balanced. - */ - ASSERT(bno != NULLAGBLOCK); - /* - * See if we can join with the left neighbor block. - */ - if (lbno != NULLAGBLOCK && - lrecs + numrecs <= XFS_INOBT_BLOCK_MAXRECS(level, cur)) { - /* - * Set "right" to be the starting block, - * "left" to be the left neighbor. - */ - rbno = bno; - right = block; - rrecs = be16_to_cpu(right->bb_numrecs); - rbp = bp; - if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, - cur->bc_private.a.agno, lbno, 0, &lbp, - XFS_INO_BTREE_REF))) - return error; - left = XFS_BUF_TO_INOBT_BLOCK(lbp); - lrecs = be16_to_cpu(left->bb_numrecs); - if ((error = xfs_btree_check_sblock(cur, left, level, lbp))) - return error; - } - /* - * If that won't work, see if we can join with the right neighbor block. - */ - else if (rbno != NULLAGBLOCK && - rrecs + numrecs <= XFS_INOBT_BLOCK_MAXRECS(level, cur)) { - /* - * Set "left" to be the starting block, - * "right" to be the right neighbor. - */ - lbno = bno; - left = block; - lrecs = be16_to_cpu(left->bb_numrecs); - lbp = bp; - if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, - cur->bc_private.a.agno, rbno, 0, &rbp, - XFS_INO_BTREE_REF))) - return error; - right = XFS_BUF_TO_INOBT_BLOCK(rbp); - rrecs = be16_to_cpu(right->bb_numrecs); - if ((error = xfs_btree_check_sblock(cur, right, level, rbp))) - return error; - } - /* - * Otherwise, we can't fix the imbalance. - * Just return. This is probably a logic error, but it's not fatal. - */ - else { - if (level > 0 && (error = xfs_inobt_decrement(cur, level, &i))) - return error; - *stat = 1; - return 0; - } - /* - * We're now going to join "left" and "right" by moving all the stuff - * in "right" to "left" and deleting "right". - */ - if (level > 0) { - /* - * It's a non-leaf. Move keys and pointers. - */ - lkp = XFS_INOBT_KEY_ADDR(left, lrecs + 1, cur); - lpp = XFS_INOBT_PTR_ADDR(left, lrecs + 1, cur); - rkp = XFS_INOBT_KEY_ADDR(right, 1, cur); - rpp = XFS_INOBT_PTR_ADDR(right, 1, cur); -#ifdef DEBUG - for (i = 0; i < rrecs; i++) { - if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i]), level))) - return error; - } -#endif - memcpy(lkp, rkp, rrecs * sizeof(*lkp)); - memcpy(lpp, rpp, rrecs * sizeof(*lpp)); - xfs_inobt_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs); - xfs_inobt_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs); - } else { - /* - * It's a leaf. Move records. - */ - lrp = XFS_INOBT_REC_ADDR(left, lrecs + 1, cur); - rrp = XFS_INOBT_REC_ADDR(right, 1, cur); - memcpy(lrp, rrp, rrecs * sizeof(*lrp)); - xfs_inobt_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs); - } - /* - * If we joined with the left neighbor, set the buffer in the - * cursor to the left block, and fix up the index. - */ - if (bp != lbp) { - xfs_btree_setbuf(cur, level, lbp); - cur->bc_ptrs[level] += lrecs; - } - /* - * If we joined with the right neighbor and there's a level above - * us, increment the cursor at that level. - */ - else if (level + 1 < cur->bc_nlevels && - (error = xfs_alloc_increment(cur, level + 1, &i))) - return error; - /* - * Fix up the number of records in the surviving block. - */ - lrecs += rrecs; - left->bb_numrecs = cpu_to_be16(lrecs); - /* - * Fix up the right block pointer in the surviving block, and log it. - */ - left->bb_rightsib = right->bb_rightsib; - xfs_inobt_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB); - /* - * If there is a right sibling now, make it point to the - * remaining block. - */ - if (be32_to_cpu(left->bb_rightsib) != NULLAGBLOCK) { - xfs_inobt_block_t *rrblock; - xfs_buf_t *rrbp; +STATIC struct xfs_btree_cur * +xfs_inobt_dup_cursor( + struct xfs_btree_cur *cur) +{ + return xfs_inobt_init_cursor(cur->bc_mp, cur->bc_tp, + cur->bc_private.a.agbp, cur->bc_private.a.agno); +} - if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, - cur->bc_private.a.agno, be32_to_cpu(left->bb_rightsib), 0, - &rrbp, XFS_INO_BTREE_REF))) - return error; - rrblock = XFS_BUF_TO_INOBT_BLOCK(rrbp); - if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp))) - return error; - rrblock->bb_leftsib = cpu_to_be32(lbno); - xfs_inobt_log_block(cur->bc_tp, rrbp, XFS_BB_LEFTSIB); - } - /* - * Free the deleting block. - */ - if ((error = xfs_free_extent(cur->bc_tp, XFS_AGB_TO_FSB(mp, - cur->bc_private.a.agno, rbno), 1))) - return error; - xfs_trans_binval(cur->bc_tp, rbp); - /* - * Readjust the ptr at this level if it's not a leaf, since it's - * still pointing at the deletion point, which makes the cursor - * inconsistent. If this makes the ptr 0, the caller fixes it up. - * We can't use decrement because it would change the next level up. - */ - if (level > 0) - cur->bc_ptrs[level]--; - /* - * Return value means the next level up has something to do. - */ - *stat = 2; - return 0; +STATIC void +xfs_inobt_set_root( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *nptr, + int inc) /* level change */ +{ + struct xfs_buf *agbp = cur->bc_private.a.agbp; + struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); -error0: - xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); - return error; + agi->agi_root = nptr->s; + be32_add_cpu(&agi->agi_level, inc); + xfs_ialloc_log_agi(cur->bc_tp, agbp, XFS_AGI_ROOT | XFS_AGI_LEVEL); } -/* - * Insert one record/level. Return information to the caller - * allowing the next level up to proceed if necessary. - */ -STATIC int /* error */ -xfs_inobt_insrec( - xfs_btree_cur_t *cur, /* btree cursor */ - int level, /* level to insert record at */ - xfs_agblock_t *bnop, /* i/o: block number inserted */ - xfs_inobt_rec_t *recp, /* i/o: record data inserted */ - xfs_btree_cur_t **curp, /* output: new cursor replacing cur */ - int *stat) /* success/failure */ +STATIC int +xfs_inobt_alloc_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int length, + int *stat) { - xfs_inobt_block_t *block; /* btree block record/key lives in */ - xfs_buf_t *bp; /* buffer for block */ - int error; /* error return value */ - int i; /* loop index */ - xfs_inobt_key_t key; /* key value being inserted */ - xfs_inobt_key_t *kp=NULL; /* pointer to btree keys */ - xfs_agblock_t nbno; /* block number of allocated block */ - xfs_btree_cur_t *ncur; /* new cursor to be used at next lvl */ - xfs_inobt_key_t nkey; /* new key value, from split */ - xfs_inobt_rec_t nrec; /* new record value, for caller */ - int numrecs; - int optr; /* old ptr value */ - xfs_inobt_ptr_t *pp; /* pointer to btree addresses */ - int ptr; /* index in btree block for this rec */ - xfs_inobt_rec_t *rp=NULL; /* pointer to btree records */ + xfs_alloc_arg_t args; /* block allocation args */ + int error; /* error return value */ + xfs_agblock_t sbno = be32_to_cpu(start->s); - /* - * GCC doesn't understand the (arguably complex) control flow in - * this function and complains about uninitialized structure fields - * without this. - */ - memset(&nrec, 0, sizeof(nrec)); + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); - /* - * If we made it to the root level, allocate a new root block - * and we're done. - */ - if (level >= cur->bc_nlevels) { - error = xfs_inobt_newroot(cur, &i); - *bnop = NULLAGBLOCK; - *stat = i; + memset(&args, 0, sizeof(args)); + args.tp = cur->bc_tp; + args.mp = cur->bc_mp; + args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, sbno); + args.minlen = 1; + args.maxlen = 1; + args.prod = 1; + args.type = XFS_ALLOCTYPE_NEAR_BNO; + + error = xfs_alloc_vextent(&args); + if (error) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); return error; } - /* - * Make a key out of the record data to be inserted, and save it. - */ - key.ir_startino = recp->ir_startino; - optr = ptr = cur->bc_ptrs[level]; - /* - * If we're off the left edge, return failure. - */ - if (ptr == 0) { + if (args.fsbno == NULLFSBLOCK) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 0; return 0; } - /* - * Get pointers to the btree buffer and block. - */ - bp = cur->bc_bufs[level]; - block = XFS_BUF_TO_INOBT_BLOCK(bp); - numrecs = be16_to_cpu(block->bb_numrecs); -#ifdef DEBUG - if ((error = xfs_btree_check_sblock(cur, block, level, bp))) - return error; - /* - * Check that the new entry is being inserted in the right place. - */ - if (ptr <= numrecs) { - if (level == 0) { - rp = XFS_INOBT_REC_ADDR(block, ptr, cur); - xfs_btree_check_rec(cur->bc_btnum, recp, rp); - } else { - kp = XFS_INOBT_KEY_ADDR(block, ptr, cur); - xfs_btree_check_key(cur->bc_btnum, &key, kp); - } - } -#endif - nbno = NULLAGBLOCK; - ncur = NULL; - /* - * If the block is full, we can't insert the new entry until we - * make the block un-full. - */ - if (numrecs == XFS_INOBT_BLOCK_MAXRECS(level, cur)) { - /* - * First, try shifting an entry to the right neighbor. - */ - if ((error = xfs_inobt_rshift(cur, level, &i))) - return error; - if (i) { - /* nothing */ - } - /* - * Next, try shifting an entry to the left neighbor. - */ - else { - if ((error = xfs_inobt_lshift(cur, level, &i))) - return error; - if (i) { - optr = ptr = cur->bc_ptrs[level]; - } else { - /* - * Next, try splitting the current block - * in half. If this works we have to - * re-set our variables because - * we could be in a different block now. - */ - if ((error = xfs_inobt_split(cur, level, &nbno, - &nkey, &ncur, &i))) - return error; - if (i) { - bp = cur->bc_bufs[level]; - block = XFS_BUF_TO_INOBT_BLOCK(bp); -#ifdef DEBUG - if ((error = xfs_btree_check_sblock(cur, - block, level, bp))) - return error; -#endif - ptr = cur->bc_ptrs[level]; - nrec.ir_startino = nkey.ir_startino; - } else { - /* - * Otherwise the insert fails. - */ - *stat = 0; - return 0; - } - } - } - } - /* - * At this point we know there's room for our new entry in the block - * we're pointing at. - */ - numrecs = be16_to_cpu(block->bb_numrecs); - if (level > 0) { - /* - * It's a non-leaf entry. Make a hole for the new data - * in the key and ptr regions of the block. - */ - kp = XFS_INOBT_KEY_ADDR(block, 1, cur); - pp = XFS_INOBT_PTR_ADDR(block, 1, cur); -#ifdef DEBUG - for (i = numrecs; i >= ptr; i--) { - if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(pp[i - 1]), level))) - return error; - } -#endif - memmove(&kp[ptr], &kp[ptr - 1], - (numrecs - ptr + 1) * sizeof(*kp)); - memmove(&pp[ptr], &pp[ptr - 1], - (numrecs - ptr + 1) * sizeof(*pp)); - /* - * Now stuff the new data in, bump numrecs and log the new data. - */ -#ifdef DEBUG - if ((error = xfs_btree_check_sptr(cur, *bnop, level))) - return error; -#endif - kp[ptr - 1] = key; - pp[ptr - 1] = cpu_to_be32(*bnop); - numrecs++; - block->bb_numrecs = cpu_to_be16(numrecs); - xfs_inobt_log_keys(cur, bp, ptr, numrecs); - xfs_inobt_log_ptrs(cur, bp, ptr, numrecs); - } else { - /* - * It's a leaf entry. Make a hole for the new record. - */ - rp = XFS_INOBT_REC_ADDR(block, 1, cur); - memmove(&rp[ptr], &rp[ptr - 1], - (numrecs - ptr + 1) * sizeof(*rp)); - /* - * Now stuff the new record in, bump numrecs - * and log the new data. - */ - rp[ptr - 1] = *recp; - numrecs++; - block->bb_numrecs = cpu_to_be16(numrecs); - xfs_inobt_log_recs(cur, bp, ptr, numrecs); - } - /* - * Log the new number of records in the btree header. - */ - xfs_inobt_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS); -#ifdef DEBUG - /* - * Check that the key/record is in the right place, now. - */ - if (ptr < numrecs) { - if (level == 0) - xfs_btree_check_rec(cur->bc_btnum, rp + ptr - 1, - rp + ptr); - else - xfs_btree_check_key(cur->bc_btnum, kp + ptr - 1, - kp + ptr); - } -#endif - /* - * If we inserted at the start of a block, update the parents' keys. - */ - if (optr == 1 && (error = xfs_inobt_updkey(cur, &key, level + 1))) - return error; - /* - * Return the new block number, if any. - * If there is one, give back a record value and a cursor too. - */ - *bnop = nbno; - if (nbno != NULLAGBLOCK) { - *recp = nrec; - *curp = ncur; - } + ASSERT(args.len == 1); + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + + new->s = cpu_to_be32(XFS_FSB_TO_AGBNO(args.mp, args.fsbno)); *stat = 1; return 0; } -/* - * Log header fields from a btree block. - */ -STATIC void -xfs_inobt_log_block( - xfs_trans_t *tp, /* transaction pointer */ - xfs_buf_t *bp, /* buffer containing btree block */ - int fields) /* mask of fields: XFS_BB_... */ +STATIC int +xfs_inobt_free_block( + struct xfs_btree_cur *cur, + struct xfs_buf *bp) { - int first; /* first byte offset logged */ - int last; /* last byte offset logged */ - static const short offsets[] = { /* table of offsets */ - offsetof(xfs_inobt_block_t, bb_magic), - offsetof(xfs_inobt_block_t, bb_level), - offsetof(xfs_inobt_block_t, bb_numrecs), - offsetof(xfs_inobt_block_t, bb_leftsib), - offsetof(xfs_inobt_block_t, bb_rightsib), - sizeof(xfs_inobt_block_t) - }; + xfs_fsblock_t fsbno; + int error; - xfs_btree_offsets(fields, offsets, XFS_BB_NUM_BITS, &first, &last); - xfs_trans_log_buf(tp, bp, first, last); + fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp)); + error = xfs_free_extent(cur->bc_tp, fsbno, 1); + if (error) + return error; + + xfs_trans_binval(cur->bc_tp, bp); + return error; } -/* - * Log keys from a btree block (nonleaf). - */ -STATIC void -xfs_inobt_log_keys( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_buf_t *bp, /* buffer containing btree block */ - int kfirst, /* index of first key to log */ - int klast) /* index of last key to log */ +STATIC int +xfs_inobt_get_maxrecs( + struct xfs_btree_cur *cur, + int level) { - xfs_inobt_block_t *block; /* btree block to log from */ - int first; /* first byte offset logged */ - xfs_inobt_key_t *kp; /* key pointer in btree block */ - int last; /* last byte offset logged */ - - block = XFS_BUF_TO_INOBT_BLOCK(bp); - kp = XFS_INOBT_KEY_ADDR(block, 1, cur); - first = (int)((xfs_caddr_t)&kp[kfirst - 1] - (xfs_caddr_t)block); - last = (int)(((xfs_caddr_t)&kp[klast] - 1) - (xfs_caddr_t)block); - xfs_trans_log_buf(cur->bc_tp, bp, first, last); + return cur->bc_mp->m_inobt_mxr[level != 0]; } -/* - * Log block pointer fields from a btree block (nonleaf). - */ STATIC void -xfs_inobt_log_ptrs( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_buf_t *bp, /* buffer containing btree block */ - int pfirst, /* index of first pointer to log */ - int plast) /* index of last pointer to log */ +xfs_inobt_init_key_from_rec( + union xfs_btree_key *key, + union xfs_btree_rec *rec) { - xfs_inobt_block_t *block; /* btree block to log from */ - int first; /* first byte offset logged */ - int last; /* last byte offset logged */ - xfs_inobt_ptr_t *pp; /* block-pointer pointer in btree blk */ - - block = XFS_BUF_TO_INOBT_BLOCK(bp); - pp = XFS_INOBT_PTR_ADDR(block, 1, cur); - first = (int)((xfs_caddr_t)&pp[pfirst - 1] - (xfs_caddr_t)block); - last = (int)(((xfs_caddr_t)&pp[plast] - 1) - (xfs_caddr_t)block); - xfs_trans_log_buf(cur->bc_tp, bp, first, last); + key->inobt.ir_startino = rec->inobt.ir_startino; } -/* - * Log records from a btree block (leaf). - */ STATIC void -xfs_inobt_log_recs( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_buf_t *bp, /* buffer containing btree block */ - int rfirst, /* index of first record to log */ - int rlast) /* index of last record to log */ +xfs_inobt_init_rec_from_key( + union xfs_btree_key *key, + union xfs_btree_rec *rec) { - xfs_inobt_block_t *block; /* btree block to log from */ - int first; /* first byte offset logged */ - int last; /* last byte offset logged */ - xfs_inobt_rec_t *rp; /* record pointer for btree block */ + rec->inobt.ir_startino = key->inobt.ir_startino; +} - block = XFS_BUF_TO_INOBT_BLOCK(bp); - rp = XFS_INOBT_REC_ADDR(block, 1, cur); - first = (int)((xfs_caddr_t)&rp[rfirst - 1] - (xfs_caddr_t)block); - last = (int)(((xfs_caddr_t)&rp[rlast] - 1) - (xfs_caddr_t)block); - xfs_trans_log_buf(cur->bc_tp, bp, first, last); +STATIC void +xfs_inobt_init_rec_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec) +{ + rec->inobt.ir_startino = cpu_to_be32(cur->bc_rec.i.ir_startino); + rec->inobt.ir_freecount = cpu_to_be32(cur->bc_rec.i.ir_freecount); + rec->inobt.ir_free = cpu_to_be64(cur->bc_rec.i.ir_free); } /* - * Lookup the record. The cursor is made to point to it, based on dir. - * Return 0 if can't find any such record, 1 for success. + * intial value of ptr for lookup */ -STATIC int /* error */ -xfs_inobt_lookup( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_lookup_t dir, /* <=, ==, or >= */ - int *stat) /* success/failure */ +STATIC void +xfs_inobt_init_ptr_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) { - xfs_agblock_t agbno; /* a.g. relative btree block number */ - xfs_agnumber_t agno; /* allocation group number */ - xfs_inobt_block_t *block=NULL; /* current btree block */ - __int64_t diff; /* difference for the current key */ - int error; /* error return value */ - int keyno=0; /* current key number */ - int level; /* level in the btree */ - xfs_mount_t *mp; /* file system mount point */ - - /* - * Get the allocation group header, and the root block number. - */ - mp = cur->bc_mp; - { - xfs_agi_t *agi; /* a.g. inode header */ - - agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp); - agno = be32_to_cpu(agi->agi_seqno); - agbno = be32_to_cpu(agi->agi_root); - } - /* - * Iterate over each level in the btree, starting at the root. - * For each level above the leaves, find the key we need, based - * on the lookup record, then follow the corresponding block - * pointer down to the next level. - */ - for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) { - xfs_buf_t *bp; /* buffer pointer for btree block */ - xfs_daddr_t d; /* disk address of btree block */ - - /* - * Get the disk address we're looking for. - */ - d = XFS_AGB_TO_DADDR(mp, agno, agbno); - /* - * If the old buffer at this level is for a different block, - * throw it away, otherwise just use it. - */ - bp = cur->bc_bufs[level]; - if (bp && XFS_BUF_ADDR(bp) != d) - bp = NULL; - if (!bp) { - /* - * Need to get a new buffer. Read it, then - * set it in the cursor, releasing the old one. - */ - if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, - agno, agbno, 0, &bp, XFS_INO_BTREE_REF))) - return error; - xfs_btree_setbuf(cur, level, bp); - /* - * Point to the btree block, now that we have the buffer - */ - block = XFS_BUF_TO_INOBT_BLOCK(bp); - if ((error = xfs_btree_check_sblock(cur, block, level, - bp))) - return error; - } else - block = XFS_BUF_TO_INOBT_BLOCK(bp); - /* - * If we already had a key match at a higher level, we know - * we need to use the first entry in this block. - */ - if (diff == 0) - keyno = 1; - /* - * Otherwise we need to search this block. Do a binary search. - */ - else { - int high; /* high entry number */ - xfs_inobt_key_t *kkbase=NULL;/* base of keys in block */ - xfs_inobt_rec_t *krbase=NULL;/* base of records in block */ - int low; /* low entry number */ + struct xfs_agi *agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp); - /* - * Get a pointer to keys or records. - */ - if (level > 0) - kkbase = XFS_INOBT_KEY_ADDR(block, 1, cur); - else - krbase = XFS_INOBT_REC_ADDR(block, 1, cur); - /* - * Set low and high entry numbers, 1-based. - */ - low = 1; - if (!(high = be16_to_cpu(block->bb_numrecs))) { - /* - * If the block is empty, the tree must - * be an empty leaf. - */ - ASSERT(level == 0 && cur->bc_nlevels == 1); - cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE; - *stat = 0; - return 0; - } - /* - * Binary search the block. - */ - while (low <= high) { - xfs_agino_t startino; /* key value */ - - /* - * keyno is average of low and high. - */ - keyno = (low + high) >> 1; - /* - * Get startino. - */ - if (level > 0) { - xfs_inobt_key_t *kkp; - - kkp = kkbase + keyno - 1; - startino = be32_to_cpu(kkp->ir_startino); - } else { - xfs_inobt_rec_t *krp; - - krp = krbase + keyno - 1; - startino = be32_to_cpu(krp->ir_startino); - } - /* - * Compute difference to get next direction. - */ - diff = (__int64_t) - startino - cur->bc_rec.i.ir_startino; - /* - * Less than, move right. - */ - if (diff < 0) - low = keyno + 1; - /* - * Greater than, move left. - */ - else if (diff > 0) - high = keyno - 1; - /* - * Equal, we're done. - */ - else - break; - } - } - /* - * If there are more levels, set up for the next level - * by getting the block number and filling in the cursor. - */ - if (level > 0) { - /* - * If we moved left, need the previous key number, - * unless there isn't one. - */ - if (diff > 0 && --keyno < 1) - keyno = 1; - agbno = be32_to_cpu(*XFS_INOBT_PTR_ADDR(block, keyno, cur)); -#ifdef DEBUG - if ((error = xfs_btree_check_sptr(cur, agbno, level))) - return error; -#endif - cur->bc_ptrs[level] = keyno; - } - } - /* - * Done with the search. - * See if we need to adjust the results. - */ - if (dir != XFS_LOOKUP_LE && diff < 0) { - keyno++; - /* - * If ge search and we went off the end of the block, but it's - * not the last block, we're in the wrong block. - */ - if (dir == XFS_LOOKUP_GE && - keyno > be16_to_cpu(block->bb_numrecs) && - be32_to_cpu(block->bb_rightsib) != NULLAGBLOCK) { - int i; + ASSERT(cur->bc_private.a.agno == be32_to_cpu(agi->agi_seqno)); - cur->bc_ptrs[0] = keyno; - if ((error = xfs_inobt_increment(cur, 0, &i))) - return error; - ASSERT(i == 1); - *stat = 1; - return 0; - } - } - else if (dir == XFS_LOOKUP_LE && diff > 0) - keyno--; - cur->bc_ptrs[0] = keyno; - /* - * Return if we succeeded or not. - */ - if (keyno == 0 || keyno > be16_to_cpu(block->bb_numrecs)) - *stat = 0; - else - *stat = ((dir != XFS_LOOKUP_EQ) || (diff == 0)); - return 0; + ptr->s = agi->agi_root; } -/* - * Move 1 record left from cur/level if possible. - * Update cur to reflect the new path. - */ -STATIC int /* error */ -xfs_inobt_lshift( - xfs_btree_cur_t *cur, /* btree cursor */ - int level, /* level to shift record on */ - int *stat) /* success/failure */ +STATIC __int64_t +xfs_inobt_key_diff( + struct xfs_btree_cur *cur, + union xfs_btree_key *key) { - int error; /* error return value */ -#ifdef DEBUG - int i; /* loop index */ -#endif - xfs_inobt_key_t key; /* key value for leaf level upward */ - xfs_buf_t *lbp; /* buffer for left neighbor block */ - xfs_inobt_block_t *left; /* left neighbor btree block */ - xfs_inobt_key_t *lkp=NULL; /* key pointer for left block */ - xfs_inobt_ptr_t *lpp; /* address pointer for left block */ - xfs_inobt_rec_t *lrp=NULL; /* record pointer for left block */ - int nrec; /* new number of left block entries */ - xfs_buf_t *rbp; /* buffer for right (current) block */ - xfs_inobt_block_t *right; /* right (current) btree block */ - xfs_inobt_key_t *rkp=NULL; /* key pointer for right block */ - xfs_inobt_ptr_t *rpp=NULL; /* address pointer for right block */ - xfs_inobt_rec_t *rrp=NULL; /* record pointer for right block */ - - /* - * Set up variables for this block as "right". - */ - rbp = cur->bc_bufs[level]; - right = XFS_BUF_TO_INOBT_BLOCK(rbp); -#ifdef DEBUG - if ((error = xfs_btree_check_sblock(cur, right, level, rbp))) - return error; -#endif - /* - * If we've got no left sibling then we can't shift an entry left. - */ - if (be32_to_cpu(right->bb_leftsib) == NULLAGBLOCK) { - *stat = 0; - return 0; - } - /* - * If the cursor entry is the one that would be moved, don't - * do it... it's too complicated. - */ - if (cur->bc_ptrs[level] <= 1) { - *stat = 0; - return 0; - } - /* - * Set up the left neighbor as "left". - */ - if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp, - cur->bc_private.a.agno, be32_to_cpu(right->bb_leftsib), - 0, &lbp, XFS_INO_BTREE_REF))) - return error; - left = XFS_BUF_TO_INOBT_BLOCK(lbp); - if ((error = xfs_btree_check_sblock(cur, left, level, lbp))) - return error; - /* - * If it's full, it can't take another entry. - */ - if (be16_to_cpu(left->bb_numrecs) == XFS_INOBT_BLOCK_MAXRECS(level, cur)) { - *stat = 0; - return 0; - } - nrec = be16_to_cpu(left->bb_numrecs) + 1; - /* - * If non-leaf, copy a key and a ptr to the left block. - */ - if (level > 0) { - lkp = XFS_INOBT_KEY_ADDR(left, nrec, cur); - rkp = XFS_INOBT_KEY_ADDR(right, 1, cur); - *lkp = *rkp; - xfs_inobt_log_keys(cur, lbp, nrec, nrec); - lpp = XFS_INOBT_PTR_ADDR(left, nrec, cur); - rpp = XFS_INOBT_PTR_ADDR(right, 1, cur); -#ifdef DEBUG - if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(*rpp), level))) - return error; -#endif - *lpp = *rpp; - xfs_inobt_log_ptrs(cur, lbp, nrec, nrec); - } - /* - * If leaf, copy a record to the left block. - */ - else { - lrp = XFS_INOBT_REC_ADDR(left, nrec, cur); - rrp = XFS_INOBT_REC_ADDR(right, 1, cur); - *lrp = *rrp; - xfs_inobt_log_recs(cur, lbp, nrec, nrec); - } - /* - * Bump and log left's numrecs, decrement and log right's numrecs. - */ - be16_add_cpu(&left->bb_numrecs, 1); - xfs_inobt_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS); -#ifdef DEBUG - if (level > 0) - xfs_btree_check_key(cur->bc_btnum, lkp - 1, lkp); - else - xfs_btree_check_rec(cur->bc_btnum, lrp - 1, lrp); -#endif - be16_add_cpu(&right->bb_numrecs, -1); - xfs_inobt_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS); - /* - * Slide the contents of right down one entry. - */ - if (level > 0) { -#ifdef DEBUG - for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) { - if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i + 1]), - level))) - return error; - } -#endif - memmove(rkp, rkp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp)); - memmove(rpp, rpp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp)); - xfs_inobt_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs)); - xfs_inobt_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs)); - } else { - memmove(rrp, rrp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp)); - xfs_inobt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs)); - key.ir_startino = rrp->ir_startino; - rkp = &key; - } - /* - * Update the parent key values of right. - */ - if ((error = xfs_inobt_updkey(cur, rkp, level + 1))) - return error; - /* - * Slide the cursor value left one. - */ - cur->bc_ptrs[level]--; - *stat = 1; - return 0; + return (__int64_t)be32_to_cpu(key->inobt.ir_startino) - + cur->bc_rec.i.ir_startino; } -/* - * Allocate a new root block, fill it in. - */ -STATIC int /* error */ -xfs_inobt_newroot( - xfs_btree_cur_t *cur, /* btree cursor */ - int *stat) /* success/failure */ +STATIC int +xfs_inobt_kill_root( + struct xfs_btree_cur *cur, + struct xfs_buf *bp, + int level, + union xfs_btree_ptr *newroot) { - xfs_agi_t *agi; /* a.g. inode header */ - xfs_alloc_arg_t args; /* allocation argument structure */ - xfs_inobt_block_t *block; /* one half of the old root block */ - xfs_buf_t *bp; /* buffer containing block */ - int error; /* error return value */ - xfs_inobt_key_t *kp; /* btree key pointer */ - xfs_agblock_t lbno; /* left block number */ - xfs_buf_t *lbp; /* left buffer pointer */ - xfs_inobt_block_t *left; /* left btree block */ - xfs_buf_t *nbp; /* new (root) buffer */ - xfs_inobt_block_t *new; /* new (root) btree block */ - int nptr; /* new value for key index, 1 or 2 */ - xfs_inobt_ptr_t *pp; /* btree address pointer */ - xfs_agblock_t rbno; /* right block number */ - xfs_buf_t *rbp; /* right buffer pointer */ - xfs_inobt_block_t *right; /* right btree block */ - xfs_inobt_rec_t *rp; /* btree record pointer */ + int error; - ASSERT(cur->bc_nlevels < XFS_IN_MAXLEVELS(cur->bc_mp)); + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_STATS_INC(cur, killroot); /* - * Get a block & a buffer. + * Update the root pointer, decreasing the level by 1 and then + * free the old root. */ - agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp); - args.tp = cur->bc_tp; - args.mp = cur->bc_mp; - args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, - be32_to_cpu(agi->agi_root)); - args.mod = args.minleft = args.alignment = args.total = args.wasdel = - args.isfl = args.userdata = args.minalignslop = 0; - args.minlen = args.maxlen = args.prod = 1; - args.type = XFS_ALLOCTYPE_NEAR_BNO; - if ((error = xfs_alloc_vextent(&args))) + xfs_inobt_set_root(cur, newroot, -1); + error = xfs_inobt_free_block(cur, bp); + if (error) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); return error; - /* - * None available, we fail. - */ - if (args.fsbno == NULLFSBLOCK) { - *stat = 0; - return 0; - } - ASSERT(args.len == 1); - nbp = xfs_btree_get_bufs(args.mp, args.tp, args.agno, args.agbno, 0); - new = XFS_BUF_TO_INOBT_BLOCK(nbp); - /* - * Set the root data in the a.g. inode structure. - */ - agi->agi_root = cpu_to_be32(args.agbno); - be32_add_cpu(&agi->agi_level, 1); - xfs_ialloc_log_agi(args.tp, cur->bc_private.a.agbp, - XFS_AGI_ROOT | XFS_AGI_LEVEL); - /* - * At the previous root level there are now two blocks: the old - * root, and the new block generated when it was split. - * We don't know which one the cursor is pointing at, so we - * set up variables "left" and "right" for each case. - */ - bp = cur->bc_bufs[cur->bc_nlevels - 1]; - block = XFS_BUF_TO_INOBT_BLOCK(bp); -#ifdef DEBUG - if ((error = xfs_btree_check_sblock(cur, block, cur->bc_nlevels - 1, bp))) - return error; -#endif - if (be32_to_cpu(block->bb_rightsib) != NULLAGBLOCK) { - /* - * Our block is left, pick up the right block. - */ - lbp = bp; - lbno = XFS_DADDR_TO_AGBNO(args.mp, XFS_BUF_ADDR(lbp)); - left = block; - rbno = be32_to_cpu(left->bb_rightsib); - if ((error = xfs_btree_read_bufs(args.mp, args.tp, args.agno, - rbno, 0, &rbp, XFS_INO_BTREE_REF))) - return error; - bp = rbp; - right = XFS_BUF_TO_INOBT_BLOCK(rbp); - if ((error = xfs_btree_check_sblock(cur, right, - cur->bc_nlevels - 1, rbp))) - return error; - nptr = 1; - } else { - /* - * Our block is right, pick up the left block. - */ - rbp = bp; - rbno = XFS_DADDR_TO_AGBNO(args.mp, XFS_BUF_ADDR(rbp)); - right = block; - lbno = be32_to_cpu(right->bb_leftsib); - if ((error = xfs_btree_read_bufs(args.mp, args.tp, args.agno, - lbno, 0, &lbp, XFS_INO_BTREE_REF))) - return error; - bp = lbp; - left = XFS_BUF_TO_INOBT_BLOCK(lbp); - if ((error = xfs_btree_check_sblock(cur, left, - cur->bc_nlevels - 1, lbp))) - return error; - nptr = 2; - } - /* - * Fill in the new block's btree header and log it. - */ - new->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]); - new->bb_level = cpu_to_be16(cur->bc_nlevels); - new->bb_numrecs = cpu_to_be16(2); - new->bb_leftsib = cpu_to_be32(NULLAGBLOCK); - new->bb_rightsib = cpu_to_be32(NULLAGBLOCK); - xfs_inobt_log_block(args.tp, nbp, XFS_BB_ALL_BITS); - ASSERT(lbno != NULLAGBLOCK && rbno != NULLAGBLOCK); - /* - * Fill in the key data in the new root. - */ - kp = XFS_INOBT_KEY_ADDR(new, 1, cur); - if (be16_to_cpu(left->bb_level) > 0) { - kp[0] = *XFS_INOBT_KEY_ADDR(left, 1, cur); - kp[1] = *XFS_INOBT_KEY_ADDR(right, 1, cur); - } else { - rp = XFS_INOBT_REC_ADDR(left, 1, cur); - kp[0].ir_startino = rp->ir_startino; - rp = XFS_INOBT_REC_ADDR(right, 1, cur); - kp[1].ir_startino = rp->ir_startino; } - xfs_inobt_log_keys(cur, nbp, 1, 2); - /* - * Fill in the pointer data in the new root. - */ - pp = XFS_INOBT_PTR_ADDR(new, 1, cur); - pp[0] = cpu_to_be32(lbno); - pp[1] = cpu_to_be32(rbno); - xfs_inobt_log_ptrs(cur, nbp, 1, 2); - /* - * Fix up the cursor. - */ - xfs_btree_setbuf(cur, cur->bc_nlevels, nbp); - cur->bc_ptrs[cur->bc_nlevels] = nptr; - cur->bc_nlevels++; - *stat = 1; - return 0; -} -/* - * Move 1 record right from cur/level if possible. - * Update cur to reflect the new path. - */ -STATIC int /* error */ -xfs_inobt_rshift( - xfs_btree_cur_t *cur, /* btree cursor */ - int level, /* level to shift record on */ - int *stat) /* success/failure */ -{ - int error; /* error return value */ - int i; /* loop index */ - xfs_inobt_key_t key; /* key value for leaf level upward */ - xfs_buf_t *lbp; /* buffer for left (current) block */ - xfs_inobt_block_t *left; /* left (current) btree block */ - xfs_inobt_key_t *lkp; /* key pointer for left block */ - xfs_inobt_ptr_t *lpp; /* address pointer for left block */ - xfs_inobt_rec_t *lrp; /* record pointer for left block */ - xfs_buf_t *rbp; /* buffer for right neighbor block */ - xfs_inobt_block_t *right; /* right neighbor btree block */ - xfs_inobt_key_t *rkp; /* key pointer for right block */ - xfs_inobt_ptr_t *rpp; /* address pointer for right block */ - xfs_inobt_rec_t *rrp=NULL; /* record pointer for right block */ - xfs_btree_cur_t *tcur; /* temporary cursor */ + XFS_BTREE_STATS_INC(cur, free); - /* - * Set up variables for this block as "left". - */ - lbp = cur->bc_bufs[level]; - left = XFS_BUF_TO_INOBT_BLOCK(lbp); -#ifdef DEBUG - if ((error = xfs_btree_check_sblock(cur, left, level, lbp))) - return error; -#endif - /* - * If we've got no right sibling then we can't shift an entry right. - */ - if (be32_to_cpu(left->bb_rightsib) == NULLAGBLOCK) { - *stat = 0; - return 0; - } - /* - * If the cursor entry is the one that would be moved, don't - * do it... it's too complicated. - */ - if (cur->bc_ptrs[level] >= be16_to_cpu(left->bb_numrecs)) { - *stat = 0; - return 0; - } - /* - * Set up the right neighbor as "right". - */ - if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp, - cur->bc_private.a.agno, be32_to_cpu(left->bb_rightsib), - 0, &rbp, XFS_INO_BTREE_REF))) - return error; - right = XFS_BUF_TO_INOBT_BLOCK(rbp); - if ((error = xfs_btree_check_sblock(cur, right, level, rbp))) - return error; - /* - * If it's full, it can't take another entry. - */ - if (be16_to_cpu(right->bb_numrecs) == XFS_INOBT_BLOCK_MAXRECS(level, cur)) { - *stat = 0; - return 0; - } - /* - * Make a hole at the start of the right neighbor block, then - * copy the last left block entry to the hole. - */ - if (level > 0) { - lkp = XFS_INOBT_KEY_ADDR(left, be16_to_cpu(left->bb_numrecs), cur); - lpp = XFS_INOBT_PTR_ADDR(left, be16_to_cpu(left->bb_numrecs), cur); - rkp = XFS_INOBT_KEY_ADDR(right, 1, cur); - rpp = XFS_INOBT_PTR_ADDR(right, 1, cur); -#ifdef DEBUG - for (i = be16_to_cpu(right->bb_numrecs) - 1; i >= 0; i--) { - if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i]), level))) - return error; - } -#endif - memmove(rkp + 1, rkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp)); - memmove(rpp + 1, rpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp)); -#ifdef DEBUG - if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(*lpp), level))) - return error; -#endif - *rkp = *lkp; - *rpp = *lpp; - xfs_inobt_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1); - xfs_inobt_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1); - } else { - lrp = XFS_INOBT_REC_ADDR(left, be16_to_cpu(left->bb_numrecs), cur); - rrp = XFS_INOBT_REC_ADDR(right, 1, cur); - memmove(rrp + 1, rrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp)); - *rrp = *lrp; - xfs_inobt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1); - key.ir_startino = rrp->ir_startino; - rkp = &key; - } - /* - * Decrement and log left's numrecs, bump and log right's numrecs. - */ - be16_add_cpu(&left->bb_numrecs, -1); - xfs_inobt_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS); - be16_add_cpu(&right->bb_numrecs, 1); -#ifdef DEBUG - if (level > 0) - xfs_btree_check_key(cur->bc_btnum, rkp, rkp + 1); - else - xfs_btree_check_rec(cur->bc_btnum, rrp, rrp + 1); -#endif - xfs_inobt_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS); - /* - * Using a temporary cursor, update the parent key values of the - * block on the right. - */ - if ((error = xfs_btree_dup_cursor(cur, &tcur))) - return error; - xfs_btree_lastrec(tcur, level); - if ((error = xfs_inobt_increment(tcur, level, &i)) || - (error = xfs_inobt_updkey(tcur, rkp, level + 1))) { - xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); - return error; - } - xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); - *stat = 1; + cur->bc_bufs[level] = NULL; + cur->bc_nlevels--; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); return 0; } -/* - * Split cur/level block in half. - * Return new block number and its first record (to be inserted into parent). - */ -STATIC int /* error */ -xfs_inobt_split( - xfs_btree_cur_t *cur, /* btree cursor */ - int level, /* level to split */ - xfs_agblock_t *bnop, /* output: block number allocated */ - xfs_inobt_key_t *keyp, /* output: first key of new block */ - xfs_btree_cur_t **curp, /* output: new cursor */ - int *stat) /* success/failure */ -{ - xfs_alloc_arg_t args; /* allocation argument structure */ - int error; /* error return value */ - int i; /* loop index/record number */ - xfs_agblock_t lbno; /* left (current) block number */ - xfs_buf_t *lbp; /* buffer for left block */ - xfs_inobt_block_t *left; /* left (current) btree block */ - xfs_inobt_key_t *lkp; /* left btree key pointer */ - xfs_inobt_ptr_t *lpp; /* left btree address pointer */ - xfs_inobt_rec_t *lrp; /* left btree record pointer */ - xfs_buf_t *rbp; /* buffer for right block */ - xfs_inobt_block_t *right; /* right (new) btree block */ - xfs_inobt_key_t *rkp; /* right btree key pointer */ - xfs_inobt_ptr_t *rpp; /* right btree address pointer */ - xfs_inobt_rec_t *rrp; /* right btree record pointer */ - - /* - * Set up left block (current one). - */ - lbp = cur->bc_bufs[level]; - args.tp = cur->bc_tp; - args.mp = cur->bc_mp; - lbno = XFS_DADDR_TO_AGBNO(args.mp, XFS_BUF_ADDR(lbp)); - /* - * Allocate the new block. - * If we can't do it, we're toast. Give up. - */ - args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, lbno); - args.mod = args.minleft = args.alignment = args.total = args.wasdel = - args.isfl = args.userdata = args.minalignslop = 0; - args.minlen = args.maxlen = args.prod = 1; - args.type = XFS_ALLOCTYPE_NEAR_BNO; - if ((error = xfs_alloc_vextent(&args))) - return error; - if (args.fsbno == NULLFSBLOCK) { - *stat = 0; - return 0; - } - ASSERT(args.len == 1); - rbp = xfs_btree_get_bufs(args.mp, args.tp, args.agno, args.agbno, 0); - /* - * Set up the new block as "right". - */ - right = XFS_BUF_TO_INOBT_BLOCK(rbp); - /* - * "Left" is the current (according to the cursor) block. - */ - left = XFS_BUF_TO_INOBT_BLOCK(lbp); #ifdef DEBUG - if ((error = xfs_btree_check_sblock(cur, left, level, lbp))) - return error; -#endif - /* - * Fill in the btree header for the new block. - */ - right->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]); - right->bb_level = left->bb_level; - right->bb_numrecs = cpu_to_be16(be16_to_cpu(left->bb_numrecs) / 2); - /* - * Make sure that if there's an odd number of entries now, that - * each new block will have the same number of entries. - */ - if ((be16_to_cpu(left->bb_numrecs) & 1) && - cur->bc_ptrs[level] <= be16_to_cpu(right->bb_numrecs) + 1) - be16_add_cpu(&right->bb_numrecs, 1); - i = be16_to_cpu(left->bb_numrecs) - be16_to_cpu(right->bb_numrecs) + 1; - /* - * For non-leaf blocks, copy keys and addresses over to the new block. - */ - if (level > 0) { - lkp = XFS_INOBT_KEY_ADDR(left, i, cur); - lpp = XFS_INOBT_PTR_ADDR(left, i, cur); - rkp = XFS_INOBT_KEY_ADDR(right, 1, cur); - rpp = XFS_INOBT_PTR_ADDR(right, 1, cur); -#ifdef DEBUG - for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) { - if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(lpp[i]), level))) - return error; - } -#endif - memcpy(rkp, lkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp)); - memcpy(rpp, lpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp)); - xfs_inobt_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs)); - xfs_inobt_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs)); - *keyp = *rkp; - } - /* - * For leaf blocks, copy records over to the new block. - */ - else { - lrp = XFS_INOBT_REC_ADDR(left, i, cur); - rrp = XFS_INOBT_REC_ADDR(right, 1, cur); - memcpy(rrp, lrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp)); - xfs_inobt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs)); - keyp->ir_startino = rrp->ir_startino; - } - /* - * Find the left block number by looking in the buffer. - * Adjust numrecs, sibling pointers. - */ - be16_add_cpu(&left->bb_numrecs, -(be16_to_cpu(right->bb_numrecs))); - right->bb_rightsib = left->bb_rightsib; - left->bb_rightsib = cpu_to_be32(args.agbno); - right->bb_leftsib = cpu_to_be32(lbno); - xfs_inobt_log_block(args.tp, rbp, XFS_BB_ALL_BITS); - xfs_inobt_log_block(args.tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB); - /* - * If there's a block to the new block's right, make that block - * point back to right instead of to left. - */ - if (be32_to_cpu(right->bb_rightsib) != NULLAGBLOCK) { - xfs_inobt_block_t *rrblock; /* rr btree block */ - xfs_buf_t *rrbp; /* buffer for rrblock */ - - if ((error = xfs_btree_read_bufs(args.mp, args.tp, args.agno, - be32_to_cpu(right->bb_rightsib), 0, &rrbp, - XFS_INO_BTREE_REF))) - return error; - rrblock = XFS_BUF_TO_INOBT_BLOCK(rrbp); - if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp))) - return error; - rrblock->bb_leftsib = cpu_to_be32(args.agbno); - xfs_inobt_log_block(args.tp, rrbp, XFS_BB_LEFTSIB); - } - /* - * If the cursor is really in the right block, move it there. - * If it's just pointing past the last entry in left, then we'll - * insert there, so don't change anything in that case. - */ - if (cur->bc_ptrs[level] > be16_to_cpu(left->bb_numrecs) + 1) { - xfs_btree_setbuf(cur, level, rbp); - cur->bc_ptrs[level] -= be16_to_cpu(left->bb_numrecs); - } - /* - * If there are more levels, we'll need another cursor which refers - * the right block, no matter where this cursor was. - */ - if (level + 1 < cur->bc_nlevels) { - if ((error = xfs_btree_dup_cursor(cur, curp))) - return error; - (*curp)->bc_ptrs[level + 1]++; - } - *bnop = args.agbno; - *stat = 1; - return 0; +STATIC int +xfs_inobt_keys_inorder( + struct xfs_btree_cur *cur, + union xfs_btree_key *k1, + union xfs_btree_key *k2) +{ + return be32_to_cpu(k1->inobt.ir_startino) < + be32_to_cpu(k2->inobt.ir_startino); } -/* - * Update keys at all levels from here to the root along the cursor's path. - */ -STATIC int /* error */ -xfs_inobt_updkey( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_inobt_key_t *keyp, /* new key value to update to */ - int level) /* starting level for update */ +STATIC int +xfs_inobt_recs_inorder( + struct xfs_btree_cur *cur, + union xfs_btree_rec *r1, + union xfs_btree_rec *r2) { - int ptr; /* index of key in block */ - - /* - * Go up the tree from this level toward the root. - * At each level, update the key value to the value input. - * Stop when we reach a level where the cursor isn't pointing - * at the first entry in the block. - */ - for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) { - xfs_buf_t *bp; /* buffer for block */ - xfs_inobt_block_t *block; /* btree block */ -#ifdef DEBUG - int error; /* error return value */ -#endif - xfs_inobt_key_t *kp; /* ptr to btree block keys */ - - bp = cur->bc_bufs[level]; - block = XFS_BUF_TO_INOBT_BLOCK(bp); -#ifdef DEBUG - if ((error = xfs_btree_check_sblock(cur, block, level, bp))) - return error; -#endif - ptr = cur->bc_ptrs[level]; - kp = XFS_INOBT_KEY_ADDR(block, ptr, cur); - *kp = *keyp; - xfs_inobt_log_keys(cur, bp, ptr, ptr); - } - return 0; + return be32_to_cpu(r1->inobt.ir_startino) + XFS_INODES_PER_CHUNK <= + be32_to_cpu(r2->inobt.ir_startino); } +#endif /* DEBUG */ -/* - * Externally visible routines. - */ +#ifdef XFS_BTREE_TRACE +ktrace_t *xfs_inobt_trace_buf; -/* - * Decrement cursor by one record at the level. - * For nonzero levels the leaf-ward information is untouched. - */ -int /* error */ -xfs_inobt_decrement( - xfs_btree_cur_t *cur, /* btree cursor */ - int level, /* level in btree, 0 is leaf */ - int *stat) /* success/failure */ +STATIC void +xfs_inobt_trace_enter( + struct xfs_btree_cur *cur, + const char *func, + char *s, + int type, + int line, + __psunsigned_t a0, + __psunsigned_t a1, + __psunsigned_t a2, + __psunsigned_t a3, + __psunsigned_t a4, + __psunsigned_t a5, + __psunsigned_t a6, + __psunsigned_t a7, + __psunsigned_t a8, + __psunsigned_t a9, + __psunsigned_t a10) { - xfs_inobt_block_t *block; /* btree block */ - int error; - int lev; /* btree level */ - - ASSERT(level < cur->bc_nlevels); - /* - * Read-ahead to the left at this level. - */ - xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA); - /* - * Decrement the ptr at this level. If we're still in the block - * then we're done. - */ - if (--cur->bc_ptrs[level] > 0) { - *stat = 1; - return 0; - } - /* - * Get a pointer to the btree block. - */ - block = XFS_BUF_TO_INOBT_BLOCK(cur->bc_bufs[level]); -#ifdef DEBUG - if ((error = xfs_btree_check_sblock(cur, block, level, - cur->bc_bufs[level]))) - return error; -#endif - /* - * If we just went off the left edge of the tree, return failure. - */ - if (be32_to_cpu(block->bb_leftsib) == NULLAGBLOCK) { - *stat = 0; - return 0; - } - /* - * March up the tree decrementing pointers. - * Stop when we don't go off the left edge of a block. - */ - for (lev = level + 1; lev < cur->bc_nlevels; lev++) { - if (--cur->bc_ptrs[lev] > 0) - break; - /* - * Read-ahead the left block, we're going to read it - * in the next loop. - */ - xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA); - } - /* - * If we went off the root then we are seriously confused. - */ - ASSERT(lev < cur->bc_nlevels); - /* - * Now walk back down the tree, fixing up the cursor's buffer - * pointers and key numbers. - */ - for (block = XFS_BUF_TO_INOBT_BLOCK(cur->bc_bufs[lev]); lev > level; ) { - xfs_agblock_t agbno; /* block number of btree block */ - xfs_buf_t *bp; /* buffer containing btree block */ - - agbno = be32_to_cpu(*XFS_INOBT_PTR_ADDR(block, cur->bc_ptrs[lev], cur)); - if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp, - cur->bc_private.a.agno, agbno, 0, &bp, - XFS_INO_BTREE_REF))) - return error; - lev--; - xfs_btree_setbuf(cur, lev, bp); - block = XFS_BUF_TO_INOBT_BLOCK(bp); - if ((error = xfs_btree_check_sblock(cur, block, lev, bp))) - return error; - cur->bc_ptrs[lev] = be16_to_cpu(block->bb_numrecs); - } - *stat = 1; - return 0; + ktrace_enter(xfs_inobt_trace_buf, (void *)(__psint_t)type, + (void *)func, (void *)s, NULL, (void *)cur, + (void *)a0, (void *)a1, (void *)a2, (void *)a3, + (void *)a4, (void *)a5, (void *)a6, (void *)a7, + (void *)a8, (void *)a9, (void *)a10); } -/* - * Delete the record pointed to by cur. - * The cursor refers to the place where the record was (could be inserted) - * when the operation returns. - */ -int /* error */ -xfs_inobt_delete( - xfs_btree_cur_t *cur, /* btree cursor */ - int *stat) /* success/failure */ +STATIC void +xfs_inobt_trace_cursor( + struct xfs_btree_cur *cur, + __uint32_t *s0, + __uint64_t *l0, + __uint64_t *l1) { - int error; - int i; /* result code */ - int level; /* btree level */ - - /* - * Go up the tree, starting at leaf level. - * If 2 is returned then a join was done; go to the next level. - * Otherwise we are done. - */ - for (level = 0, i = 2; i == 2; level++) { - if ((error = xfs_inobt_delrec(cur, level, &i))) - return error; - } - if (i == 0) { - for (level = 1; level < cur->bc_nlevels; level++) { - if (cur->bc_ptrs[level] == 0) { - if ((error = xfs_inobt_decrement(cur, level, &i))) - return error; - break; - } - } - } - *stat = i; - return 0; + *s0 = cur->bc_private.a.agno; + *l0 = cur->bc_rec.i.ir_startino; + *l1 = cur->bc_rec.i.ir_free; } - -/* - * Get the data from the pointed-to record. - */ -int /* error */ -xfs_inobt_get_rec( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_agino_t *ino, /* output: starting inode of chunk */ - __int32_t *fcnt, /* output: number of free inodes */ - xfs_inofree_t *free, /* output: free inode mask */ - int *stat) /* output: success/failure */ +STATIC void +xfs_inobt_trace_key( + struct xfs_btree_cur *cur, + union xfs_btree_key *key, + __uint64_t *l0, + __uint64_t *l1) { - xfs_inobt_block_t *block; /* btree block */ - xfs_buf_t *bp; /* buffer containing btree block */ -#ifdef DEBUG - int error; /* error return value */ -#endif - int ptr; /* record number */ - xfs_inobt_rec_t *rec; /* record data */ - - bp = cur->bc_bufs[0]; - ptr = cur->bc_ptrs[0]; - block = XFS_BUF_TO_INOBT_BLOCK(bp); -#ifdef DEBUG - if ((error = xfs_btree_check_sblock(cur, block, 0, bp))) - return error; -#endif - /* - * Off the right end or left end, return failure. - */ - if (ptr > be16_to_cpu(block->bb_numrecs) || ptr <= 0) { - *stat = 0; - return 0; - } - /* - * Point to the record and extract its data. - */ - rec = XFS_INOBT_REC_ADDR(block, ptr, cur); - *ino = be32_to_cpu(rec->ir_startino); - *fcnt = be32_to_cpu(rec->ir_freecount); - *free = be64_to_cpu(rec->ir_free); - *stat = 1; - return 0; + *l0 = be32_to_cpu(key->inobt.ir_startino); + *l1 = 0; } -/* - * Increment cursor by one record at the level. - * For nonzero levels the leaf-ward information is untouched. - */ -int /* error */ -xfs_inobt_increment( - xfs_btree_cur_t *cur, /* btree cursor */ - int level, /* level in btree, 0 is leaf */ - int *stat) /* success/failure */ +STATIC void +xfs_inobt_trace_record( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec, + __uint64_t *l0, + __uint64_t *l1, + __uint64_t *l2) { - xfs_inobt_block_t *block; /* btree block */ - xfs_buf_t *bp; /* buffer containing btree block */ - int error; /* error return value */ - int lev; /* btree level */ + *l0 = be32_to_cpu(rec->inobt.ir_startino); + *l1 = be32_to_cpu(rec->inobt.ir_freecount); + *l2 = be64_to_cpu(rec->inobt.ir_free); +} +#endif /* XFS_BTREE_TRACE */ + +static const struct xfs_btree_ops xfs_inobt_ops = { + .rec_len = sizeof(xfs_inobt_rec_t), + .key_len = sizeof(xfs_inobt_key_t), + + .dup_cursor = xfs_inobt_dup_cursor, + .set_root = xfs_inobt_set_root, + .kill_root = xfs_inobt_kill_root, + .alloc_block = xfs_inobt_alloc_block, + .free_block = xfs_inobt_free_block, + .get_minrecs = xfs_inobt_get_minrecs, + .get_maxrecs = xfs_inobt_get_maxrecs, + .init_key_from_rec = xfs_inobt_init_key_from_rec, + .init_rec_from_key = xfs_inobt_init_rec_from_key, + .init_rec_from_cur = xfs_inobt_init_rec_from_cur, + .init_ptr_from_cur = xfs_inobt_init_ptr_from_cur, + .key_diff = xfs_inobt_key_diff, - ASSERT(level < cur->bc_nlevels); - /* - * Read-ahead to the right at this level. - */ - xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA); - /* - * Get a pointer to the btree block. - */ - bp = cur->bc_bufs[level]; - block = XFS_BUF_TO_INOBT_BLOCK(bp); -#ifdef DEBUG - if ((error = xfs_btree_check_sblock(cur, block, level, bp))) - return error; -#endif - /* - * Increment the ptr at this level. If we're still in the block - * then we're done. - */ - if (++cur->bc_ptrs[level] <= be16_to_cpu(block->bb_numrecs)) { - *stat = 1; - return 0; - } - /* - * If we just went off the right edge of the tree, return failure. - */ - if (be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK) { - *stat = 0; - return 0; - } - /* - * March up the tree incrementing pointers. - * Stop when we don't go off the right edge of a block. - */ - for (lev = level + 1; lev < cur->bc_nlevels; lev++) { - bp = cur->bc_bufs[lev]; - block = XFS_BUF_TO_INOBT_BLOCK(bp); #ifdef DEBUG - if ((error = xfs_btree_check_sblock(cur, block, lev, bp))) - return error; + .keys_inorder = xfs_inobt_keys_inorder, + .recs_inorder = xfs_inobt_recs_inorder, #endif - if (++cur->bc_ptrs[lev] <= be16_to_cpu(block->bb_numrecs)) - break; - /* - * Read-ahead the right block, we're going to read it - * in the next loop. - */ - xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA); - } - /* - * If we went off the root then we are seriously confused. - */ - ASSERT(lev < cur->bc_nlevels); - /* - * Now walk back down the tree, fixing up the cursor's buffer - * pointers and key numbers. - */ - for (bp = cur->bc_bufs[lev], block = XFS_BUF_TO_INOBT_BLOCK(bp); - lev > level; ) { - xfs_agblock_t agbno; /* block number of btree block */ - agbno = be32_to_cpu(*XFS_INOBT_PTR_ADDR(block, cur->bc_ptrs[lev], cur)); - if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp, - cur->bc_private.a.agno, agbno, 0, &bp, - XFS_INO_BTREE_REF))) - return error; - lev--; - xfs_btree_setbuf(cur, lev, bp); - block = XFS_BUF_TO_INOBT_BLOCK(bp); - if ((error = xfs_btree_check_sblock(cur, block, lev, bp))) - return error; - cur->bc_ptrs[lev] = 1; - } - *stat = 1; - return 0; -} +#ifdef XFS_BTREE_TRACE + .trace_enter = xfs_inobt_trace_enter, + .trace_cursor = xfs_inobt_trace_cursor, + .trace_key = xfs_inobt_trace_key, + .trace_record = xfs_inobt_trace_record, +#endif +}; /* - * Insert the current record at the point referenced by cur. - * The cursor may be inconsistent on return if splits have been done. + * Allocate a new inode btree cursor. */ -int /* error */ -xfs_inobt_insert( - xfs_btree_cur_t *cur, /* btree cursor */ - int *stat) /* success/failure */ +struct xfs_btree_cur * /* new inode btree cursor */ +xfs_inobt_init_cursor( + struct xfs_mount *mp, /* file system mount point */ + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_buf *agbp, /* buffer for agi structure */ + xfs_agnumber_t agno) /* allocation group number */ { - int error; /* error return value */ - int i; /* result value, 0 for failure */ - int level; /* current level number in btree */ - xfs_agblock_t nbno; /* new block number (split result) */ - xfs_btree_cur_t *ncur; /* new cursor (split result) */ - xfs_inobt_rec_t nrec; /* record being inserted this level */ - xfs_btree_cur_t *pcur; /* previous level's cursor */ + struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + struct xfs_btree_cur *cur; - level = 0; - nbno = NULLAGBLOCK; - nrec.ir_startino = cpu_to_be32(cur->bc_rec.i.ir_startino); - nrec.ir_freecount = cpu_to_be32(cur->bc_rec.i.ir_freecount); - nrec.ir_free = cpu_to_be64(cur->bc_rec.i.ir_free); - ncur = NULL; - pcur = cur; - /* - * Loop going up the tree, starting at the leaf level. - * Stop when we don't get a split block, that must mean that - * the insert is finished with this level. - */ - do { - /* - * Insert nrec/nbno into this level of the tree. - * Note if we fail, nbno will be null. - */ - if ((error = xfs_inobt_insrec(pcur, level++, &nbno, &nrec, &ncur, - &i))) { - if (pcur != cur) - xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR); - return error; - } - /* - * See if the cursor we just used is trash. - * Can't trash the caller's cursor, but otherwise we should - * if ncur is a new cursor or we're about to be done. - */ - if (pcur != cur && (ncur || nbno == NULLAGBLOCK)) { - cur->bc_nlevels = pcur->bc_nlevels; - xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR); - } - /* - * If we got a new cursor, switch to it. - */ - if (ncur) { - pcur = ncur; - ncur = NULL; - } - } while (nbno != NULLAGBLOCK); - *stat = i; - return 0; -} + cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP); -/* - * Lookup the record equal to ino in the btree given by cur. - */ -int /* error */ -xfs_inobt_lookup_eq( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_agino_t ino, /* starting inode of chunk */ - __int32_t fcnt, /* free inode count */ - xfs_inofree_t free, /* free inode mask */ - int *stat) /* success/failure */ -{ - cur->bc_rec.i.ir_startino = ino; - cur->bc_rec.i.ir_freecount = fcnt; - cur->bc_rec.i.ir_free = free; - return xfs_inobt_lookup(cur, XFS_LOOKUP_EQ, stat); -} + cur->bc_tp = tp; + cur->bc_mp = mp; + cur->bc_nlevels = be32_to_cpu(agi->agi_level); + cur->bc_btnum = XFS_BTNUM_INO; + cur->bc_blocklog = mp->m_sb.sb_blocklog; -/* - * Lookup the first record greater than or equal to ino - * in the btree given by cur. - */ -int /* error */ -xfs_inobt_lookup_ge( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_agino_t ino, /* starting inode of chunk */ - __int32_t fcnt, /* free inode count */ - xfs_inofree_t free, /* free inode mask */ - int *stat) /* success/failure */ -{ - cur->bc_rec.i.ir_startino = ino; - cur->bc_rec.i.ir_freecount = fcnt; - cur->bc_rec.i.ir_free = free; - return xfs_inobt_lookup(cur, XFS_LOOKUP_GE, stat); -} + cur->bc_ops = &xfs_inobt_ops; -/* - * Lookup the first record less than or equal to ino - * in the btree given by cur. - */ -int /* error */ -xfs_inobt_lookup_le( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_agino_t ino, /* starting inode of chunk */ - __int32_t fcnt, /* free inode count */ - xfs_inofree_t free, /* free inode mask */ - int *stat) /* success/failure */ -{ - cur->bc_rec.i.ir_startino = ino; - cur->bc_rec.i.ir_freecount = fcnt; - cur->bc_rec.i.ir_free = free; - return xfs_inobt_lookup(cur, XFS_LOOKUP_LE, stat); + cur->bc_private.a.agbp = agbp; + cur->bc_private.a.agno = agno; + + return cur; } /* - * Update the record referred to by cur, to the value given - * by [ino, fcnt, free]. - * This either works (return 0) or gets an EFSCORRUPTED error. + * Calculate number of records in an inobt btree block. */ -int /* error */ -xfs_inobt_update( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_agino_t ino, /* starting inode of chunk */ - __int32_t fcnt, /* free inode count */ - xfs_inofree_t free) /* free inode mask */ +int +xfs_inobt_maxrecs( + struct xfs_mount *mp, + int blocklen, + int leaf) { - xfs_inobt_block_t *block; /* btree block to update */ - xfs_buf_t *bp; /* buffer containing btree block */ - int error; /* error return value */ - int ptr; /* current record number (updating) */ - xfs_inobt_rec_t *rp; /* pointer to updated record */ + blocklen -= XFS_INOBT_BLOCK_LEN(mp); - /* - * Pick up the current block. - */ - bp = cur->bc_bufs[0]; - block = XFS_BUF_TO_INOBT_BLOCK(bp); -#ifdef DEBUG - if ((error = xfs_btree_check_sblock(cur, block, 0, bp))) - return error; -#endif - /* - * Get the address of the rec to be updated. - */ - ptr = cur->bc_ptrs[0]; - rp = XFS_INOBT_REC_ADDR(block, ptr, cur); - /* - * Fill in the new contents and log them. - */ - rp->ir_startino = cpu_to_be32(ino); - rp->ir_freecount = cpu_to_be32(fcnt); - rp->ir_free = cpu_to_be64(free); - xfs_inobt_log_recs(cur, bp, ptr, ptr); - /* - * Updating first record in leaf. Pass new key value up to our parent. - */ - if (ptr == 1) { - xfs_inobt_key_t key; /* key containing [ino] */ - - key.ir_startino = cpu_to_be32(ino); - if ((error = xfs_inobt_updkey(cur, &key, 1))) - return error; - } - return 0; + if (leaf) + return blocklen / sizeof(xfs_inobt_rec_t); + return blocklen / (sizeof(xfs_inobt_key_t) + sizeof(xfs_inobt_ptr_t)); } diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h index 8efc4a5b8b92..37e5dd01a577 100644 --- a/fs/xfs/xfs_ialloc_btree.h +++ b/fs/xfs/xfs_ialloc_btree.h @@ -24,7 +24,6 @@ struct xfs_buf; struct xfs_btree_cur; -struct xfs_btree_sblock; struct xfs_mount; /* @@ -70,11 +69,6 @@ typedef struct xfs_inobt_key { /* btree pointer type */ typedef __be32 xfs_inobt_ptr_t; -/* btree block header type */ -typedef struct xfs_btree_sblock xfs_inobt_block_t; - -#define XFS_BUF_TO_INOBT_BLOCK(bp) ((xfs_inobt_block_t *)XFS_BUF_PTR(bp)) - /* * Bit manipulations for ir_free. */ @@ -85,14 +79,6 @@ typedef struct xfs_btree_sblock xfs_inobt_block_t; #define XFS_INOBT_CLR_FREE(rp,i) ((rp)->ir_free &= ~XFS_INOBT_MASK(i)) /* - * Real block structures have a size equal to the disk block size. - */ -#define XFS_INOBT_BLOCK_MAXRECS(lev,cur) ((cur)->bc_mp->m_inobt_mxr[lev != 0]) -#define XFS_INOBT_BLOCK_MINRECS(lev,cur) ((cur)->bc_mp->m_inobt_mnr[lev != 0]) -#define XFS_INOBT_IS_LAST_REC(cur) \ - ((cur)->bc_ptrs[0] == be16_to_cpu(XFS_BUF_TO_INOBT_BLOCK((cur)->bc_bufs[0])->bb_numrecs)) - -/* * Maximum number of inode btree levels. */ #define XFS_IN_MAXLEVELS(mp) ((mp)->m_in_maxlevels) @@ -104,75 +90,38 @@ typedef struct xfs_btree_sblock xfs_inobt_block_t; #define XFS_PREALLOC_BLOCKS(mp) ((xfs_agblock_t)(XFS_IBT_BLOCK(mp) + 1)) /* - * Record, key, and pointer address macros for btree blocks. - */ -#define XFS_INOBT_REC_ADDR(bb,i,cur) \ - (XFS_BTREE_REC_ADDR(xfs_inobt, bb, i)) - -#define XFS_INOBT_KEY_ADDR(bb,i,cur) \ - (XFS_BTREE_KEY_ADDR(xfs_inobt, bb, i)) - -#define XFS_INOBT_PTR_ADDR(bb,i,cur) \ - (XFS_BTREE_PTR_ADDR(xfs_inobt, bb, \ - i, XFS_INOBT_BLOCK_MAXRECS(1, cur))) - -/* - * Decrement cursor by one record at the level. - * For nonzero levels the leaf-ward information is untouched. - */ -extern int xfs_inobt_decrement(struct xfs_btree_cur *cur, int level, int *stat); - -/* - * Delete the record pointed to by cur. - * The cursor refers to the place where the record was (could be inserted) - * when the operation returns. - */ -extern int xfs_inobt_delete(struct xfs_btree_cur *cur, int *stat); - -/* - * Get the data from the pointed-to record. - */ -extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur, xfs_agino_t *ino, - __int32_t *fcnt, xfs_inofree_t *free, int *stat); - -/* - * Increment cursor by one record at the level. - * For nonzero levels the leaf-ward information is untouched. - */ -extern int xfs_inobt_increment(struct xfs_btree_cur *cur, int level, int *stat); - -/* - * Insert the current record at the point referenced by cur. - * The cursor may be inconsistent on return if splits have been done. - */ -extern int xfs_inobt_insert(struct xfs_btree_cur *cur, int *stat); - -/* - * Lookup the record equal to ino in the btree given by cur. - */ -extern int xfs_inobt_lookup_eq(struct xfs_btree_cur *cur, xfs_agino_t ino, - __int32_t fcnt, xfs_inofree_t free, int *stat); - -/* - * Lookup the first record greater than or equal to ino - * in the btree given by cur. - */ -extern int xfs_inobt_lookup_ge(struct xfs_btree_cur *cur, xfs_agino_t ino, - __int32_t fcnt, xfs_inofree_t free, int *stat); - -/* - * Lookup the first record less than or equal to ino - * in the btree given by cur. + * Btree block header size depends on a superblock flag. + * + * (not quite yet, but soon) */ -extern int xfs_inobt_lookup_le(struct xfs_btree_cur *cur, xfs_agino_t ino, - __int32_t fcnt, xfs_inofree_t free, int *stat); +#define XFS_INOBT_BLOCK_LEN(mp) XFS_BTREE_SBLOCK_LEN /* - * Update the record referred to by cur, to the value given - * by [ino, fcnt, free]. - * This either works (return 0) or gets an EFSCORRUPTED error. - */ -extern int xfs_inobt_update(struct xfs_btree_cur *cur, xfs_agino_t ino, - __int32_t fcnt, xfs_inofree_t free); + * Record, key, and pointer address macros for btree blocks. + * + * (note that some of these may appear unused, but they are used in userspace) + */ +#define XFS_INOBT_REC_ADDR(mp, block, index) \ + ((xfs_inobt_rec_t *) \ + ((char *)(block) + \ + XFS_INOBT_BLOCK_LEN(mp) + \ + (((index) - 1) * sizeof(xfs_inobt_rec_t)))) + +#define XFS_INOBT_KEY_ADDR(mp, block, index) \ + ((xfs_inobt_key_t *) \ + ((char *)(block) + \ + XFS_INOBT_BLOCK_LEN(mp) + \ + ((index) - 1) * sizeof(xfs_inobt_key_t))) + +#define XFS_INOBT_PTR_ADDR(mp, block, index, maxrecs) \ + ((xfs_inobt_ptr_t *) \ + ((char *)(block) + \ + XFS_INOBT_BLOCK_LEN(mp) + \ + (maxrecs) * sizeof(xfs_inobt_key_t) + \ + ((index) - 1) * sizeof(xfs_inobt_ptr_t))) + +extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *, + struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t); +extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int); #endif /* __XFS_IALLOC_BTREE_H__ */ diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index e229e9e001c2..e2fb6210d4c5 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -38,281 +38,283 @@ #include "xfs_ialloc.h" #include "xfs_quota.h" #include "xfs_utils.h" +#include "xfs_trans_priv.h" +#include "xfs_inode_item.h" +#include "xfs_bmap.h" +#include "xfs_btree_trace.h" +#include "xfs_dir2_trace.h" + /* - * Look up an inode by number in the given file system. - * The inode is looked up in the cache held in each AG. - * If the inode is found in the cache, attach it to the provided - * vnode. - * - * If it is not in core, read it in from the file system's device, - * add it to the cache and attach the provided vnode. - * - * The inode is locked according to the value of the lock_flags parameter. - * This flag parameter indicates how and if the inode's IO lock and inode lock - * should be taken. - * - * mp -- the mount point structure for the current file system. It points - * to the inode hash table. - * tp -- a pointer to the current transaction if there is one. This is - * simply passed through to the xfs_iread() call. - * ino -- the number of the inode desired. This is the unique identifier - * within the file system for the inode being requested. - * lock_flags -- flags indicating how to lock the inode. See the comment - * for xfs_ilock() for a list of valid values. - * bno -- the block number starting the buffer containing the inode, - * if known (as by bulkstat), else 0. + * Allocate and initialise an xfs_inode. */ -STATIC int -xfs_iget_core( - struct inode *inode, - xfs_mount_t *mp, - xfs_trans_t *tp, - xfs_ino_t ino, - uint flags, - uint lock_flags, - xfs_inode_t **ipp, - xfs_daddr_t bno) +STATIC struct xfs_inode * +xfs_inode_alloc( + struct xfs_mount *mp, + xfs_ino_t ino) { - struct inode *old_inode; - xfs_inode_t *ip; - xfs_inode_t *iq; - int error; - unsigned long first_index, mask; - xfs_perag_t *pag; - xfs_agino_t agino; + struct xfs_inode *ip; - /* the radix tree exists only in inode capable AGs */ - if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_maxagi) - return EINVAL; + /* + * if this didn't occur in transactions, we could use + * KM_MAYFAIL and return NULL here on ENOMEM. Set the + * code up to do this anyway. + */ + ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP); + if (!ip) + return NULL; - /* get the perag structure and ensure that it's inode capable */ - pag = xfs_get_perag(mp, ino); - if (!pag->pagi_inodeok) - return EINVAL; - ASSERT(pag->pag_ici_init); - agino = XFS_INO_TO_AGINO(mp, ino); + ASSERT(atomic_read(&ip->i_iocount) == 0); + ASSERT(atomic_read(&ip->i_pincount) == 0); + ASSERT(!spin_is_locked(&ip->i_flags_lock)); + ASSERT(completion_done(&ip->i_flush)); -again: - read_lock(&pag->pag_ici_lock); - ip = radix_tree_lookup(&pag->pag_ici_root, agino); + /* + * initialise the VFS inode here to get failures + * out of the way early. + */ + if (!inode_init_always(mp->m_super, VFS_I(ip))) { + kmem_zone_free(xfs_inode_zone, ip); + return NULL; + } + + /* initialise the xfs inode */ + ip->i_ino = ino; + ip->i_mount = mp; + memset(&ip->i_imap, 0, sizeof(struct xfs_imap)); + ip->i_afp = NULL; + memset(&ip->i_df, 0, sizeof(xfs_ifork_t)); + ip->i_flags = 0; + ip->i_update_core = 0; + ip->i_update_size = 0; + ip->i_delayed_blks = 0; + memset(&ip->i_d, 0, sizeof(xfs_icdinode_t)); + ip->i_size = 0; + ip->i_new_size = 0; + + /* + * Initialize inode's trace buffers. + */ +#ifdef XFS_INODE_TRACE + ip->i_trace = ktrace_alloc(INODE_TRACE_SIZE, KM_NOFS); +#endif +#ifdef XFS_BMAP_TRACE + ip->i_xtrace = ktrace_alloc(XFS_BMAP_KTRACE_SIZE, KM_NOFS); +#endif +#ifdef XFS_BTREE_TRACE + ip->i_btrace = ktrace_alloc(XFS_BMBT_KTRACE_SIZE, KM_NOFS); +#endif +#ifdef XFS_RW_TRACE + ip->i_rwtrace = ktrace_alloc(XFS_RW_KTRACE_SIZE, KM_NOFS); +#endif +#ifdef XFS_ILOCK_TRACE + ip->i_lock_trace = ktrace_alloc(XFS_ILOCK_KTRACE_SIZE, KM_NOFS); +#endif +#ifdef XFS_DIR2_TRACE + ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_NOFS); +#endif + + return ip; +} + +/* + * Check the validity of the inode we just found it the cache + */ +static int +xfs_iget_cache_hit( + struct xfs_perag *pag, + struct xfs_inode *ip, + int flags, + int lock_flags) __releases(pag->pag_ici_lock) +{ + struct xfs_mount *mp = ip->i_mount; + int error = EAGAIN; + + /* + * If INEW is set this inode is being set up + * If IRECLAIM is set this inode is being torn down + * Pause and try again. + */ + if (xfs_iflags_test(ip, (XFS_INEW|XFS_IRECLAIM))) { + XFS_STATS_INC(xs_ig_frecycle); + goto out_error; + } + + /* If IRECLAIMABLE is set, we've torn down the vfs inode part */ + if (xfs_iflags_test(ip, XFS_IRECLAIMABLE)) { - if (ip != NULL) { /* - * If INEW is set this inode is being set up - * we need to pause and try again. + * If lookup is racing with unlink, then we should return an + * error immediately so we don't remove it from the reclaim + * list and potentially leak the inode. */ - if (xfs_iflags_test(ip, XFS_INEW)) { - read_unlock(&pag->pag_ici_lock); - delay(1); - XFS_STATS_INC(xs_ig_frecycle); - - goto again; + if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) { + error = ENOENT; + goto out_error; } - old_inode = ip->i_vnode; - if (old_inode == NULL) { - /* - * If IRECLAIM is set this inode is - * on its way out of the system, - * we need to pause and try again. - */ - if (xfs_iflags_test(ip, XFS_IRECLAIM)) { - read_unlock(&pag->pag_ici_lock); - delay(1); - XFS_STATS_INC(xs_ig_frecycle); - - goto again; - } - ASSERT(xfs_iflags_test(ip, XFS_IRECLAIMABLE)); - - /* - * If lookup is racing with unlink, then we - * should return an error immediately so we - * don't remove it from the reclaim list and - * potentially leak the inode. - */ - if ((ip->i_d.di_mode == 0) && - !(flags & XFS_IGET_CREATE)) { - read_unlock(&pag->pag_ici_lock); - xfs_put_perag(mp, pag); - return ENOENT; - } - - xfs_itrace_exit_tag(ip, "xfs_iget.alloc"); - - XFS_STATS_INC(xs_ig_found); - xfs_iflags_clear(ip, XFS_IRECLAIMABLE); - read_unlock(&pag->pag_ici_lock); - - XFS_MOUNT_ILOCK(mp); - list_del_init(&ip->i_reclaim); - XFS_MOUNT_IUNLOCK(mp); - - goto finish_inode; - - } else if (inode != old_inode) { - /* The inode is being torn down, pause and - * try again. - */ - if (old_inode->i_state & (I_FREEING | I_CLEAR)) { - read_unlock(&pag->pag_ici_lock); - delay(1); - XFS_STATS_INC(xs_ig_frecycle); - - goto again; - } -/* Chances are the other vnode (the one in the inode) is being torn -* down right now, and we landed on top of it. Question is, what do -* we do? Unhook the old inode and hook up the new one? -*/ - cmn_err(CE_PANIC, - "xfs_iget_core: ambiguous vns: vp/0x%p, invp/0x%p", - old_inode, inode); - } + xfs_itrace_exit_tag(ip, "xfs_iget.alloc"); /* - * Inode cache hit + * We need to re-initialise the VFS inode as it has been + * 'freed' by the VFS. Do this here so we can deal with + * errors cleanly, then tag it so it can be set up correctly + * later. */ - read_unlock(&pag->pag_ici_lock); - XFS_STATS_INC(xs_ig_found); - -finish_inode: - if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) { - xfs_put_perag(mp, pag); - return ENOENT; + if (!inode_init_always(mp->m_super, VFS_I(ip))) { + error = ENOMEM; + goto out_error; } - if (lock_flags != 0) - xfs_ilock(ip, lock_flags); + /* + * We must set the XFS_INEW flag before clearing the + * XFS_IRECLAIMABLE flag so that if a racing lookup does + * not find the XFS_IRECLAIMABLE above but has the igrab() + * below succeed we can safely check XFS_INEW to detect + * that this inode is still being initialised. + */ + xfs_iflags_set(ip, XFS_INEW); + xfs_iflags_clear(ip, XFS_IRECLAIMABLE); + + /* clear the radix tree reclaim flag as well. */ + __xfs_inode_clear_reclaim_tag(mp, pag, ip); + } else if (!igrab(VFS_I(ip))) { + /* If the VFS inode is being torn down, pause and try again. */ + XFS_STATS_INC(xs_ig_frecycle); + goto out_error; + } else if (xfs_iflags_test(ip, XFS_INEW)) { + /* + * We are racing with another cache hit that is + * currently recycling this inode out of the XFS_IRECLAIMABLE + * state. Wait for the initialisation to complete before + * continuing. + */ + wait_on_inode(VFS_I(ip)); + } - xfs_iflags_clear(ip, XFS_ISTALE); - xfs_itrace_exit_tag(ip, "xfs_iget.found"); - goto return_ip; + if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) { + error = ENOENT; + iput(VFS_I(ip)); + goto out_error; } - /* - * Inode cache miss - */ + /* We've got a live one. */ read_unlock(&pag->pag_ici_lock); - XFS_STATS_INC(xs_ig_missed); - /* - * Read the disk inode attributes into a new inode structure and get - * a new vnode for it. This should also initialize i_ino and i_mount. - */ - error = xfs_iread(mp, tp, ino, &ip, bno, - (flags & XFS_IGET_BULKSTAT) ? XFS_IMAP_BULKSTAT : 0); - if (error) { - xfs_put_perag(mp, pag); - return error; - } + if (lock_flags != 0) + xfs_ilock(ip, lock_flags); - xfs_itrace_exit_tag(ip, "xfs_iget.alloc"); + xfs_iflags_clear(ip, XFS_ISTALE); + xfs_itrace_exit_tag(ip, "xfs_iget.found"); + XFS_STATS_INC(xs_ig_found); + return 0; + +out_error: + read_unlock(&pag->pag_ici_lock); + return error; +} - mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER, - "xfsino", ip->i_ino); - mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); - init_waitqueue_head(&ip->i_ipin_wait); - atomic_set(&ip->i_pincount, 0); +static int +xfs_iget_cache_miss( + struct xfs_mount *mp, + struct xfs_perag *pag, + xfs_trans_t *tp, + xfs_ino_t ino, + struct xfs_inode **ipp, + xfs_daddr_t bno, + int flags, + int lock_flags) __releases(pag->pag_ici_lock) +{ + struct xfs_inode *ip; + int error; + unsigned long first_index, mask; + xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino); - /* - * Because we want to use a counting completion, complete - * the flush completion once to allow a single access to - * the flush completion without blocking. - */ - init_completion(&ip->i_flush); - complete(&ip->i_flush); + ip = xfs_inode_alloc(mp, ino); + if (!ip) + return ENOMEM; - if (lock_flags) - xfs_ilock(ip, lock_flags); + error = xfs_iread(mp, tp, ip, bno, flags); + if (error) + goto out_destroy; + + xfs_itrace_exit_tag(ip, "xfs_iget.alloc"); if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) { - xfs_idestroy(ip); - xfs_put_perag(mp, pag); - return ENOENT; + error = ENOENT; + goto out_destroy; } + if (lock_flags) + xfs_ilock(ip, lock_flags); + /* * Preload the radix tree so we can insert safely under the - * write spinlock. + * write spinlock. Note that we cannot sleep inside the preload + * region. */ if (radix_tree_preload(GFP_KERNEL)) { - xfs_idestroy(ip); - delay(1); - goto again; + error = EAGAIN; + goto out_unlock; } + mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1); first_index = agino & mask; write_lock(&pag->pag_ici_lock); - /* - * insert the new inode - */ + + /* insert the new inode */ error = radix_tree_insert(&pag->pag_ici_root, agino, ip); if (unlikely(error)) { - BUG_ON(error != -EEXIST); - write_unlock(&pag->pag_ici_lock); - radix_tree_preload_end(); - xfs_idestroy(ip); + WARN_ON(error != -EEXIST); XFS_STATS_INC(xs_ig_dup); - goto again; + error = EAGAIN; + goto out_preload_end; } - /* - * These values _must_ be set before releasing the radix tree lock! - */ + /* These values _must_ be set before releasing the radix tree lock! */ ip->i_udquot = ip->i_gdquot = NULL; xfs_iflags_set(ip, XFS_INEW); write_unlock(&pag->pag_ici_lock); radix_tree_preload_end(); - - /* - * Link ip to its mount and thread it on the mount's inode list. - */ - XFS_MOUNT_ILOCK(mp); - if ((iq = mp->m_inodes)) { - ASSERT(iq->i_mprev->i_mnext == iq); - ip->i_mprev = iq->i_mprev; - iq->i_mprev->i_mnext = ip; - iq->i_mprev = ip; - ip->i_mnext = iq; - } else { - ip->i_mnext = ip; - ip->i_mprev = ip; - } - mp->m_inodes = ip; - - XFS_MOUNT_IUNLOCK(mp); - xfs_put_perag(mp, pag); - - return_ip: - ASSERT(ip->i_df.if_ext_max == - XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t)); - - xfs_iflags_set(ip, XFS_IMODIFIED); *ipp = ip; - - /* - * Set up the Linux with the Linux inode. - */ - ip->i_vnode = inode; - inode->i_private = ip; - - /* - * If we have a real type for an on-disk inode, we can set ops(&unlock) - * now. If it's a new inode being created, xfs_ialloc will handle it. - */ - if (ip->i_d.di_mode != 0) - xfs_setup_inode(ip); return 0; -} +out_preload_end: + write_unlock(&pag->pag_ici_lock); + radix_tree_preload_end(); +out_unlock: + if (lock_flags) + xfs_iunlock(ip, lock_flags); +out_destroy: + xfs_destroy_inode(ip); + return error; +} /* - * The 'normal' internal xfs_iget, if needed it will - * 'allocate', or 'get', the vnode. + * Look up an inode by number in the given file system. + * The inode is looked up in the cache held in each AG. + * If the inode is found in the cache, initialise the vfs inode + * if necessary. + * + * If it is not in core, read it in from the file system's device, + * add it to the cache and initialise the vfs inode. + * + * The inode is locked according to the value of the lock_flags parameter. + * This flag parameter indicates how and if the inode's IO lock and inode lock + * should be taken. + * + * mp -- the mount point structure for the current file system. It points + * to the inode hash table. + * tp -- a pointer to the current transaction if there is one. This is + * simply passed through to the xfs_iread() call. + * ino -- the number of the inode desired. This is the unique identifier + * within the file system for the inode being requested. + * lock_flags -- flags indicating how to lock the inode. See the comment + * for xfs_ilock() for a list of valid values. + * bno -- the block number starting the buffer containing the inode, + * if known (as by bulkstat), else 0. */ int xfs_iget( @@ -324,61 +326,64 @@ xfs_iget( xfs_inode_t **ipp, xfs_daddr_t bno) { - struct inode *inode; xfs_inode_t *ip; int error; + xfs_perag_t *pag; + xfs_agino_t agino; - XFS_STATS_INC(xs_ig_attempts); + /* the radix tree exists only in inode capable AGs */ + if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_maxagi) + return EINVAL; -retry: - inode = iget_locked(mp->m_super, ino); - if (!inode) - /* If we got no inode we are out of memory */ - return ENOMEM; + /* get the perag structure and ensure that it's inode capable */ + pag = xfs_get_perag(mp, ino); + if (!pag->pagi_inodeok) + return EINVAL; + ASSERT(pag->pag_ici_init); + agino = XFS_INO_TO_AGINO(mp, ino); - if (inode->i_state & I_NEW) { - XFS_STATS_INC(vn_active); - XFS_STATS_INC(vn_alloc); - - error = xfs_iget_core(inode, mp, tp, ino, flags, - lock_flags, ipp, bno); - if (error) { - make_bad_inode(inode); - if (inode->i_state & I_NEW) - unlock_new_inode(inode); - iput(inode); - } - return error; +again: + error = 0; + read_lock(&pag->pag_ici_lock); + ip = radix_tree_lookup(&pag->pag_ici_root, agino); + + if (ip) { + error = xfs_iget_cache_hit(pag, ip, flags, lock_flags); + if (error) + goto out_error_or_again; + } else { + read_unlock(&pag->pag_ici_lock); + XFS_STATS_INC(xs_ig_missed); + + error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, bno, + flags, lock_flags); + if (error) + goto out_error_or_again; } + xfs_put_perag(mp, pag); + *ipp = ip; + + ASSERT(ip->i_df.if_ext_max == + XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t)); /* - * If the inode is not fully constructed due to - * filehandle mismatches wait for the inode to go - * away and try again. - * - * iget_locked will call __wait_on_freeing_inode - * to wait for the inode to go away. + * If we have a real type for an on-disk inode, we can set ops(&unlock) + * now. If it's a new inode being created, xfs_ialloc will handle it. */ - if (is_bad_inode(inode)) { - iput(inode); - delay(1); - goto retry; - } + if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0) + xfs_setup_inode(ip); + return 0; - ip = XFS_I(inode); - if (!ip) { - iput(inode); +out_error_or_again: + if (error == EAGAIN) { delay(1); - goto retry; + goto again; } - - if (lock_flags != 0) - xfs_ilock(ip, lock_flags); - XFS_STATS_INC(xs_ig_found); - *ipp = ip; - return 0; + xfs_put_perag(mp, pag); + return error; } + /* * Look for the inode corresponding to the given ino in the hash table. * If it is there and its i_transp pointer matches tp, return it. @@ -444,99 +449,109 @@ xfs_iput_new( IRELE(ip); } - /* - * This routine embodies the part of the reclaim code that pulls - * the inode from the inode hash table and the mount structure's - * inode list. - * This should only be called from xfs_reclaim(). + * This is called free all the memory associated with an inode. + * It must free the inode itself and any buffers allocated for + * if_extents/if_data and if_broot. It must also free the lock + * associated with the inode. + * + * Note: because we don't initialise everything on reallocation out + * of the zone, we must ensure we nullify everything correctly before + * freeing the structure. */ void -xfs_ireclaim(xfs_inode_t *ip) +xfs_ireclaim( + struct xfs_inode *ip) { - /* - * Remove from old hash list and mount list. - */ - XFS_STATS_INC(xs_ig_reclaims); + struct xfs_mount *mp = ip->i_mount; + struct xfs_perag *pag; - xfs_iextract(ip); - - /* - * Here we do a spurious inode lock in order to coordinate with - * xfs_sync(). This is because xfs_sync() references the inodes - * in the mount list without taking references on the corresponding - * vnodes. We make that OK here by ensuring that we wait until - * the inode is unlocked in xfs_sync() before we go ahead and - * free it. We get both the regular lock and the io lock because - * the xfs_sync() code may need to drop the regular one but will - * still hold the io lock. - */ - xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - - /* - * Release dquots (and their references) if any. An inode may escape - * xfs_inactive and get here via vn_alloc->vn_reclaim path. - */ - XFS_QM_DQDETACH(ip->i_mount, ip); - - /* - * Pull our behavior descriptor from the vnode chain. - */ - if (ip->i_vnode) { - ip->i_vnode->i_private = NULL; - ip->i_vnode = NULL; - } + XFS_STATS_INC(xs_ig_reclaims); /* - * Free all memory associated with the inode. + * Remove the inode from the per-AG radix tree. It doesn't matter + * if it was never added to it because radix_tree_delete can deal + * with that case just fine. */ - xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - xfs_idestroy(ip); -} - -/* - * This routine removes an about-to-be-destroyed inode from - * all of the lists in which it is located with the exception - * of the behavior chain. - */ -void -xfs_iextract( - xfs_inode_t *ip) -{ - xfs_mount_t *mp = ip->i_mount; - xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino); - xfs_inode_t *iq; - + pag = xfs_get_perag(mp, ip->i_ino); write_lock(&pag->pag_ici_lock); radix_tree_delete(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino)); write_unlock(&pag->pag_ici_lock); xfs_put_perag(mp, pag); /* - * Remove from mount's inode list. + * Here we do an (almost) spurious inode lock in order to coordinate + * with inode cache radix tree lookups. This is because the lookup + * can reference the inodes in the cache without taking references. + * + * We make that OK here by ensuring that we wait until the inode is + * unlocked after the lookup before we go ahead and free it. We get + * both the ilock and the iolock because the code may need to drop the + * ilock one but will still hold the iolock. */ - XFS_MOUNT_ILOCK(mp); - ASSERT((ip->i_mnext != NULL) && (ip->i_mprev != NULL)); - iq = ip->i_mnext; - iq->i_mprev = ip->i_mprev; - ip->i_mprev->i_mnext = iq; - + xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); /* - * Fix up the head pointer if it points to the inode being deleted. + * Release dquots (and their references) if any. */ - if (mp->m_inodes == ip) { - if (ip == iq) { - mp->m_inodes = NULL; - } else { - mp->m_inodes = iq; - } + XFS_QM_DQDETACH(ip->i_mount, ip); + xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + + switch (ip->i_d.di_mode & S_IFMT) { + case S_IFREG: + case S_IFDIR: + case S_IFLNK: + xfs_idestroy_fork(ip, XFS_DATA_FORK); + break; } - /* Deal with the deleted inodes list */ - list_del_init(&ip->i_reclaim); + if (ip->i_afp) + xfs_idestroy_fork(ip, XFS_ATTR_FORK); - mp->m_ireclaims++; - XFS_MOUNT_IUNLOCK(mp); +#ifdef XFS_INODE_TRACE + ktrace_free(ip->i_trace); +#endif +#ifdef XFS_BMAP_TRACE + ktrace_free(ip->i_xtrace); +#endif +#ifdef XFS_BTREE_TRACE + ktrace_free(ip->i_btrace); +#endif +#ifdef XFS_RW_TRACE + ktrace_free(ip->i_rwtrace); +#endif +#ifdef XFS_ILOCK_TRACE + ktrace_free(ip->i_lock_trace); +#endif +#ifdef XFS_DIR2_TRACE + ktrace_free(ip->i_dir_trace); +#endif + if (ip->i_itemp) { + /* + * Only if we are shutting down the fs will we see an + * inode still in the AIL. If it is there, we should remove + * it to prevent a use-after-free from occurring. + */ + xfs_log_item_t *lip = &ip->i_itemp->ili_item; + struct xfs_ail *ailp = lip->li_ailp; + + ASSERT(((lip->li_flags & XFS_LI_IN_AIL) == 0) || + XFS_FORCED_SHUTDOWN(ip->i_mount)); + if (lip->li_flags & XFS_LI_IN_AIL) { + spin_lock(&ailp->xa_lock); + if (lip->li_flags & XFS_LI_IN_AIL) + xfs_trans_ail_delete(ailp, lip); + else + spin_unlock(&ailp->xa_lock); + } + xfs_inode_item_destroy(ip); + ip->i_itemp = NULL; + } + /* asserts to verify all state is correct here */ + ASSERT(atomic_read(&ip->i_iocount) == 0); + ASSERT(atomic_read(&ip->i_pincount) == 0); + ASSERT(!spin_is_locked(&ip->i_flags_lock)); + ASSERT(completion_done(&ip->i_flush)); + kmem_zone_free(xfs_inode_zone, ip); } /* @@ -737,7 +752,7 @@ xfs_iunlock( * it is in the AIL and anyone is waiting on it. Don't do * this if the caller has asked us not to. */ - xfs_trans_unlocked_item(ip->i_mount, + xfs_trans_unlocked_item(ip->i_itemp->ili_item.li_ailp, (xfs_log_item_t*)(ip->i_itemp)); } xfs_ilock_trace(ip, 3, lock_flags, (inst_t *)__return_address); @@ -790,3 +805,51 @@ xfs_isilocked( } #endif +#ifdef XFS_INODE_TRACE + +#define KTRACE_ENTER(ip, vk, s, line, ra) \ + ktrace_enter((ip)->i_trace, \ +/* 0 */ (void *)(__psint_t)(vk), \ +/* 1 */ (void *)(s), \ +/* 2 */ (void *)(__psint_t) line, \ +/* 3 */ (void *)(__psint_t)atomic_read(&VFS_I(ip)->i_count), \ +/* 4 */ (void *)(ra), \ +/* 5 */ NULL, \ +/* 6 */ (void *)(__psint_t)current_cpu(), \ +/* 7 */ (void *)(__psint_t)current_pid(), \ +/* 8 */ (void *)__return_address, \ +/* 9 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL) + +/* + * Vnode tracing code. + */ +void +_xfs_itrace_entry(xfs_inode_t *ip, const char *func, inst_t *ra) +{ + KTRACE_ENTER(ip, INODE_KTRACE_ENTRY, func, 0, ra); +} + +void +_xfs_itrace_exit(xfs_inode_t *ip, const char *func, inst_t *ra) +{ + KTRACE_ENTER(ip, INODE_KTRACE_EXIT, func, 0, ra); +} + +void +xfs_itrace_hold(xfs_inode_t *ip, char *file, int line, inst_t *ra) +{ + KTRACE_ENTER(ip, INODE_KTRACE_HOLD, file, line, ra); +} + +void +_xfs_itrace_ref(xfs_inode_t *ip, char *file, int line, inst_t *ra) +{ + KTRACE_ENTER(ip, INODE_KTRACE_REF, file, line, ra); +} + +void +xfs_itrace_rele(xfs_inode_t *ip, char *file, int line, inst_t *ra) +{ + KTRACE_ENTER(ip, INODE_KTRACE_RELE, file, line, ra); +} +#endif /* XFS_INODE_TRACE */ diff --git a/fs/xfs/xfs_imap.h b/fs/xfs/xfs_imap.h deleted file mode 100644 index d36450003983..000000000000 --- a/fs/xfs/xfs_imap.h +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Copyright (c) 2000,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_IMAP_H__ -#define __XFS_IMAP_H__ - -/* - * This is the structure passed to xfs_imap() to map - * an inode number to its on disk location. - */ -typedef struct xfs_imap { - xfs_daddr_t im_blkno; /* starting BB of inode chunk */ - uint im_len; /* length in BBs of inode chunk */ - xfs_agblock_t im_agblkno; /* logical block of inode chunk in ag */ - ushort im_ioffset; /* inode offset in block in "inodes" */ - ushort im_boffset; /* inode offset in block in bytes */ -} xfs_imap_t; - -#ifdef __KERNEL__ -struct xfs_mount; -struct xfs_trans; -int xfs_imap(struct xfs_mount *, struct xfs_trans *, xfs_ino_t, - xfs_imap_t *, uint); -#endif - -#endif /* __XFS_IMAP_H__ */ diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index a391b955df01..5a5e035e5d38 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -23,7 +23,6 @@ #include "xfs_bit.h" #include "xfs_log.h" #include "xfs_inum.h" -#include "xfs_imap.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_sb.h" @@ -41,6 +40,7 @@ #include "xfs_buf_item.h" #include "xfs_inode_item.h" #include "xfs_btree.h" +#include "xfs_btree_trace.h" #include "xfs_alloc.h" #include "xfs_ialloc.h" #include "xfs_bmap.h" @@ -133,10 +133,10 @@ STATIC int xfs_imap_to_bp( xfs_mount_t *mp, xfs_trans_t *tp, - xfs_imap_t *imap, + struct xfs_imap *imap, xfs_buf_t **bpp, uint buf_flags, - uint imap_flags) + uint iget_flags) { int error; int i; @@ -173,12 +173,12 @@ xfs_imap_to_bp( dip = (xfs_dinode_t *)xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog)); - di_ok = be16_to_cpu(dip->di_core.di_magic) == XFS_DINODE_MAGIC && - XFS_DINODE_GOOD_VERSION(dip->di_core.di_version); + di_ok = be16_to_cpu(dip->di_magic) == XFS_DINODE_MAGIC && + XFS_DINODE_GOOD_VERSION(dip->di_version); if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP, XFS_RANDOM_ITOBP_INOTOBP))) { - if (imap_flags & XFS_IMAP_BULKSTAT) { + if (iget_flags & XFS_IGET_BULKSTAT) { xfs_trans_brelse(tp, bp); return XFS_ERROR(EINVAL); } @@ -190,7 +190,7 @@ xfs_imap_to_bp( "daddr %lld #%d (magic=%x)", XFS_BUFTARG_NAME(mp->m_ddev_targp), (unsigned long long)imap->im_blkno, i, - be16_to_cpu(dip->di_core.di_magic)); + be16_to_cpu(dip->di_magic)); #endif xfs_trans_brelse(tp, bp); return XFS_ERROR(EFSCORRUPTED); @@ -221,25 +221,26 @@ xfs_imap_to_bp( * Use xfs_imap() to determine the size and location of the * buffer to read from disk. */ -STATIC int +int xfs_inotobp( xfs_mount_t *mp, xfs_trans_t *tp, xfs_ino_t ino, xfs_dinode_t **dipp, xfs_buf_t **bpp, - int *offset) + int *offset, + uint imap_flags) { - xfs_imap_t imap; + struct xfs_imap imap; xfs_buf_t *bp; int error; imap.im_blkno = 0; - error = xfs_imap(mp, tp, ino, &imap, XFS_IMAP_LOOKUP); + error = xfs_imap(mp, tp, ino, &imap, imap_flags); if (error) return error; - error = xfs_imap_to_bp(mp, tp, &imap, &bp, XFS_BUF_LOCK, 0); + error = xfs_imap_to_bp(mp, tp, &imap, &bp, XFS_BUF_LOCK, imap_flags); if (error) return error; @@ -260,15 +261,11 @@ xfs_inotobp( * If a non-zero error is returned, then the contents of bpp and * dipp are undefined. * - * If the inode is new and has not yet been initialized, use xfs_imap() - * to determine the size and location of the buffer to read from disk. - * If the inode has already been mapped to its buffer and read in once, - * then use the mapping information stored in the inode rather than - * calling xfs_imap(). This allows us to avoid the overhead of looking - * at the inode btree for small block file systems (see xfs_dilocate()). - * We can tell whether the inode has been mapped in before by comparing - * its disk block address to 0. Only uninitialized inodes will have - * 0 for the disk block address. + * The inode is expected to already been mapped to its buffer and read + * in once, thus we can use the mapping information stored in the inode + * rather than calling xfs_imap(). This allows us to avoid the overhead + * of looking at the inode btree for small block file systems + * (see xfs_imap()). */ int xfs_itobp( @@ -277,40 +274,14 @@ xfs_itobp( xfs_inode_t *ip, xfs_dinode_t **dipp, xfs_buf_t **bpp, - xfs_daddr_t bno, - uint imap_flags, uint buf_flags) { - xfs_imap_t imap; xfs_buf_t *bp; int error; - if (ip->i_blkno == (xfs_daddr_t)0) { - imap.im_blkno = bno; - error = xfs_imap(mp, tp, ip->i_ino, &imap, - XFS_IMAP_LOOKUP | imap_flags); - if (error) - return error; + ASSERT(ip->i_imap.im_blkno != 0); - /* - * Fill in the fields in the inode that will be used to - * map the inode to its buffer from now on. - */ - ip->i_blkno = imap.im_blkno; - ip->i_len = imap.im_len; - ip->i_boffset = imap.im_boffset; - } else { - /* - * We've already mapped the inode once, so just use the - * mapping that we saved the first time. - */ - imap.im_blkno = ip->i_blkno; - imap.im_len = ip->i_len; - imap.im_boffset = ip->i_boffset; - } - ASSERT(bno == 0 || bno == imap.im_blkno); - - error = xfs_imap_to_bp(mp, tp, &imap, &bp, buf_flags, imap_flags); + error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp, buf_flags, 0); if (error) return error; @@ -321,7 +292,7 @@ xfs_itobp( return EAGAIN; } - *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset); + *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); *bpp = bp; return 0; } @@ -348,26 +319,26 @@ xfs_iformat( XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); error = 0; - if (unlikely(be32_to_cpu(dip->di_core.di_nextents) + - be16_to_cpu(dip->di_core.di_anextents) > - be64_to_cpu(dip->di_core.di_nblocks))) { + if (unlikely(be32_to_cpu(dip->di_nextents) + + be16_to_cpu(dip->di_anextents) > + be64_to_cpu(dip->di_nblocks))) { xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.", (unsigned long long)ip->i_ino, - (int)(be32_to_cpu(dip->di_core.di_nextents) + - be16_to_cpu(dip->di_core.di_anextents)), + (int)(be32_to_cpu(dip->di_nextents) + + be16_to_cpu(dip->di_anextents)), (unsigned long long) - be64_to_cpu(dip->di_core.di_nblocks)); + be64_to_cpu(dip->di_nblocks)); XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW, ip->i_mount, dip); return XFS_ERROR(EFSCORRUPTED); } - if (unlikely(dip->di_core.di_forkoff > ip->i_mount->m_sb.sb_inodesize)) { + if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) { xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, "corrupt dinode %Lu, forkoff = 0x%x.", (unsigned long long)ip->i_ino, - dip->di_core.di_forkoff); + dip->di_forkoff); XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW, ip->i_mount, dip); return XFS_ERROR(EFSCORRUPTED); @@ -378,25 +349,25 @@ xfs_iformat( case S_IFCHR: case S_IFBLK: case S_IFSOCK: - if (unlikely(dip->di_core.di_format != XFS_DINODE_FMT_DEV)) { + if (unlikely(dip->di_format != XFS_DINODE_FMT_DEV)) { XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW, ip->i_mount, dip); return XFS_ERROR(EFSCORRUPTED); } ip->i_d.di_size = 0; ip->i_size = 0; - ip->i_df.if_u2.if_rdev = be32_to_cpu(dip->di_u.di_dev); + ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip); break; case S_IFREG: case S_IFLNK: case S_IFDIR: - switch (dip->di_core.di_format) { + switch (dip->di_format) { case XFS_DINODE_FMT_LOCAL: /* * no local regular files yet */ - if (unlikely((be16_to_cpu(dip->di_core.di_mode) & S_IFMT) == S_IFREG)) { + if (unlikely((be16_to_cpu(dip->di_mode) & S_IFMT) == S_IFREG)) { xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, "corrupt inode %Lu " "(local format for regular file).", @@ -407,7 +378,7 @@ xfs_iformat( return XFS_ERROR(EFSCORRUPTED); } - di_size = be64_to_cpu(dip->di_core.di_size); + di_size = be64_to_cpu(dip->di_size); if (unlikely(di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) { xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, "corrupt inode %Lu " @@ -449,7 +420,7 @@ xfs_iformat( ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP); ip->i_afp->if_ext_max = XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); - switch (dip->di_core.di_aformat) { + switch (dip->di_aformat) { case XFS_DINODE_FMT_LOCAL: atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip); size = be16_to_cpu(atp->hdr.totsize); @@ -621,7 +592,7 @@ xfs_iformat_btree( ifp = XFS_IFORK_PTR(ip, whichfork); dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork); size = XFS_BMAP_BROOT_SPACE(dfp); - nrecs = XFS_BMAP_BROOT_NUMRECS(dfp); + nrecs = be16_to_cpu(dfp->bb_numrecs); /* * blow out if -- fork has less extents than can fit in @@ -649,8 +620,9 @@ xfs_iformat_btree( * Copy and convert from the on-disk structure * to the in-memory structure. */ - xfs_bmdr_to_bmbt(dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork), - ifp->if_broot, size); + xfs_bmdr_to_bmbt(ip->i_mount, dfp, + XFS_DFORK_SIZE(dip, ip->i_mount, whichfork), + ifp->if_broot, size); ifp->if_flags &= ~XFS_IFEXTENTS; ifp->if_flags |= XFS_IFBROOT; @@ -660,7 +632,7 @@ xfs_iformat_btree( void xfs_dinode_from_disk( xfs_icdinode_t *to, - xfs_dinode_core_t *from) + xfs_dinode_t *from) { to->di_magic = be16_to_cpu(from->di_magic); to->di_mode = be16_to_cpu(from->di_mode); @@ -694,7 +666,7 @@ xfs_dinode_from_disk( void xfs_dinode_to_disk( - xfs_dinode_core_t *to, + xfs_dinode_t *to, xfs_icdinode_t *from) { to->di_magic = cpu_to_be16(from->di_magic); @@ -781,93 +753,57 @@ uint xfs_dic2xflags( xfs_dinode_t *dip) { - xfs_dinode_core_t *dic = &dip->di_core; - - return _xfs_dic2xflags(be16_to_cpu(dic->di_flags)) | + return _xfs_dic2xflags(be16_to_cpu(dip->di_flags)) | (XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0); } /* - * Given a mount structure and an inode number, return a pointer - * to a newly allocated in-core inode corresponding to the given - * inode number. - * - * Initialize the inode's attributes and extent pointers if it - * already has them (it will not if the inode has no links). + * Read the disk inode attributes into the in-core inode structure. */ int xfs_iread( xfs_mount_t *mp, xfs_trans_t *tp, - xfs_ino_t ino, - xfs_inode_t **ipp, + xfs_inode_t *ip, xfs_daddr_t bno, - uint imap_flags) + uint iget_flags) { xfs_buf_t *bp; xfs_dinode_t *dip; - xfs_inode_t *ip; int error; - ASSERT(xfs_inode_zone != NULL); - - ip = kmem_zone_zalloc(xfs_inode_zone, KM_SLEEP); - ip->i_ino = ino; - ip->i_mount = mp; - atomic_set(&ip->i_iocount, 0); - spin_lock_init(&ip->i_flags_lock); - /* - * Get pointer's to the on-disk inode and the buffer containing it. - * If the inode number refers to a block outside the file system - * then xfs_itobp() will return NULL. In this case we should - * return NULL as well. Set i_blkno to 0 so that xfs_itobp() will - * know that this is a new incore inode. + * Fill in the location information in the in-core inode. */ - error = xfs_itobp(mp, tp, ip, &dip, &bp, bno, imap_flags, XFS_BUF_LOCK); - if (error) { - kmem_zone_free(xfs_inode_zone, ip); + ip->i_imap.im_blkno = bno; + error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags); + if (error) return error; - } + ASSERT(bno == 0 || bno == ip->i_imap.im_blkno); /* - * Initialize inode's trace buffers. - * Do this before xfs_iformat in case it adds entries. + * Get pointers to the on-disk inode and the buffer containing it. */ -#ifdef XFS_INODE_TRACE - ip->i_trace = ktrace_alloc(INODE_TRACE_SIZE, KM_NOFS); -#endif -#ifdef XFS_BMAP_TRACE - ip->i_xtrace = ktrace_alloc(XFS_BMAP_KTRACE_SIZE, KM_NOFS); -#endif -#ifdef XFS_BMBT_TRACE - ip->i_btrace = ktrace_alloc(XFS_BMBT_KTRACE_SIZE, KM_NOFS); -#endif -#ifdef XFS_RW_TRACE - ip->i_rwtrace = ktrace_alloc(XFS_RW_KTRACE_SIZE, KM_NOFS); -#endif -#ifdef XFS_ILOCK_TRACE - ip->i_lock_trace = ktrace_alloc(XFS_ILOCK_KTRACE_SIZE, KM_NOFS); -#endif -#ifdef XFS_DIR2_TRACE - ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_NOFS); -#endif + error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp, + XFS_BUF_LOCK, iget_flags); + if (error) + return error; + dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); /* * If we got something that isn't an inode it means someone * (nfs or dmi) has a stale handle. */ - if (be16_to_cpu(dip->di_core.di_magic) != XFS_DINODE_MAGIC) { - kmem_zone_free(xfs_inode_zone, ip); - xfs_trans_brelse(tp, bp); + if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC) { #ifdef DEBUG xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: " - "dip->di_core.di_magic (0x%x) != " + "dip->di_magic (0x%x) != " "XFS_DINODE_MAGIC (0x%x)", - be16_to_cpu(dip->di_core.di_magic), + be16_to_cpu(dip->di_magic), XFS_DINODE_MAGIC); #endif /* DEBUG */ - return XFS_ERROR(EINVAL); + error = XFS_ERROR(EINVAL); + goto out_brelse; } /* @@ -877,24 +813,22 @@ xfs_iread( * specific information. * Otherwise, just get the truly permanent information. */ - if (dip->di_core.di_mode) { - xfs_dinode_from_disk(&ip->i_d, &dip->di_core); + if (dip->di_mode) { + xfs_dinode_from_disk(&ip->i_d, dip); error = xfs_iformat(ip, dip); if (error) { - kmem_zone_free(xfs_inode_zone, ip); - xfs_trans_brelse(tp, bp); #ifdef DEBUG xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: " "xfs_iformat() returned error %d", error); #endif /* DEBUG */ - return error; + goto out_brelse; } } else { - ip->i_d.di_magic = be16_to_cpu(dip->di_core.di_magic); - ip->i_d.di_version = dip->di_core.di_version; - ip->i_d.di_gen = be32_to_cpu(dip->di_core.di_gen); - ip->i_d.di_flushiter = be16_to_cpu(dip->di_core.di_flushiter); + ip->i_d.di_magic = be16_to_cpu(dip->di_magic); + ip->i_d.di_version = dip->di_version; + ip->i_d.di_gen = be32_to_cpu(dip->di_gen); + ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter); /* * Make sure to pull in the mode here as well in * case the inode is released without being used. @@ -911,8 +845,6 @@ xfs_iread( XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); } - INIT_LIST_HEAD(&ip->i_reclaim); - /* * The inode format changed when we moved the link count and * made it 32 bits long. If this is an old format inode, @@ -924,7 +856,7 @@ xfs_iread( * the new format. We don't change the version number so that we * can distinguish this from a real new format inode. */ - if (ip->i_d.di_version == XFS_DINODE_VERSION_1) { + if (ip->i_d.di_version == 1) { ip->i_d.di_nlink = ip->i_d.di_onlink; ip->i_d.di_onlink = 0; ip->i_d.di_projid = 0; @@ -938,7 +870,7 @@ xfs_iread( * around for a while. This helps to keep recently accessed * meta-data in-core longer. */ - XFS_BUF_SET_REF(bp, XFS_INO_REF); + XFS_BUF_SET_REF(bp, XFS_INO_REF); /* * Use xfs_trans_brelse() to release the buffer containing the @@ -953,9 +885,9 @@ xfs_iread( * to worry about the inode being changed just because we released * the buffer. */ + out_brelse: xfs_trans_brelse(tp, bp); - *ipp = ip; - return 0; + return error; } /* @@ -1049,6 +981,7 @@ xfs_ialloc( uint flags; int error; timespec_t tv; + int filestreams = 0; /* * Call the space management code to pick @@ -1056,9 +989,8 @@ xfs_ialloc( */ error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, okalloc, ialloc_context, call_again, &ino); - if (error != 0) { + if (error) return error; - } if (*call_again || ino == NULLFSINO) { *ipp = NULL; return 0; @@ -1072,9 +1004,8 @@ xfs_ialloc( */ error = xfs_trans_iget(tp->t_mountp, tp, ino, XFS_IGET_CREATE, XFS_ILOCK_EXCL, &ip); - if (error != 0) { + if (error) return error; - } ASSERT(ip != NULL); ip->i_d.di_mode = (__uint16_t)mode; @@ -1093,8 +1024,8 @@ xfs_ialloc( * here rather than here and in the flush/logging code. */ if (xfs_sb_version_hasnlink(&tp->t_mountp->m_sb) && - ip->i_d.di_version == XFS_DINODE_VERSION_1) { - ip->i_d.di_version = XFS_DINODE_VERSION_2; + ip->i_d.di_version == 1) { + ip->i_d.di_version = 2; /* * We've already zeroed the old link count, the projid field, * and the pad field. @@ -1104,7 +1035,7 @@ xfs_ialloc( /* * Project ids won't be stored on disk if we are using a version 1 inode. */ - if ((prid != 0) && (ip->i_d.di_version == XFS_DINODE_VERSION_1)) + if ((prid != 0) && (ip->i_d.di_version == 1)) xfs_bump_ino_vers2(tp, ip); if (pip && XFS_INHERIT_GID(pip)) { @@ -1155,13 +1086,12 @@ xfs_ialloc( flags |= XFS_ILOG_DEV; break; case S_IFREG: - if (pip && xfs_inode_is_filestream(pip)) { - error = xfs_filestream_associate(pip, ip); - if (error < 0) - return -error; - if (!error) - xfs_iflags_set(ip, XFS_IFILESTREAM); - } + /* + * we can't set up filestreams until after the VFS inode + * is set up properly. + */ + if (pip && xfs_inode_is_filestream(pip)) + filestreams = 1; /* fall through */ case S_IFDIR: if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) { @@ -1227,6 +1157,15 @@ xfs_ialloc( /* now that we have an i_mode we can setup inode ops and unlock */ xfs_setup_inode(ip); + /* now we have set up the vfs inode we can associate the filestream */ + if (filestreams) { + error = xfs_filestream_associate(pip, ip); + if (error < 0) + return -error; + if (!error) + xfs_iflags_set(ip, XFS_IFILESTREAM); + } + *ipp = ip; return 0; } @@ -1383,8 +1322,8 @@ xfs_itrunc_trace( * direct I/O with the truncate operation. Also, because we hold * the IOLOCK in exclusive mode, we prevent new direct I/Os from being * started until the truncate completes and drops the lock. Essentially, - * the vn_iowait() call forms an I/O barrier that provides strict ordering - * between direct I/Os and the truncate operation. + * the xfs_ioend_wait() call forms an I/O barrier that provides strict + * ordering between direct I/Os and the truncate operation. * * The flags parameter can have either the value XFS_ITRUNC_DEFINITE * or XFS_ITRUNC_MAYBE. The XFS_ITRUNC_MAYBE value should be used @@ -1415,7 +1354,7 @@ xfs_itruncate_start( /* wait for the completion of any pending DIOs */ if (new_size == 0 || new_size < ip->i_size) - vn_iowait(ip); + xfs_ioend_wait(ip); /* * Call toss_pages or flushinval_pages to get rid of pages @@ -1726,8 +1665,14 @@ xfs_itruncate_finish( xfs_trans_ijoin(ntp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); xfs_trans_ihold(ntp, ip); - if (!error) - error = xfs_trans_reserve(ntp, 0, + if (error) + return error; + /* + * transaction commit worked ok so we can drop the extra ticket + * reference that we gained in xfs_trans_dup() + */ + xfs_log_ticket_put(ntp->t_ticket); + error = xfs_trans_reserve(ntp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT); @@ -1781,13 +1726,10 @@ xfs_iunlink( xfs_dinode_t *dip; xfs_buf_t *agibp; xfs_buf_t *ibp; - xfs_agnumber_t agno; - xfs_daddr_t agdaddr; xfs_agino_t agino; short bucket_index; int offset; int error; - int agi_ok; ASSERT(ip->i_d.di_nlink == 0); ASSERT(ip->i_d.di_mode != 0); @@ -1795,31 +1737,15 @@ xfs_iunlink( mp = tp->t_mountp; - agno = XFS_INO_TO_AGNO(mp, ip->i_ino); - agdaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)); - /* * Get the agi buffer first. It ensures lock ordering * on the list. */ - error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, agdaddr, - XFS_FSS_TO_BB(mp, 1), 0, &agibp); + error = xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, ip->i_ino), &agibp); if (error) return error; - - /* - * Validate the magic number of the agi block. - */ agi = XFS_BUF_TO_AGI(agibp); - agi_ok = - be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC && - XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)); - if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IUNLINK, - XFS_RANDOM_IUNLINK))) { - XFS_CORRUPTION_ERROR("xfs_iunlink", XFS_ERRLEVEL_LOW, mp, agi); - xfs_trans_brelse(tp, agibp); - return XFS_ERROR(EFSCORRUPTED); - } + /* * Get the index into the agi hash table for the * list this inode will go on. @@ -1837,14 +1763,14 @@ xfs_iunlink( * Here we put the head pointer into our next pointer, * and then we fall through to point the head at us. */ - error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK); + error = xfs_itobp(mp, tp, ip, &dip, &ibp, XFS_BUF_LOCK); if (error) return error; ASSERT(be32_to_cpu(dip->di_next_unlinked) == NULLAGINO); /* both on-disk, don't endian flip twice */ dip->di_next_unlinked = agi->agi_unlinked[bucket_index]; - offset = ip->i_boffset + + offset = ip->i_imap.im_boffset + offsetof(xfs_dinode_t, di_next_unlinked); xfs_trans_inode_buf(tp, ibp); xfs_trans_log_buf(tp, ibp, offset, @@ -1879,7 +1805,6 @@ xfs_iunlink_remove( xfs_buf_t *agibp; xfs_buf_t *ibp; xfs_agnumber_t agno; - xfs_daddr_t agdaddr; xfs_agino_t agino; xfs_agino_t next_agino; xfs_buf_t *last_ibp; @@ -1887,45 +1812,20 @@ xfs_iunlink_remove( short bucket_index; int offset, last_offset = 0; int error; - int agi_ok; - /* - * First pull the on-disk inode from the AGI unlinked list. - */ mp = tp->t_mountp; - agno = XFS_INO_TO_AGNO(mp, ip->i_ino); - agdaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)); /* * Get the agi buffer first. It ensures lock ordering * on the list. */ - error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, agdaddr, - XFS_FSS_TO_BB(mp, 1), 0, &agibp); - if (error) { - cmn_err(CE_WARN, - "xfs_iunlink_remove: xfs_trans_read_buf() returned an error %d on %s. Returning error.", - error, mp->m_fsname); + error = xfs_read_agi(mp, tp, agno, &agibp); + if (error) return error; - } - /* - * Validate the magic number of the agi block. - */ + agi = XFS_BUF_TO_AGI(agibp); - agi_ok = - be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC && - XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)); - if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IUNLINK_REMOVE, - XFS_RANDOM_IUNLINK_REMOVE))) { - XFS_CORRUPTION_ERROR("xfs_iunlink_remove", XFS_ERRLEVEL_LOW, - mp, agi); - xfs_trans_brelse(tp, agibp); - cmn_err(CE_WARN, - "xfs_iunlink_remove: XFS_TEST_ERROR() returned an error on %s. Returning EFSCORRUPTED.", - mp->m_fsname); - return XFS_ERROR(EFSCORRUPTED); - } + /* * Get the index into the agi hash table for the * list this inode will go on. @@ -1945,7 +1845,7 @@ xfs_iunlink_remove( * of dealing with the buffer when there is no need to * change it. */ - error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK); + error = xfs_itobp(mp, tp, ip, &dip, &ibp, XFS_BUF_LOCK); if (error) { cmn_err(CE_WARN, "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.", @@ -1956,7 +1856,7 @@ xfs_iunlink_remove( ASSERT(next_agino != 0); if (next_agino != NULLAGINO) { dip->di_next_unlinked = cpu_to_be32(NULLAGINO); - offset = ip->i_boffset + + offset = ip->i_imap.im_boffset + offsetof(xfs_dinode_t, di_next_unlinked); xfs_trans_inode_buf(tp, ibp); xfs_trans_log_buf(tp, ibp, offset, @@ -1992,7 +1892,7 @@ xfs_iunlink_remove( } next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino); error = xfs_inotobp(mp, tp, next_ino, &last_dip, - &last_ibp, &last_offset); + &last_ibp, &last_offset, 0); if (error) { cmn_err(CE_WARN, "xfs_iunlink_remove: xfs_inotobp() returned an error %d on %s. Returning error.", @@ -2007,7 +1907,7 @@ xfs_iunlink_remove( * Now last_ibp points to the buffer previous to us on * the unlinked list. Pull us from the list. */ - error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK); + error = xfs_itobp(mp, tp, ip, &dip, &ibp, XFS_BUF_LOCK); if (error) { cmn_err(CE_WARN, "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.", @@ -2019,7 +1919,7 @@ xfs_iunlink_remove( ASSERT(next_agino != agino); if (next_agino != NULLAGINO) { dip->di_next_unlinked = cpu_to_be32(NULLAGINO); - offset = ip->i_boffset + + offset = ip->i_imap.im_boffset + offsetof(xfs_dinode_t, di_next_unlinked); xfs_trans_inode_buf(tp, ibp); xfs_trans_log_buf(tp, ibp, offset, @@ -2160,9 +2060,9 @@ xfs_ifree_cluster( iip = (xfs_inode_log_item_t *)lip; ASSERT(iip->ili_logged == 1); lip->li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) xfs_istale_done; - spin_lock(&mp->m_ail_lock); - iip->ili_flush_lsn = iip->ili_item.li_lsn; - spin_unlock(&mp->m_ail_lock); + xfs_trans_ail_copy_lsn(mp->m_ail, + &iip->ili_flush_lsn, + &iip->ili_item.li_lsn); xfs_iflags_set(iip->ili_inode, XFS_ISTALE); pre_flushed++; } @@ -2183,9 +2083,8 @@ xfs_ifree_cluster( iip->ili_last_fields = iip->ili_format.ilf_fields; iip->ili_format.ilf_fields = 0; iip->ili_logged = 1; - spin_lock(&mp->m_ail_lock); - iip->ili_flush_lsn = iip->ili_item.li_lsn; - spin_unlock(&mp->m_ail_lock); + xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, + &iip->ili_item.li_lsn); xfs_buf_attach_iodone(bp, (void(*)(xfs_buf_t*,xfs_log_item_t*)) @@ -2263,7 +2162,7 @@ xfs_ifree( xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK); + error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, XFS_BUF_LOCK); if (error) return error; @@ -2279,7 +2178,7 @@ xfs_ifree( * This is a temporary hack that would require a proper fix * in the future. */ - dip->di_core.di_mode = 0; + dip->di_mode = 0; if (delete) { xfs_ifree_cluster(ip, tp, first_ino); @@ -2312,9 +2211,10 @@ xfs_iroot_realloc( int rec_diff, int whichfork) { + struct xfs_mount *mp = ip->i_mount; int cur_max; xfs_ifork_t *ifp; - xfs_bmbt_block_t *new_broot; + struct xfs_btree_block *new_broot; int new_max; size_t new_size; char *np; @@ -2335,8 +2235,7 @@ xfs_iroot_realloc( */ if (ifp->if_broot_bytes == 0) { new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(rec_diff); - ifp->if_broot = (xfs_bmbt_block_t*)kmem_alloc(new_size, - KM_SLEEP); + ifp->if_broot = kmem_alloc(new_size, KM_SLEEP); ifp->if_broot_bytes = (int)new_size; return; } @@ -2347,18 +2246,16 @@ xfs_iroot_realloc( * location. The records don't change location because * they are kept butted up against the btree block header. */ - cur_max = XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes); + cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0); new_max = cur_max + rec_diff; new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max); - ifp->if_broot = (xfs_bmbt_block_t *) - kmem_realloc(ifp->if_broot, - new_size, + ifp->if_broot = kmem_realloc(ifp->if_broot, new_size, (size_t)XFS_BMAP_BROOT_SPACE_CALC(cur_max), /* old size */ KM_SLEEP); - op = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1, - ifp->if_broot_bytes); - np = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1, - (int)new_size); + op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, + ifp->if_broot_bytes); + np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, + (int)new_size); ifp->if_broot_bytes = (int)new_size; ASSERT(ifp->if_broot_bytes <= XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ); @@ -2372,7 +2269,7 @@ xfs_iroot_realloc( * records, just get rid of the root and clear the status bit. */ ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0)); - cur_max = XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes); + cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0); new_max = cur_max + rec_diff; ASSERT(new_max >= 0); if (new_max > 0) @@ -2380,11 +2277,11 @@ xfs_iroot_realloc( else new_size = 0; if (new_size > 0) { - new_broot = (xfs_bmbt_block_t *)kmem_alloc(new_size, KM_SLEEP); + new_broot = kmem_alloc(new_size, KM_SLEEP); /* * First copy over the btree block header. */ - memcpy(new_broot, ifp->if_broot, sizeof(xfs_bmbt_block_t)); + memcpy(new_broot, ifp->if_broot, XFS_BTREE_LBLOCK_LEN); } else { new_broot = NULL; ifp->if_flags &= ~XFS_IFBROOT; @@ -2397,18 +2294,16 @@ xfs_iroot_realloc( /* * First copy the records. */ - op = (char *)XFS_BMAP_BROOT_REC_ADDR(ifp->if_broot, 1, - ifp->if_broot_bytes); - np = (char *)XFS_BMAP_BROOT_REC_ADDR(new_broot, 1, - (int)new_size); + op = (char *)XFS_BMBT_REC_ADDR(mp, ifp->if_broot, 1); + np = (char *)XFS_BMBT_REC_ADDR(mp, new_broot, 1); memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t)); /* * Then copy the pointers. */ - op = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1, + op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, ifp->if_broot_bytes); - np = (char *)XFS_BMAP_BROOT_PTR_ADDR(new_broot, 1, + np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1, (int)new_size); memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t)); } @@ -2511,64 +2406,6 @@ xfs_idata_realloc( ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork)); } - - - -/* - * Map inode to disk block and offset. - * - * mp -- the mount point structure for the current file system - * tp -- the current transaction - * ino -- the inode number of the inode to be located - * imap -- this structure is filled in with the information necessary - * to retrieve the given inode from disk - * flags -- flags to pass to xfs_dilocate indicating whether or not - * lookups in the inode btree were OK or not - */ -int -xfs_imap( - xfs_mount_t *mp, - xfs_trans_t *tp, - xfs_ino_t ino, - xfs_imap_t *imap, - uint flags) -{ - xfs_fsblock_t fsbno; - int len; - int off; - int error; - - fsbno = imap->im_blkno ? - XFS_DADDR_TO_FSB(mp, imap->im_blkno) : NULLFSBLOCK; - error = xfs_dilocate(mp, tp, ino, &fsbno, &len, &off, flags); - if (error) - return error; - - imap->im_blkno = XFS_FSB_TO_DADDR(mp, fsbno); - imap->im_len = XFS_FSB_TO_BB(mp, len); - imap->im_agblkno = XFS_FSB_TO_AGBNO(mp, fsbno); - imap->im_ioffset = (ushort)off; - imap->im_boffset = (ushort)(off << mp->m_sb.sb_inodelog); - - /* - * If the inode number maps to a block outside the bounds - * of the file system then return NULL rather than calling - * read_buf and panicing when we get an error from the - * driver. - */ - if ((imap->im_blkno + imap->im_len) > - XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) { - xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: " - "(imap->im_blkno (0x%llx) + imap->im_len (0x%llx)) > " - " XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) (0x%llx)", - (unsigned long long) imap->im_blkno, - (unsigned long long) imap->im_len, - XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)); - return EINVAL; - } - return 0; -} - void xfs_idestroy_fork( xfs_inode_t *ip, @@ -2613,70 +2450,6 @@ xfs_idestroy_fork( } /* - * This is called free all the memory associated with an inode. - * It must free the inode itself and any buffers allocated for - * if_extents/if_data and if_broot. It must also free the lock - * associated with the inode. - */ -void -xfs_idestroy( - xfs_inode_t *ip) -{ - switch (ip->i_d.di_mode & S_IFMT) { - case S_IFREG: - case S_IFDIR: - case S_IFLNK: - xfs_idestroy_fork(ip, XFS_DATA_FORK); - break; - } - if (ip->i_afp) - xfs_idestroy_fork(ip, XFS_ATTR_FORK); - mrfree(&ip->i_lock); - mrfree(&ip->i_iolock); - -#ifdef XFS_INODE_TRACE - ktrace_free(ip->i_trace); -#endif -#ifdef XFS_BMAP_TRACE - ktrace_free(ip->i_xtrace); -#endif -#ifdef XFS_BMBT_TRACE - ktrace_free(ip->i_btrace); -#endif -#ifdef XFS_RW_TRACE - ktrace_free(ip->i_rwtrace); -#endif -#ifdef XFS_ILOCK_TRACE - ktrace_free(ip->i_lock_trace); -#endif -#ifdef XFS_DIR2_TRACE - ktrace_free(ip->i_dir_trace); -#endif - if (ip->i_itemp) { - /* - * Only if we are shutting down the fs will we see an - * inode still in the AIL. If it is there, we should remove - * it to prevent a use-after-free from occurring. - */ - xfs_mount_t *mp = ip->i_mount; - xfs_log_item_t *lip = &ip->i_itemp->ili_item; - - ASSERT(((lip->li_flags & XFS_LI_IN_AIL) == 0) || - XFS_FORCED_SHUTDOWN(ip->i_mount)); - if (lip->li_flags & XFS_LI_IN_AIL) { - spin_lock(&mp->m_ail_lock); - if (lip->li_flags & XFS_LI_IN_AIL) - xfs_trans_delete_ail(mp, lip); - else - spin_unlock(&mp->m_ail_lock); - } - xfs_inode_item_destroy(ip); - } - kmem_zone_free(xfs_inode_zone, ip); -} - - -/* * Increment the pin count of the given buffer. * This value is protected by ipinlock spinlock in the mount structure. */ @@ -2880,7 +2653,7 @@ xfs_iflush_fork( ASSERT(ifp->if_broot_bytes <= (XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ)); - xfs_bmbt_to_bmdr(ifp->if_broot, ifp->if_broot_bytes, + xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes, (xfs_bmdr_block_t *)cp, XFS_DFORK_SIZE(dip, mp, whichfork)); } @@ -2889,15 +2662,16 @@ xfs_iflush_fork( case XFS_DINODE_FMT_DEV: if (iip->ili_format.ilf_fields & XFS_ILOG_DEV) { ASSERT(whichfork == XFS_DATA_FORK); - dip->di_u.di_dev = cpu_to_be32(ip->i_df.if_u2.if_rdev); + xfs_dinode_put_rdev(dip, ip->i_df.if_u2.if_rdev); } break; case XFS_DINODE_FMT_UUID: if (iip->ili_format.ilf_fields & XFS_ILOG_UUID) { ASSERT(whichfork == XFS_DATA_FORK); - memcpy(&dip->di_u.di_muuid, &ip->i_df.if_u2.if_uuid, - sizeof(uuid_t)); + memcpy(XFS_DFORK_DPTR(dip), + &ip->i_df.if_u2.if_uuid, + sizeof(uuid_t)); } break; @@ -3030,7 +2804,6 @@ cluster_corrupt_out: XFS_BUF_CLR_BDSTRAT_FUNC(bp); XFS_BUF_UNDONE(bp); XFS_BUF_STALE(bp); - XFS_BUF_SHUT(bp); XFS_BUF_ERROR(bp,EIO); xfs_biodone(bp); } else { @@ -3172,7 +2945,7 @@ xfs_iflush( /* * Get the buffer containing the on-disk inode. */ - error = xfs_itobp(mp, NULL, ip, &dip, &bp, 0, 0, + error = xfs_itobp(mp, NULL, ip, &dip, &bp, noblock ? XFS_BUF_TRYLOCK : XFS_BUF_LOCK); if (error || !bp) { xfs_ifunlock(ip); @@ -3253,7 +3026,7 @@ xfs_iflush_int( } /* set *dip = inode's place in the buffer */ - dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_boffset); + dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); /* * Clear i_update_core before copying out the data. @@ -3275,11 +3048,11 @@ xfs_iflush_int( */ xfs_synchronize_atime(ip); - if (XFS_TEST_ERROR(be16_to_cpu(dip->di_core.di_magic) != XFS_DINODE_MAGIC, + if (XFS_TEST_ERROR(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC, mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) { xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, "xfs_iflush: Bad inode %Lu magic number 0x%x, ptr 0x%p", - ip->i_ino, be16_to_cpu(dip->di_core.di_magic), dip); + ip->i_ino, be16_to_cpu(dip->di_magic), dip); goto corrupt_out; } if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC, @@ -3342,7 +3115,7 @@ xfs_iflush_int( * because if the inode is dirty at all the core must * be. */ - xfs_dinode_to_disk(&dip->di_core, &ip->i_d); + xfs_dinode_to_disk(dip, &ip->i_d); /* Wrap, we never let the log put out DI_MAX_FLUSH */ if (ip->i_d.di_flushiter == DI_MAX_FLUSH) @@ -3354,28 +3127,27 @@ xfs_iflush_int( * convert back to the old inode format. If the superblock version * has been updated, then make the conversion permanent. */ - ASSERT(ip->i_d.di_version == XFS_DINODE_VERSION_1 || - xfs_sb_version_hasnlink(&mp->m_sb)); - if (ip->i_d.di_version == XFS_DINODE_VERSION_1) { + ASSERT(ip->i_d.di_version == 1 || xfs_sb_version_hasnlink(&mp->m_sb)); + if (ip->i_d.di_version == 1) { if (!xfs_sb_version_hasnlink(&mp->m_sb)) { /* * Convert it back. */ ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1); - dip->di_core.di_onlink = cpu_to_be16(ip->i_d.di_nlink); + dip->di_onlink = cpu_to_be16(ip->i_d.di_nlink); } else { /* * The superblock version has already been bumped, * so just make the conversion to the new inode * format permanent. */ - ip->i_d.di_version = XFS_DINODE_VERSION_2; - dip->di_core.di_version = XFS_DINODE_VERSION_2; + ip->i_d.di_version = 2; + dip->di_version = 2; ip->i_d.di_onlink = 0; - dip->di_core.di_onlink = 0; + dip->di_onlink = 0; memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); - memset(&(dip->di_core.di_pad[0]), 0, - sizeof(dip->di_core.di_pad)); + memset(&(dip->di_pad[0]), 0, + sizeof(dip->di_pad)); ASSERT(ip->i_d.di_projid == 0); } } @@ -3418,10 +3190,8 @@ xfs_iflush_int( iip->ili_format.ilf_fields = 0; iip->ili_logged = 1; - ASSERT(sizeof(xfs_lsn_t) == 8); /* don't lock if it shrinks */ - spin_lock(&mp->m_ail_lock); - iip->ili_flush_lsn = iip->ili_item.li_lsn; - spin_unlock(&mp->m_ail_lock); + xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, + &iip->ili_item.li_lsn); /* * Attach the function xfs_iflush_done to the inode's @@ -3459,45 +3229,8 @@ corrupt_out: } -/* - * Flush all inactive inodes in mp. - */ -void -xfs_iflush_all( - xfs_mount_t *mp) -{ - xfs_inode_t *ip; - - again: - XFS_MOUNT_ILOCK(mp); - ip = mp->m_inodes; - if (ip == NULL) - goto out; - - do { - /* Make sure we skip markers inserted by sync */ - if (ip->i_mount == NULL) { - ip = ip->i_mnext; - continue; - } - - if (!VFS_I(ip)) { - XFS_MOUNT_IUNLOCK(mp); - xfs_finish_reclaim(ip, 0, XFS_IFLUSH_ASYNC); - goto again; - } - - ASSERT(vn_count(VFS_I(ip)) == 0); - - ip = ip->i_mnext; - } while (ip != mp->m_inodes); - out: - XFS_MOUNT_IUNLOCK(mp); -} #ifdef XFS_ILOCK_TRACE -ktrace_t *xfs_ilock_trace_buf; - void xfs_ilock_trace(xfs_inode_t *ip, int lock, unsigned int lockflags, inst_t *ra) { diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 6be310d41daf..1f175fa34b22 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -19,8 +19,7 @@ #define __XFS_INODE_H__ struct xfs_dinode; -struct xfs_dinode_core; - +struct xfs_inode; /* * Fork identifiers. @@ -63,7 +62,7 @@ typedef struct xfs_ext_irec { typedef struct xfs_ifork { int if_bytes; /* bytes in if_u1 */ int if_real_bytes; /* bytes allocated in if_u1 */ - xfs_bmbt_block_t *if_broot; /* file's incore btree root */ + struct xfs_btree_block *if_broot; /* file's incore btree root */ short if_broot_bytes; /* bytes allocated for root */ unsigned char if_flags; /* per-fork flags */ unsigned char if_ext_max; /* max # of extent records */ @@ -84,52 +83,14 @@ typedef struct xfs_ifork { } xfs_ifork_t; /* - * Flags for xfs_ichgtime(). + * Inode location information. Stored in the inode and passed to + * xfs_imap_to_bp() to get a buffer and dinode for a given inode. */ -#define XFS_ICHGTIME_MOD 0x1 /* data fork modification timestamp */ -#define XFS_ICHGTIME_CHG 0x2 /* inode field change timestamp */ - -/* - * Per-fork incore inode flags. - */ -#define XFS_IFINLINE 0x01 /* Inline data is read in */ -#define XFS_IFEXTENTS 0x02 /* All extent pointers are read in */ -#define XFS_IFBROOT 0x04 /* i_broot points to the bmap b-tree root */ -#define XFS_IFEXTIREC 0x08 /* Indirection array of extent blocks */ - -/* - * Flags for xfs_itobp(), xfs_imap() and xfs_dilocate(). - */ -#define XFS_IMAP_LOOKUP 0x1 -#define XFS_IMAP_BULKSTAT 0x2 - -#ifdef __KERNEL__ -struct bhv_desc; -struct cred; -struct ktrace; -struct xfs_buf; -struct xfs_bmap_free; -struct xfs_bmbt_irec; -struct xfs_bmbt_block; -struct xfs_inode; -struct xfs_inode_log_item; -struct xfs_mount; -struct xfs_trans; -struct xfs_dquot; - -#if defined(XFS_ILOCK_TRACE) -#define XFS_ILOCK_KTRACE_SIZE 32 -extern ktrace_t *xfs_ilock_trace_buf; -extern void xfs_ilock_trace(struct xfs_inode *, int, unsigned int, inst_t *); -#else -#define xfs_ilock_trace(i,n,f,ra) -#endif - -typedef struct dm_attrs_s { - __uint32_t da_dmevmask; /* DMIG event mask */ - __uint16_t da_dmstate; /* DMIG state info */ - __uint16_t da_pad; /* DMIG extra padding */ -} dm_attrs_t; +struct xfs_imap { + xfs_daddr_t im_blkno; /* starting BB of inode chunk */ + ushort im_len; /* length in BBs of inode chunk */ + ushort im_boffset; /* inode offset in block in bytes */ +}; /* * This is the xfs in-core inode structure. @@ -160,7 +121,7 @@ typedef struct xfs_ictimestamp { } xfs_ictimestamp_t; /* - * NOTE: This structure must be kept identical to struct xfs_dinode_core + * NOTE: This structure must be kept identical to struct xfs_dinode * in xfs_dinode.h except for the endianess annotations. */ typedef struct xfs_icdinode { @@ -191,27 +152,97 @@ typedef struct xfs_icdinode { __uint32_t di_gen; /* generation number */ } xfs_icdinode_t; -typedef struct { - struct xfs_inode *ip_mnext; /* next inode in mount list */ - struct xfs_inode *ip_mprev; /* ptr to prev inode */ - struct xfs_mount *ip_mount; /* fs mount struct ptr */ -} xfs_iptr_t; +/* + * Flags for xfs_ichgtime(). + */ +#define XFS_ICHGTIME_MOD 0x1 /* data fork modification timestamp */ +#define XFS_ICHGTIME_CHG 0x2 /* inode field change timestamp */ + +/* + * Per-fork incore inode flags. + */ +#define XFS_IFINLINE 0x01 /* Inline data is read in */ +#define XFS_IFEXTENTS 0x02 /* All extent pointers are read in */ +#define XFS_IFBROOT 0x04 /* i_broot points to the bmap b-tree root */ +#define XFS_IFEXTIREC 0x08 /* Indirection array of extent blocks */ + +/* + * Fork handling. + */ + +#define XFS_IFORK_Q(ip) ((ip)->i_d.di_forkoff != 0) +#define XFS_IFORK_BOFF(ip) ((int)((ip)->i_d.di_forkoff << 3)) + +#define XFS_IFORK_PTR(ip,w) \ + ((w) == XFS_DATA_FORK ? \ + &(ip)->i_df : \ + (ip)->i_afp) +#define XFS_IFORK_DSIZE(ip) \ + (XFS_IFORK_Q(ip) ? \ + XFS_IFORK_BOFF(ip) : \ + XFS_LITINO((ip)->i_mount)) +#define XFS_IFORK_ASIZE(ip) \ + (XFS_IFORK_Q(ip) ? \ + XFS_LITINO((ip)->i_mount) - XFS_IFORK_BOFF(ip) : \ + 0) +#define XFS_IFORK_SIZE(ip,w) \ + ((w) == XFS_DATA_FORK ? \ + XFS_IFORK_DSIZE(ip) : \ + XFS_IFORK_ASIZE(ip)) +#define XFS_IFORK_FORMAT(ip,w) \ + ((w) == XFS_DATA_FORK ? \ + (ip)->i_d.di_format : \ + (ip)->i_d.di_aformat) +#define XFS_IFORK_FMT_SET(ip,w,n) \ + ((w) == XFS_DATA_FORK ? \ + ((ip)->i_d.di_format = (n)) : \ + ((ip)->i_d.di_aformat = (n))) +#define XFS_IFORK_NEXTENTS(ip,w) \ + ((w) == XFS_DATA_FORK ? \ + (ip)->i_d.di_nextents : \ + (ip)->i_d.di_anextents) +#define XFS_IFORK_NEXT_SET(ip,w,n) \ + ((w) == XFS_DATA_FORK ? \ + ((ip)->i_d.di_nextents = (n)) : \ + ((ip)->i_d.di_anextents = (n))) + + + +#ifdef __KERNEL__ + +struct bhv_desc; +struct cred; +struct ktrace; +struct xfs_buf; +struct xfs_bmap_free; +struct xfs_bmbt_irec; +struct xfs_inode_log_item; +struct xfs_mount; +struct xfs_trans; +struct xfs_dquot; + +#if defined(XFS_ILOCK_TRACE) +#define XFS_ILOCK_KTRACE_SIZE 32 +extern void xfs_ilock_trace(struct xfs_inode *, int, unsigned int, inst_t *); +#else +#define xfs_ilock_trace(i,n,f,ra) +#endif + +typedef struct dm_attrs_s { + __uint32_t da_dmevmask; /* DMIG event mask */ + __uint16_t da_dmstate; /* DMIG state info */ + __uint16_t da_pad; /* DMIG extra padding */ +} dm_attrs_t; typedef struct xfs_inode { /* Inode linking and identification information. */ - struct xfs_inode *i_mnext; /* next inode in mount list */ - struct xfs_inode *i_mprev; /* ptr to prev inode */ struct xfs_mount *i_mount; /* fs mount struct ptr */ - struct list_head i_reclaim; /* reclaim list */ - struct inode *i_vnode; /* vnode backpointer */ struct xfs_dquot *i_udquot; /* user dquot */ struct xfs_dquot *i_gdquot; /* group dquot */ /* Inode location stuff */ xfs_ino_t i_ino; /* inode number (agno/agino)*/ - xfs_daddr_t i_blkno; /* blkno of inode buffer */ - ushort i_len; /* len of inode buffer */ - ushort i_boffset; /* off of inode in buffer */ + struct xfs_imap i_imap; /* location for xfs_imap() */ /* Extent information. */ xfs_ifork_t *i_afp; /* attribute fork pointer */ @@ -230,7 +261,6 @@ typedef struct xfs_inode { unsigned short i_flags; /* see defined flags below */ unsigned char i_update_core; /* timestamps/size is dirty */ unsigned char i_update_size; /* di_size field is dirty */ - unsigned int i_gen; /* generation count */ unsigned int i_delayed_blks; /* count of delay alloc blks */ xfs_icdinode_t i_d; /* most of ondisk inode */ @@ -238,6 +268,10 @@ typedef struct xfs_inode { xfs_fsize_t i_size; /* in-memory size */ xfs_fsize_t i_new_size; /* size when write completes */ atomic_t i_iocount; /* outstanding I/O count */ + + /* VFS inode */ + struct inode i_vnode; /* embedded VFS inode */ + /* Trace buffers per inode. */ #ifdef XFS_INODE_TRACE struct ktrace *i_trace; /* general inode trace */ @@ -245,7 +279,7 @@ typedef struct xfs_inode { #ifdef XFS_BMAP_TRACE struct ktrace *i_xtrace; /* inode extent list trace */ #endif -#ifdef XFS_BMBT_TRACE +#ifdef XFS_BTREE_TRACE struct ktrace *i_btrace; /* inode bmap btree trace */ #endif #ifdef XFS_RW_TRACE @@ -265,13 +299,30 @@ typedef struct xfs_inode { /* Convert from vfs inode to xfs inode */ static inline struct xfs_inode *XFS_I(struct inode *inode) { - return (struct xfs_inode *)inode->i_private; + return container_of(inode, struct xfs_inode, i_vnode); } /* convert from xfs inode to vfs inode */ static inline struct inode *VFS_I(struct xfs_inode *ip) { - return (struct inode *)ip->i_vnode; + return &ip->i_vnode; +} + +/* + * Get rid of a partially initialized inode. + * + * We have to go through destroy_inode to make sure allocations + * from init_inode_always like the security data are undone. + * + * We mark the inode bad so that it takes the short cut in + * the reclaim path instead of going through the flush path + * which doesn't make sense for an inode that has never seen the + * light of day. + */ +static inline void xfs_destroy_inode(struct xfs_inode *ip) +{ + make_bad_inode(VFS_I(ip)); + return destroy_inode(VFS_I(ip)); } /* @@ -327,65 +378,36 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags) spin_unlock(&ip->i_flags_lock); return ret; } -#endif /* __KERNEL__ */ - /* - * Fork handling. + * Manage the i_flush queue embedded in the inode. This completion + * queue synchronizes processes attempting to flush the in-core + * inode back to disk. */ +static inline void xfs_iflock(xfs_inode_t *ip) +{ + wait_for_completion(&ip->i_flush); +} -#define XFS_IFORK_Q(ip) ((ip)->i_d.di_forkoff != 0) -#define XFS_IFORK_BOFF(ip) ((int)((ip)->i_d.di_forkoff << 3)) - -#define XFS_IFORK_PTR(ip,w) \ - ((w) == XFS_DATA_FORK ? \ - &(ip)->i_df : \ - (ip)->i_afp) -#define XFS_IFORK_DSIZE(ip) \ - (XFS_IFORK_Q(ip) ? \ - XFS_IFORK_BOFF(ip) : \ - XFS_LITINO((ip)->i_mount)) -#define XFS_IFORK_ASIZE(ip) \ - (XFS_IFORK_Q(ip) ? \ - XFS_LITINO((ip)->i_mount) - XFS_IFORK_BOFF(ip) : \ - 0) -#define XFS_IFORK_SIZE(ip,w) \ - ((w) == XFS_DATA_FORK ? \ - XFS_IFORK_DSIZE(ip) : \ - XFS_IFORK_ASIZE(ip)) -#define XFS_IFORK_FORMAT(ip,w) \ - ((w) == XFS_DATA_FORK ? \ - (ip)->i_d.di_format : \ - (ip)->i_d.di_aformat) -#define XFS_IFORK_FMT_SET(ip,w,n) \ - ((w) == XFS_DATA_FORK ? \ - ((ip)->i_d.di_format = (n)) : \ - ((ip)->i_d.di_aformat = (n))) -#define XFS_IFORK_NEXTENTS(ip,w) \ - ((w) == XFS_DATA_FORK ? \ - (ip)->i_d.di_nextents : \ - (ip)->i_d.di_anextents) -#define XFS_IFORK_NEXT_SET(ip,w,n) \ - ((w) == XFS_DATA_FORK ? \ - ((ip)->i_d.di_nextents = (n)) : \ - ((ip)->i_d.di_anextents = (n))) +static inline int xfs_iflock_nowait(xfs_inode_t *ip) +{ + return try_wait_for_completion(&ip->i_flush); +} -#ifdef __KERNEL__ +static inline void xfs_ifunlock(xfs_inode_t *ip) +{ + complete(&ip->i_flush); +} /* * In-core inode flags. */ -#define XFS_IGRIO 0x0001 /* inode used for guaranteed rate i/o */ -#define XFS_IUIOSZ 0x0002 /* inode i/o sizes have been explicitly set */ -#define XFS_IQUIESCE 0x0004 /* we have started quiescing for this inode */ -#define XFS_IRECLAIM 0x0008 /* we have started reclaiming this inode */ -#define XFS_ISTALE 0x0010 /* inode has been staled */ -#define XFS_IRECLAIMABLE 0x0020 /* inode can be reclaimed */ -#define XFS_INEW 0x0040 -#define XFS_IFILESTREAM 0x0080 /* inode is in a filestream directory */ -#define XFS_IMODIFIED 0x0100 /* XFS inode state possibly differs */ - /* to the Linux inode state. */ -#define XFS_ITRUNCATED 0x0200 /* truncated down so flush-on-close */ +#define XFS_IRECLAIM 0x0001 /* we have started reclaiming this inode */ +#define XFS_ISTALE 0x0002 /* inode has been staled */ +#define XFS_IRECLAIMABLE 0x0004 /* inode can be reclaimed */ +#define XFS_INEW 0x0008 /* inode has just been allocated */ +#define XFS_IFILESTREAM 0x0010 /* inode is in a filestream directory */ +#define XFS_ITRUNCATED 0x0020 /* truncated down so flush-on-close */ /* * Flags for inode locking. @@ -460,16 +482,8 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags) ((pip)->i_d.di_mode & S_ISGID)) /* - * Flags for xfs_iget() - */ -#define XFS_IGET_CREATE 0x1 -#define XFS_IGET_BULKSTAT 0x2 - -/* * xfs_iget.c prototypes. */ -void xfs_ihash_init(struct xfs_mount *); -void xfs_ihash_free(struct xfs_mount *); xfs_inode_t *xfs_inode_incore(struct xfs_mount *, xfs_ino_t, struct xfs_trans *); int xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t, @@ -484,25 +498,13 @@ int xfs_isilocked(xfs_inode_t *, uint); uint xfs_ilock_map_shared(xfs_inode_t *); void xfs_iunlock_map_shared(xfs_inode_t *, uint); void xfs_ireclaim(xfs_inode_t *); -int xfs_finish_reclaim(xfs_inode_t *, int, int); -int xfs_finish_reclaim_all(struct xfs_mount *, int); /* * xfs_inode.c prototypes. */ -int xfs_itobp(struct xfs_mount *, struct xfs_trans *, - xfs_inode_t *, struct xfs_dinode **, struct xfs_buf **, - xfs_daddr_t, uint, uint); -int xfs_iread(struct xfs_mount *, struct xfs_trans *, xfs_ino_t, - xfs_inode_t **, xfs_daddr_t, uint); -int xfs_iread_extents(struct xfs_trans *, xfs_inode_t *, int); int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, mode_t, xfs_nlink_t, xfs_dev_t, cred_t *, xfs_prid_t, int, struct xfs_buf **, boolean_t *, xfs_inode_t **); -void xfs_dinode_from_disk(struct xfs_icdinode *, - struct xfs_dinode_core *); -void xfs_dinode_to_disk(struct xfs_dinode_core *, - struct xfs_icdinode *); uint xfs_ip2xflags(struct xfs_inode *); uint xfs_dic2xflags(struct xfs_dinode *); @@ -513,17 +515,10 @@ int xfs_itruncate_finish(struct xfs_trans **, xfs_inode_t *, xfs_fsize_t, int, int); int xfs_iunlink(struct xfs_trans *, xfs_inode_t *); -void xfs_idestroy_fork(xfs_inode_t *, int); -void xfs_idestroy(xfs_inode_t *); -void xfs_idata_realloc(xfs_inode_t *, int, int); -void xfs_iextract(xfs_inode_t *); void xfs_iext_realloc(xfs_inode_t *, int, int); -void xfs_iroot_realloc(xfs_inode_t *, int, int); void xfs_ipin(xfs_inode_t *); void xfs_iunpin(xfs_inode_t *); -int xfs_iextents_copy(xfs_inode_t *, xfs_bmbt_rec_t *, int); int xfs_iflush(xfs_inode_t *, uint); -void xfs_iflush_all(struct xfs_mount *); void xfs_ichgtime(xfs_inode_t *, int); xfs_fsize_t xfs_file_last_byte(xfs_inode_t *); void xfs_lock_inodes(xfs_inode_t **, int, uint); @@ -532,6 +527,77 @@ void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); void xfs_synchronize_atime(xfs_inode_t *); void xfs_mark_inode_dirty_sync(xfs_inode_t *); +#if defined(XFS_INODE_TRACE) + +#define INODE_TRACE_SIZE 16 /* number of trace entries */ +#define INODE_KTRACE_ENTRY 1 +#define INODE_KTRACE_EXIT 2 +#define INODE_KTRACE_HOLD 3 +#define INODE_KTRACE_REF 4 +#define INODE_KTRACE_RELE 5 + +extern void _xfs_itrace_entry(struct xfs_inode *, const char *, inst_t *); +extern void _xfs_itrace_exit(struct xfs_inode *, const char *, inst_t *); +extern void xfs_itrace_hold(struct xfs_inode *, char *, int, inst_t *); +extern void _xfs_itrace_ref(struct xfs_inode *, char *, int, inst_t *); +extern void xfs_itrace_rele(struct xfs_inode *, char *, int, inst_t *); +#define xfs_itrace_entry(ip) \ + _xfs_itrace_entry(ip, __func__, (inst_t *)__return_address) +#define xfs_itrace_exit(ip) \ + _xfs_itrace_exit(ip, __func__, (inst_t *)__return_address) +#define xfs_itrace_exit_tag(ip, tag) \ + _xfs_itrace_exit(ip, tag, (inst_t *)__return_address) +#define xfs_itrace_ref(ip) \ + _xfs_itrace_ref(ip, __FILE__, __LINE__, (inst_t *)__return_address) + +#else +#define xfs_itrace_entry(a) +#define xfs_itrace_exit(a) +#define xfs_itrace_exit_tag(a, b) +#define xfs_itrace_hold(a, b, c, d) +#define xfs_itrace_ref(a) +#define xfs_itrace_rele(a, b, c, d) +#endif + +#define IHOLD(ip) \ +do { \ + ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \ + atomic_inc(&(VFS_I(ip)->i_count)); \ + xfs_itrace_hold((ip), __FILE__, __LINE__, (inst_t *)__return_address); \ +} while (0) + +#define IRELE(ip) \ +do { \ + xfs_itrace_rele((ip), __FILE__, __LINE__, (inst_t *)__return_address); \ + iput(VFS_I(ip)); \ +} while (0) + +#endif /* __KERNEL__ */ + +/* + * Flags for xfs_iget() + */ +#define XFS_IGET_CREATE 0x1 +#define XFS_IGET_BULKSTAT 0x2 + +int xfs_inotobp(struct xfs_mount *, struct xfs_trans *, + xfs_ino_t, struct xfs_dinode **, + struct xfs_buf **, int *, uint); +int xfs_itobp(struct xfs_mount *, struct xfs_trans *, + struct xfs_inode *, struct xfs_dinode **, + struct xfs_buf **, uint); +int xfs_iread(struct xfs_mount *, struct xfs_trans *, + struct xfs_inode *, xfs_daddr_t, uint); +void xfs_dinode_from_disk(struct xfs_icdinode *, + struct xfs_dinode *); +void xfs_dinode_to_disk(struct xfs_dinode *, + struct xfs_icdinode *); +void xfs_idestroy_fork(struct xfs_inode *, int); +void xfs_idata_realloc(struct xfs_inode *, int, int); +void xfs_iroot_realloc(struct xfs_inode *, int, int); +int xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int); +int xfs_iextents_copy(struct xfs_inode *, xfs_bmbt_rec_t *, int); + xfs_bmbt_rec_host_t *xfs_iext_get_ext(xfs_ifork_t *, xfs_extnum_t); void xfs_iext_insert(xfs_ifork_t *, xfs_extnum_t, xfs_extnum_t, xfs_bmbt_irec_t *); @@ -561,7 +627,8 @@ void xfs_iext_irec_update_extoffs(xfs_ifork_t *, int, int); #define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount)) #ifdef DEBUG -void xfs_isize_check(struct xfs_mount *, xfs_inode_t *, xfs_fsize_t); +void xfs_isize_check(struct xfs_mount *, struct xfs_inode *, + xfs_fsize_t); #else /* DEBUG */ #define xfs_isize_check(mp, ip, isize) #endif /* DEBUG */ @@ -576,26 +643,4 @@ extern struct kmem_zone *xfs_ifork_zone; extern struct kmem_zone *xfs_inode_zone; extern struct kmem_zone *xfs_ili_zone; -/* - * Manage the i_flush queue embedded in the inode. This completion - * queue synchronizes processes attempting to flush the in-core - * inode back to disk. - */ -static inline void xfs_iflock(xfs_inode_t *ip) -{ - wait_for_completion(&ip->i_flush); -} - -static inline int xfs_iflock_nowait(xfs_inode_t *ip) -{ - return try_wait_for_completion(&ip->i_flush); -} - -static inline void xfs_ifunlock(xfs_inode_t *ip) -{ - complete(&ip->i_flush); -} - -#endif /* __KERNEL__ */ - #endif /* __XFS_INODE_H__ */ diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 97c7452e2620..977c4aec587e 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -281,7 +281,7 @@ xfs_inode_item_format( xfs_mark_inode_dirty_sync(ip); vecp->i_addr = (xfs_caddr_t)&ip->i_d; - vecp->i_len = sizeof(xfs_dinode_core_t); + vecp->i_len = sizeof(struct xfs_icdinode); XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_ICORE); vecp++; nvecs++; @@ -296,9 +296,8 @@ xfs_inode_item_format( * has a new version number, then we don't bother converting back. */ mp = ip->i_mount; - ASSERT(ip->i_d.di_version == XFS_DINODE_VERSION_1 || - xfs_sb_version_hasnlink(&mp->m_sb)); - if (ip->i_d.di_version == XFS_DINODE_VERSION_1) { + ASSERT(ip->i_d.di_version == 1 || xfs_sb_version_hasnlink(&mp->m_sb)); + if (ip->i_d.di_version == 1) { if (!xfs_sb_version_hasnlink(&mp->m_sb)) { /* * Convert it back. @@ -311,7 +310,7 @@ xfs_inode_item_format( * so just make the conversion to the new inode * format permanent. */ - ip->i_d.di_version = XFS_DINODE_VERSION_2; + ip->i_d.di_version = 2; ip->i_d.di_onlink = 0; memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); } @@ -932,6 +931,7 @@ xfs_inode_item_init( iip->ili_item.li_type = XFS_LI_INODE; iip->ili_item.li_ops = &xfs_inode_item_ops; iip->ili_item.li_mountp = mp; + iip->ili_item.li_ailp = mp->m_ail; iip->ili_inode = ip; /* @@ -942,9 +942,9 @@ xfs_inode_item_init( iip->ili_format.ilf_type = XFS_LI_INODE; iip->ili_format.ilf_ino = ip->i_ino; - iip->ili_format.ilf_blkno = ip->i_blkno; - iip->ili_format.ilf_len = ip->i_len; - iip->ili_format.ilf_boffset = ip->i_boffset; + iip->ili_format.ilf_blkno = ip->i_imap.im_blkno; + iip->ili_format.ilf_len = ip->i_imap.im_len; + iip->ili_format.ilf_boffset = ip->i_imap.im_boffset; } /* @@ -976,9 +976,8 @@ xfs_iflush_done( xfs_buf_t *bp, xfs_inode_log_item_t *iip) { - xfs_inode_t *ip; - - ip = iip->ili_inode; + xfs_inode_t *ip = iip->ili_inode; + struct xfs_ail *ailp = iip->ili_item.li_ailp; /* * We only want to pull the item from the AIL if it is @@ -991,15 +990,12 @@ xfs_iflush_done( */ if (iip->ili_logged && (iip->ili_item.li_lsn == iip->ili_flush_lsn)) { - spin_lock(&ip->i_mount->m_ail_lock); + spin_lock(&ailp->xa_lock); if (iip->ili_item.li_lsn == iip->ili_flush_lsn) { - /* - * xfs_trans_delete_ail() drops the AIL lock. - */ - xfs_trans_delete_ail(ip->i_mount, - (xfs_log_item_t*)iip); + /* xfs_trans_ail_delete() drops the AIL lock. */ + xfs_trans_ail_delete(ailp, (xfs_log_item_t*)iip); } else { - spin_unlock(&ip->i_mount->m_ail_lock); + spin_unlock(&ailp->xa_lock); } } @@ -1031,21 +1027,20 @@ void xfs_iflush_abort( xfs_inode_t *ip) { - xfs_inode_log_item_t *iip; + xfs_inode_log_item_t *iip = ip->i_itemp; xfs_mount_t *mp; iip = ip->i_itemp; mp = ip->i_mount; if (iip) { + struct xfs_ail *ailp = iip->ili_item.li_ailp; if (iip->ili_item.li_flags & XFS_LI_IN_AIL) { - spin_lock(&mp->m_ail_lock); + spin_lock(&ailp->xa_lock); if (iip->ili_item.li_flags & XFS_LI_IN_AIL) { - /* - * xfs_trans_delete_ail() drops the AIL lock. - */ - xfs_trans_delete_ail(mp, (xfs_log_item_t *)iip); + /* xfs_trans_ail_delete() drops the AIL lock. */ + xfs_trans_ail_delete(ailp, (xfs_log_item_t *)iip); } else - spin_unlock(&mp->m_ail_lock); + spin_unlock(&ailp->xa_lock); } iip->ili_logged = 0; /* diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index 40513077ab36..1ff04cc323ad 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -112,6 +112,24 @@ typedef struct xfs_inode_log_format_64 { #define XFS_ILI_IOLOCKED_ANY (XFS_ILI_IOLOCKED_EXCL | XFS_ILI_IOLOCKED_SHARED) +#define XFS_ILOG_FBROOT(w) xfs_ilog_fbroot(w) +static inline int xfs_ilog_fbroot(int w) +{ + return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT); +} + +#define XFS_ILOG_FEXT(w) xfs_ilog_fext(w) +static inline int xfs_ilog_fext(int w) +{ + return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT); +} + +#define XFS_ILOG_FDATA(w) xfs_ilog_fdata(w) +static inline int xfs_ilog_fdata(int w) +{ + return (w == XFS_DATA_FORK ? XFS_ILOG_DDATA : XFS_ILOG_ADATA); +} + #ifdef __KERNEL__ struct xfs_buf; @@ -148,26 +166,6 @@ typedef struct xfs_inode_log_item { } xfs_inode_log_item_t; -#define XFS_ILOG_FDATA(w) xfs_ilog_fdata(w) -static inline int xfs_ilog_fdata(int w) -{ - return (w == XFS_DATA_FORK ? XFS_ILOG_DDATA : XFS_ILOG_ADATA); -} - -#endif /* __KERNEL__ */ - -#define XFS_ILOG_FBROOT(w) xfs_ilog_fbroot(w) -static inline int xfs_ilog_fbroot(int w) -{ - return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT); -} - -#define XFS_ILOG_FEXT(w) xfs_ilog_fext(w) -static inline int xfs_ilog_fext(int w) -{ - return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT); -} - static inline int xfs_inode_clean(xfs_inode_t *ip) { return (!ip->i_itemp || @@ -175,9 +173,6 @@ static inline int xfs_inode_clean(xfs_inode_t *ip) !ip->i_update_core; } - -#ifdef __KERNEL__ - extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *); extern void xfs_inode_item_destroy(struct xfs_inode *); extern void xfs_iflush_done(struct xfs_buf *, xfs_inode_log_item_t *); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 67f22b2b44b3..911062cf73a6 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -290,7 +290,6 @@ STATIC int xfs_iomap_eof_align_last_fsb( xfs_mount_t *mp, xfs_inode_t *ip, - xfs_fsize_t isize, xfs_extlen_t extsize, xfs_fileoff_t *last_fsb) { @@ -306,14 +305,14 @@ xfs_iomap_eof_align_last_fsb( * stripe width and we are allocating past the allocation eof. */ else if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC) && - (isize >= XFS_FSB_TO_B(mp, mp->m_swidth))) + (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_swidth))) new_last_fsb = roundup_64(*last_fsb, mp->m_swidth); /* * Roundup the allocation request to a stripe unit (m_dalign) boundary * if the file size is >= stripe unit size, and we are allocating past * the allocation eof. */ - else if (mp->m_dalign && (isize >= XFS_FSB_TO_B(mp, mp->m_dalign))) + else if (mp->m_dalign && (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_dalign))) new_last_fsb = roundup_64(*last_fsb, mp->m_dalign); /* @@ -403,7 +402,6 @@ xfs_iomap_write_direct( xfs_filblks_t count_fsb, resaligned; xfs_fsblock_t firstfsb; xfs_extlen_t extsz, temp; - xfs_fsize_t isize; int nimaps; int bmapi_flag; int quota_flag; @@ -426,15 +424,10 @@ xfs_iomap_write_direct( rt = XFS_IS_REALTIME_INODE(ip); extsz = xfs_get_extsz_hint(ip); - isize = ip->i_size; - if (ip->i_new_size > isize) - isize = ip->i_new_size; - offset_fsb = XFS_B_TO_FSBT(mp, offset); last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); - if ((offset + count) > isize) { - error = xfs_iomap_eof_align_last_fsb(mp, ip, isize, extsz, - &last_fsb); + if ((offset + count) > ip->i_size) { + error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb); if (error) goto error_out; } else { @@ -559,7 +552,6 @@ STATIC int xfs_iomap_eof_want_preallocate( xfs_mount_t *mp, xfs_inode_t *ip, - xfs_fsize_t isize, xfs_off_t offset, size_t count, int ioflag, @@ -573,7 +565,7 @@ xfs_iomap_eof_want_preallocate( int n, error, imaps; *prealloc = 0; - if ((ioflag & BMAPI_SYNC) || (offset + count) <= isize) + if ((ioflag & BMAPI_SYNC) || (offset + count) <= ip->i_size) return 0; /* @@ -617,7 +609,6 @@ xfs_iomap_write_delay( xfs_fileoff_t ioalign; xfs_fsblock_t firstblock; xfs_extlen_t extsz; - xfs_fsize_t isize; int nimaps; xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS]; int prealloc, fsynced = 0; @@ -637,11 +628,7 @@ xfs_iomap_write_delay( offset_fsb = XFS_B_TO_FSBT(mp, offset); retry: - isize = ip->i_size; - if (ip->i_new_size > isize) - isize = ip->i_new_size; - - error = xfs_iomap_eof_want_preallocate(mp, ip, isize, offset, count, + error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count, ioflag, imap, XFS_WRITE_IMAPS, &prealloc); if (error) return error; @@ -655,8 +642,7 @@ retry: } if (prealloc || extsz) { - error = xfs_iomap_eof_align_last_fsb(mp, ip, isize, extsz, - &last_fsb); + error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb); if (error) return error; } diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index cf6754a3c5b3..e19d0a8d5618 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -69,7 +69,7 @@ xfs_bulkstat_one_iget( } ASSERT(ip != NULL); - ASSERT(ip->i_blkno != (xfs_daddr_t)0); + ASSERT(ip->i_imap.im_blkno != 0); dic = &ip->i_d; @@ -125,13 +125,9 @@ STATIC void xfs_bulkstat_one_dinode( xfs_mount_t *mp, /* mount point for filesystem */ xfs_ino_t ino, /* inode number to get data for */ - xfs_dinode_t *dip, /* dinode inode pointer */ + xfs_dinode_t *dic, /* dinode inode pointer */ xfs_bstat_t *buf) /* return buffer */ { - xfs_dinode_core_t *dic; /* dinode core info pointer */ - - dic = &dip->di_core; - /* * The inode format changed when we moved the link count and * made it 32 bits long. If this is an old format inode, @@ -143,7 +139,7 @@ xfs_bulkstat_one_dinode( * the new format. We don't change the version number so that we * can distinguish this from a real new format inode. */ - if (dic->di_version == XFS_DINODE_VERSION_1) { + if (dic->di_version == 1) { buf->bs_nlink = be16_to_cpu(dic->di_onlink); buf->bs_projid = 0; } else { @@ -162,7 +158,7 @@ xfs_bulkstat_one_dinode( buf->bs_mtime.tv_nsec = be32_to_cpu(dic->di_mtime.t_nsec); buf->bs_ctime.tv_sec = be32_to_cpu(dic->di_ctime.t_sec); buf->bs_ctime.tv_nsec = be32_to_cpu(dic->di_ctime.t_nsec); - buf->bs_xflags = xfs_dic2xflags(dip); + buf->bs_xflags = xfs_dic2xflags(dic); buf->bs_extsize = be32_to_cpu(dic->di_extsize) << mp->m_sb.sb_blocklog; buf->bs_extents = be32_to_cpu(dic->di_nextents); buf->bs_gen = be32_to_cpu(dic->di_gen); @@ -173,7 +169,7 @@ xfs_bulkstat_one_dinode( switch (dic->di_format) { case XFS_DINODE_FMT_DEV: - buf->bs_rdev = be32_to_cpu(dip->di_u.di_dev); + buf->bs_rdev = xfs_dinode_get_rdev(dic); buf->bs_blksize = BLKDEV_IOSIZE; buf->bs_blocks = 0; break; @@ -192,27 +188,34 @@ xfs_bulkstat_one_dinode( } } +/* Return 0 on success or positive error */ STATIC int xfs_bulkstat_one_fmt( void __user *ubuffer, + int ubsize, + int *ubused, const xfs_bstat_t *buffer) { + if (ubsize < sizeof(*buffer)) + return XFS_ERROR(ENOMEM); if (copy_to_user(ubuffer, buffer, sizeof(*buffer))) - return -EFAULT; - return sizeof(*buffer); + return XFS_ERROR(EFAULT); + if (ubused) + *ubused = sizeof(*buffer); + return 0; } /* * Return stat information for one inode. * Return 0 if ok, else errno. */ -int /* error status */ -xfs_bulkstat_one( +int /* error status */ +xfs_bulkstat_one_int( xfs_mount_t *mp, /* mount point for filesystem */ xfs_ino_t ino, /* inode number to get data for */ void __user *buffer, /* buffer to place output in */ int ubsize, /* size of buffer */ - void *private_data, /* my private data */ + bulkstat_one_fmt_pf formatter, /* formatter, copy to user */ xfs_daddr_t bno, /* starting bno of inode cluster */ int *ubused, /* bytes used by me */ void *dibuff, /* on-disk inode buffer */ @@ -221,15 +224,12 @@ xfs_bulkstat_one( xfs_bstat_t *buf; /* return buffer */ int error = 0; /* error value */ xfs_dinode_t *dip; /* dinode inode pointer */ - bulkstat_one_fmt_pf formatter = private_data ? : xfs_bulkstat_one_fmt; dip = (xfs_dinode_t *)dibuff; *stat = BULKSTAT_RV_NOTHING; if (!buffer || xfs_internal_inum(mp, ino)) return XFS_ERROR(EINVAL); - if (ubsize < sizeof(*buf)) - return XFS_ERROR(ENOMEM); buf = kmem_alloc(sizeof(*buf), KM_SLEEP); @@ -244,21 +244,34 @@ xfs_bulkstat_one( xfs_bulkstat_one_dinode(mp, ino, dip, buf); } - error = formatter(buffer, buf); - if (error < 0) { - error = EFAULT; + error = formatter(buffer, ubsize, ubused, buf); + if (error) goto out_free; - } *stat = BULKSTAT_RV_DIDONE; - if (ubused) - *ubused = error; out_free: kmem_free(buf); return error; } +int +xfs_bulkstat_one( + xfs_mount_t *mp, /* mount point for filesystem */ + xfs_ino_t ino, /* inode number to get data for */ + void __user *buffer, /* buffer to place output in */ + int ubsize, /* size of buffer */ + void *private_data, /* my private data */ + xfs_daddr_t bno, /* starting bno of inode cluster */ + int *ubused, /* bytes used by me */ + void *dibuff, /* on-disk inode buffer */ + int *stat) /* BULKSTAT_RV_... */ +{ + return xfs_bulkstat_one_int(mp, ino, buffer, ubsize, + xfs_bulkstat_one_fmt, bno, + ubused, dibuff, stat); +} + /* * Test to see whether we can use the ondisk inode directly, based * on the given bulkstat flags, filling in dipp accordingly. @@ -287,19 +300,19 @@ xfs_bulkstat_use_dinode( * to disk yet. This is a temporary hack that would require a proper * fix in the future. */ - if (be16_to_cpu(dip->di_core.di_magic) != XFS_DINODE_MAGIC || - !XFS_DINODE_GOOD_VERSION(dip->di_core.di_version) || - !dip->di_core.di_mode) + if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC || + !XFS_DINODE_GOOD_VERSION(dip->di_version) || + !dip->di_mode) return 0; if (flags & BULKSTAT_FG_QUICK) { *dipp = dip; return 1; } /* BULKSTAT_FG_INLINE: if attr fork is local, or not there, use it */ - aformat = dip->di_core.di_aformat; + aformat = dip->di_aformat; if ((XFS_DFORK_Q(dip) == 0) || (aformat == XFS_DINODE_FMT_LOCAL) || - (aformat == XFS_DINODE_FMT_EXTENTS && !dip->di_core.di_anextents)) { + (aformat == XFS_DINODE_FMT_EXTENTS && !dip->di_anextents)) { *dipp = dip; return 1; } @@ -359,7 +372,6 @@ xfs_bulkstat( int ubused; /* bytes used by formatter */ xfs_buf_t *bp; /* ptr to on-disk inode cluster buf */ xfs_dinode_t *dip; /* ptr into bp for specific inode */ - xfs_inode_t *ip; /* ptr to in-core inode struct */ /* * Get the last inode value, see if there's nothing to do. @@ -416,8 +428,7 @@ xfs_bulkstat( /* * Allocate and initialize a btree cursor for ialloc btree. */ - cur = xfs_btree_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_INO, - (xfs_inode_t *)0, 0); + cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno); irbp = irbuf; irbufend = irbuf + nirbuf; end_of_ag = 0; @@ -472,7 +483,7 @@ xfs_bulkstat( * In any case, increment to the next record. */ if (!error) - error = xfs_inobt_increment(cur, 0, &tmp); + error = xfs_btree_increment(cur, 0, &tmp); } else { /* * Start of ag. Lookup the first inode chunk. @@ -539,7 +550,7 @@ xfs_bulkstat( * Set agino to after this chunk and bump the cursor. */ agino = gino + XFS_INODES_PER_CHUNK; - error = xfs_inobt_increment(cur, 0, &tmp); + error = xfs_btree_increment(cur, 0, &tmp); cond_resched(); } /* @@ -586,6 +597,8 @@ xfs_bulkstat( if (flags & (BULKSTAT_FG_QUICK | BULKSTAT_FG_INLINE)) { + int offset; + ino = XFS_AGINO_TO_INO(mp, agno, agino); bno = XFS_AGB_TO_DADDR(mp, agno, @@ -594,21 +607,15 @@ xfs_bulkstat( /* * Get the inode cluster buffer */ - ASSERT(xfs_inode_zone != NULL); - ip = kmem_zone_zalloc(xfs_inode_zone, - KM_SLEEP); - ip->i_ino = ino; - ip->i_mount = mp; - spin_lock_init(&ip->i_flags_lock); if (bp) xfs_buf_relse(bp); - error = xfs_itobp(mp, NULL, ip, - &dip, &bp, bno, - XFS_IMAP_BULKSTAT, - XFS_BUF_LOCK); + + error = xfs_inotobp(mp, NULL, ino, &dip, + &bp, &offset, + XFS_IGET_BULKSTAT); + if (!error) - clustidx = ip->i_boffset / mp->m_sb.sb_inodesize; - kmem_zone_free(xfs_inode_zone, ip); + clustidx = offset / mp->m_sb.sb_inodesize; if (XFS_TEST_ERROR(error != 0, mp, XFS_ERRTAG_BULKSTAT_READ_CHUNK, XFS_RANDOM_BULKSTAT_READ_CHUNK)) { @@ -842,8 +849,7 @@ xfs_inumbers( agino = 0; continue; } - cur = xfs_btree_init_cursor(mp, NULL, agbp, agno, - XFS_BTNUM_INO, (xfs_inode_t *)0, 0); + cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno); error = xfs_inobt_lookup_ge(cur, agino, 0, 0, &tmp); if (error) { xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); @@ -887,7 +893,7 @@ xfs_inumbers( bufidx = 0; } if (left) { - error = xfs_inobt_increment(cur, 0, &tmp); + error = xfs_btree_increment(cur, 0, &tmp); if (error) { xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); cur = NULL; diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h index a1f18fce9b70..1fb04e7deb61 100644 --- a/fs/xfs/xfs_itable.h +++ b/fs/xfs/xfs_itable.h @@ -71,9 +71,23 @@ xfs_bulkstat_single( typedef int (*bulkstat_one_fmt_pf)( /* used size in bytes or negative error */ void __user *ubuffer, /* buffer to write to */ + int ubsize, /* remaining user buffer sz */ + int *ubused, /* bytes used by formatter */ const xfs_bstat_t *buffer); /* buffer to read from */ int +xfs_bulkstat_one_int( + xfs_mount_t *mp, + xfs_ino_t ino, + void __user *buffer, + int ubsize, + bulkstat_one_fmt_pf formatter, + xfs_daddr_t bno, + int *ubused, + void *dibuff, + int *stat); + +int xfs_bulkstat_one( xfs_mount_t *mp, xfs_ino_t ino, diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 3608a0f0a5f6..f4726f702a9e 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -100,12 +100,11 @@ STATIC void xlog_ungrant_log_space(xlog_t *log, /* local ticket functions */ -STATIC xlog_ticket_t *xlog_ticket_get(xlog_t *log, +STATIC xlog_ticket_t *xlog_ticket_alloc(xlog_t *log, int unit_bytes, int count, char clientid, uint flags); -STATIC void xlog_ticket_put(xlog_t *log, xlog_ticket_t *ticket); #if defined(DEBUG) STATIC void xlog_verify_dest_ptr(xlog_t *log, __psint_t ptr); @@ -360,7 +359,7 @@ xfs_log_done(xfs_mount_t *mp, */ xlog_trace_loggrant(log, ticket, "xfs_log_done: (non-permanent)"); xlog_ungrant_log_space(log, ticket); - xlog_ticket_put(log, ticket); + xfs_log_ticket_put(ticket); } else { xlog_trace_loggrant(log, ticket, "xfs_log_done: (permanent)"); xlog_regrant_reserve_log_space(log, ticket); @@ -514,7 +513,7 @@ xfs_log_reserve(xfs_mount_t *mp, retval = xlog_regrant_write_log_space(log, internal_ticket); } else { /* may sleep if need to allocate more tickets */ - internal_ticket = xlog_ticket_get(log, unit_bytes, cnt, + internal_ticket = xlog_ticket_alloc(log, unit_bytes, cnt, client, flags); if (!internal_ticket) return XFS_ERROR(ENOMEM); @@ -572,12 +571,12 @@ xfs_log_mount( /* * Initialize the AIL now we have a log. */ - spin_lock_init(&mp->m_ail_lock); error = xfs_trans_ail_init(mp); if (error) { cmn_err(CE_WARN, "XFS: AIL initialisation failed: error %d", error); goto error; } + mp->m_log->l_ailp = mp->m_ail; /* * skip log recovery on a norecovery mount. pretend it all @@ -730,8 +729,8 @@ xfs_log_unmount_write(xfs_mount_t *mp) spin_lock(&log->l_icloglock); iclog = log->l_iclog; atomic_inc(&iclog->ic_refcnt); - spin_unlock(&log->l_icloglock); xlog_state_want_sync(log, iclog); + spin_unlock(&log->l_icloglock); error = xlog_state_release_iclog(log, iclog); spin_lock(&log->l_icloglock); @@ -749,7 +748,7 @@ xfs_log_unmount_write(xfs_mount_t *mp) if (tic) { xlog_trace_loggrant(log, tic, "unmount rec"); xlog_ungrant_log_space(log, tic); - xlog_ticket_put(log, tic); + xfs_log_ticket_put(tic); } } else { /* @@ -768,9 +767,9 @@ xfs_log_unmount_write(xfs_mount_t *mp) spin_lock(&log->l_icloglock); iclog = log->l_iclog; atomic_inc(&iclog->ic_refcnt); - spin_unlock(&log->l_icloglock); xlog_state_want_sync(log, iclog); + spin_unlock(&log->l_icloglock); error = xlog_state_release_iclog(log, iclog); spin_lock(&log->l_icloglock); @@ -906,7 +905,7 @@ xfs_log_move_tail(xfs_mount_t *mp, int xfs_log_need_covered(xfs_mount_t *mp) { - int needed = 0, gen; + int needed = 0; xlog_t *log = mp->m_log; if (!xfs_fs_writable(mp)) @@ -915,7 +914,7 @@ xfs_log_need_covered(xfs_mount_t *mp) spin_lock(&log->l_icloglock); if (((log->l_covered_state == XLOG_STATE_COVER_NEED) || (log->l_covered_state == XLOG_STATE_COVER_NEED2)) - && !xfs_trans_first_ail(mp, &gen) + && !xfs_trans_ail_tail(log->l_ailp) && xlog_iclogs_empty(log)) { if (log->l_covered_state == XLOG_STATE_COVER_NEED) log->l_covered_state = XLOG_STATE_COVER_DONE; @@ -952,7 +951,7 @@ xlog_assign_tail_lsn(xfs_mount_t *mp) xfs_lsn_t tail_lsn; xlog_t *log = mp->m_log; - tail_lsn = xfs_trans_tail_ail(mp); + tail_lsn = xfs_trans_ail_tail(mp->m_ail); spin_lock(&log->l_grant_lock); if (tail_lsn != 0) { log->l_tail_lsn = tail_lsn; @@ -1030,12 +1029,6 @@ xlog_iodone(xfs_buf_t *bp) ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) == (unsigned long) 2); XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1); aborted = 0; - - /* - * Some versions of cpp barf on the recursive definition of - * ic_log -> hic_fields.ic_log and expand ic_log twice when - * it is passed through two macros. Workaround broken cpp. - */ l = iclog->ic_log; /* @@ -1302,7 +1295,7 @@ xlog_alloc_log(xfs_mount_t *mp, XFS_BUF_SET_BDSTRAT_FUNC(bp, xlog_bdstrat_cb); XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1); iclog->ic_bp = bp; - iclog->hic_data = bp->b_addr; + iclog->ic_data = bp->b_addr; #ifdef DEBUG log->l_iclog_bak[i] = (xfs_caddr_t)&(iclog->ic_header); #endif @@ -1322,7 +1315,7 @@ xlog_alloc_log(xfs_mount_t *mp, atomic_set(&iclog->ic_refcnt, 0); spin_lock_init(&iclog->ic_callback_lock); iclog->ic_callback_tail = &(iclog->ic_callback); - iclog->ic_datap = (char *)iclog->hic_data + log->l_iclog_hsize; + iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize; ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp)); ASSERT(XFS_BUF_VALUSEMA(iclog->ic_bp) <= 0); @@ -1446,7 +1439,7 @@ xlog_grant_push_ail(xfs_mount_t *mp, */ if (threshold_lsn && !XLOG_FORCED_SHUTDOWN(log)) - xfs_trans_push_ail(mp, threshold_lsn); + xfs_trans_ail_push(log->l_ailp, threshold_lsn); } /* xlog_grant_push_ail */ @@ -1991,7 +1984,9 @@ xlog_write(xfs_mount_t * mp, if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) { xlog_state_finish_copy(log, iclog, record_cnt, data_cnt); record_cnt = data_cnt = 0; + spin_lock(&log->l_icloglock); xlog_state_want_sync(log, iclog); + spin_unlock(&log->l_icloglock); if (commit_iclog) { ASSERT(flags & XLOG_COMMIT_TRANS); *commit_iclog = iclog; @@ -3200,7 +3195,7 @@ try_again: STATIC void xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog) { - spin_lock(&log->l_icloglock); + ASSERT(spin_is_locked(&log->l_icloglock)); if (iclog->ic_state == XLOG_STATE_ACTIVE) { xlog_state_switch_iclogs(log, iclog, 0); @@ -3208,10 +3203,7 @@ xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog) ASSERT(iclog->ic_state & (XLOG_STATE_WANT_SYNC|XLOG_STATE_IOERROR)); } - - spin_unlock(&log->l_icloglock); -} /* xlog_state_want_sync */ - +} /***************************************************************************** @@ -3222,22 +3214,33 @@ xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog) */ /* - * Free a used ticket. + * Free a used ticket when it's refcount falls to zero. */ -STATIC void -xlog_ticket_put(xlog_t *log, - xlog_ticket_t *ticket) +void +xfs_log_ticket_put( + xlog_ticket_t *ticket) { - sv_destroy(&ticket->t_wait); - kmem_zone_free(xfs_log_ticket_zone, ticket); -} /* xlog_ticket_put */ + ASSERT(atomic_read(&ticket->t_ref) > 0); + if (atomic_dec_and_test(&ticket->t_ref)) { + sv_destroy(&ticket->t_wait); + kmem_zone_free(xfs_log_ticket_zone, ticket); + } +} +xlog_ticket_t * +xfs_log_ticket_get( + xlog_ticket_t *ticket) +{ + ASSERT(atomic_read(&ticket->t_ref) > 0); + atomic_inc(&ticket->t_ref); + return ticket; +} /* * Allocate and initialise a new log ticket. */ STATIC xlog_ticket_t * -xlog_ticket_get(xlog_t *log, +xlog_ticket_alloc(xlog_t *log, int unit_bytes, int cnt, char client, @@ -3308,6 +3311,7 @@ xlog_ticket_get(xlog_t *log, unit_bytes += 2*BBSIZE; } + atomic_set(&tic->t_ref, 1); tic->t_unit_res = unit_bytes; tic->t_curr_res = unit_bytes; tic->t_cnt = cnt; @@ -3323,7 +3327,7 @@ xlog_ticket_get(xlog_t *log, xlog_tic_reset_res(tic); return tic; -} /* xlog_ticket_get */ +} /****************************************************************************** @@ -3452,7 +3456,7 @@ xlog_verify_iclog(xlog_t *log, ptr = iclog->ic_datap; base_ptr = ptr; ophead = (xlog_op_header_t *)ptr; - xhdr = (xlog_in_core_2_t *)&iclog->ic_header; + xhdr = iclog->ic_data; for (i = 0; i < len; i++) { ophead = (xlog_op_header_t *)ptr; @@ -3558,7 +3562,8 @@ xfs_log_force_umount( if (!log || log->l_flags & XLOG_ACTIVE_RECOVERY) { mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN; - XFS_BUF_DONE(mp->m_sb_bp); + if (mp->m_sb_bp) + XFS_BUF_DONE(mp->m_sb_bp); return 0; } @@ -3579,7 +3584,9 @@ xfs_log_force_umount( spin_lock(&log->l_icloglock); spin_lock(&log->l_grant_lock); mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN; - XFS_BUF_DONE(mp->m_sb_bp); + if (mp->m_sb_bp) + XFS_BUF_DONE(mp->m_sb_bp); + /* * This flag is sort of redundant because of the mount flag, but * it's good to maintain the separation between the log and the rest diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index d47b91f10822..8a3e84e900a3 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -134,6 +134,7 @@ typedef struct xfs_log_callback { #ifdef __KERNEL__ /* Log manager interfaces */ struct xfs_mount; +struct xlog_ticket; xfs_lsn_t xfs_log_done(struct xfs_mount *mp, xfs_log_ticket_t ticket, void **iclog, @@ -177,6 +178,9 @@ int xfs_log_need_covered(struct xfs_mount *mp); void xlog_iodone(struct xfs_buf *); +struct xlog_ticket * xfs_log_ticket_get(struct xlog_ticket *ticket); +void xfs_log_ticket_put(struct xlog_ticket *ticket); + #endif diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index e7d8f84443fa..654167be0efb 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -245,6 +245,7 @@ typedef struct xlog_ticket { struct xlog_ticket *t_next; /* :4|8 */ struct xlog_ticket *t_prev; /* :4|8 */ xlog_tid_t t_tid; /* transaction identifier : 4 */ + atomic_t t_ref; /* ticket reference count : 4 */ int t_curr_res; /* current reservation in bytes : 4 */ int t_unit_res; /* unit reservation in bytes : 4 */ char t_ocnt; /* original count : 1 */ @@ -309,6 +310,16 @@ typedef struct xlog_rec_ext_header { } xlog_rec_ext_header_t; #ifdef __KERNEL__ + +/* + * Quite misnamed, because this union lays out the actual on-disk log buffer. + */ +typedef union xlog_in_core2 { + xlog_rec_header_t hic_header; + xlog_rec_ext_header_t hic_xheader; + char hic_sector[XLOG_HEADER_SIZE]; +} xlog_in_core_2_t; + /* * - A log record header is 512 bytes. There is plenty of room to grow the * xlog_rec_header_t into the reserved space. @@ -338,7 +349,7 @@ typedef struct xlog_rec_ext_header { * We'll put all the read-only and l_icloglock fields in the first cacheline, * and move everything else out to subsequent cachelines. */ -typedef struct xlog_iclog_fields { +typedef struct xlog_in_core { sv_t ic_force_wait; sv_t ic_write_wait; struct xlog_in_core *ic_next; @@ -361,41 +372,11 @@ typedef struct xlog_iclog_fields { /* reference counts need their own cacheline */ atomic_t ic_refcnt ____cacheline_aligned_in_smp; -} xlog_iclog_fields_t; - -typedef union xlog_in_core2 { - xlog_rec_header_t hic_header; - xlog_rec_ext_header_t hic_xheader; - char hic_sector[XLOG_HEADER_SIZE]; -} xlog_in_core_2_t; - -typedef struct xlog_in_core { - xlog_iclog_fields_t hic_fields; - xlog_in_core_2_t *hic_data; + xlog_in_core_2_t *ic_data; +#define ic_header ic_data->hic_header } xlog_in_core_t; /* - * Defines to save our code from this glop. - */ -#define ic_force_wait hic_fields.ic_force_wait -#define ic_write_wait hic_fields.ic_write_wait -#define ic_next hic_fields.ic_next -#define ic_prev hic_fields.ic_prev -#define ic_bp hic_fields.ic_bp -#define ic_log hic_fields.ic_log -#define ic_callback hic_fields.ic_callback -#define ic_callback_lock hic_fields.ic_callback_lock -#define ic_callback_tail hic_fields.ic_callback_tail -#define ic_trace hic_fields.ic_trace -#define ic_size hic_fields.ic_size -#define ic_offset hic_fields.ic_offset -#define ic_refcnt hic_fields.ic_refcnt -#define ic_bwritecnt hic_fields.ic_bwritecnt -#define ic_state hic_fields.ic_state -#define ic_datap hic_fields.ic_datap -#define ic_header hic_data->hic_header - -/* * The reservation head lsn is not made up of a cycle number and block number. * Instead, it uses a cycle number and byte number. Logs don't expect to * overflow 31 bits worth of byte offset, so using a byte number will mean @@ -404,6 +385,7 @@ typedef struct xlog_in_core { typedef struct log { /* The following fields don't need locking */ struct xfs_mount *l_mp; /* mount point */ + struct xfs_ail *l_ailp; /* AIL log is working with */ struct xfs_buf *l_xbuf; /* extra buffer for log * wrapping */ struct xfs_buftarg *l_targ; /* buftarg of log */ diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 70e3ba32e6be..35cca98bd94c 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -36,7 +36,6 @@ #include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_inode_item.h" -#include "xfs_imap.h" #include "xfs_alloc.h" #include "xfs_ialloc.h" #include "xfs_log_priv.h" @@ -54,10 +53,8 @@ STATIC void xlog_recover_insert_item_backq(xlog_recover_item_t **q, xlog_recover_item_t *item); #if defined(DEBUG) STATIC void xlog_recover_check_summary(xlog_t *); -STATIC void xlog_recover_check_ail(xfs_mount_t *, xfs_log_item_t *, int); #else #define xlog_recover_check_summary(log) -#define xlog_recover_check_ail(mp, lip, gen) #endif @@ -270,21 +267,16 @@ STATIC void xlog_recover_iodone( struct xfs_buf *bp) { - xfs_mount_t *mp; - - ASSERT(XFS_BUF_FSPRIVATE(bp, void *)); - if (XFS_BUF_GETERROR(bp)) { /* * We're not going to bother about retrying * this during recovery. One strike! */ - mp = XFS_BUF_FSPRIVATE(bp, xfs_mount_t *); xfs_ioerror_alert("xlog_recover_iodone", - mp, bp, XFS_BUF_ADDR(bp)); - xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + bp->b_mount, bp, XFS_BUF_ADDR(bp)); + xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR); } - XFS_BUF_SET_FSPRIVATE(bp, NULL); + bp->b_mount = NULL; XFS_BUF_CLR_IODONE_FUNC(bp); xfs_biodone(bp); } @@ -2228,9 +2220,8 @@ xlog_recover_do_buffer_trans( XFS_BUF_STALE(bp); error = xfs_bwrite(mp, bp); } else { - ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL || - XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp); - XFS_BUF_SET_FSPRIVATE(bp, mp); + ASSERT(bp->b_mount == NULL || bp->b_mount == mp); + bp->b_mount = mp; XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone); xfs_bdwrite(mp, bp); } @@ -2247,7 +2238,6 @@ xlog_recover_do_inode_trans( xfs_inode_log_format_t *in_f; xfs_mount_t *mp; xfs_buf_t *bp; - xfs_imap_t imap; xfs_dinode_t *dip; xfs_ino_t ino; int len; @@ -2275,54 +2265,35 @@ xlog_recover_do_inode_trans( } ino = in_f->ilf_ino; mp = log->l_mp; - if (ITEM_TYPE(item) == XFS_LI_INODE) { - imap.im_blkno = (xfs_daddr_t)in_f->ilf_blkno; - imap.im_len = in_f->ilf_len; - imap.im_boffset = in_f->ilf_boffset; - } else { - /* - * It's an old inode format record. We don't know where - * its cluster is located on disk, and we can't allow - * xfs_imap() to figure it out because the inode btrees - * are not ready to be used. Therefore do not pass the - * XFS_IMAP_LOOKUP flag to xfs_imap(). This will give - * us only the single block in which the inode lives - * rather than its cluster, so we must make sure to - * invalidate the buffer when we write it out below. - */ - imap.im_blkno = 0; - error = xfs_imap(log->l_mp, NULL, ino, &imap, 0); - if (error) - goto error; - } /* * Inode buffers can be freed, look out for it, * and do not replay the inode. */ - if (xlog_check_buffer_cancelled(log, imap.im_blkno, imap.im_len, 0)) { + if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno, + in_f->ilf_len, 0)) { error = 0; goto error; } - bp = xfs_buf_read_flags(mp->m_ddev_targp, imap.im_blkno, imap.im_len, - XFS_BUF_LOCK); + bp = xfs_buf_read_flags(mp->m_ddev_targp, in_f->ilf_blkno, + in_f->ilf_len, XFS_BUF_LOCK); if (XFS_BUF_ISERROR(bp)) { xfs_ioerror_alert("xlog_recover_do..(read#2)", mp, - bp, imap.im_blkno); + bp, in_f->ilf_blkno); error = XFS_BUF_GETERROR(bp); xfs_buf_relse(bp); goto error; } error = 0; ASSERT(in_f->ilf_fields & XFS_ILOG_CORE); - dip = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset); + dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset); /* * Make sure the place we're flushing out to really looks * like an inode! */ - if (unlikely(be16_to_cpu(dip->di_core.di_magic) != XFS_DINODE_MAGIC)) { + if (unlikely(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC)) { xfs_buf_relse(bp); xfs_fs_cmn_err(CE_ALERT, mp, "xfs_inode_recover: Bad inode magic number, dino ptr = 0x%p, dino bp = 0x%p, ino = %Ld", @@ -2345,12 +2316,12 @@ xlog_recover_do_inode_trans( } /* Skip replay when the on disk inode is newer than the log one */ - if (dicp->di_flushiter < be16_to_cpu(dip->di_core.di_flushiter)) { + if (dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) { /* * Deal with the wrap case, DI_MAX_FLUSH is less * than smaller numbers */ - if (be16_to_cpu(dip->di_core.di_flushiter) == DI_MAX_FLUSH && + if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH && dicp->di_flushiter < (DI_MAX_FLUSH >> 1)) { /* do nothing */ } else { @@ -2410,7 +2381,7 @@ xlog_recover_do_inode_trans( error = EFSCORRUPTED; goto error; } - if (unlikely(item->ri_buf[1].i_len > sizeof(xfs_dinode_core_t))) { + if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) { XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(7)", XFS_ERRLEVEL_LOW, mp, dicp); xfs_buf_relse(bp); @@ -2422,23 +2393,24 @@ xlog_recover_do_inode_trans( } /* The core is in in-core format */ - xfs_dinode_to_disk(&dip->di_core, - (xfs_icdinode_t *)item->ri_buf[1].i_addr); + xfs_dinode_to_disk(dip, (xfs_icdinode_t *)item->ri_buf[1].i_addr); /* the rest is in on-disk format */ - if (item->ri_buf[1].i_len > sizeof(xfs_dinode_core_t)) { - memcpy((xfs_caddr_t) dip + sizeof(xfs_dinode_core_t), - item->ri_buf[1].i_addr + sizeof(xfs_dinode_core_t), - item->ri_buf[1].i_len - sizeof(xfs_dinode_core_t)); + if (item->ri_buf[1].i_len > sizeof(struct xfs_icdinode)) { + memcpy((xfs_caddr_t) dip + sizeof(struct xfs_icdinode), + item->ri_buf[1].i_addr + sizeof(struct xfs_icdinode), + item->ri_buf[1].i_len - sizeof(struct xfs_icdinode)); } fields = in_f->ilf_fields; switch (fields & (XFS_ILOG_DEV | XFS_ILOG_UUID)) { case XFS_ILOG_DEV: - dip->di_u.di_dev = cpu_to_be32(in_f->ilf_u.ilfu_rdev); + xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev); break; case XFS_ILOG_UUID: - dip->di_u.di_muuid = in_f->ilf_u.ilfu_uuid; + memcpy(XFS_DFORK_DPTR(dip), + &in_f->ilf_u.ilfu_uuid, + sizeof(uuid_t)); break; } @@ -2454,12 +2426,12 @@ xlog_recover_do_inode_trans( switch (fields & XFS_ILOG_DFORK) { case XFS_ILOG_DDATA: case XFS_ILOG_DEXT: - memcpy(&dip->di_u, src, len); + memcpy(XFS_DFORK_DPTR(dip), src, len); break; case XFS_ILOG_DBROOT: - xfs_bmbt_to_bmdr((xfs_bmbt_block_t *)src, len, - &(dip->di_u.di_bmbt), + xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len, + (xfs_bmdr_block_t *)XFS_DFORK_DPTR(dip), XFS_DFORK_DSIZE(dip, mp)); break; @@ -2496,8 +2468,8 @@ xlog_recover_do_inode_trans( case XFS_ILOG_ABROOT: dest = XFS_DFORK_APTR(dip); - xfs_bmbt_to_bmdr((xfs_bmbt_block_t *)src, len, - (xfs_bmdr_block_t*)dest, + xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, + len, (xfs_bmdr_block_t*)dest, XFS_DFORK_ASIZE(dip, mp)); break; @@ -2512,9 +2484,8 @@ xlog_recover_do_inode_trans( write_inode_buffer: if (ITEM_TYPE(item) == XFS_LI_INODE) { - ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL || - XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp); - XFS_BUF_SET_FSPRIVATE(bp, mp); + ASSERT(bp->b_mount == NULL || bp->b_mount == mp); + bp->b_mount = mp; XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone); xfs_bdwrite(mp, bp); } else { @@ -2645,9 +2616,8 @@ xlog_recover_do_dquot_trans( memcpy(ddq, recddq, item->ri_buf[1].i_len); ASSERT(dq_f->qlf_size == 2); - ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL || - XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp); - XFS_BUF_SET_FSPRIVATE(bp, mp); + ASSERT(bp->b_mount == NULL || bp->b_mount == mp); + bp->b_mount = mp; XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone); xfs_bdwrite(mp, bp); @@ -2689,11 +2659,11 @@ xlog_recover_do_efi_trans( efip->efi_next_extent = efi_formatp->efi_nextents; efip->efi_flags |= XFS_EFI_COMMITTED; - spin_lock(&mp->m_ail_lock); + spin_lock(&log->l_ailp->xa_lock); /* - * xfs_trans_update_ail() drops the AIL lock. + * xfs_trans_ail_update() drops the AIL lock. */ - xfs_trans_update_ail(mp, (xfs_log_item_t *)efip, lsn); + xfs_trans_ail_update(log->l_ailp, (xfs_log_item_t *)efip, lsn); return 0; } @@ -2712,12 +2682,12 @@ xlog_recover_do_efd_trans( xlog_recover_item_t *item, int pass) { - xfs_mount_t *mp; xfs_efd_log_format_t *efd_formatp; xfs_efi_log_item_t *efip = NULL; xfs_log_item_t *lip; - int gen; __uint64_t efi_id; + struct xfs_ail_cursor cur; + struct xfs_ail *ailp = log->l_ailp; if (pass == XLOG_RECOVER_PASS1) { return; @@ -2734,25 +2704,26 @@ xlog_recover_do_efd_trans( * Search for the efi with the id in the efd format structure * in the AIL. */ - mp = log->l_mp; - spin_lock(&mp->m_ail_lock); - lip = xfs_trans_first_ail(mp, &gen); + spin_lock(&ailp->xa_lock); + lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); while (lip != NULL) { if (lip->li_type == XFS_LI_EFI) { efip = (xfs_efi_log_item_t *)lip; if (efip->efi_format.efi_id == efi_id) { /* - * xfs_trans_delete_ail() drops the + * xfs_trans_ail_delete() drops the * AIL lock. */ - xfs_trans_delete_ail(mp, lip); + xfs_trans_ail_delete(ailp, lip); xfs_efi_item_free(efip); - return; + spin_lock(&ailp->xa_lock); + break; } } - lip = xfs_trans_next_ail(mp, lip, &gen, NULL); + lip = xfs_trans_ail_cursor_next(ailp, &cur); } - spin_unlock(&mp->m_ail_lock); + xfs_trans_ail_cursor_done(ailp, &cur); + spin_unlock(&ailp->xa_lock); } /* @@ -3036,33 +3007,6 @@ abort_error: } /* - * Verify that once we've encountered something other than an EFI - * in the AIL that there are no more EFIs in the AIL. - */ -#if defined(DEBUG) -STATIC void -xlog_recover_check_ail( - xfs_mount_t *mp, - xfs_log_item_t *lip, - int gen) -{ - int orig_gen = gen; - - do { - ASSERT(lip->li_type != XFS_LI_EFI); - lip = xfs_trans_next_ail(mp, lip, &gen, NULL); - /* - * The check will be bogus if we restart from the - * beginning of the AIL, so ASSERT that we don't. - * We never should since we're holding the AIL lock - * the entire time. - */ - ASSERT(gen == orig_gen); - } while (lip != NULL); -} -#endif /* DEBUG */ - -/* * When this is called, all of the EFIs which did not have * corresponding EFDs should be in the AIL. What we do now * is free the extents associated with each one. @@ -3086,20 +3030,23 @@ xlog_recover_process_efis( { xfs_log_item_t *lip; xfs_efi_log_item_t *efip; - int gen; - xfs_mount_t *mp; int error = 0; + struct xfs_ail_cursor cur; + struct xfs_ail *ailp; - mp = log->l_mp; - spin_lock(&mp->m_ail_lock); - - lip = xfs_trans_first_ail(mp, &gen); + ailp = log->l_ailp; + spin_lock(&ailp->xa_lock); + lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); while (lip != NULL) { /* * We're done when we see something other than an EFI. + * There should be no EFIs left in the AIL now. */ if (lip->li_type != XFS_LI_EFI) { - xlog_recover_check_ail(mp, lip, gen); +#ifdef DEBUG + for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur)) + ASSERT(lip->li_type != XFS_LI_EFI); +#endif break; } @@ -3108,18 +3055,20 @@ xlog_recover_process_efis( */ efip = (xfs_efi_log_item_t *)lip; if (efip->efi_flags & XFS_EFI_RECOVERED) { - lip = xfs_trans_next_ail(mp, lip, &gen, NULL); + lip = xfs_trans_ail_cursor_next(ailp, &cur); continue; } - spin_unlock(&mp->m_ail_lock); - error = xlog_recover_process_efi(mp, efip); + spin_unlock(&ailp->xa_lock); + error = xlog_recover_process_efi(log->l_mp, efip); + spin_lock(&ailp->xa_lock); if (error) - return error; - spin_lock(&mp->m_ail_lock); - lip = xfs_trans_next_ail(mp, lip, &gen, NULL); + goto out; + lip = xfs_trans_ail_cursor_next(ailp, &cur); } - spin_unlock(&mp->m_ail_lock); +out: + xfs_trans_ail_cursor_done(ailp, &cur); + spin_unlock(&ailp->xa_lock); return error; } @@ -3140,19 +3089,16 @@ xlog_recover_clear_agi_bucket( int error; tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET); - error = xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp), 0, 0, 0); - if (!error) - error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, - XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0, &agibp); + error = xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp), + 0, 0, 0); if (error) goto out_abort; - error = EINVAL; - agi = XFS_BUF_TO_AGI(agibp); - if (be32_to_cpu(agi->agi_magicnum) != XFS_AGI_MAGIC) + error = xfs_read_agi(mp, tp, agno, &agibp); + if (error) goto out_abort; + agi = XFS_BUF_TO_AGI(agibp); agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO); offset = offsetof(xfs_agi_t, agi_unlinked) + (sizeof(xfs_agino_t) * bucket); @@ -3172,6 +3118,62 @@ out_error: return; } +STATIC xfs_agino_t +xlog_recover_process_one_iunlink( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_agino_t agino, + int bucket) +{ + struct xfs_buf *ibp; + struct xfs_dinode *dip; + struct xfs_inode *ip; + xfs_ino_t ino; + int error; + + ino = XFS_AGINO_TO_INO(mp, agno, agino); + error = xfs_iget(mp, NULL, ino, 0, 0, &ip, 0); + if (error) + goto fail; + + /* + * Get the on disk inode to find the next inode in the bucket. + */ + error = xfs_itobp(mp, NULL, ip, &dip, &ibp, XFS_BUF_LOCK); + if (error) + goto fail_iput; + + ASSERT(ip->i_d.di_nlink == 0); + ASSERT(ip->i_d.di_mode != 0); + + /* setup for the next pass */ + agino = be32_to_cpu(dip->di_next_unlinked); + xfs_buf_relse(ibp); + + /* + * Prevent any DMAPI event from being sent when the reference on + * the inode is dropped. + */ + ip->i_d.di_dmevmask = 0; + + IRELE(ip); + return agino; + + fail_iput: + IRELE(ip); + fail: + /* + * We can't read in the inode this bucket points to, or this inode + * is messed up. Just ditch this bucket of inodes. We will lose + * some inodes and space, but at least we won't hang. + * + * Call xlog_recover_clear_agi_bucket() to perform a transaction to + * clear the inode pointer in the bucket. + */ + xlog_recover_clear_agi_bucket(mp, agno, bucket); + return NULLAGINO; +} + /* * xlog_iunlink_recover * @@ -3192,11 +3194,7 @@ xlog_recover_process_iunlinks( xfs_agnumber_t agno; xfs_agi_t *agi; xfs_buf_t *agibp; - xfs_buf_t *ibp; - xfs_dinode_t *dip; - xfs_inode_t *ip; xfs_agino_t agino; - xfs_ino_t ino; int bucket; int error; uint mp_dmevmask; @@ -3213,22 +3211,21 @@ xlog_recover_process_iunlinks( /* * Find the agi for this ag. */ - agibp = xfs_buf_read(mp->m_ddev_targp, - XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0); - if (XFS_BUF_ISERROR(agibp)) { - xfs_ioerror_alert("xlog_recover_process_iunlinks(#1)", - log->l_mp, agibp, - XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp))); + error = xfs_read_agi(mp, NULL, agno, &agibp); + if (error) { + /* + * AGI is b0rked. Don't process it. + * + * We should probably mark the filesystem as corrupt + * after we've recovered all the ag's we can.... + */ + continue; } agi = XFS_BUF_TO_AGI(agibp); - ASSERT(XFS_AGI_MAGIC == be32_to_cpu(agi->agi_magicnum)); for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) { - agino = be32_to_cpu(agi->agi_unlinked[bucket]); while (agino != NULLAGINO) { - /* * Release the agi buffer so that it can * be acquired in the normal course of the @@ -3236,87 +3233,17 @@ xlog_recover_process_iunlinks( */ xfs_buf_relse(agibp); - ino = XFS_AGINO_TO_INO(mp, agno, agino); - error = xfs_iget(mp, NULL, ino, 0, 0, &ip, 0); - ASSERT(error || (ip != NULL)); - - if (!error) { - /* - * Get the on disk inode to find the - * next inode in the bucket. - */ - error = xfs_itobp(mp, NULL, ip, &dip, - &ibp, 0, 0, - XFS_BUF_LOCK); - ASSERT(error || (dip != NULL)); - } - - if (!error) { - ASSERT(ip->i_d.di_nlink == 0); - - /* setup for the next pass */ - agino = be32_to_cpu( - dip->di_next_unlinked); - xfs_buf_relse(ibp); - /* - * Prevent any DMAPI event from - * being sent when the - * reference on the inode is - * dropped. - */ - ip->i_d.di_dmevmask = 0; - - /* - * If this is a new inode, handle - * it specially. Otherwise, - * just drop our reference to the - * inode. If there are no - * other references, this will - * send the inode to - * xfs_inactive() which will - * truncate the file and free - * the inode. - */ - if (ip->i_d.di_mode == 0) - xfs_iput_new(ip, 0); - else - IRELE(ip); - } else { - /* - * We can't read in the inode - * this bucket points to, or - * this inode is messed up. Just - * ditch this bucket of inodes. We - * will lose some inodes and space, - * but at least we won't hang. Call - * xlog_recover_clear_agi_bucket() - * to perform a transaction to clear - * the inode pointer in the bucket. - */ - xlog_recover_clear_agi_bucket(mp, agno, - bucket); - - agino = NULLAGINO; - } + agino = xlog_recover_process_one_iunlink(mp, + agno, agino, bucket); /* * Reacquire the agibuffer and continue around - * the loop. + * the loop. This should never fail as we know + * the buffer was good earlier on. */ - agibp = xfs_buf_read(mp->m_ddev_targp, - XFS_AG_DADDR(mp, agno, - XFS_AGI_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0); - if (XFS_BUF_ISERROR(agibp)) { - xfs_ioerror_alert( - "xlog_recover_process_iunlinks(#2)", - log->l_mp, agibp, - XFS_AG_DADDR(mp, agno, - XFS_AGI_DADDR(mp))); - } + error = xfs_read_agi(mp, NULL, agno, &agibp); + ASSERT(error == 0); agi = XFS_BUF_TO_AGI(agibp); - ASSERT(XFS_AGI_MAGIC == be32_to_cpu( - agi->agi_magicnum)); } } @@ -3367,7 +3294,6 @@ xlog_pack_data( int size = iclog->ic_offset + roundoff; __be32 cycle_lsn; xfs_caddr_t dp; - xlog_in_core_2_t *xhdr; xlog_pack_data_checksum(log, iclog, size); @@ -3382,7 +3308,8 @@ xlog_pack_data( } if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { - xhdr = (xlog_in_core_2_t *)&iclog->ic_header; + xlog_in_core_2_t *xhdr = iclog->ic_data; + for ( ; i < BTOBB(size); i++) { j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); @@ -3440,7 +3367,6 @@ xlog_unpack_data( xlog_t *log) { int i, j, k; - xlog_in_core_2_t *xhdr; for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) && i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) { @@ -3449,7 +3375,7 @@ xlog_unpack_data( } if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { - xhdr = (xlog_in_core_2_t *)rhead; + xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead; for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) { j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); @@ -4003,11 +3929,8 @@ xlog_recover_check_summary( { xfs_mount_t *mp; xfs_agf_t *agfp; - xfs_agi_t *agip; xfs_buf_t *agfbp; xfs_buf_t *agibp; - xfs_daddr_t agfdaddr; - xfs_daddr_t agidaddr; xfs_buf_t *sbbp; #ifdef XFS_LOUD_RECOVERY xfs_sb_t *sbp; @@ -4016,6 +3939,7 @@ xlog_recover_check_summary( __uint64_t freeblks; __uint64_t itotal; __uint64_t ifree; + int error; mp = log->l_mp; @@ -4023,37 +3947,27 @@ xlog_recover_check_summary( itotal = 0LL; ifree = 0LL; for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { - agfdaddr = XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)); - agfbp = xfs_buf_read(mp->m_ddev_targp, agfdaddr, - XFS_FSS_TO_BB(mp, 1), 0); - if (XFS_BUF_ISERROR(agfbp)) { - xfs_ioerror_alert("xlog_recover_check_summary(agf)", - mp, agfbp, agfdaddr); - } - agfp = XFS_BUF_TO_AGF(agfbp); - ASSERT(XFS_AGF_MAGIC == be32_to_cpu(agfp->agf_magicnum)); - ASSERT(XFS_AGF_GOOD_VERSION(be32_to_cpu(agfp->agf_versionnum))); - ASSERT(be32_to_cpu(agfp->agf_seqno) == agno); - - freeblks += be32_to_cpu(agfp->agf_freeblks) + - be32_to_cpu(agfp->agf_flcount); - xfs_buf_relse(agfbp); - - agidaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)); - agibp = xfs_buf_read(mp->m_ddev_targp, agidaddr, - XFS_FSS_TO_BB(mp, 1), 0); - if (XFS_BUF_ISERROR(agibp)) { - xfs_ioerror_alert("xlog_recover_check_summary(agi)", - mp, agibp, agidaddr); + error = xfs_read_agf(mp, NULL, agno, 0, &agfbp); + if (error) { + xfs_fs_cmn_err(CE_ALERT, mp, + "xlog_recover_check_summary(agf)" + "agf read failed agno %d error %d", + agno, error); + } else { + agfp = XFS_BUF_TO_AGF(agfbp); + freeblks += be32_to_cpu(agfp->agf_freeblks) + + be32_to_cpu(agfp->agf_flcount); + xfs_buf_relse(agfbp); } - agip = XFS_BUF_TO_AGI(agibp); - ASSERT(XFS_AGI_MAGIC == be32_to_cpu(agip->agi_magicnum)); - ASSERT(XFS_AGI_GOOD_VERSION(be32_to_cpu(agip->agi_versionnum))); - ASSERT(be32_to_cpu(agip->agi_seqno) == agno); - itotal += be32_to_cpu(agip->agi_count); - ifree += be32_to_cpu(agip->agi_freecount); - xfs_buf_relse(agibp); + error = xfs_read_agi(mp, NULL, agno, &agibp); + if (!error) { + struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp); + + itotal += be32_to_cpu(agi->agi_count); + ifree += be32_to_cpu(agi->agi_freecount); + xfs_buf_relse(agibp); + } } sbbp = xfs_getsb(mp, 0); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 15f5dd22fbb2..3c97c6463a4e 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -567,8 +567,6 @@ xfs_readsb(xfs_mount_t *mp, int flags) STATIC void xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp) { - int i; - mp->m_agfrotor = mp->m_agirotor = 0; spin_lock_init(&mp->m_agirotor_lock); mp->m_maxagi = mp->m_sb.sb_agcount; @@ -577,12 +575,10 @@ xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp) mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT; mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1; mp->m_agino_log = sbp->sb_inopblog + sbp->sb_agblklog; - mp->m_litino = sbp->sb_inodesize - - ((uint)sizeof(xfs_dinode_core_t) + (uint)sizeof(xfs_agino_t)); + mp->m_litino = sbp->sb_inodesize - sizeof(struct xfs_dinode); mp->m_blockmask = sbp->sb_blocksize - 1; mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG; mp->m_blockwmask = mp->m_blockwsize - 1; - INIT_LIST_HEAD(&mp->m_del_inodes); /* * Setup for attributes, in case they get created. @@ -605,24 +601,20 @@ xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp) } ASSERT(mp->m_attroffset < XFS_LITINO(mp)); - for (i = 0; i < 2; i++) { - mp->m_alloc_mxr[i] = XFS_BTREE_BLOCK_MAXRECS(sbp->sb_blocksize, - xfs_alloc, i == 0); - mp->m_alloc_mnr[i] = XFS_BTREE_BLOCK_MINRECS(sbp->sb_blocksize, - xfs_alloc, i == 0); - } - for (i = 0; i < 2; i++) { - mp->m_bmap_dmxr[i] = XFS_BTREE_BLOCK_MAXRECS(sbp->sb_blocksize, - xfs_bmbt, i == 0); - mp->m_bmap_dmnr[i] = XFS_BTREE_BLOCK_MINRECS(sbp->sb_blocksize, - xfs_bmbt, i == 0); - } - for (i = 0; i < 2; i++) { - mp->m_inobt_mxr[i] = XFS_BTREE_BLOCK_MAXRECS(sbp->sb_blocksize, - xfs_inobt, i == 0); - mp->m_inobt_mnr[i] = XFS_BTREE_BLOCK_MINRECS(sbp->sb_blocksize, - xfs_inobt, i == 0); - } + mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1); + mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0); + mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2; + mp->m_alloc_mnr[1] = mp->m_alloc_mxr[1] / 2; + + mp->m_inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1); + mp->m_inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 0); + mp->m_inobt_mnr[0] = mp->m_inobt_mxr[0] / 2; + mp->m_inobt_mnr[1] = mp->m_inobt_mxr[1] / 2; + + mp->m_bmap_dmxr[0] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 1); + mp->m_bmap_dmxr[1] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 0); + mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2; + mp->m_bmap_dmnr[1] = mp->m_bmap_dmxr[1] / 2; mp->m_bsize = XFS_FSB_TO_BB(mp, 1); mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK, @@ -1228,6 +1220,16 @@ xfs_unmountfs( __uint64_t resblks; int error; + /* + * Release dquot that rootinode, rbmino and rsumino might be holding, + * and release the quota inodes. + */ + XFS_QM_UNMOUNT(mp); + + if (mp->m_rbmip) + IRELE(mp->m_rbmip); + if (mp->m_rsumip) + IRELE(mp->m_rsumip); IRELE(mp->m_rootip); /* @@ -1241,7 +1243,7 @@ xfs_unmountfs( * need to force the log first. */ xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC); - xfs_iflush_all(mp); + xfs_reclaim_inodes(mp, 0, XFS_IFLUSH_ASYNC); XFS_QM_DQPURGEALL(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_UMOUNTING); @@ -1288,11 +1290,6 @@ xfs_unmountfs( xfs_unmountfs_wait(mp); /* wait for async bufs */ xfs_log_unmount(mp); /* Done! No more fs ops. */ - /* - * All inodes from this mount point should be freed. - */ - ASSERT(mp->m_inodes == NULL); - if ((mp->m_flags & XFS_MOUNT_NOUUID) == 0) uuid_table_remove(&mp->m_sb.sb_uuid); @@ -1365,24 +1362,6 @@ xfs_log_sbcount( return error; } -STATIC void -xfs_mark_shared_ro( - xfs_mount_t *mp, - xfs_buf_t *bp) -{ - xfs_dsb_t *sb = XFS_BUF_TO_SBP(bp); - __uint16_t version; - - if (!(sb->sb_flags & XFS_SBF_READONLY)) - sb->sb_flags |= XFS_SBF_READONLY; - - version = be16_to_cpu(sb->sb_versionnum); - if ((version & XFS_SB_VERSION_NUMBITS) != XFS_SB_VERSION_4 || - !(version & XFS_SB_VERSION_SHAREDBIT)) - version |= XFS_SB_VERSION_SHAREDBIT; - sb->sb_versionnum = cpu_to_be16(version); -} - int xfs_unmountfs_writesb(xfs_mount_t *mp) { @@ -1398,12 +1377,6 @@ xfs_unmountfs_writesb(xfs_mount_t *mp) sbp = xfs_getsb(mp, 0); - /* - * mark shared-readonly if desired - */ - if (mp->m_mk_sharedro) - xfs_mark_shared_ro(mp, sbp); - XFS_BUF_UNDONE(sbp); XFS_BUF_UNREAD(sbp); XFS_BUF_UNDELAYWRITE(sbp); @@ -1415,8 +1388,6 @@ xfs_unmountfs_writesb(xfs_mount_t *mp) if (error) xfs_ioerror_alert("xfs_unmountfs_writesb", mp, sbp, XFS_BUF_ADDR(sbp)); - if (error && mp->m_mk_sharedro) - xfs_fs_cmn_err(CE_ALERT, mp, "Superblock write error detected while unmounting. Filesystem may not be marked shared readonly"); xfs_buf_relse(sbp); } return error; diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index f3c1024b1241..c1e028467327 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -18,7 +18,6 @@ #ifndef __XFS_MOUNT_H__ #define __XFS_MOUNT_H__ - typedef struct xfs_trans_reservations { uint tr_write; /* extent alloc trans */ uint tr_itruncate; /* truncate trans */ @@ -44,14 +43,16 @@ typedef struct xfs_trans_reservations { } xfs_trans_reservations_t; #ifndef __KERNEL__ -/* - * Moved here from xfs_ag.h to avoid reordering header files - */ + #define XFS_DADDR_TO_AGNO(mp,d) \ ((xfs_agnumber_t)(XFS_BB_TO_FSBT(mp, d) / (mp)->m_sb.sb_agblocks)) #define XFS_DADDR_TO_AGBNO(mp,d) \ ((xfs_agblock_t)(XFS_BB_TO_FSBT(mp, d) % (mp)->m_sb.sb_agblocks)) -#else + +#else /* __KERNEL__ */ + +#include "xfs_sync.h" + struct cred; struct log; struct xfs_mount_args; @@ -62,6 +63,7 @@ struct xfs_extdelta; struct xfs_swapext; struct xfs_mru_cache; struct xfs_nameops; +struct xfs_ail; /* * Prototypes and functions for the Data Migration subsystem. @@ -115,7 +117,7 @@ struct xfs_quotainfo; typedef int (*xfs_qminit_t)(struct xfs_mount *, uint *, uint *); typedef int (*xfs_qmmount_t)(struct xfs_mount *, uint, uint); -typedef int (*xfs_qmunmount_t)(struct xfs_mount *); +typedef void (*xfs_qmunmount_t)(struct xfs_mount *); typedef void (*xfs_qmdone_t)(struct xfs_mount *); typedef void (*xfs_dqrele_t)(struct xfs_dquot *); typedef int (*xfs_dqattach_t)(struct xfs_inode *, uint); @@ -132,7 +134,7 @@ typedef struct xfs_dquot * (*xfs_dqvopchown_t)( struct xfs_dquot **, struct xfs_dquot *); typedef int (*xfs_dqvopchownresv_t)(struct xfs_trans *, struct xfs_inode *, struct xfs_dquot *, struct xfs_dquot *, uint); -typedef void (*xfs_dqstatvfs_t)(struct xfs_inode *, bhv_statvfs_t *); +typedef void (*xfs_dqstatvfs_t)(struct xfs_inode *, struct kstatfs *); typedef int (*xfs_dqsync_t)(struct xfs_mount *, int flags); typedef int (*xfs_quotactl_t)(struct xfs_mount *, int, int, xfs_caddr_t); @@ -223,18 +225,10 @@ extern void xfs_icsb_sync_counters_locked(struct xfs_mount *, int); #define xfs_icsb_sync_counters_locked(mp, flags) do { } while (0) #endif -typedef struct xfs_ail { - struct list_head xa_ail; - uint xa_gen; - struct task_struct *xa_task; - xfs_lsn_t xa_target; -} xfs_ail_t; - typedef struct xfs_mount { struct super_block *m_super; xfs_tid_t m_tid; /* next unused tid for fs */ - spinlock_t m_ail_lock; /* fs AIL mutex */ - xfs_ail_t m_ail; /* fs active log item list */ + struct xfs_ail *m_ail; /* fs active log item list */ xfs_sb_t m_sb; /* copy of fs superblock */ spinlock_t m_sb_lock; /* sb counter lock */ struct xfs_buf *m_sb_bp; /* buffer for superblock */ @@ -247,10 +241,6 @@ typedef struct xfs_mount { xfs_agnumber_t m_agirotor; /* last ag dir inode alloced */ spinlock_t m_agirotor_lock;/* .. and lock protecting it */ xfs_agnumber_t m_maxagi; /* highest inode alloc group */ - struct xfs_inode *m_inodes; /* active inode list */ - struct list_head m_del_inodes; /* inodes to reclaim */ - mutex_t m_ilock; /* inode list mutex */ - uint m_ireclaims; /* count of calls to reclaim*/ uint m_readio_log; /* min read size log bytes */ uint m_readio_blocks; /* min read size blocks */ uint m_writeio_log; /* min write size log bytes */ @@ -267,7 +257,6 @@ typedef struct xfs_mount { xfs_buftarg_t *m_ddev_targp; /* saves taking the address */ xfs_buftarg_t *m_logdev_targp;/* ptr to log device */ xfs_buftarg_t *m_rtdev_targp; /* ptr to rt device */ - __uint8_t m_dircook_elog; /* log d-cookie entry bits */ __uint8_t m_blkbit_log; /* blocklog + NBBY */ __uint8_t m_blkbb_log; /* blocklog - BBSHIFT */ __uint8_t m_agno_log; /* log #ag's */ @@ -276,12 +265,12 @@ typedef struct xfs_mount { uint m_blockmask; /* sb_blocksize-1 */ uint m_blockwsize; /* sb_blocksize in words */ uint m_blockwmask; /* blockwsize-1 */ - uint m_alloc_mxr[2]; /* XFS_ALLOC_BLOCK_MAXRECS */ - uint m_alloc_mnr[2]; /* XFS_ALLOC_BLOCK_MINRECS */ - uint m_bmap_dmxr[2]; /* XFS_BMAP_BLOCK_DMAXRECS */ - uint m_bmap_dmnr[2]; /* XFS_BMAP_BLOCK_DMINRECS */ - uint m_inobt_mxr[2]; /* XFS_INOBT_BLOCK_MAXRECS */ - uint m_inobt_mnr[2]; /* XFS_INOBT_BLOCK_MINRECS */ + uint m_alloc_mxr[2]; /* max alloc btree records */ + uint m_alloc_mnr[2]; /* min alloc btree records */ + uint m_bmap_dmxr[2]; /* max bmap btree records */ + uint m_bmap_dmnr[2]; /* min bmap btree records */ + uint m_inobt_mxr[2]; /* max inobt btree records */ + uint m_inobt_mnr[2]; /* min inobt btree records */ uint m_ag_maxlevels; /* XFS_AG_MAXLEVELS */ uint m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */ uint m_in_maxlevels; /* XFS_IN_MAXLEVELS */ @@ -312,9 +301,6 @@ typedef struct xfs_mount { int m_sinoalign; /* stripe unit inode alignment */ int m_attr_magicpct;/* 37% of the blocksize */ int m_dir_magicpct; /* 37% of the dir blocksize */ - __uint8_t m_mk_sharedro; /* mark shared ro on unmount */ - __uint8_t m_inode_quiesce;/* call quiesce on new inodes. - field governed by m_ilock */ __uint8_t m_sectbb_log; /* sectlog - BBSHIFT */ const struct xfs_nameops *m_dirnameops; /* vector of dir name ops */ int m_dirblksize; /* directory block sz--bytes */ @@ -362,7 +348,6 @@ typedef struct xfs_mount { #define XFS_MOUNT_ATTR2 (1ULL << 8) /* allow use of attr2 format */ #define XFS_MOUNT_GRPID (1ULL << 9) /* group-ID assigned from directory */ #define XFS_MOUNT_NORECOVERY (1ULL << 10) /* no recovery - dirty fs */ -#define XFS_MOUNT_SHARED (1ULL << 11) /* shared mount */ #define XFS_MOUNT_DFLT_IOSIZE (1ULL << 12) /* set default i/o size */ #define XFS_MOUNT_OSYNCISOSYNC (1ULL << 13) /* o_sync is REALLY o_sync */ /* osyncisdsync is now default*/ @@ -439,6 +424,16 @@ void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname, #define xfs_force_shutdown(m,f) \ xfs_do_force_shutdown(m, f, __FILE__, __LINE__) +#define SHUTDOWN_META_IO_ERROR 0x0001 /* write attempt to metadata failed */ +#define SHUTDOWN_LOG_IO_ERROR 0x0002 /* write attempt to the log failed */ +#define SHUTDOWN_FORCE_UMOUNT 0x0004 /* shutdown from a forced unmount */ +#define SHUTDOWN_CORRUPT_INCORE 0x0008 /* corrupt in-memory data structures */ +#define SHUTDOWN_REMOTE_REQ 0x0010 /* shutdown came from remote cell */ +#define SHUTDOWN_DEVICE_REQ 0x0020 /* failed all paths to the device */ + +#define xfs_test_for_freeze(mp) ((mp)->m_super->s_frozen) +#define xfs_wait_for_freeze(mp,l) vfs_check_frozen((mp)->m_super, (l)) + /* * Flags for xfs_mountfs */ @@ -508,14 +503,12 @@ typedef struct xfs_mod_sb { #define XFS_MOUNT_ILOCK(mp) mutex_lock(&((mp)->m_ilock)) #define XFS_MOUNT_IUNLOCK(mp) mutex_unlock(&((mp)->m_ilock)) -extern void xfs_mod_sb(xfs_trans_t *, __int64_t); extern int xfs_log_sbcount(xfs_mount_t *, uint); extern int xfs_mountfs(xfs_mount_t *mp); extern void xfs_mountfs_check_barriers(xfs_mount_t *mp); extern void xfs_unmountfs(xfs_mount_t *); extern int xfs_unmountfs_writesb(xfs_mount_t *); -extern int xfs_unmount_flush(xfs_mount_t *, int); extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int); extern int xfs_mod_incore_sb_unlocked(xfs_mount_t *, xfs_sb_field_t, int64_t, int); @@ -525,20 +518,20 @@ extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int); extern int xfs_readsb(xfs_mount_t *, int); extern void xfs_freesb(xfs_mount_t *); extern int xfs_fs_writable(xfs_mount_t *); -extern int xfs_syncsub(xfs_mount_t *, int, int *); -extern int xfs_sync_inodes(xfs_mount_t *, int, int *); -extern xfs_agnumber_t xfs_initialize_perag(xfs_mount_t *, xfs_agnumber_t); -extern void xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *); -extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t); extern int xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t); -extern int xfs_dmops_get(struct xfs_mount *, struct xfs_mount_args *); +extern int xfs_dmops_get(struct xfs_mount *); extern void xfs_dmops_put(struct xfs_mount *); -extern int xfs_qmops_get(struct xfs_mount *, struct xfs_mount_args *); +extern int xfs_qmops_get(struct xfs_mount *); extern void xfs_qmops_put(struct xfs_mount *); extern struct xfs_dmops xfs_dmcore_xfs; #endif /* __KERNEL__ */ +extern void xfs_mod_sb(struct xfs_trans *, __int64_t); +extern xfs_agnumber_t xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t); +extern void xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *); +extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t); + #endif /* __XFS_MOUNT_H__ */ diff --git a/fs/xfs/xfs_qmops.c b/fs/xfs/xfs_qmops.c index a294e58db8dd..27f80581520a 100644 --- a/fs/xfs/xfs_qmops.c +++ b/fs/xfs/xfs_qmops.c @@ -28,7 +28,6 @@ #include "xfs_mount.h" #include "xfs_quota.h" #include "xfs_error.h" -#include "xfs_clnt.h" STATIC struct xfs_dquot * @@ -131,9 +130,9 @@ static struct xfs_qmops xfs_qmcore_stub = { }; int -xfs_qmops_get(struct xfs_mount *mp, struct xfs_mount_args *args) +xfs_qmops_get(struct xfs_mount *mp) { - if (args->flags & (XFSMNT_UQUOTA | XFSMNT_PQUOTA | XFSMNT_GQUOTA)) { + if (XFS_IS_QUOTA_RUNNING(mp)) { #ifdef CONFIG_XFS_QUOTA mp->m_qm_ops = &xfs_qmcore_xfs; #else diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index 12c4ec775af8..48965ecaa155 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -84,11 +84,9 @@ typedef struct xfs_dqblk { #define XFS_DQ_USER 0x0001 /* a user quota */ #define XFS_DQ_PROJ 0x0002 /* project quota */ #define XFS_DQ_GROUP 0x0004 /* a group quota */ -#define XFS_DQ_FLOCKED 0x0008 /* flush lock taken */ -#define XFS_DQ_DIRTY 0x0010 /* dquot is dirty */ -#define XFS_DQ_WANT 0x0020 /* for lookup/reclaim race */ -#define XFS_DQ_INACTIVE 0x0040 /* dq off mplist & hashlist */ -#define XFS_DQ_MARKER 0x0080 /* sentinel */ +#define XFS_DQ_DIRTY 0x0008 /* dquot is dirty */ +#define XFS_DQ_WANT 0x0010 /* for lookup/reclaim race */ +#define XFS_DQ_INACTIVE 0x0020 /* dq off mplist & hashlist */ #define XFS_DQ_ALLTYPES (XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP) diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c index c903130be7fd..86471bb40fd4 100644 --- a/fs/xfs/xfs_rename.c +++ b/fs/xfs/xfs_rename.c @@ -42,31 +42,6 @@ /* - * Given an array of up to 4 inode pointers, unlock the pointed to inodes. - * If there are fewer than 4 entries in the array, the empty entries will - * be at the end and will have NULL pointers in them. - */ -STATIC void -xfs_rename_unlock4( - xfs_inode_t **i_tab, - uint lock_mode) -{ - int i; - - xfs_iunlock(i_tab[0], lock_mode); - for (i = 1; i < 4; i++) { - if (i_tab[i] == NULL) - break; - - /* - * Watch out for duplicate entries in the table. - */ - if (i_tab[i] != i_tab[i-1]) - xfs_iunlock(i_tab[i], lock_mode); - } -} - -/* * Enter all inodes for a rename transaction into a sorted array. */ STATIC void @@ -205,19 +180,6 @@ xfs_rename( xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL); /* - * If we are using project inheritance, we only allow renames - * into our tree when the project IDs are the same; else the - * tree quota mechanism would be circumvented. - */ - if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && - (target_dp->i_d.di_projid != src_ip->i_d.di_projid))) { - error = XFS_ERROR(EXDEV); - xfs_rename_unlock4(inodes, XFS_ILOCK_EXCL); - xfs_trans_cancel(tp, cancel_flags); - goto std_return; - } - - /* * Join all the inodes to the transaction. From this point on, * we can rely on either trans_commit or trans_cancel to unlock * them. Note that we need to add a vnode reference to the @@ -242,6 +204,17 @@ xfs_rename( } /* + * If we are using project inheritance, we only allow renames + * into our tree when the project IDs are the same; else the + * tree quota mechanism would be circumvented. + */ + if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && + (target_dp->i_d.di_projid != src_ip->i_d.di_projid))) { + error = XFS_ERROR(EXDEV); + goto error_return; + } + + /* * Set up the target. */ if (target_ip == NULL) { @@ -367,19 +340,11 @@ xfs_rename( &first_block, &free_list, spaceres); if (error) goto abort_return; - xfs_ichgtime(src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - /* - * Update the generation counts on all the directory inodes - * that we're modifying. - */ - src_dp->i_gen++; + xfs_ichgtime(src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE); - - if (new_parent) { - target_dp->i_gen++; + if (new_parent) xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE); - } /* * If this is a synchronous mount, make sure that the diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index e2f68de16159..edf12c7b834c 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -85,7 +85,6 @@ xfs_growfs_rt_alloc( { xfs_fileoff_t bno; /* block number in file */ xfs_buf_t *bp; /* temporary buffer for zeroing */ - int cancelflags; /* flags for xfs_trans_cancel */ int committed; /* transaction committed flag */ xfs_daddr_t d; /* disk block address */ int error; /* error return value */ @@ -96,15 +95,16 @@ xfs_growfs_rt_alloc( xfs_bmbt_irec_t map; /* block map output */ int nmap; /* number of block maps */ int resblks; /* space reservation */ - xfs_trans_t *tp; /* transaction pointer */ /* * Allocate space to the file, as necessary. */ while (oblocks < nblocks) { + int cancelflags = 0; + xfs_trans_t *tp; + tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC); resblks = XFS_GROWFSRT_SPACE_RES(mp, nblocks - oblocks); - cancelflags = 0; /* * Reserve space & log for one extent added to the file. */ @@ -171,7 +171,9 @@ xfs_growfs_rt_alloc( mp->m_bsize, 0); if (bp == NULL) { error = XFS_ERROR(EIO); - goto error_cancel; +error_cancel: + xfs_trans_cancel(tp, cancelflags); + goto error; } memset(XFS_BUF_PTR(bp), 0, mp->m_sb.sb_blocksize); xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1); @@ -188,8 +190,6 @@ xfs_growfs_rt_alloc( oblocks = map.br_startoff + map.br_blockcount; } return 0; -error_cancel: - xfs_trans_cancel(tp, cancelflags); error: return error; } @@ -1856,7 +1856,6 @@ xfs_growfs_rt( { xfs_rtblock_t bmbno; /* bitmap block number */ xfs_buf_t *bp; /* temporary buffer */ - int cancelflags; /* flags for xfs_trans_cancel */ int error; /* error return value */ xfs_inode_t *ip; /* bitmap inode, used as lock */ xfs_mount_t *nmp; /* new (fake) mount structure */ @@ -1872,13 +1871,13 @@ xfs_growfs_rt( xfs_extlen_t rsumblocks; /* current number of rt summary blks */ xfs_sb_t *sbp; /* old superblock */ xfs_fsblock_t sumbno; /* summary block number */ - xfs_trans_t *tp; /* transaction pointer */ sbp = &mp->m_sb; - cancelflags = 0; /* * Initial error checking. */ + if (!capable(CAP_SYS_ADMIN)) + return XFS_ERROR(EPERM); if (mp->m_rtdev_targp == NULL || mp->m_rbmip == NULL || (nrblocks = in->newblocks) <= sbp->sb_rblocks || (sbp->sb_rblocks && (in->extsize != sbp->sb_rextsize))) @@ -1942,6 +1941,9 @@ xfs_growfs_rt( ((sbp->sb_rextents & ((1 << mp->m_blkbit_log) - 1)) != 0); bmbno < nrbmblocks; bmbno++) { + xfs_trans_t *tp; + int cancelflags = 0; + *nmp = *mp; nsbp = &nmp->m_sb; /* @@ -1967,16 +1969,15 @@ xfs_growfs_rt( * Start a transaction, get the log reservation. */ tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_FREE); - cancelflags = 0; if ((error = xfs_trans_reserve(tp, 0, XFS_GROWRTFREE_LOG_RES(nmp), 0, 0, 0))) - break; + goto error_cancel; /* * Lock out other callers by grabbing the bitmap inode lock. */ if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0, XFS_ILOCK_EXCL, &ip))) - break; + goto error_cancel; ASSERT(ip == mp->m_rbmip); /* * Update the bitmap inode's size. @@ -1990,7 +1991,7 @@ xfs_growfs_rt( */ if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rsumino, 0, XFS_ILOCK_EXCL, &ip))) - break; + goto error_cancel; ASSERT(ip == mp->m_rsumip); /* * Update the summary inode's size. @@ -2005,7 +2006,7 @@ xfs_growfs_rt( mp->m_rsumlevels != nmp->m_rsumlevels) { error = xfs_rtcopy_summary(mp, nmp, tp); if (error) - break; + goto error_cancel; } /* * Update superblock fields. @@ -2031,8 +2032,11 @@ xfs_growfs_rt( bp = NULL; error = xfs_rtfree_range(nmp, tp, sbp->sb_rextents, nsbp->sb_rextents - sbp->sb_rextents, &bp, &sumbno); - if (error) + if (error) { +error_cancel: + xfs_trans_cancel(tp, cancelflags); break; + } /* * Mark more blocks free in the superblock. */ @@ -2045,15 +2049,10 @@ xfs_growfs_rt( mp->m_rsumsize = nrsumsize; error = xfs_trans_commit(tp, 0); - if (error) { - tp = NULL; + if (error) break; - } } - if (error && tp) - xfs_trans_cancel(tp, cancelflags); - /* * Free the fake mp structure. */ diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c index 3a82576dde9a..36f3a21c54d2 100644 --- a/fs/xfs/xfs_rw.c +++ b/fs/xfs/xfs_rw.c @@ -406,7 +406,7 @@ xfs_bwrite( * XXXsup how does this work for quotas. */ XFS_BUF_SET_BDSTRAT_FUNC(bp, xfs_bdstrat_cb); - XFS_BUF_SET_FSPRIVATE3(bp, mp); + bp->b_mount = mp; XFS_BUF_WRITE(bp); if ((error = XFS_bwrite(bp))) { diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h index 3f8cf1587f4c..1ed71916e4c9 100644 --- a/fs/xfs/xfs_sb.h +++ b/fs/xfs/xfs_sb.h @@ -79,6 +79,7 @@ struct xfs_mount; #define XFS_SB_VERSION2_LAZYSBCOUNTBIT 0x00000002 /* Superblk counters */ #define XFS_SB_VERSION2_RESERVED4BIT 0x00000004 #define XFS_SB_VERSION2_ATTR2BIT 0x00000008 /* Inline attr rework */ +#define XFS_SB_VERSION2_PARENTBIT 0x00000010 /* parent pointers */ #define XFS_SB_VERSION2_OKREALFBITS \ (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \ @@ -296,30 +297,34 @@ typedef enum { #define XFS_SB_VERSION_NUM(sbp) ((sbp)->sb_versionnum & XFS_SB_VERSION_NUMBITS) -#ifdef __KERNEL__ static inline int xfs_sb_good_version(xfs_sb_t *sbp) { - return (((sbp->sb_versionnum >= XFS_SB_VERSION_1) && \ - (sbp->sb_versionnum <= XFS_SB_VERSION_3)) || \ - ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ - !((sbp->sb_versionnum & ~XFS_SB_VERSION_OKREALBITS) || \ - ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) && \ - (sbp->sb_features2 & ~XFS_SB_VERSION2_OKREALBITS))) && \ - (sbp->sb_shared_vn <= XFS_SB_MAX_SHARED_VN))); -} + /* We always support version 1-3 */ + if (sbp->sb_versionnum >= XFS_SB_VERSION_1 && + sbp->sb_versionnum <= XFS_SB_VERSION_3) + return 1; + + /* We support version 4 if all feature bits are supported */ + if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) { + if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKREALBITS) || + ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) && + (sbp->sb_features2 & ~XFS_SB_VERSION2_OKREALBITS))) + return 0; + +#ifdef __KERNEL__ + if (sbp->sb_shared_vn > XFS_SB_MAX_SHARED_VN) + return 0; #else -static inline int xfs_sb_good_version(xfs_sb_t *sbp) -{ - return (((sbp->sb_versionnum >= XFS_SB_VERSION_1) && \ - (sbp->sb_versionnum <= XFS_SB_VERSION_3)) || \ - ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ - !((sbp->sb_versionnum & ~XFS_SB_VERSION_OKREALBITS) || \ - ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) && \ - (sbp->sb_features2 & ~XFS_SB_VERSION2_OKREALBITS))) && \ - (!(sbp->sb_versionnum & XFS_SB_VERSION_SHAREDBIT) || \ - (sbp->sb_shared_vn <= XFS_SB_MAX_SHARED_VN)))); + if ((sbp->sb_versionnum & XFS_SB_VERSION_SHAREDBIT) && + sbp->sb_shared_vn > XFS_SB_MAX_SHARED_VN) + return 0; +#endif + + return 1; + } + + return 0; } -#endif /* __KERNEL__ */ /* * Detect a mismatched features2 field. Older kernels read/wrote @@ -332,123 +337,127 @@ static inline int xfs_sb_has_mismatched_features2(xfs_sb_t *sbp) static inline unsigned xfs_sb_version_tonew(unsigned v) { - return ((((v) == XFS_SB_VERSION_1) ? \ - 0 : \ - (((v) == XFS_SB_VERSION_2) ? \ - XFS_SB_VERSION_ATTRBIT : \ - (XFS_SB_VERSION_ATTRBIT | XFS_SB_VERSION_NLINKBIT))) | \ - XFS_SB_VERSION_4); + if (v == XFS_SB_VERSION_1) + return XFS_SB_VERSION_4; + + if (v == XFS_SB_VERSION_2) + return XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT; + + return XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT | + XFS_SB_VERSION_NLINKBIT; } static inline unsigned xfs_sb_version_toold(unsigned v) { - return (((v) & (XFS_SB_VERSION_QUOTABIT | XFS_SB_VERSION_ALIGNBIT)) ? \ - 0 : \ - (((v) & XFS_SB_VERSION_NLINKBIT) ? \ - XFS_SB_VERSION_3 : \ - (((v) & XFS_SB_VERSION_ATTRBIT) ? \ - XFS_SB_VERSION_2 : \ - XFS_SB_VERSION_1))); + if (v & (XFS_SB_VERSION_QUOTABIT | XFS_SB_VERSION_ALIGNBIT)) + return 0; + if (v & XFS_SB_VERSION_NLINKBIT) + return XFS_SB_VERSION_3; + if (v & XFS_SB_VERSION_ATTRBIT) + return XFS_SB_VERSION_2; + return XFS_SB_VERSION_1; } static inline int xfs_sb_version_hasattr(xfs_sb_t *sbp) { - return ((sbp)->sb_versionnum == XFS_SB_VERSION_2) || \ - ((sbp)->sb_versionnum == XFS_SB_VERSION_3) || \ - ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ - ((sbp)->sb_versionnum & XFS_SB_VERSION_ATTRBIT)); + return sbp->sb_versionnum == XFS_SB_VERSION_2 || + sbp->sb_versionnum == XFS_SB_VERSION_3 || + (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && + (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT)); } static inline void xfs_sb_version_addattr(xfs_sb_t *sbp) { - (sbp)->sb_versionnum = (((sbp)->sb_versionnum == XFS_SB_VERSION_1) ? \ - XFS_SB_VERSION_2 : \ - ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) ? \ - ((sbp)->sb_versionnum | XFS_SB_VERSION_ATTRBIT) : \ - (XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT))); + if (sbp->sb_versionnum == XFS_SB_VERSION_1) + sbp->sb_versionnum = XFS_SB_VERSION_2; + else if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) + sbp->sb_versionnum |= XFS_SB_VERSION_ATTRBIT; + else + sbp->sb_versionnum = XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT; } static inline int xfs_sb_version_hasnlink(xfs_sb_t *sbp) { - return ((sbp)->sb_versionnum == XFS_SB_VERSION_3) || \ - ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ - ((sbp)->sb_versionnum & XFS_SB_VERSION_NLINKBIT)); + return sbp->sb_versionnum == XFS_SB_VERSION_3 || + (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && + (sbp->sb_versionnum & XFS_SB_VERSION_NLINKBIT)); } static inline void xfs_sb_version_addnlink(xfs_sb_t *sbp) { - (sbp)->sb_versionnum = ((sbp)->sb_versionnum <= XFS_SB_VERSION_2 ? \ - XFS_SB_VERSION_3 : \ - ((sbp)->sb_versionnum | XFS_SB_VERSION_NLINKBIT)); + if (sbp->sb_versionnum <= XFS_SB_VERSION_2) + sbp->sb_versionnum = XFS_SB_VERSION_3; + else + sbp->sb_versionnum |= XFS_SB_VERSION_NLINKBIT; } static inline int xfs_sb_version_hasquota(xfs_sb_t *sbp) { - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ - ((sbp)->sb_versionnum & XFS_SB_VERSION_QUOTABIT); + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && + (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT); } static inline void xfs_sb_version_addquota(xfs_sb_t *sbp) { - (sbp)->sb_versionnum = \ - (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 ? \ - ((sbp)->sb_versionnum | XFS_SB_VERSION_QUOTABIT) : \ - (xfs_sb_version_tonew((sbp)->sb_versionnum) | \ - XFS_SB_VERSION_QUOTABIT)); + if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) + sbp->sb_versionnum |= XFS_SB_VERSION_QUOTABIT; + else + sbp->sb_versionnum = xfs_sb_version_tonew(sbp->sb_versionnum) | + XFS_SB_VERSION_QUOTABIT; } static inline int xfs_sb_version_hasalign(xfs_sb_t *sbp) { - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ - ((sbp)->sb_versionnum & XFS_SB_VERSION_ALIGNBIT); + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && + (sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT); } static inline int xfs_sb_version_hasdalign(xfs_sb_t *sbp) { - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ - ((sbp)->sb_versionnum & XFS_SB_VERSION_DALIGNBIT); + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && + (sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT); } static inline int xfs_sb_version_hasshared(xfs_sb_t *sbp) { - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ - ((sbp)->sb_versionnum & XFS_SB_VERSION_SHAREDBIT); + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && + (sbp->sb_versionnum & XFS_SB_VERSION_SHAREDBIT); } static inline int xfs_sb_version_hasdirv2(xfs_sb_t *sbp) { - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ - ((sbp)->sb_versionnum & XFS_SB_VERSION_DIRV2BIT); + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && + (sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT); } static inline int xfs_sb_version_haslogv2(xfs_sb_t *sbp) { - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ - ((sbp)->sb_versionnum & XFS_SB_VERSION_LOGV2BIT); + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && + (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT); } static inline int xfs_sb_version_hasextflgbit(xfs_sb_t *sbp) { - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ - ((sbp)->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT); + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && + (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT); } static inline int xfs_sb_version_hassector(xfs_sb_t *sbp) { - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ - ((sbp)->sb_versionnum & XFS_SB_VERSION_SECTORBIT); + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && + (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT); } static inline int xfs_sb_version_hasasciici(xfs_sb_t *sbp) { - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT); } static inline int xfs_sb_version_hasmorebits(xfs_sb_t *sbp) { - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ - ((sbp)->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT); + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && + (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT); } /* @@ -463,22 +472,20 @@ static inline int xfs_sb_version_hasmorebits(xfs_sb_t *sbp) static inline int xfs_sb_version_haslazysbcount(xfs_sb_t *sbp) { - return (xfs_sb_version_hasmorebits(sbp) && \ - ((sbp)->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT)); + return xfs_sb_version_hasmorebits(sbp) && + (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT); } static inline int xfs_sb_version_hasattr2(xfs_sb_t *sbp) { - return (xfs_sb_version_hasmorebits(sbp)) && \ - ((sbp)->sb_features2 & XFS_SB_VERSION2_ATTR2BIT); + return xfs_sb_version_hasmorebits(sbp) && + (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT); } static inline void xfs_sb_version_addattr2(xfs_sb_t *sbp) { - ((sbp)->sb_versionnum = \ - ((sbp)->sb_versionnum | XFS_SB_VERSION_MOREBITSBIT), \ - ((sbp)->sb_features2 = \ - ((sbp)->sb_features2 | XFS_SB_VERSION2_ATTR2BIT))); + sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT; + sbp->sb_features2 |= XFS_SB_VERSION2_ATTR2BIT; } static inline void xfs_sb_version_removeattr2(xfs_sb_t *sbp) diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 4e1c22a23be5..8570b826fedd 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -290,7 +290,7 @@ xfs_trans_dup( ASSERT(tp->t_ticket != NULL); ntp->t_flags = XFS_TRANS_PERM_LOG_RES | (tp->t_flags & XFS_TRANS_RESERVE); - ntp->t_ticket = tp->t_ticket; + ntp->t_ticket = xfs_log_ticket_get(tp->t_ticket); ntp->t_blk_res = tp->t_blk_res - tp->t_blk_res_used; tp->t_blk_res = tp->t_blk_res_used; ntp->t_rtx_res = tp->t_rtx_res - tp->t_rtx_res_used; @@ -1260,6 +1260,13 @@ xfs_trans_roll( trans = *tpp; /* + * transaction commit worked ok so we can drop the extra ticket + * reference that we gained in xfs_trans_dup() + */ + xfs_log_ticket_put(trans->t_ticket); + + + /* * Reserve space in the log for th next transaction. * This also pushes items in the "AIL", the list of logged items, * out to disk if they are taking up space at the tail of the log @@ -1383,11 +1390,12 @@ xfs_trans_chunk_committed( xfs_log_item_desc_t *lidp; xfs_log_item_t *lip; xfs_lsn_t item_lsn; - struct xfs_mount *mp; int i; lidp = licp->lic_descs; for (i = 0; i < licp->lic_unused; i++, lidp++) { + struct xfs_ail *ailp; + if (xfs_lic_isfree(licp, i)) { continue; } @@ -1424,19 +1432,19 @@ xfs_trans_chunk_committed( * This would cause the earlier transaction to fail * the test below. */ - mp = lip->li_mountp; - spin_lock(&mp->m_ail_lock); + ailp = lip->li_ailp; + spin_lock(&ailp->xa_lock); if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) { /* * This will set the item's lsn to item_lsn * and update the position of the item in * the AIL. * - * xfs_trans_update_ail() drops the AIL lock. + * xfs_trans_ail_update() drops the AIL lock. */ - xfs_trans_update_ail(mp, lip, item_lsn); + xfs_trans_ail_update(ailp, lip, item_lsn); } else { - spin_unlock(&mp->m_ail_lock); + spin_unlock(&ailp->xa_lock); } /* diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 74c80bd2b0ec..d6fe4a88d79f 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -18,6 +18,8 @@ #ifndef __XFS_TRANS_H__ #define __XFS_TRANS_H__ +struct xfs_log_item; + /* * This is the structure written in the log at the head of * every transaction. It identifies the type and id of the @@ -98,76 +100,6 @@ typedef struct xfs_trans_header { #define XFS_TRANS_TYPE_MAX 41 /* new transaction types need to be reflected in xfs_logprint(8) */ - -#ifdef __KERNEL__ -struct xfs_buf; -struct xfs_buftarg; -struct xfs_efd_log_item; -struct xfs_efi_log_item; -struct xfs_inode; -struct xfs_item_ops; -struct xfs_log_iovec; -struct xfs_log_item; -struct xfs_log_item_desc; -struct xfs_mount; -struct xfs_trans; -struct xfs_dquot_acct; - -typedef struct xfs_log_item { - struct list_head li_ail; /* AIL pointers */ - xfs_lsn_t li_lsn; /* last on-disk lsn */ - struct xfs_log_item_desc *li_desc; /* ptr to current desc*/ - struct xfs_mount *li_mountp; /* ptr to fs mount */ - uint li_type; /* item type */ - uint li_flags; /* misc flags */ - struct xfs_log_item *li_bio_list; /* buffer item list */ - void (*li_cb)(struct xfs_buf *, - struct xfs_log_item *); - /* buffer item iodone */ - /* callback func */ - struct xfs_item_ops *li_ops; /* function list */ -} xfs_log_item_t; - -#define XFS_LI_IN_AIL 0x1 -#define XFS_LI_ABORTED 0x2 - -typedef struct xfs_item_ops { - uint (*iop_size)(xfs_log_item_t *); - void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *); - void (*iop_pin)(xfs_log_item_t *); - void (*iop_unpin)(xfs_log_item_t *, int); - void (*iop_unpin_remove)(xfs_log_item_t *, struct xfs_trans *); - uint (*iop_trylock)(xfs_log_item_t *); - void (*iop_unlock)(xfs_log_item_t *); - xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t); - void (*iop_push)(xfs_log_item_t *); - void (*iop_pushbuf)(xfs_log_item_t *); - void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t); -} xfs_item_ops_t; - -#define IOP_SIZE(ip) (*(ip)->li_ops->iop_size)(ip) -#define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp) -#define IOP_PIN(ip) (*(ip)->li_ops->iop_pin)(ip) -#define IOP_UNPIN(ip, flags) (*(ip)->li_ops->iop_unpin)(ip, flags) -#define IOP_UNPIN_REMOVE(ip,tp) (*(ip)->li_ops->iop_unpin_remove)(ip, tp) -#define IOP_TRYLOCK(ip) (*(ip)->li_ops->iop_trylock)(ip) -#define IOP_UNLOCK(ip) (*(ip)->li_ops->iop_unlock)(ip) -#define IOP_COMMITTED(ip, lsn) (*(ip)->li_ops->iop_committed)(ip, lsn) -#define IOP_PUSH(ip) (*(ip)->li_ops->iop_push)(ip) -#define IOP_PUSHBUF(ip) (*(ip)->li_ops->iop_pushbuf)(ip) -#define IOP_COMMITTING(ip, lsn) (*(ip)->li_ops->iop_committing)(ip, lsn) - -/* - * Return values for the IOP_TRYLOCK() routines. - */ -#define XFS_ITEM_SUCCESS 0 -#define XFS_ITEM_PINNED 1 -#define XFS_ITEM_LOCKED 2 -#define XFS_ITEM_FLUSHING 3 -#define XFS_ITEM_PUSHBUF 4 - -#endif /* __KERNEL__ */ - /* * This structure is used to track log items associated with * a transaction. It points to the log item and keeps some @@ -176,7 +108,7 @@ typedef struct xfs_item_ops { * once we get to commit processing (see xfs_trans_commit()). */ typedef struct xfs_log_item_desc { - xfs_log_item_t *lid_item; + struct xfs_log_item *lid_item; ushort lid_size; unsigned char lid_flags; unsigned char lid_index; @@ -276,94 +208,6 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp) (xfs_caddr_t)(((xfs_log_item_chunk_t*)0)->lic_descs)); } -#ifdef __KERNEL__ -/* - * This structure is used to maintain a list of block ranges that have been - * freed in the transaction. The ranges are listed in the perag[] busy list - * between when they're freed and the transaction is committed to disk. - */ - -typedef struct xfs_log_busy_slot { - xfs_agnumber_t lbc_ag; - ushort lbc_idx; /* index in perag.busy[] */ -} xfs_log_busy_slot_t; - -#define XFS_LBC_NUM_SLOTS 31 -typedef struct xfs_log_busy_chunk { - struct xfs_log_busy_chunk *lbc_next; - uint lbc_free; /* free slots bitmask */ - ushort lbc_unused; /* first unused */ - xfs_log_busy_slot_t lbc_busy[XFS_LBC_NUM_SLOTS]; -} xfs_log_busy_chunk_t; - -#define XFS_LBC_MAX_SLOT (XFS_LBC_NUM_SLOTS - 1) -#define XFS_LBC_FREEMASK ((1U << XFS_LBC_NUM_SLOTS) - 1) - -#define XFS_LBC_INIT(cp) ((cp)->lbc_free = XFS_LBC_FREEMASK) -#define XFS_LBC_CLAIM(cp, slot) ((cp)->lbc_free &= ~(1 << (slot))) -#define XFS_LBC_SLOT(cp, slot) (&((cp)->lbc_busy[(slot)])) -#define XFS_LBC_VACANCY(cp) (((cp)->lbc_free) & XFS_LBC_FREEMASK) -#define XFS_LBC_ISFREE(cp, slot) ((cp)->lbc_free & (1 << (slot))) - -/* - * This is the type of function which can be given to xfs_trans_callback() - * to be called upon the transaction's commit to disk. - */ -typedef void (*xfs_trans_callback_t)(struct xfs_trans *, void *); - -/* - * This is the structure maintained for every active transaction. - */ -typedef struct xfs_trans { - unsigned int t_magic; /* magic number */ - xfs_log_callback_t t_logcb; /* log callback struct */ - unsigned int t_type; /* transaction type */ - unsigned int t_log_res; /* amt of log space resvd */ - unsigned int t_log_count; /* count for perm log res */ - unsigned int t_blk_res; /* # of blocks resvd */ - unsigned int t_blk_res_used; /* # of resvd blocks used */ - unsigned int t_rtx_res; /* # of rt extents resvd */ - unsigned int t_rtx_res_used; /* # of resvd rt extents used */ - xfs_log_ticket_t t_ticket; /* log mgr ticket */ - xfs_lsn_t t_lsn; /* log seq num of start of - * transaction. */ - xfs_lsn_t t_commit_lsn; /* log seq num of end of - * transaction. */ - struct xfs_mount *t_mountp; /* ptr to fs mount struct */ - struct xfs_dquot_acct *t_dqinfo; /* acctg info for dquots */ - xfs_trans_callback_t t_callback; /* transaction callback */ - void *t_callarg; /* callback arg */ - unsigned int t_flags; /* misc flags */ - int64_t t_icount_delta; /* superblock icount change */ - int64_t t_ifree_delta; /* superblock ifree change */ - int64_t t_fdblocks_delta; /* superblock fdblocks chg */ - int64_t t_res_fdblocks_delta; /* on-disk only chg */ - int64_t t_frextents_delta;/* superblock freextents chg*/ - int64_t t_res_frextents_delta; /* on-disk only chg */ -#ifdef DEBUG - int64_t t_ag_freeblks_delta; /* debugging counter */ - int64_t t_ag_flist_delta; /* debugging counter */ - int64_t t_ag_btree_delta; /* debugging counter */ -#endif - int64_t t_dblocks_delta;/* superblock dblocks change */ - int64_t t_agcount_delta;/* superblock agcount change */ - int64_t t_imaxpct_delta;/* superblock imaxpct change */ - int64_t t_rextsize_delta;/* superblock rextsize chg */ - int64_t t_rbmblocks_delta;/* superblock rbmblocks chg */ - int64_t t_rblocks_delta;/* superblock rblocks change */ - int64_t t_rextents_delta;/* superblocks rextents chg */ - int64_t t_rextslog_delta;/* superblocks rextslog chg */ - unsigned int t_items_free; /* log item descs free */ - xfs_log_item_chunk_t t_items; /* first log item desc chunk */ - xfs_trans_header_t t_header; /* header for in-log trans */ - unsigned int t_busy_free; /* busy descs free */ - xfs_log_busy_chunk_t t_busy; /* busy/async free blocks */ - unsigned long t_pflags; /* saved process flags state */ -} xfs_trans_t; - -#endif /* __KERNEL__ */ - - #define XFS_TRANS_MAGIC 0x5452414E /* 'TRAN' */ /* * Values for t_flags. @@ -906,6 +750,157 @@ typedef struct xfs_trans { #define XFS_DQUOT_REF 1 #ifdef __KERNEL__ + +struct xfs_buf; +struct xfs_buftarg; +struct xfs_efd_log_item; +struct xfs_efi_log_item; +struct xfs_inode; +struct xfs_item_ops; +struct xfs_log_iovec; +struct xfs_log_item_desc; +struct xfs_mount; +struct xfs_trans; +struct xfs_dquot_acct; + +typedef struct xfs_log_item { + struct list_head li_ail; /* AIL pointers */ + xfs_lsn_t li_lsn; /* last on-disk lsn */ + struct xfs_log_item_desc *li_desc; /* ptr to current desc*/ + struct xfs_mount *li_mountp; /* ptr to fs mount */ + struct xfs_ail *li_ailp; /* ptr to AIL */ + uint li_type; /* item type */ + uint li_flags; /* misc flags */ + struct xfs_log_item *li_bio_list; /* buffer item list */ + void (*li_cb)(struct xfs_buf *, + struct xfs_log_item *); + /* buffer item iodone */ + /* callback func */ + struct xfs_item_ops *li_ops; /* function list */ +} xfs_log_item_t; + +#define XFS_LI_IN_AIL 0x1 +#define XFS_LI_ABORTED 0x2 + +typedef struct xfs_item_ops { + uint (*iop_size)(xfs_log_item_t *); + void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *); + void (*iop_pin)(xfs_log_item_t *); + void (*iop_unpin)(xfs_log_item_t *, int); + void (*iop_unpin_remove)(xfs_log_item_t *, struct xfs_trans *); + uint (*iop_trylock)(xfs_log_item_t *); + void (*iop_unlock)(xfs_log_item_t *); + xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t); + void (*iop_push)(xfs_log_item_t *); + void (*iop_pushbuf)(xfs_log_item_t *); + void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t); +} xfs_item_ops_t; + +#define IOP_SIZE(ip) (*(ip)->li_ops->iop_size)(ip) +#define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp) +#define IOP_PIN(ip) (*(ip)->li_ops->iop_pin)(ip) +#define IOP_UNPIN(ip, flags) (*(ip)->li_ops->iop_unpin)(ip, flags) +#define IOP_UNPIN_REMOVE(ip,tp) (*(ip)->li_ops->iop_unpin_remove)(ip, tp) +#define IOP_TRYLOCK(ip) (*(ip)->li_ops->iop_trylock)(ip) +#define IOP_UNLOCK(ip) (*(ip)->li_ops->iop_unlock)(ip) +#define IOP_COMMITTED(ip, lsn) (*(ip)->li_ops->iop_committed)(ip, lsn) +#define IOP_PUSH(ip) (*(ip)->li_ops->iop_push)(ip) +#define IOP_PUSHBUF(ip) (*(ip)->li_ops->iop_pushbuf)(ip) +#define IOP_COMMITTING(ip, lsn) (*(ip)->li_ops->iop_committing)(ip, lsn) + +/* + * Return values for the IOP_TRYLOCK() routines. + */ +#define XFS_ITEM_SUCCESS 0 +#define XFS_ITEM_PINNED 1 +#define XFS_ITEM_LOCKED 2 +#define XFS_ITEM_FLUSHING 3 +#define XFS_ITEM_PUSHBUF 4 + +/* + * This structure is used to maintain a list of block ranges that have been + * freed in the transaction. The ranges are listed in the perag[] busy list + * between when they're freed and the transaction is committed to disk. + */ + +typedef struct xfs_log_busy_slot { + xfs_agnumber_t lbc_ag; + ushort lbc_idx; /* index in perag.busy[] */ +} xfs_log_busy_slot_t; + +#define XFS_LBC_NUM_SLOTS 31 +typedef struct xfs_log_busy_chunk { + struct xfs_log_busy_chunk *lbc_next; + uint lbc_free; /* free slots bitmask */ + ushort lbc_unused; /* first unused */ + xfs_log_busy_slot_t lbc_busy[XFS_LBC_NUM_SLOTS]; +} xfs_log_busy_chunk_t; + +#define XFS_LBC_MAX_SLOT (XFS_LBC_NUM_SLOTS - 1) +#define XFS_LBC_FREEMASK ((1U << XFS_LBC_NUM_SLOTS) - 1) + +#define XFS_LBC_INIT(cp) ((cp)->lbc_free = XFS_LBC_FREEMASK) +#define XFS_LBC_CLAIM(cp, slot) ((cp)->lbc_free &= ~(1 << (slot))) +#define XFS_LBC_SLOT(cp, slot) (&((cp)->lbc_busy[(slot)])) +#define XFS_LBC_VACANCY(cp) (((cp)->lbc_free) & XFS_LBC_FREEMASK) +#define XFS_LBC_ISFREE(cp, slot) ((cp)->lbc_free & (1 << (slot))) + +/* + * This is the type of function which can be given to xfs_trans_callback() + * to be called upon the transaction's commit to disk. + */ +typedef void (*xfs_trans_callback_t)(struct xfs_trans *, void *); + +/* + * This is the structure maintained for every active transaction. + */ +typedef struct xfs_trans { + unsigned int t_magic; /* magic number */ + xfs_log_callback_t t_logcb; /* log callback struct */ + unsigned int t_type; /* transaction type */ + unsigned int t_log_res; /* amt of log space resvd */ + unsigned int t_log_count; /* count for perm log res */ + unsigned int t_blk_res; /* # of blocks resvd */ + unsigned int t_blk_res_used; /* # of resvd blocks used */ + unsigned int t_rtx_res; /* # of rt extents resvd */ + unsigned int t_rtx_res_used; /* # of resvd rt extents used */ + xfs_log_ticket_t t_ticket; /* log mgr ticket */ + xfs_lsn_t t_lsn; /* log seq num of start of + * transaction. */ + xfs_lsn_t t_commit_lsn; /* log seq num of end of + * transaction. */ + struct xfs_mount *t_mountp; /* ptr to fs mount struct */ + struct xfs_dquot_acct *t_dqinfo; /* acctg info for dquots */ + xfs_trans_callback_t t_callback; /* transaction callback */ + void *t_callarg; /* callback arg */ + unsigned int t_flags; /* misc flags */ + int64_t t_icount_delta; /* superblock icount change */ + int64_t t_ifree_delta; /* superblock ifree change */ + int64_t t_fdblocks_delta; /* superblock fdblocks chg */ + int64_t t_res_fdblocks_delta; /* on-disk only chg */ + int64_t t_frextents_delta;/* superblock freextents chg*/ + int64_t t_res_frextents_delta; /* on-disk only chg */ +#ifdef DEBUG + int64_t t_ag_freeblks_delta; /* debugging counter */ + int64_t t_ag_flist_delta; /* debugging counter */ + int64_t t_ag_btree_delta; /* debugging counter */ +#endif + int64_t t_dblocks_delta;/* superblock dblocks change */ + int64_t t_agcount_delta;/* superblock agcount change */ + int64_t t_imaxpct_delta;/* superblock imaxpct change */ + int64_t t_rextsize_delta;/* superblock rextsize chg */ + int64_t t_rbmblocks_delta;/* superblock rbmblocks chg */ + int64_t t_rblocks_delta;/* superblock rblocks change */ + int64_t t_rextents_delta;/* superblocks rextents chg */ + int64_t t_rextslog_delta;/* superblocks rextslog chg */ + unsigned int t_items_free; /* log item descs free */ + xfs_log_item_chunk_t t_items; /* first log item desc chunk */ + xfs_trans_header_t t_header; /* header for in-log trans */ + unsigned int t_busy_free; /* busy descs free */ + xfs_log_busy_chunk_t t_busy; /* busy/async free blocks */ + unsigned long t_pflags; /* saved process flags state */ +} xfs_trans_t; + /* * XFS transaction mechanism exported interfaces that are * actually macros. @@ -928,7 +923,6 @@ typedef struct xfs_trans { /* * XFS transaction mechanism exported interfaces. */ -void xfs_trans_init(struct xfs_mount *); xfs_trans_t *xfs_trans_alloc(struct xfs_mount *, uint); xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint); xfs_trans_t *xfs_trans_dup(xfs_trans_t *); @@ -975,13 +969,8 @@ int _xfs_trans_commit(xfs_trans_t *, int *); #define xfs_trans_commit(tp, flags) _xfs_trans_commit(tp, flags, NULL) void xfs_trans_cancel(xfs_trans_t *, int); -int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *); int xfs_trans_ail_init(struct xfs_mount *); void xfs_trans_ail_destroy(struct xfs_mount *); -void xfs_trans_push_ail(struct xfs_mount *, xfs_lsn_t); -xfs_lsn_t xfs_trans_tail_ail(struct xfs_mount *); -void xfs_trans_unlocked_item(struct xfs_mount *, - xfs_log_item_t *); xfs_log_busy_slot_t *xfs_trans_add_busy(xfs_trans_t *tp, xfs_agnumber_t ag, xfs_extlen_t idx); @@ -990,4 +979,7 @@ extern kmem_zone_t *xfs_trans_zone; #endif /* __KERNEL__ */ +void xfs_trans_init(struct xfs_mount *); +int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *); + #endif /* __XFS_TRANS_H__ */ diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 1f77c00af566..2d47f10f8bed 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * Copyright (c) 2008 Dave Chinner * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -28,13 +29,13 @@ #include "xfs_trans_priv.h" #include "xfs_error.h" -STATIC void xfs_ail_insert(xfs_ail_t *, xfs_log_item_t *); -STATIC xfs_log_item_t * xfs_ail_delete(xfs_ail_t *, xfs_log_item_t *); -STATIC xfs_log_item_t * xfs_ail_min(xfs_ail_t *); -STATIC xfs_log_item_t * xfs_ail_next(xfs_ail_t *, xfs_log_item_t *); +STATIC void xfs_ail_insert(struct xfs_ail *, xfs_log_item_t *); +STATIC xfs_log_item_t * xfs_ail_delete(struct xfs_ail *, xfs_log_item_t *); +STATIC xfs_log_item_t * xfs_ail_min(struct xfs_ail *); +STATIC xfs_log_item_t * xfs_ail_next(struct xfs_ail *, xfs_log_item_t *); #ifdef DEBUG -STATIC void xfs_ail_check(xfs_ail_t *, xfs_log_item_t *); +STATIC void xfs_ail_check(struct xfs_ail *, xfs_log_item_t *); #else #define xfs_ail_check(a,l) #endif /* DEBUG */ @@ -50,20 +51,20 @@ STATIC void xfs_ail_check(xfs_ail_t *, xfs_log_item_t *); * lsn of the last item in the AIL. */ xfs_lsn_t -xfs_trans_tail_ail( - xfs_mount_t *mp) +xfs_trans_ail_tail( + struct xfs_ail *ailp) { xfs_lsn_t lsn; xfs_log_item_t *lip; - spin_lock(&mp->m_ail_lock); - lip = xfs_ail_min(&mp->m_ail); + spin_lock(&ailp->xa_lock); + lip = xfs_ail_min(ailp); if (lip == NULL) { lsn = (xfs_lsn_t)0; } else { lsn = lip->li_lsn; } - spin_unlock(&mp->m_ail_lock); + spin_unlock(&ailp->xa_lock); return lsn; } @@ -85,16 +86,125 @@ xfs_trans_tail_ail( * any of the objects, so the lock is not needed. */ void -xfs_trans_push_ail( - xfs_mount_t *mp, - xfs_lsn_t threshold_lsn) +xfs_trans_ail_push( + struct xfs_ail *ailp, + xfs_lsn_t threshold_lsn) { - xfs_log_item_t *lip; + xfs_log_item_t *lip; + + lip = xfs_ail_min(ailp); + if (lip && !XFS_FORCED_SHUTDOWN(ailp->xa_mount)) { + if (XFS_LSN_CMP(threshold_lsn, ailp->xa_target) > 0) + xfsaild_wakeup(ailp, threshold_lsn); + } +} + +/* + * AIL traversal cursor initialisation. + * + * The cursor keeps track of where our current traversal is up + * to by tracking the next ƣtem in the list for us. However, for + * this to be safe, removing an object from the AIL needs to invalidate + * any cursor that points to it. hence the traversal cursor needs to + * be linked to the struct xfs_ail so that deletion can search all the + * active cursors for invalidation. + * + * We don't link the push cursor because it is embedded in the struct + * xfs_ail and hence easily findable. + */ +STATIC void +xfs_trans_ail_cursor_init( + struct xfs_ail *ailp, + struct xfs_ail_cursor *cur) +{ + cur->item = NULL; + if (cur == &ailp->xa_cursors) + return; + + cur->next = ailp->xa_cursors.next; + ailp->xa_cursors.next = cur; +} + +/* + * Set the cursor to the next item, because when we look + * up the cursor the current item may have been freed. + */ +STATIC void +xfs_trans_ail_cursor_set( + struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, + struct xfs_log_item *lip) +{ + if (lip) + cur->item = xfs_ail_next(ailp, lip); +} + +/* + * Get the next item in the traversal and advance the cursor. + * If the cursor was invalidated (inidicated by a lip of 1), + * restart the traversal. + */ +struct xfs_log_item * +xfs_trans_ail_cursor_next( + struct xfs_ail *ailp, + struct xfs_ail_cursor *cur) +{ + struct xfs_log_item *lip = cur->item; + + if ((__psint_t)lip & 1) + lip = xfs_ail_min(ailp); + xfs_trans_ail_cursor_set(ailp, cur, lip); + return lip; +} + +/* + * Now that the traversal is complete, we need to remove the cursor + * from the list of traversing cursors. Avoid removing the embedded + * push cursor, but use the fact it is alway present to make the + * list deletion simple. + */ +void +xfs_trans_ail_cursor_done( + struct xfs_ail *ailp, + struct xfs_ail_cursor *done) +{ + struct xfs_ail_cursor *prev = NULL; + struct xfs_ail_cursor *cur; + + done->item = NULL; + if (done == &ailp->xa_cursors) + return; + prev = &ailp->xa_cursors; + for (cur = prev->next; cur; prev = cur, cur = prev->next) { + if (cur == done) { + prev->next = cur->next; + break; + } + } + ASSERT(cur); +} + +/* + * Invalidate any cursor that is pointing to this item. This is + * called when an item is removed from the AIL. Any cursor pointing + * to this object is now invalid and the traversal needs to be + * terminated so it doesn't reference a freed object. We set the + * cursor item to a value of 1 so we can distinguish between an + * invalidation and the end of the list when getting the next item + * from the cursor. + */ +STATIC void +xfs_trans_ail_cursor_clear( + struct xfs_ail *ailp, + struct xfs_log_item *lip) +{ + struct xfs_ail_cursor *cur; - lip = xfs_ail_min(&mp->m_ail); - if (lip && !XFS_FORCED_SHUTDOWN(mp)) { - if (XFS_LSN_CMP(threshold_lsn, mp->m_ail.xa_target) > 0) - xfsaild_wakeup(mp, threshold_lsn); + /* need to search all cursors */ + for (cur = &ailp->xa_cursors; cur; cur = cur->next) { + if (cur->item == lip) + cur->item = (struct xfs_log_item *) + ((__psint_t)cur->item | 1); } } @@ -103,25 +213,27 @@ xfs_trans_push_ail( * Return the current tree generation number for use * in calls to xfs_trans_next_ail(). */ -STATIC xfs_log_item_t * -xfs_trans_first_push_ail( - xfs_mount_t *mp, - int *gen, - xfs_lsn_t lsn) +xfs_log_item_t * +xfs_trans_ail_cursor_first( + struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, + xfs_lsn_t lsn) { - xfs_log_item_t *lip; + xfs_log_item_t *lip; - lip = xfs_ail_min(&mp->m_ail); - *gen = (int)mp->m_ail.xa_gen; + xfs_trans_ail_cursor_init(ailp, cur); + lip = xfs_ail_min(ailp); if (lsn == 0) - return lip; + goto out; - list_for_each_entry(lip, &mp->m_ail.xa_ail, li_ail) { + list_for_each_entry(lip, &ailp->xa_ail, li_ail) { if (XFS_LSN_CMP(lip->li_lsn, lsn) >= 0) - return lip; + goto out; } - - return NULL; + lip = NULL; +out: + xfs_trans_ail_cursor_set(ailp, cur, lip); + return lip; } /* @@ -129,29 +241,29 @@ xfs_trans_first_push_ail( */ long xfsaild_push( - xfs_mount_t *mp, + struct xfs_ail *ailp, xfs_lsn_t *last_lsn) { long tout = 1000; /* milliseconds */ xfs_lsn_t last_pushed_lsn = *last_lsn; - xfs_lsn_t target = mp->m_ail.xa_target; + xfs_lsn_t target = ailp->xa_target; xfs_lsn_t lsn; xfs_log_item_t *lip; - int gen; - int restarts; int flush_log, count, stuck; + xfs_mount_t *mp = ailp->xa_mount; + struct xfs_ail_cursor *cur = &ailp->xa_cursors; -#define XFS_TRANS_PUSH_AIL_RESTARTS 10 - - spin_lock(&mp->m_ail_lock); - lip = xfs_trans_first_push_ail(mp, &gen, *last_lsn); + spin_lock(&ailp->xa_lock); + xfs_trans_ail_cursor_init(ailp, cur); + lip = xfs_trans_ail_cursor_first(ailp, cur, *last_lsn); if (!lip || XFS_FORCED_SHUTDOWN(mp)) { /* * AIL is empty or our push has reached the end. */ - spin_unlock(&mp->m_ail_lock); + xfs_trans_ail_cursor_done(ailp, cur); + spin_unlock(&ailp->xa_lock); last_pushed_lsn = 0; - goto out; + return tout; } XFS_STATS_INC(xs_push_ail); @@ -169,7 +281,7 @@ xfsaild_push( */ tout = 10; lsn = lip->li_lsn; - flush_log = stuck = count = restarts = 0; + flush_log = stuck = count = 0; while ((XFS_LSN_CMP(lip->li_lsn, target) < 0)) { int lock_result; /* @@ -184,7 +296,7 @@ xfsaild_push( * skip to the next item in the list. */ lock_result = IOP_TRYLOCK(lip); - spin_unlock(&mp->m_ail_lock); + spin_unlock(&ailp->xa_lock); switch (lock_result) { case XFS_ITEM_SUCCESS: XFS_STATS_INC(xs_push_ail_success); @@ -221,7 +333,7 @@ xfsaild_push( break; } - spin_lock(&mp->m_ail_lock); + spin_lock(&ailp->xa_lock); /* should we bother continuing? */ if (XFS_FORCED_SHUTDOWN(mp)) break; @@ -244,14 +356,13 @@ xfsaild_push( if (stuck > 100) break; - lip = xfs_trans_next_ail(mp, lip, &gen, &restarts); + lip = xfs_trans_ail_cursor_next(ailp, cur); if (lip == NULL) break; - if (restarts > XFS_TRANS_PUSH_AIL_RESTARTS) - break; lsn = lip->li_lsn; } - spin_unlock(&mp->m_ail_lock); + xfs_trans_ail_cursor_done(ailp, cur); + spin_unlock(&ailp->xa_lock); if (flush_log) { /* @@ -274,8 +385,7 @@ xfsaild_push( */ tout += 20; last_pushed_lsn = 0; - } else if ((restarts > XFS_TRANS_PUSH_AIL_RESTARTS) || - ((stuck * 100) / count > 90)) { + } else if ((stuck * 100) / count > 90) { /* * Either there is a lot of contention on the AIL or we * are stuck due to operations in progress. "Stuck" in this @@ -287,7 +397,6 @@ xfsaild_push( */ tout += 10; } -out: *last_lsn = last_pushed_lsn; return tout; } /* xfsaild_push */ @@ -303,7 +412,7 @@ out: */ void xfs_trans_unlocked_item( - xfs_mount_t *mp, + struct xfs_ail *ailp, xfs_log_item_t *lip) { xfs_log_item_t *min_lip; @@ -315,7 +424,7 @@ xfs_trans_unlocked_item( * over some potentially valid data. */ if (!(lip->li_flags & XFS_LI_IN_AIL) || - XFS_FORCED_SHUTDOWN(mp)) { + XFS_FORCED_SHUTDOWN(ailp->xa_mount)) { return; } @@ -331,10 +440,10 @@ xfs_trans_unlocked_item( * the call to xfs_log_move_tail() doesn't do anything if there's * not enough free space to wake people up so we're safe calling it. */ - min_lip = xfs_ail_min(&mp->m_ail); + min_lip = xfs_ail_min(ailp); if (min_lip == lip) - xfs_log_move_tail(mp, 1); + xfs_log_move_tail(ailp->xa_mount, 1); } /* xfs_trans_unlocked_item */ @@ -347,41 +456,37 @@ xfs_trans_unlocked_item( * we move in the AIL is the minimum one, update the tail lsn in the * log manager. * - * Increment the AIL's generation count to indicate that the tree - * has changed. - * * This function must be called with the AIL lock held. The lock * is dropped before returning. */ void -xfs_trans_update_ail( - xfs_mount_t *mp, +xfs_trans_ail_update( + struct xfs_ail *ailp, xfs_log_item_t *lip, - xfs_lsn_t lsn) __releases(mp->m_ail_lock) + xfs_lsn_t lsn) __releases(ailp->xa_lock) { - xfs_log_item_t *dlip=NULL; + xfs_log_item_t *dlip = NULL; xfs_log_item_t *mlip; /* ptr to minimum lip */ - mlip = xfs_ail_min(&mp->m_ail); + mlip = xfs_ail_min(ailp); if (lip->li_flags & XFS_LI_IN_AIL) { - dlip = xfs_ail_delete(&mp->m_ail, lip); + dlip = xfs_ail_delete(ailp, lip); ASSERT(dlip == lip); + xfs_trans_ail_cursor_clear(ailp, dlip); } else { lip->li_flags |= XFS_LI_IN_AIL; } lip->li_lsn = lsn; - - xfs_ail_insert(&mp->m_ail, lip); - mp->m_ail.xa_gen++; + xfs_ail_insert(ailp, lip); if (mlip == dlip) { - mlip = xfs_ail_min(&mp->m_ail); - spin_unlock(&mp->m_ail_lock); - xfs_log_move_tail(mp, mlip->li_lsn); + mlip = xfs_ail_min(ailp); + spin_unlock(&ailp->xa_lock); + xfs_log_move_tail(ailp->xa_mount, mlip->li_lsn); } else { - spin_unlock(&mp->m_ail_lock); + spin_unlock(&ailp->xa_lock); } @@ -403,29 +508,30 @@ xfs_trans_update_ail( * is dropped before returning. */ void -xfs_trans_delete_ail( - xfs_mount_t *mp, - xfs_log_item_t *lip) __releases(mp->m_ail_lock) +xfs_trans_ail_delete( + struct xfs_ail *ailp, + xfs_log_item_t *lip) __releases(ailp->xa_lock) { xfs_log_item_t *dlip; xfs_log_item_t *mlip; if (lip->li_flags & XFS_LI_IN_AIL) { - mlip = xfs_ail_min(&mp->m_ail); - dlip = xfs_ail_delete(&mp->m_ail, lip); + mlip = xfs_ail_min(ailp); + dlip = xfs_ail_delete(ailp, lip); ASSERT(dlip == lip); + xfs_trans_ail_cursor_clear(ailp, dlip); lip->li_flags &= ~XFS_LI_IN_AIL; lip->li_lsn = 0; - mp->m_ail.xa_gen++; if (mlip == dlip) { - mlip = xfs_ail_min(&mp->m_ail); - spin_unlock(&mp->m_ail_lock); - xfs_log_move_tail(mp, (mlip ? mlip->li_lsn : 0)); + mlip = xfs_ail_min(ailp); + spin_unlock(&ailp->xa_lock); + xfs_log_move_tail(ailp->xa_mount, + (mlip ? mlip->li_lsn : 0)); } else { - spin_unlock(&mp->m_ail_lock); + spin_unlock(&ailp->xa_lock); } } else { @@ -433,13 +539,13 @@ xfs_trans_delete_ail( * If the file system is not being shutdown, we are in * serious trouble if we get to this stage. */ - if (XFS_FORCED_SHUTDOWN(mp)) - spin_unlock(&mp->m_ail_lock); - else { + struct xfs_mount *mp = ailp->xa_mount; + + spin_unlock(&ailp->xa_lock); + if (!XFS_FORCED_SHUTDOWN(mp)) { xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp, "%s: attempting to delete a log item that is not in the AIL", __func__); - spin_unlock(&mp->m_ail_lock); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); } } @@ -448,56 +554,6 @@ xfs_trans_delete_ail( /* - * Return the item in the AIL with the smallest lsn. - * Return the current tree generation number for use - * in calls to xfs_trans_next_ail(). - */ -xfs_log_item_t * -xfs_trans_first_ail( - xfs_mount_t *mp, - int *gen) -{ - xfs_log_item_t *lip; - - lip = xfs_ail_min(&mp->m_ail); - *gen = (int)mp->m_ail.xa_gen; - - return lip; -} - -/* - * If the generation count of the tree has not changed since the - * caller last took something from the AIL, then return the elmt - * in the tree which follows the one given. If the count has changed, - * then return the minimum elmt of the AIL and bump the restarts counter - * if one is given. - */ -xfs_log_item_t * -xfs_trans_next_ail( - xfs_mount_t *mp, - xfs_log_item_t *lip, - int *gen, - int *restarts) -{ - xfs_log_item_t *nlip; - - ASSERT(mp && lip && gen); - if (mp->m_ail.xa_gen == *gen) { - nlip = xfs_ail_next(&mp->m_ail, lip); - } else { - nlip = xfs_ail_min(&mp->m_ail); - *gen = (int)mp->m_ail.xa_gen; - if (restarts != NULL) { - XFS_STATS_INC(xs_push_ail_restarts); - (*restarts)++; - } - } - - return (nlip); -} - - -/* * The active item list (AIL) is a doubly linked list of log * items sorted by ascending lsn. The base of the list is * a forw/back pointer pair embedded in the xfs mount structure. @@ -515,15 +571,35 @@ int xfs_trans_ail_init( xfs_mount_t *mp) { - INIT_LIST_HEAD(&mp->m_ail.xa_ail); - return xfsaild_start(mp); + struct xfs_ail *ailp; + int error; + + ailp = kmem_zalloc(sizeof(struct xfs_ail), KM_MAYFAIL); + if (!ailp) + return ENOMEM; + + ailp->xa_mount = mp; + INIT_LIST_HEAD(&ailp->xa_ail); + spin_lock_init(&ailp->xa_lock); + error = xfsaild_start(ailp); + if (error) + goto out_free_ailp; + mp->m_ail = ailp; + return 0; + +out_free_ailp: + kmem_free(ailp); + return error; } void xfs_trans_ail_destroy( xfs_mount_t *mp) { - xfsaild_stop(mp); + struct xfs_ail *ailp = mp->m_ail; + + xfsaild_stop(ailp); + kmem_free(ailp); } /* @@ -534,7 +610,7 @@ xfs_trans_ail_destroy( */ STATIC void xfs_ail_insert( - xfs_ail_t *ailp, + struct xfs_ail *ailp, xfs_log_item_t *lip) /* ARGSUSED */ { @@ -568,7 +644,7 @@ xfs_ail_insert( /*ARGSUSED*/ STATIC xfs_log_item_t * xfs_ail_delete( - xfs_ail_t *ailp, + struct xfs_ail *ailp, xfs_log_item_t *lip) /* ARGSUSED */ { @@ -585,7 +661,7 @@ xfs_ail_delete( */ STATIC xfs_log_item_t * xfs_ail_min( - xfs_ail_t *ailp) + struct xfs_ail *ailp) /* ARGSUSED */ { if (list_empty(&ailp->xa_ail)) @@ -601,7 +677,7 @@ xfs_ail_min( */ STATIC xfs_log_item_t * xfs_ail_next( - xfs_ail_t *ailp, + struct xfs_ail *ailp, xfs_log_item_t *lip) /* ARGSUSED */ { @@ -617,7 +693,7 @@ xfs_ail_next( */ STATIC void xfs_ail_check( - xfs_ail_t *ailp, + struct xfs_ail *ailp, xfs_log_item_t *lip) { xfs_log_item_t *prev_lip; diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 4e855b5ced66..8ee2f8c8b0a6 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -527,9 +527,8 @@ xfs_trans_brelse(xfs_trans_t *tp, lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); if (lip->li_type == XFS_LI_BUF) { bip = XFS_BUF_FSPRIVATE(bp,xfs_buf_log_item_t*); - xfs_trans_unlocked_item( - bip->bli_item.li_mountp, - lip); + xfs_trans_unlocked_item(bip->bli_item.li_ailp, + lip); } } xfs_buf_relse(bp); @@ -626,7 +625,7 @@ xfs_trans_brelse(xfs_trans_t *tp, * tell the AIL that the buffer is being unlocked. */ if (bip != NULL) { - xfs_trans_unlocked_item(bip->bli_item.li_mountp, + xfs_trans_unlocked_item(bip->bli_item.li_ailp, (xfs_log_item_t*)bip); } diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c index 2a1c0f071f91..23d276af2e0c 100644 --- a/fs/xfs/xfs_trans_inode.c +++ b/fs/xfs/xfs_trans_inode.c @@ -85,7 +85,6 @@ xfs_trans_iget( { int error; xfs_inode_t *ip; - xfs_inode_log_item_t *iip; /* * If the transaction pointer is NULL, just call the normal @@ -138,34 +137,7 @@ xfs_trans_iget( } ASSERT(ip != NULL); - /* - * Get a log_item_desc to point at the new item. - */ - if (ip->i_itemp == NULL) - xfs_inode_item_init(ip, mp); - iip = ip->i_itemp; - (void) xfs_trans_add_item(tp, (xfs_log_item_t *)(iip)); - - xfs_trans_inode_broot_debug(ip); - - /* - * If the IO lock has been acquired, mark that in - * the inode log item so we'll know to unlock it - * when the transaction commits. - */ - ASSERT(iip->ili_flags == 0); - if (lock_flags & XFS_IOLOCK_EXCL) { - iip->ili_flags |= XFS_ILI_IOLOCKED_EXCL; - } else if (lock_flags & XFS_IOLOCK_SHARED) { - iip->ili_flags |= XFS_ILI_IOLOCKED_SHARED; - } - - /* - * Initialize i_transp so we can find it with xfs_inode_incore() - * above. - */ - ip->i_transp = tp; - + xfs_trans_ijoin(tp, ip, lock_flags); *ipp = ip; return 0; } diff --git a/fs/xfs/xfs_trans_item.c b/fs/xfs/xfs_trans_item.c index 3c666e8317f8..e110bf57d7f4 100644 --- a/fs/xfs/xfs_trans_item.c +++ b/fs/xfs/xfs_trans_item.c @@ -22,6 +22,14 @@ #include "xfs_inum.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" +/* XXX: from here down needed until struct xfs_trans has it's own ailp */ +#include "xfs_bit.h" +#include "xfs_buf_item.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir2.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" STATIC int xfs_trans_unlock_chunk(xfs_log_item_chunk_t *, int, int, xfs_lsn_t); @@ -79,6 +87,7 @@ xfs_trans_add_item(xfs_trans_t *tp, xfs_log_item_t *lip) lidp->lid_size = 0; lip->li_desc = lidp; lip->li_mountp = tp->t_mountp; + lip->li_ailp = tp->t_mountp->m_ail; return lidp; } @@ -120,6 +129,7 @@ xfs_trans_add_item(xfs_trans_t *tp, xfs_log_item_t *lip) lidp->lid_size = 0; lip->li_desc = lidp; lip->li_mountp = tp->t_mountp; + lip->li_ailp = tp->t_mountp->m_ail; return lidp; } diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index 3c748c456ed4..73e2ad397432 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -44,25 +44,93 @@ xfs_log_busy_slot_t *xfs_trans_add_busy(xfs_trans_t *tp, xfs_extlen_t idx); /* - * From xfs_trans_ail.c + * AIL traversal cursor. + * + * Rather than using a generation number for detecting changes in the ail, use + * a cursor that is protected by the ail lock. The aild cursor exists in the + * struct xfs_ail, but other traversals can declare it on the stack and link it + * to the ail list. + * + * When an object is deleted from or moved int the AIL, the cursor list is + * searched to see if the object is a designated cursor item. If it is, it is + * deleted from the cursor so that the next time the cursor is used traversal + * will return to the start. + * + * This means a traversal colliding with a removal will cause a restart of the + * list scan, rather than any insertion or deletion anywhere in the list. The + * low bit of the item pointer is set if the cursor has been invalidated so + * that we can tell the difference between invalidation and reaching the end + * of the list to trigger traversal restarts. */ -void xfs_trans_update_ail(struct xfs_mount *mp, - struct xfs_log_item *lip, xfs_lsn_t lsn) - __releases(mp->m_ail_lock); -void xfs_trans_delete_ail(struct xfs_mount *mp, - struct xfs_log_item *lip) - __releases(mp->m_ail_lock); -struct xfs_log_item *xfs_trans_first_ail(struct xfs_mount *, int *); -struct xfs_log_item *xfs_trans_next_ail(struct xfs_mount *, - struct xfs_log_item *, int *, int *); +struct xfs_ail_cursor { + struct xfs_ail_cursor *next; + struct xfs_log_item *item; +}; +/* + * Private AIL structures. + * + * Eventually we need to drive the locking in here as well. + */ +struct xfs_ail { + struct xfs_mount *xa_mount; + struct list_head xa_ail; + uint xa_gen; + struct task_struct *xa_task; + xfs_lsn_t xa_target; + struct xfs_ail_cursor xa_cursors; + spinlock_t xa_lock; +}; /* - * AIL push thread support + * From xfs_trans_ail.c */ -long xfsaild_push(struct xfs_mount *, xfs_lsn_t *); -void xfsaild_wakeup(struct xfs_mount *, xfs_lsn_t); -int xfsaild_start(struct xfs_mount *); -void xfsaild_stop(struct xfs_mount *); +void xfs_trans_ail_update(struct xfs_ail *ailp, + struct xfs_log_item *lip, xfs_lsn_t lsn) + __releases(ailp->xa_lock); +void xfs_trans_ail_delete(struct xfs_ail *ailp, + struct xfs_log_item *lip) + __releases(ailp->xa_lock); +void xfs_trans_ail_push(struct xfs_ail *, xfs_lsn_t); +void xfs_trans_unlocked_item(struct xfs_ail *, + xfs_log_item_t *); + +xfs_lsn_t xfs_trans_ail_tail(struct xfs_ail *ailp); + +struct xfs_log_item *xfs_trans_ail_cursor_first(struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, + xfs_lsn_t lsn); +struct xfs_log_item *xfs_trans_ail_cursor_next(struct xfs_ail *ailp, + struct xfs_ail_cursor *cur); +void xfs_trans_ail_cursor_done(struct xfs_ail *ailp, + struct xfs_ail_cursor *cur); + +long xfsaild_push(struct xfs_ail *, xfs_lsn_t *); +void xfsaild_wakeup(struct xfs_ail *, xfs_lsn_t); +int xfsaild_start(struct xfs_ail *); +void xfsaild_stop(struct xfs_ail *); +#if BITS_PER_LONG != 64 +static inline void +xfs_trans_ail_copy_lsn( + struct xfs_ail *ailp, + xfs_lsn_t *dst, + xfs_lsn_t *src) +{ + ASSERT(sizeof(xfs_lsn_t) == 8); /* don't lock if it shrinks */ + spin_lock(&ailp->xa_lock); + *dst = *src; + spin_unlock(&ailp->xa_lock); +} +#else +static inline void +xfs_trans_ail_copy_lsn( + struct xfs_ail *ailp, + xfs_lsn_t *dst, + xfs_lsn_t *src) +{ + ASSERT(sizeof(xfs_lsn_t) == 8); + *dst = *src; +} +#endif #endif /* __XFS_TRANS_PRIV_H__ */ diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c index 35d4d414bcc2..fcc2285d03ed 100644 --- a/fs/xfs/xfs_utils.c +++ b/fs/xfs/xfs_utils.c @@ -172,6 +172,12 @@ xfs_dir_ialloc( *ipp = NULL; return code; } + + /* + * transaction commit worked ok so we can drop the extra ticket + * reference that we gained in xfs_trans_dup() + */ + xfs_log_ticket_put(tp->t_ticket); code = xfs_trans_reserve(tp, 0, log_res, 0, XFS_TRANS_PERM_LOG_RES, log_count); /* @@ -268,9 +274,9 @@ xfs_bump_ino_vers2( xfs_mount_t *mp; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - ASSERT(ip->i_d.di_version == XFS_DINODE_VERSION_1); + ASSERT(ip->i_d.di_version == 1); - ip->i_d.di_version = XFS_DINODE_VERSION_2; + ip->i_d.di_version = 2; ip->i_d.di_onlink = 0; memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); mp = tp->t_mountp; @@ -302,7 +308,7 @@ xfs_bumplink( ASSERT(ip->i_d.di_nlink > 0); ip->i_d.di_nlink++; inc_nlink(VFS_I(ip)); - if ((ip->i_d.di_version == XFS_DINODE_VERSION_1) && + if ((ip->i_d.di_version == 1) && (ip->i_d.di_nlink > XFS_MAXLINK_1)) { /* * The inode has increased its number of links beyond diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c deleted file mode 100644 index 439dd3939dda..000000000000 --- a/fs/xfs/xfs_vfsops.c +++ /dev/null @@ -1,757 +0,0 @@ -/* - * Copyright (c) 2000-2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" -#include "xfs_sb.h" -#include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" -#include "xfs_mount.h" -#include "xfs_da_btree.h" -#include "xfs_bmap_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" -#include "xfs_inode.h" -#include "xfs_inode_item.h" -#include "xfs_btree.h" -#include "xfs_alloc.h" -#include "xfs_ialloc.h" -#include "xfs_quota.h" -#include "xfs_error.h" -#include "xfs_bmap.h" -#include "xfs_rw.h" -#include "xfs_buf_item.h" -#include "xfs_log_priv.h" -#include "xfs_dir2_trace.h" -#include "xfs_extfree_item.h" -#include "xfs_acl.h" -#include "xfs_attr.h" -#include "xfs_clnt.h" -#include "xfs_mru_cache.h" -#include "xfs_filestream.h" -#include "xfs_fsops.h" -#include "xfs_vnodeops.h" -#include "xfs_vfsops.h" -#include "xfs_utils.h" - - -STATIC void -xfs_quiesce_fs( - xfs_mount_t *mp) -{ - int count = 0, pincount; - - xfs_flush_buftarg(mp->m_ddev_targp, 0); - xfs_finish_reclaim_all(mp, 0); - - /* This loop must run at least twice. - * The first instance of the loop will flush - * most meta data but that will generate more - * meta data (typically directory updates). - * Which then must be flushed and logged before - * we can write the unmount record. - */ - do { - xfs_syncsub(mp, SYNC_INODE_QUIESCE, NULL); - pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1); - if (!pincount) { - delay(50); - count++; - } - } while (count < 2); -} - -/* - * Second stage of a quiesce. The data is already synced, now we have to take - * care of the metadata. New transactions are already blocked, so we need to - * wait for any remaining transactions to drain out before proceding. - */ -void -xfs_attr_quiesce( - xfs_mount_t *mp) -{ - int error = 0; - - /* wait for all modifications to complete */ - while (atomic_read(&mp->m_active_trans) > 0) - delay(100); - - /* flush inodes and push all remaining buffers out to disk */ - xfs_quiesce_fs(mp); - - ASSERT_ALWAYS(atomic_read(&mp->m_active_trans) == 0); - - /* Push the superblock and write an unmount record */ - error = xfs_log_sbcount(mp, 1); - if (error) - xfs_fs_cmn_err(CE_WARN, mp, - "xfs_attr_quiesce: failed to log sb changes. " - "Frozen image may not be consistent."); - xfs_log_unmount_write(mp); - xfs_unmountfs_writesb(mp); -} - -/* - * xfs_unmount_flush implements a set of flush operation on special - * inodes, which are needed as a separate set of operations so that - * they can be called as part of relocation process. - */ -int -xfs_unmount_flush( - xfs_mount_t *mp, /* Mount structure we are getting - rid of. */ - int relocation) /* Called from vfs relocation. */ -{ - xfs_inode_t *rip = mp->m_rootip; - xfs_inode_t *rbmip; - xfs_inode_t *rsumip = NULL; - int error; - - xfs_ilock(rip, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); - xfs_iflock(rip); - - /* - * Flush out the real time inodes. - */ - if ((rbmip = mp->m_rbmip) != NULL) { - xfs_ilock(rbmip, XFS_ILOCK_EXCL); - xfs_iflock(rbmip); - error = xfs_iflush(rbmip, XFS_IFLUSH_SYNC); - xfs_iunlock(rbmip, XFS_ILOCK_EXCL); - - if (error == EFSCORRUPTED) - goto fscorrupt_out; - - ASSERT(vn_count(VFS_I(rbmip)) == 1); - - rsumip = mp->m_rsumip; - xfs_ilock(rsumip, XFS_ILOCK_EXCL); - xfs_iflock(rsumip); - error = xfs_iflush(rsumip, XFS_IFLUSH_SYNC); - xfs_iunlock(rsumip, XFS_ILOCK_EXCL); - - if (error == EFSCORRUPTED) - goto fscorrupt_out; - - ASSERT(vn_count(VFS_I(rsumip)) == 1); - } - - /* - * Synchronously flush root inode to disk - */ - error = xfs_iflush(rip, XFS_IFLUSH_SYNC); - if (error == EFSCORRUPTED) - goto fscorrupt_out2; - - if (vn_count(VFS_I(rip)) != 1 && !relocation) { - xfs_iunlock(rip, XFS_ILOCK_EXCL); - return XFS_ERROR(EBUSY); - } - - /* - * Release dquot that rootinode, rbmino and rsumino might be holding, - * flush and purge the quota inodes. - */ - error = XFS_QM_UNMOUNT(mp); - if (error == EFSCORRUPTED) - goto fscorrupt_out2; - - if (rbmip) { - IRELE(rbmip); - IRELE(rsumip); - } - - xfs_iunlock(rip, XFS_ILOCK_EXCL); - return 0; - -fscorrupt_out: - xfs_ifunlock(rip); - -fscorrupt_out2: - xfs_iunlock(rip, XFS_ILOCK_EXCL); - - return XFS_ERROR(EFSCORRUPTED); -} - -/* - * xfs_sync flushes any pending I/O to file system vfsp. - * - * This routine is called by vfs_sync() to make sure that things make it - * out to disk eventually, on sync() system calls to flush out everything, - * and when the file system is unmounted. For the vfs_sync() case, all - * we really need to do is sync out the log to make all of our meta-data - * updates permanent (except for timestamps). For calls from pflushd(), - * dirty pages are kept moving by calling pdflush() on the inodes - * containing them. We also flush the inodes that we can lock without - * sleeping and the superblock if we can lock it without sleeping from - * vfs_sync() so that items at the tail of the log are always moving out. - * - * Flags: - * SYNC_BDFLUSH - We're being called from vfs_sync() so we don't want - * to sleep if we can help it. All we really need - * to do is ensure that the log is synced at least - * periodically. We also push the inodes and - * superblock if we can lock them without sleeping - * and they are not pinned. - * SYNC_ATTR - We need to flush the inodes. If SYNC_BDFLUSH is not - * set, then we really want to lock each inode and flush - * it. - * SYNC_WAIT - All the flushes that take place in this call should - * be synchronous. - * SYNC_DELWRI - This tells us to push dirty pages associated with - * inodes. SYNC_WAIT and SYNC_BDFLUSH are used to - * determine if they should be flushed sync, async, or - * delwri. - * SYNC_CLOSE - This flag is passed when the system is being - * unmounted. We should sync and invalidate everything. - * SYNC_FSDATA - This indicates that the caller would like to make - * sure the superblock is safe on disk. We can ensure - * this by simply making sure the log gets flushed - * if SYNC_BDFLUSH is set, and by actually writing it - * out otherwise. - * SYNC_IOWAIT - The caller wants us to wait for all data I/O to complete - * before we return (including direct I/O). Forms the drain - * side of the write barrier needed to safely quiesce the - * filesystem. - * - */ -int -xfs_sync( - xfs_mount_t *mp, - int flags) -{ - int error; - - /* - * Get the Quota Manager to flush the dquots. - * - * If XFS quota support is not enabled or this filesystem - * instance does not use quotas XFS_QM_DQSYNC will always - * return zero. - */ - error = XFS_QM_DQSYNC(mp, flags); - if (error) { - /* - * If we got an IO error, we will be shutting down. - * So, there's nothing more for us to do here. - */ - ASSERT(error != EIO || XFS_FORCED_SHUTDOWN(mp)); - if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(error); - } - - if (flags & SYNC_IOWAIT) - xfs_filestream_flush(mp); - - return xfs_syncsub(mp, flags, NULL); -} - -/* - * xfs sync routine for internal use - * - * This routine supports all of the flags defined for the generic vfs_sync - * interface as explained above under xfs_sync. - * - */ -int -xfs_sync_inodes( - xfs_mount_t *mp, - int flags, - int *bypassed) -{ - xfs_inode_t *ip = NULL; - struct inode *vp = NULL; - int error; - int last_error; - uint64_t fflag; - uint lock_flags; - uint base_lock_flags; - boolean_t mount_locked; - boolean_t vnode_refed; - int preempt; - xfs_iptr_t *ipointer; -#ifdef DEBUG - boolean_t ipointer_in = B_FALSE; - -#define IPOINTER_SET ipointer_in = B_TRUE -#define IPOINTER_CLR ipointer_in = B_FALSE -#else -#define IPOINTER_SET -#define IPOINTER_CLR -#endif - - -/* Insert a marker record into the inode list after inode ip. The list - * must be locked when this is called. After the call the list will no - * longer be locked. - */ -#define IPOINTER_INSERT(ip, mp) { \ - ASSERT(ipointer_in == B_FALSE); \ - ipointer->ip_mnext = ip->i_mnext; \ - ipointer->ip_mprev = ip; \ - ip->i_mnext = (xfs_inode_t *)ipointer; \ - ipointer->ip_mnext->i_mprev = (xfs_inode_t *)ipointer; \ - preempt = 0; \ - XFS_MOUNT_IUNLOCK(mp); \ - mount_locked = B_FALSE; \ - IPOINTER_SET; \ - } - -/* Remove the marker from the inode list. If the marker was the only item - * in the list then there are no remaining inodes and we should zero out - * the whole list. If we are the current head of the list then move the head - * past us. - */ -#define IPOINTER_REMOVE(ip, mp) { \ - ASSERT(ipointer_in == B_TRUE); \ - if (ipointer->ip_mnext != (xfs_inode_t *)ipointer) { \ - ip = ipointer->ip_mnext; \ - ip->i_mprev = ipointer->ip_mprev; \ - ipointer->ip_mprev->i_mnext = ip; \ - if (mp->m_inodes == (xfs_inode_t *)ipointer) { \ - mp->m_inodes = ip; \ - } \ - } else { \ - ASSERT(mp->m_inodes == (xfs_inode_t *)ipointer); \ - mp->m_inodes = NULL; \ - ip = NULL; \ - } \ - IPOINTER_CLR; \ - } - -#define XFS_PREEMPT_MASK 0x7f - - ASSERT(!(flags & SYNC_BDFLUSH)); - - if (bypassed) - *bypassed = 0; - if (mp->m_flags & XFS_MOUNT_RDONLY) - return 0; - error = 0; - last_error = 0; - preempt = 0; - - /* Allocate a reference marker */ - ipointer = (xfs_iptr_t *)kmem_zalloc(sizeof(xfs_iptr_t), KM_SLEEP); - - fflag = XFS_B_ASYNC; /* default is don't wait */ - if (flags & SYNC_DELWRI) - fflag = XFS_B_DELWRI; - if (flags & SYNC_WAIT) - fflag = 0; /* synchronous overrides all */ - - base_lock_flags = XFS_ILOCK_SHARED; - if (flags & (SYNC_DELWRI | SYNC_CLOSE)) { - /* - * We need the I/O lock if we're going to call any of - * the flush/inval routines. - */ - base_lock_flags |= XFS_IOLOCK_SHARED; - } - - XFS_MOUNT_ILOCK(mp); - - ip = mp->m_inodes; - - mount_locked = B_TRUE; - vnode_refed = B_FALSE; - - IPOINTER_CLR; - - do { - ASSERT(ipointer_in == B_FALSE); - ASSERT(vnode_refed == B_FALSE); - - lock_flags = base_lock_flags; - - /* - * There were no inodes in the list, just break out - * of the loop. - */ - if (ip == NULL) { - break; - } - - /* - * We found another sync thread marker - skip it - */ - if (ip->i_mount == NULL) { - ip = ip->i_mnext; - continue; - } - - vp = VFS_I(ip); - - /* - * If the vnode is gone then this is being torn down, - * call reclaim if it is flushed, else let regular flush - * code deal with it later in the loop. - */ - - if (vp == NULL) { - /* Skip ones already in reclaim */ - if (ip->i_flags & XFS_IRECLAIM) { - ip = ip->i_mnext; - continue; - } - if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0) { - ip = ip->i_mnext; - } else if ((xfs_ipincount(ip) == 0) && - xfs_iflock_nowait(ip)) { - IPOINTER_INSERT(ip, mp); - - xfs_finish_reclaim(ip, 1, - XFS_IFLUSH_DELWRI_ELSE_ASYNC); - - XFS_MOUNT_ILOCK(mp); - mount_locked = B_TRUE; - IPOINTER_REMOVE(ip, mp); - } else { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - ip = ip->i_mnext; - } - continue; - } - - if (VN_BAD(vp)) { - ip = ip->i_mnext; - continue; - } - - if (XFS_FORCED_SHUTDOWN(mp) && !(flags & SYNC_CLOSE)) { - XFS_MOUNT_IUNLOCK(mp); - kmem_free(ipointer); - return 0; - } - - /* - * Try to lock without sleeping. We're out of order with - * the inode list lock here, so if we fail we need to drop - * the mount lock and try again. If we're called from - * bdflush() here, then don't bother. - * - * The inode lock here actually coordinates with the - * almost spurious inode lock in xfs_ireclaim() to prevent - * the vnode we handle here without a reference from - * being freed while we reference it. If we lock the inode - * while it's on the mount list here, then the spurious inode - * lock in xfs_ireclaim() after the inode is pulled from - * the mount list will sleep until we release it here. - * This keeps the vnode from being freed while we reference - * it. - */ - if (xfs_ilock_nowait(ip, lock_flags) == 0) { - if (vp == NULL) { - ip = ip->i_mnext; - continue; - } - - vp = vn_grab(vp); - if (vp == NULL) { - ip = ip->i_mnext; - continue; - } - - IPOINTER_INSERT(ip, mp); - xfs_ilock(ip, lock_flags); - - ASSERT(vp == VFS_I(ip)); - ASSERT(ip->i_mount == mp); - - vnode_refed = B_TRUE; - } - - /* From here on in the loop we may have a marker record - * in the inode list. - */ - - /* - * If we have to flush data or wait for I/O completion - * we need to drop the ilock that we currently hold. - * If we need to drop the lock, insert a marker if we - * have not already done so. - */ - if ((flags & (SYNC_CLOSE|SYNC_IOWAIT)) || - ((flags & SYNC_DELWRI) && VN_DIRTY(vp))) { - if (mount_locked) { - IPOINTER_INSERT(ip, mp); - } - xfs_iunlock(ip, XFS_ILOCK_SHARED); - - if (flags & SYNC_CLOSE) { - /* Shutdown case. Flush and invalidate. */ - if (XFS_FORCED_SHUTDOWN(mp)) - xfs_tosspages(ip, 0, -1, - FI_REMAPF); - else - error = xfs_flushinval_pages(ip, - 0, -1, FI_REMAPF); - } else if ((flags & SYNC_DELWRI) && VN_DIRTY(vp)) { - error = xfs_flush_pages(ip, 0, - -1, fflag, FI_NONE); - } - - /* - * When freezing, we need to wait ensure all I/O (including direct - * I/O) is complete to ensure no further data modification can take - * place after this point - */ - if (flags & SYNC_IOWAIT) - vn_iowait(ip); - - xfs_ilock(ip, XFS_ILOCK_SHARED); - } - - if ((flags & SYNC_ATTR) && - (ip->i_update_core || - (ip->i_itemp && ip->i_itemp->ili_format.ilf_fields))) { - if (mount_locked) - IPOINTER_INSERT(ip, mp); - - if (flags & SYNC_WAIT) { - xfs_iflock(ip); - error = xfs_iflush(ip, XFS_IFLUSH_SYNC); - - /* - * If we can't acquire the flush lock, then the inode - * is already being flushed so don't bother waiting. - * - * If we can lock it then do a delwri flush so we can - * combine multiple inode flushes in each disk write. - */ - } else if (xfs_iflock_nowait(ip)) { - error = xfs_iflush(ip, XFS_IFLUSH_DELWRI); - } else if (bypassed) { - (*bypassed)++; - } - } - - if (lock_flags != 0) { - xfs_iunlock(ip, lock_flags); - } - - if (vnode_refed) { - /* - * If we had to take a reference on the vnode - * above, then wait until after we've unlocked - * the inode to release the reference. This is - * because we can be already holding the inode - * lock when IRELE() calls xfs_inactive(). - * - * Make sure to drop the mount lock before calling - * IRELE() so that we don't trip over ourselves if - * we have to go for the mount lock again in the - * inactive code. - */ - if (mount_locked) { - IPOINTER_INSERT(ip, mp); - } - - IRELE(ip); - - vnode_refed = B_FALSE; - } - - if (error) { - last_error = error; - } - - /* - * bail out if the filesystem is corrupted. - */ - if (error == EFSCORRUPTED) { - if (!mount_locked) { - XFS_MOUNT_ILOCK(mp); - IPOINTER_REMOVE(ip, mp); - } - XFS_MOUNT_IUNLOCK(mp); - ASSERT(ipointer_in == B_FALSE); - kmem_free(ipointer); - return XFS_ERROR(error); - } - - /* Let other threads have a chance at the mount lock - * if we have looped many times without dropping the - * lock. - */ - if ((++preempt & XFS_PREEMPT_MASK) == 0) { - if (mount_locked) { - IPOINTER_INSERT(ip, mp); - } - } - - if (mount_locked == B_FALSE) { - XFS_MOUNT_ILOCK(mp); - mount_locked = B_TRUE; - IPOINTER_REMOVE(ip, mp); - continue; - } - - ASSERT(ipointer_in == B_FALSE); - ip = ip->i_mnext; - - } while (ip != mp->m_inodes); - - XFS_MOUNT_IUNLOCK(mp); - - ASSERT(ipointer_in == B_FALSE); - - kmem_free(ipointer); - return XFS_ERROR(last_error); -} - -/* - * xfs sync routine for internal use - * - * This routine supports all of the flags defined for the generic vfs_sync - * interface as explained above under xfs_sync. - * - */ -int -xfs_syncsub( - xfs_mount_t *mp, - int flags, - int *bypassed) -{ - int error = 0; - int last_error = 0; - uint log_flags = XFS_LOG_FORCE; - xfs_buf_t *bp; - xfs_buf_log_item_t *bip; - - /* - * Sync out the log. This ensures that the log is periodically - * flushed even if there is not enough activity to fill it up. - */ - if (flags & SYNC_WAIT) - log_flags |= XFS_LOG_SYNC; - - xfs_log_force(mp, (xfs_lsn_t)0, log_flags); - - if (flags & (SYNC_ATTR|SYNC_DELWRI)) { - if (flags & SYNC_BDFLUSH) - xfs_finish_reclaim_all(mp, 1); - else - error = xfs_sync_inodes(mp, flags, bypassed); - } - - /* - * Flushing out dirty data above probably generated more - * log activity, so if this isn't vfs_sync() then flush - * the log again. - */ - if (flags & SYNC_DELWRI) { - xfs_log_force(mp, (xfs_lsn_t)0, log_flags); - } - - if (flags & SYNC_FSDATA) { - /* - * If this is vfs_sync() then only sync the superblock - * if we can lock it without sleeping and it is not pinned. - */ - if (flags & SYNC_BDFLUSH) { - bp = xfs_getsb(mp, XFS_BUF_TRYLOCK); - if (bp != NULL) { - bip = XFS_BUF_FSPRIVATE(bp,xfs_buf_log_item_t*); - if ((bip != NULL) && - xfs_buf_item_dirty(bip)) { - if (!(XFS_BUF_ISPINNED(bp))) { - XFS_BUF_ASYNC(bp); - error = xfs_bwrite(mp, bp); - } else { - xfs_buf_relse(bp); - } - } else { - xfs_buf_relse(bp); - } - } - } else { - bp = xfs_getsb(mp, 0); - /* - * If the buffer is pinned then push on the log so - * we won't get stuck waiting in the write for - * someone, maybe ourselves, to flush the log. - * Even though we just pushed the log above, we - * did not have the superblock buffer locked at - * that point so it can become pinned in between - * there and here. - */ - if (XFS_BUF_ISPINNED(bp)) - xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); - if (flags & SYNC_WAIT) - XFS_BUF_UNASYNC(bp); - else - XFS_BUF_ASYNC(bp); - error = xfs_bwrite(mp, bp); - } - if (error) { - last_error = error; - } - } - - /* - * Now check to see if the log needs a "dummy" transaction. - */ - if (!(flags & SYNC_REMOUNT) && xfs_log_need_covered(mp)) { - xfs_trans_t *tp; - xfs_inode_t *ip; - - /* - * Put a dummy transaction in the log to tell - * recovery that all others are OK. - */ - tp = xfs_trans_alloc(mp, XFS_TRANS_DUMMY1); - if ((error = xfs_trans_reserve(tp, 0, - XFS_ICHANGE_LOG_RES(mp), - 0, 0, 0))) { - xfs_trans_cancel(tp, 0); - return error; - } - - ip = mp->m_rootip; - xfs_ilock(ip, XFS_ILOCK_EXCL); - - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_ihold(tp, ip); - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - error = xfs_trans_commit(tp, 0); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_log_force(mp, (xfs_lsn_t)0, log_flags); - } - - /* - * When shutting down, we need to insure that the AIL is pushed - * to disk or the filesystem can appear corrupt from the PROM. - */ - if ((flags & (SYNC_CLOSE|SYNC_WAIT)) == (SYNC_CLOSE|SYNC_WAIT)) { - XFS_bflush(mp->m_ddev_targp); - if (mp->m_rtdev_targp) { - XFS_bflush(mp->m_rtdev_targp); - } - } - - return XFS_ERROR(last_error); -} diff --git a/fs/xfs/xfs_vfsops.h b/fs/xfs/xfs_vfsops.h deleted file mode 100644 index a74b05087da4..000000000000 --- a/fs/xfs/xfs_vfsops.h +++ /dev/null @@ -1,16 +0,0 @@ -#ifndef _XFS_VFSOPS_H -#define _XFS_VFSOPS_H 1 - -struct cred; -struct xfs_fid; -struct inode; -struct kstatfs; -struct xfs_mount; -struct xfs_mount_args; - -int xfs_sync(struct xfs_mount *mp, int flags); -void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname, - int lnnum); -void xfs_attr_quiesce(struct xfs_mount *mp); - -#endif /* _XFS_VFSOPS_H */ diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 8b6812f66a15..f07bf8768c3a 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -54,33 +54,10 @@ #include "xfs_vnodeops.h" int -xfs_open( - xfs_inode_t *ip) -{ - int mode; - - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) - return XFS_ERROR(EIO); - - /* - * If it's a directory with any blocks, read-ahead block 0 - * as we're almost certain to have the next operation be a read there. - */ - if (S_ISDIR(ip->i_d.di_mode) && ip->i_d.di_nextents > 0) { - mode = xfs_ilock_map_shared(ip); - if (ip->i_d.di_nextents > 0) - (void)xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK); - xfs_iunlock(ip, mode); - } - return 0; -} - -int xfs_setattr( struct xfs_inode *ip, struct iattr *iattr, - int flags, - cred_t *credp) + int flags) { xfs_mount_t *mp = ip->i_mount; struct inode *inode = VFS_I(ip); @@ -93,7 +70,6 @@ xfs_setattr( gid_t gid=0, igid=0; int timeflags = 0; struct xfs_dquot *udqp, *gdqp, *olddquot1, *olddquot2; - int file_owner; int need_iolock = 1; xfs_itrace_entry(ip); @@ -104,6 +80,10 @@ xfs_setattr( if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); + code = -inode_change_ok(inode, iattr); + if (code) + return code; + olddquot1 = olddquot2 = NULL; udqp = gdqp = NULL; @@ -181,62 +161,8 @@ xfs_setattr( xfs_ilock(ip, lock_flags); - /* boolean: are we the file owner? */ - file_owner = (current_fsuid() == ip->i_d.di_uid); - - /* - * Change various properties of a file. - * Only the owner or users with CAP_FOWNER - * capability may do these things. - */ - if (mask & (ATTR_MODE|ATTR_UID|ATTR_GID)) { - /* - * CAP_FOWNER overrides the following restrictions: - * - * The user ID of the calling process must be equal - * to the file owner ID, except in cases where the - * CAP_FSETID capability is applicable. - */ - if (!file_owner && !capable(CAP_FOWNER)) { - code = XFS_ERROR(EPERM); - goto error_return; - } - - /* - * CAP_FSETID overrides the following restrictions: - * - * The effective user ID of the calling process shall match - * the file owner when setting the set-user-ID and - * set-group-ID bits on that file. - * - * The effective group ID or one of the supplementary group - * IDs of the calling process shall match the group owner of - * the file when setting the set-group-ID bit on that file - */ - if (mask & ATTR_MODE) { - mode_t m = 0; - - if ((iattr->ia_mode & S_ISUID) && !file_owner) - m |= S_ISUID; - if ((iattr->ia_mode & S_ISGID) && - !in_group_p((gid_t)ip->i_d.di_gid)) - m |= S_ISGID; -#if 0 - /* Linux allows this, Irix doesn't. */ - if ((iattr->ia_mode & S_ISVTX) && !S_ISDIR(ip->i_d.di_mode)) - m |= S_ISVTX; -#endif - if (m && !capable(CAP_FSETID)) - iattr->ia_mode &= ~m; - } - } - /* * Change file ownership. Must be the owner or privileged. - * If the system was configured with the "restricted_chown" - * option, the owner is not permitted to give away the file, - * and can change the group id only to a group of which he - * or she is a member. */ if (mask & (ATTR_UID|ATTR_GID)) { /* @@ -251,23 +177,6 @@ xfs_setattr( uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid; /* - * CAP_CHOWN overrides the following restrictions: - * - * If _POSIX_CHOWN_RESTRICTED is defined, this capability - * shall override the restriction that a process cannot - * change the user ID of a file it owns and the restriction - * that the group ID supplied to the chown() function - * shall be equal to either the group ID or one of the - * supplementary group IDs of the calling process. - */ - if (restricted_chown && - (iuid != uid || (igid != gid && - !in_group_p((gid_t)gid))) && - !capable(CAP_CHOWN)) { - code = XFS_ERROR(EPERM); - goto error_return; - } - /* * Do a quota reservation only if uid/gid is actually * going to change. */ @@ -304,36 +213,22 @@ xfs_setattr( code = XFS_ERROR(EINVAL); goto error_return; } + /* * Make sure that the dquots are attached to the inode. */ - if ((code = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED))) + code = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED); + if (code) goto error_return; - } - - /* - * Change file access or modified times. - */ - if (mask & (ATTR_ATIME|ATTR_MTIME)) { - if (!file_owner) { - if ((mask & (ATTR_MTIME_SET|ATTR_ATIME_SET)) && - !capable(CAP_FOWNER)) { - code = XFS_ERROR(EPERM); - goto error_return; - } - } - } - /* - * Now we can make the changes. Before we join the inode - * to the transaction, if ATTR_SIZE is set then take care of - * the part of the truncation that must be done without the - * inode lock. This needs to be done before joining the inode - * to the transaction, because the inode cannot be unlocked - * once it is a part of the transaction. - */ - if (mask & ATTR_SIZE) { - code = 0; + /* + * Now we can make the changes. Before we join the inode + * to the transaction, if ATTR_SIZE is set then take care of + * the part of the truncation that must be done without the + * inode lock. This needs to be done before joining the inode + * to the transaction, because the inode cannot be unlocked + * once it is a part of the transaction. + */ if (iattr->ia_size > ip->i_size) { /* * Do the first part of growing a file: zero any data @@ -366,7 +261,7 @@ xfs_setattr( } /* wait for all I/O to complete */ - vn_iowait(ip); + xfs_ioend_wait(ip); if (!code) code = xfs_itruncate_data(ip, iattr->ia_size); @@ -388,17 +283,10 @@ xfs_setattr( } commit_flags = XFS_TRANS_RELEASE_LOG_RES; xfs_ilock(ip, XFS_ILOCK_EXCL); - } - if (tp) { xfs_trans_ijoin(tp, ip, lock_flags); xfs_trans_ihold(tp, ip); - } - /* - * Truncate file. Must have write permission and not be a directory. - */ - if (mask & ATTR_SIZE) { /* * Only change the c/mtime if we are changing the size * or we are explicitly asked to change it. This handles @@ -438,28 +326,13 @@ xfs_setattr( */ xfs_iflags_set(ip, XFS_ITRUNCATED); } - } - - /* - * Change file access modes. - */ - if (mask & ATTR_MODE) { - ip->i_d.di_mode &= S_IFMT; - ip->i_d.di_mode |= iattr->ia_mode & ~S_IFMT; - - inode->i_mode &= S_IFMT; - inode->i_mode |= iattr->ia_mode & ~S_IFMT; - - xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE); - timeflags |= XFS_ICHGTIME_CHG; + } else if (tp) { + xfs_trans_ijoin(tp, ip, lock_flags); + xfs_trans_ihold(tp, ip); } /* * Change file ownership. Must be the owner or privileged. - * If the system was configured with the "restricted_chown" - * option, the owner is not permitted to give away the file, - * and can change the group id only to a group of which he - * or she is a member. */ if (mask & (ATTR_UID|ATTR_GID)) { /* @@ -503,6 +376,24 @@ xfs_setattr( timeflags |= XFS_ICHGTIME_CHG; } + /* + * Change file access modes. + */ + if (mask & ATTR_MODE) { + umode_t mode = iattr->ia_mode; + + if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) + mode &= ~S_ISGID; + + ip->i_d.di_mode &= S_IFMT; + ip->i_d.di_mode |= mode & ~S_IFMT; + + inode->i_mode &= S_IFMT; + inode->i_mode |= mode & ~S_IFMT; + + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + timeflags |= XFS_ICHGTIME_CHG; + } /* * Change file access or modified times. @@ -713,7 +604,7 @@ xfs_fsync( return XFS_ERROR(EIO); /* capture size updates in I/O completion before writing the inode. */ - error = filemap_fdatawait(VFS_I(ip)->i_mapping); + error = xfs_wait_on_pages(ip, 0, -1); if (error) return XFS_ERROR(error); @@ -1029,6 +920,12 @@ xfs_inactive_symlink_rmt( goto error0; } /* + * transaction commit worked ok so we can drop the extra ticket + * reference that we gained in xfs_trans_dup() + */ + xfs_log_ticket_put(tp->t_ticket); + + /* * Remove the memory for extent descriptions (just bookkeeping). */ if (ip->i_df.if_bytes) @@ -1625,8 +1522,6 @@ xfs_create( xfs_trans_set_sync(tp); } - dp->i_gen++; - /* * Attach the dquot(s) to the inodes and modify them incore. * These ids of the inode couldn't have changed since the new @@ -1993,13 +1888,6 @@ xfs_remove( } xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - /* - * Bump the in memory generation count on the parent - * directory so that other can know that it has changed. - */ - dp->i_gen++; - xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); - if (is_dir) { /* * Drop the link from ip's "..". @@ -2009,7 +1897,7 @@ xfs_remove( goto out_bmap_cancel; /* - * Drop the link from dp to ip. + * Drop the "." link from ip to self. */ error = xfs_droplink(tp, ip); if (error) @@ -2017,14 +1905,14 @@ xfs_remove( } else { /* * When removing a non-directory we need to log the parent - * inode here for the i_gen update. For a directory this is - * done implicitly by the xfs_droplink call for the ".." entry. + * inode here. For a directory this is done implicitly + * by the xfs_droplink call for the ".." entry. */ xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); } /* - * Drop the "." link from ip to self. + * Drop the link from dp to ip. */ error = xfs_droplink(tp, ip); if (error) @@ -2178,7 +2066,6 @@ xfs_link( if (error) goto abort_return; xfs_ichgtime(tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - tdp->i_gen++; xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE); error = xfs_bumplink(tp, sip); @@ -2355,18 +2242,10 @@ xfs_mkdir( } xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - /* - * Bump the in memory version number of the parent directory - * so that other processes accessing it will recognize that - * the directory has changed. - */ - dp->i_gen++; - error = xfs_dir_init(tp, cdp, dp); if (error) goto error2; - cdp->i_gen = 1; error = xfs_bumplink(tp, dp); if (error) goto error2; @@ -2653,13 +2532,6 @@ xfs_symlink( xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); /* - * Bump the in memory version number of the parent directory - * so that other processes accessing it will recognize that - * the directory has changed. - */ - dp->i_gen++; - - /* * If this is a synchronous mount, make sure that the * symlink transaction goes to disk before returning to * the user. @@ -2809,7 +2681,7 @@ xfs_reclaim( return 0; } - vn_iowait(ip); + xfs_ioend_wait(ip); ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0); @@ -2833,122 +2705,10 @@ xfs_reclaim( if (!ip->i_update_core && (ip->i_itemp == NULL)) { xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_iflock(ip); - return xfs_finish_reclaim(ip, 1, XFS_IFLUSH_DELWRI_ELSE_SYNC); - } else { - xfs_mount_t *mp = ip->i_mount; - - /* Protect sync and unpin from us */ - XFS_MOUNT_ILOCK(mp); - spin_lock(&ip->i_flags_lock); - __xfs_iflags_set(ip, XFS_IRECLAIMABLE); - VFS_I(ip)->i_private = NULL; - ip->i_vnode = NULL; - spin_unlock(&ip->i_flags_lock); - list_add_tail(&ip->i_reclaim, &mp->m_del_inodes); - XFS_MOUNT_IUNLOCK(mp); - } - return 0; -} - -int -xfs_finish_reclaim( - xfs_inode_t *ip, - int locked, - int sync_mode) -{ - xfs_perag_t *pag = xfs_get_perag(ip->i_mount, ip->i_ino); - struct inode *vp = VFS_I(ip); - - if (vp && VN_BAD(vp)) - goto reclaim; - - /* The hash lock here protects a thread in xfs_iget_core from - * racing with us on linking the inode back with a vnode. - * Once we have the XFS_IRECLAIM flag set it will not touch - * us. - */ - write_lock(&pag->pag_ici_lock); - spin_lock(&ip->i_flags_lock); - if (__xfs_iflags_test(ip, XFS_IRECLAIM) || - (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) && vp == NULL)) { - spin_unlock(&ip->i_flags_lock); - write_unlock(&pag->pag_ici_lock); - if (locked) { - xfs_ifunlock(ip); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } - return 1; - } - __xfs_iflags_set(ip, XFS_IRECLAIM); - spin_unlock(&ip->i_flags_lock); - write_unlock(&pag->pag_ici_lock); - xfs_put_perag(ip->i_mount, pag); - - /* - * If the inode is still dirty, then flush it out. If the inode - * is not in the AIL, then it will be OK to flush it delwri as - * long as xfs_iflush() does not keep any references to the inode. - * We leave that decision up to xfs_iflush() since it has the - * knowledge of whether it's OK to simply do a delwri flush of - * the inode or whether we need to wait until the inode is - * pulled from the AIL. - * We get the flush lock regardless, though, just to make sure - * we don't free it while it is being flushed. - */ - if (!locked) { - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_iflock(ip); + xfs_iflags_set(ip, XFS_IRECLAIMABLE); + return xfs_reclaim_inode(ip, 1, XFS_IFLUSH_DELWRI_ELSE_SYNC); } - - /* - * In the case of a forced shutdown we rely on xfs_iflush() to - * wait for the inode to be unpinned before returning an error. - */ - if (xfs_iflush(ip, sync_mode) == 0) { - /* synchronize with xfs_iflush_done */ - xfs_iflock(ip); - xfs_ifunlock(ip); - } - - xfs_iunlock(ip, XFS_ILOCK_EXCL); - - reclaim: - xfs_ireclaim(ip); - return 0; -} - -int -xfs_finish_reclaim_all(xfs_mount_t *mp, int noblock) -{ - int purged; - xfs_inode_t *ip, *n; - int done = 0; - - while (!done) { - purged = 0; - XFS_MOUNT_ILOCK(mp); - list_for_each_entry_safe(ip, n, &mp->m_del_inodes, i_reclaim) { - if (noblock) { - if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0) - continue; - if (xfs_ipincount(ip) || - !xfs_iflock_nowait(ip)) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - continue; - } - } - XFS_MOUNT_IUNLOCK(mp); - if (xfs_finish_reclaim(ip, noblock, - XFS_IFLUSH_DELWRI_ELSE_ASYNC)) - delay(1); - purged = 1; - break; - } - - done = !purged; - } - - XFS_MOUNT_IUNLOCK(mp); + xfs_inode_set_reclaim_tag(ip); return 0; } @@ -3197,6 +2957,8 @@ xfs_zero_remaining_bytes( bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize, XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp); + if (!bp) + return XFS_ERROR(ENOMEM); for (offset = startoff; offset <= endoff; offset = lastoffset + 1) { offset_fsb = XFS_B_TO_FSBT(mp, offset); @@ -3312,7 +3074,8 @@ xfs_free_file_space( need_iolock = 0; if (need_iolock) { xfs_ilock(ip, XFS_IOLOCK_EXCL); - vn_iowait(ip); /* wait for the completion of any pending DIOs */ + /* wait for the completion of any pending DIOs */ + xfs_ioend_wait(ip); } rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); @@ -3474,7 +3237,6 @@ xfs_change_file_space( int cmd, xfs_flock64_t *bf, xfs_off_t offset, - cred_t *credp, int attr_flags) { xfs_mount_t *mp = ip->i_mount; @@ -3562,7 +3324,7 @@ xfs_change_file_space( iattr.ia_valid = ATTR_SIZE; iattr.ia_size = startoffset; - error = xfs_setattr(ip, &iattr, attr_flags, credp); + error = xfs_setattr(ip, &iattr, attr_flags); if (error) return error; diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h index 7b0c2ab88333..76df328c61b4 100644 --- a/fs/xfs/xfs_vnodeops.h +++ b/fs/xfs/xfs_vnodeops.h @@ -14,9 +14,7 @@ struct xfs_inode; struct xfs_iomap; -int xfs_open(struct xfs_inode *ip); -int xfs_setattr(struct xfs_inode *ip, struct iattr *vap, int flags, - cred_t *credp); +int xfs_setattr(struct xfs_inode *ip, struct iattr *vap, int flags); #define XFS_ATTR_DMI 0x01 /* invocation from a DMI function */ #define XFS_ATTR_NONBLOCK 0x02 /* return EAGAIN if operation would block */ #define XFS_ATTR_NOLOCK 0x04 /* Don't grab any conflicting locks */ @@ -44,8 +42,7 @@ int xfs_inode_flush(struct xfs_inode *ip, int flags); int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state); int xfs_reclaim(struct xfs_inode *ip); int xfs_change_file_space(struct xfs_inode *ip, int cmd, - xfs_flock64_t *bf, xfs_off_t offset, - cred_t *credp, int attr_flags); + xfs_flock64_t *bf, xfs_off_t offset, int attr_flags); int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name, struct xfs_inode *src_ip, struct xfs_inode *target_dp, struct xfs_name *target_name, struct xfs_inode *target_ip); @@ -56,8 +53,6 @@ int xfs_attr_set(struct xfs_inode *dp, const char *name, char *value, int xfs_attr_remove(struct xfs_inode *dp, const char *name, int flags); int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, int flags, struct attrlist_cursor_kern *cursor); -int xfs_ioctl(struct xfs_inode *ip, struct file *filp, - int ioflags, unsigned int cmd, void __user *arg); ssize_t xfs_read(struct xfs_inode *ip, struct kiocb *iocb, const struct iovec *iovp, unsigned int segs, loff_t *offset, int ioflags); @@ -78,5 +73,6 @@ int xfs_flushinval_pages(struct xfs_inode *ip, xfs_off_t first, xfs_off_t last, int fiopt); int xfs_flush_pages(struct xfs_inode *ip, xfs_off_t first, xfs_off_t last, uint64_t flags, int fiopt); +int xfs_wait_on_pages(struct xfs_inode *ip, xfs_off_t first, xfs_off_t last); #endif /* _XFS_VNODEOPS_H */ |