diff options
Diffstat (limited to 'fs')
96 files changed, 1990 insertions, 848 deletions
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 67cf810e0fd6..654d8fdbf01f 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -100,6 +100,7 @@ int afs_open_socket(void) ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx)); if (ret < 0) { sock_release(socket); + destroy_workqueue(afs_async_calls); _leave(" = %d [bind]", ret); return ret; } diff --git a/fs/autofs/root.c b/fs/autofs/root.c index 9a0520b50663..11b1ea786d00 100644 --- a/fs/autofs/root.c +++ b/fs/autofs/root.c @@ -16,6 +16,7 @@ #include <linux/slab.h> #include <linux/param.h> #include <linux/time.h> +#include <linux/compat.h> #include <linux/smp_lock.h> #include "autofs_i.h" @@ -25,13 +26,17 @@ static int autofs_root_symlink(struct inode *,struct dentry *,const char *); static int autofs_root_unlink(struct inode *,struct dentry *); static int autofs_root_rmdir(struct inode *,struct dentry *); static int autofs_root_mkdir(struct inode *,struct dentry *,int); -static int autofs_root_ioctl(struct inode *, struct file *,unsigned int,unsigned long); +static long autofs_root_ioctl(struct file *,unsigned int,unsigned long); +static long autofs_root_compat_ioctl(struct file *,unsigned int,unsigned long); const struct file_operations autofs_root_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, .readdir = autofs_root_readdir, - .ioctl = autofs_root_ioctl, + .unlocked_ioctl = autofs_root_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = autofs_root_compat_ioctl, +#endif }; const struct inode_operations autofs_root_inode_operations = { @@ -492,6 +497,25 @@ static int autofs_root_mkdir(struct inode *dir, struct dentry *dentry, int mode) } /* Get/set timeout ioctl() operation */ +#ifdef CONFIG_COMPAT +static inline int autofs_compat_get_set_timeout(struct autofs_sb_info *sbi, + unsigned int __user *p) +{ + unsigned long ntimeout; + + if (get_user(ntimeout, p) || + put_user(sbi->exp_timeout / HZ, p)) + return -EFAULT; + + if (ntimeout > UINT_MAX/HZ) + sbi->exp_timeout = 0; + else + sbi->exp_timeout = ntimeout * HZ; + + return 0; +} +#endif + static inline int autofs_get_set_timeout(struct autofs_sb_info *sbi, unsigned long __user *p) { @@ -546,7 +570,7 @@ static inline int autofs_expire_run(struct super_block *sb, * ioctl()'s on the root directory is the chief method for the daemon to * generate kernel reactions */ -static int autofs_root_ioctl(struct inode *inode, struct file *filp, +static int autofs_do_root_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg) { struct autofs_sb_info *sbi = autofs_sbi(inode->i_sb); @@ -571,6 +595,10 @@ static int autofs_root_ioctl(struct inode *inode, struct file *filp, return 0; case AUTOFS_IOC_PROTOVER: /* Get protocol version */ return autofs_get_protover(argp); +#ifdef CONFIG_COMPAT + case AUTOFS_IOC_SETTIMEOUT32: + return autofs_compat_get_set_timeout(sbi, argp); +#endif case AUTOFS_IOC_SETTIMEOUT: return autofs_get_set_timeout(sbi, argp); case AUTOFS_IOC_EXPIRE: @@ -579,4 +607,37 @@ static int autofs_root_ioctl(struct inode *inode, struct file *filp, default: return -ENOSYS; } + +} + +static long autofs_root_ioctl(struct file *filp, + unsigned int cmd, unsigned long arg) +{ + int ret; + + lock_kernel(); + ret = autofs_do_root_ioctl(filp->f_path.dentry->d_inode, + filp, cmd, arg); + unlock_kernel(); + + return ret; +} + +#ifdef CONFIG_COMPAT +static long autofs_root_compat_ioctl(struct file *filp, + unsigned int cmd, unsigned long arg) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + int ret; + + lock_kernel(); + if (cmd == AUTOFS_IOC_READY || cmd == AUTOFS_IOC_FAIL) + ret = autofs_do_root_ioctl(inode, filp, cmd, arg); + else + ret = autofs_do_root_ioctl(inode, filp, cmd, + (unsigned long)compat_ptr(arg)); + unlock_kernel(); + + return ret; } +#endif diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index db4117ed7803..cb1bd38dc08c 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -18,7 +18,9 @@ #include <linux/slab.h> #include <linux/param.h> #include <linux/time.h> +#include <linux/compat.h> #include <linux/smp_lock.h> + #include "autofs_i.h" static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *); @@ -26,6 +28,7 @@ static int autofs4_dir_unlink(struct inode *,struct dentry *); static int autofs4_dir_rmdir(struct inode *,struct dentry *); static int autofs4_dir_mkdir(struct inode *,struct dentry *,int); static long autofs4_root_ioctl(struct file *,unsigned int,unsigned long); +static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long); static int autofs4_dir_open(struct inode *inode, struct file *file); static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *); static void *autofs4_follow_link(struct dentry *, struct nameidata *); @@ -40,6 +43,9 @@ const struct file_operations autofs4_root_operations = { .readdir = dcache_readdir, .llseek = dcache_dir_lseek, .unlocked_ioctl = autofs4_root_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = autofs4_root_compat_ioctl, +#endif }; const struct file_operations autofs4_dir_operations = { @@ -198,8 +204,7 @@ static int try_to_fill_dentry(struct dentry *dentry, int flags) } /* Initialize expiry counter after successful mount */ - if (ino) - ino->last_used = jiffies; + ino->last_used = jiffies; spin_lock(&sbi->fs_lock); ino->flags &= ~AUTOFS_INF_PENDING; @@ -840,6 +845,26 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode) } /* Get/set timeout ioctl() operation */ +#ifdef CONFIG_COMPAT +static inline int autofs4_compat_get_set_timeout(struct autofs_sb_info *sbi, + compat_ulong_t __user *p) +{ + int rv; + unsigned long ntimeout; + + if ((rv = get_user(ntimeout, p)) || + (rv = put_user(sbi->exp_timeout/HZ, p))) + return rv; + + if (ntimeout > UINT_MAX/HZ) + sbi->exp_timeout = 0; + else + sbi->exp_timeout = ntimeout * HZ; + + return 0; +} +#endif + static inline int autofs4_get_set_timeout(struct autofs_sb_info *sbi, unsigned long __user *p) { @@ -933,6 +958,10 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp, return autofs4_get_protosubver(sbi, p); case AUTOFS_IOC_SETTIMEOUT: return autofs4_get_set_timeout(sbi, p); +#ifdef CONFIG_COMPAT + case AUTOFS_IOC_SETTIMEOUT32: + return autofs4_compat_get_set_timeout(sbi, p); +#endif case AUTOFS_IOC_ASKUMOUNT: return autofs4_ask_umount(filp->f_path.mnt, p); @@ -961,3 +990,22 @@ static long autofs4_root_ioctl(struct file *filp, return ret; } + +#ifdef CONFIG_COMPAT +static long autofs4_root_compat_ioctl(struct file *filp, + unsigned int cmd, unsigned long arg) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + int ret; + + lock_kernel(); + if (cmd == AUTOFS_IOC_READY || cmd == AUTOFS_IOC_FAIL) + ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg); + else + ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, + (unsigned long)compat_ptr(arg)); + unlock_kernel(); + + return ret; +} +#endif @@ -843,7 +843,8 @@ struct bio *bio_copy_user_iov(struct request_queue *q, if (!bio) goto out_bmd; - bio->bi_rw |= (!write_to_vm << BIO_RW); + if (!write_to_vm) + bio->bi_rw |= REQ_WRITE; ret = 0; @@ -1024,7 +1025,7 @@ static struct bio *__bio_map_user_iov(struct request_queue *q, * set data direction, and check if mapped pages need bouncing */ if (!write_to_vm) - bio->bi_rw |= (1 << BIO_RW); + bio->bi_rw |= REQ_WRITE; bio->bi_bdev = bdev; bio->bi_flags |= (1 << BIO_USER_MAPPED); diff --git a/fs/block_dev.c b/fs/block_dev.c index 451afbd543b5..50e8c8582faa 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1340,19 +1340,20 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) /* * hooks: /n/, see "layering violations". */ - ret = devcgroup_inode_permission(bdev->bd_inode, perm); - if (ret != 0) { - bdput(bdev); - return ret; + if (!for_part) { + ret = devcgroup_inode_permission(bdev->bd_inode, perm); + if (ret != 0) { + bdput(bdev); + return ret; + } } - lock_kernel(); restart: ret = -ENXIO; disk = get_gendisk(bdev->bd_dev, &partno); if (!disk) - goto out_unlock_kernel; + goto out; mutex_lock_nested(&bdev->bd_mutex, for_part); if (!bdev->bd_openers) { @@ -1432,7 +1433,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) if (for_part) bdev->bd_part_count++; mutex_unlock(&bdev->bd_mutex); - unlock_kernel(); return 0; out_clear: @@ -1445,9 +1445,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bdev->bd_contains = NULL; out_unlock_bdev: mutex_unlock(&bdev->bd_mutex); - out_unlock_kernel: - unlock_kernel(); - + out: if (disk) module_put(disk->fops->owner); put_disk(disk); @@ -1516,7 +1514,6 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) struct block_device *victim = NULL; mutex_lock_nested(&bdev->bd_mutex, for_part); - lock_kernel(); if (for_part) bdev->bd_part_count--; @@ -1541,7 +1538,6 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) victim = bdev->bd_contains; bdev->bd_contains = NULL; } - unlock_kernel(); mutex_unlock(&bdev->bd_mutex); bdput(bdev); if (victim) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 34f7c375567e..64f10082f048 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -480,7 +480,7 @@ static void end_workqueue_bio(struct bio *bio, int err) end_io_wq->work.func = end_workqueue_fn; end_io_wq->work.flags = 0; - if (bio->bi_rw & (1 << BIO_RW)) { + if (bio->bi_rw & REQ_WRITE) { if (end_io_wq->metadata) btrfs_queue_worker(&fs_info->endio_meta_write_workers, &end_io_wq->work); @@ -604,7 +604,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, atomic_inc(&fs_info->nr_async_submits); - if (rw & (1 << BIO_RW_SYNCIO)) + if (rw & REQ_SYNC) btrfs_set_work_high_prio(&async->work); btrfs_queue_worker(&fs_info->workers, &async->work); @@ -668,7 +668,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, bio, 1); BUG_ON(ret); - if (!(rw & (1 << BIO_RW))) { + if (!(rw & REQ_WRITE)) { /* * called for a read, do the setup so that checksum validation * can happen in the async kernel threads @@ -1427,7 +1427,7 @@ static void end_workqueue_fn(struct btrfs_work *work) * ram and up to date before trying to verify things. For * blocksize <= pagesize, it is basically a noop */ - if (!(bio->bi_rw & (1 << BIO_RW)) && end_io_wq->metadata && + if (!(bio->bi_rw & REQ_WRITE) && end_io_wq->metadata && !bio_ready_for_csum(bio)) { btrfs_queue_worker(&fs_info->endio_meta_workers, &end_io_wq->work); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8976c3343a96..c03864406af3 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1429,7 +1429,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); BUG_ON(ret); - if (!(rw & (1 << BIO_RW))) { + if (!(rw & REQ_WRITE)) { if (bio_flags & EXTENT_BIO_COMPRESSED) { return btrfs_submit_compressed_read(inode, bio, mirror_num, bio_flags); @@ -1841,7 +1841,7 @@ static int btrfs_io_failed_hook(struct bio *failed_bio, bio->bi_size = 0; bio_add_page(bio, page, failrec->len, start - page_offset(page)); - if (failed_bio->bi_rw & (1 << BIO_RW)) + if (failed_bio->bi_rw & REQ_WRITE) rw = WRITE; else rw = READ; @@ -5647,7 +5647,7 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode, struct bio_vec *bvec = bio->bi_io_vec; u64 start; int skip_sum; - int write = rw & (1 << BIO_RW); + int write = rw & REQ_WRITE; int ret = 0; skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d6e3af8be95b..dd318ff280b2 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -258,7 +258,7 @@ loop_lock: BUG_ON(atomic_read(&cur->bi_cnt) == 0); - if (bio_rw_flagged(cur, BIO_RW_SYNCIO)) + if (cur->bi_rw & REQ_SYNC) num_sync_run++; submit_bio(cur->bi_rw, cur); @@ -2651,7 +2651,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, int max_errors = 0; struct btrfs_multi_bio *multi = NULL; - if (multi_ret && !(rw & (1 << BIO_RW))) + if (multi_ret && !(rw & REQ_WRITE)) stripes_allocated = 1; again: if (multi_ret) { @@ -2687,7 +2687,7 @@ again: mirror_num = 0; /* if our multi bio struct is too small, back off and try again */ - if (rw & (1 << BIO_RW)) { + if (rw & REQ_WRITE) { if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP)) { stripes_required = map->num_stripes; @@ -2697,7 +2697,7 @@ again: max_errors = 1; } } - if (multi_ret && (rw & (1 << BIO_RW)) && + if (multi_ret && (rw & REQ_WRITE) && stripes_allocated < stripes_required) { stripes_allocated = map->num_stripes; free_extent_map(em); @@ -2733,7 +2733,7 @@ again: num_stripes = 1; stripe_index = 0; if (map->type & BTRFS_BLOCK_GROUP_RAID1) { - if (unplug_page || (rw & (1 << BIO_RW))) + if (unplug_page || (rw & REQ_WRITE)) num_stripes = map->num_stripes; else if (mirror_num) stripe_index = mirror_num - 1; @@ -2744,7 +2744,7 @@ again: } } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { - if (rw & (1 << BIO_RW)) + if (rw & REQ_WRITE) num_stripes = map->num_stripes; else if (mirror_num) stripe_index = mirror_num - 1; @@ -2755,7 +2755,7 @@ again: stripe_index = do_div(stripe_nr, factor); stripe_index *= map->sub_stripes; - if (unplug_page || (rw & (1 << BIO_RW))) + if (unplug_page || (rw & REQ_WRITE)) num_stripes = map->sub_stripes; else if (mirror_num) stripe_index += mirror_num - 1; @@ -2945,7 +2945,7 @@ static noinline int schedule_bio(struct btrfs_root *root, struct btrfs_pending_bios *pending_bios; /* don't bother with additional async steps for reads, right now */ - if (!(rw & (1 << BIO_RW))) { + if (!(rw & REQ_WRITE)) { bio_get(bio); submit_bio(rw, bio); bio_put(bio); @@ -2964,7 +2964,7 @@ static noinline int schedule_bio(struct btrfs_root *root, bio->bi_rw |= rw; spin_lock(&device->io_lock); - if (bio_rw_flagged(bio, BIO_RW_SYNCIO)) + if (bio->bi_rw & REQ_SYNC) pending_bios = &device->pending_sync_bios; else pending_bios = &device->pending_bios; diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile index 6a660e610be8..278e1172600d 100644 --- a/fs/ceph/Makefile +++ b/fs/ceph/Makefile @@ -6,7 +6,7 @@ ifneq ($(KERNELRELEASE),) obj-$(CONFIG_CEPH_FS) += ceph.o -ceph-objs := super.o inode.o dir.o file.o addr.o ioctl.o \ +ceph-objs := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ export.o caps.o snap.o xattr.o \ messenger.o msgpool.o buffer.o pagelist.o \ mds_client.o mdsmap.o \ diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index d9c60b84949a..5598a0d02295 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -309,7 +309,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, zero_user_segment(page, s, PAGE_CACHE_SIZE); } - if (add_to_page_cache_lru(page, mapping, page->index, GFP_NOFS)) { + if (add_to_page_cache_lru(page, mapping, page->index, + GFP_NOFS)) { page_cache_release(page); dout("readpages %p add_to_page_cache failed %p\n", inode, page); @@ -552,7 +553,7 @@ static void writepages_finish(struct ceph_osd_request *req, * page truncation thread, possibly losing some data that * raced its way in */ - if ((issued & CEPH_CAP_FILE_CACHE) == 0) + if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) generic_error_remove_page(inode->i_mapping, page); unlock_page(page); @@ -797,9 +798,12 @@ get_more_pages: dout("%p will write page %p idx %lu\n", inode, page, page->index); - writeback_stat = atomic_long_inc_return(&client->writeback_count); - if (writeback_stat > CONGESTION_ON_THRESH(client->mount_args->congestion_kb)) { - set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC); + writeback_stat = + atomic_long_inc_return(&client->writeback_count); + if (writeback_stat > CONGESTION_ON_THRESH( + client->mount_args->congestion_kb)) { + set_bdi_congested(&client->backing_dev_info, + BLK_RW_ASYNC); } set_page_writeback(page); @@ -1036,7 +1040,7 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping, *pagep = page; dout("write_begin file %p inode %p page %p %d~%d\n", file, - inode, page, (int)pos, (int)len); + inode, page, (int)pos, (int)len); r = ceph_update_writeable_page(file, pos, len, page); } while (r == -EAGAIN); diff --git a/fs/ceph/armor.c b/fs/ceph/armor.c index 67b2c030924b..eb2a666b0be7 100644 --- a/fs/ceph/armor.c +++ b/fs/ceph/armor.c @@ -1,11 +1,15 @@ #include <linux/errno.h> +int ceph_armor(char *dst, const char *src, const char *end); +int ceph_unarmor(char *dst, const char *src, const char *end); + /* * base64 encode/decode. */ -const char *pem_key = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; +static const char *pem_key = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; static int encode_bits(int c) { diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c index 89490beaf537..6d2e30600627 100644 --- a/fs/ceph/auth.c +++ b/fs/ceph/auth.c @@ -20,7 +20,7 @@ static u32 supported_protocols[] = { CEPH_AUTH_CEPHX }; -int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol) +static int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol) { switch (protocol) { case CEPH_AUTH_NONE: @@ -133,8 +133,8 @@ bad: return -ERANGE; } -int ceph_build_auth_request(struct ceph_auth_client *ac, - void *msg_buf, size_t msg_len) +static int ceph_build_auth_request(struct ceph_auth_client *ac, + void *msg_buf, size_t msg_len) { struct ceph_mon_request_header *monhdr = msg_buf; void *p = monhdr + 1; diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c index 6d44053ecff1..582e0b2caf8a 100644 --- a/fs/ceph/auth_x.c +++ b/fs/ceph/auth_x.c @@ -87,8 +87,8 @@ static int ceph_x_decrypt(struct ceph_crypto_key *secret, /* * get existing (or insert new) ticket handler */ -struct ceph_x_ticket_handler *get_ticket_handler(struct ceph_auth_client *ac, - int service) +static struct ceph_x_ticket_handler * +get_ticket_handler(struct ceph_auth_client *ac, int service) { struct ceph_x_ticket_handler *th; struct ceph_x_info *xi = ac->private; @@ -429,7 +429,7 @@ static int ceph_x_build_request(struct ceph_auth_client *ac, auth->struct_v = 1; auth->key = 0; for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++) - auth->key ^= *u; + auth->key ^= *(__le64 *)u; dout(" server_challenge %llx client_challenge %llx key %llx\n", xi->server_challenge, le64_to_cpu(auth->client_challenge), le64_to_cpu(auth->key)); diff --git a/fs/ceph/buffer.c b/fs/ceph/buffer.c index c67535d70aa6..cd39f17021de 100644 --- a/fs/ceph/buffer.c +++ b/fs/ceph/buffer.c @@ -47,22 +47,6 @@ void ceph_buffer_release(struct kref *kref) kfree(b); } -int ceph_buffer_alloc(struct ceph_buffer *b, int len, gfp_t gfp) -{ - b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN); - if (b->vec.iov_base) { - b->is_vmalloc = false; - } else { - b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL); - b->is_vmalloc = true; - } - if (!b->vec.iov_base) - return -ENOMEM; - b->alloc_len = len; - b->vec.iov_len = len; - return 0; -} - int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end) { size_t len; diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index b81be9a56487..7bf182b03973 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -113,58 +113,41 @@ const char *ceph_cap_string(int caps) return cap_str[i]; } -/* - * Cap reservations - * - * Maintain a global pool of preallocated struct ceph_caps, referenced - * by struct ceph_caps_reservations. This ensures that we preallocate - * memory needed to successfully process an MDS response. (If an MDS - * sends us cap information and we fail to process it, we will have - * problems due to the client and MDS being out of sync.) - * - * Reservations are 'owned' by a ceph_cap_reservation context. - */ -static spinlock_t caps_list_lock; -static struct list_head caps_list; /* unused (reserved or unreserved) */ -static int caps_total_count; /* total caps allocated */ -static int caps_use_count; /* in use */ -static int caps_reserve_count; /* unused, reserved */ -static int caps_avail_count; /* unused, unreserved */ -static int caps_min_count; /* keep at least this many (unreserved) */ - -void __init ceph_caps_init(void) +void ceph_caps_init(struct ceph_mds_client *mdsc) { - INIT_LIST_HEAD(&caps_list); - spin_lock_init(&caps_list_lock); + INIT_LIST_HEAD(&mdsc->caps_list); + spin_lock_init(&mdsc->caps_list_lock); } -void ceph_caps_finalize(void) +void ceph_caps_finalize(struct ceph_mds_client *mdsc) { struct ceph_cap *cap; - spin_lock(&caps_list_lock); - while (!list_empty(&caps_list)) { - cap = list_first_entry(&caps_list, struct ceph_cap, caps_item); + spin_lock(&mdsc->caps_list_lock); + while (!list_empty(&mdsc->caps_list)) { + cap = list_first_entry(&mdsc->caps_list, + struct ceph_cap, caps_item); list_del(&cap->caps_item); kmem_cache_free(ceph_cap_cachep, cap); } - caps_total_count = 0; - caps_avail_count = 0; - caps_use_count = 0; - caps_reserve_count = 0; - caps_min_count = 0; - spin_unlock(&caps_list_lock); + mdsc->caps_total_count = 0; + mdsc->caps_avail_count = 0; + mdsc->caps_use_count = 0; + mdsc->caps_reserve_count = 0; + mdsc->caps_min_count = 0; + spin_unlock(&mdsc->caps_list_lock); } -void ceph_adjust_min_caps(int delta) +void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta) { - spin_lock(&caps_list_lock); - caps_min_count += delta; - BUG_ON(caps_min_count < 0); - spin_unlock(&caps_list_lock); + spin_lock(&mdsc->caps_list_lock); + mdsc->caps_min_count += delta; + BUG_ON(mdsc->caps_min_count < 0); + spin_unlock(&mdsc->caps_list_lock); } -int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need) +int ceph_reserve_caps(struct ceph_mds_client *mdsc, + struct ceph_cap_reservation *ctx, int need) { int i; struct ceph_cap *cap; @@ -176,16 +159,17 @@ int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need) dout("reserve caps ctx=%p need=%d\n", ctx, need); /* first reserve any caps that are already allocated */ - spin_lock(&caps_list_lock); - if (caps_avail_count >= need) + spin_lock(&mdsc->caps_list_lock); + if (mdsc->caps_avail_count >= need) have = need; else - have = caps_avail_count; - caps_avail_count -= have; - caps_reserve_count += have; - BUG_ON(caps_total_count != caps_use_count + caps_reserve_count + - caps_avail_count); - spin_unlock(&caps_list_lock); + have = mdsc->caps_avail_count; + mdsc->caps_avail_count -= have; + mdsc->caps_reserve_count += have; + BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + + mdsc->caps_reserve_count + + mdsc->caps_avail_count); + spin_unlock(&mdsc->caps_list_lock); for (i = have; i < need; i++) { cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); @@ -198,19 +182,20 @@ int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need) } BUG_ON(have + alloc != need); - spin_lock(&caps_list_lock); - caps_total_count += alloc; - caps_reserve_count += alloc; - list_splice(&newcaps, &caps_list); + spin_lock(&mdsc->caps_list_lock); + mdsc->caps_total_count += alloc; + mdsc->caps_reserve_count += alloc; + list_splice(&newcaps, &mdsc->caps_list); - BUG_ON(caps_total_count != caps_use_count + caps_reserve_count + - caps_avail_count); - spin_unlock(&caps_list_lock); + BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + + mdsc->caps_reserve_count + + mdsc->caps_avail_count); + spin_unlock(&mdsc->caps_list_lock); ctx->count = need; dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n", - ctx, caps_total_count, caps_use_count, caps_reserve_count, - caps_avail_count); + ctx, mdsc->caps_total_count, mdsc->caps_use_count, + mdsc->caps_reserve_count, mdsc->caps_avail_count); return 0; out_alloc_count: @@ -220,26 +205,29 @@ out_alloc_count: return ret; } -int ceph_unreserve_caps(struct ceph_cap_reservation *ctx) +int ceph_unreserve_caps(struct ceph_mds_client *mdsc, + struct ceph_cap_reservation *ctx) { dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count); if (ctx->count) { - spin_lock(&caps_list_lock); - BUG_ON(caps_reserve_count < ctx->count); - caps_reserve_count -= ctx->count; - caps_avail_count += ctx->count; + spin_lock(&mdsc->caps_list_lock); + BUG_ON(mdsc->caps_reserve_count < ctx->count); + mdsc->caps_reserve_count -= ctx->count; + mdsc->caps_avail_count += ctx->count; ctx->count = 0; dout("unreserve caps %d = %d used + %d resv + %d avail\n", - caps_total_count, caps_use_count, caps_reserve_count, - caps_avail_count); - BUG_ON(caps_total_count != caps_use_count + caps_reserve_count + - caps_avail_count); - spin_unlock(&caps_list_lock); + mdsc->caps_total_count, mdsc->caps_use_count, + mdsc->caps_reserve_count, mdsc->caps_avail_count); + BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + + mdsc->caps_reserve_count + + mdsc->caps_avail_count); + spin_unlock(&mdsc->caps_list_lock); } return 0; } -static struct ceph_cap *get_cap(struct ceph_cap_reservation *ctx) +static struct ceph_cap *get_cap(struct ceph_mds_client *mdsc, + struct ceph_cap_reservation *ctx) { struct ceph_cap *cap = NULL; @@ -247,71 +235,74 @@ static struct ceph_cap *get_cap(struct ceph_cap_reservation *ctx) if (!ctx) { cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); if (cap) { - caps_use_count++; - caps_total_count++; + mdsc->caps_use_count++; + mdsc->caps_total_count++; } return cap; } - spin_lock(&caps_list_lock); + spin_lock(&mdsc->caps_list_lock); dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n", - ctx, ctx->count, caps_total_count, caps_use_count, - caps_reserve_count, caps_avail_count); + ctx, ctx->count, mdsc->caps_total_count, mdsc->caps_use_count, + mdsc->caps_reserve_count, mdsc->caps_avail_count); BUG_ON(!ctx->count); - BUG_ON(ctx->count > caps_reserve_count); - BUG_ON(list_empty(&caps_list)); + BUG_ON(ctx->count > mdsc->caps_reserve_count); + BUG_ON(list_empty(&mdsc->caps_list)); ctx->count--; - caps_reserve_count--; - caps_use_count++; + mdsc->caps_reserve_count--; + mdsc->caps_use_count++; - cap = list_first_entry(&caps_list, struct ceph_cap, caps_item); + cap = list_first_entry(&mdsc->caps_list, struct ceph_cap, caps_item); list_del(&cap->caps_item); - BUG_ON(caps_total_count != caps_use_count + caps_reserve_count + - caps_avail_count); - spin_unlock(&caps_list_lock); + BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + + mdsc->caps_reserve_count + mdsc->caps_avail_count); + spin_unlock(&mdsc->caps_list_lock); return cap; } -void ceph_put_cap(struct ceph_cap *cap) +void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap) { - spin_lock(&caps_list_lock); + spin_lock(&mdsc->caps_list_lock); dout("put_cap %p %d = %d used + %d resv + %d avail\n", - cap, caps_total_count, caps_use_count, - caps_reserve_count, caps_avail_count); - caps_use_count--; + cap, mdsc->caps_total_count, mdsc->caps_use_count, + mdsc->caps_reserve_count, mdsc->caps_avail_count); + mdsc->caps_use_count--; /* * Keep some preallocated caps around (ceph_min_count), to * avoid lots of free/alloc churn. */ - if (caps_avail_count >= caps_reserve_count + caps_min_count) { - caps_total_count--; + if (mdsc->caps_avail_count >= mdsc->caps_reserve_count + + mdsc->caps_min_count) { + mdsc->caps_total_count--; kmem_cache_free(ceph_cap_cachep, cap); } else { - caps_avail_count++; - list_add(&cap->caps_item, &caps_list); + mdsc->caps_avail_count++; + list_add(&cap->caps_item, &mdsc->caps_list); } - BUG_ON(caps_total_count != caps_use_count + caps_reserve_count + - caps_avail_count); - spin_unlock(&caps_list_lock); + BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + + mdsc->caps_reserve_count + mdsc->caps_avail_count); + spin_unlock(&mdsc->caps_list_lock); } void ceph_reservation_status(struct ceph_client *client, int *total, int *avail, int *used, int *reserved, int *min) { + struct ceph_mds_client *mdsc = &client->mdsc; + if (total) - *total = caps_total_count; + *total = mdsc->caps_total_count; if (avail) - *avail = caps_avail_count; + *avail = mdsc->caps_avail_count; if (used) - *used = caps_use_count; + *used = mdsc->caps_use_count; if (reserved) - *reserved = caps_reserve_count; + *reserved = mdsc->caps_reserve_count; if (min) - *min = caps_min_count; + *min = mdsc->caps_min_count; } /* @@ -336,22 +327,29 @@ static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds) return NULL; } +struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds) +{ + struct ceph_cap *cap; + + spin_lock(&ci->vfs_inode.i_lock); + cap = __get_cap_for_mds(ci, mds); + spin_unlock(&ci->vfs_inode.i_lock); + return cap; +} + /* - * Return id of any MDS with a cap, preferably FILE_WR|WRBUFFER|EXCL, else - * -1. + * Return id of any MDS with a cap, preferably FILE_WR|BUFFER|EXCL, else -1. */ -static int __ceph_get_cap_mds(struct ceph_inode_info *ci, u32 *mseq) +static int __ceph_get_cap_mds(struct ceph_inode_info *ci) { struct ceph_cap *cap; int mds = -1; struct rb_node *p; - /* prefer mds with WR|WRBUFFER|EXCL caps */ + /* prefer mds with WR|BUFFER|EXCL caps */ for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { cap = rb_entry(p, struct ceph_cap, ci_node); mds = cap->mds; - if (mseq) - *mseq = cap->mseq; if (cap->issued & (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_EXCL)) @@ -364,7 +362,7 @@ int ceph_get_cap_mds(struct inode *inode) { int mds; spin_lock(&inode->i_lock); - mds = __ceph_get_cap_mds(ceph_inode(inode), NULL); + mds = __ceph_get_cap_mds(ceph_inode(inode)); spin_unlock(&inode->i_lock); return mds; } @@ -483,8 +481,8 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, * Each time we receive FILE_CACHE anew, we increment * i_rdcache_gen. */ - if ((issued & CEPH_CAP_FILE_CACHE) && - (had & CEPH_CAP_FILE_CACHE) == 0) + if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && + (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) ci->i_rdcache_gen++; /* @@ -543,7 +541,7 @@ retry: new_cap = NULL; } else { spin_unlock(&inode->i_lock); - new_cap = get_cap(caps_reservation); + new_cap = get_cap(mdsc, caps_reservation); if (new_cap == NULL) return -ENOMEM; goto retry; @@ -588,6 +586,7 @@ retry: } else { pr_err("ceph_add_cap: couldn't find snap realm %llx\n", realmino); + WARN_ON(!realm); } } @@ -831,7 +830,7 @@ int __ceph_caps_file_wanted(struct ceph_inode_info *ci) { int want = 0; int mode; - for (mode = 0; mode < 4; mode++) + for (mode = 0; mode < CEPH_FILE_MODE_NUM; mode++) if (ci->i_nr_by_mode[mode]) want |= ceph_caps_for_mode(mode); return want; @@ -901,7 +900,7 @@ void __ceph_remove_cap(struct ceph_cap *cap) ci->i_auth_cap = NULL; if (removed) - ceph_put_cap(cap); + ceph_put_cap(mdsc, cap); if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) { struct ceph_snap_realm *realm = ci->i_snap_realm; @@ -1197,6 +1196,8 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, */ void __ceph_flush_snaps(struct ceph_inode_info *ci, struct ceph_mds_session **psession) + __releases(ci->vfs_inode->i_lock) + __acquires(ci->vfs_inode->i_lock) { struct inode *inode = &ci->vfs_inode; int mds; @@ -1232,7 +1233,13 @@ retry: BUG_ON(capsnap->dirty == 0); /* pick mds, take s_mutex */ - mds = __ceph_get_cap_mds(ci, &mseq); + if (ci->i_auth_cap == NULL) { + dout("no auth cap (migrating?), doing nothing\n"); + goto out; + } + mds = ci->i_auth_cap->session->s_mds; + mseq = ci->i_auth_cap->mseq; + if (session && session->s_mds != mds) { dout("oops, wrong session %p mutex\n", session); mutex_unlock(&session->s_mutex); @@ -1251,8 +1258,8 @@ retry: } /* * if session == NULL, we raced against a cap - * deletion. retry, and we'll get a better - * @mds value next time. + * deletion or migration. retry, and we'll + * get a better @mds value next time. */ spin_lock(&inode->i_lock); goto retry; @@ -1290,6 +1297,7 @@ retry: list_del_init(&ci->i_snap_flush_item); spin_unlock(&mdsc->snap_flush_lock); +out: if (psession) *psession = session; else if (session) { @@ -1435,7 +1443,6 @@ static int try_nonblocking_invalidate(struct inode *inode) */ void ceph_check_caps(struct ceph_inode_info *ci, int flags, struct ceph_mds_session *session) - __releases(session->s_mutex) { struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode); struct ceph_mds_client *mdsc = &client->mdsc; @@ -1510,11 +1517,13 @@ retry_locked: ci->i_wrbuffer_ref == 0 && /* no dirty pages... */ ci->i_rdcache_gen && /* may have cached pages */ (file_wanted == 0 || /* no open files */ - (revoking & CEPH_CAP_FILE_CACHE)) && /* or revoking cache */ + (revoking & (CEPH_CAP_FILE_CACHE| + CEPH_CAP_FILE_LAZYIO))) && /* or revoking cache */ !tried_invalidate) { dout("check_caps trying to invalidate on %p\n", inode); if (try_nonblocking_invalidate(inode) < 0) { - if (revoking & CEPH_CAP_FILE_CACHE) { + if (revoking & (CEPH_CAP_FILE_CACHE| + CEPH_CAP_FILE_LAZYIO)) { dout("check_caps queuing invalidate\n"); queue_invalidate = 1; ci->i_rdcache_revoking = ci->i_rdcache_gen; @@ -2250,8 +2259,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, struct ceph_mds_session *session, struct ceph_cap *cap, struct ceph_buffer *xattr_buf) - __releases(inode->i_lock) - __releases(session->s_mutex) + __releases(inode->i_lock) { struct ceph_inode_info *ci = ceph_inode(inode); int mds = session->s_mds; @@ -2278,6 +2286,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, * will invalidate _after_ writeback.) */ if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) && + (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && !ci->i_wrbuffer_ref) { if (try_nonblocking_invalidate(inode) == 0) { revoked_rdcache = 1; @@ -2369,15 +2378,22 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, /* revocation, grant, or no-op? */ if (cap->issued & ~newcaps) { - dout("revocation: %s -> %s\n", ceph_cap_string(cap->issued), - ceph_cap_string(newcaps)); - if ((used & ~newcaps) & CEPH_CAP_FILE_BUFFER) - writeback = 1; /* will delay ack */ - else if (dirty & ~newcaps) - check_caps = 1; /* initiate writeback in check_caps */ - else if (((used & ~newcaps) & CEPH_CAP_FILE_CACHE) == 0 || - revoked_rdcache) - check_caps = 2; /* send revoke ack in check_caps */ + int revoking = cap->issued & ~newcaps; + + dout("revocation: %s -> %s (revoking %s)\n", + ceph_cap_string(cap->issued), + ceph_cap_string(newcaps), + ceph_cap_string(revoking)); + if (revoking & used & CEPH_CAP_FILE_BUFFER) + writeback = 1; /* initiate writeback; will delay ack */ + else if (revoking == CEPH_CAP_FILE_CACHE && + (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && + queue_invalidate) + ; /* do nothing yet, invalidation will be queued */ + else if (cap == ci->i_auth_cap) + check_caps = 1; /* check auth cap only */ + else + check_caps = 2; /* check all caps */ cap->issued = newcaps; cap->implemented |= newcaps; } else if (cap->issued == newcaps) { @@ -2568,7 +2584,8 @@ static void handle_cap_trunc(struct inode *inode, * caller holds s_mutex */ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, - struct ceph_mds_session *session) + struct ceph_mds_session *session, + int *open_target_sessions) { struct ceph_inode_info *ci = ceph_inode(inode); int mds = session->s_mds; @@ -2600,6 +2617,12 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, ci->i_cap_exporting_mds = mds; ci->i_cap_exporting_mseq = mseq; ci->i_cap_exporting_issued = cap->issued; + + /* + * make sure we have open sessions with all possible + * export targets, so that we get the matching IMPORT + */ + *open_target_sessions = 1; } __ceph_remove_cap(cap); } @@ -2675,6 +2698,10 @@ void ceph_handle_caps(struct ceph_mds_session *session, u64 size, max_size; u64 tid; void *snaptrace; + size_t snaptrace_len; + void *flock; + u32 flock_len; + int open_target_sessions = 0; dout("handle_caps from mds%d\n", mds); @@ -2683,7 +2710,6 @@ void ceph_handle_caps(struct ceph_mds_session *session, if (msg->front.iov_len < sizeof(*h)) goto bad; h = msg->front.iov_base; - snaptrace = h + 1; op = le32_to_cpu(h->op); vino.ino = le64_to_cpu(h->ino); vino.snap = CEPH_NOSNAP; @@ -2693,6 +2719,21 @@ void ceph_handle_caps(struct ceph_mds_session *session, size = le64_to_cpu(h->size); max_size = le64_to_cpu(h->max_size); + snaptrace = h + 1; + snaptrace_len = le32_to_cpu(h->snap_trace_len); + + if (le16_to_cpu(msg->hdr.version) >= 2) { + void *p, *end; + + p = snaptrace + snaptrace_len; + end = msg->front.iov_base + msg->front.iov_len; + ceph_decode_32_safe(&p, end, flock_len, bad); + flock = p; + } else { + flock = NULL; + flock_len = 0; + } + mutex_lock(&session->s_mutex); session->s_seq++; dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq, @@ -2714,7 +2755,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, * along for the mds (who clearly thinks we still have this * cap). */ - ceph_add_cap_releases(mdsc, session, -1); + ceph_add_cap_releases(mdsc, session); ceph_send_cap_releases(mdsc, session); goto done; } @@ -2726,12 +2767,12 @@ void ceph_handle_caps(struct ceph_mds_session *session, goto done; case CEPH_CAP_OP_EXPORT: - handle_cap_export(inode, h, session); + handle_cap_export(inode, h, session, &open_target_sessions); goto done; case CEPH_CAP_OP_IMPORT: handle_cap_import(mdsc, inode, h, session, - snaptrace, le32_to_cpu(h->snap_trace_len)); + snaptrace, snaptrace_len); ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY, session); goto done_unlocked; @@ -2773,6 +2814,8 @@ done: done_unlocked: if (inode) iput(inode); + if (open_target_sessions) + ceph_mdsc_open_export_target_sessions(mdsc, session); return; bad: diff --git a/fs/ceph/ceph_frag.h b/fs/ceph/ceph_frag.h index 793f50cb7c22..5babb8e95352 100644 --- a/fs/ceph/ceph_frag.h +++ b/fs/ceph/ceph_frag.h @@ -1,5 +1,5 @@ -#ifndef _FS_CEPH_FRAG_H -#define _FS_CEPH_FRAG_H +#ifndef FS_CEPH_FRAG_H +#define FS_CEPH_FRAG_H /* * "Frags" are a way to describe a subset of a 32-bit number space, diff --git a/fs/ceph/ceph_fs.c b/fs/ceph/ceph_fs.c index 79d76bc4303f..3ac6cc7c1156 100644 --- a/fs/ceph/ceph_fs.c +++ b/fs/ceph/ceph_fs.c @@ -29,46 +29,44 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout) int ceph_flags_to_mode(int flags) { + int mode; + #ifdef O_DIRECTORY /* fixme */ if ((flags & O_DIRECTORY) == O_DIRECTORY) return CEPH_FILE_MODE_PIN; #endif + if ((flags & O_APPEND) == O_APPEND) + flags |= O_WRONLY; + + if ((flags & O_ACCMODE) == O_RDWR) + mode = CEPH_FILE_MODE_RDWR; + else if ((flags & O_ACCMODE) == O_WRONLY) + mode = CEPH_FILE_MODE_WR; + else + mode = CEPH_FILE_MODE_RD; + #ifdef O_LAZY if (flags & O_LAZY) - return CEPH_FILE_MODE_LAZY; + mode |= CEPH_FILE_MODE_LAZY; #endif - if ((flags & O_APPEND) == O_APPEND) - flags |= O_WRONLY; - flags &= O_ACCMODE; - if ((flags & O_RDWR) == O_RDWR) - return CEPH_FILE_MODE_RDWR; - if ((flags & O_WRONLY) == O_WRONLY) - return CEPH_FILE_MODE_WR; - return CEPH_FILE_MODE_RD; + return mode; } int ceph_caps_for_mode(int mode) { - switch (mode) { - case CEPH_FILE_MODE_PIN: - return CEPH_CAP_PIN; - case CEPH_FILE_MODE_RD: - return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED | + int caps = CEPH_CAP_PIN; + + if (mode & CEPH_FILE_MODE_RD) + caps |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE; - case CEPH_FILE_MODE_RDWR: - return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED | - CEPH_CAP_FILE_EXCL | - CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE | - CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | - CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL | - CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL; - case CEPH_FILE_MODE_WR: - return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED | - CEPH_CAP_FILE_EXCL | + if (mode & CEPH_FILE_MODE_WR) + caps |= CEPH_CAP_FILE_EXCL | CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL | CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL; - } - return 0; + if (mode & CEPH_FILE_MODE_LAZY) + caps |= CEPH_CAP_FILE_LAZYIO; + + return caps; } diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h index 2fa992eaf7da..d5619ac86711 100644 --- a/fs/ceph/ceph_fs.h +++ b/fs/ceph/ceph_fs.h @@ -9,27 +9,13 @@ * LGPL2 */ -#ifndef _FS_CEPH_CEPH_FS_H -#define _FS_CEPH_CEPH_FS_H +#ifndef CEPH_FS_H +#define CEPH_FS_H #include "msgr.h" #include "rados.h" /* - * Ceph release version - */ -#define CEPH_VERSION_MAJOR 0 -#define CEPH_VERSION_MINOR 20 -#define CEPH_VERSION_PATCH 0 - -#define _CEPH_STRINGIFY(x) #x -#define CEPH_STRINGIFY(x) _CEPH_STRINGIFY(x) -#define CEPH_MAKE_VERSION(x, y, z) CEPH_STRINGIFY(x) "." CEPH_STRINGIFY(y) \ - "." CEPH_STRINGIFY(z) -#define CEPH_VERSION CEPH_MAKE_VERSION(CEPH_VERSION_MAJOR, \ - CEPH_VERSION_MINOR, CEPH_VERSION_PATCH) - -/* * subprotocol versions. when specific messages types or high-level * protocols change, bump the affected components. we keep rev * internal cluster protocols separately from the public, @@ -53,18 +39,10 @@ /* * feature bits */ -#define CEPH_FEATURE_UID 1 -#define CEPH_FEATURE_NOSRCADDR 2 -#define CEPH_FEATURE_FLOCK 4 - -#define CEPH_FEATURE_SUPPORTED_MON CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR -#define CEPH_FEATURE_REQUIRED_MON CEPH_FEATURE_UID -#define CEPH_FEATURE_SUPPORTED_MDS CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR|CEPH_FEATURE_FLOCK -#define CEPH_FEATURE_REQUIRED_MDS CEPH_FEATURE_UID -#define CEPH_FEATURE_SUPPORTED_OSD CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR -#define CEPH_FEATURE_REQUIRED_OSD CEPH_FEATURE_UID -#define CEPH_FEATURE_SUPPORTED_CLIENT CEPH_FEATURE_NOSRCADDR -#define CEPH_FEATURE_REQUIRED_CLIENT CEPH_FEATURE_NOSRCADDR +#define CEPH_FEATURE_UID (1<<0) +#define CEPH_FEATURE_NOSRCADDR (1<<1) +#define CEPH_FEATURE_MONCLOCKCHECK (1<<2) +#define CEPH_FEATURE_FLOCK (1<<3) /* @@ -96,6 +74,8 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout); #define CEPH_CRYPTO_NONE 0x0 #define CEPH_CRYPTO_AES 0x1 +#define CEPH_AES_IV "cephsageyudagreg" + /* security/authentication protocols */ #define CEPH_AUTH_UNKNOWN 0x0 #define CEPH_AUTH_NONE 0x1 @@ -275,6 +255,7 @@ extern const char *ceph_mds_state_name(int s); #define CEPH_LOCK_IDFT 512 /* dir frag tree */ #define CEPH_LOCK_INEST 1024 /* mds internal */ #define CEPH_LOCK_IXATTR 2048 +#define CEPH_LOCK_IFLOCK 4096 /* advisory file locks */ #define CEPH_LOCK_INO 8192 /* immutable inode bits; not a lock */ /* client_session ops */ @@ -316,6 +297,8 @@ enum { CEPH_MDS_OP_RMXATTR = 0x01106, CEPH_MDS_OP_SETLAYOUT = 0x01107, CEPH_MDS_OP_SETATTR = 0x01108, + CEPH_MDS_OP_SETFILELOCK= 0x01109, + CEPH_MDS_OP_GETFILELOCK= 0x00110, CEPH_MDS_OP_MKNOD = 0x01201, CEPH_MDS_OP_LINK = 0x01202, @@ -386,6 +369,15 @@ union ceph_mds_request_args { struct { struct ceph_file_layout layout; } __attribute__ ((packed)) setlayout; + struct { + __u8 rule; /* currently fcntl or flock */ + __u8 type; /* shared, exclusive, remove*/ + __le64 pid; /* process id requesting the lock */ + __le64 pid_namespace; + __le64 start; /* initial location to lock */ + __le64 length; /* num bytes to lock from start */ + __u8 wait; /* will caller wait for lock to become available? */ + } __attribute__ ((packed)) filelock_change; } __attribute__ ((packed)); #define CEPH_MDS_FLAG_REPLAY 1 /* this is a replayed op */ @@ -480,6 +472,23 @@ struct ceph_mds_reply_dirfrag { __le32 dist[]; } __attribute__ ((packed)); +#define CEPH_LOCK_FCNTL 1 +#define CEPH_LOCK_FLOCK 2 + +#define CEPH_LOCK_SHARED 1 +#define CEPH_LOCK_EXCL 2 +#define CEPH_LOCK_UNLOCK 4 + +struct ceph_filelock { + __le64 start;/* file offset to start lock at */ + __le64 length; /* num bytes to lock; 0 for all following start */ + __le64 client; /* which client holds the lock */ + __le64 pid; /* process id holding the lock on the client */ + __le64 pid_namespace; + __u8 type; /* shared lock, exclusive lock, or unlock */ +} __attribute__ ((packed)); + + /* file access modes */ #define CEPH_FILE_MODE_PIN 0 #define CEPH_FILE_MODE_RD 1 @@ -508,9 +517,10 @@ int ceph_flags_to_mode(int flags); #define CEPH_CAP_SAUTH 2 #define CEPH_CAP_SLINK 4 #define CEPH_CAP_SXATTR 6 -#define CEPH_CAP_SFILE 8 /* goes at the end (uses >2 cap bits) */ +#define CEPH_CAP_SFILE 8 +#define CEPH_CAP_SFLOCK 20 -#define CEPH_CAP_BITS 16 +#define CEPH_CAP_BITS 22 /* composed values */ #define CEPH_CAP_AUTH_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SAUTH) @@ -528,6 +538,9 @@ int ceph_flags_to_mode(int flags); #define CEPH_CAP_FILE_BUFFER (CEPH_CAP_GBUFFER << CEPH_CAP_SFILE) #define CEPH_CAP_FILE_WREXTEND (CEPH_CAP_GWREXTEND << CEPH_CAP_SFILE) #define CEPH_CAP_FILE_LAZYIO (CEPH_CAP_GLAZYIO << CEPH_CAP_SFILE) +#define CEPH_CAP_FLOCK_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SFLOCK) +#define CEPH_CAP_FLOCK_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SFLOCK) + /* cap masks (for getattr) */ #define CEPH_STAT_CAP_INODE CEPH_CAP_PIN @@ -563,7 +576,8 @@ int ceph_flags_to_mode(int flags); CEPH_CAP_FILE_EXCL) #define CEPH_CAP_ANY_WR (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR) #define CEPH_CAP_ANY (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \ - CEPH_CAP_ANY_FILE_WR | CEPH_CAP_PIN) + CEPH_CAP_ANY_FILE_WR | CEPH_CAP_FILE_LAZYIO | \ + CEPH_CAP_PIN) #define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \ CEPH_LOCK_IXATTR) @@ -653,12 +667,21 @@ struct ceph_mds_cap_reconnect { __le64 cap_id; __le32 wanted; __le32 issued; + __le64 snaprealm; + __le64 pathbase; /* base ino for our path to this ino */ + __le32 flock_len; /* size of flock state blob, if any */ +} __attribute__ ((packed)); +/* followed by flock blob */ + +struct ceph_mds_cap_reconnect_v1 { + __le64 cap_id; + __le32 wanted; + __le32 issued; __le64 size; struct ceph_timespec mtime, atime; __le64 snaprealm; __le64 pathbase; /* base ino for our path to this ino */ } __attribute__ ((packed)); -/* followed by encoded string */ struct ceph_mds_snaprealm_reconnect { __le64 ino; /* snap realm base */ diff --git a/fs/ceph/ceph_hash.h b/fs/ceph/ceph_hash.h index 5ac470c433c9..d099c3f90236 100644 --- a/fs/ceph/ceph_hash.h +++ b/fs/ceph/ceph_hash.h @@ -1,5 +1,5 @@ -#ifndef _FS_CEPH_HASH_H -#define _FS_CEPH_HASH_H +#ifndef FS_CEPH_HASH_H +#define FS_CEPH_HASH_H #define CEPH_STR_HASH_LINUX 0x1 /* linux dcache hash */ #define CEPH_STR_HASH_RJENKINS 0x2 /* robert jenkins' */ diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/ceph_strings.c index 7503aee828ce..c6179d3a26a2 100644 --- a/fs/ceph/ceph_strings.c +++ b/fs/ceph/ceph_strings.c @@ -28,6 +28,7 @@ const char *ceph_osd_op_name(int op) case CEPH_OSD_OP_TRUNCATE: return "truncate"; case CEPH_OSD_OP_ZERO: return "zero"; case CEPH_OSD_OP_WRITEFULL: return "writefull"; + case CEPH_OSD_OP_ROLLBACK: return "rollback"; case CEPH_OSD_OP_APPEND: return "append"; case CEPH_OSD_OP_STARTSYNC: return "startsync"; @@ -129,6 +130,8 @@ const char *ceph_mds_op_name(int op) case CEPH_MDS_OP_LSSNAP: return "lssnap"; case CEPH_MDS_OP_MKSNAP: return "mksnap"; case CEPH_MDS_OP_RMSNAP: return "rmsnap"; + case CEPH_MDS_OP_SETFILELOCK: return "setfilelock"; + case CEPH_MDS_OP_GETFILELOCK: return "getfilelock"; } return "???"; } diff --git a/fs/ceph/crush/crush.h b/fs/ceph/crush/crush.h index dcd7e7523700..97e435b191f4 100644 --- a/fs/ceph/crush/crush.h +++ b/fs/ceph/crush/crush.h @@ -1,5 +1,5 @@ -#ifndef _CRUSH_CRUSH_H -#define _CRUSH_CRUSH_H +#ifndef CEPH_CRUSH_CRUSH_H +#define CEPH_CRUSH_CRUSH_H #include <linux/types.h> diff --git a/fs/ceph/crush/hash.h b/fs/ceph/crush/hash.h index ff48e110e4bb..91e884230d5d 100644 --- a/fs/ceph/crush/hash.h +++ b/fs/ceph/crush/hash.h @@ -1,5 +1,5 @@ -#ifndef _CRUSH_HASH_H -#define _CRUSH_HASH_H +#ifndef CEPH_CRUSH_HASH_H +#define CEPH_CRUSH_HASH_H #define CRUSH_HASH_RJENKINS1 0 diff --git a/fs/ceph/crush/mapper.h b/fs/ceph/crush/mapper.h index 98e90046fd9f..c46b99c18bb0 100644 --- a/fs/ceph/crush/mapper.h +++ b/fs/ceph/crush/mapper.h @@ -1,5 +1,5 @@ -#ifndef _CRUSH_MAPPER_H -#define _CRUSH_MAPPER_H +#ifndef CEPH_CRUSH_MAPPER_H +#define CEPH_CRUSH_MAPPER_H /* * CRUSH functions for find rules and then mapping an input to an diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c index f704b3b62424..a3e627f63293 100644 --- a/fs/ceph/crypto.c +++ b/fs/ceph/crypto.c @@ -75,10 +75,11 @@ static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void) return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC); } -const u8 *aes_iv = "cephsageyudagreg"; +static const u8 *aes_iv = (u8 *)CEPH_AES_IV; -int ceph_aes_encrypt(const void *key, int key_len, void *dst, size_t *dst_len, - const void *src, size_t src_len) +static int ceph_aes_encrypt(const void *key, int key_len, + void *dst, size_t *dst_len, + const void *src, size_t src_len) { struct scatterlist sg_in[2], sg_out[1]; struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher(); @@ -126,9 +127,10 @@ int ceph_aes_encrypt(const void *key, int key_len, void *dst, size_t *dst_len, return 0; } -int ceph_aes_encrypt2(const void *key, int key_len, void *dst, size_t *dst_len, - const void *src1, size_t src1_len, - const void *src2, size_t src2_len) +static int ceph_aes_encrypt2(const void *key, int key_len, void *dst, + size_t *dst_len, + const void *src1, size_t src1_len, + const void *src2, size_t src2_len) { struct scatterlist sg_in[3], sg_out[1]; struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher(); @@ -179,8 +181,9 @@ int ceph_aes_encrypt2(const void *key, int key_len, void *dst, size_t *dst_len, return 0; } -int ceph_aes_decrypt(const void *key, int key_len, void *dst, size_t *dst_len, - const void *src, size_t src_len) +static int ceph_aes_decrypt(const void *key, int key_len, + void *dst, size_t *dst_len, + const void *src, size_t src_len) { struct scatterlist sg_in[1], sg_out[2]; struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher(); @@ -238,10 +241,10 @@ int ceph_aes_decrypt(const void *key, int key_len, void *dst, size_t *dst_len, return 0; } -int ceph_aes_decrypt2(const void *key, int key_len, - void *dst1, size_t *dst1_len, - void *dst2, size_t *dst2_len, - const void *src, size_t src_len) +static int ceph_aes_decrypt2(const void *key, int key_len, + void *dst1, size_t *dst1_len, + void *dst2, size_t *dst2_len, + const void *src, size_t src_len) { struct scatterlist sg_in[1], sg_out[3]; struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher(); diff --git a/fs/ceph/crypto.h b/fs/ceph/crypto.h index 40b502e6bd89..bdf38607323c 100644 --- a/fs/ceph/crypto.h +++ b/fs/ceph/crypto.h @@ -42,7 +42,7 @@ extern int ceph_encrypt2(struct ceph_crypto_key *secret, const void *src2, size_t src2_len); /* armor.c */ -extern int ceph_armor(char *dst, const void *src, const void *end); -extern int ceph_unarmor(void *dst, const char *src, const char *end); +extern int ceph_armor(char *dst, const char *src, const char *end); +extern int ceph_unarmor(char *dst, const char *src, const char *end); #endif diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index f2f5332ddbba..360c4f22718d 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -291,7 +291,7 @@ static int dentry_lru_show(struct seq_file *s, void *ptr) return 0; } -#define DEFINE_SHOW_FUNC(name) \ +#define DEFINE_SHOW_FUNC(name) \ static int name##_open(struct inode *inode, struct file *file) \ { \ struct seq_file *sf; \ @@ -361,8 +361,8 @@ int ceph_debugfs_client_init(struct ceph_client *client) int ret = 0; char name[80]; - snprintf(name, sizeof(name), FSID_FORMAT ".client%lld", - PR_FSID(&client->fsid), client->monc.auth->global_id); + snprintf(name, sizeof(name), "%pU.client%lld", &client->fsid, + client->monc.auth->global_id); client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir); if (!client->debugfs_dir) @@ -432,11 +432,12 @@ int ceph_debugfs_client_init(struct ceph_client *client) if (!client->debugfs_caps) goto out; - client->debugfs_congestion_kb = debugfs_create_file("writeback_congestion_kb", - 0600, - client->debugfs_dir, - client, - &congestion_kb_fops); + client->debugfs_congestion_kb = + debugfs_create_file("writeback_congestion_kb", + 0600, + client->debugfs_dir, + client, + &congestion_kb_fops); if (!client->debugfs_congestion_kb) goto out; @@ -466,7 +467,7 @@ void ceph_debugfs_client_cleanup(struct ceph_client *client) debugfs_remove(client->debugfs_dir); } -#else // CONFIG_DEBUG_FS +#else /* CONFIG_DEBUG_FS */ int __init ceph_debugfs_init(void) { @@ -486,4 +487,4 @@ void ceph_debugfs_client_cleanup(struct ceph_client *client) { } -#endif // CONFIG_DEBUG_FS +#endif /* CONFIG_DEBUG_FS */ diff --git a/fs/ceph/decode.h b/fs/ceph/decode.h index 65b3e022eaf5..3d25415afe63 100644 --- a/fs/ceph/decode.h +++ b/fs/ceph/decode.h @@ -99,11 +99,13 @@ static inline void ceph_encode_timespec(struct ceph_timespec *tv, */ static inline void ceph_encode_addr(struct ceph_entity_addr *a) { - a->in_addr.ss_family = htons(a->in_addr.ss_family); + __be16 ss_family = htons(a->in_addr.ss_family); + a->in_addr.ss_family = *(__u16 *)&ss_family; } static inline void ceph_decode_addr(struct ceph_entity_addr *a) { - a->in_addr.ss_family = ntohs(a->in_addr.ss_family); + __be16 ss_family = *(__be16 *)&a->in_addr.ss_family; + a->in_addr.ss_family = ntohs(ss_family); WARN_ON(a->in_addr.ss_family == 512); } diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index f94ed3c7f6a5..67bbb41d5526 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -27,7 +27,7 @@ const struct inode_operations ceph_dir_iops; const struct file_operations ceph_dir_fops; -struct dentry_operations ceph_dentry_ops; +const struct dentry_operations ceph_dentry_ops; /* * Initialize ceph dentry state. @@ -94,6 +94,8 @@ static unsigned fpos_off(loff_t p) */ static int __dcache_readdir(struct file *filp, void *dirent, filldir_t filldir) + __releases(inode->i_lock) + __acquires(inode->i_lock) { struct inode *inode = filp->f_dentry->d_inode; struct ceph_file_info *fi = filp->private_data; @@ -1239,16 +1241,16 @@ const struct inode_operations ceph_dir_iops = { .create = ceph_create, }; -struct dentry_operations ceph_dentry_ops = { +const struct dentry_operations ceph_dentry_ops = { .d_revalidate = ceph_d_revalidate, .d_release = ceph_dentry_release, }; -struct dentry_operations ceph_snapdir_dentry_ops = { +const struct dentry_operations ceph_snapdir_dentry_ops = { .d_revalidate = ceph_snapdir_d_revalidate, .d_release = ceph_dentry_release, }; -struct dentry_operations ceph_snap_dentry_ops = { +const struct dentry_operations ceph_snap_dentry_ops = { .d_release = ceph_dentry_release, }; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 7c08698fad3e..8c044a4f0457 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -317,7 +317,7 @@ void ceph_release_page_vector(struct page **pages, int num_pages) /* * allocate a vector new pages */ -struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags) +static struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags) { struct page **pages; int i; @@ -665,7 +665,7 @@ more: * throw out any page cache pages in this range. this * may block. */ - truncate_inode_pages_range(inode->i_mapping, pos, + truncate_inode_pages_range(inode->i_mapping, pos, (pos+len) | (PAGE_CACHE_SIZE-1)); } else { pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); @@ -740,28 +740,32 @@ static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { struct file *filp = iocb->ki_filp; + struct ceph_file_info *fi = filp->private_data; loff_t *ppos = &iocb->ki_pos; size_t len = iov->iov_len; struct inode *inode = filp->f_dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - void *base = iov->iov_base; + void __user *base = iov->iov_base; ssize_t ret; - int got = 0; + int want, got = 0; int checkeof = 0, read = 0; dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", inode, ceph_vinop(inode), pos, (unsigned)len, inode); again: __ceph_do_pending_vmtruncate(inode); - ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_CACHE, - &got, -1); + if (fi->fmode & CEPH_FILE_MODE_LAZY) + want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; + else + want = CEPH_CAP_FILE_CACHE; + ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1); if (ret < 0) goto out; dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n", inode, ceph_vinop(inode), pos, (unsigned)len, ceph_cap_string(got)); - if ((got & CEPH_CAP_FILE_CACHE) == 0 || + if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 || (iocb->ki_filp->f_flags & O_DIRECT) || (inode->i_sb->s_flags & MS_SYNCHRONOUS)) /* hmm, this isn't really async... */ @@ -807,11 +811,12 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { struct file *file = iocb->ki_filp; + struct ceph_file_info *fi = file->private_data; struct inode *inode = file->f_dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc; loff_t endoff = pos + iov->iov_len; - int got = 0; + int want, got = 0; int ret, err; if (ceph_snap(inode) != CEPH_NOSNAP) @@ -824,8 +829,11 @@ retry_snap: dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n", inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, inode->i_size); - ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, - &got, endoff); + if (fi->fmode & CEPH_FILE_MODE_LAZY) + want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; + else + want = CEPH_CAP_FILE_BUFFER; + ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, endoff); if (ret < 0) goto out; @@ -833,7 +841,7 @@ retry_snap: inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, ceph_cap_string(got)); - if ((got & CEPH_CAP_FILE_BUFFER) == 0 || + if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 || (iocb->ki_filp->f_flags & O_DIRECT) || (inode->i_sb->s_flags & MS_SYNCHRONOUS)) { ret = ceph_sync_write(file, iov->iov_base, iov->iov_len, @@ -930,6 +938,8 @@ const struct file_operations ceph_file_fops = { .aio_write = ceph_aio_write, .mmap = ceph_mmap, .fsync = ceph_fsync, + .lock = ceph_lock, + .flock = ceph_flock, .splice_read = generic_file_splice_read, .splice_write = generic_file_splice_write, .unlocked_ioctl = ceph_ioctl, diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 389f9dbd9949..5d893d31e399 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -442,8 +442,9 @@ int ceph_fill_file_size(struct inode *inode, int issued, * the file is either opened or mmaped */ if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_RD| - CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER| - CEPH_CAP_FILE_EXCL)) || + CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER| + CEPH_CAP_FILE_EXCL| + CEPH_CAP_FILE_LAZYIO)) || mapping_mapped(inode->i_mapping) || __ceph_caps_file_wanted(ci)) { ci->i_truncate_pending++; diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index d085f07756b4..76e307d2aba1 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -143,6 +143,27 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) return 0; } +static long ceph_ioctl_lazyio(struct file *file) +{ + struct ceph_file_info *fi = file->private_data; + struct inode *inode = file->f_dentry->d_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + + if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) { + spin_lock(&inode->i_lock); + ci->i_nr_by_mode[fi->fmode]--; + fi->fmode |= CEPH_FILE_MODE_LAZY; + ci->i_nr_by_mode[fi->fmode]++; + spin_unlock(&inode->i_lock); + dout("ioctl_layzio: file %p marked lazy\n", file); + + ceph_check_caps(ci, 0, NULL); + } else { + dout("ioctl_layzio: file %p already lazy\n", file); + } + return 0; +} + long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg); @@ -155,6 +176,9 @@ long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case CEPH_IOC_GET_DATALOC: return ceph_ioctl_get_dataloc(file, (void __user *)arg); + + case CEPH_IOC_LAZYIO: + return ceph_ioctl_lazyio(file); } return -ENOTTY; } diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h index 25e4f1a9d059..88451a3b6857 100644 --- a/fs/ceph/ioctl.h +++ b/fs/ceph/ioctl.h @@ -37,4 +37,6 @@ struct ceph_ioctl_dataloc { #define CEPH_IOC_GET_DATALOC _IOWR(CEPH_IOCTL_MAGIC, 3, \ struct ceph_ioctl_dataloc) +#define CEPH_IOC_LAZYIO _IO(CEPH_IOCTL_MAGIC, 4) + #endif diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c new file mode 100644 index 000000000000..ae85af06454f --- /dev/null +++ b/fs/ceph/locks.c @@ -0,0 +1,256 @@ +#include "ceph_debug.h" + +#include <linux/file.h> +#include <linux/namei.h> + +#include "super.h" +#include "mds_client.h" +#include "pagelist.h" + +/** + * Implement fcntl and flock locking functions. + */ +static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, + u64 pid, u64 pid_ns, + int cmd, u64 start, u64 length, u8 wait) +{ + struct inode *inode = file->f_dentry->d_inode; + struct ceph_mds_client *mdsc = + &ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_request *req; + int err; + + req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS); + if (IS_ERR(req)) + return PTR_ERR(req); + req->r_inode = igrab(inode); + + dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, " + "length: %llu, wait: %d, type`: %d", (int)lock_type, + (int)operation, pid, start, length, wait, cmd); + + req->r_args.filelock_change.rule = lock_type; + req->r_args.filelock_change.type = cmd; + req->r_args.filelock_change.pid = cpu_to_le64(pid); + /* This should be adjusted, but I'm not sure if + namespaces actually get id numbers*/ + req->r_args.filelock_change.pid_namespace = + cpu_to_le64((u64)pid_ns); + req->r_args.filelock_change.start = cpu_to_le64(start); + req->r_args.filelock_change.length = cpu_to_le64(length); + req->r_args.filelock_change.wait = wait; + + err = ceph_mdsc_do_request(mdsc, inode, req); + ceph_mdsc_put_request(req); + dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, " + "length: %llu, wait: %d, type`: %d err code %d", (int)lock_type, + (int)operation, pid, start, length, wait, cmd, err); + return err; +} + +/** + * Attempt to set an fcntl lock. + * For now, this just goes away to the server. Later it may be more awesome. + */ +int ceph_lock(struct file *file, int cmd, struct file_lock *fl) +{ + u64 length; + u8 lock_cmd; + int err; + u8 wait = 0; + u16 op = CEPH_MDS_OP_SETFILELOCK; + + fl->fl_nspid = get_pid(task_tgid(current)); + dout("ceph_lock, fl_pid:%d", fl->fl_pid); + + /* set wait bit as appropriate, then make command as Ceph expects it*/ + if (F_SETLKW == cmd) + wait = 1; + if (F_GETLK == cmd) + op = CEPH_MDS_OP_GETFILELOCK; + + if (F_RDLCK == fl->fl_type) + lock_cmd = CEPH_LOCK_SHARED; + else if (F_WRLCK == fl->fl_type) + lock_cmd = CEPH_LOCK_EXCL; + else + lock_cmd = CEPH_LOCK_UNLOCK; + + if (LLONG_MAX == fl->fl_end) + length = 0; + else + length = fl->fl_end - fl->fl_start + 1; + + err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file, + (u64)fl->fl_pid, (u64)fl->fl_nspid, + lock_cmd, fl->fl_start, + length, wait); + if (!err) { + dout("mds locked, locking locally"); + err = posix_lock_file(file, fl, NULL); + if (err && (CEPH_MDS_OP_SETFILELOCK == op)) { + /* undo! This should only happen if the kernel detects + * local deadlock. */ + ceph_lock_message(CEPH_LOCK_FCNTL, op, file, + (u64)fl->fl_pid, (u64)fl->fl_nspid, + CEPH_LOCK_UNLOCK, fl->fl_start, + length, 0); + dout("got %d on posix_lock_file, undid lock", err); + } + } else { + dout("mds returned error code %d", err); + } + return err; +} + +int ceph_flock(struct file *file, int cmd, struct file_lock *fl) +{ + u64 length; + u8 lock_cmd; + int err; + u8 wait = 1; + + fl->fl_nspid = get_pid(task_tgid(current)); + dout("ceph_flock, fl_pid:%d", fl->fl_pid); + + /* set wait bit, then clear it out of cmd*/ + if (cmd & LOCK_NB) + wait = 0; + cmd = cmd & (LOCK_SH | LOCK_EX | LOCK_UN); + /* set command sequence that Ceph wants to see: + shared lock, exclusive lock, or unlock */ + if (LOCK_SH == cmd) + lock_cmd = CEPH_LOCK_SHARED; + else if (LOCK_EX == cmd) + lock_cmd = CEPH_LOCK_EXCL; + else + lock_cmd = CEPH_LOCK_UNLOCK; + /* mds requires start and length rather than start and end */ + if (LLONG_MAX == fl->fl_end) + length = 0; + else + length = fl->fl_end - fl->fl_start + 1; + + err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, + file, (u64)fl->fl_pid, (u64)fl->fl_nspid, + lock_cmd, fl->fl_start, + length, wait); + if (!err) { + err = flock_lock_file_wait(file, fl); + if (err) { + ceph_lock_message(CEPH_LOCK_FLOCK, + CEPH_MDS_OP_SETFILELOCK, + file, (u64)fl->fl_pid, + (u64)fl->fl_nspid, + CEPH_LOCK_UNLOCK, fl->fl_start, + length, 0); + dout("got %d on flock_lock_file_wait, undid lock", err); + } + } else { + dout("mds error code %d", err); + } + return err; +} + +/** + * Must be called with BKL already held. Fills in the passed + * counter variables, so you can prepare pagelist metadata before calling + * ceph_encode_locks. + */ +void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count) +{ + struct file_lock *lock; + + *fcntl_count = 0; + *flock_count = 0; + + for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { + if (lock->fl_flags & FL_POSIX) + ++(*fcntl_count); + else if (lock->fl_flags & FL_FLOCK) + ++(*flock_count); + } + dout("counted %d flock locks and %d fcntl locks", + *flock_count, *fcntl_count); +} + +/** + * Encode the flock and fcntl locks for the given inode into the pagelist. + * Format is: #fcntl locks, sequential fcntl locks, #flock locks, + * sequential flock locks. + * Must be called with BLK already held, and the lock numbers should have + * been gathered under the same lock holding window. + */ +int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist, + int num_fcntl_locks, int num_flock_locks) +{ + struct file_lock *lock; + struct ceph_filelock cephlock; + int err = 0; + + dout("encoding %d flock and %d fcntl locks", num_flock_locks, + num_fcntl_locks); + err = ceph_pagelist_append(pagelist, &num_fcntl_locks, sizeof(u32)); + if (err) + goto fail; + for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { + if (lock->fl_flags & FL_POSIX) { + err = lock_to_ceph_filelock(lock, &cephlock); + if (err) + goto fail; + err = ceph_pagelist_append(pagelist, &cephlock, + sizeof(struct ceph_filelock)); + } + if (err) + goto fail; + } + + err = ceph_pagelist_append(pagelist, &num_flock_locks, sizeof(u32)); + if (err) + goto fail; + for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { + if (lock->fl_flags & FL_FLOCK) { + err = lock_to_ceph_filelock(lock, &cephlock); + if (err) + goto fail; + err = ceph_pagelist_append(pagelist, &cephlock, + sizeof(struct ceph_filelock)); + } + if (err) + goto fail; + } +fail: + return err; +} + +/* + * Given a pointer to a lock, convert it to a ceph filelock + */ +int lock_to_ceph_filelock(struct file_lock *lock, + struct ceph_filelock *cephlock) +{ + int err = 0; + + cephlock->start = cpu_to_le64(lock->fl_start); + cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1); + cephlock->client = cpu_to_le64(0); + cephlock->pid = cpu_to_le64(lock->fl_pid); + cephlock->pid_namespace = cpu_to_le64((u64)lock->fl_nspid); + + switch (lock->fl_type) { + case F_RDLCK: + cephlock->type = CEPH_LOCK_SHARED; + break; + case F_WRLCK: + cephlock->type = CEPH_LOCK_EXCL; + break; + case F_UNLCK: + cephlock->type = CEPH_LOCK_UNLOCK; + break; + default: + dout("Have unknown lock type %d", lock->fl_type); + err = -EINVAL; + } + + return err; +} diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index dd440bd438a9..a75ddbf9fe37 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -3,6 +3,7 @@ #include <linux/wait.h> #include <linux/slab.h> #include <linux/sched.h> +#include <linux/smp_lock.h> #include "mds_client.h" #include "mon_client.h" @@ -37,6 +38,11 @@ * are no longer valid. */ +struct ceph_reconnect_state { + struct ceph_pagelist *pagelist; + bool flock; +}; + static void __wake_requests(struct ceph_mds_client *mdsc, struct list_head *head); @@ -449,7 +455,7 @@ void ceph_mdsc_release_request(struct kref *kref) kfree(req->r_path1); kfree(req->r_path2); put_request_session(req); - ceph_unreserve_caps(&req->r_caps_reservation); + ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation); kfree(req); } @@ -512,7 +518,8 @@ static void __register_request(struct ceph_mds_client *mdsc, { req->r_tid = ++mdsc->last_tid; if (req->r_num_caps) - ceph_reserve_caps(&req->r_caps_reservation, req->r_num_caps); + ceph_reserve_caps(mdsc, &req->r_caps_reservation, + req->r_num_caps); dout("__register_request %p tid %lld\n", req, req->r_tid); ceph_mdsc_get_request(req); __insert_request(mdsc, req); @@ -704,6 +711,51 @@ static int __open_session(struct ceph_mds_client *mdsc, } /* + * open sessions for any export targets for the given mds + * + * called under mdsc->mutex + */ +static void __open_export_target_sessions(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_mds_info *mi; + struct ceph_mds_session *ts; + int i, mds = session->s_mds; + int target; + + if (mds >= mdsc->mdsmap->m_max_mds) + return; + mi = &mdsc->mdsmap->m_info[mds]; + dout("open_export_target_sessions for mds%d (%d targets)\n", + session->s_mds, mi->num_export_targets); + + for (i = 0; i < mi->num_export_targets; i++) { + target = mi->export_targets[i]; + ts = __ceph_lookup_mds_session(mdsc, target); + if (!ts) { + ts = register_session(mdsc, target); + if (IS_ERR(ts)) + return; + } + if (session->s_state == CEPH_MDS_SESSION_NEW || + session->s_state == CEPH_MDS_SESSION_CLOSING) + __open_session(mdsc, session); + else + dout(" mds%d target mds%d %p is %s\n", session->s_mds, + i, ts, session_state_name(ts->s_state)); + ceph_put_mds_session(ts); + } +} + +void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + mutex_lock(&mdsc->mutex); + __open_export_target_sessions(mdsc, session); + mutex_unlock(&mdsc->mutex); +} + +/* * session caps */ @@ -764,7 +816,7 @@ static int iterate_session_caps(struct ceph_mds_session *session, last_inode = NULL; } if (old_cap) { - ceph_put_cap(old_cap); + ceph_put_cap(session->s_mdsc, old_cap); old_cap = NULL; } @@ -793,7 +845,7 @@ out: if (last_inode) iput(last_inode); if (old_cap) - ceph_put_cap(old_cap); + ceph_put_cap(session->s_mdsc, old_cap); return ret; } @@ -1067,15 +1119,16 @@ static int trim_caps(struct ceph_mds_client *mdsc, * Called under s_mutex. */ int ceph_add_cap_releases(struct ceph_mds_client *mdsc, - struct ceph_mds_session *session, - int extra) + struct ceph_mds_session *session) { - struct ceph_msg *msg; + struct ceph_msg *msg, *partial = NULL; struct ceph_mds_cap_release *head; int err = -ENOMEM; + int extra = mdsc->client->mount_args->cap_release_safety; + int num; - if (extra < 0) - extra = mdsc->client->mount_args->cap_release_safety; + dout("add_cap_releases %p mds%d extra %d\n", session, session->s_mds, + extra); spin_lock(&session->s_cap_lock); @@ -1084,9 +1137,14 @@ int ceph_add_cap_releases(struct ceph_mds_client *mdsc, struct ceph_msg, list_head); head = msg->front.iov_base; - extra += CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num); + num = le32_to_cpu(head->num); + if (num) { + dout(" partial %p with (%d/%d)\n", msg, num, + (int)CEPH_CAPS_PER_RELEASE); + extra += CEPH_CAPS_PER_RELEASE - num; + partial = msg; + } } - while (session->s_num_cap_releases < session->s_nr_caps + extra) { spin_unlock(&session->s_cap_lock); msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE, @@ -1103,19 +1161,14 @@ int ceph_add_cap_releases(struct ceph_mds_client *mdsc, session->s_num_cap_releases += CEPH_CAPS_PER_RELEASE; } - if (!list_empty(&session->s_cap_releases)) { - msg = list_first_entry(&session->s_cap_releases, - struct ceph_msg, - list_head); - head = msg->front.iov_base; - if (head->num) { - dout(" queueing non-full %p (%d)\n", msg, - le32_to_cpu(head->num)); - list_move_tail(&msg->list_head, - &session->s_cap_releases_done); - session->s_num_cap_releases -= - CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num); - } + if (partial) { + head = partial->front.iov_base; + num = le32_to_cpu(head->num); + dout(" queueing partial %p with %d/%d\n", partial, num, + (int)CEPH_CAPS_PER_RELEASE); + list_move_tail(&partial->list_head, + &session->s_cap_releases_done); + session->s_num_cap_releases -= CEPH_CAPS_PER_RELEASE - num; } err = 0; spin_unlock(&session->s_cap_lock); @@ -1250,6 +1303,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode) return ERR_PTR(-ENOMEM); mutex_init(&req->r_fill_mutex); + req->r_mdsc = mdsc; req->r_started = jiffies; req->r_resend_mds = -1; INIT_LIST_HEAD(&req->r_unsafe_dir_item); @@ -1580,6 +1634,15 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, req->r_mds = mds; req->r_attempts++; + if (req->r_inode) { + struct ceph_cap *cap = + ceph_get_cap_for_mds(ceph_inode(req->r_inode), mds); + + if (cap) + req->r_sent_on_mseq = cap->mseq; + else + req->r_sent_on_mseq = -1; + } dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req, req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts); @@ -1914,21 +1977,40 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) result = le32_to_cpu(head->result); /* - * Tolerate 2 consecutive ESTALEs from the same mds. - * FIXME: we should be looking at the cap migrate_seq. + * Handle an ESTALE + * if we're not talking to the authority, send to them + * if the authority has changed while we weren't looking, + * send to new authority + * Otherwise we just have to return an ESTALE */ if (result == -ESTALE) { - req->r_direct_mode = USE_AUTH_MDS; - req->r_num_stale++; - if (req->r_num_stale <= 2) { + dout("got ESTALE on request %llu", req->r_tid); + if (!req->r_inode) { + /* do nothing; not an authority problem */ + } else if (req->r_direct_mode != USE_AUTH_MDS) { + dout("not using auth, setting for that now"); + req->r_direct_mode = USE_AUTH_MDS; __do_request(mdsc, req); mutex_unlock(&mdsc->mutex); goto out; + } else { + struct ceph_inode_info *ci = ceph_inode(req->r_inode); + struct ceph_cap *cap = + ceph_get_cap_for_mds(ci, req->r_mds);; + + dout("already using auth"); + if ((!cap || cap != ci->i_auth_cap) || + (cap->mseq != req->r_sent_on_mseq)) { + dout("but cap changed, so resending"); + __do_request(mdsc, req); + mutex_unlock(&mdsc->mutex); + goto out; + } } - } else { - req->r_num_stale = 0; + dout("have to return ESTALE on request %llu", req->r_tid); } + if (head->safe) { req->r_got_safe = true; __unregister_request(mdsc, req); @@ -1985,7 +2067,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) if (err == 0) { if (result == 0 && rinfo->dir_nr) ceph_readdir_prepopulate(req, req->r_session); - ceph_unreserve_caps(&req->r_caps_reservation); + ceph_unreserve_caps(mdsc, &req->r_caps_reservation); } mutex_unlock(&req->r_fill_mutex); @@ -2005,7 +2087,7 @@ out_err: } mutex_unlock(&mdsc->mutex); - ceph_add_cap_releases(mdsc, req->r_session, -1); + ceph_add_cap_releases(mdsc, req->r_session); mutex_unlock(&session->s_mutex); /* kick calling process */ @@ -2193,9 +2275,14 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc, static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) { - struct ceph_mds_cap_reconnect rec; + union { + struct ceph_mds_cap_reconnect v2; + struct ceph_mds_cap_reconnect_v1 v1; + } rec; + size_t reclen; struct ceph_inode_info *ci; - struct ceph_pagelist *pagelist = arg; + struct ceph_reconnect_state *recon_state = arg; + struct ceph_pagelist *pagelist = recon_state->pagelist; char *path; int pathlen, err; u64 pathbase; @@ -2228,17 +2315,44 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, spin_lock(&inode->i_lock); cap->seq = 0; /* reset cap seq */ cap->issue_seq = 0; /* and issue_seq */ - rec.cap_id = cpu_to_le64(cap->cap_id); - rec.pathbase = cpu_to_le64(pathbase); - rec.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); - rec.issued = cpu_to_le32(cap->issued); - rec.size = cpu_to_le64(inode->i_size); - ceph_encode_timespec(&rec.mtime, &inode->i_mtime); - ceph_encode_timespec(&rec.atime, &inode->i_atime); - rec.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); + + if (recon_state->flock) { + rec.v2.cap_id = cpu_to_le64(cap->cap_id); + rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); + rec.v2.issued = cpu_to_le32(cap->issued); + rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); + rec.v2.pathbase = cpu_to_le64(pathbase); + rec.v2.flock_len = 0; + reclen = sizeof(rec.v2); + } else { + rec.v1.cap_id = cpu_to_le64(cap->cap_id); + rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); + rec.v1.issued = cpu_to_le32(cap->issued); + rec.v1.size = cpu_to_le64(inode->i_size); + ceph_encode_timespec(&rec.v1.mtime, &inode->i_mtime); + ceph_encode_timespec(&rec.v1.atime, &inode->i_atime); + rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); + rec.v1.pathbase = cpu_to_le64(pathbase); + reclen = sizeof(rec.v1); + } spin_unlock(&inode->i_lock); - err = ceph_pagelist_append(pagelist, &rec, sizeof(rec)); + if (recon_state->flock) { + int num_fcntl_locks, num_flock_locks; + + lock_kernel(); + ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks); + rec.v2.flock_len = (2*sizeof(u32) + + (num_fcntl_locks+num_flock_locks) * + sizeof(struct ceph_filelock)); + + err = ceph_pagelist_append(pagelist, &rec, reclen); + if (!err) + err = ceph_encode_locks(inode, pagelist, + num_fcntl_locks, + num_flock_locks); + unlock_kernel(); + } out: kfree(path); @@ -2267,6 +2381,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds = session->s_mds; int err = -ENOMEM; struct ceph_pagelist *pagelist; + struct ceph_reconnect_state recon_state; pr_info("mds%d reconnect start\n", mds); @@ -2301,7 +2416,10 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps); if (err) goto fail; - err = iterate_session_caps(session, encode_caps_cb, pagelist); + + recon_state.pagelist = pagelist; + recon_state.flock = session->s_con.peer_features & CEPH_FEATURE_FLOCK; + err = iterate_session_caps(session, encode_caps_cb, &recon_state); if (err < 0) goto fail; @@ -2326,6 +2444,8 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, } reply->pagelist = pagelist; + if (recon_state.flock) + reply->hdr.version = cpu_to_le16(2); reply->hdr.data_len = cpu_to_le32(pagelist->length); reply->nr_pages = calc_pages_for(0, pagelist->length); ceph_con_send(&session->s_con, reply); @@ -2376,9 +2496,11 @@ static void check_new_map(struct ceph_mds_client *mdsc, oldstate = ceph_mdsmap_get_state(oldmap, i); newstate = ceph_mdsmap_get_state(newmap, i); - dout("check_new_map mds%d state %s -> %s (session %s)\n", + dout("check_new_map mds%d state %s%s -> %s%s (session %s)\n", i, ceph_mds_state_name(oldstate), + ceph_mdsmap_is_laggy(oldmap, i) ? " (laggy)" : "", ceph_mds_state_name(newstate), + ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "", session_state_name(s->s_state)); if (memcmp(ceph_mdsmap_get_addr(oldmap, i), @@ -2428,6 +2550,21 @@ static void check_new_map(struct ceph_mds_client *mdsc, wake_up_session_caps(s, 1); } } + + for (i = 0; i < newmap->m_max_mds && i < mdsc->max_sessions; i++) { + s = mdsc->sessions[i]; + if (!s) + continue; + if (!ceph_mdsmap_is_laggy(newmap, i)) + continue; + if (s->s_state == CEPH_MDS_SESSION_OPEN || + s->s_state == CEPH_MDS_SESSION_HUNG || + s->s_state == CEPH_MDS_SESSION_CLOSING) { + dout(" connecting to export targets of laggy mds%d\n", + i); + __open_export_target_sessions(mdsc, s); + } + } } @@ -2715,7 +2852,7 @@ static void delayed_work(struct work_struct *work) send_renew_caps(mdsc, s); else ceph_con_keepalive(&s->s_con); - ceph_add_cap_releases(mdsc, s, -1); + ceph_add_cap_releases(mdsc, s); if (s->s_state == CEPH_MDS_SESSION_OPEN || s->s_state == CEPH_MDS_SESSION_HUNG) ceph_send_cap_releases(mdsc, s); @@ -2764,6 +2901,9 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client) spin_lock_init(&mdsc->dentry_lru_lock); INIT_LIST_HEAD(&mdsc->dentry_lru); + ceph_caps_init(mdsc); + ceph_adjust_min_caps(mdsc, client->min_caps); + return 0; } @@ -2959,6 +3099,7 @@ void ceph_mdsc_stop(struct ceph_mds_client *mdsc) if (mdsc->mdsmap) ceph_mdsmap_destroy(mdsc->mdsmap); kfree(mdsc->sessions); + ceph_caps_finalize(mdsc); } diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 952410c60d09..ab7e89f5e344 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -151,6 +151,7 @@ typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc, struct ceph_mds_request { u64 r_tid; /* transaction id */ struct rb_node r_node; + struct ceph_mds_client *r_mdsc; int r_op; /* mds op code */ int r_mds; @@ -207,8 +208,8 @@ struct ceph_mds_request { int r_attempts; /* resend attempts */ int r_num_fwd; /* number of forward attempts */ - int r_num_stale; int r_resend_mds; /* mds to resend to next, if any*/ + u32 r_sent_on_mseq; /* cap mseq request was sent at*/ struct kref r_kref; struct list_head r_wait; @@ -267,6 +268,27 @@ struct ceph_mds_client { spinlock_t cap_dirty_lock; /* protects above items */ wait_queue_head_t cap_flushing_wq; + /* + * Cap reservations + * + * Maintain a global pool of preallocated struct ceph_caps, referenced + * by struct ceph_caps_reservations. This ensures that we preallocate + * memory needed to successfully process an MDS response. (If an MDS + * sends us cap information and we fail to process it, we will have + * problems due to the client and MDS being out of sync.) + * + * Reservations are 'owned' by a ceph_cap_reservation context. + */ + spinlock_t caps_list_lock; + struct list_head caps_list; /* unused (reserved or + unreserved) */ + int caps_total_count; /* total caps allocated */ + int caps_use_count; /* in use */ + int caps_reserve_count; /* unused, reserved */ + int caps_avail_count; /* unused, unreserved */ + int caps_min_count; /* keep at least this many + (unreserved) */ + #ifdef CONFIG_DEBUG_FS struct dentry *debugfs_file; #endif @@ -324,8 +346,7 @@ static inline void ceph_mdsc_put_request(struct ceph_mds_request *req) } extern int ceph_add_cap_releases(struct ceph_mds_client *mdsc, - struct ceph_mds_session *session, - int extra); + struct ceph_mds_session *session); extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc, struct ceph_mds_session *session); @@ -343,4 +364,7 @@ extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg); +extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session); + #endif diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index c4c498e6dfef..040be6d1150b 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -85,6 +85,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) struct ceph_entity_addr addr; u32 num_export_targets; void *pexport_targets = NULL; + struct ceph_timespec laggy_since; ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad); global_id = ceph_decode_64(p); @@ -103,7 +104,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) state_seq = ceph_decode_64(p); ceph_decode_copy(p, &addr, sizeof(addr)); ceph_decode_addr(&addr); - *p += sizeof(struct ceph_timespec); + ceph_decode_copy(p, &laggy_since, sizeof(laggy_since)); *p += sizeof(u32); ceph_decode_32_safe(p, end, namelen, bad); *p += namelen; @@ -122,6 +123,9 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) m->m_info[mds].global_id = global_id; m->m_info[mds].state = state; m->m_info[mds].addr = addr; + m->m_info[mds].laggy = + (laggy_since.tv_sec != 0 || + laggy_since.tv_nsec != 0); m->m_info[mds].num_export_targets = num_export_targets; if (num_export_targets) { m->m_info[mds].export_targets = diff --git a/fs/ceph/mdsmap.h b/fs/ceph/mdsmap.h index eacc131aa5cb..4c5cb0880bba 100644 --- a/fs/ceph/mdsmap.h +++ b/fs/ceph/mdsmap.h @@ -13,6 +13,7 @@ struct ceph_mds_info { struct ceph_entity_addr addr; s32 state; int num_export_targets; + bool laggy; u32 *export_targets; }; @@ -47,6 +48,13 @@ static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w) return m->m_info[w].state; } +static inline bool ceph_mdsmap_is_laggy(struct ceph_mdsmap *m, int w) +{ + if (w >= 0 && w < m->m_max_mds) + return m->m_info[w].laggy; + return false; +} + extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m); extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end); extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m); diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index 15167b2daa55..2502d76fcec1 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -108,7 +108,7 @@ void ceph_msgr_exit(void) destroy_workqueue(ceph_msgr_wq); } -void ceph_msgr_flush() +void ceph_msgr_flush(void) { flush_workqueue(ceph_msgr_wq); } @@ -647,7 +647,7 @@ static void prepare_write_connect(struct ceph_messenger *msgr, dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con, con->connect_seq, global_seq, proto); - con->out_connect.features = cpu_to_le64(CEPH_FEATURE_SUPPORTED_CLIENT); + con->out_connect.features = cpu_to_le64(CEPH_FEATURE_SUPPORTED); con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT); con->out_connect.connect_seq = cpu_to_le32(con->connect_seq); con->out_connect.global_seq = cpu_to_le32(global_seq); @@ -1081,11 +1081,11 @@ static int process_banner(struct ceph_connection *con) sizeof(con->peer_addr)) != 0 && !(addr_is_blank(&con->actual_peer_addr.in_addr) && con->actual_peer_addr.nonce == con->peer_addr.nonce)) { - pr_warning("wrong peer, want %s/%lld, got %s/%lld\n", + pr_warning("wrong peer, want %s/%d, got %s/%d\n", pr_addr(&con->peer_addr.in_addr), - le64_to_cpu(con->peer_addr.nonce), + (int)le32_to_cpu(con->peer_addr.nonce), pr_addr(&con->actual_peer_addr.in_addr), - le64_to_cpu(con->actual_peer_addr.nonce)); + (int)le32_to_cpu(con->actual_peer_addr.nonce)); con->error_msg = "wrong peer at address"; return -1; } @@ -1123,8 +1123,8 @@ static void fail_protocol(struct ceph_connection *con) static int process_connect(struct ceph_connection *con) { - u64 sup_feat = CEPH_FEATURE_SUPPORTED_CLIENT; - u64 req_feat = CEPH_FEATURE_REQUIRED_CLIENT; + u64 sup_feat = CEPH_FEATURE_SUPPORTED; + u64 req_feat = CEPH_FEATURE_REQUIRED; u64 server_feat = le64_to_cpu(con->in_reply.features); dout("process_connect on %p tag %d\n", con, (int)con->in_tag); @@ -1302,8 +1302,8 @@ static void process_ack(struct ceph_connection *con) static int read_partial_message_section(struct ceph_connection *con, - struct kvec *section, unsigned int sec_len, - u32 *crc) + struct kvec *section, + unsigned int sec_len, u32 *crc) { int left; int ret; @@ -1434,7 +1434,8 @@ static int read_partial_message(struct ceph_connection *con) /* middle */ if (m->middle) { - ret = read_partial_message_section(con, &m->middle->vec, middle_len, + ret = read_partial_message_section(con, &m->middle->vec, + middle_len, &con->in_middle_crc); if (ret <= 0) return ret; @@ -1920,7 +1921,7 @@ out: /* * in case we faulted due to authentication, invalidate our * current tickets so that we can get new ones. - */ + */ if (con->auth_retry && con->ops->invalidate_authorizer) { dout("calling invalidate_authorizer()\n"); con->ops->invalidate_authorizer(con); diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c index 54fe01c50706..b2a5a3e4a671 100644 --- a/fs/ceph/mon_client.c +++ b/fs/ceph/mon_client.c @@ -349,7 +349,7 @@ out: } /* - * statfs + * generic requests (e.g., statfs, poolop) */ static struct ceph_mon_generic_request *__lookup_generic_req( struct ceph_mon_client *monc, u64 tid) @@ -442,6 +442,35 @@ static struct ceph_msg *get_generic_reply(struct ceph_connection *con, return m; } +static int do_generic_request(struct ceph_mon_client *monc, + struct ceph_mon_generic_request *req) +{ + int err; + + /* register request */ + mutex_lock(&monc->mutex); + req->tid = ++monc->last_tid; + req->request->hdr.tid = cpu_to_le64(req->tid); + __insert_generic_request(monc, req); + monc->num_generic_requests++; + ceph_con_send(monc->con, ceph_msg_get(req->request)); + mutex_unlock(&monc->mutex); + + err = wait_for_completion_interruptible(&req->completion); + + mutex_lock(&monc->mutex); + rb_erase(&req->node, &monc->generic_request_tree); + monc->num_generic_requests--; + mutex_unlock(&monc->mutex); + + if (!err) + err = req->result; + return err; +} + +/* + * statfs + */ static void handle_statfs_reply(struct ceph_mon_client *monc, struct ceph_msg *msg) { @@ -468,7 +497,7 @@ static void handle_statfs_reply(struct ceph_mon_client *monc, return; bad: - pr_err("corrupt generic reply, no tid\n"); + pr_err("corrupt generic reply, tid %llu\n", tid); ceph_msg_dump(msg); } @@ -487,6 +516,7 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf) kref_init(&req->kref); req->buf = buf; + req->buf_len = sizeof(*buf); init_completion(&req->completion); err = -ENOMEM; @@ -504,33 +534,134 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf) h->monhdr.session_mon_tid = 0; h->fsid = monc->monmap->fsid; - /* register request */ - mutex_lock(&monc->mutex); - req->tid = ++monc->last_tid; - req->request->hdr.tid = cpu_to_le64(req->tid); - __insert_generic_request(monc, req); - monc->num_generic_requests++; - mutex_unlock(&monc->mutex); + err = do_generic_request(monc, req); - /* send request and wait */ - ceph_con_send(monc->con, ceph_msg_get(req->request)); - err = wait_for_completion_interruptible(&req->completion); +out: + kref_put(&req->kref, release_generic_request); + return err; +} + +/* + * pool ops + */ +static int get_poolop_reply_buf(const char *src, size_t src_len, + char *dst, size_t dst_len) +{ + u32 buf_len; + + if (src_len != sizeof(u32) + dst_len) + return -EINVAL; + + buf_len = le32_to_cpu(*(u32 *)src); + if (buf_len != dst_len) + return -EINVAL; + + memcpy(dst, src + sizeof(u32), dst_len); + return 0; +} + +static void handle_poolop_reply(struct ceph_mon_client *monc, + struct ceph_msg *msg) +{ + struct ceph_mon_generic_request *req; + struct ceph_mon_poolop_reply *reply = msg->front.iov_base; + u64 tid = le64_to_cpu(msg->hdr.tid); + + if (msg->front.iov_len < sizeof(*reply)) + goto bad; + dout("handle_poolop_reply %p tid %llu\n", msg, tid); mutex_lock(&monc->mutex); - rb_erase(&req->node, &monc->generic_request_tree); - monc->num_generic_requests--; + req = __lookup_generic_req(monc, tid); + if (req) { + if (req->buf_len && + get_poolop_reply_buf(msg->front.iov_base + sizeof(*reply), + msg->front.iov_len - sizeof(*reply), + req->buf, req->buf_len) < 0) { + mutex_unlock(&monc->mutex); + goto bad; + } + req->result = le32_to_cpu(reply->reply_code); + get_generic_request(req); + } mutex_unlock(&monc->mutex); + if (req) { + complete(&req->completion); + put_generic_request(req); + } + return; - if (!err) - err = req->result; +bad: + pr_err("corrupt generic reply, tid %llu\n", tid); + ceph_msg_dump(msg); +} + +/* + * Do a synchronous pool op. + */ +int ceph_monc_do_poolop(struct ceph_mon_client *monc, u32 op, + u32 pool, u64 snapid, + char *buf, int len) +{ + struct ceph_mon_generic_request *req; + struct ceph_mon_poolop *h; + int err; + + req = kzalloc(sizeof(*req), GFP_NOFS); + if (!req) + return -ENOMEM; + + kref_init(&req->kref); + req->buf = buf; + req->buf_len = len; + init_completion(&req->completion); + + err = -ENOMEM; + req->request = ceph_msg_new(CEPH_MSG_POOLOP, sizeof(*h), GFP_NOFS); + if (!req->request) + goto out; + req->reply = ceph_msg_new(CEPH_MSG_POOLOP_REPLY, 1024, GFP_NOFS); + if (!req->reply) + goto out; + + /* fill out request */ + req->request->hdr.version = cpu_to_le16(2); + h = req->request->front.iov_base; + h->monhdr.have_version = 0; + h->monhdr.session_mon = cpu_to_le16(-1); + h->monhdr.session_mon_tid = 0; + h->fsid = monc->monmap->fsid; + h->pool = cpu_to_le32(pool); + h->op = cpu_to_le32(op); + h->auid = 0; + h->snapid = cpu_to_le64(snapid); + h->name_len = 0; + + err = do_generic_request(monc, req); out: kref_put(&req->kref, release_generic_request); return err; } +int ceph_monc_create_snapid(struct ceph_mon_client *monc, + u32 pool, u64 *snapid) +{ + return ceph_monc_do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, + pool, 0, (char *)snapid, sizeof(*snapid)); + +} + +int ceph_monc_delete_snapid(struct ceph_mon_client *monc, + u32 pool, u64 snapid) +{ + return ceph_monc_do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, + pool, snapid, 0, 0); + +} + /* - * Resend pending statfs requests. + * Resend pending generic requests. */ static void __resend_generic_request(struct ceph_mon_client *monc) { @@ -783,6 +914,10 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) handle_statfs_reply(monc, msg); break; + case CEPH_MSG_POOLOP_REPLY: + handle_poolop_reply(monc, msg); + break; + case CEPH_MSG_MON_MAP: ceph_monc_handle_map(monc, msg); break; @@ -820,6 +955,7 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con, case CEPH_MSG_MON_SUBSCRIBE_ACK: m = ceph_msg_get(monc->m_subscribe_ack); break; + case CEPH_MSG_POOLOP_REPLY: case CEPH_MSG_STATFS_REPLY: return get_generic_reply(con, hdr, skip); case CEPH_MSG_AUTH_REPLY: diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h index 174d794321d0..8e396f2c0963 100644 --- a/fs/ceph/mon_client.h +++ b/fs/ceph/mon_client.h @@ -50,6 +50,7 @@ struct ceph_mon_generic_request { struct rb_node node; int result; void *buf; + int buf_len; struct completion completion; struct ceph_msg *request; /* original request */ struct ceph_msg *reply; /* and reply */ @@ -111,6 +112,10 @@ extern int ceph_monc_open_session(struct ceph_mon_client *monc); extern int ceph_monc_validate_auth(struct ceph_mon_client *monc); +extern int ceph_monc_create_snapid(struct ceph_mon_client *monc, + u32 pool, u64 *snapid); +extern int ceph_monc_delete_snapid(struct ceph_mon_client *monc, + u32 pool, u64 snapid); #endif diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h index 892a0298dfdf..680d3d648cac 100644 --- a/fs/ceph/msgr.h +++ b/fs/ceph/msgr.h @@ -1,5 +1,5 @@ -#ifndef __MSGR_H -#define __MSGR_H +#ifndef CEPH_MSGR_H +#define CEPH_MSGR_H /* * Data types for message passing layer used by Ceph. diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c index e38522347898..bed6391e52c7 100644 --- a/fs/ceph/osd_client.c +++ b/fs/ceph/osd_client.c @@ -1276,8 +1276,6 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc, /* it may be a short read due to an object boundary */ req->r_pages = pages; - num_pages = calc_pages_for(off, *plen); - req->r_num_pages = num_pages; dout("readpages final extent is %llu~%llu (%d pages)\n", off, *plen, req->r_num_pages); @@ -1319,7 +1317,6 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, /* it may be a short write due to an object boundary */ req->r_pages = pages; - req->r_num_pages = calc_pages_for(off, len); dout("writepages %llu~%llu (%d pages)\n", off, len, req->r_num_pages); @@ -1476,8 +1473,8 @@ static void put_osd_con(struct ceph_connection *con) * authentication */ static int get_authorizer(struct ceph_connection *con, - void **buf, int *len, int *proto, - void **reply_buf, int *reply_len, int force_new) + void **buf, int *len, int *proto, + void **reply_buf, int *reply_len, int force_new) { struct ceph_osd *o = con->private; struct ceph_osd_client *osdc = o->o_osdc; @@ -1497,7 +1494,7 @@ static int get_authorizer(struct ceph_connection *con, &o->o_authorizer_reply_buf, &o->o_authorizer_reply_buf_len); if (ret) - return ret; + return ret; } *proto = ac->protocol; diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c index 416d46adbf87..e31f118f1392 100644 --- a/fs/ceph/osdmap.c +++ b/fs/ceph/osdmap.c @@ -424,12 +424,30 @@ static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi) kfree(pi); } -void __decode_pool(void **p, struct ceph_pg_pool_info *pi) +static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi) { + unsigned n, m; + ceph_decode_copy(p, &pi->v, sizeof(pi->v)); calc_pg_masks(pi); - *p += le32_to_cpu(pi->v.num_snaps) * sizeof(u64); + + /* num_snaps * snap_info_t */ + n = le32_to_cpu(pi->v.num_snaps); + while (n--) { + ceph_decode_need(p, end, sizeof(u64) + 1 + sizeof(u64) + + sizeof(struct ceph_timespec), bad); + *p += sizeof(u64) + /* key */ + 1 + sizeof(u64) + /* u8, snapid */ + sizeof(struct ceph_timespec); + m = ceph_decode_32(p); /* snap name */ + *p += m; + } + *p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2; + return 0; + +bad: + return -EINVAL; } static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map) @@ -571,7 +589,9 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) kfree(pi); goto bad; } - __decode_pool(p, pi); + err = __decode_pool(p, end, pi); + if (err < 0) + goto bad; __insert_pg_pool(&map->pg_pools, pi); } @@ -760,7 +780,9 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, pi->id = pool; __insert_pg_pool(&map->pg_pools, pi); } - __decode_pool(p, pi); + err = __decode_pool(p, end, pi); + if (err < 0) + goto bad; } if (version >= 5 && __decode_pool_names(p, end, map) < 0) goto bad; @@ -833,7 +855,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, node)->pgid, pgid) <= 0) { struct ceph_pg_mapping *cur = rb_entry(rbp, struct ceph_pg_mapping, node); - + rbp = rb_next(rbp); dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid); rb_erase(&cur->node, &map->pg_temp); @@ -1026,8 +1048,9 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid, ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset, pool->v.type, pool->v.size); if (ruleno < 0) { - pr_err("no crush rule pool %d type %d size %d\n", - poolid, pool->v.type, pool->v.size); + pr_err("no crush rule pool %d ruleset %d type %d size %d\n", + poolid, pool->v.crush_ruleset, pool->v.type, + pool->v.size); return NULL; } diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h index 8fcc023056c7..6d5247f2e81b 100644 --- a/fs/ceph/rados.h +++ b/fs/ceph/rados.h @@ -1,5 +1,5 @@ -#ifndef __RADOS_H -#define __RADOS_H +#ifndef CEPH_RADOS_H +#define CEPH_RADOS_H /* * Data types for the Ceph distributed object storage layer RADOS @@ -203,6 +203,7 @@ enum { CEPH_OSD_OP_TMAPGET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 12, CEPH_OSD_OP_CREATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13, + CEPH_OSD_OP_ROLLBACK= CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 14, /** attrs **/ /* read */ @@ -272,6 +273,10 @@ static inline int ceph_osd_op_mode_modify(int op) return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR; } +/* + * note that the following tmap stuff is also defined in the ceph librados.h + * any modification here needs to be updated there + */ #define CEPH_OSD_TMAP_HDR 'h' #define CEPH_OSD_TMAP_SET 's' #define CEPH_OSD_TMAP_RM 'r' @@ -297,6 +302,7 @@ enum { CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */ CEPH_OSD_FLAG_PGOP = 1024, /* pg op, no object */ CEPH_OSD_FLAG_EXEC = 2048, /* op may exec */ + CEPH_OSD_FLAG_EXEC_PUBLIC = 4096, /* op may exec (public) */ }; enum { @@ -350,6 +356,9 @@ struct ceph_osd_op { struct { __le64 cookie, count; } __attribute__ ((packed)) pgls; + struct { + __le64 snapid; + } __attribute__ ((packed)) snap; }; __le32 payload_len; } __attribute__ ((packed)); diff --git a/fs/ceph/super.c b/fs/ceph/super.c index fa87f51e38e1..9922628532b2 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -2,6 +2,7 @@ #include "ceph_debug.h" #include <linux/backing-dev.h> +#include <linux/ctype.h> #include <linux/fs.h> #include <linux/inet.h> #include <linux/in6.h> @@ -101,12 +102,21 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) } -static int ceph_syncfs(struct super_block *sb, int wait) +static int ceph_sync_fs(struct super_block *sb, int wait) { - dout("sync_fs %d\n", wait); + struct ceph_client *client = ceph_sb_to_client(sb); + + if (!wait) { + dout("sync_fs (non-blocking)\n"); + ceph_flush_dirty_caps(&client->mdsc); + dout("sync_fs (non-blocking) done\n"); + return 0; + } + + dout("sync_fs (blocking)\n"); ceph_osdc_sync(&ceph_sb_to_client(sb)->osdc); ceph_mdsc_sync(&ceph_sb_to_client(sb)->mdsc); - dout("sync_fs %d done\n", wait); + dout("sync_fs (blocking) done\n"); return 0; } @@ -150,9 +160,7 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt) struct ceph_mount_args *args = client->mount_args; if (args->flags & CEPH_OPT_FSID) - seq_printf(m, ",fsidmajor=%llu,fsidminor%llu", - le64_to_cpu(*(__le64 *)&args->fsid.fsid[0]), - le64_to_cpu(*(__le64 *)&args->fsid.fsid[8])); + seq_printf(m, ",fsid=%pU", &args->fsid); if (args->flags & CEPH_OPT_NOSHARE) seq_puts(m, ",noshare"); if (args->flags & CEPH_OPT_DIRSTAT) @@ -279,7 +287,7 @@ static const struct super_operations ceph_super_ops = { .alloc_inode = ceph_alloc_inode, .destroy_inode = ceph_destroy_inode, .write_inode = ceph_write_inode, - .sync_fs = ceph_syncfs, + .sync_fs = ceph_sync_fs, .put_super = ceph_put_super, .show_options = ceph_show_options, .statfs = ceph_statfs, @@ -322,9 +330,6 @@ const char *ceph_msg_type_name(int type) * mount options */ enum { - Opt_fsidmajor, - Opt_fsidminor, - Opt_monport, Opt_wsize, Opt_rsize, Opt_osdtimeout, @@ -339,6 +344,7 @@ enum { Opt_congestion_kb, Opt_last_int, /* int args above */ + Opt_fsid, Opt_snapdirname, Opt_name, Opt_secret, @@ -355,9 +361,6 @@ enum { }; static match_table_t arg_tokens = { - {Opt_fsidmajor, "fsidmajor=%ld"}, - {Opt_fsidminor, "fsidminor=%ld"}, - {Opt_monport, "monport=%d"}, {Opt_wsize, "wsize=%d"}, {Opt_rsize, "rsize=%d"}, {Opt_osdtimeout, "osdtimeout=%d"}, @@ -371,6 +374,7 @@ static match_table_t arg_tokens = { {Opt_readdir_max_bytes, "readdir_max_bytes=%d"}, {Opt_congestion_kb, "write_congestion_kb=%d"}, /* int args above */ + {Opt_fsid, "fsid=%s"}, {Opt_snapdirname, "snapdirname=%s"}, {Opt_name, "name=%s"}, {Opt_secret, "secret=%s"}, @@ -386,6 +390,36 @@ static match_table_t arg_tokens = { {-1, NULL} }; +static int parse_fsid(const char *str, struct ceph_fsid *fsid) +{ + int i = 0; + char tmp[3]; + int err = -EINVAL; + int d; + + dout("parse_fsid '%s'\n", str); + tmp[2] = 0; + while (*str && i < 16) { + if (ispunct(*str)) { + str++; + continue; + } + if (!isxdigit(str[0]) || !isxdigit(str[1])) + break; + tmp[0] = str[0]; + tmp[1] = str[1]; + if (sscanf(tmp, "%x", &d) < 1) + break; + fsid->fsid[i] = d & 0xff; + i++; + str += 2; + } + + if (i == 16) + err = 0; + dout("parse_fsid ret %d got fsid %pU", err, fsid); + return err; +} static struct ceph_mount_args *parse_mount_args(int flags, char *options, const char *dev_name, @@ -469,12 +503,6 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options, dout("got token %d\n", token); } switch (token) { - case Opt_fsidmajor: - *(__le64 *)&args->fsid.fsid[0] = cpu_to_le64(intval); - break; - case Opt_fsidminor: - *(__le64 *)&args->fsid.fsid[8] = cpu_to_le64(intval); - break; case Opt_ip: err = ceph_parse_ips(argstr[0].from, argstr[0].to, @@ -485,6 +513,11 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options, args->flags |= CEPH_OPT_MYIP; break; + case Opt_fsid: + err = parse_fsid(argstr[0].from, &args->fsid); + if (err == 0) + args->flags |= CEPH_OPT_FSID; + break; case Opt_snapdirname: kfree(args->snapdir_name); args->snapdir_name = kstrndup(argstr[0].from, @@ -515,6 +548,9 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options, case Opt_osdkeepalivetimeout: args->osd_keepalive_timeout = intval; break; + case Opt_osd_idle_ttl: + args->osd_idle_ttl = intval; + break; case Opt_mount_timeout: args->mount_timeout = intval; break; @@ -630,7 +666,6 @@ static struct ceph_client *ceph_create_client(struct ceph_mount_args *args) /* caps */ client->min_caps = args->max_readdir; - ceph_adjust_min_caps(client->min_caps); /* subsystems */ err = ceph_monc_init(&client->monc, client); @@ -680,8 +715,6 @@ static void ceph_destroy_client(struct ceph_client *client) ceph_monc_stop(&client->monc); - ceph_adjust_min_caps(-client->min_caps); - ceph_debugfs_client_cleanup(client); destroy_workqueue(client->wb_wq); destroy_workqueue(client->pg_inv_wq); @@ -706,13 +739,13 @@ int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid) { if (client->have_fsid) { if (ceph_fsid_compare(&client->fsid, fsid)) { - pr_err("bad fsid, had " FSID_FORMAT " got " FSID_FORMAT, - PR_FSID(&client->fsid), PR_FSID(fsid)); + pr_err("bad fsid, had %pU got %pU", + &client->fsid, fsid); return -1; } } else { - pr_info("client%lld fsid " FSID_FORMAT "\n", - client->monc.auth->global_id, PR_FSID(fsid)); + pr_info("client%lld fsid %pU\n", client->monc.auth->global_id, + fsid); memcpy(&client->fsid, fsid, sizeof(*fsid)); ceph_debugfs_client_init(client); client->have_fsid = true; @@ -1043,8 +1076,6 @@ static int __init init_ceph(void) if (ret) goto out_msgr; - ceph_caps_init(); - ret = register_filesystem(&ceph_fs_type); if (ret) goto out_icache; @@ -1069,7 +1100,6 @@ static void __exit exit_ceph(void) { dout("exit_ceph\n"); unregister_filesystem(&ceph_fs_type); - ceph_caps_finalize(); destroy_caches(); ceph_msgr_exit(); ceph_debugfs_cleanup(); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 10a4a406e887..2482d696f0de 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -31,6 +31,12 @@ #define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT) /* + * Supported features + */ +#define CEPH_FEATURE_SUPPORTED CEPH_FEATURE_NOSRCADDR | CEPH_FEATURE_FLOCK +#define CEPH_FEATURE_REQUIRED CEPH_FEATURE_NOSRCADDR + +/* * mount options */ #define CEPH_OPT_FSID (1<<0) @@ -560,11 +566,13 @@ static inline int __ceph_caps_wanted(struct ceph_inode_info *ci) /* what the mds thinks we want */ extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci); -extern void ceph_caps_init(void); -extern void ceph_caps_finalize(void); -extern void ceph_adjust_min_caps(int delta); -extern int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need); -extern int ceph_unreserve_caps(struct ceph_cap_reservation *ctx); +extern void ceph_caps_init(struct ceph_mds_client *mdsc); +extern void ceph_caps_finalize(struct ceph_mds_client *mdsc); +extern void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta); +extern int ceph_reserve_caps(struct ceph_mds_client *mdsc, + struct ceph_cap_reservation *ctx, int need); +extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc, + struct ceph_cap_reservation *ctx); extern void ceph_reservation_status(struct ceph_client *client, int *total, int *avail, int *used, int *reserved, int *min); @@ -738,13 +746,6 @@ extern struct kmem_cache *ceph_file_cachep; extern const char *ceph_msg_type_name(int type); extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid); -#define FSID_FORMAT "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-" \ - "%02x%02x%02x%02x%02x%02x" -#define PR_FSID(f) (f)->fsid[0], (f)->fsid[1], (f)->fsid[2], (f)->fsid[3], \ - (f)->fsid[4], (f)->fsid[5], (f)->fsid[6], (f)->fsid[7], \ - (f)->fsid[8], (f)->fsid[9], (f)->fsid[10], (f)->fsid[11], \ - (f)->fsid[12], (f)->fsid[13], (f)->fsid[14], (f)->fsid[15] - /* inode.c */ extern const struct inode_operations ceph_file_iops; @@ -806,13 +807,16 @@ static inline void ceph_remove_cap(struct ceph_cap *cap) __ceph_remove_cap(cap); spin_unlock(&inode->i_lock); } -extern void ceph_put_cap(struct ceph_cap *cap); +extern void ceph_put_cap(struct ceph_mds_client *mdsc, + struct ceph_cap *cap); extern void ceph_queue_caps_release(struct inode *inode); extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc); extern int ceph_fsync(struct file *file, int datasync); extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session); +extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, + int mds); extern int ceph_get_cap_mds(struct inode *inode); extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps); extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had); @@ -857,7 +861,7 @@ extern void ceph_release_page_vector(struct page **pages, int num_pages); /* dir.c */ extern const struct file_operations ceph_dir_fops; extern const struct inode_operations ceph_dir_iops; -extern struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops, +extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops, ceph_snapdir_dentry_ops; extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry); @@ -888,6 +892,14 @@ extern void ceph_debugfs_cleanup(void); extern int ceph_debugfs_client_init(struct ceph_client *client); extern void ceph_debugfs_client_cleanup(struct ceph_client *client); +/* locks.c */ +extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl); +extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl); +extern void ceph_count_locks(struct inode *inode, int *p_num, int *f_num); +extern int ceph_encode_locks(struct inode *i, struct ceph_pagelist *p, + int p_locks, int f_locks); +extern int lock_to_ceph_filelock(struct file_lock *fl, struct ceph_filelock *c); + static inline struct inode *get_dentry_parent_inode(struct dentry *dentry) { if (dentry && dentry->d_parent) diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 68aeebc69681..097a2654c00f 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -337,6 +337,8 @@ void __ceph_destroy_xattrs(struct ceph_inode_info *ci) } static int __build_xattrs(struct inode *inode) + __releases(inode->i_lock) + __acquires(inode->i_lock) { u32 namelen; u32 numattr = 0; diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c index 66b9cf79c5ba..de89645777c7 100644 --- a/fs/coda/psdev.c +++ b/fs/coda/psdev.c @@ -177,7 +177,7 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf, nbytes = req->uc_outSize; /* don't have more space! */ } if (copy_from_user(req->uc_data, buf, nbytes)) { - req->uc_flags |= REQ_ABORT; + req->uc_flags |= CODA_REQ_ABORT; wake_up(&req->uc_sleep); retval = -EFAULT; goto out; @@ -254,8 +254,8 @@ static ssize_t coda_psdev_read(struct file * file, char __user * buf, retval = -EFAULT; /* If request was not a signal, enqueue and don't free */ - if (!(req->uc_flags & REQ_ASYNC)) { - req->uc_flags |= REQ_READ; + if (!(req->uc_flags & CODA_REQ_ASYNC)) { + req->uc_flags |= CODA_REQ_READ; list_add_tail(&(req->uc_chain), &vcp->vc_processing); goto out; } @@ -315,19 +315,19 @@ static int coda_psdev_release(struct inode * inode, struct file * file) list_del(&req->uc_chain); /* Async requests need to be freed here */ - if (req->uc_flags & REQ_ASYNC) { + if (req->uc_flags & CODA_REQ_ASYNC) { CODA_FREE(req->uc_data, sizeof(struct coda_in_hdr)); kfree(req); continue; } - req->uc_flags |= REQ_ABORT; + req->uc_flags |= CODA_REQ_ABORT; wake_up(&req->uc_sleep); } list_for_each_entry_safe(req, tmp, &vcp->vc_processing, uc_chain) { list_del(&req->uc_chain); - req->uc_flags |= REQ_ABORT; + req->uc_flags |= CODA_REQ_ABORT; wake_up(&req->uc_sleep); } diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c index f09c5ed76f6c..b8893ab6f9e6 100644 --- a/fs/coda/upcall.c +++ b/fs/coda/upcall.c @@ -604,7 +604,7 @@ static void coda_unblock_signals(sigset_t *old) (((r)->uc_opcode != CODA_CLOSE && \ (r)->uc_opcode != CODA_STORE && \ (r)->uc_opcode != CODA_RELEASE) || \ - (r)->uc_flags & REQ_READ)) + (r)->uc_flags & CODA_REQ_READ)) static inline void coda_waitfor_upcall(struct upc_req *req) { @@ -624,7 +624,7 @@ static inline void coda_waitfor_upcall(struct upc_req *req) set_current_state(TASK_UNINTERRUPTIBLE); /* got a reply */ - if (req->uc_flags & (REQ_WRITE | REQ_ABORT)) + if (req->uc_flags & (CODA_REQ_WRITE | CODA_REQ_ABORT)) break; if (blocked && time_after(jiffies, timeout) && @@ -708,7 +708,7 @@ static int coda_upcall(struct venus_comm *vcp, coda_waitfor_upcall(req); /* Op went through, interrupt or not... */ - if (req->uc_flags & REQ_WRITE) { + if (req->uc_flags & CODA_REQ_WRITE) { out = (union outputArgs *)req->uc_data; /* here we map positive Venus errors to kernel errors */ error = -out->oh.result; @@ -717,13 +717,13 @@ static int coda_upcall(struct venus_comm *vcp, } error = -EINTR; - if ((req->uc_flags & REQ_ABORT) || !signal_pending(current)) { + if ((req->uc_flags & CODA_REQ_ABORT) || !signal_pending(current)) { printk(KERN_WARNING "coda: Unexpected interruption.\n"); goto exit; } /* Interrupted before venus read it. */ - if (!(req->uc_flags & REQ_READ)) + if (!(req->uc_flags & CODA_REQ_READ)) goto exit; /* Venus saw the upcall, make sure we can send interrupt signal */ @@ -747,7 +747,7 @@ static int coda_upcall(struct venus_comm *vcp, sig_inputArgs->ih.opcode = CODA_SIGNAL; sig_inputArgs->ih.unique = req->uc_unique; - sig_req->uc_flags = REQ_ASYNC; + sig_req->uc_flags = CODA_REQ_ASYNC; sig_req->uc_opcode = sig_inputArgs->ih.opcode; sig_req->uc_unique = sig_inputArgs->ih.unique; sig_req->uc_inSize = sizeof(struct coda_in_hdr); diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 63ae85831464..70227e0dc01d 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -131,23 +131,6 @@ static int w_long(unsigned int fd, unsigned int cmd, return err; } -static int rw_long(unsigned int fd, unsigned int cmd, - compat_ulong_t __user *argp) -{ - mm_segment_t old_fs = get_fs(); - int err; - unsigned long val; - - if(get_user(val, argp)) - return -EFAULT; - set_fs (KERNEL_DS); - err = sys_ioctl(fd, cmd, (unsigned long)&val); - set_fs (old_fs); - if (!err && put_user(val, argp)) - return -EFAULT; - return err; -} - struct compat_video_event { int32_t type; compat_time_t timestamp; @@ -594,12 +577,6 @@ static int do_smb_getmountuid(unsigned int fd, unsigned int cmd, return err; } -static int ioc_settimeout(unsigned int fd, unsigned int cmd, - compat_ulong_t __user *argp) -{ - return rw_long(fd, AUTOFS_IOC_SETTIMEOUT, argp); -} - /* Bluetooth ioctls */ #define HCIUARTSETPROTO _IOW('U', 200, int) #define HCIUARTGETPROTO _IOR('U', 201, int) @@ -969,6 +946,7 @@ COMPATIBLE_IOCTL(TIOCGPGRP) COMPATIBLE_IOCTL(TIOCGPTN) COMPATIBLE_IOCTL(TIOCSPTLCK) COMPATIBLE_IOCTL(TIOCSERGETLSR) +COMPATIBLE_IOCTL(TIOCSIG) #ifdef TCGETS2 COMPATIBLE_IOCTL(TCGETS2) COMPATIBLE_IOCTL(TCSETS2) @@ -1284,13 +1262,6 @@ COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE5) COMPATIBLE_IOCTL(SOUND_MIXER_GETLEVELS) COMPATIBLE_IOCTL(SOUND_MIXER_SETLEVELS) COMPATIBLE_IOCTL(OSS_GETVERSION) -/* AUTOFS */ -COMPATIBLE_IOCTL(AUTOFS_IOC_CATATONIC) -COMPATIBLE_IOCTL(AUTOFS_IOC_PROTOVER) -COMPATIBLE_IOCTL(AUTOFS_IOC_EXPIRE) -COMPATIBLE_IOCTL(AUTOFS_IOC_EXPIRE_MULTI) -COMPATIBLE_IOCTL(AUTOFS_IOC_PROTOSUBVER) -COMPATIBLE_IOCTL(AUTOFS_IOC_ASKUMOUNT) /* Raw devices */ COMPATIBLE_IOCTL(RAW_SETBIND) COMPATIBLE_IOCTL(RAW_GETBIND) @@ -1557,9 +1528,6 @@ static long do_ioctl_trans(int fd, unsigned int cmd, case RAW_GETBIND: return raw_ioctl(fd, cmd, argp); #endif -#define AUTOFS_IOC_SETTIMEOUT32 _IOWR(0x93,0x64,unsigned int) - case AUTOFS_IOC_SETTIMEOUT32: - return ioc_settimeout(fd, cmd, argp); /* One SMB ioctl needs translations. */ #define SMB_IOC_GETMOUNTUID_32 _IOR('u', 1, compat_uid_t) case SMB_IOC_GETMOUNTUID_32: @@ -1614,9 +1582,6 @@ static long do_ioctl_trans(int fd, unsigned int cmd, case KDSKBMETA: case KDSKBLED: case KDSETLED: - /* AUTOFS */ - case AUTOFS_IOC_READY: - case AUTOFS_IOC_FAIL: /* NBD */ case NBD_SET_SOCK: case NBD_SET_BLKSIZE: diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index e8fcf4e2ed7d..622c95140802 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -199,7 +199,7 @@ static int ecryptfs_open(struct inode *inode, struct file *file) "the persistent file for the dentry with name " "[%s]; rc = [%d]\n", __func__, ecryptfs_dentry->d_name.name, rc); - goto out; + goto out_free; } } if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_RDONLY) @@ -207,7 +207,7 @@ static int ecryptfs_open(struct inode *inode, struct file *file) rc = -EPERM; printk(KERN_WARNING "%s: Lower persistent file is RO; eCryptfs " "file must hence be opened RO\n", __func__); - goto out; + goto out_free; } ecryptfs_set_file_lower( file, ecryptfs_inode_to_private(inode)->lower_file); @@ -292,12 +292,40 @@ static int ecryptfs_fasync(int fd, struct file *file, int flag) return rc; } -static int ecryptfs_ioctl(struct inode *inode, struct file *file, - unsigned int cmd, unsigned long arg); +static long +ecryptfs_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct file *lower_file = NULL; + long rc = -ENOTTY; + + if (ecryptfs_file_to_private(file)) + lower_file = ecryptfs_file_to_lower(file); + if (lower_file && lower_file->f_op && lower_file->f_op->unlocked_ioctl) + rc = lower_file->f_op->unlocked_ioctl(lower_file, cmd, arg); + return rc; +} + +#ifdef CONFIG_COMPAT +static long +ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct file *lower_file = NULL; + long rc = -ENOIOCTLCMD; + + if (ecryptfs_file_to_private(file)) + lower_file = ecryptfs_file_to_lower(file); + if (lower_file && lower_file->f_op && lower_file->f_op->compat_ioctl) + rc = lower_file->f_op->compat_ioctl(lower_file, cmd, arg); + return rc; +} +#endif const struct file_operations ecryptfs_dir_fops = { .readdir = ecryptfs_readdir, - .ioctl = ecryptfs_ioctl, + .unlocked_ioctl = ecryptfs_unlocked_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ecryptfs_compat_ioctl, +#endif .open = ecryptfs_open, .flush = ecryptfs_flush, .release = ecryptfs_release, @@ -313,7 +341,10 @@ const struct file_operations ecryptfs_main_fops = { .write = do_sync_write, .aio_write = generic_file_aio_write, .readdir = ecryptfs_readdir, - .ioctl = ecryptfs_ioctl, + .unlocked_ioctl = ecryptfs_unlocked_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ecryptfs_compat_ioctl, +#endif .mmap = generic_file_mmap, .open = ecryptfs_open, .flush = ecryptfs_flush, @@ -322,20 +353,3 @@ const struct file_operations ecryptfs_main_fops = { .fasync = ecryptfs_fasync, .splice_read = generic_file_splice_read, }; - -static int -ecryptfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd, - unsigned long arg) -{ - int rc = 0; - struct file *lower_file = NULL; - - if (ecryptfs_file_to_private(file)) - lower_file = ecryptfs_file_to_lower(file); - if (lower_file && lower_file->f_op && lower_file->f_op->ioctl) - rc = lower_file->f_op->ioctl(ecryptfs_inode_to_lower(inode), - lower_file, cmd, arg); - else - rc = -ENOTTY; - return rc; -} diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 82900b063b1e..6c55113e7222 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -264,7 +264,7 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry, printk(KERN_ERR "%s: Out of memory whilst attempting " "to allocate ecryptfs_dentry_info struct\n", __func__); - goto out_dput; + goto out_put; } ecryptfs_set_dentry_lower(ecryptfs_dentry, lower_dentry); ecryptfs_set_dentry_lower_mnt(ecryptfs_dentry, lower_mnt); @@ -339,14 +339,85 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry, out_free_kmem: kmem_cache_free(ecryptfs_header_cache_2, page_virt); goto out; -out_dput: +out_put: dput(lower_dentry); + mntput(lower_mnt); d_drop(ecryptfs_dentry); out: return rc; } /** + * ecryptfs_new_lower_dentry + * @ename: The name of the new dentry. + * @lower_dir_dentry: Parent directory of the new dentry. + * @nd: nameidata from last lookup. + * + * Create a new dentry or get it from lower parent dir. + */ +static struct dentry * +ecryptfs_new_lower_dentry(struct qstr *name, struct dentry *lower_dir_dentry, + struct nameidata *nd) +{ + struct dentry *new_dentry; + struct dentry *tmp; + struct inode *lower_dir_inode; + + lower_dir_inode = lower_dir_dentry->d_inode; + + tmp = d_alloc(lower_dir_dentry, name); + if (!tmp) + return ERR_PTR(-ENOMEM); + + mutex_lock(&lower_dir_inode->i_mutex); + new_dentry = lower_dir_inode->i_op->lookup(lower_dir_inode, tmp, nd); + mutex_unlock(&lower_dir_inode->i_mutex); + + if (!new_dentry) + new_dentry = tmp; + else + dput(tmp); + + return new_dentry; +} + + +/** + * ecryptfs_lookup_one_lower + * @ecryptfs_dentry: The eCryptfs dentry that we are looking up + * @lower_dir_dentry: lower parent directory + * + * Get the lower dentry from vfs. If lower dentry does not exist yet, + * create it. + */ +static struct dentry * +ecryptfs_lookup_one_lower(struct dentry *ecryptfs_dentry, + struct dentry *lower_dir_dentry) +{ + struct nameidata nd; + struct vfsmount *lower_mnt; + struct qstr *name; + int err; + + name = &ecryptfs_dentry->d_name; + lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt( + ecryptfs_dentry->d_parent)); + err = vfs_path_lookup(lower_dir_dentry, lower_mnt, name->name , 0, &nd); + mntput(lower_mnt); + + if (!err) { + /* we dont need the mount */ + mntput(nd.path.mnt); + return nd.path.dentry; + } + if (err != -ENOENT) + return ERR_PTR(err); + + /* create a new lower dentry */ + return ecryptfs_new_lower_dentry(name, lower_dir_dentry, &nd); +} + +/** * ecryptfs_lookup * @ecryptfs_dir_inode: The eCryptfs directory inode * @ecryptfs_dentry: The eCryptfs dentry that we are looking up @@ -373,14 +444,12 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, goto out_d_drop; } lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent); - mutex_lock(&lower_dir_dentry->d_inode->i_mutex); - lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name, - lower_dir_dentry, - ecryptfs_dentry->d_name.len); - mutex_unlock(&lower_dir_dentry->d_inode->i_mutex); + + lower_dentry = ecryptfs_lookup_one_lower(ecryptfs_dentry, + lower_dir_dentry); if (IS_ERR(lower_dentry)) { rc = PTR_ERR(lower_dentry); - ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned " + ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_lower() returned " "[%d] on lower_dentry = [%s]\n", __func__, rc, encrypted_and_encoded_name); goto out_d_drop; @@ -402,14 +471,11 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, "filename; rc = [%d]\n", __func__, rc); goto out_d_drop; } - mutex_lock(&lower_dir_dentry->d_inode->i_mutex); - lower_dentry = lookup_one_len(encrypted_and_encoded_name, - lower_dir_dentry, - encrypted_and_encoded_name_size - 1); - mutex_unlock(&lower_dir_dentry->d_inode->i_mutex); + lower_dentry = ecryptfs_lookup_one_lower(ecryptfs_dentry, + lower_dir_dentry); if (IS_ERR(lower_dentry)) { rc = PTR_ERR(lower_dentry); - ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned " + ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_lower() returned " "[%d] on lower_dentry = [%s]\n", __func__, rc, encrypted_and_encoded_name); goto out_d_drop; diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c index 46c4dd8dfcc3..bcb68c0cb1f0 100644 --- a/fs/ecryptfs/messaging.c +++ b/fs/ecryptfs/messaging.c @@ -274,7 +274,7 @@ int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid, struct user_namespace *user_ns, struct pid *pid, u32 seq) { - struct ecryptfs_daemon *daemon; + struct ecryptfs_daemon *uninitialized_var(daemon); struct ecryptfs_msg_ctx *msg_ctx; size_t msg_size; struct nsproxy *nsproxy; diff --git a/fs/exofs/file.c b/fs/exofs/file.c index f9bfe2b501d5..68cb23e3bb98 100644 --- a/fs/exofs/file.c +++ b/fs/exofs/file.c @@ -30,9 +30,6 @@ * along with exofs; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ - -#include <linux/buffer_head.h> - #include "exofs.h" static int exofs_release_file(struct inode *inode, struct file *filp) @@ -40,19 +37,27 @@ static int exofs_release_file(struct inode *inode, struct file *filp) return 0; } +/* exofs_file_fsync - flush the inode to disk + * + * Note, in exofs all metadata is written as part of inode, regardless. + * The writeout is synchronous + */ static int exofs_file_fsync(struct file *filp, int datasync) { int ret; - struct address_space *mapping = filp->f_mapping; - struct inode *inode = mapping->host; + struct inode *inode = filp->f_mapping->host; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = 0, /* metadata-only; caller takes care of data */ + }; struct super_block *sb; - ret = filemap_write_and_wait(mapping); - if (ret) - return ret; + if (!(inode->i_state & I_DIRTY)) + return 0; + if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) + return 0; - /* sync the inode attributes */ - ret = write_inode_now(inode, 1); + ret = sync_inode(inode, &wbc); /* This is a good place to write the sb */ /* TODO: Sechedule an sb-sync on create */ @@ -65,9 +70,9 @@ static int exofs_file_fsync(struct file *filp, int datasync) static int exofs_flush(struct file *file, fl_owner_t id) { - exofs_file_fsync(file, 1); + int ret = vfs_fsync(file, 0); /* TODO: Flush the OSD target */ - return 0; + return ret; } const struct file_operations exofs_file_operations = { diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 088cb476b68a..eb7368ebd8cd 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -32,9 +32,6 @@ */ #include <linux/slab.h> -#include <linux/writeback.h> -#include <linux/buffer_head.h> -#include <scsi/scsi_device.h> #include "exofs.h" @@ -773,15 +770,13 @@ static int exofs_releasepage(struct page *page, gfp_t gfp) { EXOFS_DBGMSG("page 0x%lx\n", page->index); WARN_ON(1); - return try_to_free_buffers(page); + return 0; } static void exofs_invalidatepage(struct page *page, unsigned long offset) { - EXOFS_DBGMSG("page_has_buffers=>%d\n", page_has_buffers(page)); + EXOFS_DBGMSG("page 0x%lx offset 0x%lx\n", page->index, offset); WARN_ON(1); - - block_invalidatepage(page, offset); } const struct address_space_operations exofs_aops = { diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c index 4337cad7777b..6550bf70e41d 100644 --- a/fs/exofs/ios.c +++ b/fs/exofs/ios.c @@ -305,8 +305,6 @@ int exofs_check_io(struct exofs_io_state *ios, u64 *resid) struct _striping_info { u64 obj_offset; u64 group_length; - u64 total_group_length; - u64 Major; unsigned dev; unsigned unit_off; }; @@ -343,8 +341,6 @@ static void _calc_stripe_info(struct exofs_io_state *ios, u64 file_offset, (M * group_depth * stripe_unit); si->group_length = T - H; - si->total_group_length = T; - si->Major = M; } static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_pg, @@ -392,20 +388,19 @@ static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_pg, } static int _prepare_one_group(struct exofs_io_state *ios, u64 length, - struct _striping_info *si, unsigned first_comp) + struct _striping_info *si) { unsigned stripe_unit = ios->layout->stripe_unit; unsigned mirrors_p1 = ios->layout->mirrors_p1; unsigned devs_in_group = ios->layout->group_width * mirrors_p1; unsigned dev = si->dev; unsigned first_dev = dev - (dev % devs_in_group); - unsigned comp = first_comp + (dev - first_dev); unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0; unsigned cur_pg = ios->pages_consumed; int ret = 0; while (length) { - struct exofs_per_dev_state *per_dev = &ios->per_dev[comp]; + struct exofs_per_dev_state *per_dev = &ios->per_dev[dev]; unsigned cur_len, page_off = 0; if (!per_dev->length) { @@ -424,11 +419,8 @@ static int _prepare_one_group(struct exofs_io_state *ios, u64 length, cur_len = stripe_unit; } - if (max_comp < comp) - max_comp = comp; - - dev += mirrors_p1; - dev = (dev % devs_in_group) + first_dev; + if (max_comp < dev) + max_comp = dev; } else { cur_len = stripe_unit; } @@ -440,8 +432,8 @@ static int _prepare_one_group(struct exofs_io_state *ios, u64 length, if (unlikely(ret)) goto out; - comp += mirrors_p1; - comp = (comp % devs_in_group) + first_comp; + dev += mirrors_p1; + dev = (dev % devs_in_group) + first_dev; length -= cur_len; } @@ -454,18 +446,15 @@ out: static int _prepare_for_striping(struct exofs_io_state *ios) { u64 length = ios->length; + u64 offset = ios->offset; struct _striping_info si; - unsigned devs_in_group = ios->layout->group_width * - ios->layout->mirrors_p1; - unsigned first_comp = 0; int ret = 0; - _calc_stripe_info(ios, ios->offset, &si); - if (!ios->pages) { if (ios->kern_buff) { struct exofs_per_dev_state *per_dev = &ios->per_dev[0]; + _calc_stripe_info(ios, ios->offset, &si); per_dev->offset = si.obj_offset; per_dev->dev = si.dev; @@ -479,26 +468,17 @@ static int _prepare_for_striping(struct exofs_io_state *ios) } while (length) { + _calc_stripe_info(ios, offset, &si); + if (length < si.group_length) si.group_length = length; - ret = _prepare_one_group(ios, si.group_length, &si, first_comp); + ret = _prepare_one_group(ios, si.group_length, &si); if (unlikely(ret)) goto out; + offset += si.group_length; length -= si.group_length; - - si.group_length = si.total_group_length; - si.unit_off = 0; - ++si.Major; - si.obj_offset = si.Major * ios->layout->stripe_unit * - ios->layout->group_depth; - - si.dev = (si.dev - (si.dev % devs_in_group)) + devs_in_group; - si.dev %= ios->layout->s_numdevs; - - first_comp += devs_in_group; - first_comp %= ios->layout->s_numdevs; } out: @@ -599,7 +579,7 @@ static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp) } else { bio = master_dev->bio; /* FIXME: bio_set_dir() */ - bio->bi_rw |= (1 << BIO_RW); + bio->bi_rw |= REQ_WRITE; } osd_req_write(or, &ios->obj, per_dev->offset, bio, diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 32cfd61def5f..047e92fa3af8 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -31,7 +31,6 @@ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ -#include <linux/smp_lock.h> #include <linux/string.h> #include <linux/parser.h> #include <linux/vfs.h> diff --git a/fs/fcntl.c b/fs/fcntl.c index 9d175d623aab..6769fd0f35b8 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -767,11 +767,22 @@ void kill_fasync(struct fasync_struct **fp, int sig, int band) } EXPORT_SYMBOL(kill_fasync); -static int __init fasync_init(void) +static int __init fcntl_init(void) { + /* please add new bits here to ensure allocation uniqueness */ + BUILD_BUG_ON(19 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32( + O_RDONLY | O_WRONLY | O_RDWR | + O_CREAT | O_EXCL | O_NOCTTY | + O_TRUNC | O_APPEND | O_NONBLOCK | + __O_SYNC | O_DSYNC | FASYNC | + O_DIRECT | O_LARGEFILE | O_DIRECTORY | + O_NOFOLLOW | O_NOATIME | O_CLOEXEC | + FMODE_EXEC + )); + fasync_cache = kmem_cache_create("fasync_cache", sizeof(struct fasync_struct), 0, SLAB_PANIC, NULL); return 0; } -module_init(fasync_init) +module_init(fcntl_init) diff --git a/fs/file.c b/fs/file.c index cccaead962c2..0be344755c02 100644 --- a/fs/file.c +++ b/fs/file.c @@ -39,28 +39,27 @@ int sysctl_nr_open_max = 1024 * 1024; /* raised later */ */ static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list); -static inline void * alloc_fdmem(unsigned int size) +static inline void *alloc_fdmem(unsigned int size) { - if (size <= PAGE_SIZE) - return kmalloc(size, GFP_KERNEL); - else - return vmalloc(size); + void *data; + + data = kmalloc(size, GFP_KERNEL|__GFP_NOWARN); + if (data != NULL) + return data; + + return vmalloc(size); } -static inline void free_fdarr(struct fdtable *fdt) +static void free_fdmem(void *ptr) { - if (fdt->max_fds <= (PAGE_SIZE / sizeof(struct file *))) - kfree(fdt->fd); - else - vfree(fdt->fd); + is_vmalloc_addr(ptr) ? vfree(ptr) : kfree(ptr); } -static inline void free_fdset(struct fdtable *fdt) +static void __free_fdtable(struct fdtable *fdt) { - if (fdt->max_fds <= (PAGE_SIZE * BITS_PER_BYTE / 2)) - kfree(fdt->open_fds); - else - vfree(fdt->open_fds); + free_fdmem(fdt->fd); + free_fdmem(fdt->open_fds); + kfree(fdt); } static void free_fdtable_work(struct work_struct *work) @@ -75,9 +74,8 @@ static void free_fdtable_work(struct work_struct *work) spin_unlock_bh(&f->lock); while(fdt) { struct fdtable *next = fdt->next; - vfree(fdt->fd); - free_fdset(fdt); - kfree(fdt); + + __free_fdtable(fdt); fdt = next; } } @@ -98,7 +96,7 @@ void free_fdtable_rcu(struct rcu_head *rcu) container_of(fdt, struct files_struct, fdtab)); return; } - if (fdt->max_fds <= (PAGE_SIZE / sizeof(struct file *))) { + if (!is_vmalloc_addr(fdt->fd) && !is_vmalloc_addr(fdt->open_fds)) { kfree(fdt->fd); kfree(fdt->open_fds); kfree(fdt); @@ -183,7 +181,7 @@ static struct fdtable * alloc_fdtable(unsigned int nr) return fdt; out_arr: - free_fdarr(fdt); + free_fdmem(fdt->fd); out_fdt: kfree(fdt); out: @@ -213,9 +211,7 @@ static int expand_fdtable(struct files_struct *files, int nr) * caller and alloc_fdtable(). Cheaper to catch it here... */ if (unlikely(new_fdt->max_fds <= nr)) { - free_fdarr(new_fdt); - free_fdset(new_fdt); - kfree(new_fdt); + __free_fdtable(new_fdt); return -EMFILE; } /* @@ -231,9 +227,7 @@ static int expand_fdtable(struct files_struct *files, int nr) free_fdtable(cur_fdt); } else { /* Somebody else expanded, so undo our attempt */ - free_fdarr(new_fdt); - free_fdset(new_fdt); - kfree(new_fdt); + __free_fdtable(new_fdt); } return 1; } @@ -323,11 +317,8 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) while (unlikely(open_files > new_fdt->max_fds)) { spin_unlock(&oldf->file_lock); - if (new_fdt != &newf->fdtab) { - free_fdarr(new_fdt); - free_fdset(new_fdt); - kfree(new_fdt); - } + if (new_fdt != &newf->fdtab) + __free_fdtable(new_fdt); new_fdt = alloc_fdtable(open_files - 1); if (!new_fdt) { @@ -337,9 +328,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) /* beyond sysctl_nr_open; nothing to do */ if (unlikely(new_fdt->max_fds < open_files)) { - free_fdarr(new_fdt); - free_fdset(new_fdt); - kfree(new_fdt); + __free_fdtable(new_fdt); *errorp = -EMFILE; goto out_release; } diff --git a/fs/file_table.c b/fs/file_table.c index b8a0bb63cbd7..2fc3b3c08911 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -298,11 +298,20 @@ struct file *fget(unsigned int fd) EXPORT_SYMBOL(fget); /* - * Lightweight file lookup - no refcnt increment if fd table isn't shared. - * You can use this only if it is guranteed that the current task already - * holds a refcnt to that file. That check has to be done at fget() only - * and a flag is returned to be passed to the corresponding fput_light(). - * There must not be a cloning between an fget_light/fput_light pair. + * Lightweight file lookup - no refcnt increment if fd table isn't shared. + * + * You can use this instead of fget if you satisfy all of the following + * conditions: + * 1) You must call fput_light before exiting the syscall and returning control + * to userspace (i.e. you cannot remember the returned struct file * after + * returning to userspace). + * 2) You must not call filp_close on the returned struct file * in between + * calls to fget_light and fput_light. + * 3) You must not clone the current task in between the calls to fget_light + * and fput_light. + * + * The fput_needed flag returned by fget_light should be passed to the + * corresponding fput_light. */ struct file *fget_light(unsigned int fd, int *fput_needed) { diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index b7c7586caea1..2f76c4a081a2 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -26,15 +26,9 @@ #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/buffer_head.h> +#include <linux/tracepoint.h> #include "internal.h" -#define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info) - -/* - * We don't actually have pdflush, but this one is exported though /proc... - */ -int nr_pdflush_threads; - /* * Passed into wb_writeback(), essentially a subset of writeback_control */ @@ -50,6 +44,21 @@ struct wb_writeback_work { struct completion *done; /* set if the caller waits */ }; +/* + * Include the creation of the trace points after defining the + * wb_writeback_work structure so that the definition remains local to this + * file. + */ +#define CREATE_TRACE_POINTS +#include <trace/events/writeback.h> + +#define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info) + +/* + * We don't actually have pdflush, but this one is exported though /proc... + */ +int nr_pdflush_threads; + /** * writeback_in_progress - determine whether there is writeback in progress * @bdi: the device's backing_dev_info structure. @@ -65,22 +74,21 @@ int writeback_in_progress(struct backing_dev_info *bdi) static void bdi_queue_work(struct backing_dev_info *bdi, struct wb_writeback_work *work) { - spin_lock(&bdi->wb_lock); - list_add_tail(&work->list, &bdi->work_list); - spin_unlock(&bdi->wb_lock); + trace_writeback_queue(bdi, work); - /* - * If the default thread isn't there, make sure we add it. When - * it gets created and wakes up, we'll run this work. - */ - if (unlikely(list_empty_careful(&bdi->wb_list))) + spin_lock_bh(&bdi->wb_lock); + list_add_tail(&work->list, &bdi->work_list); + if (bdi->wb.task) { + wake_up_process(bdi->wb.task); + } else { + /* + * The bdi thread isn't there, wake up the forker thread which + * will create and run it. + */ + trace_writeback_nothread(bdi, work); wake_up_process(default_backing_dev_info.wb.task); - else { - struct bdi_writeback *wb = &bdi->wb; - - if (wb->task) - wake_up_process(wb->task); } + spin_unlock_bh(&bdi->wb_lock); } static void @@ -95,8 +103,10 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, */ work = kzalloc(sizeof(*work), GFP_ATOMIC); if (!work) { - if (bdi->wb.task) + if (bdi->wb.task) { + trace_writeback_nowork(bdi); wake_up_process(bdi->wb.task); + } return; } @@ -643,10 +653,14 @@ static long wb_writeback(struct bdi_writeback *wb, wbc.more_io = 0; wbc.nr_to_write = MAX_WRITEBACK_PAGES; wbc.pages_skipped = 0; + + trace_wbc_writeback_start(&wbc, wb->bdi); if (work->sb) __writeback_inodes_sb(work->sb, wb, &wbc); else writeback_inodes_wb(wb, &wbc); + trace_wbc_writeback_written(&wbc, wb->bdi); + work->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write; @@ -674,6 +688,7 @@ static long wb_writeback(struct bdi_writeback *wb, if (!list_empty(&wb->b_more_io)) { inode = list_entry(wb->b_more_io.prev, struct inode, i_list); + trace_wbc_writeback_wait(&wbc, wb->bdi); inode_wait_for_writeback(inode); } spin_unlock(&inode_lock); @@ -686,17 +701,17 @@ static long wb_writeback(struct bdi_writeback *wb, * Return the next wb_writeback_work struct that hasn't been processed yet. */ static struct wb_writeback_work * -get_next_work_item(struct backing_dev_info *bdi, struct bdi_writeback *wb) +get_next_work_item(struct backing_dev_info *bdi) { struct wb_writeback_work *work = NULL; - spin_lock(&bdi->wb_lock); + spin_lock_bh(&bdi->wb_lock); if (!list_empty(&bdi->work_list)) { work = list_entry(bdi->work_list.next, struct wb_writeback_work, list); list_del_init(&work->list); } - spin_unlock(&bdi->wb_lock); + spin_unlock_bh(&bdi->wb_lock); return work; } @@ -744,7 +759,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait) struct wb_writeback_work *work; long wrote = 0; - while ((work = get_next_work_item(bdi, wb)) != NULL) { + while ((work = get_next_work_item(bdi)) != NULL) { /* * Override sync mode, in case we must wait for completion * because this thread is exiting now. @@ -752,6 +767,8 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait) if (force_wait) work->sync_mode = WB_SYNC_ALL; + trace_writeback_exec(bdi, work); + wrote += wb_writeback(wb, work); /* @@ -776,47 +793,66 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait) * Handle writeback of dirty data for the device backed by this bdi. Also * wakes up periodically and does kupdated style flushing. */ -int bdi_writeback_task(struct bdi_writeback *wb) +int bdi_writeback_thread(void *data) { - unsigned long last_active = jiffies; - unsigned long wait_jiffies = -1UL; + struct bdi_writeback *wb = data; + struct backing_dev_info *bdi = wb->bdi; long pages_written; + current->flags |= PF_FLUSHER | PF_SWAPWRITE; + set_freezable(); + wb->last_active = jiffies; + + /* + * Our parent may run at a different priority, just set us to normal + */ + set_user_nice(current, 0); + + trace_writeback_thread_start(bdi); + while (!kthread_should_stop()) { + /* + * Remove own delayed wake-up timer, since we are already awake + * and we'll take care of the preriodic write-back. + */ + del_timer(&wb->wakeup_timer); + pages_written = wb_do_writeback(wb, 0); + trace_writeback_pages_written(pages_written); + if (pages_written) - last_active = jiffies; - else if (wait_jiffies != -1UL) { - unsigned long max_idle; + wb->last_active = jiffies; - /* - * Longest period of inactivity that we tolerate. If we - * see dirty data again later, the task will get - * recreated automatically. - */ - max_idle = max(5UL * 60 * HZ, wait_jiffies); - if (time_after(jiffies, max_idle + last_active)) - break; + set_current_state(TASK_INTERRUPTIBLE); + if (!list_empty(&bdi->work_list)) { + __set_current_state(TASK_RUNNING); + continue; } - if (dirty_writeback_interval) { - wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10); - schedule_timeout_interruptible(wait_jiffies); - } else { - set_current_state(TASK_INTERRUPTIBLE); - if (list_empty_careful(&wb->bdi->work_list) && - !kthread_should_stop()) - schedule(); - __set_current_state(TASK_RUNNING); + if (wb_has_dirty_io(wb) && dirty_writeback_interval) + schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10)); + else { + /* + * We have nothing to do, so can go sleep without any + * timeout and save power. When a work is queued or + * something is made dirty - we will be woken up. + */ + schedule(); } try_to_freeze(); } + /* Flush any work that raced with us exiting */ + if (!list_empty(&bdi->work_list)) + wb_do_writeback(wb, 1); + + trace_writeback_thread_stop(bdi); return 0; } + /* * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back * the whole world. @@ -891,6 +927,8 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode) void __mark_inode_dirty(struct inode *inode, int flags) { struct super_block *sb = inode->i_sb; + struct backing_dev_info *bdi = NULL; + bool wakeup_bdi = false; /* * Don't do this for I_DIRTY_PAGES - that doesn't actually @@ -944,22 +982,31 @@ void __mark_inode_dirty(struct inode *inode, int flags) * reposition it (that would break b_dirty time-ordering). */ if (!was_dirty) { - struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; - struct backing_dev_info *bdi = wb->bdi; - - if (bdi_cap_writeback_dirty(bdi) && - !test_bit(BDI_registered, &bdi->state)) { - WARN_ON(1); - printk(KERN_ERR "bdi-%s not registered\n", - bdi->name); + bdi = inode_to_bdi(inode); + + if (bdi_cap_writeback_dirty(bdi)) { + WARN(!test_bit(BDI_registered, &bdi->state), + "bdi-%s not registered\n", bdi->name); + + /* + * If this is the first dirty inode for this + * bdi, we have to wake-up the corresponding + * bdi thread to make sure background + * write-back happens later. + */ + if (!wb_has_dirty_io(&bdi->wb)) + wakeup_bdi = true; } inode->dirtied_when = jiffies; - list_move(&inode->i_list, &wb->b_dirty); + list_move(&inode->i_list, &bdi->wb.b_dirty); } } out: spin_unlock(&inode_lock); + + if (wakeup_bdi) + bdi_wakeup_thread_delayed(bdi); } EXPORT_SYMBOL(__mark_inode_dirty); diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 6a857e24f947..cde1248a6225 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -595,7 +595,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull) if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) goto skip_barrier; get_bh(bh); - submit_bh(WRITE_SYNC | (1 << BIO_RW_BARRIER) | (1 << BIO_RW_META), bh); + submit_bh(WRITE_BARRIER | REQ_META, bh); wait_on_buffer(bh); if (buffer_eopnotsupp(bh)) { clear_buffer_eopnotsupp(bh); @@ -605,7 +605,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull) lock_buffer(bh); skip_barrier: get_bh(bh); - submit_bh(WRITE_SYNC | (1 << BIO_RW_META), bh); + submit_bh(WRITE_SYNC | REQ_META, bh); wait_on_buffer(bh); } if (!buffer_uptodate(bh)) diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 18176d0b75d7..f3b071f921aa 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -36,8 +36,8 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb { struct buffer_head *bh, *head; int nr_underway = 0; - int write_op = (1 << BIO_RW_META) | ((wbc->sync_mode == WB_SYNC_ALL ? - WRITE_SYNC_PLUG : WRITE)); + int write_op = REQ_META | + (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC_PLUG : WRITE); BUG_ON(!PageLocked(page)); BUG_ON(!page_has_buffers(page)); @@ -225,7 +225,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, } bh->b_end_io = end_buffer_read_sync; get_bh(bh); - submit_bh(READ_SYNC | (1 << BIO_RW_META), bh); + submit_bh(READ_SYNC | REQ_META, bh); if (!(flags & DIO_WAIT)) return 0; @@ -432,7 +432,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen) if (buffer_uptodate(first_bh)) goto out; if (!buffer_locked(first_bh)) - ll_rw_block(READ_SYNC | (1 << BIO_RW_META), 1, &first_bh); + ll_rw_block(READ_SYNC | REQ_META, 1, &first_bh); dblock++; extlen--; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 4f44bdeb2f03..4d4b1e8ac64c 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -274,7 +274,7 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector) bio->bi_end_io = end_bio_io_page; bio->bi_private = page; - submit_bio(READ_SYNC | (1 << BIO_RW_META), bio); + submit_bio(READ_SYNC | REQ_META, bio); wait_on_page_locked(page); bio_put(bio); if (!PageUptodate(page)) { diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index fa96bbb26343..2d7f165d0f1d 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -86,46 +86,25 @@ struct ea_buffer { #define EA_MALLOC 0x0008 +static int is_known_namespace(const char *name) +{ + if (strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) && + strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) && + strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) && + strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) + return false; + + return true; +} + /* * These three routines are used to recognize on-disk extended attributes * that are in a recognized namespace. If the attribute is not recognized, * "os2." is prepended to the name */ -static inline int is_os2_xattr(struct jfs_ea *ea) +static int is_os2_xattr(struct jfs_ea *ea) { - /* - * Check for "system." - */ - if ((ea->namelen >= XATTR_SYSTEM_PREFIX_LEN) && - !strncmp(ea->name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) - return false; - /* - * Check for "user." - */ - if ((ea->namelen >= XATTR_USER_PREFIX_LEN) && - !strncmp(ea->name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) - return false; - /* - * Check for "security." - */ - if ((ea->namelen >= XATTR_SECURITY_PREFIX_LEN) && - !strncmp(ea->name, XATTR_SECURITY_PREFIX, - XATTR_SECURITY_PREFIX_LEN)) - return false; - /* - * Check for "trusted." - */ - if ((ea->namelen >= XATTR_TRUSTED_PREFIX_LEN) && - !strncmp(ea->name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) - return false; - /* - * Add any other valid namespace prefixes here - */ - - /* - * We assume it's OS/2's flat namespace - */ - return true; + return !is_known_namespace(ea->name); } static inline int name_size(struct jfs_ea *ea) @@ -764,13 +743,23 @@ static int can_set_xattr(struct inode *inode, const char *name, if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) return can_set_system_xattr(inode, name, value, value_len); + if (!strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN)) { + /* + * This makes sure that we aren't trying to set an + * attribute in a different namespace by prefixing it + * with "os2." + */ + if (is_known_namespace(name + XATTR_OS2_PREFIX_LEN)) + return -EOPNOTSUPP; + return 0; + } + /* * Don't allow setting an attribute in an unknown namespace. */ if (strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) && strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) && - strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) && - strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN)) + strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) return -EOPNOTSUPP; return 0; @@ -952,19 +941,8 @@ ssize_t __jfs_getxattr(struct inode *inode, const char *name, void *data, int xattr_size; ssize_t size; int namelen = strlen(name); - char *os2name = NULL; char *value; - if (strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) { - os2name = kmalloc(namelen - XATTR_OS2_PREFIX_LEN + 1, - GFP_KERNEL); - if (!os2name) - return -ENOMEM; - strcpy(os2name, name + XATTR_OS2_PREFIX_LEN); - name = os2name; - namelen -= XATTR_OS2_PREFIX_LEN; - } - down_read(&JFS_IP(inode)->xattr_sem); xattr_size = ea_get(inode, &ea_buf, 0); @@ -1002,8 +980,6 @@ ssize_t __jfs_getxattr(struct inode *inode, const char *name, void *data, out: up_read(&JFS_IP(inode)->xattr_sem); - kfree(os2name); - return size; } @@ -1012,6 +988,19 @@ ssize_t jfs_getxattr(struct dentry *dentry, const char *name, void *data, { int err; + if (strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) { + /* + * skip past "os2." prefix + */ + name += XATTR_OS2_PREFIX_LEN; + /* + * Don't allow retrieving properly prefixed attributes + * by prepending them with "os2." + */ + if (is_known_namespace(name)) + return -EOPNOTSUPP; + } + err = __jfs_getxattr(dentry->d_inode, name, data, buf_size); return err; diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c index 023c03d02070..84a8cfc4e38e 100644 --- a/fs/ncpfs/ioctl.c +++ b/fs/ncpfs/ioctl.c @@ -20,7 +20,6 @@ #include <linux/smp_lock.h> #include <linux/vmalloc.h> #include <linux/sched.h> -#include <linux/smp_lock.h> #include <linux/ncp_fs.h> diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index 2e6a2723b8fa..4588fb9e93df 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -508,7 +508,7 @@ static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf, * Last BIO is always sent through the following * submission. */ - rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); + rw |= REQ_SYNC | REQ_UNPLUG; res = nilfs_segbuf_submit_bio(segbuf, &wi, rw); } diff --git a/fs/open.c b/fs/open.c index b715d06fbe36..630715f9f73d 100644 --- a/fs/open.c +++ b/fs/open.c @@ -1031,7 +1031,9 @@ EXPORT_SYMBOL(generic_file_open); /* * This is used by subsystems that don't want seekable - * file descriptors + * file descriptors. The function is not supposed to ever fail, the only + * reason it returns an 'int' and not 'void' is so that it can be plugged + * directly into file_operations structure. */ int nonseekable_open(struct inode *inode, struct file *filp) { diff --git a/fs/partitions/acorn.c b/fs/partitions/acorn.c index 6921e7890be6..fbeb697374d5 100644 --- a/fs/partitions/acorn.c +++ b/fs/partitions/acorn.c @@ -45,8 +45,11 @@ adfs_partition(struct parsed_partitions *state, char *name, char *data, nr_sects = (le32_to_cpu(dr->disc_size_high) << 23) | (le32_to_cpu(dr->disc_size) >> 9); - if (name) - printk(" [%s]", name); + if (name) { + strlcat(state->pp_buf, " [", PAGE_SIZE); + strlcat(state->pp_buf, name, PAGE_SIZE); + strlcat(state->pp_buf, "]", PAGE_SIZE); + } put_partition(state, slot, first_sector, nr_sects); return dr; } @@ -81,14 +84,14 @@ static int riscix_partition(struct parsed_partitions *state, if (!rr) return -1; - printk(" [RISCiX]"); + strlcat(state->pp_buf, " [RISCiX]", PAGE_SIZE); if (rr->magic == RISCIX_MAGIC) { unsigned long size = nr_sects > 2 ? 2 : nr_sects; int part; - printk(" <"); + strlcat(state->pp_buf, " <", PAGE_SIZE); put_partition(state, slot++, first_sect, size); for (part = 0; part < 8; part++) { @@ -97,11 +100,13 @@ static int riscix_partition(struct parsed_partitions *state, put_partition(state, slot++, le32_to_cpu(rr->part[part].start), le32_to_cpu(rr->part[part].length)); - printk("(%s)", rr->part[part].name); + strlcat(state->pp_buf, "(", PAGE_SIZE); + strlcat(state->pp_buf, rr->part[part].name, PAGE_SIZE); + strlcat(state->pp_buf, ")", PAGE_SIZE); } } - printk(" >\n"); + strlcat(state->pp_buf, " >\n", PAGE_SIZE); } else { put_partition(state, slot++, first_sect, nr_sects); } @@ -131,7 +136,7 @@ static int linux_partition(struct parsed_partitions *state, struct linux_part *linuxp; unsigned long size = nr_sects > 2 ? 2 : nr_sects; - printk(" [Linux]"); + strlcat(state->pp_buf, " [Linux]", PAGE_SIZE); put_partition(state, slot++, first_sect, size); @@ -139,7 +144,7 @@ static int linux_partition(struct parsed_partitions *state, if (!linuxp) return -1; - printk(" <"); + strlcat(state->pp_buf, " <", PAGE_SIZE); while (linuxp->magic == cpu_to_le32(LINUX_NATIVE_MAGIC) || linuxp->magic == cpu_to_le32(LINUX_SWAP_MAGIC)) { if (slot == state->limit) @@ -149,7 +154,7 @@ static int linux_partition(struct parsed_partitions *state, le32_to_cpu(linuxp->nr_sects)); linuxp ++; } - printk(" >"); + strlcat(state->pp_buf, " >", PAGE_SIZE); put_dev_sector(sect); return slot; @@ -294,7 +299,7 @@ int adfspart_check_ADFS(struct parsed_partitions *state) break; } } - printk("\n"); + strlcat(state->pp_buf, "\n", PAGE_SIZE); return 1; } #endif @@ -367,7 +372,7 @@ int adfspart_check_ICS(struct parsed_partitions *state) return 0; } - printk(" [ICS]"); + strlcat(state->pp_buf, " [ICS]", PAGE_SIZE); for (slot = 1, p = (const struct ics_part *)data; p->size; p++) { u32 start = le32_to_cpu(p->start); @@ -401,7 +406,7 @@ int adfspart_check_ICS(struct parsed_partitions *state) } put_dev_sector(sect); - printk("\n"); + strlcat(state->pp_buf, "\n", PAGE_SIZE); return 1; } #endif @@ -461,7 +466,7 @@ int adfspart_check_POWERTEC(struct parsed_partitions *state) return 0; } - printk(" [POWERTEC]"); + strlcat(state->pp_buf, " [POWERTEC]", PAGE_SIZE); for (i = 0, p = (const struct ptec_part *)data; i < 12; i++, p++) { u32 start = le32_to_cpu(p->start); @@ -472,7 +477,7 @@ int adfspart_check_POWERTEC(struct parsed_partitions *state) } put_dev_sector(sect); - printk("\n"); + strlcat(state->pp_buf, "\n", PAGE_SIZE); return 1; } #endif @@ -543,7 +548,7 @@ int adfspart_check_EESOX(struct parsed_partitions *state) size = get_capacity(state->bdev->bd_disk); put_partition(state, slot++, start, size - start); - printk("\n"); + strlcat(state->pp_buf, "\n", PAGE_SIZE); } return i ? 1 : 0; diff --git a/fs/partitions/amiga.c b/fs/partitions/amiga.c index ba443d4229f8..70cbf44a1560 100644 --- a/fs/partitions/amiga.c +++ b/fs/partitions/amiga.c @@ -69,7 +69,13 @@ int amiga_partition(struct parsed_partitions *state) /* blksize is blocks per 512 byte standard block */ blksize = be32_to_cpu( rdb->rdb_BlockBytes ) / 512; - printk(" RDSK (%d)", blksize * 512); /* Be more informative */ + { + char tmp[7 + 10 + 1 + 1]; + + /* Be more informative */ + snprintf(tmp, sizeof(tmp), " RDSK (%d)", blksize * 512); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + } blk = be32_to_cpu(rdb->rdb_PartitionList); put_dev_sector(sect); for (part = 1; blk>0 && part<=16; part++, put_dev_sector(sect)) { @@ -106,23 +112,27 @@ int amiga_partition(struct parsed_partitions *state) { /* Be even more informative to aid mounting */ char dostype[4]; + char tmp[42]; + __be32 *dt = (__be32 *)dostype; *dt = pb->pb_Environment[16]; if (dostype[3] < ' ') - printk(" (%c%c%c^%c)", + snprintf(tmp, sizeof(tmp), " (%c%c%c^%c)", dostype[0], dostype[1], dostype[2], dostype[3] + '@' ); else - printk(" (%c%c%c%c)", + snprintf(tmp, sizeof(tmp), " (%c%c%c%c)", dostype[0], dostype[1], dostype[2], dostype[3]); - printk("(res %d spb %d)", + strlcat(state->pp_buf, tmp, PAGE_SIZE); + snprintf(tmp, sizeof(tmp), "(res %d spb %d)", be32_to_cpu(pb->pb_Environment[6]), be32_to_cpu(pb->pb_Environment[4])); + strlcat(state->pp_buf, tmp, PAGE_SIZE); } res = 1; } - printk("\n"); + strlcat(state->pp_buf, "\n", PAGE_SIZE); rdb_done: return res; diff --git a/fs/partitions/atari.c b/fs/partitions/atari.c index 4439ff1b6cec..9875b05e80a2 100644 --- a/fs/partitions/atari.c +++ b/fs/partitions/atari.c @@ -62,7 +62,7 @@ int atari_partition(struct parsed_partitions *state) } pi = &rs->part[0]; - printk (" AHDI"); + strlcat(state->pp_buf, " AHDI", PAGE_SIZE); for (slot = 1; pi < &rs->part[4] && slot < state->limit; slot++, pi++) { struct rootsector *xrs; Sector sect2; @@ -81,7 +81,7 @@ int atari_partition(struct parsed_partitions *state) #ifdef ICD_PARTS part_fmt = 1; #endif - printk(" XGM<"); + strlcat(state->pp_buf, " XGM<", PAGE_SIZE); partsect = extensect = be32_to_cpu(pi->st); while (1) { xrs = read_part_sector(state, partsect, §2); @@ -120,14 +120,14 @@ int atari_partition(struct parsed_partitions *state) break; } } - printk(" >"); + strlcat(state->pp_buf, " >", PAGE_SIZE); } #ifdef ICD_PARTS if ( part_fmt!=1 ) { /* no extended partitions -> test ICD-format */ pi = &rs->icdpart[0]; /* sanity check: no ICD format if first partition invalid */ if (OK_id(pi->id)) { - printk(" ICD<"); + strlcat(state->pp_buf, " ICD<", PAGE_SIZE); for (; pi < &rs->icdpart[8] && slot < state->limit; slot++, pi++) { /* accept only GEM,BGM,RAW,LNX,SWP partitions */ if (!((pi->flg & 1) && OK_id(pi->id))) @@ -137,13 +137,13 @@ int atari_partition(struct parsed_partitions *state) be32_to_cpu(pi->st), be32_to_cpu(pi->siz)); } - printk(" >"); + strlcat(state->pp_buf, " >", PAGE_SIZE); } } #endif put_dev_sector(sect); - printk ("\n"); + strlcat(state->pp_buf, "\n", PAGE_SIZE); return 1; } diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 72c52656dc2e..79fbf3f390f0 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -164,10 +164,16 @@ check_partition(struct gendisk *hd, struct block_device *bdev) state = kzalloc(sizeof(struct parsed_partitions), GFP_KERNEL); if (!state) return NULL; + state->pp_buf = (char *)__get_free_page(GFP_KERNEL); + if (!state->pp_buf) { + kfree(state); + return NULL; + } + state->pp_buf[0] = '\0'; state->bdev = bdev; disk_name(hd, 0, state->name); - printk(KERN_INFO " %s:", state->name); + snprintf(state->pp_buf, PAGE_SIZE, " %s:", state->name); if (isdigit(state->name[strlen(state->name)-1])) sprintf(state->name, "p"); @@ -185,17 +191,25 @@ check_partition(struct gendisk *hd, struct block_device *bdev) } } - if (res > 0) + if (res > 0) { + printk(KERN_INFO "%s", state->pp_buf); + + free_page((unsigned long)state->pp_buf); return state; + } if (state->access_beyond_eod) err = -ENOSPC; if (err) /* The partition is unrecognized. So report I/O errors if there were any */ res = err; if (!res) - printk(" unknown partition table\n"); + strlcat(state->pp_buf, " unknown partition table\n", PAGE_SIZE); else if (warn_no_part) - printk(" unable to read partition table\n"); + strlcat(state->pp_buf, " unable to read partition table\n", PAGE_SIZE); + + printk(KERN_INFO "%s", state->pp_buf); + + free_page((unsigned long)state->pp_buf); kfree(state); return ERR_PTR(res); } diff --git a/fs/partitions/check.h b/fs/partitions/check.h index 52f8bd399396..8e4e103ba216 100644 --- a/fs/partitions/check.h +++ b/fs/partitions/check.h @@ -16,6 +16,7 @@ struct parsed_partitions { int next; int limit; bool access_beyond_eod; + char *pp_buf; }; static inline void *read_part_sector(struct parsed_partitions *state, @@ -32,9 +33,12 @@ static inline void put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size) { if (n < p->limit) { + char tmp[1 + BDEVNAME_SIZE + 10 + 1]; + p->parts[n].from = from; p->parts[n].size = size; - printk(" %s%d", p->name, n); + snprintf(tmp, sizeof(tmp), " %s%d", p->name, n); + strlcat(p->pp_buf, tmp, PAGE_SIZE); } } diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c index 9efb2cfe2410..dbb44d4bb8a7 100644 --- a/fs/partitions/efi.c +++ b/fs/partitions/efi.c @@ -630,6 +630,6 @@ int efi_partition(struct parsed_partitions *state) } kfree(ptes); kfree(gpt); - printk("\n"); + strlcat(state->pp_buf, "\n", PAGE_SIZE); return 1; } diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c index fc8497643fd0..d1b8a5c4bc0a 100644 --- a/fs/partitions/ibm.c +++ b/fs/partitions/ibm.c @@ -75,6 +75,7 @@ int ibm_partition(struct parsed_partitions *state) unsigned char *data; Sector sect; sector_t labelsect; + char tmp[64]; res = 0; blocksize = bdev_logical_block_size(bdev); @@ -144,13 +145,15 @@ int ibm_partition(struct parsed_partitions *state) */ blocksize = label->cms.block_size; if (label->cms.disk_offset != 0) { - printk("CMS1/%8s(MDSK):", name); + snprintf(tmp, sizeof(tmp), "CMS1/%8s(MDSK):", name); + strlcat(state->pp_buf, tmp, PAGE_SIZE); /* disk is reserved minidisk */ offset = label->cms.disk_offset; size = (label->cms.block_count - 1) * (blocksize >> 9); } else { - printk("CMS1/%8s:", name); + snprintf(tmp, sizeof(tmp), "CMS1/%8s:", name); + strlcat(state->pp_buf, tmp, PAGE_SIZE); offset = (info->label_block + 1); size = label->cms.block_count * (blocksize >> 9); @@ -159,7 +162,8 @@ int ibm_partition(struct parsed_partitions *state) size-offset*(blocksize >> 9)); } else { if (strncmp(type, "LNX1", 4) == 0) { - printk("LNX1/%8s:", name); + snprintf(tmp, sizeof(tmp), "LNX1/%8s:", name); + strlcat(state->pp_buf, tmp, PAGE_SIZE); if (label->lnx.ldl_version == 0xf2) { fmt_size = label->lnx.formatted_blocks * (blocksize >> 9); @@ -178,7 +182,7 @@ int ibm_partition(struct parsed_partitions *state) offset = (info->label_block + 1); } else { /* unlabeled disk */ - printk("(nonl)"); + strlcat(tmp, sizeof(tmp), "(nonl)", PAGE_SIZE); size = i_size >> 9; offset = (info->label_block + 1); } @@ -197,7 +201,8 @@ int ibm_partition(struct parsed_partitions *state) * if not, something is wrong, skipping partition detection */ if (strncmp(type, "VOL1", 4) == 0) { - printk("VOL1/%8s:", name); + snprintf(tmp, sizeof(tmp), "VOL1/%8s:", name); + strlcat(state->pp_buf, tmp, PAGE_SIZE); /* * get block number and read then go through format1 * labels @@ -253,7 +258,7 @@ int ibm_partition(struct parsed_partitions *state) } - printk("\n"); + strlcat(state->pp_buf, "\n", PAGE_SIZE); goto out_freeall; diff --git a/fs/partitions/karma.c b/fs/partitions/karma.c index 1cc928bb762f..0ea19312706b 100644 --- a/fs/partitions/karma.c +++ b/fs/partitions/karma.c @@ -50,7 +50,7 @@ int karma_partition(struct parsed_partitions *state) } slot++; } - printk("\n"); + strlcat(state->pp_buf, "\n", PAGE_SIZE); put_dev_sector(sect); return 1; } diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c index 648c9d8f3357..5bf8a04b5d9b 100644 --- a/fs/partitions/ldm.c +++ b/fs/partitions/ldm.c @@ -643,7 +643,7 @@ static bool ldm_create_data_partitions (struct parsed_partitions *pp, return false; } - printk (" [LDM]"); + strlcat(pp->pp_buf, " [LDM]", PAGE_SIZE); /* Create the data partitions */ list_for_each (item, &ldb->v_part) { @@ -658,7 +658,7 @@ static bool ldm_create_data_partitions (struct parsed_partitions *pp, part_num++; } - printk ("\n"); + strlcat(pp->pp_buf, "\n", PAGE_SIZE); return true; } diff --git a/fs/partitions/mac.c b/fs/partitions/mac.c index 74465ff7c263..68d6a216ee79 100644 --- a/fs/partitions/mac.c +++ b/fs/partitions/mac.c @@ -59,7 +59,7 @@ int mac_partition(struct parsed_partitions *state) put_dev_sector(sect); return 0; /* not a MacOS disk */ } - printk(" [mac]"); + strlcat(state->pp_buf, " [mac]", PAGE_SIZE); blocks_in_map = be32_to_cpu(part->map_count); for (blk = 1; blk <= blocks_in_map; ++blk) { int pos = blk * secsize; @@ -128,6 +128,6 @@ int mac_partition(struct parsed_partitions *state) #endif put_dev_sector(sect); - printk("\n"); + strlcat(state->pp_buf, "\n", PAGE_SIZE); return 1; } diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c index 15bfb7b1e044..5f79a6677c69 100644 --- a/fs/partitions/msdos.c +++ b/fs/partitions/msdos.c @@ -213,10 +213,18 @@ static void parse_solaris_x86(struct parsed_partitions *state, put_dev_sector(sect); return; } - printk(" %s%d: <solaris:", state->name, origin); + { + char tmp[1 + BDEVNAME_SIZE + 10 + 11 + 1]; + + snprintf(tmp, sizeof(tmp), " %s%d: <solaris:", state->name, origin); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + } if (le32_to_cpu(v->v_version) != 1) { - printk(" cannot handle version %d vtoc>\n", - le32_to_cpu(v->v_version)); + char tmp[64]; + + snprintf(tmp, sizeof(tmp), " cannot handle version %d vtoc>\n", + le32_to_cpu(v->v_version)); + strlcat(state->pp_buf, tmp, PAGE_SIZE); put_dev_sector(sect); return; } @@ -224,9 +232,12 @@ static void parse_solaris_x86(struct parsed_partitions *state, max_nparts = le16_to_cpu (v->v_nparts) > 8 ? SOLARIS_X86_NUMSLICE : 8; for (i=0; i<max_nparts && state->next<state->limit; i++) { struct solaris_x86_slice *s = &v->v_slice[i]; + char tmp[3 + 10 + 1 + 1]; + if (s->s_size == 0) continue; - printk(" [s%d]", i); + snprintf(tmp, sizeof(tmp), " [s%d]", i); + strlcat(state->pp_buf, tmp, PAGE_SIZE); /* solaris partitions are relative to current MS-DOS * one; must add the offset of the current partition */ put_partition(state, state->next++, @@ -234,7 +245,7 @@ static void parse_solaris_x86(struct parsed_partitions *state, le32_to_cpu(s->s_size)); } put_dev_sector(sect); - printk(" >\n"); + strlcat(state->pp_buf, " >\n", PAGE_SIZE); #endif } @@ -250,6 +261,7 @@ static void parse_bsd(struct parsed_partitions *state, Sector sect; struct bsd_disklabel *l; struct bsd_partition *p; + char tmp[64]; l = read_part_sector(state, offset + 1, §); if (!l) @@ -258,7 +270,9 @@ static void parse_bsd(struct parsed_partitions *state, put_dev_sector(sect); return; } - printk(" %s%d: <%s:", state->name, origin, flavour); + + snprintf(tmp, sizeof(tmp), " %s%d: <%s:", state->name, origin, flavour); + strlcat(state->pp_buf, tmp, PAGE_SIZE); if (le16_to_cpu(l->d_npartitions) < max_partitions) max_partitions = le16_to_cpu(l->d_npartitions); @@ -275,16 +289,18 @@ static void parse_bsd(struct parsed_partitions *state, /* full parent partition, we have it already */ continue; if (offset > bsd_start || offset+size < bsd_start+bsd_size) { - printk("bad subpartition - ignored\n"); + strlcat(state->pp_buf, "bad subpartition - ignored\n", PAGE_SIZE); continue; } put_partition(state, state->next++, bsd_start, bsd_size); } put_dev_sector(sect); - if (le16_to_cpu(l->d_npartitions) > max_partitions) - printk(" (ignored %d more)", - le16_to_cpu(l->d_npartitions) - max_partitions); - printk(" >\n"); + if (le16_to_cpu(l->d_npartitions) > max_partitions) { + snprintf(tmp, sizeof(tmp), " (ignored %d more)", + le16_to_cpu(l->d_npartitions) - max_partitions); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + } + strlcat(state->pp_buf, " >\n", PAGE_SIZE); } #endif @@ -333,7 +349,12 @@ static void parse_unixware(struct parsed_partitions *state, put_dev_sector(sect); return; } - printk(" %s%d: <unixware:", state->name, origin); + { + char tmp[1 + BDEVNAME_SIZE + 10 + 12 + 1]; + + snprintf(tmp, sizeof(tmp), " %s%d: <unixware:", state->name, origin); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + } p = &l->vtoc.v_slice[1]; /* I omit the 0th slice as it is the same as whole disk. */ while (p - &l->vtoc.v_slice[0] < UNIXWARE_NUMSLICE) { @@ -347,7 +368,7 @@ static void parse_unixware(struct parsed_partitions *state, p++; } put_dev_sector(sect); - printk(" >\n"); + strlcat(state->pp_buf, " >\n", PAGE_SIZE); #endif } @@ -376,8 +397,10 @@ static void parse_minix(struct parsed_partitions *state, * the normal boot sector. */ if (msdos_magic_present (data + 510) && SYS_IND(p) == MINIX_PARTITION) { /* subpartition table present */ + char tmp[1 + BDEVNAME_SIZE + 10 + 9 + 1]; - printk(" %s%d: <minix:", state->name, origin); + snprintf(tmp, sizeof(tmp), " %s%d: <minix:", state->name, origin); + strlcat(state->pp_buf, tmp, PAGE_SIZE); for (i = 0; i < MINIX_NR_SUBPARTITIONS; i++, p++) { if (state->next == state->limit) break; @@ -386,7 +409,7 @@ static void parse_minix(struct parsed_partitions *state, put_partition(state, state->next++, start_sect(p), nr_sects(p)); } - printk(" >\n"); + strlcat(state->pp_buf, " >\n", PAGE_SIZE); } put_dev_sector(sect); #endif /* CONFIG_MINIX_SUBPARTITION */ @@ -425,7 +448,7 @@ int msdos_partition(struct parsed_partitions *state) if (aix_magic_present(state, data)) { put_dev_sector(sect); - printk( " [AIX]"); + strlcat(state->pp_buf, " [AIX]", PAGE_SIZE); return 0; } @@ -446,7 +469,7 @@ int msdos_partition(struct parsed_partitions *state) fb = (struct fat_boot_sector *) data; if (slot == 1 && fb->reserved && fb->fats && fat_valid_media(fb->media)) { - printk("\n"); + strlcat(state->pp_buf, "\n", PAGE_SIZE); put_dev_sector(sect); return 1; } else { @@ -491,21 +514,21 @@ int msdos_partition(struct parsed_partitions *state) n = min(size, max(sector_size, n)); put_partition(state, slot, start, n); - printk(" <"); + strlcat(state->pp_buf, " <", PAGE_SIZE); parse_extended(state, start, size); - printk(" >"); + strlcat(state->pp_buf, " >", PAGE_SIZE); continue; } put_partition(state, slot, start, size); if (SYS_IND(p) == LINUX_RAID_PARTITION) state->parts[slot].flags = ADDPART_FLAG_RAID; if (SYS_IND(p) == DM6_PARTITION) - printk("[DM]"); + strlcat(state->pp_buf, "[DM]", PAGE_SIZE); if (SYS_IND(p) == EZD_PARTITION) - printk("[EZD]"); + strlcat(state->pp_buf, "[EZD]", PAGE_SIZE); } - printk("\n"); + strlcat(state->pp_buf, "\n", PAGE_SIZE); /* second pass - output for each on a separate line */ p = (struct partition *) (0x1be + data); diff --git a/fs/partitions/osf.c b/fs/partitions/osf.c index fc22b85d436a..48cec7cbca17 100644 --- a/fs/partitions/osf.c +++ b/fs/partitions/osf.c @@ -72,7 +72,7 @@ int osf_partition(struct parsed_partitions *state) le32_to_cpu(partition->p_size)); slot++; } - printk("\n"); + strlcat(state->pp_buf, "\n", PAGE_SIZE); put_dev_sector(sect); return 1; } diff --git a/fs/partitions/sgi.c b/fs/partitions/sgi.c index 43b1df9aa16c..ea8a86dceaf4 100644 --- a/fs/partitions/sgi.c +++ b/fs/partitions/sgi.c @@ -76,7 +76,7 @@ int sgi_partition(struct parsed_partitions *state) } slot++; } - printk("\n"); + strlcat(state->pp_buf, "\n", PAGE_SIZE); put_dev_sector(sect); return 1; } diff --git a/fs/partitions/sun.c b/fs/partitions/sun.c index a32660e25f7f..b5b6fcfb3d36 100644 --- a/fs/partitions/sun.c +++ b/fs/partitions/sun.c @@ -116,7 +116,7 @@ int sun_partition(struct parsed_partitions *state) } slot++; } - printk("\n"); + strlcat(state->pp_buf, "\n", PAGE_SIZE); put_dev_sector(sect); return 1; } diff --git a/fs/partitions/sysv68.c b/fs/partitions/sysv68.c index 9030c864428e..9627ccffc1c4 100644 --- a/fs/partitions/sysv68.c +++ b/fs/partitions/sysv68.c @@ -54,6 +54,7 @@ int sysv68_partition(struct parsed_partitions *state) unsigned char *data; struct dkblk0 *b; struct slice *slice; + char tmp[64]; data = read_part_sector(state, 0, §); if (!data) @@ -73,7 +74,8 @@ int sysv68_partition(struct parsed_partitions *state) return -1; slices -= 1; /* last slice is the whole disk */ - printk("sysV68: %s(s%u)", state->name, slices); + snprintf(tmp, sizeof(tmp), "sysV68: %s(s%u)", state->name, slices); + strlcat(state->pp_buf, tmp, PAGE_SIZE); slice = (struct slice *)data; for (i = 0; i < slices; i++, slice++) { if (slot == state->limit) @@ -82,11 +84,12 @@ int sysv68_partition(struct parsed_partitions *state) put_partition(state, slot, be32_to_cpu(slice->blkoff), be32_to_cpu(slice->nblocks)); - printk("(s%u)", i); + snprintf(tmp, sizeof(tmp), "(s%u)", i); + strlcat(state->pp_buf, tmp, PAGE_SIZE); } slot++; } - printk("\n"); + strlcat(state->pp_buf, "\n", PAGE_SIZE); put_dev_sector(sect); return 1; } diff --git a/fs/partitions/ultrix.c b/fs/partitions/ultrix.c index db9eef260364..8dbaf9f77a99 100644 --- a/fs/partitions/ultrix.c +++ b/fs/partitions/ultrix.c @@ -39,7 +39,7 @@ int ultrix_partition(struct parsed_partitions *state) label->pt_part[i].pi_blkoff, label->pt_part[i].pi_nblocks); put_dev_sector(sect); - printk ("\n"); + strlcat(state->pp_buf, "\n", PAGE_SIZE); return 1; } else { put_dev_sector(sect); diff --git a/fs/proc/Makefile b/fs/proc/Makefile index 11a7b5c68153..2758e2afc518 100644 --- a/fs/proc/Makefile +++ b/fs/proc/Makefile @@ -2,7 +2,7 @@ # Makefile for the Linux proc filesystem routines. # -obj-$(CONFIG_PROC_FS) += proc.o +obj-y += proc.o proc-y := nommu.o task_nommu.o proc-$(CONFIG_MMU) := mmu.o task_mmu.o diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 19fbc810e8e7..1ec952b1f036 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -983,7 +983,6 @@ static int flush_older_commits(struct super_block *s, static int reiserfs_async_progress_wait(struct super_block *s) { - DEFINE_WAIT(wait); struct reiserfs_journal *j = SB_JOURNAL(s); if (atomic_read(&j->j_async_throttle)) { diff --git a/fs/signalfd.c b/fs/signalfd.c index f329849ce3c0..1c5a6add779d 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -88,6 +88,7 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo, err |= __put_user(kinfo->si_tid, &uinfo->ssi_tid); err |= __put_user(kinfo->si_overrun, &uinfo->ssi_overrun); err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr); + err |= __put_user(kinfo->si_int, &uinfo->ssi_int); break; case __SI_POLL: err |= __put_user(kinfo->si_band, &uinfo->ssi_band); @@ -111,6 +112,7 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo, err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid); err |= __put_user(kinfo->si_uid, &uinfo->ssi_uid); err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr); + err |= __put_user(kinfo->si_int, &uinfo->ssi_int); break; default: /* diff --git a/fs/splice.c b/fs/splice.c index efdbfece9932..8f1dfaecc8f0 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -399,17 +399,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, * If the page isn't uptodate, we may need to start io on it */ if (!PageUptodate(page)) { - /* - * If in nonblock mode then dont block on waiting - * for an in-flight io page - */ - if (flags & SPLICE_F_NONBLOCK) { - if (!trylock_page(page)) { - error = -EAGAIN; - break; - } - } else - lock_page(page); + lock_page(page); /* * Page was truncated, or invalidated by the @@ -597,7 +587,6 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos, struct page *pages[PIPE_DEF_BUFFERS]; struct partial_page partial[PIPE_DEF_BUFFERS]; struct iovec *vec, __vec[PIPE_DEF_BUFFERS]; - pgoff_t index; ssize_t res; size_t this_len; int error; @@ -621,7 +610,6 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos, goto shrink_ret; } - index = *ppos >> PAGE_CACHE_SHIFT; offset = *ppos & ~PAGE_CACHE_MASK; nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig index cc6ce8a84c21..e5f63da64d04 100644 --- a/fs/squashfs/Kconfig +++ b/fs/squashfs/Kconfig @@ -5,13 +5,13 @@ config SQUASHFS help Saying Y here includes support for SquashFS 4.0 (a Compressed Read-Only File System). Squashfs is a highly compressed read-only - filesystem for Linux. It uses zlib compression to compress both + filesystem for Linux. It uses zlib/lzo compression to compress both files, inodes and directories. Inodes in the system are very small and all blocks are packed to minimise data overhead. Block sizes greater than 4K are supported up to a maximum of 1 Mbytes (default block size 128K). SquashFS 4.0 supports 64 bit filesystems and files (larger than 4GB), full uid/gid information, hard links and - timestamps. + timestamps. Squashfs is intended for general read-only filesystem use, for archival use (i.e. in cases where a .tar.gz file may be used), and in @@ -26,7 +26,7 @@ config SQUASHFS If unsure, say N. -config SQUASHFS_XATTRS +config SQUASHFS_XATTR bool "Squashfs XATTR support" depends on SQUASHFS default n @@ -37,9 +37,24 @@ config SQUASHFS_XATTRS If unsure, say N. -config SQUASHFS_EMBEDDED +config SQUASHFS_LZO + bool "Include support for LZO compressed file systems" + depends on SQUASHFS + default n + select LZO_DECOMPRESS + help + Saying Y here includes support for reading Squashfs file systems + compressed with LZO compresssion. LZO compression is mainly + aimed at embedded systems with slower CPUs where the overheads + of zlib are too high. - bool "Additional option for memory-constrained systems" + LZO is not the standard compression used in Squashfs and so most + file systems will be readable without selecting this option. + + If unsure, say N. + +config SQUASHFS_EMBEDDED + bool "Additional option for memory-constrained systems" depends on SQUASHFS default n help diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile index 2cee3e9fa452..7672bac8d328 100644 --- a/fs/squashfs/Makefile +++ b/fs/squashfs/Makefile @@ -5,5 +5,5 @@ obj-$(CONFIG_SQUASHFS) += squashfs.o squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o -squashfs-$(CONFIG_SQUASHFS_XATTRS) += xattr.o xattr_id.o - +squashfs-$(CONFIG_SQUASHFS_XATTR) += xattr.o xattr_id.o +squashfs-$(CONFIG_SQUASHFS_LZO) += lzo_wrapper.o diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c index 157478da6ac9..24af9ce9722f 100644 --- a/fs/squashfs/decompressor.c +++ b/fs/squashfs/decompressor.c @@ -40,9 +40,11 @@ static const struct squashfs_decompressor squashfs_lzma_unsupported_comp_ops = { NULL, NULL, NULL, LZMA_COMPRESSION, "lzma", 0 }; +#ifndef CONFIG_SQUASHFS_LZO static const struct squashfs_decompressor squashfs_lzo_unsupported_comp_ops = { NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0 }; +#endif static const struct squashfs_decompressor squashfs_unknown_comp_ops = { NULL, NULL, NULL, 0, "unknown", 0 @@ -51,7 +53,11 @@ static const struct squashfs_decompressor squashfs_unknown_comp_ops = { static const struct squashfs_decompressor *decompressor[] = { &squashfs_zlib_comp_ops, &squashfs_lzma_unsupported_comp_ops, +#ifdef CONFIG_SQUASHFS_LZO + &squashfs_lzo_comp_ops, +#else &squashfs_lzo_unsupported_comp_ops, +#endif &squashfs_unknown_comp_ops }; diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c new file mode 100644 index 000000000000..5d87789bf1c1 --- /dev/null +++ b/fs/squashfs/lzo_wrapper.c @@ -0,0 +1,136 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2010 LG Electronics + * Chan Jeong <chan.jeong@lge.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * lzo_wrapper.c + */ + +#include <linux/mutex.h> +#include <linux/buffer_head.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/lzo.h> + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "squashfs.h" +#include "decompressor.h" + +struct squashfs_lzo { + void *input; + void *output; +}; + +static void *lzo_init(struct squashfs_sb_info *msblk) +{ + int block_size = max_t(int, msblk->block_size, SQUASHFS_METADATA_SIZE); + + struct squashfs_lzo *stream = kzalloc(sizeof(*stream), GFP_KERNEL); + if (stream == NULL) + goto failed; + stream->input = vmalloc(block_size); + if (stream->input == NULL) + goto failed; + stream->output = vmalloc(block_size); + if (stream->output == NULL) + goto failed2; + + return stream; + +failed2: + vfree(stream->input); +failed: + ERROR("Failed to allocate lzo workspace\n"); + kfree(stream); + return NULL; +} + + +static void lzo_free(void *strm) +{ + struct squashfs_lzo *stream = strm; + + if (stream) { + vfree(stream->input); + vfree(stream->output); + } + kfree(stream); +} + + +static int lzo_uncompress(struct squashfs_sb_info *msblk, void **buffer, + struct buffer_head **bh, int b, int offset, int length, int srclength, + int pages) +{ + struct squashfs_lzo *stream = msblk->stream; + void *buff = stream->input; + int avail, i, bytes = length, res; + size_t out_len = srclength; + + mutex_lock(&msblk->read_data_mutex); + + for (i = 0; i < b; i++) { + wait_on_buffer(bh[i]); + if (!buffer_uptodate(bh[i])) + goto block_release; + + avail = min(bytes, msblk->devblksize - offset); + memcpy(buff, bh[i]->b_data + offset, avail); + buff += avail; + bytes -= avail; + offset = 0; + put_bh(bh[i]); + } + + res = lzo1x_decompress_safe(stream->input, (size_t)length, + stream->output, &out_len); + if (res != LZO_E_OK) + goto failed; + + res = bytes = (int)out_len; + for (i = 0, buff = stream->output; bytes && i < pages; i++) { + avail = min_t(int, bytes, PAGE_CACHE_SIZE); + memcpy(buffer[i], buff, avail); + buff += avail; + bytes -= avail; + } + + mutex_unlock(&msblk->read_data_mutex); + return res; + +block_release: + for (; i < b; i++) + put_bh(bh[i]); + +failed: + mutex_unlock(&msblk->read_data_mutex); + + ERROR("lzo decompression failed, data probably corrupt\n"); + return -EIO; +} + +const struct squashfs_decompressor squashfs_lzo_comp_ops = { + .init = lzo_init, + .free = lzo_free, + .decompress = lzo_uncompress, + .id = LZO_COMPRESSION, + .name = "lzo", + .supported = 1 +}; diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h index 733a17c42945..5d45569d5f72 100644 --- a/fs/squashfs/squashfs.h +++ b/fs/squashfs/squashfs.h @@ -104,3 +104,6 @@ extern const struct xattr_handler *squashfs_xattr_handlers[]; /* zlib_wrapper.c */ extern const struct squashfs_decompressor squashfs_zlib_comp_ops; + +/* lzo_wrapper.c */ +extern const struct squashfs_decompressor squashfs_lzo_comp_ops; diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h index 8eabb808b78d..c5137fc9ab11 100644 --- a/fs/squashfs/squashfs_fs.h +++ b/fs/squashfs/squashfs_fs.h @@ -274,7 +274,7 @@ struct squashfs_base_inode { __le16 uid; __le16 guid; __le32 mtime; - __le32 inode_number; + __le32 inode_number; }; struct squashfs_ipc_inode { @@ -283,7 +283,7 @@ struct squashfs_ipc_inode { __le16 uid; __le16 guid; __le32 mtime; - __le32 inode_number; + __le32 inode_number; __le32 nlink; }; @@ -293,7 +293,7 @@ struct squashfs_lipc_inode { __le16 uid; __le16 guid; __le32 mtime; - __le32 inode_number; + __le32 inode_number; __le32 nlink; __le32 xattr; }; @@ -304,7 +304,7 @@ struct squashfs_dev_inode { __le16 uid; __le16 guid; __le32 mtime; - __le32 inode_number; + __le32 inode_number; __le32 nlink; __le32 rdev; }; @@ -315,7 +315,7 @@ struct squashfs_ldev_inode { __le16 uid; __le16 guid; __le32 mtime; - __le32 inode_number; + __le32 inode_number; __le32 nlink; __le32 rdev; __le32 xattr; @@ -327,7 +327,7 @@ struct squashfs_symlink_inode { __le16 uid; __le16 guid; __le32 mtime; - __le32 inode_number; + __le32 inode_number; __le32 nlink; __le32 symlink_size; char symlink[0]; @@ -339,7 +339,7 @@ struct squashfs_reg_inode { __le16 uid; __le16 guid; __le32 mtime; - __le32 inode_number; + __le32 inode_number; __le32 start_block; __le32 fragment; __le32 offset; @@ -353,7 +353,7 @@ struct squashfs_lreg_inode { __le16 uid; __le16 guid; __le32 mtime; - __le32 inode_number; + __le32 inode_number; __le64 start_block; __le64 file_size; __le64 sparse; @@ -370,7 +370,7 @@ struct squashfs_dir_inode { __le16 uid; __le16 guid; __le32 mtime; - __le32 inode_number; + __le32 inode_number; __le32 start_block; __le32 nlink; __le16 file_size; @@ -384,7 +384,7 @@ struct squashfs_ldir_inode { __le16 uid; __le16 guid; __le32 mtime; - __le32 inode_number; + __le32 inode_number; __le32 nlink; __le32 file_size; __le32 start_block; diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c index c7655e8b31cd..652b8541f9c6 100644 --- a/fs/squashfs/xattr.c +++ b/fs/squashfs/xattr.c @@ -18,7 +18,7 @@ * along with this program; if not, write to the Free Software * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * - * xattr_id.c + * xattr.c */ #include <linux/init.h> @@ -295,7 +295,7 @@ static const struct xattr_handler squashfs_xattr_security_handler = { .get = squashfs_security_get }; -static inline const struct xattr_handler *squashfs_xattr_handler(int type) +static const struct xattr_handler *squashfs_xattr_handler(int type) { if (type & ~(SQUASHFS_XATTR_PREFIX_MASK | SQUASHFS_XATTR_VALUE_OOL)) /* ignore unrecognised type */ diff --git a/fs/squashfs/xattr.h b/fs/squashfs/xattr.h index 9da071ae181c..49fe0d719fbf 100644 --- a/fs/squashfs/xattr.h +++ b/fs/squashfs/xattr.h @@ -21,7 +21,7 @@ * xattr.h */ -#ifdef CONFIG_SQUASHFS_XATTRS +#ifdef CONFIG_SQUASHFS_XATTR extern __le64 *squashfs_read_xattr_id_table(struct super_block *, u64, u64 *, int *); extern int squashfs_xattr_lookup(struct super_block *, unsigned int, int *, |