diff options
Diffstat (limited to 'fs')
-rw-r--r-- | fs/afs/write.c | 1 | ||||
-rw-r--r-- | fs/aio.c | 52 | ||||
-rw-r--r-- | fs/block_dev.c | 19 | ||||
-rw-r--r-- | fs/btrfs/ctree.h | 3 | ||||
-rw-r--r-- | fs/btrfs/extent-tree.c | 9 | ||||
-rw-r--r-- | fs/btrfs/extent_io.c | 138 | ||||
-rw-r--r-- | fs/btrfs/extent_io.h | 2 | ||||
-rw-r--r-- | fs/btrfs/inode.c | 126 | ||||
-rw-r--r-- | fs/btrfs/ioctl.c | 7 | ||||
-rw-r--r-- | fs/btrfs/lzo.c | 21 | ||||
-rw-r--r-- | fs/btrfs/relocation.c | 13 | ||||
-rw-r--r-- | fs/btrfs/super.c | 7 | ||||
-rw-r--r-- | fs/btrfs/volumes.c | 13 | ||||
-rw-r--r-- | fs/eventpoll.c | 95 | ||||
-rw-r--r-- | fs/fuse/dir.c | 7 | ||||
-rw-r--r-- | fs/fuse/file.c | 52 | ||||
-rw-r--r-- | fs/fuse/fuse_i.h | 6 | ||||
-rw-r--r-- | fs/inode.c | 9 | ||||
-rw-r--r-- | fs/internal.h | 2 | ||||
-rw-r--r-- | fs/namespace.c | 2 | ||||
-rw-r--r-- | fs/ocfs2/journal.h | 6 | ||||
-rw-r--r-- | fs/ocfs2/refcounttree.c | 7 | ||||
-rw-r--r-- | fs/ocfs2/super.c | 28 | ||||
-rw-r--r-- | fs/partitions/ldm.c | 5 |
24 files changed, 530 insertions, 100 deletions
diff --git a/fs/afs/write.c b/fs/afs/write.c index 15690bb1d3b5..789b3afb3423 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -140,6 +140,7 @@ int afs_write_begin(struct file *file, struct address_space *mapping, candidate->first = candidate->last = index; candidate->offset_first = from; candidate->to_last = to; + INIT_LIST_HEAD(&candidate->link); candidate->usage = 1; candidate->state = AFS_WBACK_PENDING; init_waitqueue_head(&candidate->waitq); @@ -239,15 +239,23 @@ static void __put_ioctx(struct kioctx *ctx) call_rcu(&ctx->rcu_head, ctx_rcu_free); } -#define get_ioctx(kioctx) do { \ - BUG_ON(atomic_read(&(kioctx)->users) <= 0); \ - atomic_inc(&(kioctx)->users); \ -} while (0) -#define put_ioctx(kioctx) do { \ - BUG_ON(atomic_read(&(kioctx)->users) <= 0); \ - if (unlikely(atomic_dec_and_test(&(kioctx)->users))) \ - __put_ioctx(kioctx); \ -} while (0) +static inline void get_ioctx(struct kioctx *kioctx) +{ + BUG_ON(atomic_read(&kioctx->users) <= 0); + atomic_inc(&kioctx->users); +} + +static inline int try_get_ioctx(struct kioctx *kioctx) +{ + return atomic_inc_not_zero(&kioctx->users); +} + +static inline void put_ioctx(struct kioctx *kioctx) +{ + BUG_ON(atomic_read(&kioctx->users) <= 0); + if (unlikely(atomic_dec_and_test(&kioctx->users))) + __put_ioctx(kioctx); +} /* ioctx_alloc * Allocates and initializes an ioctx. Returns an ERR_PTR if it failed. @@ -601,8 +609,13 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) rcu_read_lock(); hlist_for_each_entry_rcu(ctx, n, &mm->ioctx_list, list) { - if (ctx->user_id == ctx_id && !ctx->dead) { - get_ioctx(ctx); + /* + * RCU protects us against accessing freed memory but + * we have to be careful not to get a reference when the + * reference count already dropped to 0 (ctx->dead test + * is unreliable because of races). + */ + if (ctx->user_id == ctx_id && !ctx->dead && try_get_ioctx(ctx)){ ret = ctx; break; } @@ -1629,6 +1642,23 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, goto out_put_req; spin_lock_irq(&ctx->ctx_lock); + /* + * We could have raced with io_destroy() and are currently holding a + * reference to ctx which should be destroyed. We cannot submit IO + * since ctx gets freed as soon as io_submit() puts its reference. The + * check here is reliable: io_destroy() sets ctx->dead before waiting + * for outstanding IO and the barrier between these two is realized by + * unlock of mm->ioctx_lock and lock of ctx->ctx_lock. Analogously we + * increment ctx->reqs_active before checking for ctx->dead and the + * barrier is realized by unlock and lock of ctx->ctx_lock. Thus if we + * don't see ctx->dead set here, io_destroy() waits for our IO to + * finish. + */ + if (ctx->dead) { + spin_unlock_irq(&ctx->ctx_lock); + ret = -EINVAL; + goto out_put_req; + } aio_run_iocb(req); if (!list_empty(&ctx->run_list)) { /* drain the run list */ diff --git a/fs/block_dev.c b/fs/block_dev.c index 4fb8a3431531..889287019599 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -873,6 +873,11 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj); if (ret) goto out_del; + /* + * bdev could be deleted beneath us which would implicitly destroy + * the holder directory. Hold on to it. + */ + kobject_get(bdev->bd_part->holder_dir); list_add(&holder->list, &bdev->bd_holder_disks); goto out_unlock; @@ -909,6 +914,7 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); del_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj); + kobject_put(bdev->bd_part->holder_dir); list_del_init(&holder->list); kfree(holder); } @@ -922,14 +928,15 @@ EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); * flush_disk - invalidates all buffer-cache entries on a disk * * @bdev: struct block device to be flushed + * @kill_dirty: flag to guide handling of dirty inodes * * Invalidates all buffer-cache entries on a disk. It should be called * when a disk has been changed -- either by a media change or online * resize. */ -static void flush_disk(struct block_device *bdev) +static void flush_disk(struct block_device *bdev, bool kill_dirty) { - if (__invalidate_device(bdev)) { + if (__invalidate_device(bdev, kill_dirty)) { char name[BDEVNAME_SIZE] = ""; if (bdev->bd_disk) @@ -966,7 +973,7 @@ void check_disk_size_change(struct gendisk *disk, struct block_device *bdev) "%s: detected capacity change from %lld to %lld\n", name, bdev_size, disk_size); i_size_write(bdev->bd_inode, disk_size); - flush_disk(bdev); + flush_disk(bdev, false); } } EXPORT_SYMBOL(check_disk_size_change); @@ -1019,7 +1026,7 @@ int check_disk_change(struct block_device *bdev) if (!(events & DISK_EVENT_MEDIA_CHANGE)) return 0; - flush_disk(bdev); + flush_disk(bdev, true); if (bdops->revalidate_disk) bdops->revalidate_disk(bdev->bd_disk); return 1; @@ -1600,7 +1607,7 @@ fail: } EXPORT_SYMBOL(lookup_bdev); -int __invalidate_device(struct block_device *bdev) +int __invalidate_device(struct block_device *bdev, bool kill_dirty) { struct super_block *sb = get_super(bdev); int res = 0; @@ -1613,7 +1620,7 @@ int __invalidate_device(struct block_device *bdev) * hold). */ shrink_dcache_sb(sb); - res = invalidate_inodes(sb); + res = invalidate_inodes(sb, kill_dirty); drop_super(sb); } invalidate_bdev(bdev); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 2c98b3af6052..6f820fa23df4 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1254,6 +1254,7 @@ struct btrfs_root { #define BTRFS_MOUNT_SPACE_CACHE (1 << 12) #define BTRFS_MOUNT_CLEAR_CACHE (1 << 13) #define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14) +#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) @@ -2218,6 +2219,8 @@ int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end); int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr, u64 num_bytes); +int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 type); /* ctree.c */ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f3c96fc01439..588ff9849873 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5376,7 +5376,7 @@ again: num_bytes, data, 1); goto again; } - if (ret == -ENOSPC) { + if (ret == -ENOSPC && btrfs_test_opt(root, ENOSPC_DEBUG)) { struct btrfs_space_info *sinfo; sinfo = __find_space_info(root->fs_info, data); @@ -8065,6 +8065,13 @@ out: return ret; } +int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 type) +{ + u64 alloc_flags = get_alloc_profile(root, type); + return do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1); +} + /* * helper to account the unused space of all the readonly block group in the * list. takes mirrors into account. diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 92ac5192c518..fd3f172e94e6 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1433,12 +1433,13 @@ int extent_clear_unlock_delalloc(struct inode *inode, */ u64 count_range_bits(struct extent_io_tree *tree, u64 *start, u64 search_end, u64 max_bytes, - unsigned long bits) + unsigned long bits, int contig) { struct rb_node *node; struct extent_state *state; u64 cur_start = *start; u64 total_bytes = 0; + u64 last = 0; int found = 0; if (search_end <= cur_start) { @@ -1463,7 +1464,9 @@ u64 count_range_bits(struct extent_io_tree *tree, state = rb_entry(node, struct extent_state, rb_node); if (state->start > search_end) break; - if (state->end >= cur_start && (state->state & bits)) { + if (contig && found && state->start > last + 1) + break; + if (state->end >= cur_start && (state->state & bits) == bits) { total_bytes += min(search_end, state->end) + 1 - max(cur_start, state->start); if (total_bytes >= max_bytes) @@ -1472,6 +1475,9 @@ u64 count_range_bits(struct extent_io_tree *tree, *start = state->start; found = 1; } + last = state->end; + } else if (contig && found) { + break; } node = rb_next(node); if (!node) @@ -2912,6 +2918,46 @@ out: return sector; } +/* + * helper function for fiemap, which doesn't want to see any holes. + * This maps until we find something past 'last' + */ +static struct extent_map *get_extent_skip_holes(struct inode *inode, + u64 offset, + u64 last, + get_extent_t *get_extent) +{ + u64 sectorsize = BTRFS_I(inode)->root->sectorsize; + struct extent_map *em; + u64 len; + + if (offset >= last) + return NULL; + + while(1) { + len = last - offset; + if (len == 0) + break; + len = (len + sectorsize - 1) & ~(sectorsize - 1); + em = get_extent(inode, NULL, 0, offset, len, 0); + if (!em || IS_ERR(em)) + return em; + + /* if this isn't a hole return it */ + if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) && + em->block_start != EXTENT_MAP_HOLE) { + return em; + } + + /* this is a hole, advance to the next extent */ + offset = extent_map_end(em); + free_extent_map(em); + if (offset >= last) + break; + } + return NULL; +} + int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len, get_extent_t *get_extent) { @@ -2921,16 +2967,19 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u32 flags = 0; u32 found_type; u64 last; + u64 last_for_get_extent = 0; u64 disko = 0; + u64 isize = i_size_read(inode); struct btrfs_key found_key; struct extent_map *em = NULL; struct extent_state *cached_state = NULL; struct btrfs_path *path; struct btrfs_file_extent_item *item; int end = 0; - u64 em_start = 0, em_len = 0; + u64 em_start = 0; + u64 em_len = 0; + u64 em_end = 0; unsigned long emflags; - int hole = 0; if (len == 0) return -EINVAL; @@ -2940,6 +2989,10 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return -ENOMEM; path->leave_spinning = 1; + /* + * lookup the last file extent. We're not using i_size here + * because there might be preallocation past i_size + */ ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root, path, inode->i_ino, -1, 0); if (ret < 0) { @@ -2953,18 +3006,38 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); found_type = btrfs_key_type(&found_key); - /* No extents, just return */ + /* No extents, but there might be delalloc bits */ if (found_key.objectid != inode->i_ino || found_type != BTRFS_EXTENT_DATA_KEY) { - btrfs_free_path(path); - return 0; + /* have to trust i_size as the end */ + last = (u64)-1; + last_for_get_extent = isize; + } else { + /* + * remember the start of the last extent. There are a + * bunch of different factors that go into the length of the + * extent, so its much less complex to remember where it started + */ + last = found_key.offset; + last_for_get_extent = last + 1; } - last = found_key.offset; btrfs_free_path(path); + /* + * we might have some extents allocated but more delalloc past those + * extents. so, we trust isize unless the start of the last extent is + * beyond isize + */ + if (last < isize) { + last = (u64)-1; + last_for_get_extent = isize; + } + lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, &cached_state, GFP_NOFS); - em = get_extent(inode, NULL, 0, off, max - off, 0); + + em = get_extent_skip_holes(inode, off, last_for_get_extent, + get_extent); if (!em) goto out; if (IS_ERR(em)) { @@ -2973,19 +3046,14 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } while (!end) { - hole = 0; - off = em->start + em->len; + off = extent_map_end(em); if (off >= max) end = 1; - if (em->block_start == EXTENT_MAP_HOLE) { - hole = 1; - goto next; - } - em_start = em->start; em_len = em->len; - + em_end = extent_map_end(em); + emflags = em->flags; disko = 0; flags = 0; @@ -3004,37 +3072,29 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) flags |= FIEMAP_EXTENT_ENCODED; -next: - emflags = em->flags; free_extent_map(em); em = NULL; - if (!end) { - em = get_extent(inode, NULL, 0, off, max - off, 0); - if (!em) - goto out; - if (IS_ERR(em)) { - ret = PTR_ERR(em); - goto out; - } - emflags = em->flags; - } - - if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) { + if ((em_start >= last) || em_len == (u64)-1 || + (last == (u64)-1 && isize <= em_end)) { flags |= FIEMAP_EXTENT_LAST; end = 1; } - if (em_start == last) { + /* now scan forward to see if this is really the last extent. */ + em = get_extent_skip_holes(inode, off, last_for_get_extent, + get_extent); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out; + } + if (!em) { flags |= FIEMAP_EXTENT_LAST; end = 1; } - - if (!hole) { - ret = fiemap_fill_next_extent(fieinfo, em_start, disko, - em_len, flags); - if (ret) - goto out_free; - } + ret = fiemap_fill_next_extent(fieinfo, em_start, disko, + em_len, flags); + if (ret) + goto out_free; } out_free: free_extent_map(em); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 7083cfafd061..9318dfefd59c 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -191,7 +191,7 @@ void extent_io_exit(void); u64 count_range_bits(struct extent_io_tree *tree, u64 *start, u64 search_end, - u64 max_bytes, unsigned long bits); + u64 max_bytes, unsigned long bits, int contig); void free_extent_state(struct extent_state *state); int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index fb9bd7832b6d..0efdb65953c5 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1913,7 +1913,7 @@ static int btrfs_clean_io_failures(struct inode *inode, u64 start) private = 0; if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, - (u64)-1, 1, EXTENT_DIRTY)) { + (u64)-1, 1, EXTENT_DIRTY, 0)) { ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start, &private_failure); if (ret == 0) { @@ -5280,6 +5280,128 @@ out: return em; } +struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page, + size_t pg_offset, u64 start, u64 len, + int create) +{ + struct extent_map *em; + struct extent_map *hole_em = NULL; + u64 range_start = start; + u64 end; + u64 found; + u64 found_end; + int err = 0; + + em = btrfs_get_extent(inode, page, pg_offset, start, len, create); + if (IS_ERR(em)) + return em; + if (em) { + /* + * if our em maps to a hole, there might + * actually be delalloc bytes behind it + */ + if (em->block_start != EXTENT_MAP_HOLE) + return em; + else + hole_em = em; + } + + /* check to see if we've wrapped (len == -1 or similar) */ + end = start + len; + if (end < start) + end = (u64)-1; + else + end -= 1; + + em = NULL; + + /* ok, we didn't find anything, lets look for delalloc */ + found = count_range_bits(&BTRFS_I(inode)->io_tree, &range_start, + end, len, EXTENT_DELALLOC, 1); + found_end = range_start + found; + if (found_end < range_start) + found_end = (u64)-1; + + /* + * we didn't find anything useful, return + * the original results from get_extent() + */ + if (range_start > end || found_end <= start) { + em = hole_em; + hole_em = NULL; + goto out; + } + + /* adjust the range_start to make sure it doesn't + * go backwards from the start they passed in + */ + range_start = max(start,range_start); + found = found_end - range_start; + + if (found > 0) { + u64 hole_start = start; + u64 hole_len = len; + + em = alloc_extent_map(GFP_NOFS); + if (!em) { + err = -ENOMEM; + goto out; + } + /* + * when btrfs_get_extent can't find anything it + * returns one huge hole + * + * make sure what it found really fits our range, and + * adjust to make sure it is based on the start from + * the caller + */ + if (hole_em) { + u64 calc_end = extent_map_end(hole_em); + + if (calc_end <= start || (hole_em->start > end)) { + free_extent_map(hole_em); + hole_em = NULL; + } else { + hole_start = max(hole_em->start, start); + hole_len = calc_end - hole_start; + } + } + em->bdev = NULL; + if (hole_em && range_start > hole_start) { + /* our hole starts before our delalloc, so we + * have to return just the parts of the hole + * that go until the delalloc starts + */ + em->len = min(hole_len, + range_start - hole_start); + em->start = hole_start; + em->orig_start = hole_start; + /* + * don't adjust block start at all, + * it is fixed at EXTENT_MAP_HOLE + */ + em->block_start = hole_em->block_start; + em->block_len = hole_len; + } else { + em->start = range_start; + em->len = found; + em->orig_start = range_start; + em->block_start = EXTENT_MAP_DELALLOC; + em->block_len = found; + } + } else if (hole_em) { + return hole_em; + } +out: + + free_extent_map(hole_em); + if (err) { + free_extent_map(em); + return ERR_PTR(err); + } + return em; +} + static struct extent_map *btrfs_new_extent_direct(struct inode *inode, u64 start, u64 len) { @@ -6102,7 +6224,7 @@ out: static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len) { - return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent); + return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap); } int btrfs_readpage(struct file *file, struct page *page) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index be2d4f6aaa5e..5fdb2abc4fa7 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1071,12 +1071,15 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file, if (copy_from_user(&flags, arg, sizeof(flags))) return -EFAULT; - if (flags & ~BTRFS_SUBVOL_CREATE_ASYNC) + if (flags & BTRFS_SUBVOL_CREATE_ASYNC) return -EINVAL; if (flags & ~BTRFS_SUBVOL_RDONLY) return -EOPNOTSUPP; + if (!is_owner_or_cap(inode)) + return -EACCES; + down_write(&root->fs_info->subvol_sem); /* nothing to do */ @@ -1097,7 +1100,7 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file, goto out_reset; } - ret = btrfs_update_root(trans, root, + ret = btrfs_update_root(trans, root->fs_info->tree_root, &root->root_key, &root->root_item); btrfs_commit_transaction(trans, root); diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index cc9b450399df..a178f5ebea78 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -280,6 +280,7 @@ static int lzo_decompress_biovec(struct list_head *ws, unsigned long tot_out; unsigned long tot_len; char *buf; + bool may_late_unmap, need_unmap; data_in = kmap(pages_in[0]); tot_len = read_compress_length(data_in); @@ -300,11 +301,13 @@ static int lzo_decompress_biovec(struct list_head *ws, tot_in += in_len; working_bytes = in_len; + may_late_unmap = need_unmap = false; /* fast path: avoid using the working buffer */ if (in_page_bytes_left >= in_len) { buf = data_in + in_offset; bytes = in_len; + may_late_unmap = true; goto cont; } @@ -329,14 +332,17 @@ cont: if (working_bytes == 0 && tot_in >= tot_len) break; - kunmap(pages_in[page_in_index]); - page_in_index++; - if (page_in_index >= total_pages_in) { + if (page_in_index + 1 >= total_pages_in) { ret = -1; - data_in = NULL; goto done; } - data_in = kmap(pages_in[page_in_index]); + + if (may_late_unmap) + need_unmap = true; + else + kunmap(pages_in[page_in_index]); + + data_in = kmap(pages_in[++page_in_index]); in_page_bytes_left = PAGE_CACHE_SIZE; in_offset = 0; @@ -346,6 +352,8 @@ cont: out_len = lzo1x_worst_compress(PAGE_CACHE_SIZE); ret = lzo1x_decompress_safe(buf, in_len, workspace->buf, &out_len); + if (need_unmap) + kunmap(pages_in[page_in_index - 1]); if (ret != LZO_E_OK) { printk(KERN_WARNING "btrfs decompress failed\n"); ret = -1; @@ -363,8 +371,7 @@ cont: break; } done: - if (data_in) - kunmap(pages_in[page_in_index]); + kunmap(pages_in[page_in_index]); return ret; } diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 0825e4ed9447..31ade5802ae8 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3654,6 +3654,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) u32 item_size; int ret; int err = 0; + int progress = 0; path = btrfs_alloc_path(); if (!path) @@ -3666,9 +3667,10 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) } while (1) { + progress++; trans = btrfs_start_transaction(rc->extent_root, 0); BUG_ON(IS_ERR(trans)); - +restart: if (update_backref_cache(trans, &rc->backref_cache)) { btrfs_end_transaction(trans, rc->extent_root); continue; @@ -3781,6 +3783,15 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) } } } + if (trans && progress && err == -ENOSPC) { + ret = btrfs_force_chunk_alloc(trans, rc->extent_root, + rc->block_group->flags); + if (ret == 0) { + err = 0; + progress = 0; + goto restart; + } + } btrfs_release_path(rc->extent_root, path); clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY, diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index a004008f7d28..d39a9895d932 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -155,7 +155,8 @@ enum { Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_compress_type, Opt_compress_force, Opt_compress_force_type, Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, - Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, Opt_err, + Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, + Opt_enospc_debug, Opt_err, }; static match_table_t tokens = { @@ -184,6 +185,7 @@ static match_table_t tokens = { {Opt_space_cache, "space_cache"}, {Opt_clear_cache, "clear_cache"}, {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"}, + {Opt_enospc_debug, "enospc_debug"}, {Opt_err, NULL}, }; @@ -358,6 +360,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) case Opt_user_subvol_rm_allowed: btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED); break; + case Opt_enospc_debug: + btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG); + break; case Opt_err: printk(KERN_INFO "btrfs: unrecognized mount option " "'%s'\n", p); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index af7dbca15276..dd13eb81ee40 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1338,11 +1338,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) ret = btrfs_shrink_device(device, 0); if (ret) - goto error_brelse; + goto error_undo; ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device); if (ret) - goto error_brelse; + goto error_undo; device->in_fs_metadata = 0; @@ -1416,6 +1416,13 @@ out: mutex_unlock(&root->fs_info->volume_mutex); mutex_unlock(&uuid_mutex); return ret; +error_undo: + if (device->writeable) { + list_add(&device->dev_alloc_list, + &root->fs_info->fs_devices->alloc_list); + root->fs_info->fs_devices->rw_devices++; + } + goto error_brelse; } /* @@ -1633,7 +1640,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) device->dev_root = root->fs_info->dev_root; device->bdev = bdev; device->in_fs_metadata = 1; - device->mode = 0; + device->mode = FMODE_EXCL; set_blocksize(device->bdev, 4096); if (seeding_dev) { diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 267d0ada4541..4a09af9e9a63 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -63,6 +63,13 @@ * cleanup path and it is also acquired by eventpoll_release_file() * if a file has been pushed inside an epoll set and it is then * close()d without a previous call toepoll_ctl(EPOLL_CTL_DEL). + * It is also acquired when inserting an epoll fd onto another epoll + * fd. We do this so that we walk the epoll tree and ensure that this + * insertion does not create a cycle of epoll file descriptors, which + * could lead to deadlock. We need a global mutex to prevent two + * simultaneous inserts (A into B and B into A) from racing and + * constructing a cycle without either insert observing that it is + * going to. * It is possible to drop the "ep->mtx" and to use the global * mutex "epmutex" (together with "ep->lock") to have it working, * but having "ep->mtx" will make the interface more scalable. @@ -224,6 +231,9 @@ static long max_user_watches __read_mostly; */ static DEFINE_MUTEX(epmutex); +/* Used to check for epoll file descriptor inclusion loops */ +static struct nested_calls poll_loop_ncalls; + /* Used for safe wake up implementation */ static struct nested_calls poll_safewake_ncalls; @@ -1198,6 +1208,62 @@ retry: return res; } +/** + * ep_loop_check_proc - Callback function to be passed to the @ep_call_nested() + * API, to verify that adding an epoll file inside another + * epoll structure, does not violate the constraints, in + * terms of closed loops, or too deep chains (which can + * result in excessive stack usage). + * + * @priv: Pointer to the epoll file to be currently checked. + * @cookie: Original cookie for this call. This is the top-of-the-chain epoll + * data structure pointer. + * @call_nests: Current dept of the @ep_call_nested() call stack. + * + * Returns: Returns zero if adding the epoll @file inside current epoll + * structure @ep does not violate the constraints, or -1 otherwise. + */ +static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) +{ + int error = 0; + struct file *file = priv; + struct eventpoll *ep = file->private_data; + struct rb_node *rbp; + struct epitem *epi; + + mutex_lock(&ep->mtx); + for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { + epi = rb_entry(rbp, struct epitem, rbn); + if (unlikely(is_file_epoll(epi->ffd.file))) { + error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, + ep_loop_check_proc, epi->ffd.file, + epi->ffd.file->private_data, current); + if (error != 0) + break; + } + } + mutex_unlock(&ep->mtx); + + return error; +} + +/** + * ep_loop_check - Performs a check to verify that adding an epoll file (@file) + * another epoll file (represented by @ep) does not create + * closed loops or too deep chains. + * + * @ep: Pointer to the epoll private data structure. + * @file: Pointer to the epoll file to be checked. + * + * Returns: Returns zero if adding the epoll @file inside current epoll + * structure @ep does not violate the constraints, or -1 otherwise. + */ +static int ep_loop_check(struct eventpoll *ep, struct file *file) +{ + return ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, + ep_loop_check_proc, file, ep, current); +} + /* * Open an eventpoll file descriptor. */ @@ -1246,6 +1312,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, struct epoll_event __user *, event) { int error; + int did_lock_epmutex = 0; struct file *file, *tfile; struct eventpoll *ep; struct epitem *epi; @@ -1287,6 +1354,25 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, */ ep = file->private_data; + /* + * When we insert an epoll file descriptor, inside another epoll file + * descriptor, there is the change of creating closed loops, which are + * better be handled here, than in more critical paths. + * + * We hold epmutex across the loop check and the insert in this case, in + * order to prevent two separate inserts from racing and each doing the + * insert "at the same time" such that ep_loop_check passes on both + * before either one does the insert, thereby creating a cycle. + */ + if (unlikely(is_file_epoll(tfile) && op == EPOLL_CTL_ADD)) { + mutex_lock(&epmutex); + did_lock_epmutex = 1; + error = -ELOOP; + if (ep_loop_check(ep, tfile) != 0) + goto error_tgt_fput; + } + + mutex_lock(&ep->mtx); /* @@ -1322,6 +1408,9 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, mutex_unlock(&ep->mtx); error_tgt_fput: + if (unlikely(did_lock_epmutex)) + mutex_unlock(&epmutex); + fput(tfile); error_fput: fput(file); @@ -1441,6 +1530,12 @@ static int __init eventpoll_init(void) EP_ITEM_COST; BUG_ON(max_user_watches < 0); + /* + * Initialize the structure used to perform epoll file descriptor + * inclusion loops checks. + */ + ep_nested_calls_init(&poll_loop_ncalls); + /* Initialize the structure used to perform safe poll wait head wake ups */ ep_nested_calls_init(&poll_safewake_ncalls); diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index bfed8447ed80..83543b5ff941 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1283,8 +1283,11 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr, if (err) return err; - if ((attr->ia_valid & ATTR_OPEN) && fc->atomic_o_trunc) - return 0; + if (attr->ia_valid & ATTR_OPEN) { + if (fc->atomic_o_trunc) + return 0; + file = NULL; + } if (attr->ia_valid & ATTR_SIZE) is_truncate = true; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 95da1bc1c826..9e0832dbb1e3 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -86,18 +86,52 @@ struct fuse_file *fuse_file_get(struct fuse_file *ff) return ff; } +static void fuse_release_async(struct work_struct *work) +{ + struct fuse_req *req; + struct fuse_conn *fc; + struct path path; + + req = container_of(work, struct fuse_req, misc.release.work); + path = req->misc.release.path; + fc = get_fuse_conn(path.dentry->d_inode); + + fuse_put_request(fc, req); + path_put(&path); +} + static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req) { - path_put(&req->misc.release.path); + if (fc->destroy_req) { + /* + * If this is a fuseblk mount, then it's possible that + * releasing the path will result in releasing the + * super block and sending the DESTROY request. If + * the server is single threaded, this would hang. + * For this reason do the path_put() in a separate + * thread. + */ + atomic_inc(&req->count); + INIT_WORK(&req->misc.release.work, fuse_release_async); + schedule_work(&req->misc.release.work); + } else { + path_put(&req->misc.release.path); + } } -static void fuse_file_put(struct fuse_file *ff) +static void fuse_file_put(struct fuse_file *ff, bool sync) { if (atomic_dec_and_test(&ff->count)) { struct fuse_req *req = ff->reserved_req; - req->end = fuse_release_end; - fuse_request_send_background(ff->fc, req); + if (sync) { + fuse_request_send(ff->fc, req); + path_put(&req->misc.release.path); + fuse_put_request(ff->fc, req); + } else { + req->end = fuse_release_end; + fuse_request_send_background(ff->fc, req); + } kfree(ff); } } @@ -219,8 +253,12 @@ void fuse_release_common(struct file *file, int opcode) * Normally this will send the RELEASE request, however if * some asynchronous READ or WRITE requests are outstanding, * the sending will be delayed. + * + * Make the release synchronous if this is a fuseblk mount, + * synchronous RELEASE is allowed (and desirable) in this case + * because the server can be trusted not to screw up. */ - fuse_file_put(ff); + fuse_file_put(ff, ff->fc->destroy_req != NULL); } static int fuse_open(struct inode *inode, struct file *file) @@ -558,7 +596,7 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req) page_cache_release(page); } if (req->ff) - fuse_file_put(req->ff); + fuse_file_put(req->ff, false); } static void fuse_send_readpages(struct fuse_req *req, struct file *file) @@ -1137,7 +1175,7 @@ static ssize_t fuse_direct_write(struct file *file, const char __user *buf, static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req) { __free_page(req->pages[0]); - fuse_file_put(req->ff); + fuse_file_put(req->ff, false); } static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index ae5744a2f9e9..d4286947bc2c 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -21,6 +21,7 @@ #include <linux/rwsem.h> #include <linux/rbtree.h> #include <linux/poll.h> +#include <linux/workqueue.h> /** Max number of pages that can be used in a single read request */ #define FUSE_MAX_PAGES_PER_REQ 32 @@ -262,7 +263,10 @@ struct fuse_req { /** Data for asynchronous requests */ union { struct { - struct fuse_release_in in; + union { + struct fuse_release_in in; + struct work_struct work; + }; struct path path; } release; struct fuse_init_in init_in; diff --git a/fs/inode.c b/fs/inode.c index 9c2b795ccc93..0647d80accf6 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -548,11 +548,14 @@ void evict_inodes(struct super_block *sb) /** * invalidate_inodes - attempt to free all inodes on a superblock * @sb: superblock to operate on + * @kill_dirty: flag to guide handling of dirty inodes * * Attempts to free all inodes for a given superblock. If there were any * busy inodes return a non-zero value, else zero. + * If @kill_dirty is set, discard dirty inodes too, otherwise treat + * them as busy. */ -int invalidate_inodes(struct super_block *sb) +int invalidate_inodes(struct super_block *sb, bool kill_dirty) { int busy = 0; struct inode *inode, *next; @@ -564,6 +567,10 @@ int invalidate_inodes(struct super_block *sb) list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) continue; + if (inode->i_state & I_DIRTY && !kill_dirty) { + busy = 1; + continue; + } if (atomic_read(&inode->i_count)) { busy = 1; continue; diff --git a/fs/internal.h b/fs/internal.h index 0663568b1247..9b976b57d7fe 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -112,4 +112,4 @@ extern void release_open_intent(struct nameidata *); */ extern int get_nr_dirty_inodes(void); extern void evict_inodes(struct super_block *); -extern int invalidate_inodes(struct super_block *); +extern int invalidate_inodes(struct super_block *, bool); diff --git a/fs/namespace.c b/fs/namespace.c index 7b0b95371696..d1edf26025dc 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1244,7 +1244,7 @@ static int do_umount(struct vfsmount *mnt, int flags) */ br_write_lock(vfsmount_lock); if (mnt_get_count(mnt) != 2) { - br_write_lock(vfsmount_lock); + br_write_unlock(vfsmount_lock); return -EBUSY; } br_write_unlock(vfsmount_lock); diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 43e56b97f9c0..6180da1e37e6 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -405,9 +405,9 @@ static inline int ocfs2_remove_extent_credits(struct super_block *sb) ocfs2_quota_trans_credits(sb); } -/* data block for new dir/symlink, 2 for bitmap updates (bitmap fe + - * bitmap block for the new bit) dx_root update for free list */ -#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2 + 1) +/* data block for new dir/symlink, allocation of directory block, dx_root + * update for free list */ +#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + OCFS2_SUBALLOC_ALLOC + 1) static inline int ocfs2_add_dir_index_credits(struct super_block *sb) { diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index b5f9160e93e9..19ebc5aad391 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -3228,7 +3228,7 @@ static int ocfs2_make_clusters_writable(struct super_block *sb, u32 num_clusters, unsigned int e_flags) { int ret, delete, index, credits = 0; - u32 new_bit, new_len; + u32 new_bit, new_len, orig_num_clusters; unsigned int set_len; struct ocfs2_super *osb = OCFS2_SB(sb); handle_t *handle; @@ -3261,6 +3261,8 @@ static int ocfs2_make_clusters_writable(struct super_block *sb, goto out; } + orig_num_clusters = num_clusters; + while (num_clusters) { ret = ocfs2_get_refcount_rec(ref_ci, context->ref_root_bh, p_cluster, num_clusters, @@ -3348,7 +3350,8 @@ static int ocfs2_make_clusters_writable(struct super_block *sb, * in write-back mode. */ if (context->get_clusters == ocfs2_di_get_clusters) { - ret = ocfs2_cow_sync_writeback(sb, context, cpos, num_clusters); + ret = ocfs2_cow_sync_writeback(sb, context, cpos, + orig_num_clusters); if (ret) mlog_errno(ret); } diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 38f986d2447e..36c423fb0635 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1316,7 +1316,7 @@ static int ocfs2_parse_options(struct super_block *sb, struct mount_options *mopt, int is_remount) { - int status; + int status, user_stack = 0; char *p; u32 tmp; @@ -1459,6 +1459,15 @@ static int ocfs2_parse_options(struct super_block *sb, memcpy(mopt->cluster_stack, args[0].from, OCFS2_STACK_LABEL_LEN); mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0'; + /* + * Open code the memcmp here as we don't have + * an osb to pass to + * ocfs2_userspace_stack(). + */ + if (memcmp(mopt->cluster_stack, + OCFS2_CLASSIC_CLUSTER_STACK, + OCFS2_STACK_LABEL_LEN)) + user_stack = 1; break; case Opt_inode64: mopt->mount_opt |= OCFS2_MOUNT_INODE64; @@ -1514,13 +1523,16 @@ static int ocfs2_parse_options(struct super_block *sb, } } - /* Ensure only one heartbeat mode */ - tmp = mopt->mount_opt & (OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL | - OCFS2_MOUNT_HB_NONE); - if (hweight32(tmp) != 1) { - mlog(ML_ERROR, "Invalid heartbeat mount options\n"); - status = 0; - goto bail; + if (user_stack == 0) { + /* Ensure only one heartbeat mode */ + tmp = mopt->mount_opt & (OCFS2_MOUNT_HB_LOCAL | + OCFS2_MOUNT_HB_GLOBAL | + OCFS2_MOUNT_HB_NONE); + if (hweight32(tmp) != 1) { + mlog(ML_ERROR, "Invalid heartbeat mount options\n"); + status = 0; + goto bail; + } } status = 1; diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c index 789c625c7aa5..b10e3540d5b7 100644 --- a/fs/partitions/ldm.c +++ b/fs/partitions/ldm.c @@ -251,6 +251,11 @@ static bool ldm_parse_vmdb (const u8 *data, struct vmdb *vm) } vm->vblk_size = get_unaligned_be32(data + 0x08); + if (vm->vblk_size == 0) { + ldm_error ("Illegal VBLK size"); + return false; + } + vm->vblk_offset = get_unaligned_be32(data + 0x0C); vm->last_vblk_seq = get_unaligned_be32(data + 0x04); |