From 8ddc7d9cd0a00062247c732b96386ec2462bdbc7 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Mon, 13 Jun 2011 20:02:58 +0200 Subject: btrfs: add mirror_num to extent_read_full_page Currently, extent_read_full_page always assumes we are trying to read mirror 0, which generally is the best we can do. To add flexibility, pass it as a parameter. This will be needed by scrub fixup code. Signed-off-by: Jan Schmidt --- fs/btrfs/disk-io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 07b3ac662e19..dc0343802535 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -908,7 +908,7 @@ static int btree_readpage(struct file *file, struct page *page) { struct extent_io_tree *tree; tree = &BTRFS_I(page->mapping->host)->io_tree; - return extent_read_full_page(tree, page, btree_get_extent); + return extent_read_full_page(tree, page, btree_get_extent, 0); } static int btree_releasepage(struct page *page, gfp_t gfp_flags) -- cgit v1.2.3 From bb82ab88dfdb12948af58989c75bfe904bc1b09d Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Fri, 10 Jun 2011 14:06:53 +0200 Subject: btrfs: add an extra wait mode to read_extent_buffer_pages read_extent_buffer_pages currently has two modes, either trigger a read without waiting for anything, or wait for the I/O to finish. The former also bails when it's unable to lock the page. This patch now adds an additional parameter to allow it to block on page lock, but don't wait for completion. Changes v5: - merge the 2 wait parameters into one and define WAIT_NONE, WAIT_COMPLETE and WAIT_PAGE_LOCK Change v6: - fix bug introduced in v5 Signed-off-by: Arne Jansen --- fs/btrfs/disk-io.c | 5 +++-- fs/btrfs/extent_io.c | 7 +++---- fs/btrfs/extent_io.h | 3 +++ 3 files changed, 9 insertions(+), 6 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 07b3ac662e19..e6170e142cad 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -367,7 +367,8 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; while (1) { - ret = read_extent_buffer_pages(io_tree, eb, start, 1, + ret = read_extent_buffer_pages(io_tree, eb, start, + WAIT_COMPLETE, btree_get_extent, mirror_num); if (!ret && !verify_parent_transid(io_tree, eb, parent_transid)) @@ -974,7 +975,7 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, if (!buf) return 0; read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, - buf, 0, 0, btree_get_extent, 0); + buf, 0, WAIT_NONE, btree_get_extent, 0); free_extent_buffer(buf); return ret; } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d418164a35f1..823028e73cf8 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3349,8 +3349,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree, } int read_extent_buffer_pages(struct extent_io_tree *tree, - struct extent_buffer *eb, - u64 start, int wait, + struct extent_buffer *eb, u64 start, int wait, get_extent_t *get_extent, int mirror_num) { unsigned long i; @@ -3386,7 +3385,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, num_pages = num_extent_pages(eb->start, eb->len); for (i = start_i; i < num_pages; i++) { page = extent_buffer_page(eb, i); - if (!wait) { + if (wait == WAIT_NONE) { if (!trylock_page(page)) goto unlock_exit; } else { @@ -3430,7 +3429,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, if (bio) submit_one_bio(READ, bio, mirror_num, bio_flags); - if (ret || !wait) + if (ret || wait != WAIT_COMPLETE) return ret; for (i = start_i; i < num_pages; i++) { diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 7b2f0c3e7929..6d74c6b34691 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -248,6 +248,9 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, u64 start, unsigned long len); void free_extent_buffer(struct extent_buffer *eb); +#define WAIT_NONE 0 +#define WAIT_COMPLETE 1 +#define WAIT_PAGE_LOCK 2 int read_extent_buffer_pages(struct extent_io_tree *tree, struct extent_buffer *eb, u64 start, int wait, get_extent_t *get_extent, int mirror_num); -- cgit v1.2.3 From ab0fff03055d2d1b01a7581badeba18db9c4f55c Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Mon, 23 May 2011 14:25:41 +0200 Subject: btrfs: add READAHEAD extent buffer flag Add a READAHEAD extent buffer flag. Add a function to trigger a read with this flag set. Changes v2: - use extent buffer flags instead of extent state flags Changes v5: - adapt to changed read_extent_buffer_pages interface - don't return eb from reada_tree_block_flagged if it has CORRUPT flag set Signed-off-by: Arne Jansen --- fs/btrfs/disk-io.c | 32 ++++++++++++++++++++++++++++++++ fs/btrfs/disk-io.h | 2 ++ fs/btrfs/extent_io.h | 1 + 3 files changed, 35 insertions(+) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e6170e142cad..1220d04072c8 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -980,6 +980,38 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, return ret; } +int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, + int mirror_num, struct extent_buffer **eb) +{ + struct extent_buffer *buf = NULL; + struct inode *btree_inode = root->fs_info->btree_inode; + struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree; + int ret; + + buf = btrfs_find_create_tree_block(root, bytenr, blocksize); + if (!buf) + return 0; + + set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags); + + ret = read_extent_buffer_pages(io_tree, buf, 0, WAIT_PAGE_LOCK, + btree_get_extent, mirror_num); + if (ret) { + free_extent_buffer(buf); + return ret; + } + + if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) { + free_extent_buffer(buf); + return -EIO; + } else if (extent_buffer_uptodate(io_tree, buf, NULL)) { + *eb = buf; + } else { + free_extent_buffer(buf); + } + return 0; +} + struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize) { diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index bec3ea4bd67f..b3bdb5c1390f 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -40,6 +40,8 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, u64 parent_transid); int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, u64 parent_transid); +int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, + int mirror_num, struct extent_buffer **eb); struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize); int clean_tree_block(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 6d74c6b34691..fcaf49bcb880 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -32,6 +32,7 @@ #define EXTENT_BUFFER_BLOCKING 1 #define EXTENT_BUFFER_DIRTY 2 #define EXTENT_BUFFER_CORRUPT 3 +#define EXTENT_BUFFER_READAHEAD 4 /* this got triggered by readahead */ /* these are flags for extent_clear_unlock_delalloc */ #define EXTENT_CLEAR_UNLOCK_PAGE 0x1 -- cgit v1.2.3 From 90519d66abbccc251d14719ac76f191f70826e40 Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Mon, 23 May 2011 14:30:00 +0200 Subject: btrfs: state information for readahead Add state information for readahead to btrfs_fs_info and btrfs_device Changes v2: - don't wait in radix_trees - add own set of workers for readahead Reviewed-by: Josef Bacik Signed-off-by: Arne Jansen --- fs/btrfs/ctree.h | 5 +++++ fs/btrfs/disk-io.c | 10 ++++++++++ fs/btrfs/volumes.c | 8 ++++++++ fs/btrfs/volumes.h | 8 ++++++++ 4 files changed, 31 insertions(+) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 03912c5c6f49..f71fd24cc152 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1036,6 +1036,7 @@ struct btrfs_fs_info { struct btrfs_workers endio_freespace_worker; struct btrfs_workers submit_workers; struct btrfs_workers caching_workers; + struct btrfs_workers readahead_workers; /* * fixup workers take dirty pages that didn't properly go through @@ -1119,6 +1120,10 @@ struct btrfs_fs_info { u64 fs_state; struct btrfs_delayed_root *delayed_root; + + /* readahead tree */ + spinlock_t reada_lock; + struct radix_tree_root reada_tree; }; /* diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 1220d04072c8..fd28c40e9ec3 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1711,6 +1711,10 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->defrag_inodes = RB_ROOT; fs_info->trans_no_join = 0; + /* readahead state */ + INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT); + spin_lock_init(&fs_info->reada_lock); + fs_info->thread_pool_size = min_t(unsigned long, num_online_cpus() + 2, 8); @@ -1903,6 +1907,9 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta", fs_info->thread_pool_size, &fs_info->generic_worker); + btrfs_init_workers(&fs_info->readahead_workers, "readahead", + fs_info->thread_pool_size, + &fs_info->generic_worker); /* * endios are largely parallel and should have a very @@ -1913,6 +1920,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->endio_write_workers.idle_thresh = 2; fs_info->endio_meta_write_workers.idle_thresh = 2; + fs_info->readahead_workers.idle_thresh = 2; btrfs_start_workers(&fs_info->workers, 1); btrfs_start_workers(&fs_info->generic_worker, 1); @@ -1926,6 +1934,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_start_workers(&fs_info->endio_freespace_worker, 1); btrfs_start_workers(&fs_info->delayed_workers, 1); btrfs_start_workers(&fs_info->caching_workers, 1); + btrfs_start_workers(&fs_info->readahead_workers, 1); fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, @@ -2650,6 +2659,7 @@ int close_ctree(struct btrfs_root *root) btrfs_stop_workers(&fs_info->submit_workers); btrfs_stop_workers(&fs_info->delayed_workers); btrfs_stop_workers(&fs_info->caching_workers); + btrfs_stop_workers(&fs_info->readahead_workers); btrfs_close_devices(fs_info->fs_devices); btrfs_mapping_tree_free(&fs_info->mapping_tree); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f2a4cc79da61..1dccce5bc93d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -366,6 +366,14 @@ static noinline int device_list_add(const char *path, } INIT_LIST_HEAD(&device->dev_alloc_list); + /* init readahead state */ + spin_lock_init(&device->reada_lock); + device->reada_curr_zone = NULL; + atomic_set(&device->reada_in_flight, 0); + device->reada_next = 0; + INIT_RADIX_TREE(&device->reada_zones, GFP_NOFS & ~__GFP_WAIT); + INIT_RADIX_TREE(&device->reada_extents, GFP_NOFS & ~__GFP_WAIT); + mutex_lock(&fs_devices->device_list_mutex); list_add_rcu(&device->dev_list, &fs_devices->devices); mutex_unlock(&fs_devices->device_list_mutex); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 6d866db4e177..2a751246188a 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -92,6 +92,14 @@ struct btrfs_device { struct btrfs_work work; struct rcu_head rcu; struct work_struct rcu_work; + + /* readahead state */ + spinlock_t reada_lock; + atomic_t reada_in_flight; + u64 reada_next; + struct reada_zone *reada_curr_zone; + struct radix_tree_root reada_zones; + struct radix_tree_root reada_extents; }; struct btrfs_fs_devices { -- cgit v1.2.3 From 4bb31e928d1a47f5bd046ecb176b8eff7c589fc0 Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Fri, 10 Jun 2011 13:55:54 +0200 Subject: btrfs: hooks for readahead This adds the hooks needed for readahead. In the readpage_end_io_hook, the extent state is checked for the EXTENT_READAHEAD flag. Only in this case the readahead hook is called, to keep the impact on non-ra as low as possible. Additionally, a hook for a failed IO is added, otherwise readahead would wait indefinitely for the extent to finish. Changes for v2: - eliminate race condition Signed-off-by: Arne Jansen --- fs/btrfs/disk-io.c | 37 +++++++++++++++++++++++++++++++++++++ fs/btrfs/extent_io.c | 2 +- 2 files changed, 38 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index fd28c40e9ec3..2151828aa142 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -609,11 +609,47 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, end = min_t(u64, eb->len, PAGE_CACHE_SIZE); end = eb->start + end - 1; err: + if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) { + clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags); + btree_readahead_hook(root, eb, eb->start, ret); + } + free_extent_buffer(eb); out: return ret; } +static int btree_io_failed_hook(struct bio *failed_bio, + struct page *page, u64 start, u64 end, + struct extent_state *state) +{ + struct extent_io_tree *tree; + unsigned long len; + struct extent_buffer *eb; + struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; + + tree = &BTRFS_I(page->mapping->host)->io_tree; + if (page->private == EXTENT_PAGE_PRIVATE) + goto out; + if (!page->private) + goto out; + + len = page->private >> 2; + WARN_ON(len == 0); + + eb = alloc_extent_buffer(tree, start, len, page); + if (eb == NULL) + goto out; + + if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) { + clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags); + btree_readahead_hook(root, eb, eb->start, -EIO); + } + +out: + return -EIO; /* we fixed nothing */ +} + static void end_workqueue_bio(struct bio *bio, int err) { struct end_io_wq *end_io_wq = bio->bi_private; @@ -3166,6 +3202,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root) static struct extent_io_ops btree_extent_io_ops = { .write_cache_pages_lock_hook = btree_lock_page_hook, .readpage_end_io_hook = btree_readpage_end_io_hook, + .readpage_io_failed_hook = btree_io_failed_hook, .submit_bio_hook = btree_submit_bio_hook, /* note we're sharing with inode.c for the merge bio hook */ .merge_bio_hook = btrfs_merge_bio_hook, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 823028e73cf8..deba714236d7 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1731,7 +1731,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err) if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) { ret = tree->ops->readpage_io_failed_hook(bio, page, - start, end, NULL); + start, end, state); if (ret == 0) { uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); -- cgit v1.2.3 From 37be25bcb6d731914e126f8de59c4367f0d66b80 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 5 Aug 2011 10:25:38 -0400 Subject: Btrfs: kill the durable block rsv stuff This is confusing code and isn't used by anything anymore, so delete it. Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 11 ------ fs/btrfs/disk-io.c | 2 - fs/btrfs/extent-tree.c | 100 +++++++++---------------------------------------- fs/btrfs/inode.c | 4 -- fs/btrfs/relocation.c | 1 - 5 files changed, 17 insertions(+), 101 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 4ef777e85c89..c5ceba4078cc 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -772,13 +772,10 @@ struct btrfs_space_info { struct btrfs_block_rsv { u64 size; u64 reserved; - u64 freed[2]; struct btrfs_space_info *space_info; - struct list_head list; spinlock_t lock; atomic_t usage; unsigned int priority:8; - unsigned int durable:1; unsigned int refill_used:1; unsigned int full:1; }; @@ -840,7 +837,6 @@ struct btrfs_block_group_cache { spinlock_t lock; u64 pinned; u64 reserved; - u64 reserved_pinned; u64 bytes_super; u64 flags; u64 sectorsize; @@ -919,11 +915,6 @@ struct btrfs_fs_info { struct btrfs_block_rsv empty_block_rsv; - /* list of block reservations that cross multiple transactions */ - struct list_head durable_block_rsv_list; - - struct mutex durable_block_rsv_mutex; - u64 generation; u64 last_trans_committed; @@ -2238,8 +2229,6 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv); struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root); void btrfs_free_block_rsv(struct btrfs_root *root, struct btrfs_block_rsv *rsv); -void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *rsv); int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 07b3ac662e19..0b5643a68d57 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1665,8 +1665,6 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_init_block_rsv(&fs_info->trans_block_rsv); btrfs_init_block_rsv(&fs_info->chunk_block_rsv); btrfs_init_block_rsv(&fs_info->empty_block_rsv); - INIT_LIST_HEAD(&fs_info->durable_block_rsv_list); - mutex_init(&fs_info->durable_block_rsv_mutex); atomic_set(&fs_info->nr_async_submits, 0); atomic_set(&fs_info->async_delalloc_pages, 0); atomic_set(&fs_info->async_submit_draining, 0); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 4add1ac2dda0..30c0558eae84 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -121,7 +121,6 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache) if (atomic_dec_and_test(&cache->count)) { WARN_ON(cache->pinned > 0); WARN_ON(cache->reserved > 0); - WARN_ON(cache->reserved_pinned > 0); kfree(cache->free_space_ctl); kfree(cache); } @@ -3662,7 +3661,6 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv) spin_lock_init(&rsv->lock); atomic_set(&rsv->usage, 1); rsv->priority = 6; - INIT_LIST_HEAD(&rsv->list); } struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root) @@ -3685,25 +3683,10 @@ void btrfs_free_block_rsv(struct btrfs_root *root, { if (rsv && atomic_dec_and_test(&rsv->usage)) { btrfs_block_rsv_release(root, rsv, (u64)-1); - if (!rsv->durable) - kfree(rsv); + kfree(rsv); } } -/* - * make the block_rsv struct be able to capture freed space. - * the captured space will re-add to the the block_rsv struct - * after transaction commit - */ -void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *block_rsv) -{ - block_rsv->durable = 1; - mutex_lock(&fs_info->durable_block_rsv_mutex); - list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list); - mutex_unlock(&fs_info->durable_block_rsv_mutex); -} - int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, @@ -3745,9 +3728,7 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, ret = 0; } else { num_bytes -= block_rsv->reserved; - if (block_rsv->durable && - block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes) - commit_trans = 1; + commit_trans = 1; } spin_unlock(&block_rsv->lock); if (!ret) @@ -3763,8 +3744,18 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, } if (commit_trans) { + struct btrfs_space_info *sinfo = block_rsv->space_info; + if (trans) return -EAGAIN; + + spin_lock(&sinfo->lock); + if (sinfo->bytes_pinned < num_bytes) { + spin_unlock(&sinfo->lock); + return -ENOSPC; + } + spin_unlock(&sinfo->lock); + trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); ret = btrfs_commit_transaction(trans, root); @@ -3885,10 +3876,6 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info) fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; - btrfs_add_durable_block_rsv(fs_info, &fs_info->global_block_rsv); - - btrfs_add_durable_block_rsv(fs_info, &fs_info->delalloc_block_rsv); - update_global_block_rsv(fs_info); } @@ -4447,13 +4434,8 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) spin_lock(&cache->lock); cache->pinned -= len; cache->space_info->bytes_pinned -= len; - if (cache->ro) { + if (cache->ro) cache->space_info->bytes_readonly += len; - } else if (cache->reserved_pinned > 0) { - len = min(len, cache->reserved_pinned); - cache->reserved_pinned -= len; - cache->space_info->bytes_may_use += len; - } spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); } @@ -4468,11 +4450,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = root->fs_info; struct extent_io_tree *unpin; - struct btrfs_block_rsv *block_rsv; - struct btrfs_block_rsv *next_rsv; u64 start; u64 end; - int idx; int ret; if (fs_info->pinned_extents == &fs_info->freed_extents[0]) @@ -4495,30 +4474,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, cond_resched(); } - mutex_lock(&fs_info->durable_block_rsv_mutex); - list_for_each_entry_safe(block_rsv, next_rsv, - &fs_info->durable_block_rsv_list, list) { - - idx = trans->transid & 0x1; - if (block_rsv->freed[idx] > 0) { - block_rsv_add_bytes(block_rsv, - block_rsv->freed[idx], 0); - block_rsv->freed[idx] = 0; - } - if (atomic_read(&block_rsv->usage) == 0) { - btrfs_block_rsv_release(root, block_rsv, (u64)-1); - - if (block_rsv->freed[0] == 0 && - block_rsv->freed[1] == 0) { - list_del_init(&block_rsv->list); - kfree(block_rsv); - } - } else { - btrfs_block_rsv_release(root, block_rsv, 0); - } - } - mutex_unlock(&fs_info->durable_block_rsv_mutex); - return 0; } @@ -4820,36 +4775,18 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { ret = check_ref_cleanup(trans, root, buf->start); if (!ret) - goto pin; + goto out; } if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { pin_down_extent(root, cache, buf->start, buf->len, 1); - goto pin; + goto out; } WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); btrfs_add_free_space(cache, buf->start, buf->len); btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE); - - goto out; - } -pin: - if (block_rsv->durable && !cache->ro) { - ret = 0; - spin_lock(&cache->lock); - if (!cache->ro) { - cache->reserved_pinned += buf->len; - ret = 1; - } - spin_unlock(&cache->lock); - - if (ret) { - spin_lock(&block_rsv->lock); - block_rsv->freed[trans->transid & 0x1] += buf->len; - spin_unlock(&block_rsv->lock); - } } out: /* @@ -6705,12 +6642,9 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force) cache->bytes_super - btrfs_block_group_used(&cache->item); if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned + - sinfo->bytes_may_use + sinfo->bytes_readonly + - cache->reserved_pinned + num_bytes + min_allocable_bytes <= - sinfo->total_bytes) { + sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes + + min_allocable_bytes <= sinfo->total_bytes) { sinfo->bytes_readonly += num_bytes; - sinfo->bytes_may_use += cache->reserved_pinned; - cache->reserved_pinned = 0; cache->ro = 1; ret = 0; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5f5f8a577e69..6402a41b9023 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2164,9 +2164,6 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) } spin_unlock(&root->orphan_lock); - if (block_rsv) - btrfs_add_durable_block_rsv(root->fs_info, block_rsv); - /* grab metadata reservation from transaction handle */ if (reserve) { ret = btrfs_orphan_reserve_metadata(trans, inode); @@ -6505,7 +6502,6 @@ static int btrfs_truncate(struct inode *inode) rsv = btrfs_alloc_block_rsv(root); if (!rsv) return -ENOMEM; - btrfs_add_durable_block_rsv(root->fs_info, rsv); trans = btrfs_start_transaction(root, 4); if (IS_ERR(trans)) { diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 59bb1764273d..545b04358249 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3651,7 +3651,6 @@ int prepare_to_relocate(struct reloc_control *rc) return ret; rc->block_rsv->refill_used = 1; - btrfs_add_durable_block_rsv(rc->extent_root->fs_info, rc->block_rsv); memset(&rc->cluster, 0, sizeof(rc->cluster)); rc->search_start = rc->block_group->key.objectid; -- cgit v1.2.3 From 300e4f8a56f263797568c95b71c949f9f02e4534 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 29 Aug 2011 14:06:00 -0400 Subject: Btrfs: put the block group cache after we commit the super In moving some enospc stuff around I noticed that when we unmount we are often evicting the free space cache inodes before we do our last commit. This isn't bad, but it makes us constantly have to re-read the inodes back. So instead don't evict the cache until after we do our last commit, this will make things a little less crappy and makes a future enospc change work properly. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/disk-io.c | 4 ++-- fs/btrfs/free-space-cache.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 0b5643a68d57..4965a0179b31 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2543,8 +2543,6 @@ int close_ctree(struct btrfs_root *root) /* clear out the rbtree of defraggable inodes */ btrfs_run_defrag_inodes(root->fs_info); - btrfs_put_block_group_cache(fs_info); - /* * Here come 2 situations when btrfs is broken to flip readonly: * @@ -2570,6 +2568,8 @@ int close_ctree(struct btrfs_root *root) printk(KERN_ERR "btrfs: commit super ret %d\n", ret); } + btrfs_put_block_group_cache(fs_info); + kthread_stop(root->fs_info->transaction_kthread); kthread_stop(root->fs_info->cleaner_kthread); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 701ef5951e3f..1ea10731797a 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -105,7 +105,7 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root, block_group->disk_cache_state = BTRFS_DC_CLEAR; } - if (!btrfs_fs_closing(root->fs_info)) { + if (!block_group->iref) { block_group->inode = igrab(inode); block_group->iref = 1; } -- cgit v1.2.3 From 2bf64758fd6290797a5ce97d4b9c698a4ed1cbad Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 26 Sep 2011 17:12:22 -0400 Subject: Btrfs: allow us to overcommit our enospc reservations One of the things that kills us is the fact that our ENOSPC reservations are horribly over the top in most normal cases. There isn't too much that can be done about this because when we are completely full we really need them to work like this so we don't under reserve. However if there is plenty of unallocated chunks on the disk we can use that to gauge how much we can overcommit. So this patch adds chunk free space accounting so we always know how much unallocated space we have. Then if we fail to make a reservation within our allocated space, check to see if we can overcommit. In the normal flushing case (like with delalloc metadata reservations) we'll take the free space and divide it by 2 if our metadata profile is setup for DUP or any of those, and then divide it by 8 to make sure we don't overcommit too much. Then if we're in a non-flushing case (we really need this reservation now!) we only limit ourselves to half of the free space. This makes this fio test [torrent] filename=torrent-test rw=randwrite size=4g ioengine=sync directory=/mnt/btrfs-test go from taking around 45 minutes to 10 seconds on my freshly formatted 3 TiB file system. This doesn't seem to break my other enospc tests, but could really use some more testing as this is a super scary change. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 4 ++++ fs/btrfs/disk-io.c | 2 ++ fs/btrfs/extent-tree.c | 61 ++++++++++++++++++++++++++++++++++++++------------ fs/btrfs/volumes.c | 39 ++++++++++++++++++++++++++++---- 4 files changed, 88 insertions(+), 18 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 47dea7118e0e..1eafccb162ee 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -893,6 +893,10 @@ struct btrfs_fs_info { spinlock_t block_group_cache_lock; struct rb_root block_group_cache_tree; + /* keep track of unallocated space */ + spinlock_t free_chunk_lock; + u64 free_chunk_space; + struct extent_io_tree freed_extents[2]; struct extent_io_tree *pinned_extents; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 4965a0179b31..51372a521167 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1648,6 +1648,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, spin_lock_init(&fs_info->fs_roots_radix_lock); spin_lock_init(&fs_info->delayed_iput_lock); spin_lock_init(&fs_info->defrag_inodes_lock); + spin_lock_init(&fs_info->free_chunk_lock); mutex_init(&fs_info->reloc_mutex); init_completion(&fs_info->kobj_unregister); @@ -1675,6 +1676,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->metadata_ratio = 0; fs_info->defrag_inodes = RB_ROOT; fs_info->trans_no_join = 0; + fs_info->free_chunk_space = 0; fs_info->thread_pool_size = min_t(unsigned long, num_online_cpus() + 2, 8); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index fd65f6bc676c..25b69d0f9135 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3410,6 +3410,7 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, * @block_rsv - the block_rsv we're allocating for * @orig_bytes - the number of bytes we want * @flush - wether or not we can flush to make our reservation + * @check - wether this is just to check if we have enough space or not * * This will reserve orgi_bytes number of bytes from the space info associated * with the block_rsv. If there is not enough space it will make an attempt to @@ -3420,11 +3421,11 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, */ static int reserve_metadata_bytes(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, - u64 orig_bytes, int flush) + u64 orig_bytes, int flush, int check) { struct btrfs_space_info *space_info = block_rsv->space_info; struct btrfs_trans_handle *trans; - u64 unused; + u64 used; u64 num_bytes = orig_bytes; int retries = 0; int ret = 0; @@ -3459,9 +3460,9 @@ again: } ret = -ENOSPC; - unused = space_info->bytes_used + space_info->bytes_reserved + - space_info->bytes_pinned + space_info->bytes_readonly + - space_info->bytes_may_use; + used = space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_pinned + space_info->bytes_readonly + + space_info->bytes_may_use; /* * The idea here is that we've not already over-reserved the block group @@ -3470,9 +3471,8 @@ again: * lets start flushing stuff first and then come back and try to make * our reservation. */ - if (unused <= space_info->total_bytes) { - unused = space_info->total_bytes - unused; - if (unused >= orig_bytes) { + if (used <= space_info->total_bytes) { + if (used + orig_bytes <= space_info->total_bytes) { space_info->bytes_may_use += orig_bytes; ret = 0; } else { @@ -3489,10 +3489,43 @@ again: * amount plus the amount of bytes that we need for this * reservation. */ - num_bytes = unused - space_info->total_bytes + + num_bytes = used - space_info->total_bytes + (orig_bytes * (retries + 1)); } + if (ret && !check) { + u64 profile = btrfs_get_alloc_profile(root, 0); + u64 avail; + + spin_lock(&root->fs_info->free_chunk_lock); + avail = root->fs_info->free_chunk_space; + + /* + * If we have dup, raid1 or raid10 then only half of the free + * space is actually useable. + */ + if (profile & (BTRFS_BLOCK_GROUP_DUP | + BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10)) + avail >>= 1; + + /* + * If we aren't flushing don't let us overcommit too much, say + * 1/8th of the space. If we can flush, let it overcommit up to + * 1/2 of the space. + */ + if (flush) + avail >>= 3; + else + avail >>= 1; + spin_unlock(&root->fs_info->free_chunk_lock); + + if (used + orig_bytes < space_info->total_bytes + avail) { + space_info->bytes_may_use += orig_bytes; + ret = 0; + } + } + /* * Couldn't make our reservation, save our place so while we're trying * to reclaim space we can actually use it instead of somebody else @@ -3703,7 +3736,7 @@ int btrfs_block_rsv_add(struct btrfs_root *root, if (num_bytes == 0) return 0; - ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1); + ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1, 0); if (!ret) { block_rsv_add_bytes(block_rsv, num_bytes, 1); return 0; @@ -3737,7 +3770,7 @@ int btrfs_block_rsv_check(struct btrfs_root *root, if (!ret) return 0; - ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); + ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush, !flush); if (!ret) { block_rsv_add_bytes(block_rsv, num_bytes, 0); return 0; @@ -4037,7 +4070,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); spin_unlock(&BTRFS_I(inode)->lock); - ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); + ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush, 0); if (ret) { u64 to_free = 0; unsigned dropped; @@ -5692,7 +5725,7 @@ use_block_rsv(struct btrfs_trans_handle *trans, block_rsv = get_block_rsv(trans, root); if (block_rsv->size == 0) { - ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); + ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0, 0); /* * If we couldn't reserve metadata bytes try and use some from * the global reserve. @@ -5713,7 +5746,7 @@ use_block_rsv(struct btrfs_trans_handle *trans, return block_rsv; if (ret) { WARN_ON(1); - ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); + ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0, 0); if (!ret) { return block_rsv; } else if (ret && block_rsv != global_rsv) { diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f2a4cc79da61..e138af710de2 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1013,8 +1013,13 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, } BUG_ON(ret); - if (device->bytes_used > 0) - device->bytes_used -= btrfs_dev_extent_length(leaf, extent); + if (device->bytes_used > 0) { + u64 len = btrfs_dev_extent_length(leaf, extent); + device->bytes_used -= len; + spin_lock(&root->fs_info->free_chunk_lock); + root->fs_info->free_chunk_space += len; + spin_unlock(&root->fs_info->free_chunk_lock); + } ret = btrfs_del_item(trans, root, path); out: @@ -1356,6 +1361,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) if (ret) goto error_undo; + spin_lock(&root->fs_info->free_chunk_lock); + root->fs_info->free_chunk_space = device->total_bytes - + device->bytes_used; + spin_unlock(&root->fs_info->free_chunk_lock); + device->in_fs_metadata = 0; btrfs_scrub_cancel_dev(root, device); @@ -1691,6 +1701,10 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) root->fs_info->fs_devices->num_can_discard++; root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; + spin_lock(&root->fs_info->free_chunk_lock); + root->fs_info->free_chunk_space += device->total_bytes; + spin_unlock(&root->fs_info->free_chunk_lock); + if (!blk_queue_nonrot(bdev_get_queue(bdev))) root->fs_info->fs_devices->rotating = 1; @@ -2192,8 +2206,12 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) lock_chunks(root); device->total_bytes = new_size; - if (device->writeable) + if (device->writeable) { device->fs_devices->total_rw_bytes -= diff; + spin_lock(&root->fs_info->free_chunk_lock); + root->fs_info->free_chunk_space -= diff; + spin_unlock(&root->fs_info->free_chunk_lock); + } unlock_chunks(root); again: @@ -2257,6 +2275,9 @@ again: device->total_bytes = old_size; if (device->writeable) device->fs_devices->total_rw_bytes += diff; + spin_lock(&root->fs_info->free_chunk_lock); + root->fs_info->free_chunk_space += diff; + spin_unlock(&root->fs_info->free_chunk_lock); unlock_chunks(root); goto done; } @@ -2615,6 +2636,11 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, index++; } + spin_lock(&extent_root->fs_info->free_chunk_lock); + extent_root->fs_info->free_chunk_space -= (stripe_size * + map->num_stripes); + spin_unlock(&extent_root->fs_info->free_chunk_lock); + index = 0; stripe = &chunk->stripe; while (index < map->num_stripes) { @@ -3616,8 +3642,13 @@ static int read_one_dev(struct btrfs_root *root, fill_device_from_item(leaf, dev_item, device); device->dev_root = root->fs_info->dev_root; device->in_fs_metadata = 1; - if (device->writeable) + if (device->writeable) { device->fs_devices->total_rw_bytes += device->total_bytes; + spin_lock(&root->fs_info->free_chunk_lock); + root->fs_info->free_chunk_space += device->total_bytes - + device->bytes_used; + spin_unlock(&root->fs_info->free_chunk_lock); + } ret = 0; return ret; } -- cgit v1.2.3 From 01d658f2ca3c85c1ffb20b306e30d16197000ce7 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 1 Nov 2011 10:08:06 -0400 Subject: Btrfs: make sure to flush queued bios if write_cache_pages waits write_cache_pages tries to build up a large bio to stuff down the pipe. But if it needs to wait for a page lock, it needs to make sure and send down any pending writes so we don't deadlock with anyone who has the page lock and is waiting for writeback of things inside the bio. Dave Sterba triggered this as a deadlock between the autodefrag code and the extent write_cache_pages Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 13 ++++++++++--- fs/btrfs/disk-io.h | 2 -- fs/btrfs/extent_io.c | 14 ++++++++++---- fs/btrfs/extent_io.h | 3 ++- 4 files changed, 22 insertions(+), 10 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 51372a521167..6f58911ece0d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2735,7 +2735,8 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) return ret; } -int btree_lock_page_hook(struct page *page) +static int btree_lock_page_hook(struct page *page, void *data, + void (*flush_fn)(void *)) { struct inode *inode = page->mapping->host; struct btrfs_root *root = BTRFS_I(inode)->root; @@ -2752,7 +2753,10 @@ int btree_lock_page_hook(struct page *page) if (!eb) goto out; - btrfs_tree_lock(eb); + if (!btrfs_try_tree_write_lock(eb)) { + flush_fn(data); + btrfs_tree_lock(eb); + } btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { @@ -2767,7 +2771,10 @@ int btree_lock_page_hook(struct page *page) btrfs_tree_unlock(eb); free_extent_buffer(eb); out: - lock_page(page); + if (!trylock_page(page)) { + flush_fn(data); + lock_page(page); + } return 0; } diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index bec3ea4bd67f..e678539c8519 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -83,8 +83,6 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_add_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root); -int btree_lock_page_hook(struct page *page); - #ifdef CONFIG_DEBUG_LOCK_ALLOC void btrfs_init_lockdep(void); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index f284d4e5f447..b40ba75f4483 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2613,10 +2613,16 @@ retry: * swizzled back from swapper_space to tmpfs file * mapping */ - if (tree->ops && tree->ops->write_cache_pages_lock_hook) - tree->ops->write_cache_pages_lock_hook(page); - else - lock_page(page); + if (tree->ops && + tree->ops->write_cache_pages_lock_hook) { + tree->ops->write_cache_pages_lock_hook(page, + data, flush_fn); + } else { + if (!trylock_page(page)) { + flush_fn(data); + lock_page(page); + } + } if (unlikely(page->mapping != mapping)) { unlock_page(page); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 325a346369da..cbd4824a7c94 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -86,7 +86,8 @@ struct extent_io_ops { struct extent_state *other); void (*split_extent_hook)(struct inode *inode, struct extent_state *orig, u64 split); - int (*write_cache_pages_lock_hook)(struct page *page); + int (*write_cache_pages_lock_hook)(struct page *page, void *data, + void (*flush_fn)(void *)); }; struct extent_io_tree { -- cgit v1.2.3 From 6c41761fc6efe1503103a1afe03a6635c0b5d4ec Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 13 Apr 2011 15:41:04 +0200 Subject: btrfs: separate superblock items out of fs_info fs_info has now ~9kb, more than fits into one page. This will cause mount failure when memory is too fragmented. Top space consumers are super block structures super_copy and super_for_commit, ~2.8kb each. Allocate them dynamically. fs_info will be ~3.5kb. (measured on x86_64) Add a wrapper for freeing fs_info and all of it's dynamically allocated members. Signed-off-by: David Sterba --- fs/btrfs/compression.c | 3 ++- fs/btrfs/ctree.h | 16 ++++++++++++++-- fs/btrfs/disk-io.c | 33 ++++++++++----------------------- fs/btrfs/extent-tree.c | 16 ++++++++-------- fs/btrfs/file-item.c | 17 ++++++----------- fs/btrfs/inode.c | 2 +- fs/btrfs/ioctl.c | 8 ++++---- fs/btrfs/scrub.c | 2 +- fs/btrfs/super.c | 19 +++++++++++++------ fs/btrfs/transaction.c | 10 +++++----- fs/btrfs/tree-log.c | 4 ++-- fs/btrfs/volumes.c | 24 ++++++++++++------------ 12 files changed, 78 insertions(+), 76 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 8ec5d86f1734..14f1c5a0b2d2 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -85,7 +85,8 @@ struct compressed_bio { static inline int compressed_bio_size(struct btrfs_root *root, unsigned long disk_size) { - u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); + return sizeof(struct compressed_bio) + ((disk_size + root->sectorsize - 1) / root->sectorsize) * csum_size; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f63c9b3f6e08..5181c53c1124 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -936,8 +936,8 @@ struct btrfs_fs_info { wait_queue_head_t transaction_blocked_wait; wait_queue_head_t async_submit_wait; - struct btrfs_super_block super_copy; - struct btrfs_super_block super_for_commit; + struct btrfs_super_block *super_copy; + struct btrfs_super_block *super_for_commit; struct block_device *__bdev; struct super_block *sb; struct inode *btree_inode; @@ -2387,6 +2387,18 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info) smp_mb(); return fs_info->closing; } +static inline void free_fs_info(struct btrfs_fs_info *fs_info) +{ + kfree(fs_info->delayed_root); + kfree(fs_info->extent_root); + kfree(fs_info->tree_root); + kfree(fs_info->chunk_root); + kfree(fs_info->dev_root); + kfree(fs_info->csum_root); + kfree(fs_info->super_copy); + kfree(fs_info->super_for_commit); + kfree(fs_info); +} /* root-item.c */ int btrfs_find_root_ref(struct btrfs_root *tree_root, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 6f58911ece0d..761717c98278 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -256,8 +256,7 @@ void btrfs_csum_final(u32 crc, char *result) static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, int verify) { - u16 csum_size = - btrfs_super_csum_size(&root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); char *result = NULL; unsigned long len; unsigned long cur_len; @@ -1766,14 +1765,14 @@ struct btrfs_root *open_ctree(struct super_block *sb, goto fail_alloc; } - memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy)); - memcpy(&fs_info->super_for_commit, &fs_info->super_copy, - sizeof(fs_info->super_for_commit)); + memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy)); + memcpy(fs_info->super_for_commit, fs_info->super_copy, + sizeof(*fs_info->super_for_commit)); brelse(bh); - memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE); + memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE); - disk_super = &fs_info->super_copy; + disk_super = fs_info->super_copy; if (!btrfs_super_root(disk_super)) goto fail_alloc; @@ -2152,7 +2151,6 @@ fail_sb_buffer: btrfs_stop_workers(&fs_info->delayed_workers); btrfs_stop_workers(&fs_info->caching_workers); fail_alloc: - kfree(fs_info->delayed_root); fail_iput: invalidate_inode_pages2(fs_info->btree_inode->i_mapping); iput(fs_info->btree_inode); @@ -2164,12 +2162,7 @@ fail_bdi: fail_srcu: cleanup_srcu_struct(&fs_info->subvol_srcu); fail: - kfree(extent_root); - kfree(tree_root); - kfree(fs_info); - kfree(chunk_root); - kfree(dev_root); - kfree(csum_root); + free_fs_info(fs_info); return ERR_PTR(err); } @@ -2338,10 +2331,10 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors) int total_errors = 0; u64 flags; - max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; + max_errors = btrfs_super_num_devices(root->fs_info->super_copy) - 1; do_barriers = !btrfs_test_opt(root, NOBARRIER); - sb = &root->fs_info->super_for_commit; + sb = root->fs_info->super_for_commit; dev_item = &sb->dev_item; mutex_lock(&root->fs_info->fs_devices->device_list_mutex); @@ -2603,7 +2596,6 @@ int close_ctree(struct btrfs_root *root) del_fs_roots(fs_info); iput(fs_info->btree_inode); - kfree(fs_info->delayed_root); btrfs_stop_workers(&fs_info->generic_worker); btrfs_stop_workers(&fs_info->fixup_workers); @@ -2624,12 +2616,7 @@ int close_ctree(struct btrfs_root *root) bdi_destroy(&fs_info->bdi); cleanup_srcu_struct(&fs_info->subvol_srcu); - kfree(fs_info->extent_root); - kfree(fs_info->tree_root); - kfree(fs_info->chunk_root); - kfree(fs_info->dev_root); - kfree(fs_info->csum_root); - kfree(fs_info); + free_fs_info(fs_info); return 0; } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index cb7626646bba..782eb3ea8edf 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3209,7 +3209,7 @@ static int should_alloc_chunk(struct btrfs_root *root, * about 1% of the FS size. */ if (force == CHUNK_ALLOC_LIMITED) { - thresh = btrfs_super_total_bytes(&root->fs_info->super_copy); + thresh = btrfs_super_total_bytes(root->fs_info->super_copy); thresh = max_t(u64, 64 * 1024 * 1024, div_factor_fine(thresh, 1)); @@ -3231,7 +3231,7 @@ static int should_alloc_chunk(struct btrfs_root *root, if (num_allocated + alloc_bytes < div_factor(num_bytes, 8)) return 0; - thresh = btrfs_super_total_bytes(&root->fs_info->super_copy); + thresh = btrfs_super_total_bytes(root->fs_info->super_copy); /* 256MB or 5% of the FS */ thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5)); @@ -3843,7 +3843,7 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info) u64 num_bytes; u64 meta_used; u64 data_used; - int csum_size = btrfs_super_csum_size(&fs_info->super_copy); + int csum_size = btrfs_super_csum_size(fs_info->super_copy); sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA); spin_lock(&sinfo->lock); @@ -4222,12 +4222,12 @@ static int update_block_group(struct btrfs_trans_handle *trans, /* block accounting for super block */ spin_lock(&info->delalloc_lock); - old_val = btrfs_super_bytes_used(&info->super_copy); + old_val = btrfs_super_bytes_used(info->super_copy); if (alloc) old_val += num_bytes; else old_val -= num_bytes; - btrfs_set_super_bytes_used(&info->super_copy, old_val); + btrfs_set_super_bytes_used(info->super_copy, old_val); spin_unlock(&info->delalloc_lock); while (total) { @@ -7127,9 +7127,9 @@ int btrfs_read_block_groups(struct btrfs_root *root) return -ENOMEM; path->reada = 1; - cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy); + cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); if (btrfs_test_opt(root, SPACE_CACHE) && - btrfs_super_generation(&root->fs_info->super_copy) != cache_gen) + btrfs_super_generation(root->fs_info->super_copy) != cache_gen) need_clear = 1; if (btrfs_test_opt(root, CLEAR_CACHE)) need_clear = 1; @@ -7458,7 +7458,7 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info) int mixed = 0; int ret; - disk_super = &fs_info->super_copy; + disk_super = fs_info->super_copy; if (!btrfs_super_root(disk_super)) return 1; diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index a1cb7821becd..c7fb3a4247d3 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -91,8 +91,7 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans, struct btrfs_csum_item *item; struct extent_buffer *leaf; u64 csum_offset = 0; - u16 csum_size = - btrfs_super_csum_size(&root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); int csums_in_item; file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; @@ -162,8 +161,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, u64 item_last_offset = 0; u64 disk_bytenr; u32 diff; - u16 csum_size = - btrfs_super_csum_size(&root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); int ret; struct btrfs_path *path; struct btrfs_csum_item *item = NULL; @@ -290,7 +288,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, int ret; size_t size; u64 csum_end; - u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); path = btrfs_alloc_path(); if (!path) @@ -492,8 +490,7 @@ static noinline int truncate_one_csum(struct btrfs_trans_handle *trans, u64 bytenr, u64 len) { struct extent_buffer *leaf; - u16 csum_size = - btrfs_super_csum_size(&root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); u64 csum_end; u64 end_byte = bytenr + len; u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits; @@ -549,8 +546,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, u64 csum_end; struct extent_buffer *leaf; int ret; - u16 csum_size = - btrfs_super_csum_size(&root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); int blocksize_bits = root->fs_info->sb->s_blocksize_bits; root = root->fs_info->csum_root; @@ -676,8 +672,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_sector_sum *sector_sum; u32 nritems; u32 ins_size; - u16 csum_size = - btrfs_super_csum_size(&root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); path = btrfs_alloc_path(); if (!path) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a8586e10953c..b6b612e14ed7 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -824,7 +824,7 @@ static noinline int cow_file_range(struct inode *inode, } BUG_ON(disk_num_bytes > - btrfs_super_total_bytes(&root->fs_info->super_copy)); + btrfs_super_total_bytes(root->fs_info->super_copy)); alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 33aae13cc74b..8f6e14279409 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -282,7 +282,7 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) struct fstrim_range range; u64 minlen = ULLONG_MAX; u64 num_devices = 0; - u64 total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy); + u64 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy); int ret; if (!capable(CAP_SYS_ADMIN)) @@ -1164,7 +1164,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, mutex_unlock(&inode->i_mutex); } - disk_super = &root->fs_info->super_copy; + disk_super = root->fs_info->super_copy; features = btrfs_super_incompat_flags(disk_super); if (range->compress_type == BTRFS_COMPRESS_LZO) { features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; @@ -2613,7 +2613,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) return PTR_ERR(trans); } - dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); + dir_id = btrfs_super_root_dir(root->fs_info->super_copy); di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path, dir_id, "default", 7, 1); if (IS_ERR_OR_NULL(di)) { @@ -2629,7 +2629,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_free_path(path); - disk_super = &root->fs_info->super_copy; + disk_super = root->fs_info->super_copy; features = btrfs_super_incompat_flags(disk_super); if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) { features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index a8d03d5efb5d..69a600f07763 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -182,7 +182,7 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) sdev->curr = -1; atomic_set(&sdev->in_flight, 0); atomic_set(&sdev->cancel_req, 0); - sdev->csum_size = btrfs_super_csum_size(&fs_info->super_copy); + sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy); INIT_LIST_HEAD(&sdev->csum_list); spin_lock_init(&sdev->list_lock); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 5429b1fa0bfc..f7e9de724ef2 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -216,7 +216,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) char *compress_type; bool compress_force = false; - cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy); + cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); if (cache_gen) btrfs_set_opt(info->mount_opt, SPACE_CACHE); @@ -524,7 +524,7 @@ static struct dentry *get_default_root(struct super_block *sb, * will mount by default if we haven't been given a specific subvolume * to mount. */ - dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); + dir_id = btrfs_super_root_dir(root->fs_info->super_copy); di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0); if (IS_ERR(di)) { btrfs_free_path(path); @@ -937,6 +937,13 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, fs_info->fs_devices = fs_devices; tree_root->fs_info = fs_info; + fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); + fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); + if (!fs_info->super_copy || !fs_info->super_for_commit) { + error = -ENOMEM; + goto error_close_devices; + } + bdev = fs_devices->latest_bdev; s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root); if (IS_ERR(s)) { @@ -951,7 +958,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, } btrfs_close_devices(fs_devices); - kfree(fs_info); + free_fs_info(fs_info); kfree(tree_root); } else { char b[BDEVNAME_SIZE]; @@ -979,7 +986,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, error_close_devices: btrfs_close_devices(fs_devices); - kfree(fs_info); + free_fs_info(fs_info); kfree(tree_root); return ERR_PTR(error); } @@ -1005,7 +1012,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) if (root->fs_info->fs_devices->rw_devices == 0) return -EACCES; - if (btrfs_super_log_root(&root->fs_info->super_copy) != 0) + if (btrfs_super_log_root(root->fs_info->super_copy) != 0) return -EINVAL; ret = btrfs_cleanup_fs_roots(root->fs_info); @@ -1171,7 +1178,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct btrfs_root *root = btrfs_sb(dentry->d_sb); - struct btrfs_super_block *disk_super = &root->fs_info->super_copy; + struct btrfs_super_block *disk_super = root->fs_info->super_copy; struct list_head *head = &root->fs_info->space_info; struct btrfs_space_info *found; u64 total_used = 0; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 29bef63e23ba..373c7ec1a026 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -991,7 +991,7 @@ static void update_super_roots(struct btrfs_root *root) struct btrfs_root_item *root_item; struct btrfs_super_block *super; - super = &root->fs_info->super_copy; + super = root->fs_info->super_copy; root_item = &root->fs_info->chunk_root->root_item; super->chunk_root = root_item->bytenr; @@ -1301,12 +1301,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, update_super_roots(root); if (!root->fs_info->log_root_recovering) { - btrfs_set_super_log_root(&root->fs_info->super_copy, 0); - btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0); + btrfs_set_super_log_root(root->fs_info->super_copy, 0); + btrfs_set_super_log_root_level(root->fs_info->super_copy, 0); } - memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy, - sizeof(root->fs_info->super_copy)); + memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy, + sizeof(*root->fs_info->super_copy)); trans->transaction->blocked = 0; spin_lock(&root->fs_info->trans_lock); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 8ca1b6b83bd1..f4d81c06d48f 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2118,9 +2118,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, BUG_ON(ret); btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); - btrfs_set_super_log_root(&root->fs_info->super_for_commit, + btrfs_set_super_log_root(root->fs_info->super_for_commit, log_root_tree->node->start); - btrfs_set_super_log_root_level(&root->fs_info->super_for_commit, + btrfs_set_super_log_root_level(root->fs_info->super_for_commit, btrfs_header_level(log_root_tree->node)); log_root_tree->log_batch = 0; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c6938b45e0fd..c3b45564048e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1395,8 +1395,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) call_rcu(&device->rcu, free_device); mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); - num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; - btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices); + num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1; + btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices); if (cur_devices->open_devices == 0) { struct btrfs_fs_devices *fs_devices; @@ -1458,7 +1458,7 @@ static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans, struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; struct btrfs_fs_devices *old_devices; struct btrfs_fs_devices *seed_devices; - struct btrfs_super_block *disk_super = &root->fs_info->super_copy; + struct btrfs_super_block *disk_super = root->fs_info->super_copy; struct btrfs_device *device; u64 super_flags; @@ -1706,12 +1706,12 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) if (!blk_queue_nonrot(bdev_get_queue(bdev))) root->fs_info->fs_devices->rotating = 1; - total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy); - btrfs_set_super_total_bytes(&root->fs_info->super_copy, + total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy); + btrfs_set_super_total_bytes(root->fs_info->super_copy, total_bytes + device->total_bytes); - total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy); - btrfs_set_super_num_devices(&root->fs_info->super_copy, + total_bytes = btrfs_super_num_devices(root->fs_info->super_copy); + btrfs_set_super_num_devices(root->fs_info->super_copy, total_bytes + 1); mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); @@ -1802,7 +1802,7 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 new_size) { struct btrfs_super_block *super_copy = - &device->dev_root->fs_info->super_copy; + device->dev_root->fs_info->super_copy; u64 old_total = btrfs_super_total_bytes(super_copy); u64 diff = new_size - device->total_bytes; @@ -1861,7 +1861,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64 chunk_offset) { - struct btrfs_super_block *super_copy = &root->fs_info->super_copy; + struct btrfs_super_block *super_copy = root->fs_info->super_copy; struct btrfs_disk_key *disk_key; struct btrfs_chunk *chunk; u8 *ptr; @@ -2187,7 +2187,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) bool retried = false; struct extent_buffer *l; struct btrfs_key key; - struct btrfs_super_block *super_copy = &root->fs_info->super_copy; + struct btrfs_super_block *super_copy = root->fs_info->super_copy; u64 old_total = btrfs_super_total_bytes(super_copy); u64 old_size = device->total_bytes; u64 diff = device->total_bytes - new_size; @@ -2311,7 +2311,7 @@ static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans, struct btrfs_key *key, struct btrfs_chunk *chunk, int item_size) { - struct btrfs_super_block *super_copy = &root->fs_info->super_copy; + struct btrfs_super_block *super_copy = root->fs_info->super_copy; struct btrfs_disk_key disk_key; u32 array_size; u8 *ptr; @@ -3653,7 +3653,7 @@ static int read_one_dev(struct btrfs_root *root, int btrfs_read_sys_array(struct btrfs_root *root) { - struct btrfs_super_block *super_copy = &root->fs_info->super_copy; + struct btrfs_super_block *super_copy = root->fs_info->super_copy; struct extent_buffer *sb; struct btrfs_disk_key *disk_key; struct btrfs_chunk *chunk; -- cgit v1.2.3 From af31f5e5b84b5bf2bcec464153a5130b170b2770 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 3 Nov 2011 15:17:42 -0400 Subject: Btrfs: add a log of past tree roots This takes some of the free space in the btrfs super block to record information about most of the roots in the last four commits. It also adds a -o recovery to use the root history log when we're not able to read the tree of tree roots, the extent tree root, the device tree root or the csum root. Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 95 +++++++++++++++++ fs/btrfs/disk-io.c | 291 ++++++++++++++++++++++++++++++++++++++++++++++++----- fs/btrfs/super.c | 7 +- 3 files changed, 369 insertions(+), 24 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 5181c53c1124..78f43d1102a0 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -360,6 +360,47 @@ struct btrfs_header { #define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048 #define BTRFS_LABEL_SIZE 256 +/* + * just in case we somehow lose the roots and are not able to mount, + * we store an array of the roots from previous transactions + * in the super. + */ +#define BTRFS_NUM_BACKUP_ROOTS 4 +struct btrfs_root_backup { + __le64 tree_root; + __le64 tree_root_gen; + + __le64 chunk_root; + __le64 chunk_root_gen; + + __le64 extent_root; + __le64 extent_root_gen; + + __le64 fs_root; + __le64 fs_root_gen; + + __le64 dev_root; + __le64 dev_root_gen; + + __le64 csum_root; + __le64 csum_root_gen; + + __le64 total_bytes; + __le64 bytes_used; + __le64 num_devices; + /* future */ + __le64 unsed_64[4]; + + u8 tree_root_level; + u8 chunk_root_level; + u8 extent_root_level; + u8 fs_root_level; + u8 dev_root_level; + u8 csum_root_level; + /* future and to align */ + u8 unused_8[10]; +} __attribute__ ((__packed__)); + /* * the super block basically lists the main trees of the FS * it currently lacks any block count etc etc @@ -406,6 +447,7 @@ struct btrfs_super_block { /* future expansion */ __le64 reserved[31]; u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; + struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS]; } __attribute__ ((__packed__)); /* @@ -1113,6 +1155,9 @@ struct btrfs_fs_info { u64 fs_state; struct btrfs_delayed_root *delayed_root; + + /* next backup root to be overwritten */ + int backup_root_index; }; /* @@ -1357,6 +1402,7 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15) #define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16) #define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17) +#define BTRFS_MOUNT_RECOVERY (1 << 18) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) @@ -1972,6 +2018,55 @@ static inline bool btrfs_root_readonly(struct btrfs_root *root) return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY; } +/* struct btrfs_root_backup */ +BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup, + tree_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_tree_root_gen, struct btrfs_root_backup, + tree_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_tree_root_level, struct btrfs_root_backup, + tree_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_chunk_root, struct btrfs_root_backup, + chunk_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_gen, struct btrfs_root_backup, + chunk_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_level, struct btrfs_root_backup, + chunk_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_extent_root, struct btrfs_root_backup, + extent_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_extent_root_gen, struct btrfs_root_backup, + extent_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_extent_root_level, struct btrfs_root_backup, + extent_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_fs_root, struct btrfs_root_backup, + fs_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_fs_root_gen, struct btrfs_root_backup, + fs_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_fs_root_level, struct btrfs_root_backup, + fs_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_dev_root, struct btrfs_root_backup, + dev_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_dev_root_gen, struct btrfs_root_backup, + dev_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_dev_root_level, struct btrfs_root_backup, + dev_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_csum_root, struct btrfs_root_backup, + csum_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_csum_root_gen, struct btrfs_root_backup, + csum_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_csum_root_level, struct btrfs_root_backup, + csum_root_level, 8); +BTRFS_SETGET_STACK_FUNCS(backup_total_bytes, struct btrfs_root_backup, + total_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup, + bytes_used, 64); +BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup, + num_devices, 64); + /* struct btrfs_super_block */ BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 761717c98278..a61f8a6cf219 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1134,10 +1134,12 @@ static int find_and_setup_root(struct btrfs_root *tree_root, generation = btrfs_root_generation(&root->root_item); blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); + root->commit_root = NULL; root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), blocksize, generation); if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) { free_extent_buffer(root->node); + root->node = NULL; return -EIO; } root->commit_root = btrfs_root_node(root); @@ -1576,6 +1578,228 @@ sleep: return 0; } +/* + * this will find the highest generation in the array of + * root backups. The index of the highest array is returned, + * or -1 if we can't find anything. + * + * We check to make sure the array is valid by comparing the + * generation of the latest root in the array with the generation + * in the super block. If they don't match we pitch it. + */ +static int find_newest_super_backup(struct btrfs_fs_info *info, u64 newest_gen) +{ + u64 cur; + int newest_index = -1; + struct btrfs_root_backup *root_backup; + int i; + + for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) { + root_backup = info->super_copy->super_roots + i; + cur = btrfs_backup_tree_root_gen(root_backup); + if (cur == newest_gen) + newest_index = i; + } + + /* check to see if we actually wrapped around */ + if (newest_index == BTRFS_NUM_BACKUP_ROOTS - 1) { + root_backup = info->super_copy->super_roots; + cur = btrfs_backup_tree_root_gen(root_backup); + if (cur == newest_gen) + newest_index = 0; + } + return newest_index; +} + + +/* + * find the oldest backup so we know where to store new entries + * in the backup array. This will set the backup_root_index + * field in the fs_info struct + */ +static void find_oldest_super_backup(struct btrfs_fs_info *info, + u64 newest_gen) +{ + int newest_index = -1; + + newest_index = find_newest_super_backup(info, newest_gen); + /* if there was garbage in there, just move along */ + if (newest_index == -1) { + info->backup_root_index = 0; + } else { + info->backup_root_index = (newest_index + 1) % BTRFS_NUM_BACKUP_ROOTS; + } +} + +/* + * copy all the root pointers into the super backup array. + * this will bump the backup pointer by one when it is + * done + */ +static void backup_super_roots(struct btrfs_fs_info *info) +{ + int next_backup; + struct btrfs_root_backup *root_backup; + int last_backup; + + next_backup = info->backup_root_index; + last_backup = (next_backup + BTRFS_NUM_BACKUP_ROOTS - 1) % + BTRFS_NUM_BACKUP_ROOTS; + + /* + * just overwrite the last backup if we're at the same generation + * this happens only at umount + */ + root_backup = info->super_for_commit->super_roots + last_backup; + if (btrfs_backup_tree_root_gen(root_backup) == + btrfs_header_generation(info->tree_root->node)) + next_backup = last_backup; + + root_backup = info->super_for_commit->super_roots + next_backup; + + /* + * make sure all of our padding and empty slots get zero filled + * regardless of which ones we use today + */ + memset(root_backup, 0, sizeof(*root_backup)); + + info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS; + + btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start); + btrfs_set_backup_tree_root_gen(root_backup, + btrfs_header_generation(info->tree_root->node)); + + btrfs_set_backup_tree_root_level(root_backup, + btrfs_header_level(info->tree_root->node)); + + btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start); + btrfs_set_backup_chunk_root_gen(root_backup, + btrfs_header_generation(info->chunk_root->node)); + btrfs_set_backup_chunk_root_level(root_backup, + btrfs_header_level(info->chunk_root->node)); + + btrfs_set_backup_extent_root(root_backup, info->extent_root->node->start); + btrfs_set_backup_extent_root_gen(root_backup, + btrfs_header_generation(info->extent_root->node)); + btrfs_set_backup_extent_root_level(root_backup, + btrfs_header_level(info->extent_root->node)); + + btrfs_set_backup_fs_root(root_backup, info->fs_root->node->start); + btrfs_set_backup_fs_root_gen(root_backup, + btrfs_header_generation(info->fs_root->node)); + btrfs_set_backup_fs_root_level(root_backup, + btrfs_header_level(info->fs_root->node)); + + btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start); + btrfs_set_backup_dev_root_gen(root_backup, + btrfs_header_generation(info->dev_root->node)); + btrfs_set_backup_dev_root_level(root_backup, + btrfs_header_level(info->dev_root->node)); + + btrfs_set_backup_csum_root(root_backup, info->csum_root->node->start); + btrfs_set_backup_csum_root_gen(root_backup, + btrfs_header_generation(info->csum_root->node)); + btrfs_set_backup_csum_root_level(root_backup, + btrfs_header_level(info->csum_root->node)); + + btrfs_set_backup_total_bytes(root_backup, + btrfs_super_total_bytes(info->super_copy)); + btrfs_set_backup_bytes_used(root_backup, + btrfs_super_bytes_used(info->super_copy)); + btrfs_set_backup_num_devices(root_backup, + btrfs_super_num_devices(info->super_copy)); + + /* + * if we don't copy this out to the super_copy, it won't get remembered + * for the next commit + */ + memcpy(&info->super_copy->super_roots, + &info->super_for_commit->super_roots, + sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS); +} + +/* + * this copies info out of the root backup array and back into + * the in-memory super block. It is meant to help iterate through + * the array, so you send it the number of backups you've already + * tried and the last backup index you used. + * + * this returns -1 when it has tried all the backups + */ +static noinline int next_root_backup(struct btrfs_fs_info *info, + struct btrfs_super_block *super, + int *num_backups_tried, int *backup_index) +{ + struct btrfs_root_backup *root_backup; + int newest = *backup_index; + + if (*num_backups_tried == 0) { + u64 gen = btrfs_super_generation(super); + + newest = find_newest_super_backup(info, gen); + if (newest == -1) + return -1; + + *backup_index = newest; + *num_backups_tried = 1; + } else if (*num_backups_tried == BTRFS_NUM_BACKUP_ROOTS) { + /* we've tried all the backups, all done */ + return -1; + } else { + /* jump to the next oldest backup */ + newest = (*backup_index + BTRFS_NUM_BACKUP_ROOTS - 1) % + BTRFS_NUM_BACKUP_ROOTS; + *backup_index = newest; + *num_backups_tried += 1; + } + root_backup = super->super_roots + newest; + + btrfs_set_super_generation(super, + btrfs_backup_tree_root_gen(root_backup)); + btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup)); + btrfs_set_super_root_level(super, + btrfs_backup_tree_root_level(root_backup)); + btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup)); + + /* + * fixme: the total bytes and num_devices need to match or we should + * need a fsck + */ + btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup)); + btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup)); + return 0; +} + +/* helper to cleanup tree roots */ +static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root) +{ + free_extent_buffer(info->tree_root->node); + free_extent_buffer(info->tree_root->commit_root); + free_extent_buffer(info->dev_root->node); + free_extent_buffer(info->dev_root->commit_root); + free_extent_buffer(info->extent_root->node); + free_extent_buffer(info->extent_root->commit_root); + free_extent_buffer(info->csum_root->node); + free_extent_buffer(info->csum_root->commit_root); + + info->tree_root->node = NULL; + info->tree_root->commit_root = NULL; + info->dev_root->node = NULL; + info->dev_root->commit_root = NULL; + info->extent_root->node = NULL; + info->extent_root->commit_root = NULL; + info->csum_root->node = NULL; + info->csum_root->commit_root = NULL; + + if (chunk_root) { + free_extent_buffer(info->chunk_root->node); + free_extent_buffer(info->chunk_root->commit_root); + info->chunk_root->node = NULL; + info->chunk_root->commit_root = NULL; + } +} + + struct btrfs_root *open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, char *options) @@ -1603,6 +1827,8 @@ struct btrfs_root *open_ctree(struct super_block *sb, int ret; int err = -EINVAL; + int num_backups_tried = 0; + int backup_index = 0; struct btrfs_super_block *disk_super; @@ -1781,6 +2007,13 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); + /* + * run through our array of backup supers and setup + * our ring pointer to the oldest one + */ + generation = btrfs_super_generation(disk_super); + find_oldest_super_backup(fs_info, generation); + /* * In the long term, we'll store the compression type in the super * block, and it'll be used for per file compression control. @@ -1938,7 +2171,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) { printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n", sb->s_id); - goto fail_chunk_root; + goto fail_tree_roots; } btrfs_set_root_node(&chunk_root->root_item, chunk_root->node); chunk_root->commit_root = btrfs_root_node(chunk_root); @@ -1953,11 +2186,12 @@ struct btrfs_root *open_ctree(struct super_block *sb, if (ret) { printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n", sb->s_id); - goto fail_chunk_root; + goto fail_tree_roots; } btrfs_close_extra_devices(fs_devices); +retry_root_backup: blocksize = btrfs_level_size(tree_root, btrfs_super_root_level(disk_super)); generation = btrfs_super_generation(disk_super); @@ -1965,32 +2199,33 @@ struct btrfs_root *open_ctree(struct super_block *sb, tree_root->node = read_tree_block(tree_root, btrfs_super_root(disk_super), blocksize, generation); - if (!tree_root->node) - goto fail_chunk_root; - if (!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) { + if (!tree_root->node || + !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) { printk(KERN_WARNING "btrfs: failed to read tree root on %s\n", sb->s_id); - goto fail_tree_root; + + goto recovery_tree_root; } + btrfs_set_root_node(&tree_root->root_item, tree_root->node); tree_root->commit_root = btrfs_root_node(tree_root); ret = find_and_setup_root(tree_root, fs_info, BTRFS_EXTENT_TREE_OBJECTID, extent_root); if (ret) - goto fail_tree_root; + goto recovery_tree_root; extent_root->track_dirty = 1; ret = find_and_setup_root(tree_root, fs_info, BTRFS_DEV_TREE_OBJECTID, dev_root); if (ret) - goto fail_extent_root; + goto recovery_tree_root; dev_root->track_dirty = 1; ret = find_and_setup_root(tree_root, fs_info, BTRFS_CSUM_TREE_OBJECTID, csum_root); if (ret) - goto fail_dev_root; + goto recovery_tree_root; csum_root->track_dirty = 1; @@ -2123,20 +2358,10 @@ fail_cleaner: fail_block_groups: btrfs_free_block_groups(fs_info); - free_extent_buffer(csum_root->node); - free_extent_buffer(csum_root->commit_root); -fail_dev_root: - free_extent_buffer(dev_root->node); - free_extent_buffer(dev_root->commit_root); -fail_extent_root: - free_extent_buffer(extent_root->node); - free_extent_buffer(extent_root->commit_root); -fail_tree_root: - free_extent_buffer(tree_root->node); - free_extent_buffer(tree_root->commit_root); -fail_chunk_root: - free_extent_buffer(chunk_root->node); - free_extent_buffer(chunk_root->commit_root); + +fail_tree_roots: + free_root_pointers(fs_info, 1); + fail_sb_buffer: btrfs_stop_workers(&fs_info->generic_worker); btrfs_stop_workers(&fs_info->fixup_workers); @@ -2164,6 +2389,25 @@ fail_srcu: fail: free_fs_info(fs_info); return ERR_PTR(err); + +recovery_tree_root: + + if (!btrfs_test_opt(tree_root, RECOVERY)) + goto fail_tree_roots; + + free_root_pointers(fs_info, 0); + + /* don't use the log in recovery mode, it won't be valid */ + btrfs_set_super_log_root(disk_super, 0); + + /* we can't trust the free space cache either */ + btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE); + + ret = next_root_backup(fs_info, fs_info->super_copy, + &num_backups_tried, &backup_index); + if (ret == -1) + goto fail_block_groups; + goto retry_root_backup; } static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) @@ -2333,6 +2577,7 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors) max_errors = btrfs_super_num_devices(root->fs_info->super_copy) - 1; do_barriers = !btrfs_test_opt(root, NOBARRIER); + backup_super_roots(root->fs_info); sb = root->fs_info->super_for_commit; dev_item = &sb->dev_item; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index f7e9de724ef2..57080dffdfc6 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -164,7 +164,7 @@ enum { Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, - Opt_inode_cache, Opt_no_space_cache, Opt_err, + Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_err, }; static match_table_t tokens = { @@ -198,6 +198,7 @@ static match_table_t tokens = { {Opt_defrag, "autodefrag"}, {Opt_inode_cache, "inode_cache"}, {Opt_no_space_cache, "no_space_cache"}, + {Opt_recovery, "recovery"}, {Opt_err, NULL}, }; @@ -392,6 +393,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) printk(KERN_INFO "btrfs: enabling auto defrag"); btrfs_set_opt(info->mount_opt, AUTO_DEFRAG); break; + case Opt_recovery: + printk(KERN_INFO "btrfs: enabling auto recovery"); + btrfs_set_opt(info->mount_opt, RECOVERY); + break; case Opt_err: printk(KERN_INFO "btrfs: unrecognized mount option " "'%s'\n", p); -- cgit v1.2.3 From 6d668dda0caec537fbf28c4d91e6d18181af3cff Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 3 Nov 2011 22:54:25 -0400 Subject: Btrfs: make a delayed_block_rsv for the delayed item insertion I've been hitting warnings in use_block_rsv when running the delayed insertion stuff. It's because we will readjust global block rsv based on what is in use, which means we could end up discarding reservations that are for the delayed insertion stuff. So instead create a seperate block rsv for the delayed insertion stuff. This will also make it easier to debug problems with the delayed insertion reservations since we will know that only the delayed insertion code touches this block_rsv. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 2 ++ fs/btrfs/delayed-inode.c | 14 +++++++------- fs/btrfs/disk-io.c | 1 + fs/btrfs/extent-tree.c | 3 +++ 4 files changed, 13 insertions(+), 7 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 78f43d1102a0..3002e5d4da0b 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -957,6 +957,8 @@ struct btrfs_fs_info { struct btrfs_block_rsv trans_block_rsv; /* block reservation for chunk tree */ struct btrfs_block_rsv chunk_block_rsv; + /* block reservation for delayed operations */ + struct btrfs_block_rsv delayed_block_rsv; struct btrfs_block_rsv empty_block_rsv; diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index b52c672f4c18..fc4026af7290 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -591,7 +591,7 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, return 0; src_rsv = trans->block_rsv; - dst_rsv = &root->fs_info->global_block_rsv; + dst_rsv = &root->fs_info->delayed_block_rsv; num_bytes = btrfs_calc_trans_metadata_size(root, 1); ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); @@ -609,7 +609,7 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root, if (!item->bytes_reserved) return; - rsv = &root->fs_info->global_block_rsv; + rsv = &root->fs_info->delayed_block_rsv; btrfs_block_rsv_release(root, rsv, item->bytes_reserved); } @@ -628,7 +628,7 @@ static int btrfs_delayed_inode_reserve_metadata( return 0; src_rsv = trans->block_rsv; - dst_rsv = &root->fs_info->global_block_rsv; + dst_rsv = &root->fs_info->delayed_block_rsv; num_bytes = btrfs_calc_trans_metadata_size(root, 1); ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); @@ -646,7 +646,7 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root, if (!node->bytes_reserved) return; - rsv = &root->fs_info->global_block_rsv; + rsv = &root->fs_info->delayed_block_rsv; btrfs_block_rsv_release(root, rsv, node->bytes_reserved); node->bytes_reserved = 0; @@ -1026,7 +1026,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, path->leave_spinning = 1; block_rsv = trans->block_rsv; - trans->block_rsv = &root->fs_info->global_block_rsv; + trans->block_rsv = &root->fs_info->delayed_block_rsv; delayed_root = btrfs_get_delayed_root(root); @@ -1069,7 +1069,7 @@ static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, path->leave_spinning = 1; block_rsv = trans->block_rsv; - trans->block_rsv = &node->root->fs_info->global_block_rsv; + trans->block_rsv = &node->root->fs_info->delayed_block_rsv; ret = btrfs_insert_delayed_items(trans, path, node->root, node); if (!ret) @@ -1149,7 +1149,7 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work) goto free_path; block_rsv = trans->block_rsv; - trans->block_rsv = &root->fs_info->global_block_rsv; + trans->block_rsv = &root->fs_info->delayed_block_rsv; ret = btrfs_insert_delayed_items(trans, path, root, delayed_node); if (!ret) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a61f8a6cf219..23b6776477b7 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1891,6 +1891,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_init_block_rsv(&fs_info->trans_block_rsv); btrfs_init_block_rsv(&fs_info->chunk_block_rsv); btrfs_init_block_rsv(&fs_info->empty_block_rsv); + btrfs_init_block_rsv(&fs_info->delayed_block_rsv); atomic_set(&fs_info->nr_async_submits, 0); atomic_set(&fs_info->async_delalloc_pages, 0); atomic_set(&fs_info->async_submit_draining, 0); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 782eb3ea8edf..01c1f08b976a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3914,6 +3914,7 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info) fs_info->delalloc_block_rsv.space_info = space_info; fs_info->trans_block_rsv.space_info = space_info; fs_info->empty_block_rsv.space_info = space_info; + fs_info->delayed_block_rsv.space_info = space_info; fs_info->extent_root->block_rsv = &fs_info->global_block_rsv; fs_info->csum_root->block_rsv = &fs_info->global_block_rsv; @@ -3933,6 +3934,8 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info) WARN_ON(fs_info->trans_block_rsv.reserved > 0); WARN_ON(fs_info->chunk_block_rsv.size > 0); WARN_ON(fs_info->chunk_block_rsv.reserved > 0); + WARN_ON(fs_info->delayed_block_rsv.size > 0); + WARN_ON(fs_info->delayed_block_rsv.reserved > 0); } void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, -- cgit v1.2.3 From c674e04e1cd6049715e7b9446790f4b441e547c0 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 3 Nov 2011 22:23:13 -0400 Subject: Btrfs: fix extent_buffer leak in the metadata IO error handling The scrub readahead branch brought in a new error handling hook, but it was leaking extent_buffer references. Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 0eb1f0951251..40a62b980087 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -644,6 +644,7 @@ static int btree_io_failed_hook(struct bio *failed_bio, clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags); btree_readahead_hook(root, eb, eb->start, -EIO); } + free_extent_buffer(eb); out: return -EIO; /* we fixed nothing */ -- cgit v1.2.3 From 306c8b68c82dfe6b7c9e5b61985760ad5d089205 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 3 Nov 2011 15:21:39 -0400 Subject: Btrfs: stop the readahead threads on failed mount If we don't stop them, they linger around corrupting memory by using pointers to freed things. Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 40a62b980087..e532892431f4 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2444,6 +2444,7 @@ fail_tree_roots: fail_sb_buffer: btrfs_stop_workers(&fs_info->generic_worker); + btrfs_stop_workers(&fs_info->readahead_workers); btrfs_stop_workers(&fs_info->fixup_workers); btrfs_stop_workers(&fs_info->delalloc_workers); btrfs_stop_workers(&fs_info->workers); -- cgit v1.2.3 From 7c7e82a77fe3d89ae50824aa7c897454675eb4c4 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Sun, 6 Nov 2011 18:50:56 -0500 Subject: Btrfs: check for a null fs root when writing to the backup root log During log replay, can commit the transaction before the fs_root pointers are setup, so we have to make sure they are not null before trying to use them. Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e532892431f4..e53a5bb85670 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1754,11 +1754,18 @@ static void backup_super_roots(struct btrfs_fs_info *info) btrfs_set_backup_extent_root_level(root_backup, btrfs_header_level(info->extent_root->node)); - btrfs_set_backup_fs_root(root_backup, info->fs_root->node->start); - btrfs_set_backup_fs_root_gen(root_backup, + /* + * we might commit during log recovery, which happens before we set + * the fs_root. Make sure it is valid before we fill it in. + */ + if (info->fs_root && info->fs_root->node) { + btrfs_set_backup_fs_root(root_backup, + info->fs_root->node->start); + btrfs_set_backup_fs_root_gen(root_backup, btrfs_header_generation(info->fs_root->node)); - btrfs_set_backup_fs_root_level(root_backup, + btrfs_set_backup_fs_root_level(root_backup, btrfs_header_level(info->fs_root->node)); + } btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start); btrfs_set_backup_dev_root_gen(root_backup, -- cgit v1.2.3 From 4d34b2789538befa45a68a191dc12e0886a69f7d Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 9 Nov 2011 00:08:15 +0200 Subject: Btrfs: avoid null dereference and leaks when bailing from open_ctree() Fix bugs introduced by 6c41761f. Firstly, after failing to allocate any of the tree roots (first 'goto fail' in open_ctree()) we would dereference a NULL fs_info pointer in free_fs_info(). Secondly, after failures from init_srcu_struct(), setup_bdi() and new_inode() we would leak all earlier allocated roots: fs_info fields haven't been initialized yet so free_fs_info() is rendered useless. Fix this by initializing fs_info pointer and fs_info fields before any allocations happen. Signed-off-by: Ilya Dryomov --- fs/btrfs/disk-io.c | 35 +++++++++++++++-------------------- 1 file changed, 15 insertions(+), 20 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e53a5bb85670..91db90b526c2 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1890,31 +1890,32 @@ struct btrfs_root *open_ctree(struct super_block *sb, u64 features; struct btrfs_key location; struct buffer_head *bh; - struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root), - GFP_NOFS); - struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root), - GFP_NOFS); + struct btrfs_super_block *disk_super; struct btrfs_root *tree_root = btrfs_sb(sb); - struct btrfs_fs_info *fs_info = NULL; - struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root), - GFP_NOFS); - struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root), - GFP_NOFS); + struct btrfs_fs_info *fs_info = tree_root->fs_info; + struct btrfs_root *extent_root; + struct btrfs_root *csum_root; + struct btrfs_root *chunk_root; + struct btrfs_root *dev_root; struct btrfs_root *log_tree_root; - int ret; int err = -EINVAL; int num_backups_tried = 0; int backup_index = 0; - struct btrfs_super_block *disk_super; + extent_root = fs_info->extent_root = + kzalloc(sizeof(struct btrfs_root), GFP_NOFS); + csum_root = fs_info->csum_root = + kzalloc(sizeof(struct btrfs_root), GFP_NOFS); + chunk_root = fs_info->chunk_root = + kzalloc(sizeof(struct btrfs_root), GFP_NOFS); + dev_root = fs_info->dev_root = + kzalloc(sizeof(struct btrfs_root), GFP_NOFS); - if (!extent_root || !tree_root || !tree_root->fs_info || - !chunk_root || !dev_root || !csum_root) { + if (!extent_root || !csum_root || !chunk_root || !dev_root) { err = -ENOMEM; goto fail; } - fs_info = tree_root->fs_info; ret = init_srcu_struct(&fs_info->subvol_srcu); if (ret) { @@ -1954,12 +1955,6 @@ struct btrfs_root *open_ctree(struct super_block *sb, mutex_init(&fs_info->reloc_mutex); init_completion(&fs_info->kobj_unregister); - fs_info->tree_root = tree_root; - fs_info->extent_root = extent_root; - fs_info->csum_root = csum_root; - fs_info->chunk_root = chunk_root; - fs_info->dev_root = dev_root; - fs_info->fs_devices = fs_devices; INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); INIT_LIST_HEAD(&fs_info->space_info); btrfs_mapping_init(&fs_info->mapping_tree); -- cgit v1.2.3 From 586e46e2813c589d26258a599580421fb6fb576b Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 9 Nov 2011 13:26:37 +0200 Subject: Btrfs: close devices on all error paths in open_ctree() Fix a bug introduced by 7e662854 where we would leave devices busy on certain error paths in open_ctree(). fs_info is guaranteed to be non-NULL now so it's safe to dereference it on all error paths. Signed-off-by: Ilya Dryomov --- fs/btrfs/disk-io.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 91db90b526c2..b6a5c0dd0dd8 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2460,21 +2460,20 @@ fail_sb_buffer: btrfs_stop_workers(&fs_info->caching_workers); fail_alloc: fail_iput: + btrfs_mapping_tree_free(&fs_info->mapping_tree); + invalidate_inode_pages2(fs_info->btree_inode->i_mapping); iput(fs_info->btree_inode); - - btrfs_close_devices(fs_info->fs_devices); - btrfs_mapping_tree_free(&fs_info->mapping_tree); fail_bdi: bdi_destroy(&fs_info->bdi); fail_srcu: cleanup_srcu_struct(&fs_info->subvol_srcu); fail: + btrfs_close_devices(fs_info->fs_devices); free_fs_info(fs_info); return ERR_PTR(err); recovery_tree_root: - if (!btrfs_test_opt(tree_root, RECOVERY)) goto fail_tree_roots; -- cgit v1.2.3 From 387125fc722a8ed432066b85a552917343bdafca Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 18 Nov 2011 15:07:51 -0500 Subject: Btrfs: fix barrier flushes When btrfs is writing the super blocks, it send barrier flushes to make sure writeback caching drives get all the metadata on disk in the right order. But, we have two bugs in the way these are sent down. When doing full commits (not via the tree log), we are sending the barrier down before the last super when it should be going down before the first. In multi-device setups, we should be waiting for the barriers to complete on all devices before writing any of the supers. Both of these bugs can cause corruptions on power failures. We fix it with some new code to send down empty barriers to all devices before writing the first super. Alexandre Oliva found the multi-device bug. Arne Jansen did the async barrier loop. Signed-off-by: Chris Mason Reported-by: Alexandre Oliva --- fs/btrfs/disk-io.c | 145 ++++++++++++++++++++++++++++++++++++++++++++++------- fs/btrfs/volumes.h | 6 +++ 2 files changed, 134 insertions(+), 17 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b6a5c0dd0dd8..48d30138237f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2573,22 +2573,10 @@ static int write_dev_supers(struct btrfs_device *device, int errors = 0; u32 crc; u64 bytenr; - int last_barrier = 0; if (max_mirrors == 0) max_mirrors = BTRFS_SUPER_MIRROR_MAX; - /* make sure only the last submit_bh does a barrier */ - if (do_barriers) { - for (i = 0; i < max_mirrors; i++) { - bytenr = btrfs_sb_offset(i); - if (bytenr + BTRFS_SUPER_INFO_SIZE >= - device->total_bytes) - break; - last_barrier = i; - } - } - for (i = 0; i < max_mirrors; i++) { bytenr = btrfs_sb_offset(i); if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes) @@ -2634,17 +2622,136 @@ static int write_dev_supers(struct btrfs_device *device, bh->b_end_io = btrfs_end_buffer_write_sync; } - if (i == last_barrier && do_barriers) - ret = submit_bh(WRITE_FLUSH_FUA, bh); - else - ret = submit_bh(WRITE_SYNC, bh); - + /* + * we fua the first super. The others we allow + * to go down lazy. + */ + ret = submit_bh(WRITE_FUA, bh); if (ret) errors++; } return errors < i ? 0 : -1; } +/* + * endio for the write_dev_flush, this will wake anyone waiting + * for the barrier when it is done + */ +static void btrfs_end_empty_barrier(struct bio *bio, int err) +{ + if (err) { + if (err == -EOPNOTSUPP) + set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); + clear_bit(BIO_UPTODATE, &bio->bi_flags); + } + if (bio->bi_private) + complete(bio->bi_private); + bio_put(bio); +} + +/* + * trigger flushes for one the devices. If you pass wait == 0, the flushes are + * sent down. With wait == 1, it waits for the previous flush. + * + * any device where the flush fails with eopnotsupp are flagged as not-barrier + * capable + */ +static int write_dev_flush(struct btrfs_device *device, int wait) +{ + struct bio *bio; + int ret = 0; + + if (device->nobarriers) + return 0; + + if (wait) { + bio = device->flush_bio; + if (!bio) + return 0; + + wait_for_completion(&device->flush_wait); + + if (bio_flagged(bio, BIO_EOPNOTSUPP)) { + printk("btrfs: disabling barriers on dev %s\n", + device->name); + device->nobarriers = 1; + } + if (!bio_flagged(bio, BIO_UPTODATE)) { + ret = -EIO; + } + + /* drop the reference from the wait == 0 run */ + bio_put(bio); + device->flush_bio = NULL; + + return ret; + } + + /* + * one reference for us, and we leave it for the + * caller + */ + device->flush_bio = NULL;; + bio = bio_alloc(GFP_NOFS, 0); + if (!bio) + return -ENOMEM; + + bio->bi_end_io = btrfs_end_empty_barrier; + bio->bi_bdev = device->bdev; + init_completion(&device->flush_wait); + bio->bi_private = &device->flush_wait; + device->flush_bio = bio; + + bio_get(bio); + submit_bio(WRITE_FLUSH, bio); + + return 0; +} + +/* + * send an empty flush down to each device in parallel, + * then wait for them + */ +static int barrier_all_devices(struct btrfs_fs_info *info) +{ + struct list_head *head; + struct btrfs_device *dev; + int errors = 0; + int ret; + + /* send down all the barriers */ + head = &info->fs_devices->devices; + list_for_each_entry_rcu(dev, head, dev_list) { + if (!dev->bdev) { + errors++; + continue; + } + if (!dev->in_fs_metadata || !dev->writeable) + continue; + + ret = write_dev_flush(dev, 0); + if (ret) + errors++; + } + + /* wait for all the barriers */ + list_for_each_entry_rcu(dev, head, dev_list) { + if (!dev->bdev) { + errors++; + continue; + } + if (!dev->in_fs_metadata || !dev->writeable) + continue; + + ret = write_dev_flush(dev, 1); + if (ret) + errors++; + } + if (errors) + return -EIO; + return 0; +} + int write_all_supers(struct btrfs_root *root, int max_mirrors) { struct list_head *head; @@ -2666,6 +2773,10 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors) mutex_lock(&root->fs_info->fs_devices->device_list_mutex); head = &root->fs_info->fs_devices->devices; + + if (do_barriers) + barrier_all_devices(root->fs_info); + list_for_each_entry_rcu(dev, head, dev_list) { if (!dev->bdev) { total_errors++; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index ab5b1c49f352..78f2d4d4f37f 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -100,6 +100,12 @@ struct btrfs_device { struct reada_zone *reada_curr_zone; struct radix_tree_root reada_zones; struct radix_tree_root reada_extents; + + /* for sending down flush barriers */ + struct bio *flush_bio; + struct completion flush_wait; + int nobarriers; + }; struct btrfs_fs_devices { -- cgit v1.2.3 From 32240a913d9f3a5aad42175d7696590ea1bfdb08 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Sun, 20 Nov 2011 07:33:38 -0500 Subject: btrfs: mirror_num should be int, not u64 My previous patch introduced some u64 for failed_mirror variables, this one makes it consistent again. Signed-off-by: Jan Schmidt Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 2 +- fs/btrfs/extent_io.c | 4 ++-- fs/btrfs/extent_io.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 48d30138237f..94abc25392f6 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -620,7 +620,7 @@ out: static int btree_io_failed_hook(struct bio *failed_bio, struct page *page, u64 start, u64 end, - u64 mirror_num, struct extent_state *state) + int mirror_num, struct extent_state *state) { struct extent_io_tree *tree; unsigned long len; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 47fdba7853c3..a877b8b212eb 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2285,8 +2285,8 @@ static void end_bio_extent_readpage(struct bio *bio, int err) clean_io_failure(start, page); } if (!uptodate) { - u64 failed_mirror; - failed_mirror = (unsigned long)bio->bi_bdev; + int failed_mirror; + failed_mirror = (int)(unsigned long)bio->bi_bdev; if (tree->ops && tree->ops->readpage_io_failed_hook) ret = tree->ops->readpage_io_failed_hook( bio, page, start, end, diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index feb9be0e23bc..7604c3001322 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -70,7 +70,7 @@ struct extent_io_ops { unsigned long bio_flags); int (*readpage_io_hook)(struct page *page, u64 start, u64 end); int (*readpage_io_failed_hook)(struct bio *bio, struct page *page, - u64 start, u64 end, u64 failed_mirror, + u64 start, u64 end, int failed_mirror, struct extent_state *state); int (*writepage_io_failed_hook)(struct bio *bio, struct page *page, u64 start, u64 end, -- cgit v1.2.3