From 8ddc7d9cd0a00062247c732b96386ec2462bdbc7 Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Mon, 13 Jun 2011 20:02:58 +0200
Subject: btrfs: add mirror_num to extent_read_full_page

Currently, extent_read_full_page always assumes we are trying to read mirror
0, which generally is the best we can do. To add flexibility, pass it as a
parameter. This will be needed by scrub fixup code.

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/disk-io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 07b3ac662e19..dc0343802535 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -908,7 +908,7 @@ static int btree_readpage(struct file *file, struct page *page)
 {
 	struct extent_io_tree *tree;
 	tree = &BTRFS_I(page->mapping->host)->io_tree;
-	return extent_read_full_page(tree, page, btree_get_extent);
+	return extent_read_full_page(tree, page, btree_get_extent, 0);
 }
 
 static int btree_releasepage(struct page *page, gfp_t gfp_flags)
-- 
cgit v1.2.3


From bb82ab88dfdb12948af58989c75bfe904bc1b09d Mon Sep 17 00:00:00 2001
From: Arne Jansen <sensille@gmx.net>
Date: Fri, 10 Jun 2011 14:06:53 +0200
Subject: btrfs: add an extra wait mode to read_extent_buffer_pages

read_extent_buffer_pages currently has two modes, either trigger a read
without waiting for anything, or wait for the I/O to finish. The former
also bails when it's unable to lock the page. This patch now adds an
additional parameter to allow it to block on page lock, but don't wait
for completion.

Changes v5:
 - merge the 2 wait parameters into one and define WAIT_NONE, WAIT_COMPLETE and
   WAIT_PAGE_LOCK

Change v6:
 - fix bug introduced in v5

Signed-off-by: Arne Jansen <sensille@gmx.net>
---
 fs/btrfs/disk-io.c   | 5 +++--
 fs/btrfs/extent_io.c | 7 +++----
 fs/btrfs/extent_io.h | 3 +++
 3 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 07b3ac662e19..e6170e142cad 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -367,7 +367,8 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
 	clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
 	io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
 	while (1) {
-		ret = read_extent_buffer_pages(io_tree, eb, start, 1,
+		ret = read_extent_buffer_pages(io_tree, eb, start,
+					       WAIT_COMPLETE,
 					       btree_get_extent, mirror_num);
 		if (!ret &&
 		    !verify_parent_transid(io_tree, eb, parent_transid))
@@ -974,7 +975,7 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
 	if (!buf)
 		return 0;
 	read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
-				 buf, 0, 0, btree_get_extent, 0);
+				 buf, 0, WAIT_NONE, btree_get_extent, 0);
 	free_extent_buffer(buf);
 	return ret;
 }
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d418164a35f1..823028e73cf8 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3349,8 +3349,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
 }
 
 int read_extent_buffer_pages(struct extent_io_tree *tree,
-			     struct extent_buffer *eb,
-			     u64 start, int wait,
+			     struct extent_buffer *eb, u64 start, int wait,
 			     get_extent_t *get_extent, int mirror_num)
 {
 	unsigned long i;
@@ -3386,7 +3385,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 	num_pages = num_extent_pages(eb->start, eb->len);
 	for (i = start_i; i < num_pages; i++) {
 		page = extent_buffer_page(eb, i);
-		if (!wait) {
+		if (wait == WAIT_NONE) {
 			if (!trylock_page(page))
 				goto unlock_exit;
 		} else {
@@ -3430,7 +3429,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 	if (bio)
 		submit_one_bio(READ, bio, mirror_num, bio_flags);
 
-	if (ret || !wait)
+	if (ret || wait != WAIT_COMPLETE)
 		return ret;
 
 	for (i = start_i; i < num_pages; i++) {
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 7b2f0c3e7929..6d74c6b34691 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -248,6 +248,9 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
 struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
 					 u64 start, unsigned long len);
 void free_extent_buffer(struct extent_buffer *eb);
+#define WAIT_NONE	0
+#define WAIT_COMPLETE	1
+#define WAIT_PAGE_LOCK	2
 int read_extent_buffer_pages(struct extent_io_tree *tree,
 			     struct extent_buffer *eb, u64 start, int wait,
 			     get_extent_t *get_extent, int mirror_num);
-- 
cgit v1.2.3


From ab0fff03055d2d1b01a7581badeba18db9c4f55c Mon Sep 17 00:00:00 2001
From: Arne Jansen <sensille@gmx.net>
Date: Mon, 23 May 2011 14:25:41 +0200
Subject: btrfs: add READAHEAD extent buffer flag

Add a READAHEAD extent buffer flag.
Add a function to trigger a read with this flag set.

Changes v2:
 - use extent buffer flags instead of extent state flags

Changes v5:
 - adapt to changed read_extent_buffer_pages interface
 - don't return eb from reada_tree_block_flagged if it has CORRUPT flag set

Signed-off-by: Arne Jansen <sensille@gmx.net>
---
 fs/btrfs/disk-io.c   | 32 ++++++++++++++++++++++++++++++++
 fs/btrfs/disk-io.h   |  2 ++
 fs/btrfs/extent_io.h |  1 +
 3 files changed, 35 insertions(+)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e6170e142cad..1220d04072c8 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -980,6 +980,38 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
 	return ret;
 }
 
+int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
+			 int mirror_num, struct extent_buffer **eb)
+{
+	struct extent_buffer *buf = NULL;
+	struct inode *btree_inode = root->fs_info->btree_inode;
+	struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree;
+	int ret;
+
+	buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+	if (!buf)
+		return 0;
+
+	set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags);
+
+	ret = read_extent_buffer_pages(io_tree, buf, 0, WAIT_PAGE_LOCK,
+				       btree_get_extent, mirror_num);
+	if (ret) {
+		free_extent_buffer(buf);
+		return ret;
+	}
+
+	if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) {
+		free_extent_buffer(buf);
+		return -EIO;
+	} else if (extent_buffer_uptodate(io_tree, buf, NULL)) {
+		*eb = buf;
+	} else {
+		free_extent_buffer(buf);
+	}
+	return 0;
+}
+
 struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
 					    u64 bytenr, u32 blocksize)
 {
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index bec3ea4bd67f..b3bdb5c1390f 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -40,6 +40,8 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
 				      u32 blocksize, u64 parent_transid);
 int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
 			 u64 parent_transid);
+int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
+			 int mirror_num, struct extent_buffer **eb);
 struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
 						   u64 bytenr, u32 blocksize);
 int clean_tree_block(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 6d74c6b34691..fcaf49bcb880 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -32,6 +32,7 @@
 #define EXTENT_BUFFER_BLOCKING 1
 #define EXTENT_BUFFER_DIRTY 2
 #define EXTENT_BUFFER_CORRUPT 3
+#define EXTENT_BUFFER_READAHEAD 4	/* this got triggered by readahead */
 
 /* these are flags for extent_clear_unlock_delalloc */
 #define EXTENT_CLEAR_UNLOCK_PAGE 0x1
-- 
cgit v1.2.3


From 90519d66abbccc251d14719ac76f191f70826e40 Mon Sep 17 00:00:00 2001
From: Arne Jansen <sensille@gmx.net>
Date: Mon, 23 May 2011 14:30:00 +0200
Subject: btrfs: state information for readahead

Add state information for readahead to btrfs_fs_info and btrfs_device

Changes v2:
 - don't wait in radix_trees
 - add own set of workers for readahead

Reviewed-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Arne Jansen <sensille@gmx.net>
---
 fs/btrfs/ctree.h   |  5 +++++
 fs/btrfs/disk-io.c | 10 ++++++++++
 fs/btrfs/volumes.c |  8 ++++++++
 fs/btrfs/volumes.h |  8 ++++++++
 4 files changed, 31 insertions(+)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 03912c5c6f49..f71fd24cc152 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1036,6 +1036,7 @@ struct btrfs_fs_info {
 	struct btrfs_workers endio_freespace_worker;
 	struct btrfs_workers submit_workers;
 	struct btrfs_workers caching_workers;
+	struct btrfs_workers readahead_workers;
 
 	/*
 	 * fixup workers take dirty pages that didn't properly go through
@@ -1119,6 +1120,10 @@ struct btrfs_fs_info {
 	u64 fs_state;
 
 	struct btrfs_delayed_root *delayed_root;
+
+	/* readahead tree */
+	spinlock_t reada_lock;
+	struct radix_tree_root reada_tree;
 };
 
 /*
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1220d04072c8..fd28c40e9ec3 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1711,6 +1711,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	fs_info->defrag_inodes = RB_ROOT;
 	fs_info->trans_no_join = 0;
 
+	/* readahead state */
+	INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
+	spin_lock_init(&fs_info->reada_lock);
+
 	fs_info->thread_pool_size = min_t(unsigned long,
 					  num_online_cpus() + 2, 8);
 
@@ -1903,6 +1907,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta",
 			   fs_info->thread_pool_size,
 			   &fs_info->generic_worker);
+	btrfs_init_workers(&fs_info->readahead_workers, "readahead",
+			   fs_info->thread_pool_size,
+			   &fs_info->generic_worker);
 
 	/*
 	 * endios are largely parallel and should have a very
@@ -1913,6 +1920,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
 	fs_info->endio_write_workers.idle_thresh = 2;
 	fs_info->endio_meta_write_workers.idle_thresh = 2;
+	fs_info->readahead_workers.idle_thresh = 2;
 
 	btrfs_start_workers(&fs_info->workers, 1);
 	btrfs_start_workers(&fs_info->generic_worker, 1);
@@ -1926,6 +1934,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	btrfs_start_workers(&fs_info->endio_freespace_worker, 1);
 	btrfs_start_workers(&fs_info->delayed_workers, 1);
 	btrfs_start_workers(&fs_info->caching_workers, 1);
+	btrfs_start_workers(&fs_info->readahead_workers, 1);
 
 	fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
 	fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -2650,6 +2659,7 @@ int close_ctree(struct btrfs_root *root)
 	btrfs_stop_workers(&fs_info->submit_workers);
 	btrfs_stop_workers(&fs_info->delayed_workers);
 	btrfs_stop_workers(&fs_info->caching_workers);
+	btrfs_stop_workers(&fs_info->readahead_workers);
 
 	btrfs_close_devices(fs_info->fs_devices);
 	btrfs_mapping_tree_free(&fs_info->mapping_tree);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f2a4cc79da61..1dccce5bc93d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -366,6 +366,14 @@ static noinline int device_list_add(const char *path,
 		}
 		INIT_LIST_HEAD(&device->dev_alloc_list);
 
+		/* init readahead state */
+		spin_lock_init(&device->reada_lock);
+		device->reada_curr_zone = NULL;
+		atomic_set(&device->reada_in_flight, 0);
+		device->reada_next = 0;
+		INIT_RADIX_TREE(&device->reada_zones, GFP_NOFS & ~__GFP_WAIT);
+		INIT_RADIX_TREE(&device->reada_extents, GFP_NOFS & ~__GFP_WAIT);
+
 		mutex_lock(&fs_devices->device_list_mutex);
 		list_add_rcu(&device->dev_list, &fs_devices->devices);
 		mutex_unlock(&fs_devices->device_list_mutex);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 6d866db4e177..2a751246188a 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -92,6 +92,14 @@ struct btrfs_device {
 	struct btrfs_work work;
 	struct rcu_head rcu;
 	struct work_struct rcu_work;
+
+	/* readahead state */
+	spinlock_t reada_lock;
+	atomic_t reada_in_flight;
+	u64 reada_next;
+	struct reada_zone *reada_curr_zone;
+	struct radix_tree_root reada_zones;
+	struct radix_tree_root reada_extents;
 };
 
 struct btrfs_fs_devices {
-- 
cgit v1.2.3


From 4bb31e928d1a47f5bd046ecb176b8eff7c589fc0 Mon Sep 17 00:00:00 2001
From: Arne Jansen <sensille@gmx.net>
Date: Fri, 10 Jun 2011 13:55:54 +0200
Subject: btrfs: hooks for readahead

This adds the hooks needed for readahead. In the readpage_end_io_hook,
the extent state is checked for the EXTENT_READAHEAD flag. Only in this
case the readahead hook is called, to keep the impact on non-ra as low
as possible.
Additionally, a hook for a failed IO is added, otherwise readahead would
wait indefinitely for the extent to finish.

Changes for v2:
 - eliminate race condition

Signed-off-by: Arne Jansen <sensille@gmx.net>
---
 fs/btrfs/disk-io.c   | 37 +++++++++++++++++++++++++++++++++++++
 fs/btrfs/extent_io.c |  2 +-
 2 files changed, 38 insertions(+), 1 deletion(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index fd28c40e9ec3..2151828aa142 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -609,11 +609,47 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 	end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
 	end = eb->start + end - 1;
 err:
+	if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
+		clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
+		btree_readahead_hook(root, eb, eb->start, ret);
+	}
+
 	free_extent_buffer(eb);
 out:
 	return ret;
 }
 
+static int btree_io_failed_hook(struct bio *failed_bio,
+			 struct page *page, u64 start, u64 end,
+			 struct extent_state *state)
+{
+	struct extent_io_tree *tree;
+	unsigned long len;
+	struct extent_buffer *eb;
+	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
+
+	tree = &BTRFS_I(page->mapping->host)->io_tree;
+	if (page->private == EXTENT_PAGE_PRIVATE)
+		goto out;
+	if (!page->private)
+		goto out;
+
+	len = page->private >> 2;
+	WARN_ON(len == 0);
+
+	eb = alloc_extent_buffer(tree, start, len, page);
+	if (eb == NULL)
+		goto out;
+
+	if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
+		clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
+		btree_readahead_hook(root, eb, eb->start, -EIO);
+	}
+
+out:
+	return -EIO;	/* we fixed nothing */
+}
+
 static void end_workqueue_bio(struct bio *bio, int err)
 {
 	struct end_io_wq *end_io_wq = bio->bi_private;
@@ -3166,6 +3202,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
 static struct extent_io_ops btree_extent_io_ops = {
 	.write_cache_pages_lock_hook = btree_lock_page_hook,
 	.readpage_end_io_hook = btree_readpage_end_io_hook,
+	.readpage_io_failed_hook = btree_io_failed_hook,
 	.submit_bio_hook = btree_submit_bio_hook,
 	/* note we're sharing with inode.c for the merge bio hook */
 	.merge_bio_hook = btrfs_merge_bio_hook,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 823028e73cf8..deba714236d7 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1731,7 +1731,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
 		if (!uptodate && tree->ops &&
 		    tree->ops->readpage_io_failed_hook) {
 			ret = tree->ops->readpage_io_failed_hook(bio, page,
-							 start, end, NULL);
+							 start, end, state);
 			if (ret == 0) {
 				uptodate =
 					test_bit(BIO_UPTODATE, &bio->bi_flags);
-- 
cgit v1.2.3


From 37be25bcb6d731914e126f8de59c4367f0d66b80 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 5 Aug 2011 10:25:38 -0400
Subject: Btrfs: kill the durable block rsv stuff

This is confusing code and isn't used by anything anymore, so delete it.

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/ctree.h       |  11 ------
 fs/btrfs/disk-io.c     |   2 -
 fs/btrfs/extent-tree.c | 100 +++++++++----------------------------------------
 fs/btrfs/inode.c       |   4 --
 fs/btrfs/relocation.c  |   1 -
 5 files changed, 17 insertions(+), 101 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 4ef777e85c89..c5ceba4078cc 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -772,13 +772,10 @@ struct btrfs_space_info {
 struct btrfs_block_rsv {
 	u64 size;
 	u64 reserved;
-	u64 freed[2];
 	struct btrfs_space_info *space_info;
-	struct list_head list;
 	spinlock_t lock;
 	atomic_t usage;
 	unsigned int priority:8;
-	unsigned int durable:1;
 	unsigned int refill_used:1;
 	unsigned int full:1;
 };
@@ -840,7 +837,6 @@ struct btrfs_block_group_cache {
 	spinlock_t lock;
 	u64 pinned;
 	u64 reserved;
-	u64 reserved_pinned;
 	u64 bytes_super;
 	u64 flags;
 	u64 sectorsize;
@@ -919,11 +915,6 @@ struct btrfs_fs_info {
 
 	struct btrfs_block_rsv empty_block_rsv;
 
-	/* list of block reservations that cross multiple transactions */
-	struct list_head durable_block_rsv_list;
-
-	struct mutex durable_block_rsv_mutex;
-
 	u64 generation;
 	u64 last_trans_committed;
 
@@ -2238,8 +2229,6 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv);
 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root);
 void btrfs_free_block_rsv(struct btrfs_root *root,
 			  struct btrfs_block_rsv *rsv);
-void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
-				 struct btrfs_block_rsv *rsv);
 int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root,
 			struct btrfs_block_rsv *block_rsv,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 07b3ac662e19..0b5643a68d57 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1665,8 +1665,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	btrfs_init_block_rsv(&fs_info->trans_block_rsv);
 	btrfs_init_block_rsv(&fs_info->chunk_block_rsv);
 	btrfs_init_block_rsv(&fs_info->empty_block_rsv);
-	INIT_LIST_HEAD(&fs_info->durable_block_rsv_list);
-	mutex_init(&fs_info->durable_block_rsv_mutex);
 	atomic_set(&fs_info->nr_async_submits, 0);
 	atomic_set(&fs_info->async_delalloc_pages, 0);
 	atomic_set(&fs_info->async_submit_draining, 0);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 4add1ac2dda0..30c0558eae84 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -121,7 +121,6 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
 	if (atomic_dec_and_test(&cache->count)) {
 		WARN_ON(cache->pinned > 0);
 		WARN_ON(cache->reserved > 0);
-		WARN_ON(cache->reserved_pinned > 0);
 		kfree(cache->free_space_ctl);
 		kfree(cache);
 	}
@@ -3662,7 +3661,6 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
 	spin_lock_init(&rsv->lock);
 	atomic_set(&rsv->usage, 1);
 	rsv->priority = 6;
-	INIT_LIST_HEAD(&rsv->list);
 }
 
 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
@@ -3685,25 +3683,10 @@ void btrfs_free_block_rsv(struct btrfs_root *root,
 {
 	if (rsv && atomic_dec_and_test(&rsv->usage)) {
 		btrfs_block_rsv_release(root, rsv, (u64)-1);
-		if (!rsv->durable)
-			kfree(rsv);
+		kfree(rsv);
 	}
 }
 
-/*
- * make the block_rsv struct be able to capture freed space.
- * the captured space will re-add to the the block_rsv struct
- * after transaction commit
- */
-void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
-				 struct btrfs_block_rsv *block_rsv)
-{
-	block_rsv->durable = 1;
-	mutex_lock(&fs_info->durable_block_rsv_mutex);
-	list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list);
-	mutex_unlock(&fs_info->durable_block_rsv_mutex);
-}
-
 int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root,
 			struct btrfs_block_rsv *block_rsv,
@@ -3745,9 +3728,7 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
 		ret = 0;
 	} else {
 		num_bytes -= block_rsv->reserved;
-		if (block_rsv->durable &&
-		    block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes)
-			commit_trans = 1;
+		commit_trans = 1;
 	}
 	spin_unlock(&block_rsv->lock);
 	if (!ret)
@@ -3763,8 +3744,18 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
 	}
 
 	if (commit_trans) {
+		struct btrfs_space_info *sinfo = block_rsv->space_info;
+
 		if (trans)
 			return -EAGAIN;
+
+		spin_lock(&sinfo->lock);
+		if (sinfo->bytes_pinned < num_bytes) {
+			spin_unlock(&sinfo->lock);
+			return -ENOSPC;
+		}
+		spin_unlock(&sinfo->lock);
+
 		trans = btrfs_join_transaction(root);
 		BUG_ON(IS_ERR(trans));
 		ret = btrfs_commit_transaction(trans, root);
@@ -3885,10 +3876,6 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
 	fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
 	fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
 
-	btrfs_add_durable_block_rsv(fs_info, &fs_info->global_block_rsv);
-
-	btrfs_add_durable_block_rsv(fs_info, &fs_info->delalloc_block_rsv);
-
 	update_global_block_rsv(fs_info);
 }
 
@@ -4447,13 +4434,8 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
 		spin_lock(&cache->lock);
 		cache->pinned -= len;
 		cache->space_info->bytes_pinned -= len;
-		if (cache->ro) {
+		if (cache->ro)
 			cache->space_info->bytes_readonly += len;
-		} else if (cache->reserved_pinned > 0) {
-			len = min(len, cache->reserved_pinned);
-			cache->reserved_pinned -= len;
-			cache->space_info->bytes_may_use += len;
-		}
 		spin_unlock(&cache->lock);
 		spin_unlock(&cache->space_info->lock);
 	}
@@ -4468,11 +4450,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct extent_io_tree *unpin;
-	struct btrfs_block_rsv *block_rsv;
-	struct btrfs_block_rsv *next_rsv;
 	u64 start;
 	u64 end;
-	int idx;
 	int ret;
 
 	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
@@ -4495,30 +4474,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 		cond_resched();
 	}
 
-	mutex_lock(&fs_info->durable_block_rsv_mutex);
-	list_for_each_entry_safe(block_rsv, next_rsv,
-				 &fs_info->durable_block_rsv_list, list) {
-
-		idx = trans->transid & 0x1;
-		if (block_rsv->freed[idx] > 0) {
-			block_rsv_add_bytes(block_rsv,
-					    block_rsv->freed[idx], 0);
-			block_rsv->freed[idx] = 0;
-		}
-		if (atomic_read(&block_rsv->usage) == 0) {
-			btrfs_block_rsv_release(root, block_rsv, (u64)-1);
-
-			if (block_rsv->freed[0] == 0 &&
-			    block_rsv->freed[1] == 0) {
-				list_del_init(&block_rsv->list);
-				kfree(block_rsv);
-			}
-		} else {
-			btrfs_block_rsv_release(root, block_rsv, 0);
-		}
-	}
-	mutex_unlock(&fs_info->durable_block_rsv_mutex);
-
 	return 0;
 }
 
@@ -4820,36 +4775,18 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 		if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
 			ret = check_ref_cleanup(trans, root, buf->start);
 			if (!ret)
-				goto pin;
+				goto out;
 		}
 
 		if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
 			pin_down_extent(root, cache, buf->start, buf->len, 1);
-			goto pin;
+			goto out;
 		}
 
 		WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
 
 		btrfs_add_free_space(cache, buf->start, buf->len);
 		btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
-
-		goto out;
-	}
-pin:
-	if (block_rsv->durable && !cache->ro) {
-		ret = 0;
-		spin_lock(&cache->lock);
-		if (!cache->ro) {
-			cache->reserved_pinned += buf->len;
-			ret = 1;
-		}
-		spin_unlock(&cache->lock);
-
-		if (ret) {
-			spin_lock(&block_rsv->lock);
-			block_rsv->freed[trans->transid & 0x1] += buf->len;
-			spin_unlock(&block_rsv->lock);
-		}
 	}
 out:
 	/*
@@ -6705,12 +6642,9 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
 		    cache->bytes_super - btrfs_block_group_used(&cache->item);
 
 	if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
-	    sinfo->bytes_may_use + sinfo->bytes_readonly +
-	    cache->reserved_pinned + num_bytes + min_allocable_bytes <=
-	    sinfo->total_bytes) {
+	    sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
+	    min_allocable_bytes <= sinfo->total_bytes) {
 		sinfo->bytes_readonly += num_bytes;
-		sinfo->bytes_may_use += cache->reserved_pinned;
-		cache->reserved_pinned = 0;
 		cache->ro = 1;
 		ret = 0;
 	}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5f5f8a577e69..6402a41b9023 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2164,9 +2164,6 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
 	}
 	spin_unlock(&root->orphan_lock);
 
-	if (block_rsv)
-		btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
-
 	/* grab metadata reservation from transaction handle */
 	if (reserve) {
 		ret = btrfs_orphan_reserve_metadata(trans, inode);
@@ -6505,7 +6502,6 @@ static int btrfs_truncate(struct inode *inode)
 	rsv = btrfs_alloc_block_rsv(root);
 	if (!rsv)
 		return -ENOMEM;
-	btrfs_add_durable_block_rsv(root->fs_info, rsv);
 
 	trans = btrfs_start_transaction(root, 4);
 	if (IS_ERR(trans)) {
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 59bb1764273d..545b04358249 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3651,7 +3651,6 @@ int prepare_to_relocate(struct reloc_control *rc)
 		return ret;
 
 	rc->block_rsv->refill_used = 1;
-	btrfs_add_durable_block_rsv(rc->extent_root->fs_info, rc->block_rsv);
 
 	memset(&rc->cluster, 0, sizeof(rc->cluster));
 	rc->search_start = rc->block_group->key.objectid;
-- 
cgit v1.2.3


From 300e4f8a56f263797568c95b71c949f9f02e4534 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 29 Aug 2011 14:06:00 -0400
Subject: Btrfs: put the block group cache after we commit the super

In moving some enospc stuff around I noticed that when we unmount we are often
evicting the free space cache inodes before we do our last commit.  This isn't
bad, but it makes us constantly have to re-read the inodes back.  So instead
don't evict the cache until after we do our last commit, this will make things a
little less crappy and makes a future enospc change work properly.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/disk-io.c          | 4 ++--
 fs/btrfs/free-space-cache.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0b5643a68d57..4965a0179b31 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2543,8 +2543,6 @@ int close_ctree(struct btrfs_root *root)
 	/* clear out the rbtree of defraggable inodes */
 	btrfs_run_defrag_inodes(root->fs_info);
 
-	btrfs_put_block_group_cache(fs_info);
-
 	/*
 	 * Here come 2 situations when btrfs is broken to flip readonly:
 	 *
@@ -2570,6 +2568,8 @@ int close_ctree(struct btrfs_root *root)
 			printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
 	}
 
+	btrfs_put_block_group_cache(fs_info);
+
 	kthread_stop(root->fs_info->transaction_kthread);
 	kthread_stop(root->fs_info->cleaner_kthread);
 
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 701ef5951e3f..1ea10731797a 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -105,7 +105,7 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
 		block_group->disk_cache_state = BTRFS_DC_CLEAR;
 	}
 
-	if (!btrfs_fs_closing(root->fs_info)) {
+	if (!block_group->iref) {
 		block_group->inode = igrab(inode);
 		block_group->iref = 1;
 	}
-- 
cgit v1.2.3


From 2bf64758fd6290797a5ce97d4b9c698a4ed1cbad Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 26 Sep 2011 17:12:22 -0400
Subject: Btrfs: allow us to overcommit our enospc reservations

One of the things that kills us is the fact that our ENOSPC reservations are
horribly over the top in most normal cases.  There isn't too much that can be
done about this because when we are completely full we really need them to work
like this so we don't under reserve.  However if there is plenty of unallocated
chunks on the disk we can use that to gauge how much we can overcommit.  So this
patch adds chunk free space accounting so we always know how much unallocated
space we have.  Then if we fail to make a reservation within our allocated
space, check to see if we can overcommit.  In the normal flushing case (like
with delalloc metadata reservations) we'll take the free space and divide it by
2 if our metadata profile is setup for DUP or any of those, and then divide it
by 8 to make sure we don't overcommit too much.  Then if we're in a non-flushing
case (we really need this reservation now!) we only limit ourselves to half of
the free space.  This makes this fio test

[torrent]
filename=torrent-test
rw=randwrite
size=4g
ioengine=sync
directory=/mnt/btrfs-test

go from taking around 45 minutes to 10 seconds on my freshly formatted 3 TiB
file system.  This doesn't seem to break my other enospc tests, but could really
use some more testing as this is a super scary change.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/ctree.h       |  4 ++++
 fs/btrfs/disk-io.c     |  2 ++
 fs/btrfs/extent-tree.c | 61 ++++++++++++++++++++++++++++++++++++++------------
 fs/btrfs/volumes.c     | 39 ++++++++++++++++++++++++++++----
 4 files changed, 88 insertions(+), 18 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 47dea7118e0e..1eafccb162ee 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -893,6 +893,10 @@ struct btrfs_fs_info {
 	spinlock_t block_group_cache_lock;
 	struct rb_root block_group_cache_tree;
 
+	/* keep track of unallocated space */
+	spinlock_t free_chunk_lock;
+	u64 free_chunk_space;
+
 	struct extent_io_tree freed_extents[2];
 	struct extent_io_tree *pinned_extents;
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4965a0179b31..51372a521167 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1648,6 +1648,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	spin_lock_init(&fs_info->fs_roots_radix_lock);
 	spin_lock_init(&fs_info->delayed_iput_lock);
 	spin_lock_init(&fs_info->defrag_inodes_lock);
+	spin_lock_init(&fs_info->free_chunk_lock);
 	mutex_init(&fs_info->reloc_mutex);
 
 	init_completion(&fs_info->kobj_unregister);
@@ -1675,6 +1676,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	fs_info->metadata_ratio = 0;
 	fs_info->defrag_inodes = RB_ROOT;
 	fs_info->trans_no_join = 0;
+	fs_info->free_chunk_space = 0;
 
 	fs_info->thread_pool_size = min_t(unsigned long,
 					  num_online_cpus() + 2, 8);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index fd65f6bc676c..25b69d0f9135 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3410,6 +3410,7 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
  * @block_rsv - the block_rsv we're allocating for
  * @orig_bytes - the number of bytes we want
  * @flush - wether or not we can flush to make our reservation
+ * @check - wether this is just to check if we have enough space or not
  *
  * This will reserve orgi_bytes number of bytes from the space info associated
  * with the block_rsv.  If there is not enough space it will make an attempt to
@@ -3420,11 +3421,11 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
  */
 static int reserve_metadata_bytes(struct btrfs_root *root,
 				  struct btrfs_block_rsv *block_rsv,
-				  u64 orig_bytes, int flush)
+				  u64 orig_bytes, int flush, int check)
 {
 	struct btrfs_space_info *space_info = block_rsv->space_info;
 	struct btrfs_trans_handle *trans;
-	u64 unused;
+	u64 used;
 	u64 num_bytes = orig_bytes;
 	int retries = 0;
 	int ret = 0;
@@ -3459,9 +3460,9 @@ again:
 	}
 
 	ret = -ENOSPC;
-	unused = space_info->bytes_used + space_info->bytes_reserved +
-		 space_info->bytes_pinned + space_info->bytes_readonly +
-		 space_info->bytes_may_use;
+	used = space_info->bytes_used + space_info->bytes_reserved +
+		space_info->bytes_pinned + space_info->bytes_readonly +
+		space_info->bytes_may_use;
 
 	/*
 	 * The idea here is that we've not already over-reserved the block group
@@ -3470,9 +3471,8 @@ again:
 	 * lets start flushing stuff first and then come back and try to make
 	 * our reservation.
 	 */
-	if (unused <= space_info->total_bytes) {
-		unused = space_info->total_bytes - unused;
-		if (unused >= orig_bytes) {
+	if (used <= space_info->total_bytes) {
+		if (used + orig_bytes <= space_info->total_bytes) {
 			space_info->bytes_may_use += orig_bytes;
 			ret = 0;
 		} else {
@@ -3489,10 +3489,43 @@ again:
 		 * amount plus the amount of bytes that we need for this
 		 * reservation.
 		 */
-		num_bytes = unused - space_info->total_bytes +
+		num_bytes = used - space_info->total_bytes +
 			(orig_bytes * (retries + 1));
 	}
 
+	if (ret && !check) {
+		u64 profile = btrfs_get_alloc_profile(root, 0);
+		u64 avail;
+
+		spin_lock(&root->fs_info->free_chunk_lock);
+		avail = root->fs_info->free_chunk_space;
+
+		/*
+		 * If we have dup, raid1 or raid10 then only half of the free
+		 * space is actually useable.
+		 */
+		if (profile & (BTRFS_BLOCK_GROUP_DUP |
+			       BTRFS_BLOCK_GROUP_RAID1 |
+			       BTRFS_BLOCK_GROUP_RAID10))
+			avail >>= 1;
+
+		/*
+		 * If we aren't flushing don't let us overcommit too much, say
+		 * 1/8th of the space.  If we can flush, let it overcommit up to
+		 * 1/2 of the space.
+		 */
+		if (flush)
+			avail >>= 3;
+		else
+			avail >>= 1;
+		 spin_unlock(&root->fs_info->free_chunk_lock);
+
+		if (used + orig_bytes < space_info->total_bytes + avail) {
+			space_info->bytes_may_use += orig_bytes;
+			ret = 0;
+		}
+	}
+
 	/*
 	 * Couldn't make our reservation, save our place so while we're trying
 	 * to reclaim space we can actually use it instead of somebody else
@@ -3703,7 +3736,7 @@ int btrfs_block_rsv_add(struct btrfs_root *root,
 	if (num_bytes == 0)
 		return 0;
 
-	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1);
+	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1, 0);
 	if (!ret) {
 		block_rsv_add_bytes(block_rsv, num_bytes, 1);
 		return 0;
@@ -3737,7 +3770,7 @@ int btrfs_block_rsv_check(struct btrfs_root *root,
 	if (!ret)
 		return 0;
 
-	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
+	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush, !flush);
 	if (!ret) {
 		block_rsv_add_bytes(block_rsv, num_bytes, 0);
 		return 0;
@@ -4037,7 +4070,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 	to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
 	spin_unlock(&BTRFS_I(inode)->lock);
 
-	ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
+	ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush, 0);
 	if (ret) {
 		u64 to_free = 0;
 		unsigned dropped;
@@ -5692,7 +5725,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,
 	block_rsv = get_block_rsv(trans, root);
 
 	if (block_rsv->size == 0) {
-		ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
+		ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0, 0);
 		/*
 		 * If we couldn't reserve metadata bytes try and use some from
 		 * the global reserve.
@@ -5713,7 +5746,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,
 		return block_rsv;
 	if (ret) {
 		WARN_ON(1);
-		ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
+		ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0, 0);
 		if (!ret) {
 			return block_rsv;
 		} else if (ret && block_rsv != global_rsv) {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f2a4cc79da61..e138af710de2 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1013,8 +1013,13 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
 	}
 	BUG_ON(ret);
 
-	if (device->bytes_used > 0)
-		device->bytes_used -= btrfs_dev_extent_length(leaf, extent);
+	if (device->bytes_used > 0) {
+		u64 len = btrfs_dev_extent_length(leaf, extent);
+		device->bytes_used -= len;
+		spin_lock(&root->fs_info->free_chunk_lock);
+		root->fs_info->free_chunk_space += len;
+		spin_unlock(&root->fs_info->free_chunk_lock);
+	}
 	ret = btrfs_del_item(trans, root, path);
 
 out:
@@ -1356,6 +1361,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 	if (ret)
 		goto error_undo;
 
+	spin_lock(&root->fs_info->free_chunk_lock);
+	root->fs_info->free_chunk_space = device->total_bytes -
+		device->bytes_used;
+	spin_unlock(&root->fs_info->free_chunk_lock);
+
 	device->in_fs_metadata = 0;
 	btrfs_scrub_cancel_dev(root, device);
 
@@ -1691,6 +1701,10 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 		root->fs_info->fs_devices->num_can_discard++;
 	root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
 
+	spin_lock(&root->fs_info->free_chunk_lock);
+	root->fs_info->free_chunk_space += device->total_bytes;
+	spin_unlock(&root->fs_info->free_chunk_lock);
+
 	if (!blk_queue_nonrot(bdev_get_queue(bdev)))
 		root->fs_info->fs_devices->rotating = 1;
 
@@ -2192,8 +2206,12 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 	lock_chunks(root);
 
 	device->total_bytes = new_size;
-	if (device->writeable)
+	if (device->writeable) {
 		device->fs_devices->total_rw_bytes -= diff;
+		spin_lock(&root->fs_info->free_chunk_lock);
+		root->fs_info->free_chunk_space -= diff;
+		spin_unlock(&root->fs_info->free_chunk_lock);
+	}
 	unlock_chunks(root);
 
 again:
@@ -2257,6 +2275,9 @@ again:
 		device->total_bytes = old_size;
 		if (device->writeable)
 			device->fs_devices->total_rw_bytes += diff;
+		spin_lock(&root->fs_info->free_chunk_lock);
+		root->fs_info->free_chunk_space += diff;
+		spin_unlock(&root->fs_info->free_chunk_lock);
 		unlock_chunks(root);
 		goto done;
 	}
@@ -2615,6 +2636,11 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
 		index++;
 	}
 
+	spin_lock(&extent_root->fs_info->free_chunk_lock);
+	extent_root->fs_info->free_chunk_space -= (stripe_size *
+						   map->num_stripes);
+	spin_unlock(&extent_root->fs_info->free_chunk_lock);
+
 	index = 0;
 	stripe = &chunk->stripe;
 	while (index < map->num_stripes) {
@@ -3616,8 +3642,13 @@ static int read_one_dev(struct btrfs_root *root,
 	fill_device_from_item(leaf, dev_item, device);
 	device->dev_root = root->fs_info->dev_root;
 	device->in_fs_metadata = 1;
-	if (device->writeable)
+	if (device->writeable) {
 		device->fs_devices->total_rw_bytes += device->total_bytes;
+		spin_lock(&root->fs_info->free_chunk_lock);
+		root->fs_info->free_chunk_space += device->total_bytes -
+			device->bytes_used;
+		spin_unlock(&root->fs_info->free_chunk_lock);
+	}
 	ret = 0;
 	return ret;
 }
-- 
cgit v1.2.3


From 01d658f2ca3c85c1ffb20b306e30d16197000ce7 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 1 Nov 2011 10:08:06 -0400
Subject: Btrfs: make sure to flush queued bios if write_cache_pages waits

write_cache_pages tries to build up a large bio to stuff down the pipe.
But if it needs to wait for a page lock, it needs to make sure and send
down any pending writes so we don't deadlock with anyone who has the
page lock and is waiting for writeback of things inside the bio.

Dave Sterba triggered this as a deadlock between the autodefrag code and
the extent write_cache_pages

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c   | 13 ++++++++++---
 fs/btrfs/disk-io.h   |  2 --
 fs/btrfs/extent_io.c | 14 ++++++++++----
 fs/btrfs/extent_io.h |  3 ++-
 4 files changed, 22 insertions(+), 10 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 51372a521167..6f58911ece0d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2735,7 +2735,8 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
 	return ret;
 }
 
-int btree_lock_page_hook(struct page *page)
+static int btree_lock_page_hook(struct page *page, void *data,
+				void (*flush_fn)(void *))
 {
 	struct inode *inode = page->mapping->host;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -2752,7 +2753,10 @@ int btree_lock_page_hook(struct page *page)
 	if (!eb)
 		goto out;
 
-	btrfs_tree_lock(eb);
+	if (!btrfs_try_tree_write_lock(eb)) {
+		flush_fn(data);
+		btrfs_tree_lock(eb);
+	}
 	btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
 
 	if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
@@ -2767,7 +2771,10 @@ int btree_lock_page_hook(struct page *page)
 	btrfs_tree_unlock(eb);
 	free_extent_buffer(eb);
 out:
-	lock_page(page);
+	if (!trylock_page(page)) {
+		flush_fn(data);
+		lock_page(page);
+	}
 	return 0;
 }
 
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index bec3ea4bd67f..e678539c8519 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -83,8 +83,6 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
 			     struct btrfs_fs_info *fs_info);
 int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root);
-int btree_lock_page_hook(struct page *page);
-
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 void btrfs_init_lockdep(void);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index f284d4e5f447..b40ba75f4483 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2613,10 +2613,16 @@ retry:
 			 * swizzled back from swapper_space to tmpfs file
 			 * mapping
 			 */
-			if (tree->ops && tree->ops->write_cache_pages_lock_hook)
-				tree->ops->write_cache_pages_lock_hook(page);
-			else
-				lock_page(page);
+			if (tree->ops &&
+			    tree->ops->write_cache_pages_lock_hook) {
+				tree->ops->write_cache_pages_lock_hook(page,
+							       data, flush_fn);
+			} else {
+				if (!trylock_page(page)) {
+					flush_fn(data);
+					lock_page(page);
+				}
+			}
 
 			if (unlikely(page->mapping != mapping)) {
 				unlock_page(page);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 325a346369da..cbd4824a7c94 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -86,7 +86,8 @@ struct extent_io_ops {
 				  struct extent_state *other);
 	void (*split_extent_hook)(struct inode *inode,
 				  struct extent_state *orig, u64 split);
-	int (*write_cache_pages_lock_hook)(struct page *page);
+	int (*write_cache_pages_lock_hook)(struct page *page, void *data,
+					   void (*flush_fn)(void *));
 };
 
 struct extent_io_tree {
-- 
cgit v1.2.3


From 6c41761fc6efe1503103a1afe03a6635c0b5d4ec Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Wed, 13 Apr 2011 15:41:04 +0200
Subject: btrfs: separate superblock items out of fs_info

fs_info has now ~9kb, more than fits into one page. This will cause
mount failure when memory is too fragmented. Top space consumers are
super block structures super_copy and super_for_commit, ~2.8kb each.
Allocate them dynamically. fs_info will be ~3.5kb. (measured on x86_64)

Add a wrapper for freeing fs_info and all of it's dynamically allocated
members.

Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/compression.c |  3 ++-
 fs/btrfs/ctree.h       | 16 ++++++++++++++--
 fs/btrfs/disk-io.c     | 33 ++++++++++-----------------------
 fs/btrfs/extent-tree.c | 16 ++++++++--------
 fs/btrfs/file-item.c   | 17 ++++++-----------
 fs/btrfs/inode.c       |  2 +-
 fs/btrfs/ioctl.c       |  8 ++++----
 fs/btrfs/scrub.c       |  2 +-
 fs/btrfs/super.c       | 19 +++++++++++++------
 fs/btrfs/transaction.c | 10 +++++-----
 fs/btrfs/tree-log.c    |  4 ++--
 fs/btrfs/volumes.c     | 24 ++++++++++++------------
 12 files changed, 78 insertions(+), 76 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 8ec5d86f1734..14f1c5a0b2d2 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -85,7 +85,8 @@ struct compressed_bio {
 static inline int compressed_bio_size(struct btrfs_root *root,
 				      unsigned long disk_size)
 {
-	u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
+	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
+
 	return sizeof(struct compressed_bio) +
 		((disk_size + root->sectorsize - 1) / root->sectorsize) *
 		csum_size;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f63c9b3f6e08..5181c53c1124 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -936,8 +936,8 @@ struct btrfs_fs_info {
 	wait_queue_head_t transaction_blocked_wait;
 	wait_queue_head_t async_submit_wait;
 
-	struct btrfs_super_block super_copy;
-	struct btrfs_super_block super_for_commit;
+	struct btrfs_super_block *super_copy;
+	struct btrfs_super_block *super_for_commit;
 	struct block_device *__bdev;
 	struct super_block *sb;
 	struct inode *btree_inode;
@@ -2387,6 +2387,18 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
 	smp_mb();
 	return fs_info->closing;
 }
+static inline void free_fs_info(struct btrfs_fs_info *fs_info)
+{
+	kfree(fs_info->delayed_root);
+	kfree(fs_info->extent_root);
+	kfree(fs_info->tree_root);
+	kfree(fs_info->chunk_root);
+	kfree(fs_info->dev_root);
+	kfree(fs_info->csum_root);
+	kfree(fs_info->super_copy);
+	kfree(fs_info->super_for_commit);
+	kfree(fs_info);
+}
 
 /* root-item.c */
 int btrfs_find_root_ref(struct btrfs_root *tree_root,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 6f58911ece0d..761717c98278 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -256,8 +256,7 @@ void btrfs_csum_final(u32 crc, char *result)
 static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
 			   int verify)
 {
-	u16 csum_size =
-		btrfs_super_csum_size(&root->fs_info->super_copy);
+	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
 	char *result = NULL;
 	unsigned long len;
 	unsigned long cur_len;
@@ -1766,14 +1765,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 		goto fail_alloc;
 	}
 
-	memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
-	memcpy(&fs_info->super_for_commit, &fs_info->super_copy,
-	       sizeof(fs_info->super_for_commit));
+	memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy));
+	memcpy(fs_info->super_for_commit, fs_info->super_copy,
+	       sizeof(*fs_info->super_for_commit));
 	brelse(bh);
 
-	memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE);
+	memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE);
 
-	disk_super = &fs_info->super_copy;
+	disk_super = fs_info->super_copy;
 	if (!btrfs_super_root(disk_super))
 		goto fail_alloc;
 
@@ -2152,7 +2151,6 @@ fail_sb_buffer:
 	btrfs_stop_workers(&fs_info->delayed_workers);
 	btrfs_stop_workers(&fs_info->caching_workers);
 fail_alloc:
-	kfree(fs_info->delayed_root);
 fail_iput:
 	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
 	iput(fs_info->btree_inode);
@@ -2164,12 +2162,7 @@ fail_bdi:
 fail_srcu:
 	cleanup_srcu_struct(&fs_info->subvol_srcu);
 fail:
-	kfree(extent_root);
-	kfree(tree_root);
-	kfree(fs_info);
-	kfree(chunk_root);
-	kfree(dev_root);
-	kfree(csum_root);
+	free_fs_info(fs_info);
 	return ERR_PTR(err);
 }
 
@@ -2338,10 +2331,10 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
 	int total_errors = 0;
 	u64 flags;
 
-	max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
+	max_errors = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
 	do_barriers = !btrfs_test_opt(root, NOBARRIER);
 
-	sb = &root->fs_info->super_for_commit;
+	sb = root->fs_info->super_for_commit;
 	dev_item = &sb->dev_item;
 
 	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
@@ -2603,7 +2596,6 @@ int close_ctree(struct btrfs_root *root)
 	del_fs_roots(fs_info);
 
 	iput(fs_info->btree_inode);
-	kfree(fs_info->delayed_root);
 
 	btrfs_stop_workers(&fs_info->generic_worker);
 	btrfs_stop_workers(&fs_info->fixup_workers);
@@ -2624,12 +2616,7 @@ int close_ctree(struct btrfs_root *root)
 	bdi_destroy(&fs_info->bdi);
 	cleanup_srcu_struct(&fs_info->subvol_srcu);
 
-	kfree(fs_info->extent_root);
-	kfree(fs_info->tree_root);
-	kfree(fs_info->chunk_root);
-	kfree(fs_info->dev_root);
-	kfree(fs_info->csum_root);
-	kfree(fs_info);
+	free_fs_info(fs_info);
 
 	return 0;
 }
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index cb7626646bba..782eb3ea8edf 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3209,7 +3209,7 @@ static int should_alloc_chunk(struct btrfs_root *root,
 	 * about 1% of the FS size.
 	 */
 	if (force == CHUNK_ALLOC_LIMITED) {
-		thresh = btrfs_super_total_bytes(&root->fs_info->super_copy);
+		thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
 		thresh = max_t(u64, 64 * 1024 * 1024,
 			       div_factor_fine(thresh, 1));
 
@@ -3231,7 +3231,7 @@ static int should_alloc_chunk(struct btrfs_root *root,
 	if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))
 		return 0;
 
-	thresh = btrfs_super_total_bytes(&root->fs_info->super_copy);
+	thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
 
 	/* 256MB or 5% of the FS */
 	thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
@@ -3843,7 +3843,7 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
 	u64 num_bytes;
 	u64 meta_used;
 	u64 data_used;
-	int csum_size = btrfs_super_csum_size(&fs_info->super_copy);
+	int csum_size = btrfs_super_csum_size(fs_info->super_copy);
 
 	sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
 	spin_lock(&sinfo->lock);
@@ -4222,12 +4222,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 
 	/* block accounting for super block */
 	spin_lock(&info->delalloc_lock);
-	old_val = btrfs_super_bytes_used(&info->super_copy);
+	old_val = btrfs_super_bytes_used(info->super_copy);
 	if (alloc)
 		old_val += num_bytes;
 	else
 		old_val -= num_bytes;
-	btrfs_set_super_bytes_used(&info->super_copy, old_val);
+	btrfs_set_super_bytes_used(info->super_copy, old_val);
 	spin_unlock(&info->delalloc_lock);
 
 	while (total) {
@@ -7127,9 +7127,9 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 		return -ENOMEM;
 	path->reada = 1;
 
-	cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy);
+	cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
 	if (btrfs_test_opt(root, SPACE_CACHE) &&
-	    btrfs_super_generation(&root->fs_info->super_copy) != cache_gen)
+	    btrfs_super_generation(root->fs_info->super_copy) != cache_gen)
 		need_clear = 1;
 	if (btrfs_test_opt(root, CLEAR_CACHE))
 		need_clear = 1;
@@ -7458,7 +7458,7 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
 	int mixed = 0;
 	int ret;
 
-	disk_super = &fs_info->super_copy;
+	disk_super = fs_info->super_copy;
 	if (!btrfs_super_root(disk_super))
 		return 1;
 
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index a1cb7821becd..c7fb3a4247d3 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -91,8 +91,7 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
 	struct btrfs_csum_item *item;
 	struct extent_buffer *leaf;
 	u64 csum_offset = 0;
-	u16 csum_size =
-		btrfs_super_csum_size(&root->fs_info->super_copy);
+	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
 	int csums_in_item;
 
 	file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
@@ -162,8 +161,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
 	u64 item_last_offset = 0;
 	u64 disk_bytenr;
 	u32 diff;
-	u16 csum_size =
-		btrfs_super_csum_size(&root->fs_info->super_copy);
+	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
 	int ret;
 	struct btrfs_path *path;
 	struct btrfs_csum_item *item = NULL;
@@ -290,7 +288,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
 	int ret;
 	size_t size;
 	u64 csum_end;
-	u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
+	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -492,8 +490,7 @@ static noinline int truncate_one_csum(struct btrfs_trans_handle *trans,
 				      u64 bytenr, u64 len)
 {
 	struct extent_buffer *leaf;
-	u16 csum_size =
-		btrfs_super_csum_size(&root->fs_info->super_copy);
+	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
 	u64 csum_end;
 	u64 end_byte = bytenr + len;
 	u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits;
@@ -549,8 +546,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
 	u64 csum_end;
 	struct extent_buffer *leaf;
 	int ret;
-	u16 csum_size =
-		btrfs_super_csum_size(&root->fs_info->super_copy);
+	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
 	int blocksize_bits = root->fs_info->sb->s_blocksize_bits;
 
 	root = root->fs_info->csum_root;
@@ -676,8 +672,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
 	struct btrfs_sector_sum *sector_sum;
 	u32 nritems;
 	u32 ins_size;
-	u16 csum_size =
-		btrfs_super_csum_size(&root->fs_info->super_copy);
+	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
 
 	path = btrfs_alloc_path();
 	if (!path)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a8586e10953c..b6b612e14ed7 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -824,7 +824,7 @@ static noinline int cow_file_range(struct inode *inode,
 	}
 
 	BUG_ON(disk_num_bytes >
-	       btrfs_super_total_bytes(&root->fs_info->super_copy));
+	       btrfs_super_total_bytes(root->fs_info->super_copy));
 
 	alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
 	btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 33aae13cc74b..8f6e14279409 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -282,7 +282,7 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
 	struct fstrim_range range;
 	u64 minlen = ULLONG_MAX;
 	u64 num_devices = 0;
-	u64 total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
+	u64 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
 	int ret;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -1164,7 +1164,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 		mutex_unlock(&inode->i_mutex);
 	}
 
-	disk_super = &root->fs_info->super_copy;
+	disk_super = root->fs_info->super_copy;
 	features = btrfs_super_incompat_flags(disk_super);
 	if (range->compress_type == BTRFS_COMPRESS_LZO) {
 		features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
@@ -2613,7 +2613,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
 		return PTR_ERR(trans);
 	}
 
-	dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
+	dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
 	di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path,
 				   dir_id, "default", 7, 1);
 	if (IS_ERR_OR_NULL(di)) {
@@ -2629,7 +2629,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 	btrfs_free_path(path);
 
-	disk_super = &root->fs_info->super_copy;
+	disk_super = root->fs_info->super_copy;
 	features = btrfs_super_incompat_flags(disk_super);
 	if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) {
 		features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index a8d03d5efb5d..69a600f07763 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -182,7 +182,7 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
 	sdev->curr = -1;
 	atomic_set(&sdev->in_flight, 0);
 	atomic_set(&sdev->cancel_req, 0);
-	sdev->csum_size = btrfs_super_csum_size(&fs_info->super_copy);
+	sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy);
 	INIT_LIST_HEAD(&sdev->csum_list);
 
 	spin_lock_init(&sdev->list_lock);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 5429b1fa0bfc..f7e9de724ef2 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -216,7 +216,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 	char *compress_type;
 	bool compress_force = false;
 
-	cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy);
+	cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
 	if (cache_gen)
 		btrfs_set_opt(info->mount_opt, SPACE_CACHE);
 
@@ -524,7 +524,7 @@ static struct dentry *get_default_root(struct super_block *sb,
 	 * will mount by default if we haven't been given a specific subvolume
 	 * to mount.
 	 */
-	dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
+	dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
 	di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
 	if (IS_ERR(di)) {
 		btrfs_free_path(path);
@@ -937,6 +937,13 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 	fs_info->fs_devices = fs_devices;
 	tree_root->fs_info = fs_info;
 
+	fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
+	fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
+	if (!fs_info->super_copy || !fs_info->super_for_commit) {
+		error = -ENOMEM;
+		goto error_close_devices;
+	}
+
 	bdev = fs_devices->latest_bdev;
 	s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root);
 	if (IS_ERR(s)) {
@@ -951,7 +958,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 		}
 
 		btrfs_close_devices(fs_devices);
-		kfree(fs_info);
+		free_fs_info(fs_info);
 		kfree(tree_root);
 	} else {
 		char b[BDEVNAME_SIZE];
@@ -979,7 +986,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 
 error_close_devices:
 	btrfs_close_devices(fs_devices);
-	kfree(fs_info);
+	free_fs_info(fs_info);
 	kfree(tree_root);
 	return ERR_PTR(error);
 }
@@ -1005,7 +1012,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 		if (root->fs_info->fs_devices->rw_devices == 0)
 			return -EACCES;
 
-		if (btrfs_super_log_root(&root->fs_info->super_copy) != 0)
+		if (btrfs_super_log_root(root->fs_info->super_copy) != 0)
 			return -EINVAL;
 
 		ret = btrfs_cleanup_fs_roots(root->fs_info);
@@ -1171,7 +1178,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
 static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct btrfs_root *root = btrfs_sb(dentry->d_sb);
-	struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
+	struct btrfs_super_block *disk_super = root->fs_info->super_copy;
 	struct list_head *head = &root->fs_info->space_info;
 	struct btrfs_space_info *found;
 	u64 total_used = 0;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 29bef63e23ba..373c7ec1a026 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -991,7 +991,7 @@ static void update_super_roots(struct btrfs_root *root)
 	struct btrfs_root_item *root_item;
 	struct btrfs_super_block *super;
 
-	super = &root->fs_info->super_copy;
+	super = root->fs_info->super_copy;
 
 	root_item = &root->fs_info->chunk_root->root_item;
 	super->chunk_root = root_item->bytenr;
@@ -1301,12 +1301,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	update_super_roots(root);
 
 	if (!root->fs_info->log_root_recovering) {
-		btrfs_set_super_log_root(&root->fs_info->super_copy, 0);
-		btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0);
+		btrfs_set_super_log_root(root->fs_info->super_copy, 0);
+		btrfs_set_super_log_root_level(root->fs_info->super_copy, 0);
 	}
 
-	memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy,
-	       sizeof(root->fs_info->super_copy));
+	memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy,
+	       sizeof(*root->fs_info->super_copy));
 
 	trans->transaction->blocked = 0;
 	spin_lock(&root->fs_info->trans_lock);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 8ca1b6b83bd1..f4d81c06d48f 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2118,9 +2118,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	BUG_ON(ret);
 	btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
 
-	btrfs_set_super_log_root(&root->fs_info->super_for_commit,
+	btrfs_set_super_log_root(root->fs_info->super_for_commit,
 				log_root_tree->node->start);
-	btrfs_set_super_log_root_level(&root->fs_info->super_for_commit,
+	btrfs_set_super_log_root_level(root->fs_info->super_for_commit,
 				btrfs_header_level(log_root_tree->node));
 
 	log_root_tree->log_batch = 0;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c6938b45e0fd..c3b45564048e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1395,8 +1395,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 	call_rcu(&device->rcu, free_device);
 	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
 
-	num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
-	btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices);
+	num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
+	btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices);
 
 	if (cur_devices->open_devices == 0) {
 		struct btrfs_fs_devices *fs_devices;
@@ -1458,7 +1458,7 @@ static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
 	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
 	struct btrfs_fs_devices *old_devices;
 	struct btrfs_fs_devices *seed_devices;
-	struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
+	struct btrfs_super_block *disk_super = root->fs_info->super_copy;
 	struct btrfs_device *device;
 	u64 super_flags;
 
@@ -1706,12 +1706,12 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	if (!blk_queue_nonrot(bdev_get_queue(bdev)))
 		root->fs_info->fs_devices->rotating = 1;
 
-	total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
-	btrfs_set_super_total_bytes(&root->fs_info->super_copy,
+	total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
+	btrfs_set_super_total_bytes(root->fs_info->super_copy,
 				    total_bytes + device->total_bytes);
 
-	total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
-	btrfs_set_super_num_devices(&root->fs_info->super_copy,
+	total_bytes = btrfs_super_num_devices(root->fs_info->super_copy);
+	btrfs_set_super_num_devices(root->fs_info->super_copy,
 				    total_bytes + 1);
 	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
 
@@ -1802,7 +1802,7 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
 		      struct btrfs_device *device, u64 new_size)
 {
 	struct btrfs_super_block *super_copy =
-		&device->dev_root->fs_info->super_copy;
+		device->dev_root->fs_info->super_copy;
 	u64 old_total = btrfs_super_total_bytes(super_copy);
 	u64 diff = new_size - device->total_bytes;
 
@@ -1861,7 +1861,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
 static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
 			chunk_offset)
 {
-	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+	struct btrfs_super_block *super_copy = root->fs_info->super_copy;
 	struct btrfs_disk_key *disk_key;
 	struct btrfs_chunk *chunk;
 	u8 *ptr;
@@ -2187,7 +2187,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 	bool retried = false;
 	struct extent_buffer *l;
 	struct btrfs_key key;
-	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+	struct btrfs_super_block *super_copy = root->fs_info->super_copy;
 	u64 old_total = btrfs_super_total_bytes(super_copy);
 	u64 old_size = device->total_bytes;
 	u64 diff = device->total_bytes - new_size;
@@ -2311,7 +2311,7 @@ static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
 			   struct btrfs_key *key,
 			   struct btrfs_chunk *chunk, int item_size)
 {
-	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+	struct btrfs_super_block *super_copy = root->fs_info->super_copy;
 	struct btrfs_disk_key disk_key;
 	u32 array_size;
 	u8 *ptr;
@@ -3653,7 +3653,7 @@ static int read_one_dev(struct btrfs_root *root,
 
 int btrfs_read_sys_array(struct btrfs_root *root)
 {
-	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+	struct btrfs_super_block *super_copy = root->fs_info->super_copy;
 	struct extent_buffer *sb;
 	struct btrfs_disk_key *disk_key;
 	struct btrfs_chunk *chunk;
-- 
cgit v1.2.3


From af31f5e5b84b5bf2bcec464153a5130b170b2770 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 3 Nov 2011 15:17:42 -0400
Subject: Btrfs: add a log of past tree roots

This takes some of the free space in the btrfs super block
to record information about most of the roots in the last four
commits.

It also adds a -o recovery to use the root history log when
we're not able to read the tree of tree roots, the extent
tree root, the device tree root or the csum root.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h   |  95 +++++++++++++++++
 fs/btrfs/disk-io.c | 291 ++++++++++++++++++++++++++++++++++++++++++++++++-----
 fs/btrfs/super.c   |   7 +-
 3 files changed, 369 insertions(+), 24 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 5181c53c1124..78f43d1102a0 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -360,6 +360,47 @@ struct btrfs_header {
 #define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048
 #define BTRFS_LABEL_SIZE 256
 
+/*
+ * just in case we somehow lose the roots and are not able to mount,
+ * we store an array of the roots from previous transactions
+ * in the super.
+ */
+#define BTRFS_NUM_BACKUP_ROOTS 4
+struct btrfs_root_backup {
+	__le64 tree_root;
+	__le64 tree_root_gen;
+
+	__le64 chunk_root;
+	__le64 chunk_root_gen;
+
+	__le64 extent_root;
+	__le64 extent_root_gen;
+
+	__le64 fs_root;
+	__le64 fs_root_gen;
+
+	__le64 dev_root;
+	__le64 dev_root_gen;
+
+	__le64 csum_root;
+	__le64 csum_root_gen;
+
+	__le64 total_bytes;
+	__le64 bytes_used;
+	__le64 num_devices;
+	/* future */
+	__le64 unsed_64[4];
+
+	u8 tree_root_level;
+	u8 chunk_root_level;
+	u8 extent_root_level;
+	u8 fs_root_level;
+	u8 dev_root_level;
+	u8 csum_root_level;
+	/* future and to align */
+	u8 unused_8[10];
+} __attribute__ ((__packed__));
+
 /*
  * the super block basically lists the main trees of the FS
  * it currently lacks any block count etc etc
@@ -406,6 +447,7 @@ struct btrfs_super_block {
 	/* future expansion */
 	__le64 reserved[31];
 	u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
+	struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS];
 } __attribute__ ((__packed__));
 
 /*
@@ -1113,6 +1155,9 @@ struct btrfs_fs_info {
 	u64 fs_state;
 
 	struct btrfs_delayed_root *delayed_root;
+
+	/* next backup root to be overwritten */
+	int backup_root_index;
 };
 
 /*
@@ -1357,6 +1402,7 @@ struct btrfs_ioctl_defrag_range_args {
 #define BTRFS_MOUNT_ENOSPC_DEBUG	 (1 << 15)
 #define BTRFS_MOUNT_AUTO_DEFRAG		(1 << 16)
 #define BTRFS_MOUNT_INODE_MAP_CACHE	(1 << 17)
+#define BTRFS_MOUNT_RECOVERY		(1 << 18)
 
 #define btrfs_clear_opt(o, opt)		((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)		((o) |= BTRFS_MOUNT_##opt)
@@ -1972,6 +2018,55 @@ static inline bool btrfs_root_readonly(struct btrfs_root *root)
 	return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY;
 }
 
+/* struct btrfs_root_backup */
+BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup,
+		   tree_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_tree_root_gen, struct btrfs_root_backup,
+		   tree_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_tree_root_level, struct btrfs_root_backup,
+		   tree_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_chunk_root, struct btrfs_root_backup,
+		   chunk_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_gen, struct btrfs_root_backup,
+		   chunk_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_level, struct btrfs_root_backup,
+		   chunk_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_extent_root, struct btrfs_root_backup,
+		   extent_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_extent_root_gen, struct btrfs_root_backup,
+		   extent_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_extent_root_level, struct btrfs_root_backup,
+		   extent_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_fs_root, struct btrfs_root_backup,
+		   fs_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_fs_root_gen, struct btrfs_root_backup,
+		   fs_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_fs_root_level, struct btrfs_root_backup,
+		   fs_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_dev_root, struct btrfs_root_backup,
+		   dev_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_dev_root_gen, struct btrfs_root_backup,
+		   dev_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_dev_root_level, struct btrfs_root_backup,
+		   dev_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_csum_root, struct btrfs_root_backup,
+		   csum_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_csum_root_gen, struct btrfs_root_backup,
+		   csum_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_csum_root_level, struct btrfs_root_backup,
+		   csum_root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(backup_total_bytes, struct btrfs_root_backup,
+		   total_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup,
+		   bytes_used, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
+		   num_devices, 64);
+
 /* struct btrfs_super_block */
 
 BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 761717c98278..a61f8a6cf219 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1134,10 +1134,12 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
 
 	generation = btrfs_root_generation(&root->root_item);
 	blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
+	root->commit_root = NULL;
 	root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
 				     blocksize, generation);
 	if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) {
 		free_extent_buffer(root->node);
+		root->node = NULL;
 		return -EIO;
 	}
 	root->commit_root = btrfs_root_node(root);
@@ -1576,6 +1578,228 @@ sleep:
 	return 0;
 }
 
+/*
+ * this will find the highest generation in the array of
+ * root backups.  The index of the highest array is returned,
+ * or -1 if we can't find anything.
+ *
+ * We check to make sure the array is valid by comparing the
+ * generation of the latest  root in the array with the generation
+ * in the super block.  If they don't match we pitch it.
+ */
+static int find_newest_super_backup(struct btrfs_fs_info *info, u64 newest_gen)
+{
+	u64 cur;
+	int newest_index = -1;
+	struct btrfs_root_backup *root_backup;
+	int i;
+
+	for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
+		root_backup = info->super_copy->super_roots + i;
+		cur = btrfs_backup_tree_root_gen(root_backup);
+		if (cur == newest_gen)
+			newest_index = i;
+	}
+
+	/* check to see if we actually wrapped around */
+	if (newest_index == BTRFS_NUM_BACKUP_ROOTS - 1) {
+		root_backup = info->super_copy->super_roots;
+		cur = btrfs_backup_tree_root_gen(root_backup);
+		if (cur == newest_gen)
+			newest_index = 0;
+	}
+	return newest_index;
+}
+
+
+/*
+ * find the oldest backup so we know where to store new entries
+ * in the backup array.  This will set the backup_root_index
+ * field in the fs_info struct
+ */
+static void find_oldest_super_backup(struct btrfs_fs_info *info,
+				     u64 newest_gen)
+{
+	int newest_index = -1;
+
+	newest_index = find_newest_super_backup(info, newest_gen);
+	/* if there was garbage in there, just move along */
+	if (newest_index == -1) {
+		info->backup_root_index = 0;
+	} else {
+		info->backup_root_index = (newest_index + 1) % BTRFS_NUM_BACKUP_ROOTS;
+	}
+}
+
+/*
+ * copy all the root pointers into the super backup array.
+ * this will bump the backup pointer by one when it is
+ * done
+ */
+static void backup_super_roots(struct btrfs_fs_info *info)
+{
+	int next_backup;
+	struct btrfs_root_backup *root_backup;
+	int last_backup;
+
+	next_backup = info->backup_root_index;
+	last_backup = (next_backup + BTRFS_NUM_BACKUP_ROOTS - 1) %
+		BTRFS_NUM_BACKUP_ROOTS;
+
+	/*
+	 * just overwrite the last backup if we're at the same generation
+	 * this happens only at umount
+	 */
+	root_backup = info->super_for_commit->super_roots + last_backup;
+	if (btrfs_backup_tree_root_gen(root_backup) ==
+	    btrfs_header_generation(info->tree_root->node))
+		next_backup = last_backup;
+
+	root_backup = info->super_for_commit->super_roots + next_backup;
+
+	/*
+	 * make sure all of our padding and empty slots get zero filled
+	 * regardless of which ones we use today
+	 */
+	memset(root_backup, 0, sizeof(*root_backup));
+
+	info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS;
+
+	btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start);
+	btrfs_set_backup_tree_root_gen(root_backup,
+			       btrfs_header_generation(info->tree_root->node));
+
+	btrfs_set_backup_tree_root_level(root_backup,
+			       btrfs_header_level(info->tree_root->node));
+
+	btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start);
+	btrfs_set_backup_chunk_root_gen(root_backup,
+			       btrfs_header_generation(info->chunk_root->node));
+	btrfs_set_backup_chunk_root_level(root_backup,
+			       btrfs_header_level(info->chunk_root->node));
+
+	btrfs_set_backup_extent_root(root_backup, info->extent_root->node->start);
+	btrfs_set_backup_extent_root_gen(root_backup,
+			       btrfs_header_generation(info->extent_root->node));
+	btrfs_set_backup_extent_root_level(root_backup,
+			       btrfs_header_level(info->extent_root->node));
+
+	btrfs_set_backup_fs_root(root_backup, info->fs_root->node->start);
+	btrfs_set_backup_fs_root_gen(root_backup,
+			       btrfs_header_generation(info->fs_root->node));
+	btrfs_set_backup_fs_root_level(root_backup,
+			       btrfs_header_level(info->fs_root->node));
+
+	btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start);
+	btrfs_set_backup_dev_root_gen(root_backup,
+			       btrfs_header_generation(info->dev_root->node));
+	btrfs_set_backup_dev_root_level(root_backup,
+				       btrfs_header_level(info->dev_root->node));
+
+	btrfs_set_backup_csum_root(root_backup, info->csum_root->node->start);
+	btrfs_set_backup_csum_root_gen(root_backup,
+			       btrfs_header_generation(info->csum_root->node));
+	btrfs_set_backup_csum_root_level(root_backup,
+			       btrfs_header_level(info->csum_root->node));
+
+	btrfs_set_backup_total_bytes(root_backup,
+			     btrfs_super_total_bytes(info->super_copy));
+	btrfs_set_backup_bytes_used(root_backup,
+			     btrfs_super_bytes_used(info->super_copy));
+	btrfs_set_backup_num_devices(root_backup,
+			     btrfs_super_num_devices(info->super_copy));
+
+	/*
+	 * if we don't copy this out to the super_copy, it won't get remembered
+	 * for the next commit
+	 */
+	memcpy(&info->super_copy->super_roots,
+	       &info->super_for_commit->super_roots,
+	       sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS);
+}
+
+/*
+ * this copies info out of the root backup array and back into
+ * the in-memory super block.  It is meant to help iterate through
+ * the array, so you send it the number of backups you've already
+ * tried and the last backup index you used.
+ *
+ * this returns -1 when it has tried all the backups
+ */
+static noinline int next_root_backup(struct btrfs_fs_info *info,
+				     struct btrfs_super_block *super,
+				     int *num_backups_tried, int *backup_index)
+{
+	struct btrfs_root_backup *root_backup;
+	int newest = *backup_index;
+
+	if (*num_backups_tried == 0) {
+		u64 gen = btrfs_super_generation(super);
+
+		newest = find_newest_super_backup(info, gen);
+		if (newest == -1)
+			return -1;
+
+		*backup_index = newest;
+		*num_backups_tried = 1;
+	} else if (*num_backups_tried == BTRFS_NUM_BACKUP_ROOTS) {
+		/* we've tried all the backups, all done */
+		return -1;
+	} else {
+		/* jump to the next oldest backup */
+		newest = (*backup_index + BTRFS_NUM_BACKUP_ROOTS - 1) %
+			BTRFS_NUM_BACKUP_ROOTS;
+		*backup_index = newest;
+		*num_backups_tried += 1;
+	}
+	root_backup = super->super_roots + newest;
+
+	btrfs_set_super_generation(super,
+				   btrfs_backup_tree_root_gen(root_backup));
+	btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup));
+	btrfs_set_super_root_level(super,
+				   btrfs_backup_tree_root_level(root_backup));
+	btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup));
+
+	/*
+	 * fixme: the total bytes and num_devices need to match or we should
+	 * need a fsck
+	 */
+	btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup));
+	btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup));
+	return 0;
+}
+
+/* helper to cleanup tree roots */
+static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
+{
+	free_extent_buffer(info->tree_root->node);
+	free_extent_buffer(info->tree_root->commit_root);
+	free_extent_buffer(info->dev_root->node);
+	free_extent_buffer(info->dev_root->commit_root);
+	free_extent_buffer(info->extent_root->node);
+	free_extent_buffer(info->extent_root->commit_root);
+	free_extent_buffer(info->csum_root->node);
+	free_extent_buffer(info->csum_root->commit_root);
+
+	info->tree_root->node = NULL;
+	info->tree_root->commit_root = NULL;
+	info->dev_root->node = NULL;
+	info->dev_root->commit_root = NULL;
+	info->extent_root->node = NULL;
+	info->extent_root->commit_root = NULL;
+	info->csum_root->node = NULL;
+	info->csum_root->commit_root = NULL;
+
+	if (chunk_root) {
+		free_extent_buffer(info->chunk_root->node);
+		free_extent_buffer(info->chunk_root->commit_root);
+		info->chunk_root->node = NULL;
+		info->chunk_root->commit_root = NULL;
+	}
+}
+
+
 struct btrfs_root *open_ctree(struct super_block *sb,
 			      struct btrfs_fs_devices *fs_devices,
 			      char *options)
@@ -1603,6 +1827,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
 	int ret;
 	int err = -EINVAL;
+	int num_backups_tried = 0;
+	int backup_index = 0;
 
 	struct btrfs_super_block *disk_super;
 
@@ -1781,6 +2007,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
 	btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
 
+	/*
+	 * run through our array of backup supers and setup
+	 * our ring pointer to the oldest one
+	 */
+	generation = btrfs_super_generation(disk_super);
+	find_oldest_super_backup(fs_info, generation);
+
 	/*
 	 * In the long term, we'll store the compression type in the super
 	 * block, and it'll be used for per file compression control.
@@ -1938,7 +2171,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
 		printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n",
 		       sb->s_id);
-		goto fail_chunk_root;
+		goto fail_tree_roots;
 	}
 	btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
 	chunk_root->commit_root = btrfs_root_node(chunk_root);
@@ -1953,11 +2186,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	if (ret) {
 		printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
 		       sb->s_id);
-		goto fail_chunk_root;
+		goto fail_tree_roots;
 	}
 
 	btrfs_close_extra_devices(fs_devices);
 
+retry_root_backup:
 	blocksize = btrfs_level_size(tree_root,
 				     btrfs_super_root_level(disk_super));
 	generation = btrfs_super_generation(disk_super);
@@ -1965,32 +2199,33 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	tree_root->node = read_tree_block(tree_root,
 					  btrfs_super_root(disk_super),
 					  blocksize, generation);
-	if (!tree_root->node)
-		goto fail_chunk_root;
-	if (!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
+	if (!tree_root->node ||
+	    !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
 		printk(KERN_WARNING "btrfs: failed to read tree root on %s\n",
 		       sb->s_id);
-		goto fail_tree_root;
+
+		goto recovery_tree_root;
 	}
+
 	btrfs_set_root_node(&tree_root->root_item, tree_root->node);
 	tree_root->commit_root = btrfs_root_node(tree_root);
 
 	ret = find_and_setup_root(tree_root, fs_info,
 				  BTRFS_EXTENT_TREE_OBJECTID, extent_root);
 	if (ret)
-		goto fail_tree_root;
+		goto recovery_tree_root;
 	extent_root->track_dirty = 1;
 
 	ret = find_and_setup_root(tree_root, fs_info,
 				  BTRFS_DEV_TREE_OBJECTID, dev_root);
 	if (ret)
-		goto fail_extent_root;
+		goto recovery_tree_root;
 	dev_root->track_dirty = 1;
 
 	ret = find_and_setup_root(tree_root, fs_info,
 				  BTRFS_CSUM_TREE_OBJECTID, csum_root);
 	if (ret)
-		goto fail_dev_root;
+		goto recovery_tree_root;
 
 	csum_root->track_dirty = 1;
 
@@ -2123,20 +2358,10 @@ fail_cleaner:
 
 fail_block_groups:
 	btrfs_free_block_groups(fs_info);
-	free_extent_buffer(csum_root->node);
-	free_extent_buffer(csum_root->commit_root);
-fail_dev_root:
-	free_extent_buffer(dev_root->node);
-	free_extent_buffer(dev_root->commit_root);
-fail_extent_root:
-	free_extent_buffer(extent_root->node);
-	free_extent_buffer(extent_root->commit_root);
-fail_tree_root:
-	free_extent_buffer(tree_root->node);
-	free_extent_buffer(tree_root->commit_root);
-fail_chunk_root:
-	free_extent_buffer(chunk_root->node);
-	free_extent_buffer(chunk_root->commit_root);
+
+fail_tree_roots:
+	free_root_pointers(fs_info, 1);
+
 fail_sb_buffer:
 	btrfs_stop_workers(&fs_info->generic_worker);
 	btrfs_stop_workers(&fs_info->fixup_workers);
@@ -2164,6 +2389,25 @@ fail_srcu:
 fail:
 	free_fs_info(fs_info);
 	return ERR_PTR(err);
+
+recovery_tree_root:
+
+	if (!btrfs_test_opt(tree_root, RECOVERY))
+		goto fail_tree_roots;
+
+	free_root_pointers(fs_info, 0);
+
+	/* don't use the log in recovery mode, it won't be valid */
+	btrfs_set_super_log_root(disk_super, 0);
+
+	/* we can't trust the free space cache either */
+	btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);
+
+	ret = next_root_backup(fs_info, fs_info->super_copy,
+			       &num_backups_tried, &backup_index);
+	if (ret == -1)
+		goto fail_block_groups;
+	goto retry_root_backup;
 }
 
 static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
@@ -2333,6 +2577,7 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
 
 	max_errors = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
 	do_barriers = !btrfs_test_opt(root, NOBARRIER);
+	backup_super_roots(root->fs_info);
 
 	sb = root->fs_info->super_for_commit;
 	dev_item = &sb->dev_item;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f7e9de724ef2..57080dffdfc6 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -164,7 +164,7 @@ enum {
 	Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
 	Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
 	Opt_enospc_debug, Opt_subvolrootid, Opt_defrag,
-	Opt_inode_cache, Opt_no_space_cache, Opt_err,
+	Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_err,
 };
 
 static match_table_t tokens = {
@@ -198,6 +198,7 @@ static match_table_t tokens = {
 	{Opt_defrag, "autodefrag"},
 	{Opt_inode_cache, "inode_cache"},
 	{Opt_no_space_cache, "no_space_cache"},
+	{Opt_recovery, "recovery"},
 	{Opt_err, NULL},
 };
 
@@ -392,6 +393,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			printk(KERN_INFO "btrfs: enabling auto defrag");
 			btrfs_set_opt(info->mount_opt, AUTO_DEFRAG);
 			break;
+		case Opt_recovery:
+			printk(KERN_INFO "btrfs: enabling auto recovery");
+			btrfs_set_opt(info->mount_opt, RECOVERY);
+			break;
 		case Opt_err:
 			printk(KERN_INFO "btrfs: unrecognized mount option "
 			       "'%s'\n", p);
-- 
cgit v1.2.3


From 6d668dda0caec537fbf28c4d91e6d18181af3cff Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Thu, 3 Nov 2011 22:54:25 -0400
Subject: Btrfs: make a delayed_block_rsv for the delayed item insertion

I've been hitting warnings in use_block_rsv when running the delayed insertion
stuff.  It's because we will readjust global block rsv based on what is in use,
which means we could end up discarding reservations that are for the delayed
insertion stuff.  So instead create a seperate block rsv for the delayed
insertion stuff.  This will also make it easier to debug problems with the
delayed insertion reservations since we will know that only the delayed
insertion code touches this block_rsv.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h         |  2 ++
 fs/btrfs/delayed-inode.c | 14 +++++++-------
 fs/btrfs/disk-io.c       |  1 +
 fs/btrfs/extent-tree.c   |  3 +++
 4 files changed, 13 insertions(+), 7 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 78f43d1102a0..3002e5d4da0b 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -957,6 +957,8 @@ struct btrfs_fs_info {
 	struct btrfs_block_rsv trans_block_rsv;
 	/* block reservation for chunk tree */
 	struct btrfs_block_rsv chunk_block_rsv;
+	/* block reservation for delayed operations */
+	struct btrfs_block_rsv delayed_block_rsv;
 
 	struct btrfs_block_rsv empty_block_rsv;
 
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index b52c672f4c18..fc4026af7290 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -591,7 +591,7 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
 		return 0;
 
 	src_rsv = trans->block_rsv;
-	dst_rsv = &root->fs_info->global_block_rsv;
+	dst_rsv = &root->fs_info->delayed_block_rsv;
 
 	num_bytes = btrfs_calc_trans_metadata_size(root, 1);
 	ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
@@ -609,7 +609,7 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
 	if (!item->bytes_reserved)
 		return;
 
-	rsv = &root->fs_info->global_block_rsv;
+	rsv = &root->fs_info->delayed_block_rsv;
 	btrfs_block_rsv_release(root, rsv,
 				item->bytes_reserved);
 }
@@ -628,7 +628,7 @@ static int btrfs_delayed_inode_reserve_metadata(
 		return 0;
 
 	src_rsv = trans->block_rsv;
-	dst_rsv = &root->fs_info->global_block_rsv;
+	dst_rsv = &root->fs_info->delayed_block_rsv;
 
 	num_bytes = btrfs_calc_trans_metadata_size(root, 1);
 	ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
@@ -646,7 +646,7 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root,
 	if (!node->bytes_reserved)
 		return;
 
-	rsv = &root->fs_info->global_block_rsv;
+	rsv = &root->fs_info->delayed_block_rsv;
 	btrfs_block_rsv_release(root, rsv,
 				node->bytes_reserved);
 	node->bytes_reserved = 0;
@@ -1026,7 +1026,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
 	path->leave_spinning = 1;
 
 	block_rsv = trans->block_rsv;
-	trans->block_rsv = &root->fs_info->global_block_rsv;
+	trans->block_rsv = &root->fs_info->delayed_block_rsv;
 
 	delayed_root = btrfs_get_delayed_root(root);
 
@@ -1069,7 +1069,7 @@ static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
 	path->leave_spinning = 1;
 
 	block_rsv = trans->block_rsv;
-	trans->block_rsv = &node->root->fs_info->global_block_rsv;
+	trans->block_rsv = &node->root->fs_info->delayed_block_rsv;
 
 	ret = btrfs_insert_delayed_items(trans, path, node->root, node);
 	if (!ret)
@@ -1149,7 +1149,7 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
 		goto free_path;
 
 	block_rsv = trans->block_rsv;
-	trans->block_rsv = &root->fs_info->global_block_rsv;
+	trans->block_rsv = &root->fs_info->delayed_block_rsv;
 
 	ret = btrfs_insert_delayed_items(trans, path, root, delayed_node);
 	if (!ret)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a61f8a6cf219..23b6776477b7 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1891,6 +1891,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	btrfs_init_block_rsv(&fs_info->trans_block_rsv);
 	btrfs_init_block_rsv(&fs_info->chunk_block_rsv);
 	btrfs_init_block_rsv(&fs_info->empty_block_rsv);
+	btrfs_init_block_rsv(&fs_info->delayed_block_rsv);
 	atomic_set(&fs_info->nr_async_submits, 0);
 	atomic_set(&fs_info->async_delalloc_pages, 0);
 	atomic_set(&fs_info->async_submit_draining, 0);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 782eb3ea8edf..01c1f08b976a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3914,6 +3914,7 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
 	fs_info->delalloc_block_rsv.space_info = space_info;
 	fs_info->trans_block_rsv.space_info = space_info;
 	fs_info->empty_block_rsv.space_info = space_info;
+	fs_info->delayed_block_rsv.space_info = space_info;
 
 	fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
 	fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
@@ -3933,6 +3934,8 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
 	WARN_ON(fs_info->trans_block_rsv.reserved > 0);
 	WARN_ON(fs_info->chunk_block_rsv.size > 0);
 	WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
+	WARN_ON(fs_info->delayed_block_rsv.size > 0);
+	WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
 }
 
 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
-- 
cgit v1.2.3


From c674e04e1cd6049715e7b9446790f4b441e547c0 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 3 Nov 2011 22:23:13 -0400
Subject: Btrfs: fix extent_buffer leak in the metadata IO error handling

The scrub readahead branch brought in a new error handling hook,
but it was leaking extent_buffer references.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0eb1f0951251..40a62b980087 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -644,6 +644,7 @@ static int btree_io_failed_hook(struct bio *failed_bio,
 		clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
 		btree_readahead_hook(root, eb, eb->start, -EIO);
 	}
+	free_extent_buffer(eb);
 
 out:
 	return -EIO;	/* we fixed nothing */
-- 
cgit v1.2.3


From 306c8b68c82dfe6b7c9e5b61985760ad5d089205 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 3 Nov 2011 15:21:39 -0400
Subject: Btrfs: stop the readahead threads on failed mount

If we don't stop them, they linger around corrupting
memory by using pointers to freed things.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 40a62b980087..e532892431f4 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2444,6 +2444,7 @@ fail_tree_roots:
 
 fail_sb_buffer:
 	btrfs_stop_workers(&fs_info->generic_worker);
+	btrfs_stop_workers(&fs_info->readahead_workers);
 	btrfs_stop_workers(&fs_info->fixup_workers);
 	btrfs_stop_workers(&fs_info->delalloc_workers);
 	btrfs_stop_workers(&fs_info->workers);
-- 
cgit v1.2.3


From 7c7e82a77fe3d89ae50824aa7c897454675eb4c4 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Sun, 6 Nov 2011 18:50:56 -0500
Subject: Btrfs: check for a null fs root when writing to the backup root log

During log replay, can commit the transaction before the fs_root
pointers are setup, so we have to make sure they are not null before
trying to use them.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e532892431f4..e53a5bb85670 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1754,11 +1754,18 @@ static void backup_super_roots(struct btrfs_fs_info *info)
 	btrfs_set_backup_extent_root_level(root_backup,
 			       btrfs_header_level(info->extent_root->node));
 
-	btrfs_set_backup_fs_root(root_backup, info->fs_root->node->start);
-	btrfs_set_backup_fs_root_gen(root_backup,
+	/*
+	 * we might commit during log recovery, which happens before we set
+	 * the fs_root.  Make sure it is valid before we fill it in.
+	 */
+	if (info->fs_root && info->fs_root->node) {
+		btrfs_set_backup_fs_root(root_backup,
+					 info->fs_root->node->start);
+		btrfs_set_backup_fs_root_gen(root_backup,
 			       btrfs_header_generation(info->fs_root->node));
-	btrfs_set_backup_fs_root_level(root_backup,
+		btrfs_set_backup_fs_root_level(root_backup,
 			       btrfs_header_level(info->fs_root->node));
+	}
 
 	btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start);
 	btrfs_set_backup_dev_root_gen(root_backup,
-- 
cgit v1.2.3


From 4d34b2789538befa45a68a191dc12e0886a69f7d Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Wed, 9 Nov 2011 00:08:15 +0200
Subject: Btrfs: avoid null dereference and leaks when bailing from
 open_ctree()

Fix bugs introduced by 6c41761f.  Firstly, after failing to allocate any
of the tree roots (first 'goto fail' in open_ctree()) we would
dereference a NULL fs_info pointer in free_fs_info().  Secondly, after
failures from init_srcu_struct(), setup_bdi() and new_inode() we would
leak all earlier allocated roots: fs_info fields haven't been
initialized yet so free_fs_info() is rendered useless.

Fix this by initializing fs_info pointer and fs_info fields before any
allocations happen.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/disk-io.c | 35 +++++++++++++++--------------------
 1 file changed, 15 insertions(+), 20 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e53a5bb85670..91db90b526c2 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1890,31 +1890,32 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	u64 features;
 	struct btrfs_key location;
 	struct buffer_head *bh;
-	struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root),
-						 GFP_NOFS);
-	struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root),
-						 GFP_NOFS);
+	struct btrfs_super_block *disk_super;
 	struct btrfs_root *tree_root = btrfs_sb(sb);
-	struct btrfs_fs_info *fs_info = NULL;
-	struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root),
-						GFP_NOFS);
-	struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root),
-					      GFP_NOFS);
+	struct btrfs_fs_info *fs_info = tree_root->fs_info;
+	struct btrfs_root *extent_root;
+	struct btrfs_root *csum_root;
+	struct btrfs_root *chunk_root;
+	struct btrfs_root *dev_root;
 	struct btrfs_root *log_tree_root;
-
 	int ret;
 	int err = -EINVAL;
 	int num_backups_tried = 0;
 	int backup_index = 0;
 
-	struct btrfs_super_block *disk_super;
+	extent_root = fs_info->extent_root =
+		kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+	csum_root = fs_info->csum_root =
+		kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+	chunk_root = fs_info->chunk_root =
+		kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+	dev_root = fs_info->dev_root =
+		kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
 
-	if (!extent_root || !tree_root || !tree_root->fs_info ||
-	    !chunk_root || !dev_root || !csum_root) {
+	if (!extent_root || !csum_root || !chunk_root || !dev_root) {
 		err = -ENOMEM;
 		goto fail;
 	}
-	fs_info = tree_root->fs_info;
 
 	ret = init_srcu_struct(&fs_info->subvol_srcu);
 	if (ret) {
@@ -1954,12 +1955,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	mutex_init(&fs_info->reloc_mutex);
 
 	init_completion(&fs_info->kobj_unregister);
-	fs_info->tree_root = tree_root;
-	fs_info->extent_root = extent_root;
-	fs_info->csum_root = csum_root;
-	fs_info->chunk_root = chunk_root;
-	fs_info->dev_root = dev_root;
-	fs_info->fs_devices = fs_devices;
 	INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
 	INIT_LIST_HEAD(&fs_info->space_info);
 	btrfs_mapping_init(&fs_info->mapping_tree);
-- 
cgit v1.2.3


From 586e46e2813c589d26258a599580421fb6fb576b Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Wed, 9 Nov 2011 13:26:37 +0200
Subject: Btrfs: close devices on all error paths in open_ctree()

Fix a bug introduced by 7e662854 where we would leave devices busy on
certain error paths in open_ctree().  fs_info is guaranteed to be
non-NULL now so it's safe to dereference it on all error paths.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/disk-io.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 91db90b526c2..b6a5c0dd0dd8 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2460,21 +2460,20 @@ fail_sb_buffer:
 	btrfs_stop_workers(&fs_info->caching_workers);
 fail_alloc:
 fail_iput:
+	btrfs_mapping_tree_free(&fs_info->mapping_tree);
+
 	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
 	iput(fs_info->btree_inode);
-
-	btrfs_close_devices(fs_info->fs_devices);
-	btrfs_mapping_tree_free(&fs_info->mapping_tree);
 fail_bdi:
 	bdi_destroy(&fs_info->bdi);
 fail_srcu:
 	cleanup_srcu_struct(&fs_info->subvol_srcu);
 fail:
+	btrfs_close_devices(fs_info->fs_devices);
 	free_fs_info(fs_info);
 	return ERR_PTR(err);
 
 recovery_tree_root:
-
 	if (!btrfs_test_opt(tree_root, RECOVERY))
 		goto fail_tree_roots;
 
-- 
cgit v1.2.3


From 387125fc722a8ed432066b85a552917343bdafca Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 18 Nov 2011 15:07:51 -0500
Subject: Btrfs: fix barrier flushes

When btrfs is writing the super blocks, it send barrier flushes to make
sure writeback caching drives get all the metadata on disk in the
right order.

But, we have two bugs in the way these are sent down.  When doing
full commits (not via the tree log), we are sending the barrier down
before the last super when it should be going down before the first.

In multi-device setups, we should be waiting for the barriers to
complete on all devices before writing any of the supers.

Both of these bugs can cause corruptions on power failures.  We fix it
with some new code to send down empty barriers to all devices before
writing the first super.

Alexandre Oliva found the multi-device bug.  Arne Jansen did the async
barrier loop.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
Reported-by: Alexandre Oliva <oliva@lsd.ic.unicamp.br>
---
 fs/btrfs/disk-io.c | 145 ++++++++++++++++++++++++++++++++++++++++++++++-------
 fs/btrfs/volumes.h |   6 +++
 2 files changed, 134 insertions(+), 17 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b6a5c0dd0dd8..48d30138237f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2573,22 +2573,10 @@ static int write_dev_supers(struct btrfs_device *device,
 	int errors = 0;
 	u32 crc;
 	u64 bytenr;
-	int last_barrier = 0;
 
 	if (max_mirrors == 0)
 		max_mirrors = BTRFS_SUPER_MIRROR_MAX;
 
-	/* make sure only the last submit_bh does a barrier */
-	if (do_barriers) {
-		for (i = 0; i < max_mirrors; i++) {
-			bytenr = btrfs_sb_offset(i);
-			if (bytenr + BTRFS_SUPER_INFO_SIZE >=
-			    device->total_bytes)
-				break;
-			last_barrier = i;
-		}
-	}
-
 	for (i = 0; i < max_mirrors; i++) {
 		bytenr = btrfs_sb_offset(i);
 		if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
@@ -2634,17 +2622,136 @@ static int write_dev_supers(struct btrfs_device *device,
 			bh->b_end_io = btrfs_end_buffer_write_sync;
 		}
 
-		if (i == last_barrier && do_barriers)
-			ret = submit_bh(WRITE_FLUSH_FUA, bh);
-		else
-			ret = submit_bh(WRITE_SYNC, bh);
-
+		/*
+		 * we fua the first super.  The others we allow
+		 * to go down lazy.
+		 */
+		ret = submit_bh(WRITE_FUA, bh);
 		if (ret)
 			errors++;
 	}
 	return errors < i ? 0 : -1;
 }
 
+/*
+ * endio for the write_dev_flush, this will wake anyone waiting
+ * for the barrier when it is done
+ */
+static void btrfs_end_empty_barrier(struct bio *bio, int err)
+{
+	if (err) {
+		if (err == -EOPNOTSUPP)
+			set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
+		clear_bit(BIO_UPTODATE, &bio->bi_flags);
+	}
+	if (bio->bi_private)
+		complete(bio->bi_private);
+	bio_put(bio);
+}
+
+/*
+ * trigger flushes for one the devices.  If you pass wait == 0, the flushes are
+ * sent down.  With wait == 1, it waits for the previous flush.
+ *
+ * any device where the flush fails with eopnotsupp are flagged as not-barrier
+ * capable
+ */
+static int write_dev_flush(struct btrfs_device *device, int wait)
+{
+	struct bio *bio;
+	int ret = 0;
+
+	if (device->nobarriers)
+		return 0;
+
+	if (wait) {
+		bio = device->flush_bio;
+		if (!bio)
+			return 0;
+
+		wait_for_completion(&device->flush_wait);
+
+		if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
+			printk("btrfs: disabling barriers on dev %s\n",
+			       device->name);
+			device->nobarriers = 1;
+		}
+		if (!bio_flagged(bio, BIO_UPTODATE)) {
+			ret = -EIO;
+		}
+
+		/* drop the reference from the wait == 0 run */
+		bio_put(bio);
+		device->flush_bio = NULL;
+
+		return ret;
+	}
+
+	/*
+	 * one reference for us, and we leave it for the
+	 * caller
+	 */
+	device->flush_bio = NULL;;
+	bio = bio_alloc(GFP_NOFS, 0);
+	if (!bio)
+		return -ENOMEM;
+
+	bio->bi_end_io = btrfs_end_empty_barrier;
+	bio->bi_bdev = device->bdev;
+	init_completion(&device->flush_wait);
+	bio->bi_private = &device->flush_wait;
+	device->flush_bio = bio;
+
+	bio_get(bio);
+	submit_bio(WRITE_FLUSH, bio);
+
+	return 0;
+}
+
+/*
+ * send an empty flush down to each device in parallel,
+ * then wait for them
+ */
+static int barrier_all_devices(struct btrfs_fs_info *info)
+{
+	struct list_head *head;
+	struct btrfs_device *dev;
+	int errors = 0;
+	int ret;
+
+	/* send down all the barriers */
+	head = &info->fs_devices->devices;
+	list_for_each_entry_rcu(dev, head, dev_list) {
+		if (!dev->bdev) {
+			errors++;
+			continue;
+		}
+		if (!dev->in_fs_metadata || !dev->writeable)
+			continue;
+
+		ret = write_dev_flush(dev, 0);
+		if (ret)
+			errors++;
+	}
+
+	/* wait for all the barriers */
+	list_for_each_entry_rcu(dev, head, dev_list) {
+		if (!dev->bdev) {
+			errors++;
+			continue;
+		}
+		if (!dev->in_fs_metadata || !dev->writeable)
+			continue;
+
+		ret = write_dev_flush(dev, 1);
+		if (ret)
+			errors++;
+	}
+	if (errors)
+		return -EIO;
+	return 0;
+}
+
 int write_all_supers(struct btrfs_root *root, int max_mirrors)
 {
 	struct list_head *head;
@@ -2666,6 +2773,10 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
 
 	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
 	head = &root->fs_info->fs_devices->devices;
+
+	if (do_barriers)
+		barrier_all_devices(root->fs_info);
+
 	list_for_each_entry_rcu(dev, head, dev_list) {
 		if (!dev->bdev) {
 			total_errors++;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index ab5b1c49f352..78f2d4d4f37f 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -100,6 +100,12 @@ struct btrfs_device {
 	struct reada_zone *reada_curr_zone;
 	struct radix_tree_root reada_zones;
 	struct radix_tree_root reada_extents;
+
+	/* for sending down flush barriers */
+	struct bio *flush_bio;
+	struct completion flush_wait;
+	int nobarriers;
+
 };
 
 struct btrfs_fs_devices {
-- 
cgit v1.2.3


From 32240a913d9f3a5aad42175d7696590ea1bfdb08 Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Sun, 20 Nov 2011 07:33:38 -0500
Subject: btrfs: mirror_num should be int, not u64

My previous patch introduced some u64 for failed_mirror variables, this one
makes it consistent again.

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c   | 2 +-
 fs/btrfs/extent_io.c | 4 ++--
 fs/btrfs/extent_io.h | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 48d30138237f..94abc25392f6 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -620,7 +620,7 @@ out:
 
 static int btree_io_failed_hook(struct bio *failed_bio,
 			 struct page *page, u64 start, u64 end,
-			 u64 mirror_num, struct extent_state *state)
+			 int mirror_num, struct extent_state *state)
 {
 	struct extent_io_tree *tree;
 	unsigned long len;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 47fdba7853c3..a877b8b212eb 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2285,8 +2285,8 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
 				clean_io_failure(start, page);
 		}
 		if (!uptodate) {
-			u64 failed_mirror;
-			failed_mirror = (unsigned long)bio->bi_bdev;
+			int failed_mirror;
+			failed_mirror = (int)(unsigned long)bio->bi_bdev;
 			if (tree->ops && tree->ops->readpage_io_failed_hook)
 				ret = tree->ops->readpage_io_failed_hook(
 						bio, page, start, end,
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index feb9be0e23bc..7604c3001322 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -70,7 +70,7 @@ struct extent_io_ops {
 			      unsigned long bio_flags);
 	int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
 	int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
-				       u64 start, u64 end, u64 failed_mirror,
+				       u64 start, u64 end, int failed_mirror,
 				       struct extent_state *state);
 	int (*writepage_io_failed_hook)(struct bio *bio, struct page *page,
 					u64 start, u64 end,
-- 
cgit v1.2.3