summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChris Mason <chris.mason@oracle.com>2007-08-07 16:15:09 -0400
committerDavid Woodhouse <dwmw2@hera.kernel.org>2007-08-07 16:15:09 -0400
commit6702ed490ca0bb44e17131818a5a18b773957c5a (patch)
treef6bc4b77e58506a68f1ab539ca6c96854f39f92d
parent3c69faecb8d83cb2ef085a98b196a3fecea67725 (diff)
downloadlwn-6702ed490ca0bb44e17131818a5a18b773957c5a.tar.gz
lwn-6702ed490ca0bb44e17131818a5a18b773957c5a.zip
Btrfs: Add run time btree defrag, and an ioctl to force btree defrag
This adds two types of btree defrag, a run time form that tries to defrag recently allocated blocks in the btree when they are still in ram, and an ioctl that forces defrag of all btree blocks. File data blocks are not defragged yet, but this can make a huge difference in sequential btree reads. Signed-off-by: Chris Mason <chris.mason@oracle.com>
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/ctree.c195
-rw-r--r--fs/btrfs/ctree.h19
-rw-r--r--fs/btrfs/disk-io.c8
-rw-r--r--fs/btrfs/extent-tree.c53
-rw-r--r--fs/btrfs/file.c2
-rw-r--r--fs/btrfs/inode.c33
-rw-r--r--fs/btrfs/ioctl.h2
-rw-r--r--fs/btrfs/transaction.c105
-rw-r--r--fs/btrfs/transaction.h1
-rw-r--r--fs/btrfs/tree-defrag.c222
11 files changed, 553 insertions, 89 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index a4e2df66bdbb..9321438c5eef 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -4,7 +4,7 @@ ifneq ($(KERNELRELEASE),)
obj-m := btrfs.o
btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
hash.o file-item.o inode-item.o inode-map.o disk-io.o \
- transaction.o bit-radix.o inode.o file.o
+ transaction.o bit-radix.o inode.o file.o tree-defrag.o
#btrfs-y := ctree.o disk-io.o radix-tree.o extent-tree.o print-tree.o \
# root-tree.o dir-item.o hash.o file-item.o inode-item.o \
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 7a08491e208e..c7e47e77723f 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -65,44 +65,44 @@ void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
memset(p, 0, sizeof(*p));
}
-static int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root
+static int __btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root
*root, struct buffer_head *buf, struct buffer_head
*parent, int parent_slot, struct buffer_head
- **cow_ret)
+ **cow_ret, u64 search_start, u64 empty_size)
{
struct buffer_head *cow;
struct btrfs_node *cow_node;
- int ret;
+ int ret = 0;
+ int different_trans = 0;
+ WARN_ON(root->ref_cows && trans->transid != root->last_trans);
WARN_ON(!buffer_uptodate(buf));
- if (trans->transaction != root->fs_info->running_transaction) {
- printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
- root->fs_info->running_transaction->transid);
- WARN_ON(1);
- }
- if (trans->transid != root->fs_info->generation) {
- printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
- root->fs_info->generation);
- WARN_ON(1);
- }
- if (btrfs_header_generation(btrfs_buffer_header(buf)) ==
- trans->transid) {
- *cow_ret = buf;
- return 0;
- }
- cow = btrfs_alloc_free_block(trans, root, buf->b_blocknr);
+ cow = btrfs_alloc_free_block(trans, root, search_start, empty_size);
if (IS_ERR(cow))
return PTR_ERR(cow);
+
cow_node = btrfs_buffer_node(cow);
if (buf->b_size != root->blocksize || cow->b_size != root->blocksize)
WARN_ON(1);
+
memcpy(cow_node, btrfs_buffer_node(buf), root->blocksize);
btrfs_set_header_blocknr(&cow_node->header, bh_blocknr(cow));
btrfs_set_header_generation(&cow_node->header, trans->transid);
btrfs_set_header_owner(&cow_node->header, root->root_key.objectid);
- ret = btrfs_inc_ref(trans, root, buf);
- if (ret)
- return ret;
+
+ WARN_ON(btrfs_header_generation(btrfs_buffer_header(buf)) >
+ trans->transid);
+ if (btrfs_header_generation(btrfs_buffer_header(buf)) !=
+ trans->transid) {
+ different_trans = 1;
+ ret = btrfs_inc_ref(trans, root, buf);
+ if (ret)
+ return ret;
+ } else {
+ WARN_ON(!root->ref_cows);
+ clean_tree_block(trans, root, buf);
+ }
+
if (buf == root->node) {
root->node = cow;
get_bh(cow);
@@ -114,6 +114,8 @@ static int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root
btrfs_set_node_blockptr(btrfs_buffer_node(parent), parent_slot,
bh_blocknr(cow));
btrfs_mark_buffer_dirty(parent);
+ WARN_ON(btrfs_header_generation(btrfs_buffer_header(parent)) !=
+ trans->transid);
btrfs_free_extent(trans, root, bh_blocknr(buf), 1, 1);
}
btrfs_block_release(root, buf);
@@ -122,6 +124,115 @@ static int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root
return 0;
}
+int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct buffer_head *buf, struct buffer_head
+ *parent, int parent_slot, struct buffer_head
+ **cow_ret)
+{
+ u64 search_start;
+ if (trans->transaction != root->fs_info->running_transaction) {
+ printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
+ root->fs_info->running_transaction->transid);
+ WARN_ON(1);
+ }
+ if (trans->transid != root->fs_info->generation) {
+ printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
+ root->fs_info->generation);
+ WARN_ON(1);
+ }
+ if (btrfs_header_generation(btrfs_buffer_header(buf)) ==
+ trans->transid) {
+ *cow_ret = buf;
+ return 0;
+ }
+
+ search_start = bh_blocknr(buf) & ~((u64)65535);
+ return __btrfs_cow_block(trans, root, buf, parent,
+ parent_slot, cow_ret, search_start, 0);
+}
+
+static int close_blocks(u64 blocknr, u64 other)
+{
+ if (blocknr < other && other - blocknr < 8)
+ return 1;
+ if (blocknr > other && blocknr - other < 8)
+ return 1;
+ return 0;
+}
+
+int btrfs_realloc_node(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct buffer_head *parent,
+ int cache_only)
+{
+ struct btrfs_node *parent_node;
+ struct buffer_head *cur_bh;
+ struct buffer_head *tmp_bh;
+ u64 blocknr;
+ u64 search_start = 0;
+ u64 other;
+ u32 parent_nritems;
+ int start_slot;
+ int end_slot;
+ int i;
+ int err = 0;
+
+ if (trans->transaction != root->fs_info->running_transaction) {
+ printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
+ root->fs_info->running_transaction->transid);
+ WARN_ON(1);
+ }
+ if (trans->transid != root->fs_info->generation) {
+ printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
+ root->fs_info->generation);
+ WARN_ON(1);
+ }
+ parent_node = btrfs_buffer_node(parent);
+ parent_nritems = btrfs_header_nritems(&parent_node->header);
+
+ start_slot = 0;
+ end_slot = parent_nritems;
+
+ if (parent_nritems == 1)
+ return 0;
+
+ for (i = start_slot; i < end_slot; i++) {
+ int close = 1;
+ blocknr = btrfs_node_blockptr(parent_node, i);
+ if (i > 0) {
+ other = btrfs_node_blockptr(parent_node, i - 1);
+ close = close_blocks(blocknr, other);
+ }
+ if (close && i < end_slot - 1) {
+ other = btrfs_node_blockptr(parent_node, i + 1);
+ close = close_blocks(blocknr, other);
+ }
+ if (close)
+ continue;
+
+ cur_bh = btrfs_find_tree_block(root, blocknr);
+ if (!cur_bh || !buffer_uptodate(cur_bh) ||
+ buffer_locked(cur_bh)) {
+ if (cache_only) {
+ brelse(cur_bh);
+ continue;
+ }
+ brelse(cur_bh);
+ cur_bh = read_tree_block(root, blocknr);
+ }
+ if (search_start == 0) {
+ search_start = bh_blocknr(cur_bh) & ~((u64)65535);
+ }
+ err = __btrfs_cow_block(trans, root, cur_bh, parent, i,
+ &tmp_bh, search_start,
+ min(8, end_slot - i));
+ if (err)
+ break;
+ search_start = bh_blocknr(tmp_bh);
+ brelse(tmp_bh);
+ }
+ return err;
+}
+
/*
* The leaf data grows from end-to-front in the node.
* this returns the address of the start of the last item,
@@ -221,6 +332,7 @@ static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
parent_slot = path->slots[level + 1];
parent_key = &parent->ptrs[parent_slot].key;
+
BUG_ON(memcmp(parent_key, &leaf->items[0].key,
sizeof(struct btrfs_disk_key)));
BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
@@ -643,7 +755,7 @@ static int push_nodes_for_insert(struct btrfs_trans_handle *trans,
* readahead one full node of leaves
*/
static void reada_for_search(struct btrfs_root *root, struct btrfs_path *path,
- int slot)
+ int level, int slot)
{
struct btrfs_node *node;
int i;
@@ -659,10 +771,13 @@ static void reada_for_search(struct btrfs_root *root, struct btrfs_path *path,
unsigned long gang[8];
struct buffer_head *bh;
- if (!path->nodes[1])
+ if (level == 0)
+ return;
+
+ if (!path->nodes[level])
return;
- node = btrfs_buffer_node(path->nodes[1]);
+ node = btrfs_buffer_node(path->nodes[level]);
search = btrfs_node_blockptr(node, slot);
bh = btrfs_find_tree_block(root, search);
if (bh) {
@@ -690,7 +805,7 @@ static void reada_for_search(struct btrfs_root *root, struct btrfs_path *path,
for (i = 0; i < ret; i++) {
blocknr = gang[i];
clear_radix_bit(&found, blocknr);
- if (nread > 64)
+ if (nread > 32)
continue;
if (direction > 0 && cluster_start <= blocknr &&
cluster_start + 8 > blocknr) {
@@ -726,7 +841,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
struct buffer_head *b;
struct buffer_head *cow_buf;
struct btrfs_node *c;
- struct btrfs_root_item *root_item = &root->root_item;
u64 blocknr;
int slot;
int ret;
@@ -734,11 +848,8 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
int should_reada = p->reada;
u8 lowest_level = 0;
- if (btrfs_root_refs(root_item) == 0 && root->ref_cows) {
- lowest_level = root_item->drop_level;
- WARN_ON(ins_len || cow);
- }
-
+ lowest_level = p->lowest_level;
+ WARN_ON(lowest_level && ins_len);
WARN_ON(p->nodes[0] != NULL);
WARN_ON(!mutex_is_locked(&root->fs_info->fs_mutex));
again:
@@ -798,8 +909,8 @@ again:
if (level == lowest_level)
break;
blocknr = btrfs_node_blockptr(c, slot);
- if (level == 1 && should_reada)
- reada_for_search(root, p, slot);
+ if (should_reada)
+ reada_for_search(root, p, level, slot);
b = read_tree_block(root, btrfs_node_blockptr(c, slot));
} else {
@@ -960,7 +1071,7 @@ static int insert_new_root(struct btrfs_trans_handle *trans, struct btrfs_root
BUG_ON(path->nodes[level]);
BUG_ON(path->nodes[level-1] != root->node);
- t = btrfs_alloc_free_block(trans, root, root->node->b_blocknr);
+ t = btrfs_alloc_free_block(trans, root, root->node->b_blocknr, 0);
if (IS_ERR(t))
return PTR_ERR(t);
c = btrfs_buffer_node(t);
@@ -1070,7 +1181,7 @@ static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
}
c_nritems = btrfs_header_nritems(&c->header);
- split_buffer = btrfs_alloc_free_block(trans, root, t->b_blocknr);
+ split_buffer = btrfs_alloc_free_block(trans, root, t->b_blocknr, 0);
if (IS_ERR(split_buffer))
return PTR_ERR(split_buffer);
@@ -1461,7 +1572,7 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
nritems = btrfs_header_nritems(&l->header);
mid = (nritems + 1)/ 2;
- right_buffer = btrfs_alloc_free_block(trans, root, l_buf->b_blocknr);
+ right_buffer = btrfs_alloc_free_block(trans, root, l_buf->b_blocknr, 0);
if (IS_ERR(right_buffer))
return PTR_ERR(right_buffer);
@@ -1560,7 +1671,7 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
if (!double_split)
return ret;
- right_buffer = btrfs_alloc_free_block(trans, root, l_buf->b_blocknr);
+ right_buffer = btrfs_alloc_free_block(trans, root, l_buf->b_blocknr, 0);
if (IS_ERR(right_buffer))
return PTR_ERR(right_buffer);
@@ -1988,8 +2099,8 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
blocknr = btrfs_node_blockptr(c_node, slot);
if (next)
btrfs_block_release(root, next);
- if (level == 1 && path->reada)
- reada_for_search(root, path, slot);
+ if (path->reada)
+ reada_for_search(root, path, level, slot);
next = read_tree_block(root, blocknr);
break;
}
@@ -2002,8 +2113,8 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
path->slots[level] = 0;
if (!level)
break;
- if (level == 1 && path->reada)
- reada_for_search(root, path, slot);
+ if (path->reada)
+ reada_for_search(root, path, level, slot);
next = read_tree_block(root,
btrfs_node_blockptr(btrfs_buffer_node(next), 0));
}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index c5a18d5d7f7c..42aa20301bc9 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -178,6 +178,7 @@ struct btrfs_path {
struct buffer_head *nodes[BTRFS_MAX_LEVEL];
int slots[BTRFS_MAX_LEVEL];
int reada;
+ int lowest_level;
};
/*
@@ -338,6 +339,9 @@ struct btrfs_root {
u64 highest_inode;
u64 last_inode_alloc;
int ref_cows;
+ struct btrfs_key defrag_progress;
+ int defrag_running;
+ int defrag_level;
};
/* the lower bits in the key flags defines the item type */
@@ -1031,10 +1035,11 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
int btrfs_inc_root_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
struct buffer_head *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 hint);
+ struct btrfs_root *root, u64 hint,
+ u64 empty_size);
int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 owner,
- u64 num_blocks, u64 search_start,
+ u64 num_blocks, u64 empty_size, u64 search_start,
u64 search_end, struct btrfs_key *ins, int data);
int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct buffer_head *buf);
@@ -1051,6 +1056,10 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
int btrfs_free_block_groups(struct btrfs_fs_info *info);
int btrfs_read_block_groups(struct btrfs_root *root);
/* ctree.c */
+int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct buffer_head *buf, struct buffer_head
+ *parent, int parent_slot, struct buffer_head
+ **cow_ret);
int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root
*root, struct btrfs_path *path, u32 data_size);
int btrfs_truncate_item(struct btrfs_trans_handle *trans,
@@ -1060,6 +1069,9 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
*root, struct btrfs_key *key, struct btrfs_path *p, int
ins_len, int cow);
+int btrfs_realloc_node(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct buffer_head *parent,
+ int cache_only);
void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p);
struct btrfs_path *btrfs_alloc_path(void);
void btrfs_free_path(struct btrfs_path *p);
@@ -1171,4 +1183,7 @@ extern struct file_operations btrfs_file_operations;
int btrfs_drop_extents(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode,
u64 start, u64 end, u64 *hint_block);
+/* tree-defrag.c */
+int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, int cache_only);
#endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 60db85bc4ba8..c948416fea3a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -273,7 +273,9 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct buffer_head *buf)
{
WARN_ON(atomic_read(&buf->b_count) == 0);
+ lock_buffer(buf);
clear_buffer_dirty(buf);
+ unlock_buffer(buf);
return 0;
}
@@ -294,6 +296,9 @@ static int __setup_root(int blocksize,
root->last_inode_alloc = 0;
memset(&root->root_key, 0, sizeof(root->root_key));
memset(&root->root_item, 0, sizeof(root->root_item));
+ memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
+ root->defrag_running = 0;
+ root->defrag_level = 0;
root->root_key.objectid = objectid;
return 0;
}
@@ -585,6 +590,7 @@ int close_ctree(struct btrfs_root *root)
fs_info->closing = 1;
btrfs_transaction_flush_work(root);
mutex_lock(&fs_info->fs_mutex);
+ btrfs_defrag_dirty_roots(root->fs_info);
trans = btrfs_start_transaction(root, 1);
ret = btrfs_commit_transaction(trans, root);
/* run commit again to drop the original snapshot */
@@ -616,7 +622,9 @@ void btrfs_mark_buffer_dirty(struct buffer_head *bh)
{
struct btrfs_root *root = BTRFS_I(bh->b_page->mapping->host)->root;
u64 transid = btrfs_header_generation(btrfs_buffer_header(bh));
+
WARN_ON(!atomic_read(&bh->b_count));
+
if (transid != root->fs_info->generation) {
printk(KERN_CRIT "transid mismatch buffer %llu, found %Lu running %Lu\n",
(unsigned long long)bh->b_blocknr,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 5d4d5d8db8ef..26b8d3406491 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -23,7 +23,8 @@
#include "transaction.h"
static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
- *orig_root, u64 num_blocks, u64 search_start,
+ *orig_root, u64 num_blocks, u64 empty_size,
+ u64 search_start,
u64 search_end, u64 hint_block,
struct btrfs_key *ins, u64 exclude_start,
u64 exclude_nr, int data);
@@ -379,7 +380,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- ret = find_free_extent(trans, root->fs_info->extent_root, 0, 0,
+ ret = find_free_extent(trans, root->fs_info->extent_root, 0, 0, 0,
(u64)-1, 0, &ins, 0, 0, 0);
if (ret) {
btrfs_free_path(path);
@@ -533,7 +534,7 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
struct btrfs_block_group_item *bi;
struct btrfs_key ins;
- ret = find_free_extent(trans, extent_root, 0, 0, (u64)-1, 0, &ins,
+ ret = find_free_extent(trans, extent_root, 0, 0, 0, (u64)-1, 0, &ins,
0, 0, 0);
/* FIXME, set bit to recalc cache groups on next mount */
if (ret)
@@ -708,6 +709,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
static int try_remove_page(struct address_space *mapping, unsigned long index)
{
int ret;
+ return 0;
ret = invalidate_mapping_pages(mapping, index, index);
return ret;
}
@@ -866,7 +868,7 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
if (!path)
return -ENOMEM;
- ret = find_free_extent(trans, root, 0, 0, (u64)-1, 0, &ins, 0, 0, 0);
+ ret = find_free_extent(trans, root, 0, 0, 0, (u64)-1, 0, &ins, 0, 0, 0);
if (ret) {
btrfs_free_path(path);
return ret;
@@ -983,8 +985,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
* Any available blocks before search_start are skipped.
*/
static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
- *orig_root, u64 num_blocks, u64 search_start, u64
- search_end, u64 hint_block,
+ *orig_root, u64 num_blocks, u64 empty_size,
+ u64 search_start, u64 search_end, u64 hint_block,
struct btrfs_key *ins, u64 exclude_start,
u64 exclude_nr, int data)
{
@@ -1042,6 +1044,7 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
data, 1);
}
+ total_needed += empty_size;
path = btrfs_alloc_path();
check_failed:
@@ -1157,9 +1160,11 @@ check_pending:
goto error;
}
search_start = orig_search_start;
- if (wrapped)
+ if (wrapped) {
+ if (!full_scan)
+ total_needed -= empty_size;
full_scan = 1;
- else
+ } else
wrapped = 1;
goto new_group;
}
@@ -1238,9 +1243,11 @@ new_group:
ret = -ENOSPC;
goto error;
}
- if (wrapped)
+ if (wrapped) {
+ if (!full_scan)
+ total_needed -= empty_size;
full_scan = 1;
- else
+ } else
wrapped = 1;
}
block_group = btrfs_lookup_block_group(info, search_start);
@@ -1264,7 +1271,7 @@ error:
*/
int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 owner,
- u64 num_blocks, u64 hint_block,
+ u64 num_blocks, u64 empty_size, u64 hint_block,
u64 search_end, struct btrfs_key *ins, int data)
{
int ret;
@@ -1303,7 +1310,7 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
* in the correct block group.
*/
if (data) {
- ret = find_free_extent(trans, root, 0, 0,
+ ret = find_free_extent(trans, root, 0, 0, 0,
search_end, 0, &prealloc_key, 0, 0, 0);
BUG_ON(ret);
if (ret)
@@ -1313,8 +1320,8 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
}
/* do the real allocation */
- ret = find_free_extent(trans, root, num_blocks, search_start,
- search_end, hint_block, ins,
+ ret = find_free_extent(trans, root, num_blocks, empty_size,
+ search_start, search_end, hint_block, ins,
exclude_start, exclude_nr, data);
BUG_ON(ret);
if (ret)
@@ -1333,7 +1340,7 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
exclude_start = ins->objectid;
exclude_nr = ins->offset;
hint_block = exclude_start + exclude_nr;
- ret = find_free_extent(trans, root, 0, search_start,
+ ret = find_free_extent(trans, root, 0, 0, search_start,
search_end, hint_block,
&prealloc_key, exclude_start,
exclude_nr, 0);
@@ -1368,14 +1375,16 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
* returns the tree buffer or NULL.
*/
struct buffer_head *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 hint)
+ struct btrfs_root *root, u64 hint,
+ u64 empty_size)
{
struct btrfs_key ins;
int ret;
struct buffer_head *buf;
ret = btrfs_alloc_extent(trans, root, root->root_key.objectid,
- 1, hint, (unsigned long)-1, &ins, 0);
+ 1, empty_size, hint,
+ (unsigned long)-1, &ins, 0);
if (ret) {
BUG_ON(ret > 0);
return ERR_PTR(ret);
@@ -1385,6 +1394,7 @@ struct buffer_head *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
btrfs_free_extent(trans, root, ins.objectid, 1, 0);
return ERR_PTR(-ENOMEM);
}
+ WARN_ON(buffer_dirty(buf));
set_buffer_uptodate(buf);
set_buffer_checked(buf);
set_radix_bit(&trans->transaction->dirty_pages, buf->b_page->index);
@@ -1591,13 +1601,15 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
struct btrfs_key key;
struct btrfs_disk_key *found_key;
struct btrfs_node *node;
+
btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
+ level = root_item->drop_level;
+ path->lowest_level = level;
wret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0) {
+ if (wret < 0) {
ret = wret;
goto out;
}
- level = root_item->drop_level;
node = btrfs_buffer_node(path->nodes[level]);
found_key = &node->ptrs[path->slots[level]].key;
WARN_ON(memcmp(found_key, &root_item->drop_progress,
@@ -1617,8 +1629,6 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
ret = wret;
num_walks++;
if (num_walks > 10) {
- struct btrfs_key key;
- btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
ret = -EAGAIN;
get_bh(root->node);
break;
@@ -1627,6 +1637,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
for (i = 0; i <= orig_level; i++) {
if (path->nodes[i]) {
btrfs_block_release(root, path->nodes[i]);
+ path->nodes[i] = 0;
}
}
out:
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 1fe38fe84150..00b118a2db69 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -512,7 +512,7 @@ static int prepare_pages(struct btrfs_root *root,
if (isize >= PAGE_CACHE_SIZE || pos + write_bytes < inode->i_size ||
pos + write_bytes - start_pos > BTRFS_MAX_INLINE_DATA_SIZE(root)) {
err = btrfs_alloc_extent(trans, root, inode->i_ino,
- num_blocks, hint_block, (u64)-1,
+ num_blocks, 0, hint_block, (u64)-1,
&ins, 1);
if (err)
goto failed_truncate;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3889032fc449..12aa043b9f6f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -554,7 +554,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
&alloc_hint);
if (ret)
goto out;
- ret = btrfs_alloc_extent(trans, root, inode->i_ino, 1,
+ ret = btrfs_alloc_extent(trans, root, inode->i_ino, 1, 0,
alloc_hint, (u64)-1, &ins, 1);
if (ret)
goto out;
@@ -1360,7 +1360,7 @@ not_found:
if (create & BTRFS_GET_BLOCK_CREATE) {
struct btrfs_key ins;
ret = btrfs_alloc_extent(trans, root, inode->i_ino,
- 1, alloc_hint, (u64)-1,
+ 1, 0, alloc_hint, (u64)-1,
&ins, 1);
if (ret) {
err = ret;
@@ -1998,7 +1998,7 @@ static int create_subvol(struct btrfs_root *root, char *name, int namelen)
trans = btrfs_start_transaction(root, 1);
BUG_ON(!trans);
- subvol = btrfs_alloc_free_block(trans, root, 0);
+ subvol = btrfs_alloc_free_block(trans, root, 0, 0);
if (IS_ERR(subvol))
return PTR_ERR(subvol);
leaf = btrfs_buffer_leaf(subvol);
@@ -2159,7 +2159,9 @@ int btrfs_ioctl(struct inode *inode, struct file *filp, unsigned int
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_ioctl_vol_args vol_args;
+ struct btrfs_trans_handle *trans;
int ret = 0;
+ int err;
struct btrfs_dir_item *di;
int namelen;
struct btrfs_path *path;
@@ -2196,6 +2198,31 @@ int btrfs_ioctl(struct inode *inode, struct file *filp, unsigned int
else
ret = create_snapshot(root, vol_args.name, namelen);
break;
+
+ case BTRFS_IOC_DEFRAG:
+ mutex_lock(&root->fs_info->fs_mutex);
+ trans = btrfs_start_transaction(root, 1);
+ memset(&root->defrag_progress, 0,
+ sizeof(root->defrag_progress));
+ while (1) {
+ root->defrag_running = 1;
+ err = btrfs_defrag_leaves(trans, root, 0);
+
+ btrfs_end_transaction(trans, root);
+ mutex_unlock(&root->fs_info->fs_mutex);
+
+ btrfs_btree_balance_dirty(root);
+
+ mutex_lock(&root->fs_info->fs_mutex);
+ trans = btrfs_start_transaction(root, 1);
+ if (err != -EAGAIN)
+ break;
+ }
+ root->defrag_running = 0;
+ btrfs_end_transaction(trans, root);
+ mutex_unlock(&root->fs_info->fs_mutex);
+ ret = 0;
+ break;
default:
return -ENOTTY;
}
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 23bed48c51a7..8bc47dec2865 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -28,6 +28,6 @@ struct btrfs_ioctl_vol_args {
#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
struct btrfs_ioctl_vol_args)
-#define BTRFS_IOC_ADD_DISK _IOW(BTRFS_IOCTL_MAGIC, 2, \
+#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
struct btrfs_ioctl_vol_args)
#endif
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 498626470a04..338a7199363b 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -29,6 +29,7 @@ extern struct kmem_cache *btrfs_transaction_cachep;
static struct workqueue_struct *trans_wq;
#define BTRFS_ROOT_TRANS_TAG 0
+#define BTRFS_ROOT_DEFRAG_TAG 1
static void put_transaction(struct btrfs_transaction *transaction)
{
@@ -69,35 +70,41 @@ static int join_transaction(struct btrfs_root *root)
return 0;
}
+static int record_root_in_trans(struct btrfs_root *root)
+{
+ u64 running_trans_id = root->fs_info->running_transaction->transid;
+ if (root->ref_cows && root->last_trans < running_trans_id) {
+ WARN_ON(root == root->fs_info->extent_root);
+ if (root->root_item.refs != 0) {
+ radix_tree_tag_set(&root->fs_info->fs_roots_radix,
+ (unsigned long)root->root_key.objectid,
+ BTRFS_ROOT_TRANS_TAG);
+ radix_tree_tag_set(&root->fs_info->fs_roots_radix,
+ (unsigned long)root->root_key.objectid,
+ BTRFS_ROOT_DEFRAG_TAG);
+ root->commit_root = root->node;
+ get_bh(root->node);
+ } else {
+ WARN_ON(1);
+ }
+ root->last_trans = running_trans_id;
+ }
+ return 0;
+}
+
struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
int num_blocks)
{
struct btrfs_trans_handle *h =
kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
int ret;
- u64 running_trans_id;
mutex_lock(&root->fs_info->trans_mutex);
ret = join_transaction(root);
BUG_ON(ret);
- running_trans_id = root->fs_info->running_transaction->transid;
- if (root != root->fs_info->tree_root && root->last_trans <
- running_trans_id) {
- WARN_ON(root == root->fs_info->extent_root);
- WARN_ON(root->ref_cows != 1);
- if (root->root_item.refs != 0) {
- radix_tree_tag_set(&root->fs_info->fs_roots_radix,
- (unsigned long)root->root_key.objectid,
- BTRFS_ROOT_TRANS_TAG);
- root->commit_root = root->node;
- get_bh(root->node);
- } else {
- WARN_ON(1);
- }
- }
- root->last_trans = running_trans_id;
- h->transid = running_trans_id;
+ record_root_in_trans(root);
+ h->transid = root->fs_info->running_transaction->transid;
h->transaction = root->fs_info->running_transaction;
h->blocks_reserved = num_blocks;
h->blocks_used = 0;
@@ -155,6 +162,15 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
gang[i]);
if (!page)
continue;
+ if (PageWriteback(page)) {
+ if (PageDirty(page))
+ wait_on_page_writeback(page);
+ else {
+ unlock_page(page);
+ page_cache_release(page);
+ continue;
+ }
+ }
err = write_one_page(page, 0);
if (err)
werr = err;
@@ -299,6 +315,58 @@ static int add_dirty_roots(struct btrfs_trans_handle *trans,
return err;
}
+int btrfs_defrag_dirty_roots(struct btrfs_fs_info *info)
+{
+ struct btrfs_root *gang[1];
+ struct btrfs_root *root;
+ struct btrfs_root *tree_root = info->tree_root;
+ struct btrfs_trans_handle *trans;
+ int i;
+ int ret;
+ int err = 0;
+ u64 last = 0;
+
+ trans = btrfs_start_transaction(tree_root, 1);
+ while(1) {
+ ret = radix_tree_gang_lookup_tag(&info->fs_roots_radix,
+ (void **)gang, last,
+ ARRAY_SIZE(gang),
+ BTRFS_ROOT_DEFRAG_TAG);
+ if (ret == 0)
+ break;
+ for (i = 0; i < ret; i++) {
+ root = gang[i];
+ last = root->root_key.objectid + 1;
+ radix_tree_tag_clear(&info->fs_roots_radix,
+ (unsigned long)root->root_key.objectid,
+ BTRFS_ROOT_DEFRAG_TAG);
+ if (root->defrag_running)
+ continue;
+
+ while (1) {
+ mutex_lock(&root->fs_info->trans_mutex);
+ record_root_in_trans(root);
+ mutex_unlock(&root->fs_info->trans_mutex);
+
+ root->defrag_running = 1;
+ err = btrfs_defrag_leaves(trans, root, 1);
+ btrfs_end_transaction(trans, tree_root);
+ mutex_unlock(&info->fs_mutex);
+
+ btrfs_btree_balance_dirty(root);
+
+ mutex_lock(&info->fs_mutex);
+ trans = btrfs_start_transaction(tree_root, 1);
+ if (err != -EAGAIN)
+ break;
+ }
+ root->defrag_running = 0;
+ }
+ }
+ btrfs_end_transaction(trans, tree_root);
+ return err;
+}
+
static int drop_dirty_roots(struct btrfs_root *tree_root,
struct list_head *list)
{
@@ -475,6 +543,7 @@ void btrfs_transaction_cleaner(struct work_struct *work)
goto out;
}
mutex_unlock(&root->fs_info->trans_mutex);
+ btrfs_defrag_dirty_roots(root->fs_info);
trans = btrfs_start_transaction(root, 1);
ret = btrfs_commit_transaction(trans, root);
out:
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index ebf44f3e1118..8b2714e65562 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -70,5 +70,6 @@ void btrfs_transaction_queue_work(struct btrfs_root *root, int delay);
void btrfs_init_transaction_sys(void);
void btrfs_exit_transaction_sys(void);
int btrfs_add_dead_root(struct btrfs_root *root, struct list_head *dead_list);
+int btrfs_defrag_dirty_roots(struct btrfs_fs_info *info);
#endif
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
new file mode 100644
index 000000000000..15d0a486fb59
--- /dev/null
+++ b/fs/btrfs/tree-defrag.c
@@ -0,0 +1,222 @@
+/*
+ * Copyright (C) 2007 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "print-tree.h"
+#include "transaction.h"
+
+static void reada_defrag(struct btrfs_root *root,
+ struct btrfs_node *node)
+{
+ int i;
+ u32 nritems;
+ u64 blocknr;
+ int ret;
+
+ nritems = btrfs_header_nritems(&node->header);
+ for (i = 0; i < nritems; i++) {
+ blocknr = btrfs_node_blockptr(node, i);
+ ret = readahead_tree_block(root, blocknr);
+ if (ret)
+ break;
+ }
+}
+
+static int defrag_walk_down(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, int *level,
+ int cache_only)
+{
+ struct buffer_head *next;
+ struct buffer_head *cur;
+ u64 blocknr;
+ int ret = 0;
+
+ WARN_ON(*level < 0);
+ WARN_ON(*level >= BTRFS_MAX_LEVEL);
+
+ while(*level > 0) {
+ WARN_ON(*level < 0);
+ WARN_ON(*level >= BTRFS_MAX_LEVEL);
+ cur = path->nodes[*level];
+
+ if (!cache_only && *level > 1 && path->slots[*level] == 0)
+ reada_defrag(root, btrfs_buffer_node(cur));
+
+ if (btrfs_header_level(btrfs_buffer_header(cur)) != *level)
+ WARN_ON(1);
+
+ if (path->slots[*level] >=
+ btrfs_header_nritems(btrfs_buffer_header(cur)))
+ break;
+
+ if (*level == 1) {
+ ret = btrfs_realloc_node(trans, root,
+ path->nodes[*level],
+ cache_only);
+ break;
+ }
+ blocknr = btrfs_node_blockptr(btrfs_buffer_node(cur),
+ path->slots[*level]);
+
+ if (cache_only) {
+ next = btrfs_find_tree_block(root, blocknr);
+ if (!next || !buffer_uptodate(next) ||
+ buffer_locked(next)) {
+ brelse(next);
+ path->slots[*level]++;
+ continue;
+ }
+ } else {
+ next = read_tree_block(root, blocknr);
+ }
+ ret = btrfs_cow_block(trans, root, next, path->nodes[*level],
+ path->slots[*level], &next);
+ BUG_ON(ret);
+ ret = btrfs_realloc_node(trans, root, next, cache_only);
+ BUG_ON(ret);
+ WARN_ON(*level <= 0);
+ if (path->nodes[*level-1])
+ btrfs_block_release(root, path->nodes[*level-1]);
+ path->nodes[*level-1] = next;
+ *level = btrfs_header_level(btrfs_buffer_header(next));
+ path->slots[*level] = 0;
+ }
+ WARN_ON(*level < 0);
+ WARN_ON(*level >= BTRFS_MAX_LEVEL);
+ btrfs_block_release(root, path->nodes[*level]);
+ path->nodes[*level] = NULL;
+ *level += 1;
+ WARN_ON(ret);
+ return 0;
+}
+
+static int defrag_walk_up(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, int *level,
+ int cache_only)
+{
+ int i;
+ int slot;
+ struct btrfs_node *node;
+
+ for(i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
+ slot = path->slots[i];
+ if (slot < btrfs_header_nritems(
+ btrfs_buffer_header(path->nodes[i])) - 1) {
+ path->slots[i]++;
+ *level = i;
+ node = btrfs_buffer_node(path->nodes[i]);
+ WARN_ON(i == 0);
+ btrfs_disk_key_to_cpu(&root->defrag_progress,
+ &node->ptrs[path->slots[i]].key);
+ root->defrag_level = i;
+ return 0;
+ } else {
+ btrfs_block_release(root, path->nodes[*level]);
+ path->nodes[*level] = NULL;
+ *level = i + 1;
+ }
+ }
+ return 1;
+}
+
+int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, int cache_only)
+{
+ struct btrfs_path *path = NULL;
+ struct buffer_head *tmp;
+ int ret = 0;
+ int wret;
+ int level;
+ int orig_level;
+ int i;
+ int num_runs = 0;
+
+ if (root->ref_cows == 0) {
+ goto out;
+ }
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ level = btrfs_header_level(btrfs_buffer_header(root->node));
+ orig_level = level;
+ if (level == 0) {
+ goto out;
+ }
+ if (root->defrag_progress.objectid == 0) {
+ get_bh(root->node);
+ ret = btrfs_cow_block(trans, root, root->node, NULL, 0, &tmp);
+ BUG_ON(ret);
+ ret = btrfs_realloc_node(trans, root, root->node, cache_only);
+ BUG_ON(ret);
+ path->nodes[level] = root->node;
+ path->slots[level] = 0;
+ } else {
+ level = root->defrag_level;
+ path->lowest_level = level;
+ wret = btrfs_search_slot(trans, root, &root->defrag_progress,
+ path, 0, 1);
+
+ if (wret < 0) {
+ ret = wret;
+ goto out;
+ }
+ while(level > 0 && !path->nodes[level])
+ level--;
+ if (!path->nodes[level]) {
+ ret = 0;
+ goto out;
+ }
+ }
+
+ while(1) {
+ wret = defrag_walk_down(trans, root, path, &level, cache_only);
+ if (wret > 0)
+ break;
+ if (wret < 0)
+ ret = wret;
+
+ wret = defrag_walk_up(trans, root, path, &level, cache_only);
+ if (wret > 0)
+ break;
+ if (wret < 0)
+ ret = wret;
+ if (num_runs++ > 8) {
+ ret = -EAGAIN;
+ break;
+ }
+ }
+ for (i = 0; i <= orig_level; i++) {
+ if (path->nodes[i]) {
+ btrfs_block_release(root, path->nodes[i]);
+ path->nodes[i] = 0;
+ }
+ }
+out:
+ if (path)
+ btrfs_free_path(path);
+ if (ret != -EAGAIN) {
+ memset(&root->defrag_progress, 0,
+ sizeof(root->defrag_progress));
+ }
+ return ret;
+}