From 2c90e5d658424bc71b111eb5a972240d5d06fe86 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 2 Apr 2007 10:50:19 -0400
Subject: Btrfs: still corruption hunting

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/btrfs_inode.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 fs/btrfs/btrfs_inode.h

(limited to 'fs/btrfs/btrfs_inode.h')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
new file mode 100644
index 000000000000..e159841650a4
--- /dev/null
+++ b/fs/btrfs/btrfs_inode.h
@@ -0,0 +1,14 @@
+#ifndef __BTRFS_I__
+#define __BTRFS_I__
+
+struct btrfs_inode {
+	u32 magic;
+	struct inode vfs_inode;
+	u32 magic2;
+};
+static inline struct btrfs_inode *BTRFS_I(struct inode *inode)
+{
+	return container_of(inode, struct btrfs_inode, vfs_inode);
+}
+
+#endif
-- 
cgit v1.2.3


From d6e4a428eb8f92bbb3537ccabadfb1195efb432b Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 6 Apr 2007 15:37:36 -0400
Subject: Btrfs: start of support for many FS volumes

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/TODO          |   7 ++
 fs/btrfs/btrfs_inode.h |   2 +
 fs/btrfs/ctree.h       |  36 ++++----
 fs/btrfs/dir-item.c    |  14 ++--
 fs/btrfs/disk-io.c     |   7 +-
 fs/btrfs/inode-item.c  |  26 ++++--
 fs/btrfs/print-tree.c  |   2 +-
 fs/btrfs/super.c       | 218 +++++++++++++++++++++++++++++++++++++------------
 fs/btrfs/transaction.c |   9 ++
 9 files changed, 242 insertions(+), 79 deletions(-)

(limited to 'fs/btrfs/btrfs_inode.h')

diff --git a/fs/btrfs/TODO b/fs/btrfs/TODO
index ea4c38c790b4..73d4b842a316 100644
--- a/fs/btrfs/TODO
+++ b/fs/btrfs/TODO
@@ -19,3 +19,10 @@
 * fsck
 * Scrub & defrag
 
+---
+metata FS ideas:
+mount metadata FS on /sys/fs/btrfs/dev/
+subvolumes in /sys/fs/btrfs/dev/fs/
+snapshots in /sys/fs/btrfs/dev/fs/snapshots
+mount -o bind
+ioctl to create a snapshot
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index e159841650a4..3439ffb467b5 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -3,6 +3,8 @@
 
 struct btrfs_inode {
 	u32 magic;
+	struct btrfs_root *root;
+	struct btrfs_key location;
 	struct inode vfs_inode;
 	u32 magic2;
 };
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 6d40758b893f..1ff5b99af682 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3,6 +3,7 @@
 
 #include <linux/fs.h>
 #include <linux/buffer_head.h>
+#include <linux/kobject.h>
 #include "bit-radix.h"
 
 struct btrfs_trans_handle;
@@ -183,13 +184,15 @@ struct btrfs_inline_data_item {
 } __attribute__ ((__packed__));
 
 struct btrfs_dir_item {
-	__le64 objectid;
+	struct btrfs_disk_key location;
 	__le16 flags;
 	__le16 name_len;
 	u8 type;
 } __attribute__ ((__packed__));
 
 struct btrfs_root_item {
+	struct btrfs_inode_item inode;
+	__le64 root_dirid;
 	__le64 blocknr;
 	__le32 flags;
 	__le64 block_limit;
@@ -249,6 +252,7 @@ struct btrfs_fs_info {
 	struct mutex fs_mutex;
 	struct crypto_hash *hash_tfm;
 	spinlock_t hash_lock;
+	struct kobject kobj;
 };
 
 /*
@@ -504,16 +508,6 @@ static inline void btrfs_set_item_size(struct btrfs_item *item, u16 val)
 	item->size = cpu_to_le16(val);
 }
 
-static inline u64 btrfs_dir_objectid(struct btrfs_dir_item *d)
-{
-	return le64_to_cpu(d->objectid);
-}
-
-static inline void btrfs_set_dir_objectid(struct btrfs_dir_item *d, u64 val)
-{
-	d->objectid = cpu_to_le64(val);
-}
-
 static inline u16 btrfs_dir_flags(struct btrfs_dir_item *d)
 {
 	return le16_to_cpu(d->flags);
@@ -724,6 +718,16 @@ static inline void btrfs_set_root_blocknr(struct btrfs_root_item *item, u64 val)
 	item->blocknr = cpu_to_le64(val);
 }
 
+static inline u64 btrfs_root_dirid(struct btrfs_root_item *item)
+{
+	return le64_to_cpu(item->root_dirid);
+}
+
+static inline void btrfs_set_root_dirid(struct btrfs_root_item *item, u64 val)
+{
+	item->root_dirid = cpu_to_le64(val);
+}
+
 static inline u32 btrfs_root_refs(struct btrfs_root_item *item)
 {
 	return le32_to_cpu(item->refs);
@@ -950,8 +954,8 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
 			 btrfs_root_item *item, struct btrfs_key *key);
 /* dir-item.c */
 int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
-			  *root, const char *name, int name_len, u64 dir, u64
-			  objectid, u8 type);
+			  *root, const char *name, int name_len, u64 dir,
+			  struct btrfs_key *location, u8 type);
 int btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 			  *root, struct btrfs_path *path, u64 dir,
 			  const char *name, int name_len, int mod);
@@ -978,7 +982,8 @@ int btrfs_insert_inode(struct btrfs_trans_handle *trans, struct btrfs_root
 		       *root, u64 objectid, struct btrfs_inode_item
 		       *inode_item);
 int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
-		       *root, struct btrfs_path *path, u64 objectid, int mod);
+		       *root, struct btrfs_path *path,
+		       struct btrfs_key *location, int mod);
 
 /* file-item.c */
 int btrfs_alloc_file_extent(struct btrfs_trans_handle *trans,
@@ -997,4 +1002,7 @@ int btrfs_csum_file_block(struct btrfs_trans_handle *trans,
 int btrfs_csum_verify_file_block(struct btrfs_root *root,
 				 u64 objectid, u64 offset,
 				 char *data, size_t len);
+/* super.c */
+extern struct subsystem btrfs_subsys;
+
 #endif
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 0ee9945fb1b0..7aed9f015b55 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -25,8 +25,8 @@ int insert_with_overflow(struct btrfs_trans_handle *trans, struct btrfs_root
 }
 
 int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
-			  *root, const char *name, int name_len, u64 dir, u64
-			  objectid, u8 type)
+			  *root, const char *name, int name_len, u64 dir,
+			  struct btrfs_key *location, u8 type)
 {
 	int ret = 0;
 	struct btrfs_path *path;
@@ -50,17 +50,21 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 	dir_item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
 				  path->slots[0],
 				  struct btrfs_dir_item);
-	btrfs_set_dir_objectid(dir_item, objectid);
+	btrfs_cpu_key_to_disk(&dir_item->location, location);
 	btrfs_set_dir_type(dir_item, type);
 	btrfs_set_dir_flags(dir_item, 0);
 	btrfs_set_dir_name_len(dir_item, name_len);
 	name_ptr = (char *)(dir_item + 1);
+	/* FIXME, use some real flag for selecting the extra index */
+	if (root == root->fs_info->tree_root)
+		goto out;
+
 	btrfs_memcpy(root, path->nodes[0]->b_data, name_ptr, name, name_len);
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 	btrfs_release_path(root, path);
 
 	btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
-	key.offset = objectid;
+	key.offset = location->objectid;
 	ret = insert_with_overflow(trans, root, path, &key, data_size);
 	// FIXME clear the dirindex bit
 	if (ret)
@@ -69,7 +73,7 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 	dir_item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
 				  path->slots[0],
 				  struct btrfs_dir_item);
-	btrfs_set_dir_objectid(dir_item, objectid);
+	btrfs_cpu_key_to_disk(&dir_item->location, location);
 	btrfs_set_dir_type(dir_item, type);
 	btrfs_set_dir_flags(dir_item, 0);
 	btrfs_set_dir_name_len(dir_item, name_len);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 5230554380d1..b9301a5e4608 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -338,7 +338,6 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 					     sb->s_blocksize);
 
 	if (!fs_info->sb_buffer) {
-printk("failed2\n");
 		return NULL;
 	}
 	disk_super = (struct btrfs_super_block *)fs_info->sb_buffer->b_data;
@@ -369,6 +368,10 @@ printk("failed2\n");
 	ret = btrfs_find_highest_inode(root, &root->fs_info->last_inode_alloc);
 	if (ret == 0)
 		fs_info->highest_inode = fs_info->last_inode_alloc;
+	memset(&fs_info->kobj, 0, sizeof(fs_info->kobj));
+	kobj_set_kset_s(fs_info, btrfs_subsys);
+	kobject_set_name(&fs_info->kobj, "%s", sb->s_id);
+	kobject_register(&fs_info->kobj);
 	mutex_unlock(&fs_info->fs_mutex);
 	return root;
 }
@@ -430,7 +433,7 @@ int close_ctree(struct btrfs_root *root)
 	kfree(root->fs_info->extent_root);
 	kfree(root->fs_info->inode_root);
 	kfree(root->fs_info->tree_root);
-	kfree(root->fs_info);
+	kobject_unregister(&root->fs_info->kobj);
 	kfree(root);
 	return 0;
 }
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 6bfa980790c2..b276a3b40a66 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -26,15 +26,27 @@ int btrfs_insert_inode(struct btrfs_trans_handle *trans, struct btrfs_root
 }
 
 int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
-		       *root, struct btrfs_path *path, u64 objectid, int mod)
+		       *root, struct btrfs_path *path,
+		       struct btrfs_key *location, int mod)
 {
-	struct btrfs_key key;
 	int ins_len = mod < 0 ? -1 : 0;
 	int cow = mod != 0;
+	int ret;
+	int slot;
+	struct btrfs_leaf *leaf;
+	struct btrfs_key found_key;
 
-	key.objectid = objectid;
-	key.flags = 0;
-	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
-	key.offset = 0;
-	return btrfs_search_slot(trans, root, &key, path, ins_len, cow);
+	ret = btrfs_search_slot(trans, root, location, path, ins_len, cow);
+	if (ret > 0 && btrfs_key_type(location) == BTRFS_ROOT_ITEM_KEY &&
+	    location->offset == (u64)-1 && path->slots[0] != 0) {
+		slot = path->slots[0] - 1;
+		leaf = btrfs_buffer_leaf(path->nodes[0]);
+		btrfs_disk_key_to_cpu(&found_key, &leaf->items[slot].key);
+		if (found_key.objectid == location->objectid &&
+		    btrfs_key_type(&found_key) == btrfs_key_type(location)) {
+			path->slots[0]--;
+			return 0;
+		}
+	}
+	return ret;
 }
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index c8ee938c1251..f0da65c4f96b 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -38,7 +38,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct btrfs_leaf *l)
 		case BTRFS_DIR_ITEM_KEY:
 			di = btrfs_item_ptr(l, i, struct btrfs_dir_item);
 			printk("\t\tdir oid %Lu flags %u type %u\n",
-				btrfs_dir_objectid(di),
+				btrfs_disk_key_objectid(&di->location),
 				btrfs_dir_flags(di),
 				btrfs_dir_type(di));
 			printk("\t\tname %.*s\n",
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 32224ffdcecd..66d9fb2288c3 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -16,9 +16,23 @@
 #include "transaction.h"
 #include "btrfs_inode.h"
 
+void btrfs_fsinfo_release(struct kobject *obj)
+{
+	struct btrfs_fs_info *fsinfo = container_of(obj,
+					    struct btrfs_fs_info, kobj);
+	kfree(fsinfo);
+}
+
+struct kobj_type btrfs_fsinfo_ktype = {
+	.release = btrfs_fsinfo_release,
+};
+
+decl_subsys(btrfs, &btrfs_fsinfo_ktype, NULL);
+
 #define BTRFS_SUPER_MAGIC 0x9123682E
 
 static struct inode_operations btrfs_dir_inode_operations;
+static struct inode_operations btrfs_dir_ro_inode_operations;
 static struct super_operations btrfs_super_ops;
 static struct file_operations btrfs_dir_file_operations;
 static struct inode_operations btrfs_file_inode_operations;
@@ -37,7 +51,8 @@ static void btrfs_read_locked_inode(struct inode *inode)
 {
 	struct btrfs_path *path;
 	struct btrfs_inode_item *inode_item;
-	struct btrfs_root *root = btrfs_sb(inode->i_sb);
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_key location;
 	int ret;
 
 	path = btrfs_alloc_path();
@@ -46,13 +61,12 @@ static void btrfs_read_locked_inode(struct inode *inode)
 	mutex_lock(&root->fs_info->fs_mutex);
 
 	check_inode(inode);
-	ret = btrfs_lookup_inode(NULL, root, path, inode->i_ino, 0);
+
+	memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
+	ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
 	if (ret) {
-		btrfs_release_path(root, path);
 		btrfs_free_path(path);
-		mutex_unlock(&root->fs_info->fs_mutex);
-		make_bad_inode(inode);
-		return;
+		goto make_bad;
 	}
 	check_inode(inode);
 	inode_item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
@@ -73,7 +87,6 @@ static void btrfs_read_locked_inode(struct inode *inode)
 	inode->i_blocks = btrfs_inode_nblocks(inode_item);
 	inode->i_generation = btrfs_inode_generation(inode_item);
 
-	btrfs_release_path(root, path);
 	btrfs_free_path(path);
 	inode_item = NULL;
 
@@ -92,8 +105,11 @@ static void btrfs_read_locked_inode(struct inode *inode)
 		inode->i_op = &btrfs_file_inode_operations;
 		break;
 	case S_IFDIR:
-		inode->i_op = &btrfs_dir_inode_operations;
 		inode->i_fop = &btrfs_dir_file_operations;
+		if (root == root->fs_info->tree_root)
+			inode->i_op = &btrfs_dir_ro_inode_operations;
+		else
+			inode->i_op = &btrfs_dir_inode_operations;
 		break;
 	case S_IFLNK:
 		// inode->i_op = &page_symlink_inode_operations;
@@ -101,6 +117,12 @@ static void btrfs_read_locked_inode(struct inode *inode)
 	}
 	check_inode(inode);
 	return;
+
+make_bad:
+	btrfs_release_path(root, path);
+	btrfs_free_path(path);
+	mutex_unlock(&root->fs_info->fs_mutex);
+	make_bad_inode(inode);
 }
 
 static int btrfs_unlink_trans(struct btrfs_trans_handle *trans,
@@ -128,7 +150,7 @@ static int btrfs_unlink_trans(struct btrfs_trans_handle *trans,
 	}
 	di = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), path->slots[0],
 			    struct btrfs_dir_item);
-	objectid = btrfs_dir_objectid(di);
+	objectid = btrfs_disk_key_objectid(&di->location);
 
 	ret = btrfs_del_item(trans, root, path);
 	BUG_ON(ret);
@@ -157,7 +179,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 	struct btrfs_trans_handle *trans;
 	int ret;
 
-	root = btrfs_sb(dir->i_sb);
+	root = BTRFS_I(dir)->root;
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
 	ret = btrfs_unlink_trans(trans, root, dir, dentry);
@@ -171,7 +193,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 	struct inode *inode = dentry->d_inode;
 	int err;
 	int ret;
-	struct btrfs_root *root = btrfs_sb(dir->i_sb);
+	struct btrfs_root *root = BTRFS_I(dir)->root;
 	struct btrfs_path *path;
 	struct btrfs_key key;
 	struct btrfs_trans_handle *trans;
@@ -268,7 +290,8 @@ static int btrfs_free_inode(struct btrfs_trans_handle *trans,
 	BUG_ON(ret);
 	btrfs_release_path(root, path);
 
-	ret = btrfs_lookup_inode(trans, root, path, objectid, -1);
+	ret = btrfs_lookup_inode(trans, root, path,
+				 &BTRFS_I(inode)->location, -1);
 	BUG_ON(ret);
 	ret = btrfs_del_item(trans, root, path);
 	BUG_ON(ret);
@@ -355,7 +378,7 @@ error:
 static void btrfs_delete_inode(struct inode *inode)
 {
 	struct btrfs_trans_handle *trans;
-	struct btrfs_root *root = btrfs_sb(inode->i_sb);
+	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret;
 
 	truncate_inode_pages(&inode->i_data, 0);
@@ -378,13 +401,13 @@ no_delete:
 }
 
 static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
-			      ino_t *ino)
+			       struct btrfs_key *location)
 {
 	const char *name = dentry->d_name.name;
 	int namelen = dentry->d_name.len;
 	struct btrfs_dir_item *di;
 	struct btrfs_path *path;
-	struct btrfs_root *root = btrfs_sb(dir->i_sb);
+	struct btrfs_root *root = BTRFS_I(dir)->root;
 	int ret;
 
 	path = btrfs_alloc_path();
@@ -393,13 +416,13 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
 	ret = btrfs_lookup_dir_item(NULL, root, path, dir->i_ino, name,
 				    namelen, 0);
 	if (ret || !btrfs_match_dir_item_name(root, path, name, namelen)) {
-		*ino = 0;
+		location->objectid = 0;
 		ret = 0;
 		goto out;
 	}
 	di = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), path->slots[0],
 			    struct btrfs_dir_item);
-	*ino = btrfs_dir_objectid(di);
+	btrfs_disk_key_to_cpu(location, &di->location);
 out:
 	btrfs_release_path(root, path);
 	btrfs_free_path(path);
@@ -407,26 +430,76 @@ out:
 	return ret;
 }
 
+int fixup_tree_root_location(struct btrfs_root *root,
+			     struct btrfs_key *location,
+			     struct btrfs_root **sub_root)
+{
+	struct btrfs_path *path;
+	struct btrfs_root_item *ri;
+	int ret;
+
+	if (btrfs_key_type(location) != BTRFS_ROOT_ITEM_KEY)
+		return 0;
+	if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
+		return 0;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	mutex_lock(&root->fs_info->fs_mutex);
+
+	ret = btrfs_lookup_inode(NULL, root, path, location, 0);
+	if (ret)
+		goto out;
+	ri = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
+			  path->slots[0],
+			  struct btrfs_root_item);
+	location->objectid = btrfs_root_dirid(ri);
+	location->flags = 0;
+	btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
+	location->offset = 0;
+	/* FIXME properly select the root */
+	*sub_root = root->fs_info->fs_root;
+out:
+	btrfs_free_path(path);
+	mutex_unlock(&root->fs_info->fs_mutex);
+	return ret;
+}
+
+
 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
 				   struct nameidata *nd)
 {
 	struct inode * inode;
-	struct btrfs_root *root = btrfs_sb(dir->i_sb);
-	ino_t ino;
+	struct btrfs_inode *bi = BTRFS_I(dir);
+	struct btrfs_root *root = bi->root;
+	struct btrfs_root *sub_root = root;
+	struct btrfs_key location;
 	int ret;
 
 	if (dentry->d_name.len > BTRFS_NAME_LEN)
 		return ERR_PTR(-ENAMETOOLONG);
 	mutex_lock(&root->fs_info->fs_mutex);
-	ret = btrfs_inode_by_name(dir, dentry, &ino);
+	ret = btrfs_inode_by_name(dir, dentry, &location);
 	mutex_unlock(&root->fs_info->fs_mutex);
 	if (ret < 0)
 		return ERR_PTR(ret);
 	inode = NULL;
-	if (ino) {
-		inode = iget(dir->i_sb, ino);
+	if (location.objectid) {
+		ret = fixup_tree_root_location(root, &location, &sub_root);
+		if (ret < 0)
+			return ERR_PTR(ret);
+		if (ret > 0)
+			return ERR_PTR(-ENOENT);
+		inode = iget_locked(dir->i_sb, location.objectid);
 		if (!inode)
 			return ERR_PTR(-EACCES);
+		if (inode->i_state & I_NEW) {
+			BTRFS_I(inode)->root = sub_root;
+			memcpy(&BTRFS_I(inode)->location, &location,
+			       sizeof(location));
+			btrfs_read_locked_inode(inode);
+			unlock_new_inode(inode);
+		}
 		check_inode(inode);
 	}
 	check_inode(dir);
@@ -436,7 +509,7 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
 static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 {
 	struct inode *inode = filp->f_path.dentry->d_inode;
-	struct btrfs_root *root = btrfs_sb(inode->i_sb);
+	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_item *item;
 	struct btrfs_dir_item *di;
 	struct btrfs_key key;
@@ -448,11 +521,16 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	int advance;
 	unsigned char d_type = DT_UNKNOWN;
 	int over = 0;
+	int key_type = BTRFS_DIR_INDEX_KEY;
+
+	/* FIXME, use a real flag for deciding about the key type */
+	if (root->fs_info->tree_root == root)
+		key_type = BTRFS_DIR_ITEM_KEY;
 
 	mutex_lock(&root->fs_info->fs_mutex);
 	key.objectid = inode->i_ino;
 	key.flags = 0;
-	btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
+	btrfs_set_key_type(&key, key_type);
 	key.offset = filp->f_pos;
 	path = btrfs_alloc_path();
 	btrfs_init_path(path);
@@ -482,10 +560,11 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		item = leaf->items + slot;
 		if (btrfs_disk_key_objectid(&item->key) != key.objectid)
 			break;
-		if (btrfs_disk_key_offset(&item->key) >
+		if (key_type == BTRFS_DIR_INDEX_KEY &&
+		    btrfs_disk_key_offset(&item->key) >
 		    root->fs_info->highest_inode)
 			break;
-		if (btrfs_disk_key_type(&item->key) != BTRFS_DIR_INDEX_KEY)
+		if (btrfs_disk_key_type(&item->key) != key_type)
 			continue;
 		if (btrfs_disk_key_offset(&item->key) < filp->f_pos)
 			continue;
@@ -495,7 +574,7 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		over = filldir(dirent, (const char *)(di + 1),
 			       btrfs_dir_name_len(di),
 			       btrfs_disk_key_offset(&item->key),
-			       btrfs_dir_objectid(di), d_type);
+			       btrfs_disk_key_objectid(&di->location), d_type);
 		if (over)
 			goto nopos;
 	}
@@ -527,6 +606,7 @@ static int btrfs_fill_super(struct super_block * sb, void * data, int silent)
 	struct dentry * root_dentry;
 	struct btrfs_super_block *disk_super;
 	struct btrfs_root *root;
+	struct btrfs_inode *bi;
 
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
 	sb->s_magic = BTRFS_SUPER_MAGIC;
@@ -546,6 +626,13 @@ static int btrfs_fill_super(struct super_block * sb, void * data, int silent)
 	       btrfs_super_root_dir(disk_super));
 
 	inode = iget_locked(sb, btrfs_super_root_dir(disk_super));
+	bi = BTRFS_I(inode);
+	bi->location.objectid = inode->i_ino;
+	bi->location.offset = 0;
+	bi->location.flags = 0;
+	bi->root = root->fs_info->tree_root;
+	btrfs_set_key_type(&bi->location, BTRFS_INODE_ITEM_KEY);
+
 	if (!inode)
 		return -ENOMEM;
 	if (inode->i_state & I_NEW) {
@@ -594,7 +681,8 @@ static int btrfs_update_inode(struct btrfs_trans_handle *trans,
 	BUG_ON(!path);
 	btrfs_init_path(path);
 
-	ret = btrfs_lookup_inode(trans, root, path, inode->i_ino, 1);
+	ret = btrfs_lookup_inode(trans, root, path,
+				 &BTRFS_I(inode)->location, 1);
 	if (ret) {
 		if (ret > 0)
 			ret = -ENOENT;
@@ -616,7 +704,7 @@ failed:
 
 static int btrfs_write_inode(struct inode *inode, int wait)
 {
-	struct btrfs_root *root = btrfs_sb(inode->i_sb);
+	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
 	int ret;
 
@@ -637,8 +725,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 {
 	struct inode *inode;
 	struct btrfs_inode_item inode_item;
-	struct btrfs_root *root = btrfs_sb(dir->i_sb);
-	struct btrfs_key key;
+	struct btrfs_root *root = BTRFS_I(dir)->root;
+	struct btrfs_key *key;
 	int ret;
 	u64 objectid;
 
@@ -646,6 +734,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
 
+	BTRFS_I(inode)->root = BTRFS_I(dir)->root;
+	key = &BTRFS_I(inode)->location;
 	check_inode(inode);
 	ret = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
 	BUG_ON(ret);
@@ -658,11 +748,11 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
 	fill_inode_item(&inode_item, inode);
 
-	key.objectid = objectid;
-	key.flags = 0;
-	key.offset = 0;
-	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
-	ret = btrfs_insert_inode_map(trans, root, objectid, &key);
+	key->objectid = objectid;
+	key->flags = 0;
+	key->offset = 0;
+	btrfs_set_key_type(key, BTRFS_INODE_ITEM_KEY);
+	ret = btrfs_insert_inode_map(trans, root, objectid, key);
 	BUG_ON(ret);
 
 	ret = btrfs_insert_inode(trans, root, objectid, &inode_item);
@@ -678,13 +768,20 @@ static int btrfs_add_link(struct btrfs_trans_handle *trans,
 			    struct dentry *dentry, struct inode *inode)
 {
 	int ret;
-	ret = btrfs_insert_dir_item(trans, btrfs_sb(inode->i_sb),
+	struct btrfs_key key;
+	struct btrfs_root *root = BTRFS_I(dentry->d_parent->d_inode)->root;
+	key.objectid = inode->i_ino;
+	key.flags = 0;
+	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+	key.offset = 0;
+
+	ret = btrfs_insert_dir_item(trans, root,
 				    dentry->d_name.name, dentry->d_name.len,
 				    dentry->d_parent->d_inode->i_ino,
-				    inode->i_ino, 0);
+				    &key, 0);
 	if (ret == 0) {
 		dentry->d_parent->d_inode->i_size += dentry->d_name.len * 2;
-		ret = btrfs_update_inode(trans, btrfs_sb(inode->i_sb),
+		ret = btrfs_update_inode(trans, root,
 					 dentry->d_parent->d_inode);
 	}
 	check_inode(inode);
@@ -710,7 +807,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 			int mode, struct nameidata *nd)
 {
 	struct btrfs_trans_handle *trans;
-	struct btrfs_root *root = btrfs_sb(dir->i_sb);
+	struct btrfs_root *root = BTRFS_I(dir)->root;
 	struct inode *inode;
 	int err;
 	int drop_inode = 0;
@@ -747,18 +844,26 @@ out_unlock:
 static int btrfs_make_empty_dir(struct btrfs_trans_handle *trans,
 				struct inode *inode, struct inode *dir)
 {
-	struct btrfs_root *root = btrfs_sb(inode->i_sb);
+	struct btrfs_root *root = BTRFS_I(dir)->root;
 	int ret;
 	char buf[2];
+	struct btrfs_key key;
+
 	buf[0] = '.';
 	buf[1] = '.';
 
+	key.objectid = inode->i_ino;
+	key.offset = 0;
+	key.flags = 0;
+	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+
 	ret = btrfs_insert_dir_item(trans, root, buf, 1, inode->i_ino,
-				    inode->i_ino, 1);
+				    &key, 1);
 	if (ret)
 		goto error;
+	key.objectid = dir->i_ino;
 	ret = btrfs_insert_dir_item(trans, root, buf, 2, inode->i_ino,
-				    dir->i_ino, 1);
+				    &key, 1);
 	if (ret)
 		goto error;
 	inode->i_size = 6;
@@ -771,7 +876,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
 	struct inode *inode;
 	struct btrfs_trans_handle *trans;
-	struct btrfs_root *root = btrfs_sb(dir->i_sb);
+	struct btrfs_root *root = BTRFS_I(dir)->root;
 	int err = 0;
 	int drop_on_err = 0;
 
@@ -910,7 +1015,7 @@ static int btrfs_get_block_lock(struct inode *inode, sector_t iblock,
 	u64 extent_end = 0;
 	u64 objectid = inode->i_ino;
 	struct btrfs_path *path;
-	struct btrfs_root *root = btrfs_sb(inode->i_sb);
+	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans = NULL;
 	struct btrfs_file_extent_item *item;
 	struct btrfs_leaf *leaf;
@@ -999,7 +1104,7 @@ static int btrfs_get_block(struct inode *inode, sector_t iblock,
 			   struct buffer_head *result, int create)
 {
 	int err;
-	struct btrfs_root *root = btrfs_sb(inode->i_sb);
+	struct btrfs_root *root = BTRFS_I(inode)->root;
 	mutex_lock(&root->fs_info->fs_mutex);
 	err = btrfs_get_block_lock(inode, iblock, result, create);
 	// err = btrfs_get_block_inline(inode, iblock, result, create);
@@ -1041,7 +1146,7 @@ static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
 
 static void btrfs_truncate(struct inode *inode)
 {
-	struct btrfs_root *root = btrfs_sb(inode->i_sb);
+	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret;
 	struct btrfs_trans_handle *trans;
 
@@ -1208,7 +1313,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 	int err = 0;
 	int ret = 0;
 	struct inode *inode = file->f_path.dentry->d_inode;
-	struct btrfs_root *root = btrfs_sb(inode->i_sb);
+	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct page *pages[1];
 
 	if (file->f_flags & O_DIRECT)
@@ -1365,7 +1470,7 @@ static ssize_t btrfs_file_inline_write(struct file *file,
 	int err = 0;
 	int ret = 0;
 	struct inode *inode = file->f_path.dentry->d_inode;
-	struct btrfs_root *root = btrfs_sb(inode->i_sb);
+	struct btrfs_root *root = BTRFS_I(inode)->root;
 	unsigned long page_index;
 
 	if (file->f_flags & O_DIRECT)
@@ -1432,13 +1537,14 @@ static int btrfs_read_actor(read_descriptor_t *desc, struct page *page,
 {
 	char *kaddr;
 	unsigned long left, count = desc->count;
+	struct inode *inode = page->mapping->host;
 
 	if (size > count)
 		size = count;
 
 	if (!PageChecked(page)) {
 		/* FIXME, do it per block */
-		struct btrfs_root *root = btrfs_sb(page->mapping->host->i_sb);
+		struct btrfs_root *root = BTRFS_I(inode)->root;
 		int ret = btrfs_csum_verify_file_block(root,
 					  page->mapping->host->i_ino,
 					  page->index << PAGE_CACHE_SHIFT,
@@ -1662,6 +1768,10 @@ static struct inode_operations btrfs_dir_inode_operations = {
 	.rmdir		= btrfs_rmdir,
 };
 
+static struct inode_operations btrfs_dir_ro_inode_operations = {
+	.lookup		= btrfs_lookup,
+};
+
 static struct file_operations btrfs_dir_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
@@ -1697,13 +1807,21 @@ static int __init init_btrfs_fs(void)
 	err = init_inodecache();
 	if (err)
 		return err;
+	kset_set_kset_s(&btrfs_subsys, fs_subsys);
+	err = subsystem_register(&btrfs_subsys);
+	if (err)
+		goto out;
 	return register_filesystem(&btrfs_fs_type);
+out:
+	destroy_inodecache();
+	return err;
 }
 
 static void __exit exit_btrfs_fs(void)
 {
 	destroy_inodecache();
 	unregister_filesystem(&btrfs_fs_type);
+	subsystem_unregister(&btrfs_subsys);
 	printk("btrfs unloaded\n");
 }
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 2b0f98c92123..20d84bd03223 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -52,6 +52,8 @@ struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
 		kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
 	int ret;
 
+	/* FIXME, use the right root */
+	root = root->fs_info->fs_root;
 	mutex_lock(&root->fs_info->trans_mutex);
 	ret = join_transaction(root);
 	BUG_ON(ret);
@@ -69,6 +71,10 @@ int btrfs_end_transaction(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root)
 {
 	struct btrfs_transaction *cur_trans;
+
+	/* FIXME, use the right root */
+	root = root->fs_info->fs_root;
+
 	WARN_ON(trans->magic != TRANS_MAGIC);
 	WARN_ON(trans->magic2 != TRANS_MAGIC);
 	mutex_lock(&root->fs_info->trans_mutex);
@@ -148,6 +154,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	struct btrfs_transaction *cur_trans;
 	DEFINE_WAIT(wait);
 
+	/* FIXME, use the right root */
+	root = root->fs_info->fs_root;
+
 	mutex_lock(&root->fs_info->trans_mutex);
 	if (trans->transaction->in_commit) {
 printk("already in commit!, waiting\n");
-- 
cgit v1.2.3


From 1b05da2ee6217e7d55460d04335813fec25be4ca Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 10 Apr 2007 12:13:09 -0400
Subject: Btrfs: drop the inode map tree

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/btrfs_inode.h |  2 --
 fs/btrfs/ctree.h       | 39 +++-------------------
 fs/btrfs/disk-io.c     | 25 +++++---------
 fs/btrfs/inode-item.c  |  2 ++
 fs/btrfs/inode-map.c   | 65 +++---------------------------------
 fs/btrfs/print-tree.c  |  8 -----
 fs/btrfs/super.c       | 89 ++++++++------------------------------------------
 fs/btrfs/transaction.c |  7 ----
 8 files changed, 33 insertions(+), 204 deletions(-)

(limited to 'fs/btrfs/btrfs_inode.h')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 3439ffb467b5..43a4f1ddb375 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -2,11 +2,9 @@
 #define __BTRFS_I__
 
 struct btrfs_inode {
-	u32 magic;
 	struct btrfs_root *root;
 	struct btrfs_key location;
 	struct inode vfs_inode;
-	u32 magic2;
 };
 static inline struct btrfs_inode *BTRFS_I(struct inode *inode)
 {
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2cbcaaeac9e5..5103709bb2b9 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -14,9 +14,8 @@ extern struct kmem_cache *btrfs_path_cachep;
 
 #define BTRFS_ROOT_TREE_OBJECTID 1ULL
 #define BTRFS_EXTENT_TREE_OBJECTID 2ULL
-#define BTRFS_INODE_MAP_OBJECTID 3ULL
-#define BTRFS_FS_TREE_OBJECTID 4ULL
-#define BTRFS_FIRST_FREE_OBJECTID 5ULL
+#define BTRFS_FS_TREE_OBJECTID 3ULL
+#define BTRFS_FIRST_FREE_OBJECTID 4ULL
 
 /*
  * we can actually store much bigger names, but lets not confuse the rest
@@ -62,7 +61,6 @@ struct btrfs_header {
 	__le64 blocknr; /* which block this node is supposed to live in */
 	__le64 generation;
 	__le64 parentid; /* objectid of the tree root */
-	__le32 ham;
 	__le16 nritems;
 	__le16 flags;
 	u8 level;
@@ -226,23 +224,16 @@ struct btrfs_csum_item {
 	u8 csum[BTRFS_CSUM_SIZE];
 } __attribute__ ((__packed__));
 
-struct btrfs_inode_map_item {
-	u32 refs;
-} __attribute__ ((__packed__));
-
 struct crypto_hash;
 struct btrfs_fs_info {
 	struct btrfs_root *extent_root;
 	struct btrfs_root *tree_root;
-	struct btrfs_root *inode_root;
 	struct btrfs_key current_insert;
 	struct btrfs_key last_insert;
 	struct radix_tree_root fs_roots_radix;
 	struct radix_tree_root pending_del_radix;
 	struct radix_tree_root pinned_radix;
-	u64 last_inode_alloc;
 	u64 generation;
-	u64 highest_inode;
 	struct btrfs_transaction *running_transaction;
 	struct btrfs_super_block *disk_super;
 	struct buffer_head *sb_buffer;
@@ -272,6 +263,8 @@ struct btrfs_root {
 	u32 blocksize;
 	int ref_cows;
 	u32 type;
+	u64 highest_inode;
+	u64 last_inode_alloc;
 };
 
 /* the lower bits in the key flags defines the item type */
@@ -320,16 +313,11 @@ struct btrfs_root {
  */
 #define BTRFS_EXTENT_ITEM_KEY	8
 
-/*
- * the inode map records which inode numbers are in use and where
- * they actually live on disk
- */
-#define BTRFS_INODE_MAP_ITEM_KEY 9
 /*
  * string items are for debugging.  They just store a short string of
  * data in the FS
  */
-#define BTRFS_STRING_ITEM_KEY	10
+#define BTRFS_STRING_ITEM_KEY	9
 
 static inline u64 btrfs_inode_generation(struct btrfs_inode_item *i)
 {
@@ -883,17 +871,6 @@ static inline void btrfs_set_file_extent_num_blocks(struct
 	e->num_blocks = cpu_to_le64(val);
 }
 
-static inline u32 btrfs_inode_map_refs(struct btrfs_inode_map_item *m)
-{
-	return le32_to_cpu(m->refs);
-}
-
-static inline void btrfs_set_inode_map_refs(struct btrfs_inode_map_item *m,
-					    u32 val)
-{
-	m->refs = cpu_to_le32(val);
-}
-
 static inline struct btrfs_root *btrfs_sb(struct super_block *sb)
 {
 	return sb->s_fs_info;
@@ -996,12 +973,6 @@ int btrfs_match_dir_item_name(struct btrfs_root *root, struct btrfs_path *path,
 int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *fs_root,
 			     u64 dirid, u64 *objectid);
-int btrfs_insert_inode_map(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root,
-			   u64 objectid, struct btrfs_key *location);
-int btrfs_lookup_inode_map(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root, struct btrfs_path *path,
-			   u64 objectid, int mod);
 int btrfs_find_highest_inode(struct btrfs_root *fs_root, u64 *objectid);
 
 /* inode-item.c */
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 6b097ede80b1..760fdc9a7664 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -264,6 +264,8 @@ static int __setup_root(int blocksize,
 	root->fs_info = fs_info;
 	root->objectid = objectid;
 	root->last_trans = 0;
+	root->highest_inode = 0;
+	root->last_inode_alloc = 0;
 	memset(&root->root_key, 0, sizeof(root->root_key));
 	memset(&root->root_item, 0, sizeof(root->root_item));
 	return 0;
@@ -295,6 +297,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
 	struct btrfs_root *tree_root = fs_info->tree_root;
 	struct btrfs_path *path;
 	struct btrfs_leaf *l;
+	u64 highest_inode;
 	int ret = 0;
 
 printk("read_fs_root looking for %Lu %Lu %u\n", location->objectid, location->offset, location->flags);
@@ -354,6 +357,12 @@ printk("radix_tree_insert gives us %d\n", ret);
 		kfree(root);
 		return ERR_PTR(ret);
 	}
+	ret = btrfs_find_highest_inode(root, &highest_inode);
+	if (ret == 0) {
+		root->highest_inode = highest_inode;
+		root->last_inode_alloc = highest_inode;
+printk("highest inode is %Lu\n", highest_inode);
+	}
 printk("all worked\n");
 	return root;
 }
@@ -364,8 +373,6 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 						 GFP_NOFS);
 	struct btrfs_root *tree_root = kmalloc(sizeof(struct btrfs_root),
 					       GFP_NOFS);
-	struct btrfs_root *inode_root = kmalloc(sizeof(struct btrfs_root),
-						GFP_NOFS);
 	struct btrfs_fs_info *fs_info = kmalloc(sizeof(*fs_info),
 						GFP_NOFS);
 	int ret;
@@ -378,9 +385,6 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	fs_info->running_transaction = NULL;
 	fs_info->tree_root = tree_root;
 	fs_info->extent_root = extent_root;
-	fs_info->inode_root = inode_root;
-	fs_info->last_inode_alloc = 0;
-	fs_info->highest_inode = 0;
 	fs_info->sb = sb;
 	fs_info->btree_inode = new_inode(sb);
 	fs_info->btree_inode->i_ino = 1;
@@ -425,14 +429,7 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 				  BTRFS_EXTENT_TREE_OBJECTID, extent_root);
 	BUG_ON(ret);
 
-	ret = find_and_setup_root(sb->s_blocksize, tree_root, fs_info,
-				  BTRFS_INODE_MAP_OBJECTID, inode_root);
-	BUG_ON(ret);
-
 	fs_info->generation = btrfs_super_generation(disk_super) + 1;
-	ret = btrfs_find_highest_inode(tree_root, &fs_info->last_inode_alloc);
-	if (ret == 0)
-		fs_info->highest_inode = fs_info->last_inode_alloc;
 	memset(&fs_info->kobj, 0, sizeof(fs_info->kobj));
 	kobj_set_kset_s(fs_info, btrfs_subsys);
 	kobject_set_name(&fs_info->kobj, "%s", sb->s_id);
@@ -512,9 +509,6 @@ int close_ctree(struct btrfs_root *root)
 	if (fs_info->extent_root->node)
 		btrfs_block_release(fs_info->extent_root,
 				    fs_info->extent_root->node);
-	if (fs_info->inode_root->node)
-		btrfs_block_release(fs_info->inode_root,
-				    fs_info->inode_root->node);
 	if (fs_info->tree_root->node)
 		btrfs_block_release(fs_info->tree_root,
 				    fs_info->tree_root->node);
@@ -524,7 +518,6 @@ int close_ctree(struct btrfs_root *root)
 	iput(fs_info->btree_inode);
 	del_fs_roots(fs_info);
 	kfree(fs_info->extent_root);
-	kfree(fs_info->inode_root);
 	kfree(fs_info->tree_root);
 	kobject_unregister(&fs_info->kobj);
 	return 0;
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index b276a3b40a66..c3b990b661c2 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -22,6 +22,8 @@ int btrfs_insert_inode(struct btrfs_trans_handle *trans, struct btrfs_root
 				sizeof(*inode_item));
 	btrfs_release_path(root, path);
 	btrfs_free_path(path);
+	if (ret == 0 && objectid > root->highest_inode)
+		root->highest_inode = objectid;
 	return ret;
 }
 
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index f665221409ac..318e27a6378b 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -3,12 +3,11 @@
 #include "disk-io.h"
 #include "transaction.h"
 
-int btrfs_find_highest_inode(struct btrfs_root *fs_root, u64 *objectid)
+int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid)
 {
 	struct btrfs_path *path;
 	int ret;
 	struct btrfs_leaf *l;
-	struct btrfs_root *root = fs_root->fs_info->inode_root;
 	struct btrfs_key search_key;
 	int slot;
 
@@ -38,7 +37,7 @@ error:
  * walks the btree of allocated inodes and find a hole.
  */
 int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
-			     struct btrfs_root *fs_root,
+			     struct btrfs_root *root,
 			     u64 dirid, u64 *objectid)
 {
 	struct btrfs_path *path;
@@ -49,16 +48,13 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
 	u64 last_ino = 0;
 	int start_found;
 	struct btrfs_leaf *l;
-	struct btrfs_root *root = fs_root->fs_info->inode_root;
 	struct btrfs_key search_key;
 	u64 search_start = dirid;
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 	search_key.flags = 0;
-	btrfs_set_key_type(&search_key, BTRFS_INODE_MAP_ITEM_KEY);
-
-	search_start = fs_root->fs_info->last_inode_alloc;
+	search_start = root->last_inode_alloc;
 	search_start = max(search_start, BTRFS_FIRST_FREE_OBJECTID);
 	search_key.objectid = search_start;
 	search_key.offset = 0;
@@ -108,7 +104,7 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
 	}
 	// FIXME -ENOSPC
 found:
-	root->fs_info->last_inode_alloc = *objectid;
+	root->last_inode_alloc = *objectid;
 	btrfs_release_path(root, path);
 	btrfs_free_path(path);
 	BUG_ON(*objectid < search_start);
@@ -118,56 +114,3 @@ error:
 	btrfs_free_path(path);
 	return ret;
 }
-
-int btrfs_insert_inode_map(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *fs_root,
-			   u64 objectid, struct btrfs_key *location)
-{
-	int ret = 0;
-	struct btrfs_path *path;
-	struct btrfs_inode_map_item *inode_item;
-	struct btrfs_key key;
-	struct btrfs_root *inode_root = fs_root->fs_info->inode_root;
-
-	key.objectid = objectid;
-	key.flags = 0;
-	btrfs_set_key_type(&key, BTRFS_INODE_MAP_ITEM_KEY);
-	key.offset = 0;
-	path = btrfs_alloc_path();
-	BUG_ON(!path);
-	btrfs_init_path(path);
-	ret = btrfs_insert_empty_item(trans, inode_root, path, &key,
-				      sizeof(struct btrfs_inode_map_item));
-	if (ret)
-		goto out;
-
-	inode_item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
-				    path->slots[0], struct btrfs_inode_map_item);
-	btrfs_cpu_key_to_disk(&inode_item->key, location);
-	btrfs_mark_buffer_dirty(path->nodes[0]);
-	if (objectid > fs_root->fs_info->highest_inode)
-		fs_root->fs_info->highest_inode = objectid;
-out:
-	btrfs_release_path(inode_root, path);
-	btrfs_free_path(path);
-	return ret;
-}
-
-int btrfs_lookup_inode_map(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *fs_root, struct btrfs_path *path,
-			   u64 objectid, int mod)
-{
-	int ret;
-	struct btrfs_key key;
-	int ins_len = mod < 0 ? -1 : 0;
-	int cow = mod != 0;
-	struct btrfs_root *inode_root = fs_root->fs_info->inode_root;
-
-	key.objectid = objectid;
-	key.flags = 0;
-	key.offset = 0;
-	btrfs_set_key_type(&key, BTRFS_INODE_MAP_ITEM_KEY);
-	ret = btrfs_search_slot(trans, inode_root, &key, path, ins_len, cow);
-	return ret;
-}
-
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index f0da65c4f96b..854d47d9bdc9 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -10,7 +10,6 @@ void btrfs_print_leaf(struct btrfs_root *root, struct btrfs_leaf *l)
 	struct btrfs_extent_item *ei;
 	struct btrfs_root_item *ri;
 	struct btrfs_dir_item *di;
-	struct btrfs_inode_map_item *mi;
 	struct btrfs_inode_item *ii;
 	u32 type;
 
@@ -54,13 +53,6 @@ void btrfs_print_leaf(struct btrfs_root *root, struct btrfs_leaf *l)
 			printk("\t\textent data refs %u owner %Lu\n",
 				btrfs_extent_refs(ei), btrfs_extent_owner(ei));
 			break;
-		case BTRFS_INODE_MAP_ITEM_KEY:
-			mi = btrfs_item_ptr(l, i, struct btrfs_inode_map_item);
-			printk("\t\tinode map key %Lu %u %Lu\n",
-			       btrfs_disk_key_objectid(&mi->key),
-			       btrfs_disk_key_flags(&mi->key),
-			       btrfs_disk_key_offset(&mi->key));
-			break;
 		case BTRFS_STRING_ITEM_KEY:
 			printk("\t\titem data %.*s\n", btrfs_item_size(item),
 				btrfs_leaf_data(l) + btrfs_item_offset(item));
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index bbe5cabcb42e..3e8bfb0e5d7e 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -45,14 +45,6 @@ static struct inode_operations btrfs_file_inode_operations;
 static struct address_space_operations btrfs_aops;
 static struct file_operations btrfs_file_operations;
 
-static int check_inode(struct inode *inode)
-{
-	struct btrfs_inode *ei = BTRFS_I(inode);
-	WARN_ON(ei->magic != 0xDEADBEEF);
-	WARN_ON(ei->magic2 != 0xDEADBEAF);
-	return 0;
-}
-
 static void btrfs_read_locked_inode(struct inode *inode)
 {
 	struct btrfs_path *path;
@@ -66,15 +58,12 @@ static void btrfs_read_locked_inode(struct inode *inode)
 	btrfs_init_path(path);
 	mutex_lock(&root->fs_info->fs_mutex);
 
-	check_inode(inode);
-
 	memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
 	ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
 	if (ret) {
 		btrfs_free_path(path);
 		goto make_bad;
 	}
-	check_inode(inode);
 	inode_item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
 				  path->slots[0],
 				  struct btrfs_inode_item);
@@ -97,7 +86,7 @@ static void btrfs_read_locked_inode(struct inode *inode)
 	inode_item = NULL;
 
 	mutex_unlock(&root->fs_info->fs_mutex);
-	check_inode(inode);
+
 	switch (inode->i_mode & S_IFMT) {
 #if 0
 	default:
@@ -121,7 +110,6 @@ static void btrfs_read_locked_inode(struct inode *inode)
 		// inode->i_op = &page_symlink_inode_operations;
 		break;
 	}
-	check_inode(inode);
 	return;
 
 make_bad:
@@ -272,10 +260,7 @@ static int btrfs_free_inode(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root,
 			    struct inode *inode)
 {
-	u64 objectid = inode->i_ino;
 	struct btrfs_path *path;
-	struct btrfs_inode_map_item *map;
-	struct btrfs_key stat_data_key;
 	int ret;
 
 	clear_inode(inode);
@@ -283,26 +268,11 @@ static int btrfs_free_inode(struct btrfs_trans_handle *trans,
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 	btrfs_init_path(path);
-	ret = btrfs_lookup_inode_map(trans, root, path, objectid, -1);
-	if (ret) {
-		if (ret > 0)
-			ret = -ENOENT;
-		goto error;
-	}
-	map = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), path->slots[0],
-			    struct btrfs_inode_map_item);
-	btrfs_disk_key_to_cpu(&stat_data_key, &map->key);
-	ret = btrfs_del_item(trans, root->fs_info->inode_root, path);
-	BUG_ON(ret);
-	btrfs_release_path(root, path);
-
 	ret = btrfs_lookup_inode(trans, root, path,
 				 &BTRFS_I(inode)->location, -1);
 	BUG_ON(ret);
 	ret = btrfs_del_item(trans, root, path);
 	BUG_ON(ret);
-error:
-	btrfs_release_path(root, path);
 	btrfs_free_path(path);
 	return ret;
 }
@@ -432,7 +402,6 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
 out:
 	btrfs_release_path(root, path);
 	btrfs_free_path(path);
-	check_inode(dir);
 	return ret;
 }
 
@@ -540,9 +509,7 @@ printk("adding new root for inode %lu root %p (found %p)\n", inode->i_ino, sub_r
 			btrfs_read_locked_inode(inode);
 			unlock_new_inode(inode);
 		}
-		check_inode(inode);
 	}
-	check_inode(dir);
 	return d_splice_alias(inode, dentry);
 }
 
@@ -566,7 +533,6 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	/* FIXME, use a real flag for deciding about the key type */
 	if (root->fs_info->tree_root == root)
 		key_type = BTRFS_DIR_ITEM_KEY;
-
 	mutex_lock(&root->fs_info->fs_mutex);
 	key.objectid = inode->i_ino;
 	key.flags = 0;
@@ -575,9 +541,8 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	path = btrfs_alloc_path();
 	btrfs_init_path(path);
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-	if (ret < 0) {
+	if (ret < 0)
 		goto err;
-	}
 	advance = 0;
 	while(1) {
 		leaf = btrfs_buffer_leaf(path->nodes[0]);
@@ -601,8 +566,7 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		if (btrfs_disk_key_objectid(&item->key) != key.objectid)
 			break;
 		if (key_type == BTRFS_DIR_INDEX_KEY &&
-		    btrfs_disk_key_offset(&item->key) >
-		    root->fs_info->highest_inode)
+		    btrfs_disk_key_offset(&item->key) > root->highest_inode)
 			break;
 		if (btrfs_disk_key_type(&item->key) != key_type)
 			continue;
@@ -707,7 +671,6 @@ static void fill_inode_item(struct btrfs_inode_item *item,
 	btrfs_set_timespec_nsec(&item->ctime, inode->i_ctime.tv_nsec);
 	btrfs_set_inode_nblocks(item, inode->i_blocks);
 	btrfs_set_inode_generation(item, inode->i_generation);
-	check_inode(inode);
 }
 
 static int btrfs_update_inode(struct btrfs_trans_handle *trans,
@@ -721,7 +684,6 @@ static int btrfs_update_inode(struct btrfs_trans_handle *trans,
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 	btrfs_init_path(path);
-
 	ret = btrfs_lookup_inode(trans, root, path,
 				 &BTRFS_I(inode)->location, 1);
 	if (ret) {
@@ -736,11 +698,11 @@ static int btrfs_update_inode(struct btrfs_trans_handle *trans,
 
 	fill_inode_item(inode_item, inode);
 	btrfs_mark_buffer_dirty(path->nodes[0]);
+	ret = 0;
 failed:
 	btrfs_release_path(root, path);
 	btrfs_free_path(path);
-	check_inode(inode);
-	return 0;
+	return ret;
 }
 
 static int btrfs_write_inode(struct inode *inode, int wait)
@@ -757,7 +719,6 @@ static int btrfs_write_inode(struct inode *inode, int wait)
 	else
 		btrfs_end_transaction(trans, root);
 	mutex_unlock(&root->fs_info->fs_mutex);
-	check_inode(inode);
 	return ret;
 }
 
@@ -767,7 +728,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	struct inode *inode;
 	struct btrfs_inode_item inode_item;
 	struct btrfs_root *root = BTRFS_I(dir)->root;
-	struct btrfs_key *key;
+	struct btrfs_key *location;
 	int ret;
 	u64 objectid;
 
@@ -776,8 +737,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 		return ERR_PTR(-ENOMEM);
 
 	BTRFS_I(inode)->root = BTRFS_I(dir)->root;
-	key = &BTRFS_I(inode)->location;
-	check_inode(inode);
 	ret = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
 	BUG_ON(ret);
 
@@ -788,20 +747,16 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	inode->i_blocks = 0;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 	fill_inode_item(&inode_item, inode);
-
-	key->objectid = objectid;
-	key->flags = 0;
-	key->offset = 0;
-	btrfs_set_key_type(key, BTRFS_INODE_ITEM_KEY);
-	ret = btrfs_insert_inode_map(trans, root, objectid, key);
-	BUG_ON(ret);
+	location = &BTRFS_I(inode)->location;
+	location->objectid = objectid;
+	location->flags = 0;
+	location->offset = 0;
+	btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
 
 	ret = btrfs_insert_inode(trans, root, objectid, &inode_item);
 	BUG_ON(ret);
 
 	insert_inode_hash(inode);
-	check_inode(inode);
-	check_inode(dir);
 	return inode;
 }
 
@@ -825,8 +780,6 @@ static int btrfs_add_link(struct btrfs_trans_handle *trans,
 		ret = btrfs_update_inode(trans, root,
 					 dentry->d_parent->d_inode);
 	}
-	check_inode(inode);
-	check_inode(dentry->d_parent->d_inode);
 	return ret;
 }
 
@@ -840,7 +793,6 @@ static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
 	}
 	if (err > 0)
 		err = -EEXIST;
-	check_inode(inode);
 	return err;
 }
 
@@ -872,8 +824,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 out_unlock:
 	btrfs_end_transaction(trans, root);
 	mutex_unlock(&root->fs_info->fs_mutex);
-	check_inode(inode);
-	check_inode(dir);
 
 	if (drop_inode) {
 		inode_dec_link_count(inode);
@@ -1701,20 +1651,14 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
 	ret = btrfs_update_inode(trans, root, root->inode);
 	BUG_ON(ret);
 
-	ret = btrfs_find_free_objectid(trans, root, 0, &objectid);
+	ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
+				       0, &objectid);
 	BUG_ON(ret);
 
 	memset(&new_root_item, 0, sizeof(new_root_item));
 	memcpy(&new_root_item, &root->root_item,
 	       sizeof(new_root_item));
 
-	key.objectid = objectid;
-	key.flags = 0;
-	key.offset = 0;
-	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
-	ret = btrfs_insert_inode_map(trans, root, objectid, &key);
-	BUG_ON(ret);
-
 	key.objectid = objectid;
 	key.offset = 1;
 	key.flags = 0;
@@ -1791,21 +1735,14 @@ static struct inode *btrfs_alloc_inode(struct super_block *sb)
 	ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
 	if (!ei)
 		return NULL;
-	ei->magic = 0xDEADBEEF;
-	ei->magic2 = 0xDEADBEAF;
 	return &ei->vfs_inode;
 }
 
 static void btrfs_destroy_inode(struct inode *inode)
 {
-	struct btrfs_inode *ei = BTRFS_I(inode);
-	WARN_ON(ei->magic != 0xDEADBEEF);
-	WARN_ON(ei->magic2 != 0xDEADBEAF);
 	WARN_ON(!list_empty(&inode->i_dentry));
 	WARN_ON(inode->i_data.nrpages);
 
-	ei->magic = 0;
-	ei->magic2 = 0;
 	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
 }
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 83a0194ab163..54978d1bd958 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -115,14 +115,7 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_root *tree_root = fs_info->tree_root;
 	struct btrfs_root *extent_root = fs_info->extent_root;
-	struct btrfs_root *inode_root = fs_info->inode_root;
 
-	btrfs_set_root_blocknr(&inode_root->root_item,
-			       inode_root->node->b_blocknr);
-	ret = btrfs_update_root(trans, tree_root,
-				&inode_root->root_key,
-				&inode_root->root_item);
-	BUG_ON(ret);
 	while(1) {
 		old_extent_block = btrfs_root_blocknr(&extent_root->root_item);
 		if (old_extent_block == extent_root->node->b_blocknr)
-- 
cgit v1.2.3


From 31f3c99b73483f7b738a886c552050cbd6128ff3 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 30 Apr 2007 15:25:45 -0400
Subject: Btrfs: allocator improvements, inode block groups

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/btrfs_inode.h |   1 +
 fs/btrfs/ctree.c       |  10 ++--
 fs/btrfs/ctree.h       |  18 ++++++-
 fs/btrfs/extent-tree.c | 138 +++++++++++++++++++++++++++++++++++--------------
 fs/btrfs/super.c       |  53 +++++++++++++++----
 fs/btrfs/transaction.c |   1 +
 fs/btrfs/transaction.h |  15 ++++++
 7 files changed, 181 insertions(+), 55 deletions(-)

(limited to 'fs/btrfs/btrfs_inode.h')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 43a4f1ddb375..6b50076b02d5 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -3,6 +3,7 @@
 
 struct btrfs_inode {
 	struct btrfs_root *root;
+	struct btrfs_block_group_cache *block_group;
 	struct btrfs_key location;
 	struct inode vfs_inode;
 };
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index b4783bf8bf4f..dbd3f636dd3d 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -61,7 +61,7 @@ static int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root
 		*cow_ret = buf;
 		return 0;
 	}
-	cow = btrfs_alloc_free_block(trans, root);
+	cow = btrfs_alloc_free_block(trans, root, buf->b_blocknr);
 	cow_node = btrfs_buffer_node(cow);
 	if (buf->b_size != root->blocksize || cow->b_size != root->blocksize)
 		WARN_ON(1);
@@ -800,7 +800,7 @@ static int insert_new_root(struct btrfs_trans_handle *trans, struct btrfs_root
 	BUG_ON(path->nodes[level]);
 	BUG_ON(path->nodes[level-1] != root->node);
 
-	t = btrfs_alloc_free_block(trans, root);
+	t = btrfs_alloc_free_block(trans, root, root->node->b_blocknr);
 	c = btrfs_buffer_node(t);
 	memset(c, 0, root->blocksize);
 	btrfs_set_header_nritems(&c->header, 1);
@@ -905,7 +905,7 @@ static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
 	}
 
 	c_nritems = btrfs_header_nritems(&c->header);
-	split_buffer = btrfs_alloc_free_block(trans, root);
+	split_buffer = btrfs_alloc_free_block(trans, root, t->b_blocknr);
 	split = btrfs_buffer_node(split_buffer);
 	btrfs_set_header_flags(&split->header, btrfs_header_flags(&c->header));
 	btrfs_set_header_level(&split->header, btrfs_header_level(&c->header));
@@ -1277,7 +1277,7 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 	slot = path->slots[0];
 	nritems = btrfs_header_nritems(&l->header);
 	mid = (nritems + 1)/ 2;
-	right_buffer = btrfs_alloc_free_block(trans, root);
+	right_buffer = btrfs_alloc_free_block(trans, root, l_buf->b_blocknr);
 	BUG_ON(!right_buffer);
 	right = btrfs_buffer_leaf(right_buffer);
 	memset(&right->header, 0, sizeof(right->header));
@@ -1374,7 +1374,7 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	if (!double_split)
 		return ret;
-	right_buffer = btrfs_alloc_free_block(trans, root);
+	right_buffer = btrfs_alloc_free_block(trans, root, l_buf->b_blocknr);
 	BUG_ON(!right_buffer);
 	right = btrfs_buffer_leaf(right_buffer);
 	memset(&right->header, 0, sizeof(right->header));
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index c432222d40e3..e6bf9919536a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -174,6 +174,7 @@ struct btrfs_inode_item {
 	__le64 generation;
 	__le64 size;
 	__le64 nblocks;
+	__le64 block_group;
 	__le32 nlink;
 	__le32 uid;
 	__le32 gid;
@@ -241,6 +242,7 @@ struct btrfs_device_item {
 
 /* tag for the radix tree of block groups in ram */
 #define BTRFS_BLOCK_GROUP_DIRTY 0
+#define BTRFS_BLOCK_GROUP_AVAIL 1
 #define BTRFS_BLOCK_GROUP_HINTS 8
 #define BTRFS_BLOCK_GROUP_SIZE (256 * 1024 * 1024)
 struct btrfs_block_group_item {
@@ -410,6 +412,17 @@ static inline void btrfs_set_inode_nblocks(struct btrfs_inode_item *i, u64 val)
 	i->nblocks = cpu_to_le64(val);
 }
 
+static inline u64 btrfs_inode_block_group(struct btrfs_inode_item *i)
+{
+	return le64_to_cpu(i->block_group);
+}
+
+static inline void btrfs_set_inode_block_group(struct btrfs_inode_item *i,
+						u64 val)
+{
+	i->block_group = cpu_to_le64(val);
+}
+
 static inline u32 btrfs_inode_nlink(struct btrfs_inode_item *i)
 {
 	return le32_to_cpu(i->nlink);
@@ -1054,10 +1067,13 @@ static inline void btrfs_mark_buffer_dirty(struct buffer_head *bh)
 	btrfs_item_offset((leaf)->items + (slot))))
 
 /* extent-tree.c */
+struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
+						 struct btrfs_block_group_cache
+						 *hint, int data);
 int btrfs_inc_root_ref(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root);
 struct buffer_head *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
-					    struct btrfs_root *root);
+					    struct btrfs_root *root, u64 hint);
 int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, u64 owner,
 		       u64 num_blocks, u64 search_start,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 62051a36664a..8b8cbe25fffb 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -12,42 +12,57 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, struct
 static int del_pending_extents(struct btrfs_trans_handle *trans, struct
 			       btrfs_root *extent_root);
 
-static int find_search_start(struct btrfs_root *root, int data)
+struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
+						 struct btrfs_block_group_cache
+						 *hint, int data)
 {
 	struct btrfs_block_group_cache *cache[8];
+	struct btrfs_block_group_cache *found_group = NULL;
 	struct btrfs_fs_info *info = root->fs_info;
 	u64 used;
-	u64 last;
+	u64 last = 0;
+	u64 hint_last;
 	int i;
 	int ret;
-
-	cache[0] = info->block_group_cache;
-	if (!cache[0])
-		goto find_new;
-	used = btrfs_block_group_used(&cache[0]->item);
-	if (used < (cache[0]->key.offset * 3 / 2))
-		return 0;
-find_new:
-	last = 0;
+	int full_search = 0;
+	if (hint) {
+		used = btrfs_block_group_used(&hint->item);
+		if (used < (hint->key.offset * 2) / 3) {
+			return hint;
+		}
+		radix_tree_tag_clear(&info->block_group_radix,
+				     hint->key.objectid + hint->key.offset - 1,
+				     BTRFS_BLOCK_GROUP_AVAIL);
+		last = hint->key.objectid + hint->key.offset;
+		hint_last = last;
+	} else {
+		hint_last = 0;
+		last = 0;
+	}
 	while(1) {
 		ret = radix_tree_gang_lookup_tag(&info->block_group_radix,
 						 (void **)cache,
 						 last, ARRAY_SIZE(cache),
-						 BTRFS_BLOCK_GROUP_DIRTY);
+						 BTRFS_BLOCK_GROUP_AVAIL);
 		if (!ret)
 			break;
 		for (i = 0; i < ret; i++) {
 			used = btrfs_block_group_used(&cache[i]->item);
-			if (used < (cache[i]->key.offset * 3 / 2)) {
+			if (used < (cache[i]->key.offset * 2) / 3) {
 				info->block_group_cache = cache[i];
-				cache[i]->last_alloc = cache[i]->first_free;
-				return 0;
+				found_group = cache[i];
+				goto found;
 			}
+			radix_tree_tag_clear(&info->block_group_radix,
+					   cache[i]->key.objectid +
+					   cache[i]->key.offset - 1,
+					   BTRFS_BLOCK_GROUP_AVAIL);
 			last = cache[i]->key.objectid +
-				cache[i]->key.offset - 1;
+				cache[i]->key.offset;
 		}
 	}
-	last = 0;
+	last = hint_last;
+again:
 	while(1) {
 		ret = radix_tree_gang_lookup(&info->block_group_radix,
 						 (void **)cache,
@@ -56,17 +71,32 @@ find_new:
 			break;
 		for (i = 0; i < ret; i++) {
 			used = btrfs_block_group_used(&cache[i]->item);
-			if (used < (cache[i]->key.offset * 3 / 2)) {
+			if (used < cache[i]->key.offset) {
 				info->block_group_cache = cache[i];
-				cache[i]->last_alloc = cache[i]->first_free;
-				return 0;
+				found_group = cache[i];
+				goto found;
 			}
+			radix_tree_tag_clear(&info->block_group_radix,
+					   cache[i]->key.objectid +
+					   cache[i]->key.offset - 1,
+					   BTRFS_BLOCK_GROUP_AVAIL);
 			last = cache[i]->key.objectid +
-				cache[i]->key.offset - 1;
+				cache[i]->key.offset;
 		}
 	}
 	info->block_group_cache = NULL;
-	return 0;
+	if (!full_search) {
+		last = 0;
+		full_search = 1;
+		goto again;
+	}
+found:
+	if (!found_group) {
+		ret = radix_tree_gang_lookup(&info->block_group_radix,
+					     (void **)&found_group, 0, 1);
+		BUG_ON(ret != 1);
+	}
+	return found_group;
 }
 
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
@@ -243,6 +273,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 						    path, cache[i]);
 			if (err)
 				werr = err;
+			cache[i]->last_alloc = cache[i]->first_free;
 		}
 	}
 	btrfs_free_path(path);
@@ -322,10 +353,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct
 						    btree_inode->i_blkbits));
 		}
 	}
-	if (root->fs_info->block_group_cache) {
-		root->fs_info->block_group_cache->last_alloc =
-			root->fs_info->block_group_cache->first_free;
-	}
 	return 0;
 }
 
@@ -532,22 +559,43 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	int total_found = 0;
 	int fill_prealloc = 0;
 	int level;
+	int update_block_group = 0;
+	struct btrfs_block_group_cache *hint_block_group;
 
 	path = btrfs_alloc_path();
 	ins->flags = 0;
 	btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
 
 	level = btrfs_header_level(btrfs_buffer_header(root->node));
+	/* find search start here */
+	if (0 && search_start && num_blocks) {
+		u64 used;
+		ret = radix_tree_gang_lookup(&info->block_group_radix,
+					     (void **)&hint_block_group,
+					     search_start, 1);
+		if (ret) {
+			used = btrfs_block_group_used(&hint_block_group->item);
+			if (used > (hint_block_group->key.offset * 9) / 10)
+				search_start = 0;
+			else if (search_start < hint_block_group->last_alloc)
+				search_start = hint_block_group->last_alloc;
+		} else {
+			search_start = 0;
+		}
+	}
 	if (num_blocks == 0) {
 		fill_prealloc = 1;
 		num_blocks = 1;
 		total_needed = (min(level + 1, BTRFS_MAX_LEVEL) + 2) * 3;
 	}
-	find_search_start(root, 0);
-	if (info->block_group_cache &&
-	    info->block_group_cache->last_alloc > search_start)
-		search_start = info->block_group_cache->last_alloc;
-
+	if (1 || !search_start) {
+		trans->block_group = btrfs_find_block_group(root,
+							    trans->block_group,
+							    0);
+		if (trans->block_group->last_alloc > search_start)
+			search_start = trans->block_group->last_alloc;
+		update_block_group = 1;
+	}
 check_failed:
 	btrfs_init_path(path);
 	ins->objectid = search_start;
@@ -662,11 +710,13 @@ check_pending:
 		}
 		info->extent_tree_prealloc_nr = total_found;
 	}
-	ret = radix_tree_gang_lookup(&info->block_group_radix,
-				     (void **)&info->block_group_cache,
-				     ins->objectid, 1);
-	if (ret) {
-		info->block_group_cache->last_alloc = ins->objectid;
+	if (update_block_group) {
+		ret = radix_tree_gang_lookup(&info->block_group_radix,
+					     (void **)&trans->block_group,
+					     ins->objectid, 1);
+		if (ret) {
+			trans->block_group->last_alloc = ins->objectid;
+		}
 	}
 	ins->offset = num_blocks;
 	btrfs_free_path(path);
@@ -747,14 +797,14 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
  * returns the tree buffer or NULL.
  */
 struct buffer_head *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
-					   struct btrfs_root *root)
+					   struct btrfs_root *root, u64 hint)
 {
 	struct btrfs_key ins;
 	int ret;
 	struct buffer_head *buf;
 
 	ret = btrfs_alloc_extent(trans, root, root->root_key.objectid,
-				 1, 0, (unsigned long)-1, &ins);
+				 1, hint, (unsigned long)-1, &ins);
 	if (ret) {
 		BUG();
 		return NULL;
@@ -975,6 +1025,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 	struct btrfs_key found_key;
 	struct btrfs_leaf *leaf;
 	u64 group_size_blocks = BTRFS_BLOCK_GROUP_SIZE / root->blocksize;
+	u64 used;
 
 	root = root->fs_info->extent_root;
 	key.objectid = 0;
@@ -1005,8 +1056,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 				    struct btrfs_block_group_item);
 		memcpy(&cache->item, bi, sizeof(*bi));
 		memcpy(&cache->key, &found_key, sizeof(found_key));
-		cache->last_alloc = 0;
-		cache->first_free = 0;
+		cache->last_alloc = cache->key.objectid;
+		cache->first_free = cache->key.objectid;
 		key.objectid = found_key.objectid + found_key.offset;
 		btrfs_release_path(root, path);
 		ret = radix_tree_insert(&root->fs_info->block_group_radix,
@@ -1014,6 +1065,13 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 					found_key.offset - 1,
 					(void *)cache);
 		BUG_ON(ret);
+		used = btrfs_block_group_used(bi);
+		if (used < (key.offset * 2) / 3) {
+			radix_tree_tag_set(&root->fs_info->block_group_radix,
+					   found_key.objectid +
+					   found_key.offset - 1,
+					   BTRFS_BLOCK_GROUP_AVAIL);
+		}
 		if (key.objectid >=
 		    btrfs_super_total_blocks(root->fs_info->disk_super))
 			break;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 1890e8648dbd..7ecbe7c86186 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -52,6 +52,8 @@ static void btrfs_read_locked_inode(struct inode *inode)
 	struct btrfs_inode_item *inode_item;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_key location;
+	struct btrfs_block_group_cache *alloc_group;
+	u64 alloc_group_block;
 	int ret;
 
 	path = btrfs_alloc_path();
@@ -82,6 +84,12 @@ static void btrfs_read_locked_inode(struct inode *inode)
 	inode->i_ctime.tv_nsec = btrfs_timespec_nsec(&inode_item->ctime);
 	inode->i_blocks = btrfs_inode_nblocks(inode_item);
 	inode->i_generation = btrfs_inode_generation(inode_item);
+	alloc_group_block = btrfs_inode_block_group(inode_item);
+	ret = radix_tree_gang_lookup(&root->fs_info->block_group_radix,
+				     (void **)&alloc_group,
+				     alloc_group_block, 1);
+	BUG_ON(!ret);
+	BTRFS_I(inode)->block_group = alloc_group;
 
 	btrfs_free_path(path);
 	inode_item = NULL;
@@ -136,6 +144,8 @@ static void fill_inode_item(struct btrfs_inode_item *item,
 	btrfs_set_timespec_nsec(&item->ctime, inode->i_ctime.tv_nsec);
 	btrfs_set_inode_nblocks(item, inode->i_blocks);
 	btrfs_set_inode_generation(item, inode->i_generation);
+	btrfs_set_inode_block_group(item,
+				    BTRFS_I(inode)->block_group->key.objectid);
 }
 
 
@@ -237,6 +247,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 	root = BTRFS_I(dir)->root;
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
+	btrfs_set_trans_block_group(trans, dir);
 	ret = btrfs_unlink_trans(trans, root, dir, dentry);
 	btrfs_end_transaction(trans, root);
 	mutex_unlock(&root->fs_info->fs_mutex);
@@ -262,6 +273,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 	btrfs_init_path(path);
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
+	btrfs_set_trans_block_group(trans, dir);
 	key.objectid = inode->i_ino;
 	key.offset = (u64)-1;
 	key.flags = (u32)-1;
@@ -429,6 +441,7 @@ static void btrfs_delete_inode(struct inode *inode)
 	inode->i_size = 0;
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
+	btrfs_set_trans_block_group(trans, inode);
 	if (S_ISREG(inode->i_mode)) {
 		ret = btrfs_truncate_in_trans(trans, root, inode);
 		BUG_ON(ret);
@@ -731,6 +744,7 @@ static int btrfs_write_inode(struct inode *inode, int wait)
 	if (wait) {
 		mutex_lock(&root->fs_info->fs_mutex);
 		trans = btrfs_start_transaction(root, 1);
+		btrfs_set_trans_block_group(trans, inode);
 		ret = btrfs_commit_transaction(trans, root);
 		mutex_unlock(&root->fs_info->fs_mutex);
 	}
@@ -744,6 +758,7 @@ static void btrfs_dirty_inode(struct inode *inode)
 
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
+	btrfs_set_trans_block_group(trans, inode);
 	btrfs_update_inode(trans, root, inode);
 	btrfs_end_transaction(trans, root);
 	mutex_unlock(&root->fs_info->fs_mutex);
@@ -751,7 +766,9 @@ static void btrfs_dirty_inode(struct inode *inode)
 
 static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root,
-				     u64 objectid, int mode)
+				     u64 objectid,
+				     struct btrfs_block_group_cache *group,
+				     int mode)
 {
 	struct inode *inode;
 	struct btrfs_inode_item inode_item;
@@ -763,6 +780,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 		return ERR_PTR(-ENOMEM);
 
 	BTRFS_I(inode)->root = root;
+	group = btrfs_find_block_group(root, group, 0);
+	BTRFS_I(inode)->block_group = group;
 
 	inode->i_uid = current->fsuid;
 	inode->i_gid = current->fsgid;
@@ -832,6 +851,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
+	btrfs_set_trans_block_group(trans, dir);
 
 	err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
 	if (err) {
@@ -839,11 +859,13 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 		goto out_unlock;
 	}
 
-	inode = btrfs_new_inode(trans, root, objectid, mode);
+	inode = btrfs_new_inode(trans, root, objectid,
+				BTRFS_I(dir)->block_group, mode);
 	err = PTR_ERR(inode);
 	if (IS_ERR(inode))
 		goto out_unlock;
-	// FIXME mark the inode dirty
+
+	btrfs_set_trans_block_group(trans, inode);
 	err = btrfs_add_nondir(trans, dentry, inode);
 	if (err)
 		drop_inode = 1;
@@ -853,6 +875,8 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 		inode->i_op = &btrfs_file_inode_operations;
 	}
 	dir->i_sb->s_dirt = 1;
+	btrfs_update_inode_block_group(trans, inode);
+	btrfs_update_inode_block_group(trans, dir);
 out_unlock:
 	btrfs_end_transaction(trans, root);
 	mutex_unlock(&root->fs_info->fs_mutex);
@@ -904,6 +928,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
+	btrfs_set_trans_block_group(trans, dir);
 	if (IS_ERR(trans)) {
 		err = PTR_ERR(trans);
 		goto out_unlock;
@@ -915,7 +940,8 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 		goto out_unlock;
 	}
 
-	inode = btrfs_new_inode(trans, root, objectid, S_IFDIR | mode);
+	inode = btrfs_new_inode(trans, root, objectid,
+				BTRFS_I(dir)->block_group, S_IFDIR | mode);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
 		goto out_fail;
@@ -923,6 +949,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	drop_on_err = 1;
 	inode->i_op = &btrfs_dir_inode_operations;
 	inode->i_fop = &btrfs_dir_file_operations;
+	btrfs_set_trans_block_group(trans, inode);
 
 	err = btrfs_make_empty_dir(trans, root, inode->i_ino, dir->i_ino);
 	if (err)
@@ -938,6 +965,8 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	d_instantiate(dentry, inode);
 	drop_on_err = 0;
 	dir->i_sb->s_dirt = 1;
+	btrfs_update_inode_block_group(trans, inode);
+	btrfs_update_inode_block_group(trans, dir);
 
 out_fail:
 	btrfs_end_transaction(trans, root);
@@ -1349,6 +1378,7 @@ static void btrfs_truncate(struct inode *inode)
 	/* FIXME, add redo link to tree so we don't leak on crash */
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
+	btrfs_set_trans_block_group(trans, inode);
 	ret = btrfs_truncate_in_trans(trans, root, inode);
 	BUG_ON(ret);
 	ret = btrfs_end_transaction(trans, root);
@@ -1445,6 +1475,7 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 
 		mutex_lock(&root->fs_info->fs_mutex);
 		trans = btrfs_start_transaction(root, 1);
+		btrfs_set_trans_block_group(trans, inode);
 
 		bh = page_buffers(pages[i]);
 		if (buffer_mapped(bh) && bh->b_blocknr == 0) {
@@ -1481,6 +1512,7 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 			kunmap(pages[i]);
 		}
 		SetPageChecked(pages[i]);
+		btrfs_update_inode_block_group(trans, inode);
 		ret = btrfs_end_transaction(trans, root);
 		BUG_ON(ret);
 		mutex_unlock(&root->fs_info->fs_mutex);
@@ -1821,6 +1853,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 		mutex_unlock(&root->fs_info->fs_mutex);
 		goto out_unlock;
 	}
+	btrfs_set_trans_block_group(trans, inode);
 	/* FIXME blocksize != 4096 */
 	inode->i_blocks += num_blocks << 3;
 	if (start_pos < inode->i_size) {
@@ -1845,6 +1878,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 	}
 	BUG_ON(ret);
 	alloc_extent_start = ins.objectid;
+	btrfs_update_inode_block_group(trans, inode);
 	ret = btrfs_end_transaction(trans, root);
 	mutex_unlock(&root->fs_info->fs_mutex);
 
@@ -2017,6 +2051,7 @@ static int create_subvol(struct btrfs_root *root, char *name, int namelen)
 	struct btrfs_leaf *leaf;
 	struct btrfs_root *new_root;
 	struct inode *inode;
+	struct inode *dir;
 	int ret;
 	u64 objectid;
 	u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
@@ -2025,7 +2060,7 @@ static int create_subvol(struct btrfs_root *root, char *name, int namelen)
 	trans = btrfs_start_transaction(root, 1);
 	BUG_ON(!trans);
 
-	subvol = btrfs_alloc_free_block(trans, root);
+	subvol = btrfs_alloc_free_block(trans, root, 0);
 	if (subvol == NULL)
 		return -ENOSPC;
 	leaf = btrfs_buffer_leaf(subvol);
@@ -2069,10 +2104,9 @@ static int create_subvol(struct btrfs_root *root, char *name, int namelen)
 	 * insert the directory item
 	 */
 	key.offset = (u64)-1;
+	dir = root->fs_info->sb->s_root->d_inode;
 	ret = btrfs_insert_dir_item(trans, root->fs_info->tree_root,
-				    name, namelen,
-				    root->fs_info->sb->s_root->d_inode->i_ino,
-				    &key, 0);
+				    name, namelen, dir->i_ino, &key, 0);
 	BUG_ON(ret);
 
 	ret = btrfs_commit_transaction(trans, root);
@@ -2084,7 +2118,8 @@ static int create_subvol(struct btrfs_root *root, char *name, int namelen)
 	trans = btrfs_start_transaction(new_root, 1);
 	BUG_ON(!trans);
 
-	inode = btrfs_new_inode(trans, new_root, new_dirid, S_IFDIR | 0700);
+	inode = btrfs_new_inode(trans, new_root, new_dirid,
+				BTRFS_I(dir)->block_group, S_IFDIR | 0700);
 	inode->i_op = &btrfs_dir_inode_operations;
 	inode->i_fop = &btrfs_dir_file_operations;
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index f9b8864dcc40..bdbf514c111f 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -77,6 +77,7 @@ struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
 	h->transaction = root->fs_info->running_transaction;
 	h->blocks_reserved = num_blocks;
 	h->blocks_used = 0;
+	h->block_group = NULL;
 	root->fs_info->running_transaction->use_count++;
 	mutex_unlock(&root->fs_info->trans_mutex);
 	h->magic = h->magic2 = TRANS_MAGIC;
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index afe42d167cee..b5378119e76c 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -1,5 +1,6 @@
 #ifndef __TRANSACTION__
 #define __TRANSACTION__
+#include "btrfs_inode.h"
 
 struct btrfs_transaction {
 	u64 transid;
@@ -20,10 +21,24 @@ struct btrfs_trans_handle {
 	unsigned long blocks_reserved;
 	unsigned long blocks_used;
 	struct btrfs_transaction *transaction;
+	struct btrfs_block_group_cache *block_group;
 	int magic2;
 };
 
 
+static inline void btrfs_set_trans_block_group(struct btrfs_trans_handle *trans,
+					       struct inode *inode)
+{
+	trans->block_group = BTRFS_I(inode)->block_group;
+}
+
+static inline void btrfs_update_inode_block_group(struct
+						  btrfs_trans_handle *trans,
+						  struct inode *inode)
+{
+	BTRFS_I(inode)->block_group = trans->block_group;
+}
+
 int btrfs_end_transaction(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
-- 
cgit v1.2.3


From 6cbd55707802b98843f953d1ae6d8f5bcd9a76c0 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 12 Jun 2007 09:07:21 -0400
Subject: Btrfs: add GPLv2

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/COPYING       | 356 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/TODO          |   1 -
 fs/btrfs/bit-radix.c   |  18 +++
 fs/btrfs/bit-radix.h   |  18 +++
 fs/btrfs/btrfs_inode.h |  18 +++
 fs/btrfs/ctree.c       |  18 +++
 fs/btrfs/ctree.h       |  18 +++
 fs/btrfs/dir-item.c    |  18 +++
 fs/btrfs/disk-io.c     |  18 +++
 fs/btrfs/disk-io.h     |  18 +++
 fs/btrfs/extent-tree.c |  18 +++
 fs/btrfs/file-item.c   |  18 +++
 fs/btrfs/file.c        |  18 +++
 fs/btrfs/hash.c        |  18 +++
 fs/btrfs/hash.h        |  18 +++
 fs/btrfs/inode-item.c  |  18 +++
 fs/btrfs/inode-map.c   |  18 +++
 fs/btrfs/inode.c       |  18 +++
 fs/btrfs/ioctl.h       |  18 +++
 fs/btrfs/print-tree.c  |  18 +++
 fs/btrfs/print-tree.h  |  18 +++
 fs/btrfs/root-tree.c   |  18 +++
 fs/btrfs/super.c       |  18 +++
 fs/btrfs/sysfs.c       |  18 +++
 fs/btrfs/transaction.c |  18 +++
 fs/btrfs/transaction.h |  18 +++
 26 files changed, 788 insertions(+), 1 deletion(-)
 create mode 100644 fs/btrfs/COPYING

(limited to 'fs/btrfs/btrfs_inode.h')

diff --git a/fs/btrfs/COPYING b/fs/btrfs/COPYING
new file mode 100644
index 000000000000..ca442d313d86
--- /dev/null
+++ b/fs/btrfs/COPYING
@@ -0,0 +1,356 @@
+
+   NOTE! This copyright does *not* cover user programs that use kernel
+ services by normal system calls - this is merely considered normal use
+ of the kernel, and does *not* fall under the heading of "derived work".
+ Also note that the GPL below is copyrighted by the Free Software
+ Foundation, but the instance of code that it refers to (the Linux
+ kernel) is copyrighted by me and others who actually wrote it.
+
+ Also note that the only valid version of the GPL as far as the kernel
+ is concerned is _this_ particular version of the license (ie v2, not
+ v2.2 or v3.x or whatever), unless explicitly otherwise stated.
+
+			Linus Torvalds
+
+----------------------------------------
+
+		    GNU GENERAL PUBLIC LICENSE
+		       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.
+                       51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+			    Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Library General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+		    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+			    NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+		     END OF TERMS AND CONDITIONS
+
+	    How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) year name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Library General
+Public License instead of this License.
diff --git a/fs/btrfs/TODO b/fs/btrfs/TODO
index 4a48a2365d57..d9b6d38c603a 100644
--- a/fs/btrfs/TODO
+++ b/fs/btrfs/TODO
@@ -1,4 +1,3 @@
-* fix printk warnings
 * cleanup, add more error checking, get rid of BUG_ONs
 * Fix ENOSPC handling
 * Make allocator smarter
diff --git a/fs/btrfs/bit-radix.c b/fs/btrfs/bit-radix.c
index 8f9cd4277231..a4ddf4de9797 100644
--- a/fs/btrfs/bit-radix.c
+++ b/fs/btrfs/bit-radix.c
@@ -1,3 +1,21 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
 #include <linux/module.h>
 #include "bit-radix.h"
 
diff --git a/fs/btrfs/bit-radix.h b/fs/btrfs/bit-radix.h
index 4e717e30db4f..c100f54d5c32 100644
--- a/fs/btrfs/bit-radix.h
+++ b/fs/btrfs/bit-radix.h
@@ -1,3 +1,21 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
 #ifndef __BIT_RADIX__
 #define __BIT_RADIX__
 #include <linux/radix-tree.h>
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 6b50076b02d5..783f06b63259 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -1,3 +1,21 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
 #ifndef __BTRFS_I__
 #define __BTRFS_I__
 
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 20e01f95ce6e..4736155af419 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1,3 +1,21 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
 #include <linux/module.h>
 #include "ctree.h"
 #include "disk-io.h"
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 24410d42e3ce..4bd648d68e88 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1,3 +1,21 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
 #ifndef __BTRFS__
 #define __BTRFS__
 
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index b408a3d20c7b..0d179a44ebf4 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -1,3 +1,21 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
 #include <linux/module.h>
 #include "ctree.h"
 #include "disk-io.h"
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 878101172546..8c68a64c0ceb 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1,3 +1,21 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/blkdev.h>
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 822ccb8aa4ae..7b76ccc48754 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -1,3 +1,21 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
 #ifndef __DISKIO__
 #define __DISKIO__
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index fe02fbfa6d59..975a0eb9241b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1,3 +1,21 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
 #include <linux/module.h>
 #include "ctree.h"
 #include "disk-io.h"
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index d5a98827e38a..37dd3801a107 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -1,3 +1,21 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
 #include <linux/module.h>
 #include "ctree.h"
 #include "disk-io.h"
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 0325dc038592..566e526b60ff 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1,3 +1,21 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
 #include <linux/module.h>
 #include <linux/buffer_head.h>
 #include <linux/fs.h>
diff --git a/fs/btrfs/hash.c b/fs/btrfs/hash.c
index 32de1ea1b64e..e5c76903d722 100644
--- a/fs/btrfs/hash.c
+++ b/fs/btrfs/hash.c
@@ -1,3 +1,21 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
 /*
  *  Original copy from:
  *  linux/fs/ext3/hash.c
diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h
index e56c4670093d..d3be02670585 100644
--- a/fs/btrfs/hash.h
+++ b/fs/btrfs/hash.h
@@ -1,3 +1,21 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
 #ifndef __HASH__
 #define __HASH__
 int btrfs_name_hash(const char *name, int len, u64 *hash_result);
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index c3b990b661c2..449e9dbdf957 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -1,3 +1,21 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
 #include <linux/module.h>
 #include "ctree.h"
 #include "disk-io.h"
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 318e27a6378b..58930fc48be6 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -1,3 +1,21 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
 #include <linux/module.h>
 #include "ctree.h"
 #include "disk-io.h"
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1cabc1951b0e..46f869d6d0fb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1,3 +1,21 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
 #include <linux/module.h>
 #include <linux/buffer_head.h>
 #include <linux/fs.h>
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 497fadd98c3a..23bed48c51a7 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -1,3 +1,21 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
 #ifndef __IOCTL_
 #define __IOCTL_
 #include <linux/ioctl.h>
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 21791f037561..1fa705683008 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -1,3 +1,21 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
 #include <linux/module.h>
 #include "ctree.h"
 #include "disk-io.h"
diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h
index 396041a05cfa..205cd03601ab 100644
--- a/fs/btrfs/print-tree.h
+++ b/fs/btrfs/print-tree.h
@@ -1,3 +1,21 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
 #ifndef __PRINT_TREE_
 #define __PRINT_TREE_
 void btrfs_print_leaf(struct btrfs_root *root, struct btrfs_leaf *l);
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 13c30284fd84..7a2d63c1ee80 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -1,3 +1,21 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
 #include <linux/module.h>
 #include "ctree.h"
 #include "disk-io.h"
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 94b3535a2a11..ef8ad18b275c 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1,3 +1,21 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
 #include <linux/module.h>
 #include <linux/buffer_head.h>
 #include <linux/fs.h>
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index ed8b7e20cfbc..ead9dc8051be 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -1,3 +1,21 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
 #include <linux/module.h>
 #include "ctree.h"
 #include "disk-io.h"
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 4f3c8ddcb401..30914521cf01 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1,3 +1,21 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
 #include <linux/module.h>
 #include <linux/fs.h>
 #include "ctree.h"
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index f25b4900db45..5fb1d322aa10 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -1,3 +1,21 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
 #ifndef __TRANSACTION__
 #define __TRANSACTION__
 #include "btrfs_inode.h"
-- 
cgit v1.2.3


From f1ace244c8c1e16eaa5c8b3b5339849651e31ede Mon Sep 17 00:00:00 2001
From: Aneesh <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 13 Jun 2007 16:18:26 -0400
Subject: btrfs: Code cleanup Attaching below is some of the code cleanups that
 i came across while reading the code.

a) alloc_path already calls init_path.
b) Mention that btrfs_inode is the in memory copy.Ext4 have ext4_inode_info as
the in memory copy ext4_inode as the disk copy

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/btrfs_inode.h |  1 +
 fs/btrfs/ctree.c       |  1 -
 fs/btrfs/dir-item.c    |  1 -
 fs/btrfs/extent-tree.c |  4 ----
 fs/btrfs/file-item.c   |  2 --
 fs/btrfs/inode-item.c  |  1 -
 fs/btrfs/inode.c       | 10 +---------
 fs/btrfs/root-tree.c   |  3 ---
 8 files changed, 2 insertions(+), 21 deletions(-)

(limited to 'fs/btrfs/btrfs_inode.h')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 783f06b63259..6bce46be49ff 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -19,6 +19,7 @@
 #ifndef __BTRFS_I__
 #define __BTRFS_I__
 
+/* in memory btrfs inode */
 struct btrfs_inode {
 	struct btrfs_root *root;
 	struct btrfs_block_group_cache *block_group;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 4736155af419..498dea0854c1 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1647,7 +1647,6 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
-	btrfs_init_path(path);
 	ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
 	if (!ret) {
 		ptr = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 0d179a44ebf4..ff10cf5a8422 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -74,7 +74,6 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 	ret = btrfs_name_hash(name, name_len, &key.offset);
 	BUG_ON(ret);
 	path = btrfs_alloc_path();
-	btrfs_init_path(path);
 	data_size = sizeof(*dir_item) + name_len;
 	dir_item = insert_with_overflow(trans, root, path, &key, data_size,
 					name, name_len);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 975a0eb9241b..5f4eaba8b85f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -402,7 +402,6 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 			 &ins, 0);
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
-	btrfs_init_path(path);
 	key.objectid = blocknr;
 	key.flags = 0;
 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
@@ -437,7 +436,6 @@ static int lookup_extent_ref(struct btrfs_trans_handle *trans,
 	struct btrfs_extent_item *item;
 
 	path = btrfs_alloc_path();
-	btrfs_init_path(path);
 	key.objectid = blocknr;
 	key.offset = num_blocks;
 	key.flags = 0;
@@ -807,7 +805,6 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	find_free_extent(trans, root, 0, 0, (u64)-1, 0, &ins, 0);
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
-	btrfs_init_path(path);
 
 	ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1);
 	if (ret) {
@@ -1463,7 +1460,6 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
-	btrfs_init_path(path);
 
 	level = btrfs_header_level(btrfs_buffer_header(snap));
 	orig_level = level;
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 37dd3801a107..3c140cc1dd05 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -38,7 +38,6 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
-	btrfs_init_path(path);
 	file_key.objectid = objectid;
 	file_key.offset = pos;
 	file_key.flags = 0;
@@ -274,7 +273,6 @@ int btrfs_csum_verify_file_block(struct btrfs_root *root,
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
-	btrfs_init_path(path);
 	file_key.objectid = objectid;
 	file_key.offset = offset;
 	file_key.flags = 0;
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 449e9dbdf957..e8b5a9f7c0c2 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -35,7 +35,6 @@ int btrfs_insert_inode(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
-	btrfs_init_path(path);
 	ret = btrfs_insert_item(trans, root, &key, inode_item,
 				sizeof(*inode_item));
 	btrfs_release_path(root, path);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d3d5a4cdb6af..e639cc9e089f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -79,7 +79,6 @@ void btrfs_read_locked_inode(struct inode *inode)
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
-	btrfs_init_path(path);
 	mutex_lock(&root->fs_info->fs_mutex);
 
 	memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
@@ -177,7 +176,6 @@ static int btrfs_update_inode(struct btrfs_trans_handle *trans,
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
-	btrfs_init_path(path);
 	ret = btrfs_lookup_inode(trans, root, path,
 				 &BTRFS_I(inode)->location, 1);
 	if (ret) {
@@ -214,7 +212,6 @@ static int btrfs_unlink_trans(struct btrfs_trans_handle *trans,
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
-	btrfs_init_path(path);
 	di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
 				    name, name_len, -1);
 	if (IS_ERR(di)) {
@@ -289,7 +286,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
-	btrfs_init_path(path);
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
 	btrfs_set_trans_block_group(trans, dir);
@@ -360,7 +356,6 @@ static int btrfs_free_inode(struct btrfs_trans_handle *trans,
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
-	btrfs_init_path(path);
 	ret = btrfs_lookup_inode(trans, root, path,
 				 &BTRFS_I(inode)->location, -1);
 	BUG_ON(ret);
@@ -687,7 +682,6 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
-	btrfs_init_path(path);
 	di = btrfs_lookup_dir_item(NULL, root, path, dir->i_ino, name,
 				    namelen, 0);
 	if (!di || IS_ERR(di)) {
@@ -874,7 +868,6 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	btrfs_set_key_type(&key, key_type);
 	key.offset = filp->f_pos;
 	path = btrfs_alloc_path();
-	btrfs_init_path(path);
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	if (ret < 0)
 		goto err;
@@ -1263,7 +1256,6 @@ static int btrfs_get_block_lock(struct inode *inode, sector_t iblock,
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
-	btrfs_init_path(path);
 	if (create & BTRFS_GET_BLOCK_CREATE) {
 		WARN_ON(1);
 		/* this almost but not quite works */
@@ -1280,7 +1272,7 @@ static int btrfs_get_block_lock(struct inode *inode, sector_t iblock,
 	}
 
 	ret = btrfs_lookup_file_extent(NULL, root, path,
-				       inode->i_ino,
+				       objectid,
 				       iblock << inode->i_blkbits, 0);
 	if (ret < 0) {
 		err = ret;
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 7a2d63c1ee80..8e1b9046d5ec 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -36,7 +36,6 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
-	btrfs_init_path(path);
 	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
 	if (ret < 0)
 		goto out;
@@ -70,7 +69,6 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
-	btrfs_init_path(path);
 	ret = btrfs_search_slot(trans, root, key, path, 0, 1);
 	if (ret < 0)
 		goto out;
@@ -106,7 +104,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
-	btrfs_init_path(path);
 	ret = btrfs_search_slot(trans, root, key, path, -1, 1);
 	if (ret < 0)
 		goto out;
-- 
cgit v1.2.3


From 15ee9bc7ed171248d1405df5854da5fa91bfdc39 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jwhiter@redhat.com>
Date: Fri, 10 Aug 2007 16:22:09 -0400
Subject: Btrfs: delay commits during fsync to allow more writers

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/btrfs_inode.h |  5 +++++
 fs/btrfs/ctree.h       |  1 +
 fs/btrfs/disk-io.c     |  1 +
 fs/btrfs/file.c        | 22 ++++++++++++++++++----
 fs/btrfs/inode.c       |  2 ++
 fs/btrfs/transaction.c | 44 ++++++++++++++++++++++++++++++++------------
 fs/btrfs/transaction.h |  7 +++++++
 7 files changed, 66 insertions(+), 16 deletions(-)

(limited to 'fs/btrfs/btrfs_inode.h')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 6bce46be49ff..f446f161f736 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -25,6 +25,11 @@ struct btrfs_inode {
 	struct btrfs_block_group_cache *block_group;
 	struct btrfs_key location;
 	struct inode vfs_inode;
+
+	/*
+	 * transid of the trans_handle that last modified this inode
+	 */
+	u64 last_trans;
 };
 static inline struct btrfs_inode *BTRFS_I(struct inode *inode)
 {
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index d3cd564b3b3f..c7f5161271d3 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -301,6 +301,7 @@ struct btrfs_fs_info {
 	struct radix_tree_root extent_map_radix;
 	struct radix_tree_root extent_ins_radix;
 	u64 generation;
+	u64 last_trans_committed;
 	struct btrfs_transaction *running_transaction;
 	struct btrfs_super_block *disk_super;
 	struct btrfs_super_block super_copy;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 192211274ce5..b2f79878d51a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -435,6 +435,7 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	INIT_LIST_HEAD(&fs_info->dead_roots);
 	sb_set_blocksize(sb, 4096);
 	fs_info->running_transaction = NULL;
+	fs_info->last_trans_committed = 0;
 	fs_info->tree_root = tree_root;
 	fs_info->extent_root = extent_root;
 	fs_info->sb = sb;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 00b118a2db69..6933ab11a5cd 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -694,22 +694,36 @@ static int btrfs_sync_file(struct file *file,
 {
 	struct inode *inode = dentry->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	int ret;
+	int ret = 0;
 	struct btrfs_trans_handle *trans;
 
 	/*
-	 * FIXME, use inode generation number to check if we can skip the
-	 * commit
+	 * check the transaction that last modified this inode
+	 * and see if its already been committed
 	 */
 	mutex_lock(&root->fs_info->fs_mutex);
+	if (!BTRFS_I(inode)->last_trans)
+		goto out;
+	mutex_lock(&root->fs_info->trans_mutex);
+	if (BTRFS_I(inode)->last_trans <=
+	    root->fs_info->last_trans_committed) {
+		BTRFS_I(inode)->last_trans = 0;
+		mutex_unlock(&root->fs_info->trans_mutex);
+		goto out;
+	}
+	mutex_unlock(&root->fs_info->trans_mutex);
+
+	/*
+ 	 * ok we haven't committed the transaction yet, lets do a commit
+ 	 */
 	trans = btrfs_start_transaction(root, 1);
 	if (!trans) {
 		ret = -ENOMEM;
 		goto out;
 	}
 	ret = btrfs_commit_transaction(trans, root);
-	mutex_unlock(&root->fs_info->fs_mutex);
 out:
+	mutex_unlock(&root->fs_info->fs_mutex);
 	return ret > 0 ? EIO : ret;
 }
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5c05ecbc5726..398484179d82 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -193,6 +193,7 @@ static int btrfs_update_inode(struct btrfs_trans_handle *trans,
 
 	fill_inode_item(inode_item, inode);
 	btrfs_mark_buffer_dirty(path->nodes[0]);
+	btrfs_set_inode_last_trans(trans, inode);
 	ret = 0;
 failed:
 	btrfs_release_path(root, path);
@@ -2234,6 +2235,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 	ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
 	if (!ei)
 		return NULL;
+	ei->last_trans = 0;
 	return &ei->vfs_inode;
 }
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index c9d52dc83e48..18abea802794 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -55,7 +55,8 @@ static int join_transaction(struct btrfs_root *root)
 		BUG_ON(!cur_trans);
 		root->fs_info->generation++;
 		root->fs_info->running_transaction = cur_trans;
-		cur_trans->num_writers = 0;
+		cur_trans->num_writers = 1;
+		cur_trans->num_joined = 0;
 		cur_trans->transid = root->fs_info->generation;
 		init_waitqueue_head(&cur_trans->writer_wait);
 		init_waitqueue_head(&cur_trans->commit_wait);
@@ -65,8 +66,11 @@ static int join_transaction(struct btrfs_root *root)
 		cur_trans->start_time = get_seconds();
 		list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
 		init_bit_radix(&cur_trans->dirty_pages);
+	} else {
+		cur_trans->num_writers++;
+		cur_trans->num_joined++;
 	}
-	cur_trans->num_writers++;
+
 	return 0;
 }
 
@@ -428,12 +432,14 @@ static int drop_dirty_roots(struct btrfs_root *tree_root,
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root)
 {
-	int ret = 0;
+	unsigned long joined = 0;
+	unsigned long timeout = 1;
 	struct btrfs_transaction *cur_trans;
 	struct btrfs_transaction *prev_trans = NULL;
 	struct list_head dirty_fs_roots;
 	struct radix_tree_root pinned_copy;
 	DEFINE_WAIT(wait);
+	int ret;
 
 	init_bit_radix(&pinned_copy);
 	INIT_LIST_HEAD(&dirty_fs_roots);
@@ -448,7 +454,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 		mutex_unlock(&root->fs_info->fs_mutex);
 		ret = wait_for_commit(root, cur_trans);
 		BUG_ON(ret);
+
+		mutex_lock(&root->fs_info->trans_mutex);
 		put_transaction(cur_trans);
+		mutex_unlock(&root->fs_info->trans_mutex);
+
 		mutex_lock(&root->fs_info->fs_mutex);
 		return 0;
 	}
@@ -463,26 +473,35 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 			mutex_unlock(&root->fs_info->trans_mutex);
 
 			wait_for_commit(root, prev_trans);
-			put_transaction(prev_trans);
 
 			mutex_lock(&root->fs_info->fs_mutex);
 			mutex_lock(&root->fs_info->trans_mutex);
+			put_transaction(prev_trans);
 		}
 	}
-	while (trans->transaction->num_writers > 1) {
+
+	do {
+		joined = cur_trans->num_joined;
 		WARN_ON(cur_trans != trans->transaction);
-		prepare_to_wait(&trans->transaction->writer_wait, &wait,
+		prepare_to_wait(&cur_trans->writer_wait, &wait,
 				TASK_UNINTERRUPTIBLE);
-		if (trans->transaction->num_writers <= 1)
-			break;
+
+		if (cur_trans->num_writers > 1)
+			timeout = MAX_SCHEDULE_TIMEOUT;
+		else
+			timeout = 1;
+
 		mutex_unlock(&root->fs_info->fs_mutex);
 		mutex_unlock(&root->fs_info->trans_mutex);
-		schedule();
+
+		schedule_timeout(timeout);
+
 		mutex_lock(&root->fs_info->fs_mutex);
 		mutex_lock(&root->fs_info->trans_mutex);
-		finish_wait(&trans->transaction->writer_wait, &wait);
-	}
-	finish_wait(&trans->transaction->writer_wait, &wait);
+		finish_wait(&cur_trans->writer_wait, &wait);
+	} while (cur_trans->num_writers > 1 ||
+		 (cur_trans->num_joined != joined));
+
 	WARN_ON(cur_trans != trans->transaction);
 	ret = add_dirty_roots(trans, &root->fs_info->fs_roots_radix,
 			      &dirty_fs_roots);
@@ -511,6 +530,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	btrfs_finish_extent_commit(trans, root, &pinned_copy);
 	mutex_lock(&root->fs_info->trans_mutex);
 	cur_trans->commit_done = 1;
+	root->fs_info->last_trans_committed = cur_trans->transid;
 	wake_up(&cur_trans->commit_wait);
 	put_transaction(cur_trans);
 	put_transaction(cur_trans);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index d5f491d3757e..e451783a1a4d 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -23,6 +23,7 @@
 struct btrfs_transaction {
 	u64 transid;
 	unsigned long num_writers;
+	unsigned long num_joined;
 	int in_commit;
 	int use_count;
 	int commit_done;
@@ -57,6 +58,12 @@ static inline void btrfs_update_inode_block_group(struct
 	BTRFS_I(inode)->block_group = trans->block_group;
 }
 
+static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
+					      struct inode *inode)
+{
+	BTRFS_I(inode)->last_trans = trans->transaction->transid;
+}
+
 int btrfs_end_transaction(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
-- 
cgit v1.2.3


From a52d9a8033c454cd9b4697cfafb467509fc1693f Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 27 Aug 2007 16:49:44 -0400
Subject: Btrfs: Extent based page cache code.  This uses an rbtree of extents
 and tests instead of buffer heads.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile      |    3 +-
 fs/btrfs/btrfs_inode.h |    3 +
 fs/btrfs/ctree.h       |    7 +
 fs/btrfs/disk-io.h     |    1 -
 fs/btrfs/extent_map.c  | 1656 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/extent_map.h  |   89 +++
 fs/btrfs/file.c        |  295 ++++-----
 fs/btrfs/inode.c       |  816 +++++++++---------------
 fs/btrfs/super.c       |    2 +
 9 files changed, 2203 insertions(+), 669 deletions(-)
 create mode 100644 fs/btrfs/extent_map.c
 create mode 100644 fs/btrfs/extent_map.h

(limited to 'fs/btrfs/btrfs_inode.h')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 9321438c5eef..ea7a22870350 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -4,7 +4,8 @@ ifneq ($(KERNELRELEASE),)
 obj-m  := btrfs.o
 btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   hash.o file-item.o inode-item.o inode-map.o disk-io.o \
-	   transaction.o bit-radix.o inode.o file.o tree-defrag.o
+	   transaction.o bit-radix.o inode.o file.o tree-defrag.o \
+	   extent_map.o
 
 #btrfs-y := ctree.o disk-io.o radix-tree.o extent-tree.o print-tree.o \
 #	  root-tree.o dir-item.o hash.o file-item.o inode-item.o \
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index f446f161f736..d1d5af471c3c 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -19,11 +19,14 @@
 #ifndef __BTRFS_I__
 #define __BTRFS_I__
 
+#include "extent_map.h"
+
 /* in memory btrfs inode */
 struct btrfs_inode {
 	struct btrfs_root *root;
 	struct btrfs_block_group_cache *block_group;
 	struct btrfs_key location;
+	struct extent_map_tree extent_tree;
 	struct inode vfs_inode;
 
 	/*
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index c7f5161271d3..947f061ed118 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1176,7 +1176,14 @@ int btrfs_commit_write(struct file *file, struct page *page,
 		       unsigned from, unsigned to);
 int btrfs_get_block(struct inode *inode, sector_t iblock,
 		    struct buffer_head *result, int create);
+struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
+				    size_t page_offset, u64 start, u64 end,
+				    int create);
+int btrfs_update_inode(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root,
+			      struct inode *inode);
 /* file.c */
+int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end);
 extern struct file_operations btrfs_file_operations;
 int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct inode *inode,
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 81fd18cbd824..5261733b8735 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -77,7 +77,6 @@ int btrfs_insert_dev_radix(struct btrfs_root *root,
 			   u64 num_blocks);
 int btrfs_map_bh_to_logical(struct btrfs_root *root, struct buffer_head *bh,
 			     u64 logical);
-int btrfs_releasepage(struct page *page, gfp_t flags);
 void btrfs_btree_balance_dirty(struct btrfs_root *root);
 int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
 void btrfs_mark_buffer_dirty(struct buffer_head *bh);
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
new file mode 100644
index 000000000000..d378edf0964e
--- /dev/null
+++ b/fs/btrfs/extent_map.c
@@ -0,0 +1,1656 @@
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include <linux/bio.h>
+#include <linux/mm.h>
+#include <linux/gfp.h>
+#include <linux/pagemap.h>
+#include <linux/page-flags.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/blkdev.h>
+#include "extent_map.h"
+
+static struct kmem_cache *extent_map_cache;
+static struct kmem_cache *extent_state_cache;
+
+struct tree_entry {
+	u64 start;
+	u64 end;
+	int in_tree;
+	struct rb_node rb_node;
+};
+
+/* bits for the extent state */
+#define EXTENT_DIRTY 1
+#define EXTENT_WRITEBACK (1 << 1)
+#define EXTENT_UPTODATE (1 << 2)
+#define EXTENT_LOCKED (1 << 3)
+#define EXTENT_NEW (1 << 4)
+#define EXTENT_DELALLOC (1 << 5)
+
+#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
+
+static LIST_HEAD(all_states);
+spinlock_t state_lock = SPIN_LOCK_UNLOCKED;
+
+void __init extent_map_init(void)
+{
+	extent_map_cache = kmem_cache_create("extent_map",
+					    sizeof(struct extent_map), 0,
+					    SLAB_RECLAIM_ACCOUNT |
+					    SLAB_DESTROY_BY_RCU,
+					    NULL);
+	extent_state_cache = kmem_cache_create("extent_state",
+					    sizeof(struct extent_state), 0,
+					    SLAB_RECLAIM_ACCOUNT |
+					    SLAB_DESTROY_BY_RCU,
+					    NULL);
+}
+
+void __exit extent_map_exit(void)
+{
+	while(!list_empty(&all_states)) {
+		struct extent_state *state;
+		struct list_head *cur = all_states.next;
+		state = list_entry(cur, struct extent_state, list);
+		printk("found leaked state %Lu %Lu state %d in_tree %d\n",
+		       state->start, state->end, state->state, state->in_tree);
+		list_del(&state->list);
+		kfree(state);
+	}
+	if (extent_map_cache)
+		kmem_cache_destroy(extent_map_cache);
+	if (extent_state_cache)
+		kmem_cache_destroy(extent_state_cache);
+}
+
+void extent_map_tree_init(struct extent_map_tree *tree,
+			  struct address_space *mapping, gfp_t mask)
+{
+	tree->map.rb_node = NULL;
+	tree->state.rb_node = NULL;
+	rwlock_init(&tree->lock);
+	tree->mapping = mapping;
+}
+EXPORT_SYMBOL(extent_map_tree_init);
+
+struct extent_map *alloc_extent_map(gfp_t mask)
+{
+	struct extent_map *em;
+	em = kmem_cache_alloc(extent_map_cache, mask);
+	if (!em || IS_ERR(em))
+		return em;
+	em->in_tree = 0;
+	atomic_set(&em->refs, 1);
+	return em;
+}
+EXPORT_SYMBOL(alloc_extent_map);
+
+void free_extent_map(struct extent_map *em)
+{
+	if (atomic_dec_and_test(&em->refs)) {
+		WARN_ON(em->in_tree);
+		kmem_cache_free(extent_map_cache, em);
+	}
+}
+EXPORT_SYMBOL(free_extent_map);
+
+
+struct extent_state *alloc_extent_state(gfp_t mask)
+{
+	struct extent_state *state;
+	state = kmem_cache_alloc(extent_state_cache, mask);
+	if (!state || IS_ERR(state))
+		return state;
+	state->state = 0;
+	state->in_tree = 0;
+	atomic_set(&state->refs, 1);
+	init_waitqueue_head(&state->wq);
+	spin_lock_irq(&state_lock);
+	list_add(&state->list, &all_states);
+	spin_unlock_irq(&state_lock);
+	return state;
+}
+EXPORT_SYMBOL(alloc_extent_state);
+
+void free_extent_state(struct extent_state *state)
+{
+	if (atomic_dec_and_test(&state->refs)) {
+		WARN_ON(state->in_tree);
+		spin_lock_irq(&state_lock);
+		list_del_init(&state->list);
+		spin_unlock_irq(&state_lock);
+		kmem_cache_free(extent_state_cache, state);
+	}
+}
+EXPORT_SYMBOL(free_extent_state);
+
+static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
+				   struct rb_node *node)
+{
+	struct rb_node ** p = &root->rb_node;
+	struct rb_node * parent = NULL;
+	struct tree_entry *entry;
+
+	while(*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct tree_entry, rb_node);
+
+		if (offset < entry->start)
+			p = &(*p)->rb_left;
+		else if (offset > entry->end)
+			p = &(*p)->rb_right;
+		else
+			return parent;
+	}
+
+	entry = rb_entry(node, struct tree_entry, rb_node);
+	entry->in_tree = 1;
+	rb_link_node(node, parent, p);
+	rb_insert_color(node, root);
+	return NULL;
+}
+
+static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
+				   struct rb_node **prev_ret)
+{
+	struct rb_node * n = root->rb_node;
+	struct rb_node *prev = NULL;
+	struct tree_entry *entry;
+	struct tree_entry *prev_entry = NULL;
+
+	while(n) {
+		entry = rb_entry(n, struct tree_entry, rb_node);
+		prev = n;
+		prev_entry = entry;
+
+		if (offset < entry->start)
+			n = n->rb_left;
+		else if (offset > entry->end)
+			n = n->rb_right;
+		else
+			return n;
+	}
+	if (!prev_ret)
+		return NULL;
+	while(prev && offset > prev_entry->end) {
+		prev = rb_next(prev);
+		prev_entry = rb_entry(prev, struct tree_entry, rb_node);
+	}
+	*prev_ret = prev;
+	return NULL;
+}
+
+static inline struct rb_node *tree_search(struct rb_root *root, u64 offset)
+{
+	struct rb_node *prev;
+	struct rb_node *ret;
+	ret = __tree_search(root, offset, &prev);
+	if (!ret)
+		return prev;
+	return ret;
+}
+
+static int tree_delete(struct rb_root *root, u64 offset)
+{
+	struct rb_node *node;
+	struct tree_entry *entry;
+
+	node = __tree_search(root, offset, NULL);
+	if (!node)
+		return -ENOENT;
+	entry = rb_entry(node, struct tree_entry, rb_node);
+	entry->in_tree = 0;
+	rb_erase(node, root);
+	return 0;
+}
+
+/*
+ * add_extent_mapping tries a simple backward merge with existing
+ * mappings.  The extent_map struct passed in will be inserted into
+ * the tree directly (no copies made, just a reference taken).
+ */
+int add_extent_mapping(struct extent_map_tree *tree,
+		       struct extent_map *em)
+{
+	int ret = 0;
+	struct extent_map *prev = NULL;
+	struct rb_node *rb;
+
+	write_lock_irq(&tree->lock);
+	rb = tree_insert(&tree->map, em->end, &em->rb_node);
+	if (rb) {
+		prev = rb_entry(rb, struct extent_map, rb_node);
+		printk("found extent map %Lu %Lu on insert of %Lu %Lu\n", prev->start, prev->end, em->start, em->end);
+		ret = -EEXIST;
+		goto out;
+	}
+	atomic_inc(&em->refs);
+	if (em->start != 0) {
+		rb = rb_prev(&em->rb_node);
+		if (rb)
+			prev = rb_entry(rb, struct extent_map, rb_node);
+		if (prev && prev->end + 1 == em->start &&
+		    ((em->block_start == 0 && prev->block_start == 0) ||
+			     (em->block_start == prev->block_end + 1))) {
+			em->start = prev->start;
+			em->block_start = prev->block_start;
+			rb_erase(&prev->rb_node, &tree->map);
+			prev->in_tree = 0;
+			free_extent_map(prev);
+		}
+	 }
+out:
+	write_unlock_irq(&tree->lock);
+	return ret;
+}
+EXPORT_SYMBOL(add_extent_mapping);
+
+/*
+ * lookup_extent_mapping returns the first extent_map struct in the
+ * tree that intersects the [start, end] (inclusive) range.  There may
+ * be additional objects in the tree that intersect, so check the object
+ * returned carefully to make sure you don't need additional lookups.
+ */
+struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
+					 u64 start, u64 end)
+{
+	struct extent_map *em;
+	struct rb_node *rb_node;
+
+	read_lock_irq(&tree->lock);
+	rb_node = tree_search(&tree->map, start);
+	if (!rb_node) {
+		em = NULL;
+		goto out;
+	}
+	if (IS_ERR(rb_node)) {
+		em = ERR_PTR(PTR_ERR(rb_node));
+		goto out;
+	}
+	em = rb_entry(rb_node, struct extent_map, rb_node);
+	if (em->end < start || em->start > end) {
+		em = NULL;
+		goto out;
+	}
+	atomic_inc(&em->refs);
+out:
+	read_unlock_irq(&tree->lock);
+	return em;
+}
+EXPORT_SYMBOL(lookup_extent_mapping);
+
+/*
+ * removes an extent_map struct from the tree.  No reference counts are
+ * dropped, and no checks are done to  see if the range is in use
+ */
+int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
+{
+	int ret;
+
+	write_lock_irq(&tree->lock);
+	ret = tree_delete(&tree->map, em->end);
+	write_unlock_irq(&tree->lock);
+	return ret;
+}
+EXPORT_SYMBOL(remove_extent_mapping);
+
+/*
+ * utility function to look for merge candidates inside a given range.
+ * Any extents with matching state are merged together into a single
+ * extent in the tree.  Extents with EXTENT_IO in their state field
+ * are not merged because the end_io handlers need to be able to do
+ * operations on them without sleeping (or doing allocations/splits).
+ *
+ * This should be called with the tree lock held.
+ */
+static int merge_state(struct extent_map_tree *tree,
+		       struct extent_state *state)
+{
+	struct extent_state *other;
+	struct rb_node *other_node;
+
+	if (state->state & EXTENT_IOBITS)
+		return 0;
+
+	other_node = rb_prev(&state->rb_node);
+	if (other_node) {
+		other = rb_entry(other_node, struct extent_state, rb_node);
+		if (other->end == state->start - 1 &&
+		    other->state == state->state) {
+			state->start = other->start;
+			other->in_tree = 0;
+			rb_erase(&other->rb_node, &tree->state);
+			free_extent_state(other);
+		}
+	}
+	other_node = rb_next(&state->rb_node);
+	if (other_node) {
+		other = rb_entry(other_node, struct extent_state, rb_node);
+		if (other->start == state->end + 1 &&
+		    other->state == state->state) {
+			other->start = state->start;
+			state->in_tree = 0;
+			rb_erase(&state->rb_node, &tree->state);
+			free_extent_state(state);
+		}
+	}
+	return 0;
+}
+
+/*
+ * insert an extent_state struct into the tree.  'bits' are set on the
+ * struct before it is inserted.
+ *
+ * This may return -EEXIST if the extent is already there, in which case the
+ * state struct is freed.
+ *
+ * The tree lock is not taken internally.  This is a utility function and
+ * probably isn't what you want to call (see set/clear_extent_bit).
+ */
+static int insert_state(struct extent_map_tree *tree,
+			struct extent_state *state, u64 start, u64 end,
+			int bits)
+{
+	struct rb_node *node;
+
+	if (end < start) {
+		printk("end < start %Lu %Lu\n", end, start);
+		WARN_ON(1);
+	}
+	state->state |= bits;
+	state->start = start;
+	state->end = end;
+	if ((end & 4095) == 0) {
+		printk("insert state %Lu %Lu strange end\n", start, end);
+		WARN_ON(1);
+	}
+	node = tree_insert(&tree->state, end, &state->rb_node);
+	if (node) {
+		struct extent_state *found;
+		found = rb_entry(node, struct extent_state, rb_node);
+printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, start, end);
+		free_extent_state(state);
+		return -EEXIST;
+	}
+	merge_state(tree, state);
+	return 0;
+}
+
+/*
+ * split a given extent state struct in two, inserting the preallocated
+ * struct 'prealloc' as the newly created second half.  'split' indicates an
+ * offset inside 'orig' where it should be split.
+ *
+ * Before calling,
+ * the tree has 'orig' at [orig->start, orig->end].  After calling, there
+ * are two extent state structs in the tree:
+ * prealloc: [orig->start, split - 1]
+ * orig: [ split, orig->end ]
+ *
+ * The tree locks are not taken by this function. They need to be held
+ * by the caller.
+ */
+static int split_state(struct extent_map_tree *tree, struct extent_state *orig,
+		       struct extent_state *prealloc, u64 split)
+{
+	struct rb_node *node;
+	prealloc->start = orig->start;
+	prealloc->end = split - 1;
+	prealloc->state = orig->state;
+	orig->start = split;
+	if ((prealloc->end & 4095) == 0) {
+		printk("insert state %Lu %Lu strange end\n", prealloc->start,
+		       prealloc->end);
+		WARN_ON(1);
+	}
+	node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
+	if (node) {
+		struct extent_state *found;
+		found = rb_entry(node, struct extent_state, rb_node);
+printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, prealloc->start, prealloc->end);
+		free_extent_state(prealloc);
+		return -EEXIST;
+	}
+	return 0;
+}
+
+/*
+ * utility function to clear some bits in an extent state struct.
+ * it will optionally wake up any one waiting on this state (wake == 1), or
+ * forcibly remove the state from the tree (delete == 1).
+ *
+ * If no bits are set on the state struct after clearing things, the
+ * struct is freed and removed from the tree
+ */
+static int clear_state_bit(struct extent_map_tree *tree,
+			    struct extent_state *state, int bits, int wake,
+			    int delete)
+{
+	int ret = state->state & bits;
+	state->state &= ~bits;
+	if (wake)
+		wake_up(&state->wq);
+	if (delete || state->state == 0) {
+		if (state->in_tree) {
+			rb_erase(&state->rb_node, &tree->state);
+			state->in_tree = 0;
+			free_extent_state(state);
+		} else {
+			WARN_ON(1);
+		}
+	} else {
+		merge_state(tree, state);
+	}
+	return ret;
+}
+
+/*
+ * clear some bits on a range in the tree.  This may require splitting
+ * or inserting elements in the tree, so the gfp mask is used to
+ * indicate which allocations or sleeping are allowed.
+ *
+ * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
+ * the given range from the tree regardless of state (ie for truncate).
+ *
+ * the range [start, end] is inclusive.
+ *
+ * This takes the tree lock, and returns < 0 on error, > 0 if any of the
+ * bits were already set, or zero if none of the bits were already set.
+ */
+int clear_extent_bit(struct extent_map_tree *tree, u64 start, u64 end,
+		     int bits, int wake, int delete, gfp_t mask)
+{
+	struct extent_state *state;
+	struct extent_state *prealloc = NULL;
+	struct rb_node *node;
+	int err;
+	int set = 0;
+
+again:
+	if (!prealloc && (mask & __GFP_WAIT)) {
+		prealloc = alloc_extent_state(mask);
+		if (!prealloc)
+			return -ENOMEM;
+	}
+
+	write_lock_irq(&tree->lock);
+	/*
+	 * this search will find the extents that end after
+	 * our range starts
+	 */
+	node = tree_search(&tree->state, start);
+	if (!node)
+		goto out;
+	state = rb_entry(node, struct extent_state, rb_node);
+	if (state->start > end)
+		goto out;
+	WARN_ON(state->end < start);
+
+	/*
+	 *     | ---- desired range ---- |
+	 *  | state | or
+	 *  | ------------- state -------------- |
+	 *
+	 * We need to split the extent we found, and may flip
+	 * bits on second half.
+	 *
+	 * If the extent we found extends past our range, we
+	 * just split and search again.  It'll get split again
+	 * the next time though.
+	 *
+	 * If the extent we found is inside our range, we clear
+	 * the desired bit on it.
+	 */
+
+	if (state->start < start) {
+		err = split_state(tree, state, prealloc, start);
+		BUG_ON(err == -EEXIST);
+		prealloc = NULL;
+		if (err)
+			goto out;
+		if (state->end <= end) {
+			start = state->end + 1;
+			set |= clear_state_bit(tree, state, bits,
+					wake, delete);
+		} else {
+			start = state->start;
+		}
+		goto search_again;
+	}
+	/*
+	 * | ---- desired range ---- |
+	 *                        | state |
+	 * We need to split the extent, and clear the bit
+	 * on the first half
+	 */
+	if (state->start <= end && state->end > end) {
+		err = split_state(tree, state, prealloc, end + 1);
+		BUG_ON(err == -EEXIST);
+
+		if (wake)
+			wake_up(&state->wq);
+		set |= clear_state_bit(tree, prealloc, bits,
+				       wake, delete);
+		prealloc = NULL;
+		goto out;
+	}
+
+	start = state->end + 1;
+	set |= clear_state_bit(tree, state, bits, wake, delete);
+	goto search_again;
+
+out:
+	write_unlock_irq(&tree->lock);
+	if (prealloc)
+		free_extent_state(prealloc);
+
+	return set;
+
+search_again:
+	if (start >= end)
+		goto out;
+	write_unlock_irq(&tree->lock);
+	if (mask & __GFP_WAIT)
+		cond_resched();
+	goto again;
+}
+EXPORT_SYMBOL(clear_extent_bit);
+
+static int wait_on_state(struct extent_map_tree *tree,
+			 struct extent_state *state)
+{
+	DEFINE_WAIT(wait);
+	prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
+	read_unlock_irq(&tree->lock);
+	schedule();
+	read_lock_irq(&tree->lock);
+	finish_wait(&state->wq, &wait);
+	return 0;
+}
+
+/*
+ * waits for one or more bits to clear on a range in the state tree.
+ * The range [start, end] is inclusive.
+ * The tree lock is taken by this function
+ */
+int wait_extent_bit(struct extent_map_tree *tree, u64 start, u64 end, int bits)
+{
+	struct extent_state *state;
+	struct rb_node *node;
+
+	read_lock_irq(&tree->lock);
+again:
+	while (1) {
+		/*
+		 * this search will find all the extents that end after
+		 * our range starts
+		 */
+		node = tree_search(&tree->state, start);
+		if (!node)
+			break;
+
+		state = rb_entry(node, struct extent_state, rb_node);
+
+		if (state->start > end)
+			goto out;
+
+		if (state->state & bits) {
+			start = state->start;
+			atomic_inc(&state->refs);
+			wait_on_state(tree, state);
+			free_extent_state(state);
+			goto again;
+		}
+		start = state->end + 1;
+
+		if (start > end)
+			break;
+
+		if (need_resched()) {
+			read_unlock_irq(&tree->lock);
+			cond_resched();
+			read_lock_irq(&tree->lock);
+		}
+	}
+out:
+	read_unlock_irq(&tree->lock);
+	return 0;
+}
+EXPORT_SYMBOL(wait_extent_bit);
+
+/*
+ * set some bits on a range in the tree.  This may require allocations
+ * or sleeping, so the gfp mask is used to indicate what is allowed.
+ *
+ * If 'exclusive' == 1, this will fail with -EEXIST if some part of the
+ * range already has the desired bits set.  The start of the existing
+ * range is returned in failed_start in this case.
+ *
+ * [start, end] is inclusive
+ * This takes the tree lock.
+ */
+int set_extent_bit(struct extent_map_tree *tree, u64 start, u64 end, int bits,
+		   int exclusive, u64 *failed_start, gfp_t mask)
+{
+	struct extent_state *state;
+	struct extent_state *prealloc = NULL;
+	struct rb_node *node;
+	int err = 0;
+	int set;
+	u64 last_start;
+	u64 last_end;
+again:
+	if (!prealloc && (mask & __GFP_WAIT)) {
+		prealloc = alloc_extent_state(mask);
+		if (!prealloc)
+			return -ENOMEM;
+	}
+
+	write_lock_irq(&tree->lock);
+	/*
+	 * this search will find all the extents that end after
+	 * our range starts.
+	 */
+	node = tree_search(&tree->state, start);
+	if (!node) {
+		err = insert_state(tree, prealloc, start, end, bits);
+		prealloc = NULL;
+		BUG_ON(err == -EEXIST);
+		goto out;
+	}
+
+	state = rb_entry(node, struct extent_state, rb_node);
+	last_start = state->start;
+	last_end = state->end;
+
+	/*
+	 * | ---- desired range ---- |
+	 * | state |
+	 *
+	 * Just lock what we found and keep going
+	 */
+	if (state->start == start && state->end <= end) {
+		set = state->state & bits;
+		if (set && exclusive) {
+			*failed_start = state->start;
+			err = -EEXIST;
+			goto out;
+		}
+		state->state |= bits;
+		start = state->end + 1;
+		merge_state(tree, state);
+		goto search_again;
+	}
+
+	/*
+	 *     | ---- desired range ---- |
+	 * | state |
+	 *   or
+	 * | ------------- state -------------- |
+	 *
+	 * We need to split the extent we found, and may flip bits on
+	 * second half.
+	 *
+	 * If the extent we found extends past our
+	 * range, we just split and search again.  It'll get split
+	 * again the next time though.
+	 *
+	 * If the extent we found is inside our range, we set the
+	 * desired bit on it.
+	 */
+	if (state->start < start) {
+		set = state->state & bits;
+		if (exclusive && set) {
+			*failed_start = start;
+			err = -EEXIST;
+			goto out;
+		}
+		err = split_state(tree, state, prealloc, start);
+		BUG_ON(err == -EEXIST);
+		prealloc = NULL;
+		if (err)
+			goto out;
+		if (state->end <= end) {
+			state->state |= bits;
+			start = state->end + 1;
+			merge_state(tree, state);
+		} else {
+			start = state->start;
+		}
+		goto search_again;
+	}
+	/*
+	 * | ---- desired range ---- |
+	 *                        | state |
+	 * We need to split the extent, and set the bit
+	 * on the first half
+	 */
+	if (state->start <= end && state->end > end) {
+		set = state->state & bits;
+		if (exclusive && set) {
+			*failed_start = start;
+			err = -EEXIST;
+			goto out;
+		}
+		err = split_state(tree, state, prealloc, end + 1);
+		BUG_ON(err == -EEXIST);
+
+		prealloc->state |= bits;
+		merge_state(tree, prealloc);
+		prealloc = NULL;
+		goto out;
+	}
+
+	/*
+	 * | ---- desired range ---- |
+	 *     | state | or               | state |
+	 *
+	 * There's a hole, we need to insert something in it and
+	 * ignore the extent we found.
+	 */
+	if (state->start > start) {
+		u64 this_end;
+		if (end < last_start)
+			this_end = end;
+		else
+			this_end = last_start -1;
+		err = insert_state(tree, prealloc, start, this_end,
+				   bits);
+		prealloc = NULL;
+		BUG_ON(err == -EEXIST);
+		if (err)
+			goto out;
+		start = this_end + 1;
+		goto search_again;
+	}
+	goto search_again;
+
+out:
+	write_unlock_irq(&tree->lock);
+	if (prealloc)
+		free_extent_state(prealloc);
+
+	return err;
+
+search_again:
+	if (start > end)
+		goto out;
+	write_unlock_irq(&tree->lock);
+	if (mask & __GFP_WAIT)
+		cond_resched();
+	goto again;
+}
+EXPORT_SYMBOL(set_extent_bit);
+
+/* wrappers around set/clear extent bit */
+int set_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end,
+		     gfp_t mask)
+{
+	return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
+			      mask);
+}
+EXPORT_SYMBOL(set_extent_dirty);
+
+int clear_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end,
+		       gfp_t mask)
+{
+	return clear_extent_bit(tree, start, end, EXTENT_DIRTY, 0, 0, mask);
+}
+EXPORT_SYMBOL(clear_extent_dirty);
+
+int set_extent_new(struct extent_map_tree *tree, u64 start, u64 end,
+		     gfp_t mask)
+{
+	return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
+			      mask);
+}
+EXPORT_SYMBOL(set_extent_new);
+
+int clear_extent_new(struct extent_map_tree *tree, u64 start, u64 end,
+		       gfp_t mask)
+{
+	return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask);
+}
+EXPORT_SYMBOL(clear_extent_new);
+
+int set_extent_uptodate(struct extent_map_tree *tree, u64 start, u64 end,
+			gfp_t mask)
+{
+	return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
+			      mask);
+}
+EXPORT_SYMBOL(set_extent_uptodate);
+
+int clear_extent_uptodate(struct extent_map_tree *tree, u64 start, u64 end,
+			  gfp_t mask)
+{
+	return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask);
+}
+EXPORT_SYMBOL(clear_extent_uptodate);
+
+int set_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end,
+			 gfp_t mask)
+{
+	return set_extent_bit(tree, start, end, EXTENT_WRITEBACK,
+			      0, NULL, mask);
+}
+EXPORT_SYMBOL(set_extent_writeback);
+
+int clear_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end,
+			   gfp_t mask)
+{
+	return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask);
+}
+EXPORT_SYMBOL(clear_extent_writeback);
+
+int wait_on_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end)
+{
+	return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK);
+}
+EXPORT_SYMBOL(wait_on_extent_writeback);
+
+/*
+ * locks a range in ascending order, waiting for any locked regions
+ * it hits on the way.  [start,end] are inclusive, and this will sleep.
+ */
+int lock_extent(struct extent_map_tree *tree, u64 start, u64 end, gfp_t mask)
+{
+	int err;
+	u64 failed_start;
+	while (1) {
+		err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
+				     &failed_start, mask);
+		if (err == -EEXIST && (mask & __GFP_WAIT)) {
+			wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
+			start = failed_start;
+		} else {
+			break;
+		}
+		WARN_ON(start > end);
+	}
+	return err;
+}
+EXPORT_SYMBOL(lock_extent);
+
+int unlock_extent(struct extent_map_tree *tree, u64 start, u64 end,
+		  gfp_t mask)
+{
+	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask);
+}
+EXPORT_SYMBOL(unlock_extent);
+
+/*
+ * helper function to set pages and extents in the tree dirty
+ */
+int set_range_dirty(struct extent_map_tree *tree, u64 start, u64 end)
+{
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+	struct page *page;
+
+	while (index <= end_index) {
+		page = find_get_page(tree->mapping, index);
+		BUG_ON(!page);
+		__set_page_dirty_nobuffers(page);
+		page_cache_release(page);
+		index++;
+	}
+	set_extent_dirty(tree, start, end, GFP_NOFS);
+	return 0;
+}
+EXPORT_SYMBOL(set_range_dirty);
+
+/*
+ * helper function to set both pages and extents in the tree writeback
+ */
+int set_range_writeback(struct extent_map_tree *tree, u64 start, u64 end)
+{
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+	struct page *page;
+
+	while (index <= end_index) {
+		page = find_get_page(tree->mapping, index);
+		BUG_ON(!page);
+		set_page_writeback(page);
+		page_cache_release(page);
+		index++;
+	}
+	set_extent_writeback(tree, start, end, GFP_NOFS);
+	return 0;
+}
+EXPORT_SYMBOL(set_range_writeback);
+
+/*
+ * helper function to lock both pages and extents in the tree.
+ * pages must be locked first.
+ */
+int lock_range(struct extent_map_tree *tree, u64 start, u64 end)
+{
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+	struct page *page;
+	int err;
+
+	while (index <= end_index) {
+		page = grab_cache_page(tree->mapping, index);
+		if (!page) {
+			err = -ENOMEM;
+			goto failed;
+		}
+		if (IS_ERR(page)) {
+			err = PTR_ERR(page);
+			goto failed;
+		}
+		index++;
+	}
+	lock_extent(tree, start, end, GFP_NOFS);
+	return 0;
+
+failed:
+	/*
+	 * we failed above in getting the page at 'index', so we undo here
+	 * up to but not including the page at 'index'
+	 */
+	end_index = index;
+	index = start >> PAGE_CACHE_SHIFT;
+	while (index < end_index) {
+		page = find_get_page(tree->mapping, index);
+		unlock_page(page);
+		page_cache_release(page);
+		index++;
+	}
+	return err;
+}
+EXPORT_SYMBOL(lock_range);
+
+/*
+ * helper function to unlock both pages and extents in the tree.
+ */
+int unlock_range(struct extent_map_tree *tree, u64 start, u64 end)
+{
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+	struct page *page;
+
+	while (index <= end_index) {
+		page = find_get_page(tree->mapping, index);
+		unlock_page(page);
+		page_cache_release(page);
+		index++;
+	}
+	unlock_extent(tree, start, end, GFP_NOFS);
+	return 0;
+}
+EXPORT_SYMBOL(unlock_range);
+
+/*
+ * searches a range in the state tree for a given mask.
+ * If 'filled' == 1, this returns 1 only if ever extent in the tree
+ * has the bits set.  Otherwise, 1 is returned if any bit in the
+ * range is found set.
+ */
+static int test_range_bit(struct extent_map_tree *tree, u64 start, u64 end,
+			  int bits, int filled)
+{
+	struct extent_state *state = NULL;
+	struct rb_node *node;
+	int bitset = 0;
+
+	read_lock_irq(&tree->lock);
+	node = tree_search(&tree->state, start);
+	while (node && start <= end) {
+		state = rb_entry(node, struct extent_state, rb_node);
+		if (state->start > end)
+			break;
+
+		if (filled && state->start > start) {
+			bitset = 0;
+			break;
+		}
+		if (state->state & bits) {
+			bitset = 1;
+			if (!filled)
+				break;
+		} else if (filled) {
+			bitset = 0;
+			break;
+		}
+		start = state->end + 1;
+		if (start > end)
+			break;
+		node = rb_next(node);
+	}
+	read_unlock_irq(&tree->lock);
+	return bitset;
+}
+
+/*
+ * helper function to set a given page up to date if all the
+ * extents in the tree for that page are up to date
+ */
+static int check_page_uptodate(struct extent_map_tree *tree,
+			       struct page *page)
+{
+	u64 start = page->index << PAGE_CACHE_SHIFT;
+	u64 end = start + PAGE_CACHE_SIZE - 1;
+	if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1))
+		SetPageUptodate(page);
+	return 0;
+}
+
+/*
+ * helper function to unlock a page if all the extents in the tree
+ * for that page are unlocked
+ */
+static int check_page_locked(struct extent_map_tree *tree,
+			     struct page *page)
+{
+	u64 start = page->index << PAGE_CACHE_SHIFT;
+	u64 end = start + PAGE_CACHE_SIZE - 1;
+	if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0))
+		unlock_page(page);
+	return 0;
+}
+
+/*
+ * helper function to end page writeback if all the extents
+ * in the tree for that page are done with writeback
+ */
+static int check_page_writeback(struct extent_map_tree *tree,
+			     struct page *page)
+{
+	u64 start = page->index << PAGE_CACHE_SHIFT;
+	u64 end = start + PAGE_CACHE_SIZE - 1;
+	if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0))
+		end_page_writeback(page);
+	return 0;
+}
+
+/* lots and lots of room for performance fixes in the end_bio funcs */
+
+/*
+ * after a writepage IO is done, we need to:
+ * clear the uptodate bits on error
+ * clear the writeback bits in the extent tree for this IO
+ * end_page_writeback if the page has no more pending IO
+ *
+ * Scheduling is not allowed, so the extent state tree is expected
+ * to have one and only one object corresponding to this IO.
+ */
+static int end_bio_extent_writepage(struct bio *bio,
+				   unsigned int bytes_done, int err)
+{
+	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+	struct extent_map_tree *tree = bio->bi_private;
+	u64 start;
+	u64 end;
+	int whole_page;
+
+	if (bio->bi_size)
+		return 1;
+
+	do {
+		struct page *page = bvec->bv_page;
+		start = (page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
+		end = start + bvec->bv_len - 1;
+
+		if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
+			whole_page = 1;
+		else
+			whole_page = 0;
+
+		if (--bvec >= bio->bi_io_vec)
+			prefetchw(&bvec->bv_page->flags);
+
+		if (!uptodate) {
+			clear_extent_uptodate(tree, start, end, GFP_ATOMIC);
+			ClearPageUptodate(page);
+			SetPageError(page);
+		}
+		clear_extent_writeback(tree, start, end, GFP_ATOMIC);
+
+		if (whole_page)
+			end_page_writeback(page);
+		else
+			check_page_writeback(tree, page);
+	} while (bvec >= bio->bi_io_vec);
+
+	bio_put(bio);
+	return 0;
+}
+
+/*
+ * after a readpage IO is done, we need to:
+ * clear the uptodate bits on error
+ * set the uptodate bits if things worked
+ * set the page up to date if all extents in the tree are uptodate
+ * clear the lock bit in the extent tree
+ * unlock the page if there are no other extents locked for it
+ *
+ * Scheduling is not allowed, so the extent state tree is expected
+ * to have one and only one object corresponding to this IO.
+ */
+static int end_bio_extent_readpage(struct bio *bio,
+				   unsigned int bytes_done, int err)
+{
+	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+	struct extent_map_tree *tree = bio->bi_private;
+	u64 start;
+	u64 end;
+	int whole_page;
+
+	if (bio->bi_size)
+		return 1;
+
+	do {
+		struct page *page = bvec->bv_page;
+		start = (page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
+		end = start + bvec->bv_len - 1;
+
+		if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
+			whole_page = 1;
+		else
+			whole_page = 0;
+
+		if (--bvec >= bio->bi_io_vec)
+			prefetchw(&bvec->bv_page->flags);
+
+		if (uptodate) {
+			set_extent_uptodate(tree, start, end, GFP_ATOMIC);
+			if (whole_page)
+				SetPageUptodate(page);
+			else
+				check_page_uptodate(tree, page);
+		} else {
+			ClearPageUptodate(page);
+			SetPageError(page);
+		}
+
+		unlock_extent(tree, start, end, GFP_ATOMIC);
+
+		if (whole_page)
+			unlock_page(page);
+		else
+			check_page_locked(tree, page);
+	} while (bvec >= bio->bi_io_vec);
+
+	bio_put(bio);
+	return 0;
+}
+
+/*
+ * IO done from prepare_write is pretty simple, we just unlock
+ * the structs in the extent tree when done, and set the uptodate bits
+ * as appropriate.
+ */
+static int end_bio_extent_preparewrite(struct bio *bio,
+				       unsigned int bytes_done, int err)
+{
+	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+	struct extent_map_tree *tree = bio->bi_private;
+	u64 start;
+	u64 end;
+
+	if (bio->bi_size)
+		return 1;
+
+	do {
+		struct page *page = bvec->bv_page;
+		start = (page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
+		end = start + bvec->bv_len - 1;
+
+		if (--bvec >= bio->bi_io_vec)
+			prefetchw(&bvec->bv_page->flags);
+
+		if (uptodate) {
+			set_extent_uptodate(tree, start, end, GFP_ATOMIC);
+		} else {
+			ClearPageUptodate(page);
+			SetPageError(page);
+		}
+
+		unlock_extent(tree, start, end, GFP_ATOMIC);
+
+	} while (bvec >= bio->bi_io_vec);
+
+	bio_put(bio);
+	return 0;
+}
+
+static int submit_extent_page(int rw, struct extent_map_tree *tree,
+			      struct page *page, sector_t sector,
+			      size_t size, unsigned long offset,
+			      struct block_device *bdev,
+			      bio_end_io_t end_io_func)
+{
+	struct bio *bio;
+	int ret = 0;
+
+	bio = bio_alloc(GFP_NOIO, 1);
+
+	bio->bi_sector = sector;
+	bio->bi_bdev = bdev;
+	bio->bi_io_vec[0].bv_page = page;
+	bio->bi_io_vec[0].bv_len = size;
+	bio->bi_io_vec[0].bv_offset = offset;
+
+	bio->bi_vcnt = 1;
+	bio->bi_idx = 0;
+	bio->bi_size = size;
+
+	bio->bi_end_io = end_io_func;
+	bio->bi_private = tree;
+
+	bio_get(bio);
+	submit_bio(rw, bio);
+
+	if (bio_flagged(bio, BIO_EOPNOTSUPP))
+		ret = -EOPNOTSUPP;
+
+	bio_put(bio);
+	return ret;
+}
+
+/*
+ * basic readpage implementation.  Locked extent state structs are inserted
+ * into the tree that are removed when the IO is done (by the end_io
+ * handlers)
+ */
+int extent_read_full_page(struct extent_map_tree *tree, struct page *page,
+			  get_extent_t *get_extent)
+{
+	struct inode *inode = page->mapping->host;
+	u64 start = page->index << PAGE_CACHE_SHIFT;
+	u64 page_end = start + PAGE_CACHE_SIZE - 1;
+	u64 end;
+	u64 cur = start;
+	u64 extent_offset;
+	u64 last_byte = i_size_read(inode);
+	u64 block_start;
+	u64 cur_end;
+	sector_t sector;
+	struct extent_map *em;
+	struct block_device *bdev;
+	int ret;
+	int nr = 0;
+	size_t page_offset = 0;
+	size_t iosize;
+	size_t blocksize = inode->i_sb->s_blocksize;
+
+	if (!PagePrivate(page)) {
+		SetPagePrivate(page);
+		set_page_private(page, 1);
+		page_cache_get(page);
+	}
+
+	end = page_end;
+	lock_extent(tree, start, end, GFP_NOFS);
+
+	while (cur <= end) {
+		if (cur >= last_byte) {
+			iosize = PAGE_CACHE_SIZE - page_offset;
+			zero_user_page(page, page_offset, iosize, KM_USER0);
+			set_extent_uptodate(tree, cur, cur + iosize - 1,
+					    GFP_NOFS);
+			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+			break;
+		}
+		em = get_extent(inode, page, page_offset, cur, end, 0);
+		if (IS_ERR(em) || !em) {
+			SetPageError(page);
+			unlock_extent(tree, cur, end, GFP_NOFS);
+			break;
+		}
+
+		extent_offset = cur - em->start;
+		BUG_ON(em->end < cur);
+		BUG_ON(end < cur);
+
+		iosize = min(em->end - cur, end - cur) + 1;
+		cur_end = min(em->end, end);
+		iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
+		sector = (em->block_start + extent_offset) >> 9;
+		bdev = em->bdev;
+		block_start = em->block_start;
+		free_extent_map(em);
+		em = NULL;
+
+		/* we've found a hole, just zero and go on */
+		if (block_start == 0) {
+			zero_user_page(page, page_offset, iosize, KM_USER0);
+			set_extent_uptodate(tree, cur, cur + iosize - 1,
+					    GFP_NOFS);
+			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+			cur = cur + iosize;
+			page_offset += iosize;
+			continue;
+		}
+		/* the get_extent function already copied into the page */
+		if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) {
+			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+			cur = cur + iosize;
+			page_offset += iosize;
+			continue;
+		}
+
+		ret = submit_extent_page(READ, tree, page,
+					 sector, iosize, page_offset, bdev,
+					 end_bio_extent_readpage);
+		if (ret)
+			SetPageError(page);
+		cur = cur + iosize;
+		page_offset += iosize;
+		nr++;
+	}
+	if (!nr) {
+		if (!PageError(page))
+			SetPageUptodate(page);
+		unlock_page(page);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(extent_read_full_page);
+
+/*
+ * the writepage semantics are similar to regular writepage.  extent
+ * records are inserted to lock ranges in the tree, and as dirty areas
+ * are found, they are marked writeback.  Then the lock bits are removed
+ * and the end_io handler clears the writeback ranges
+ */
+int extent_write_full_page(struct extent_map_tree *tree, struct page *page,
+			  get_extent_t *get_extent,
+			  struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	u64 start = page->index << PAGE_CACHE_SHIFT;
+	u64 page_end = start + PAGE_CACHE_SIZE - 1;
+	u64 end;
+	u64 cur = start;
+	u64 extent_offset;
+	u64 last_byte = i_size_read(inode);
+	u64 block_start;
+	sector_t sector;
+	struct extent_map *em;
+	struct block_device *bdev;
+	int ret;
+	int nr = 0;
+	size_t page_offset = 0;
+	size_t iosize;
+	size_t blocksize;
+	loff_t i_size = i_size_read(inode);
+	unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
+
+	if (page->index > end_index) {
+		clear_extent_dirty(tree, start, page_end, GFP_NOFS);
+		unlock_page(page);
+		return 0;
+	}
+
+	if (page->index == end_index) {
+		size_t offset = i_size & (PAGE_CACHE_SIZE - 1);
+		zero_user_page(page, offset,
+			       PAGE_CACHE_SIZE - offset, KM_USER0);
+	}
+
+	if (!PagePrivate(page)) {
+		SetPagePrivate(page);
+		set_page_private(page, 1);
+		page_cache_get(page);
+	}
+
+	end = page_end;
+	lock_extent(tree, start, page_end, GFP_NOFS);
+
+	if (last_byte <= start) {
+		clear_extent_dirty(tree, start, page_end, GFP_NOFS);
+		goto done;
+	}
+
+	set_extent_uptodate(tree, start, page_end, GFP_NOFS);
+	blocksize = inode->i_sb->s_blocksize;
+
+	while (cur <= end) {
+		if (cur >= last_byte) {
+			clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
+			break;
+		}
+		em = get_extent(inode, page, page_offset, cur, end, 1);
+		if (IS_ERR(em) || !em) {
+			SetPageError(page);
+			break;
+		}
+
+		extent_offset = cur - em->start;
+		BUG_ON(em->end < cur);
+		BUG_ON(end < cur);
+		iosize = min(em->end - cur, end - cur) + 1;
+		iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
+		sector = (em->block_start + extent_offset) >> 9;
+		bdev = em->bdev;
+		block_start = em->block_start;
+		free_extent_map(em);
+		em = NULL;
+
+		if (block_start == 0 || block_start == EXTENT_MAP_INLINE) {
+			clear_extent_dirty(tree, cur,
+					   cur + iosize - 1, GFP_NOFS);
+			cur = cur + iosize;
+			page_offset += iosize;
+			continue;
+		}
+
+		/* leave this out until we have a page_mkwrite call */
+		if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
+				   EXTENT_DIRTY, 0)) {
+			cur = cur + iosize;
+			page_offset += iosize;
+			continue;
+		}
+		clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
+		set_range_writeback(tree, cur, cur + iosize - 1);
+		ret = submit_extent_page(WRITE, tree, page,
+					 sector, iosize, page_offset, bdev,
+					 end_bio_extent_writepage);
+		if (ret)
+			SetPageError(page);
+		cur = cur + iosize;
+		page_offset += iosize;
+		nr++;
+	}
+done:
+	WARN_ON(test_range_bit(tree, start, page_end, EXTENT_DIRTY, 0));
+	unlock_extent(tree, start, page_end, GFP_NOFS);
+	unlock_page(page);
+	return 0;
+}
+EXPORT_SYMBOL(extent_write_full_page);
+
+/*
+ * basic invalidatepage code, this waits on any locked or writeback
+ * ranges corresponding to the page, and then deletes any extent state
+ * records from the tree
+ */
+int extent_invalidatepage(struct extent_map_tree *tree,
+			  struct page *page, unsigned long offset)
+{
+	u64 start = (page->index << PAGE_CACHE_SHIFT);
+	u64 end = start + PAGE_CACHE_SIZE - 1;
+	size_t blocksize = page->mapping->host->i_sb->s_blocksize;
+
+	start += (offset + blocksize -1) & ~(blocksize - 1);
+	if (start > end)
+		return 0;
+
+	lock_extent(tree, start, end, GFP_NOFS);
+	wait_on_extent_writeback(tree, start, end);
+	clear_extent_bit(tree, start, end, EXTENT_LOCKED | EXTENT_DIRTY,
+			 1, 1, GFP_NOFS);
+	return 0;
+}
+EXPORT_SYMBOL(extent_invalidatepage);
+
+/*
+ * simple commit_write call, set_range_dirty is used to mark both
+ * the pages and the extent records as dirty
+ */
+int extent_commit_write(struct extent_map_tree *tree,
+			struct inode *inode, struct page *page,
+			unsigned from, unsigned to)
+{
+	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+
+	if (!PagePrivate(page)) {
+		SetPagePrivate(page);
+		set_page_private(page, 1);
+		page_cache_get(page);
+	}
+
+	set_page_dirty(page);
+
+	if (pos > inode->i_size) {
+		i_size_write(inode, pos);
+		mark_inode_dirty(inode);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(extent_commit_write);
+
+int extent_prepare_write(struct extent_map_tree *tree,
+			 struct inode *inode, struct page *page,
+			 unsigned from, unsigned to, get_extent_t *get_extent)
+{
+	u64 page_start = page->index << PAGE_CACHE_SHIFT;
+	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
+	u64 block_start;
+	u64 orig_block_start;
+	u64 block_end;
+	u64 cur_end;
+	struct extent_map *em;
+	unsigned blocksize = 1 << inode->i_blkbits;
+	size_t page_offset = 0;
+	size_t block_off_start;
+	size_t block_off_end;
+	int err = 0;
+	int iocount = 0;
+	int ret = 0;
+	int isnew;
+
+	if (!PagePrivate(page)) {
+		SetPagePrivate(page);
+		set_page_private(page, 1);
+		page_cache_get(page);
+	}
+	block_start = (page_start + from) & ~((u64)blocksize - 1);
+	block_end = (page_start + to - 1) | (blocksize - 1);
+	orig_block_start = block_start;
+
+	lock_extent(tree, page_start, page_end, GFP_NOFS);
+	while(block_start <= block_end) {
+		em = get_extent(inode, page, page_offset, block_start,
+				block_end, 1);
+		if (IS_ERR(em) || !em) {
+			goto err;
+		}
+		cur_end = min(block_end, em->end);
+		block_off_start = block_start & (PAGE_CACHE_SIZE - 1);
+		block_off_end = block_off_start + blocksize;
+		isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS);
+
+		if (!PageUptodate(page) && isnew &&
+		    (block_off_end > to || block_off_start < from)) {
+			void *kaddr;
+
+			kaddr = kmap_atomic(page, KM_USER0);
+			if (block_off_end > to)
+				memset(kaddr + to, 0, block_off_end - to);
+			if (block_off_start < from)
+				memset(kaddr + block_off_start, 0,
+				       from - block_off_start);
+			flush_dcache_page(page);
+			kunmap_atomic(kaddr, KM_USER0);
+		}
+		if (!isnew && !PageUptodate(page) &&
+		    (block_off_end > to || block_off_start < from) &&
+		    !test_range_bit(tree, block_start, cur_end,
+				    EXTENT_UPTODATE, 1)) {
+			u64 sector;
+			u64 extent_offset = block_start - em->start;
+			size_t iosize;
+			sector = (em->block_start + extent_offset) >> 9;
+			iosize = (cur_end - block_start + blocksize - 1) &
+				~((u64)blocksize - 1);
+			/*
+			 * we've already got the extent locked, but we
+			 * need to split the state such that our end_bio
+			 * handler can clear the lock.
+			 */
+			set_extent_bit(tree, block_start,
+				       block_start + iosize - 1,
+				       EXTENT_LOCKED, 0, NULL, GFP_NOFS);
+			ret = submit_extent_page(READ, tree, page,
+					 sector, iosize, page_offset, em->bdev,
+					 end_bio_extent_preparewrite);
+			iocount++;
+			block_start = block_start + iosize;
+		} else {
+			set_extent_uptodate(tree, block_start, cur_end,
+					    GFP_NOFS);
+			unlock_extent(tree, block_start, cur_end, GFP_NOFS);
+			block_start = cur_end + 1;
+		}
+		page_offset = block_start & (PAGE_CACHE_SIZE - 1);
+		free_extent_map(em);
+	}
+	if (iocount) {
+		wait_extent_bit(tree, orig_block_start,
+				block_end, EXTENT_LOCKED);
+	}
+	check_page_uptodate(tree, page);
+err:
+	/* FIXME, zero out newly allocated blocks on error */
+	return err;
+}
+EXPORT_SYMBOL(extent_prepare_write);
+
+/*
+ * a helper for releasepage.  As long as there are no locked extents
+ * in the range corresponding to the page, both state records and extent
+ * map records are removed
+ */
+int try_release_extent_mapping(struct extent_map_tree *tree, struct page *page)
+{
+	struct extent_map *em;
+	u64 start = page->index << PAGE_CACHE_SHIFT;
+	u64 end = start + PAGE_CACHE_SIZE - 1;
+	u64 orig_start = start;
+
+	while (start <= end) {
+		em = lookup_extent_mapping(tree, start, end);
+		if (!em || IS_ERR(em))
+			break;
+		if (test_range_bit(tree, em->start, em->end,
+				   EXTENT_LOCKED, 0)) {
+			free_extent_map(em);
+			start = em->end + 1;
+printk("range still locked %Lu %Lu\n", em->start, em->end);
+			break;
+		}
+		remove_extent_mapping(tree, em);
+		start = em->end + 1;
+		/* once for the rb tree */
+		free_extent_map(em);
+		/* once for us */
+		free_extent_map(em);
+	}
+	WARN_ON(test_range_bit(tree, orig_start, end, EXTENT_WRITEBACK, 0));
+	clear_extent_bit(tree, orig_start, end, EXTENT_UPTODATE,
+			 1, 1, GFP_NOFS);
+	return 1;
+}
+EXPORT_SYMBOL(try_release_extent_mapping);
+
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
new file mode 100644
index 000000000000..108944aab4bd
--- /dev/null
+++ b/fs/btrfs/extent_map.h
@@ -0,0 +1,89 @@
+#ifndef __EXTENTMAP__
+#define __EXTENTMAP__
+
+#include <linux/rbtree.h>
+
+#define EXTENT_MAP_INLINE (u64)-2
+#define EXTENT_MAP_DELALLOC (u64)-1
+
+struct extent_map_tree {
+	struct rb_root map;
+	struct rb_root state;
+	struct address_space *mapping;
+	rwlock_t lock;
+};
+
+/* note, this must start with the same fields as fs/extent_map.c:tree_entry */
+struct extent_map {
+	u64 start;
+	u64 end; /* inclusive */
+	int in_tree;
+	struct rb_node rb_node;
+	/* block_start and block_end are in bytes */
+	u64 block_start;
+	u64 block_end; /* inclusive */
+	struct block_device *bdev;
+	atomic_t refs;
+};
+
+/* note, this must start with the same fields as fs/extent_map.c:tree_entry */
+struct extent_state {
+	u64 start;
+	u64 end; /* inclusive */
+	int in_tree;
+	struct rb_node rb_node;
+	wait_queue_head_t wq;
+	atomic_t refs;
+	unsigned long state;
+	struct list_head list;
+};
+
+struct extent_buffer {
+	u64 start;
+	u64 end; /* inclusive */
+	char *addr;
+	struct page *pages[];
+};
+
+typedef struct extent_map *(get_extent_t)(struct inode *inode,
+					  struct page *page,
+					  size_t page_offset,
+					  u64 start, u64 end,
+					  int create);
+
+void extent_map_tree_init(struct extent_map_tree *tree,
+			  struct address_space *mapping, gfp_t mask);
+struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
+					 u64 start, u64 end);
+int add_extent_mapping(struct extent_map_tree *tree,
+		       struct extent_map *em);
+int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
+int try_release_extent_mapping(struct extent_map_tree *tree, struct page *page);
+int lock_extent(struct extent_map_tree *tree, u64 start, u64 end, gfp_t mask);
+int unlock_extent(struct extent_map_tree *tree, u64 start, u64 end, gfp_t mask);
+struct extent_map *alloc_extent_map(gfp_t mask);
+void free_extent_map(struct extent_map *em);
+int extent_read_full_page(struct extent_map_tree *tree, struct page *page,
+			  get_extent_t *get_extent);
+void __init extent_map_init(void);
+void __exit extent_map_exit(void);
+int extent_clean_all_trees(struct extent_map_tree *tree);
+int set_extent_uptodate(struct extent_map_tree *tree, u64 start, u64 end,
+			gfp_t mask);
+int set_extent_new(struct extent_map_tree *tree, u64 start, u64 end,
+		   gfp_t mask);
+int set_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end,
+		     gfp_t mask);
+int extent_invalidatepage(struct extent_map_tree *tree,
+			  struct page *page, unsigned long offset);
+int extent_write_full_page(struct extent_map_tree *tree, struct page *page,
+			  get_extent_t *get_extent,
+			  struct writeback_control *wbc);
+int extent_prepare_write(struct extent_map_tree *tree,
+			 struct inode *inode, struct page *page,
+			 unsigned from, unsigned to, get_extent_t *get_extent);
+int extent_commit_write(struct extent_map_tree *tree,
+			struct inode *inode, struct page *page,
+			unsigned from, unsigned to);
+int set_range_dirty(struct extent_map_tree *tree, u64 start, u64 end);
+#endif
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 6933ab11a5cd..71a481894ab6 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -81,14 +81,14 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages)
 	}
 }
 
-static int insert_inline_extent(struct btrfs_root *root, struct inode *inode,
+static int insert_inline_extent(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root, struct inode *inode,
 				u64 offset, ssize_t size,
-				struct buffer_head *bh)
+				struct page *page, size_t page_offset)
 {
 	struct btrfs_key key;
 	struct btrfs_path *path;
 	char *ptr, *kaddr;
-	struct btrfs_trans_handle *trans;
 	struct btrfs_file_extent_item *ei;
 	u32 datasize;
 	int err = 0;
@@ -98,8 +98,6 @@ static int insert_inline_extent(struct btrfs_root *root, struct inode *inode,
 	if (!path)
 		return -ENOMEM;
 
-	mutex_lock(&root->fs_info->fs_mutex);
-	trans = btrfs_start_transaction(root, 1);
 	btrfs_set_trans_block_group(trans, inode);
 
 	key.objectid = inode->i_ino;
@@ -122,18 +120,13 @@ static int insert_inline_extent(struct btrfs_root *root, struct inode *inode,
 				   BTRFS_FILE_EXTENT_INLINE);
 	ptr = btrfs_file_extent_inline_start(ei);
 
-	kaddr = kmap_atomic(bh->b_page, KM_USER0);
+	kaddr = kmap_atomic(page, KM_USER0);
 	btrfs_memcpy(root, path->nodes[0]->b_data,
-		     ptr, kaddr + bh_offset(bh),
-		     size);
+		     ptr, kaddr + page_offset, size);
 	kunmap_atomic(kaddr, KM_USER0);
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 fail:
 	btrfs_free_path(path);
-	ret = btrfs_end_transaction(trans, root);
-	if (ret && !err)
-		err = ret;
-	mutex_unlock(&root->fs_info->fs_mutex);
 	return err;
 }
 
@@ -145,45 +138,143 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 				   loff_t pos,
 				   size_t write_bytes)
 {
-	int i;
-	int offset;
 	int err = 0;
-	int ret;
-	int this_write;
+	int i;
 	struct inode *inode = file->f_path.dentry->d_inode;
-	struct buffer_head *bh;
+	struct extent_map *em;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct btrfs_key ins;
+	u64 hint_block;
+	u64 num_blocks;
+	u64 start_pos;
+	u64 end_of_last_block;
+	u64 end_pos = pos + write_bytes;
+	loff_t isize = i_size_read(inode);
 
-	for (i = 0; i < num_pages; i++) {
-		offset = pos & (PAGE_CACHE_SIZE -1);
-		this_write = min((size_t)PAGE_CACHE_SIZE - offset, write_bytes);
+	em = alloc_extent_map(GFP_NOFS);
+	if (!em)
+		return -ENOMEM;
 
-		/* FIXME, one block at a time */
-		bh = page_buffers(pages[i]);
+	em->bdev = inode->i_sb->s_bdev;
 
-		if (buffer_mapped(bh) && bh->b_blocknr == 0) {
-			ret = insert_inline_extent(root, inode,
-					pages[i]->index << PAGE_CACHE_SHIFT,
-					offset + this_write, bh);
-			if (ret) {
-				err = ret;
-				goto failed;
-			}
-		}
+	start_pos = pos & ~((u64)root->blocksize - 1);
+	num_blocks = (write_bytes + pos - start_pos + root->blocksize - 1) >>
+			inode->i_blkbits;
 
-		ret = btrfs_commit_write(file, pages[i], offset,
-					 offset + this_write);
-		pos += this_write;
-		if (ret) {
-			err = ret;
+	end_of_last_block = start_pos + (num_blocks << inode->i_blkbits) - 1;
+	mutex_lock(&root->fs_info->fs_mutex);
+	trans = btrfs_start_transaction(root, 1);
+	if (!trans) {
+		err = -ENOMEM;
+		goto out_unlock;
+	}
+	btrfs_set_trans_block_group(trans, inode);
+	inode->i_blocks += num_blocks << 3;
+	hint_block = 0;
+
+	if ((end_of_last_block & 4095) == 0) {
+		printk("strange end of last %Lu %lu %Lu\n", start_pos, write_bytes, end_of_last_block);
+	}
+	set_extent_uptodate(em_tree, start_pos, end_of_last_block, GFP_NOFS);
+
+	/* FIXME...EIEIO, ENOSPC and more */
+
+	/* step one, delete the existing extents in this range */
+	/* FIXME blocksize != pagesize */
+	if (start_pos < inode->i_size) {
+		err = btrfs_drop_extents(trans, root, inode,
+			 start_pos, (pos + write_bytes + root->blocksize -1) &
+			 ~((u64)root->blocksize - 1), &hint_block);
+		if (err)
+			goto failed;
+	}
+
+	/* insert any holes we need to create */
+	if (inode->i_size < start_pos) {
+		u64 last_pos_in_file;
+		u64 hole_size;
+		u64 mask = root->blocksize - 1;
+		last_pos_in_file = (isize + mask) & ~mask;
+		hole_size = (start_pos - last_pos_in_file + mask) & ~mask;
+		hole_size >>= inode->i_blkbits;
+		if (last_pos_in_file < start_pos) {
+			err = btrfs_insert_file_extent(trans, root,
+						       inode->i_ino,
+						       last_pos_in_file,
+						       0, 0, hole_size);
+		}
+		if (err)
 			goto failed;
+	}
+
+	/*
+	 * either allocate an extent for the new bytes or setup the key
+	 * to show we are doing inline data in the extent
+	 */
+	if (isize >= PAGE_CACHE_SIZE || pos + write_bytes < inode->i_size ||
+	    pos + write_bytes - start_pos > BTRFS_MAX_INLINE_DATA_SIZE(root)) {
+		err = btrfs_alloc_extent(trans, root, inode->i_ino,
+					 num_blocks, 0, hint_block, (u64)-1,
+					 &ins, 1);
+		BUG_ON(err);
+		err = btrfs_insert_file_extent(trans, root, inode->i_ino,
+				       start_pos, ins.objectid, ins.offset,
+				       ins.offset);
+		BUG_ON(err);
+		em->start = start_pos;
+		em->end = end_of_last_block;
+		em->block_start = ins.objectid << inode->i_blkbits;
+		em->block_end = em->block_start +
+			(ins.offset << inode->i_blkbits) - 1;
+		set_extent_dirty(em_tree, start_pos, end_of_last_block,
+				 GFP_NOFS);
+		err = add_extent_mapping(em_tree, em);
+		for (i = 0; i < num_pages; i++) {
+			struct page *p = pages[i];
+			SetPageUptodate(p);
+			__set_page_dirty_nobuffers(p);
 		}
-		WARN_ON(this_write > write_bytes);
-		write_bytes -= this_write;
+	} else {
+		struct page *p = pages[0];
+		err = insert_inline_extent(trans, root, inode, start_pos,
+					   end_pos - start_pos, p, 0);
+		BUG_ON(err);
+		em->start = start_pos;
+		em->end = end_pos;
+		em->block_start = EXTENT_MAP_INLINE;
+		em->block_end = EXTENT_MAP_INLINE;
+		add_extent_mapping(em_tree, em);
+	}
+	if (end_pos > isize) {
+		i_size_write(inode, end_pos);
+		btrfs_update_inode(trans, root, inode);
 	}
 failed:
+	err = btrfs_end_transaction(trans, root);
+out_unlock:
+	mutex_unlock(&root->fs_info->fs_mutex);
+	free_extent_map(em);
 	return err;
 }
 
+int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end)
+{
+	struct extent_map *em;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+
+	while(1) {
+		em = lookup_extent_mapping(em_tree, start, end);
+		if (!em)
+			break;
+		remove_extent_mapping(em_tree, em);
+		/* once for us */
+		free_extent_map(em);
+		/* once for the tree*/
+		free_extent_map(em);
+	}
+	return 0;
+}
+
 /*
  * this is very complex, but the basic idea is to drop all extents
  * in the range start - end.  hint_block is filled in with a block number
@@ -213,6 +304,8 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 	int found_inline;
 	int recow;
 
+	btrfs_drop_extent_cache(inode, start, end - 1);
+
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
@@ -434,18 +527,9 @@ static int prepare_pages(struct btrfs_root *root,
 	int i;
 	unsigned long index = pos >> PAGE_CACHE_SHIFT;
 	struct inode *inode = file->f_path.dentry->d_inode;
-	int offset;
 	int err = 0;
-	int this_write;
-	struct buffer_head *bh;
-	struct buffer_head *head;
-	loff_t isize = i_size_read(inode);
-	struct btrfs_trans_handle *trans;
-	u64 hint_block;
 	u64 num_blocks;
-	u64 alloc_extent_start;
 	u64 start_pos;
-	struct btrfs_key ins;
 
 	start_pos = pos & ~((u64)PAGE_CACHE_SIZE - 1);
 	num_blocks = (write_bytes + pos - start_pos + root->blocksize - 1) >>
@@ -457,119 +541,17 @@ static int prepare_pages(struct btrfs_root *root,
 		pages[i] = grab_cache_page(inode->i_mapping, index + i);
 		if (!pages[i]) {
 			err = -ENOMEM;
-			goto failed_release;
+			BUG_ON(1);
 		}
 		cancel_dirty_page(pages[i], PAGE_CACHE_SIZE);
 		wait_on_page_writeback(pages[i]);
-	}
-
-	mutex_lock(&root->fs_info->fs_mutex);
-	trans = btrfs_start_transaction(root, 1);
-	if (!trans) {
-		err = -ENOMEM;
-		mutex_unlock(&root->fs_info->fs_mutex);
-		goto out_unlock;
-	}
-	btrfs_set_trans_block_group(trans, inode);
-	/* FIXME blocksize != 4096 */
-	inode->i_blocks += num_blocks << 3;
-	hint_block = 0;
-
-	/* FIXME...EIEIO, ENOSPC and more */
-
-	/* step one, delete the existing extents in this range */
-	/* FIXME blocksize != pagesize */
-	if (start_pos < inode->i_size) {
-		err = btrfs_drop_extents(trans, root, inode,
-			 start_pos, (pos + write_bytes + root->blocksize -1) &
-			 ~((u64)root->blocksize - 1), &hint_block);
-		if (err)
-			goto failed_release;
-	}
-
-	/* insert any holes we need to create */
-	if (inode->i_size < start_pos) {
-		u64 last_pos_in_file;
-		u64 hole_size;
-		u64 mask = root->blocksize - 1;
-		last_pos_in_file = (isize + mask) & ~mask;
-		hole_size = (start_pos - last_pos_in_file + mask) & ~mask;
-		hole_size >>= inode->i_blkbits;
-		if (last_pos_in_file < start_pos) {
-			err = btrfs_insert_file_extent(trans, root,
-						       inode->i_ino,
-						       last_pos_in_file,
-						       0, 0, hole_size);
-		}
-		if (err)
-			goto failed_release;
-	}
-
-	/*
-	 * either allocate an extent for the new bytes or setup the key
-	 * to show we are doing inline data in the extent
-	 */
-	if (isize >= PAGE_CACHE_SIZE || pos + write_bytes < inode->i_size ||
-	    pos + write_bytes - start_pos > BTRFS_MAX_INLINE_DATA_SIZE(root)) {
-		err = btrfs_alloc_extent(trans, root, inode->i_ino,
-					 num_blocks, 0, hint_block, (u64)-1,
-					 &ins, 1);
-		if (err)
-			goto failed_truncate;
-		err = btrfs_insert_file_extent(trans, root, inode->i_ino,
-				       start_pos, ins.objectid, ins.offset,
-				       ins.offset);
-		if (err)
-			goto failed_truncate;
-	} else {
-		ins.offset = 0;
-		ins.objectid = 0;
-	}
-	BUG_ON(err);
-	alloc_extent_start = ins.objectid;
-	err = btrfs_end_transaction(trans, root);
-	mutex_unlock(&root->fs_info->fs_mutex);
-
-	for (i = 0; i < num_pages; i++) {
-		offset = pos & (PAGE_CACHE_SIZE -1);
-		this_write = min((size_t)PAGE_CACHE_SIZE - offset, write_bytes);
-		if (!page_has_buffers(pages[i])) {
-			create_empty_buffers(pages[i],
-					     root->fs_info->sb->s_blocksize,
-					     (1 << BH_Uptodate));
+		if (!PagePrivate(pages[i])) {
+			SetPagePrivate(pages[i]);
+			set_page_private(pages[i], 1);
+			page_cache_get(pages[i]);
 		}
-		head = page_buffers(pages[i]);
-		bh = head;
-		do {
-			err = btrfs_map_bh_to_logical(root, bh,
-						      alloc_extent_start);
-			BUG_ON(err);
-			if (err)
-				goto failed_truncate;
-			bh = bh->b_this_page;
-			if (alloc_extent_start)
-				alloc_extent_start++;
-		} while (bh != head);
-		pos += this_write;
-		WARN_ON(this_write > write_bytes);
-		write_bytes -= this_write;
 	}
 	return 0;
-
-failed_release:
-	btrfs_drop_pages(pages, num_pages);
-	return err;
-
-failed_truncate:
-	btrfs_drop_pages(pages, num_pages);
-	if (pos > isize)
-		vmtruncate(inode, isize);
-	return err;
-
-out_unlock:
-	mutex_unlock(&root->fs_info->fs_mutex);
-	goto failed_release;
-
 }
 
 static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
@@ -685,7 +667,6 @@ out:
 		page_cache_release(pinned[1]);
 	*ppos = pos;
 	current->backing_dev_info = NULL;
-	mark_inode_dirty(inode);
 	return num_written ? num_written : err;
 }
 
@@ -714,8 +695,8 @@ static int btrfs_sync_file(struct file *file,
 	mutex_unlock(&root->fs_info->trans_mutex);
 
 	/*
- 	 * ok we haven't committed the transaction yet, lets do a commit
- 	 */
+	 * ok we haven't committed the transaction yet, lets do a commit
+	 */
 	trans = btrfs_start_transaction(root, 1);
 	if (!trans) {
 		ret = -ENOMEM;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 504d37dfa6c9..fd4cb65fcf1f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -121,6 +121,8 @@ void btrfs_read_locked_inode(struct inode *inode)
 	switch (inode->i_mode & S_IFMT) {
 	case S_IFREG:
 		inode->i_mapping->a_ops = &btrfs_aops;
+		extent_map_tree_init(&BTRFS_I(inode)->extent_tree,
+				     inode->i_mapping, GFP_NOFS);
 		inode->i_fop = &btrfs_file_operations;
 		inode->i_op = &btrfs_file_inode_operations;
 		break;
@@ -169,7 +171,7 @@ static void fill_inode_item(struct btrfs_inode_item *item,
 				    BTRFS_I(inode)->block_group->key.objectid);
 }
 
-static int btrfs_update_inode(struct btrfs_trans_handle *trans,
+int btrfs_update_inode(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root,
 			      struct inode *inode)
 {
@@ -400,6 +402,7 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 	int found_extent;
 	int del_item;
 
+	btrfs_drop_extent_cache(inode, inode->i_size, (u64)-1);
 	path = btrfs_alloc_path();
 	path->reada = -1;
 	BUG_ON(!path);
@@ -511,6 +514,50 @@ error:
 	return ret;
 }
 
+static int btrfs_cow_one_page(struct btrfs_trans_handle *trans,
+			      struct inode *inode, struct page *page,
+			      size_t zero_start)
+{
+	char *kaddr;
+	int ret = 0;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	u64 alloc_hint = 0;
+	u64 page_start = page->index << PAGE_CACHE_SHIFT;
+	struct btrfs_key ins;
+
+	if (!PagePrivate(page)) {
+		SetPagePrivate(page);
+		set_page_private(page, 1);
+		page_cache_get(page);
+	}
+
+	btrfs_set_trans_block_group(trans, inode);
+
+	ret = btrfs_drop_extents(trans, root, inode,
+				 page_start, page_start + PAGE_CACHE_SIZE,
+				 &alloc_hint);
+	if (ret)
+		goto out;
+	ret = btrfs_alloc_extent(trans, root, inode->i_ino, 1, 0,
+				 alloc_hint, (u64)-1, &ins, 1);
+	if (ret)
+		goto out;
+	ret = btrfs_insert_file_extent(trans, root, inode->i_ino,
+				       page_start, ins.objectid, 1, 1);
+	if (ret)
+		goto out;
+	SetPageChecked(page);
+	kaddr = kmap(page);
+	if (zero_start != PAGE_CACHE_SIZE) {
+		memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start);
+		flush_dcache_page(page);
+	}
+	kunmap(page);
+
+out:
+	return ret;
+}
+
 /*
  * taken from block_truncate_page, but does cow as it zeros out
  * any bytes left in the last page in the file.
@@ -518,16 +565,14 @@ error:
 static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
 {
 	struct inode *inode = mapping->host;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans;
 	unsigned blocksize = 1 << inode->i_blkbits;
 	pgoff_t index = from >> PAGE_CACHE_SHIFT;
 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
 	struct page *page;
-	char *kaddr;
 	int ret = 0;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	u64 alloc_hint = 0;
-	struct btrfs_key ins;
-	struct btrfs_trans_handle *trans;
+	u64 page_start;
 
 	if ((offset & (blocksize - 1)) == 0)
 		goto out;
@@ -536,7 +581,6 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
 	page = grab_cache_page(mapping, index);
 	if (!page)
 		goto out;
-
 	if (!PageUptodate(page)) {
 		ret = btrfs_readpage(NULL, page);
 		lock_page(page);
@@ -545,37 +589,24 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
 			goto out;
 		}
 	}
+	page_start = page->index << PAGE_CACHE_SHIFT;
+
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
-	btrfs_set_trans_block_group(trans, inode);
-
-	ret = btrfs_drop_extents(trans, root, inode,
-				 page->index << PAGE_CACHE_SHIFT,
-				 (page->index + 1) << PAGE_CACHE_SHIFT,
-				 &alloc_hint);
-	if (ret)
-		goto out;
-	ret = btrfs_alloc_extent(trans, root, inode->i_ino, 1, 0,
-				 alloc_hint, (u64)-1, &ins, 1);
-	if (ret)
-		goto out;
-	ret = btrfs_insert_file_extent(trans, root, inode->i_ino,
-				       page->index << PAGE_CACHE_SHIFT,
-				       ins.objectid, 1, 1);
-	if (ret)
-		goto out;
-	SetPageChecked(page);
-	kaddr = kmap(page);
-	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
-	flush_dcache_page(page);
-	ret = btrfs_csum_file_block(trans, root, inode->i_ino,
-			      page->index << PAGE_CACHE_SHIFT,
-			      kaddr, PAGE_CACHE_SIZE);
-	kunmap(page);
+	ret = btrfs_cow_one_page(trans, inode, page, offset);
+	if (!ret) {
+		char *kaddr = kmap(page);
+		ret = btrfs_csum_file_block(trans, root, inode->i_ino,
+					    page_start, kaddr, PAGE_CACHE_SIZE);
+		kunmap(page);
+	}
+	set_extent_dirty(&BTRFS_I(inode)->extent_tree,
+			 page_start, page_start + PAGE_CACHE_SIZE - 1,
+			 GFP_NOFS);
+	set_page_dirty(page);
 	btrfs_end_transaction(trans, root);
 	mutex_unlock(&root->fs_info->fs_mutex);
 
-	set_page_dirty(page);
 	unlock_page(page);
 	page_cache_release(page);
 out:
@@ -1095,6 +1126,8 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 		inode->i_mapping->a_ops = &btrfs_aops;
 		inode->i_fop = &btrfs_file_operations;
 		inode->i_op = &btrfs_file_inode_operations;
+		extent_map_tree_init(&BTRFS_I(inode)->extent_tree,
+				     inode->i_mapping, GFP_NOFS);
 	}
 	dir->i_sb->s_dirt = 1;
 	btrfs_update_inode_block_group(trans, inode);
@@ -1238,6 +1271,182 @@ out_unlock:
 	return err;
 }
 
+struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
+				    size_t page_offset, u64 start, u64 end,
+				    int create)
+{
+	int ret;
+	int err = 0;
+	u64 blocknr;
+	u64 extent_start = 0;
+	u64 extent_end = 0;
+	u64 objectid = inode->i_ino;
+	u32 found_type;
+	int failed_insert = 0;
+	struct btrfs_path *path;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_file_extent_item *item;
+	struct btrfs_leaf *leaf;
+	struct btrfs_disk_key *found_key;
+	struct extent_map *em = NULL;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct btrfs_trans_handle *trans = NULL;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	mutex_lock(&root->fs_info->fs_mutex);
+
+again:
+	em = lookup_extent_mapping(em_tree, start, end);
+	if (em) {
+		goto out;
+	}
+	if (!em) {
+		em = alloc_extent_map(GFP_NOFS);
+		if (!em) {
+			err = -ENOMEM;
+			goto out;
+		}
+		em->start = 0;
+		em->end = 0;
+	}
+	em->bdev = inode->i_sb->s_bdev;
+	ret = btrfs_lookup_file_extent(NULL, root, path,
+				       objectid, start, 0);
+	if (ret < 0) {
+		err = ret;
+		goto out;
+	}
+
+	if (ret != 0) {
+		if (path->slots[0] == 0)
+			goto not_found;
+		path->slots[0]--;
+	}
+
+	item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), path->slots[0],
+			      struct btrfs_file_extent_item);
+	leaf = btrfs_buffer_leaf(path->nodes[0]);
+	blocknr = btrfs_file_extent_disk_blocknr(item);
+	blocknr += btrfs_file_extent_offset(item);
+
+	/* are we inside the extent that was found? */
+	found_key = &leaf->items[path->slots[0]].key;
+	found_type = btrfs_disk_key_type(found_key);
+	if (btrfs_disk_key_objectid(found_key) != objectid ||
+	    found_type != BTRFS_EXTENT_DATA_KEY) {
+		goto not_found;
+	}
+
+	found_type = btrfs_file_extent_type(item);
+	extent_start = btrfs_disk_key_offset(&leaf->items[path->slots[0]].key);
+	if (found_type == BTRFS_FILE_EXTENT_REG) {
+		extent_end = extent_start +
+		       (btrfs_file_extent_num_blocks(item) << inode->i_blkbits);
+		err = 0;
+		if (start < extent_start || start > extent_end) {
+			em->start = start;
+			if (start < extent_start) {
+				em->end = extent_end - 1;
+			} else {
+				em->end = end;
+			}
+			goto not_found_em;
+		}
+		if (btrfs_file_extent_disk_blocknr(item) == 0) {
+			em->start = extent_start;
+			em->end = extent_end - 1;
+			em->block_start = 0;
+			em->block_end = 0;
+			goto insert;
+		}
+		em->block_start = blocknr << inode->i_blkbits;
+		em->block_end = em->block_start +
+			(btrfs_file_extent_num_blocks(item) <<
+			 inode->i_blkbits) - 1;
+		em->start = extent_start;
+		em->end = extent_end - 1;
+		goto insert;
+	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+		char *ptr;
+		char *map;
+		u32 size;
+
+		size = btrfs_file_extent_inline_len(leaf->items +
+						    path->slots[0]);
+		extent_end = extent_start + size;
+		if (start < extent_start || start > extent_end) {
+			em->start = start;
+			if (start < extent_start) {
+				em->end = extent_end - 1;
+			} else {
+				em->end = end;
+			}
+			goto not_found_em;
+		}
+		em->block_start = EXTENT_MAP_INLINE;
+		em->block_end = EXTENT_MAP_INLINE;
+		em->start = extent_start;
+		em->end = extent_end - 1;
+		if (!page) {
+			goto insert;
+		}
+		ptr = btrfs_file_extent_inline_start(item);
+		map = kmap(page);
+		memcpy(map + page_offset, ptr, size);
+		flush_dcache_page(result->b_page);
+		kunmap(page);
+		set_extent_uptodate(em_tree, extent_start,
+				    extent_end, GFP_NOFS);
+		goto insert;
+	} else {
+		printk("unkknown found_type %d\n", found_type);
+		WARN_ON(1);
+	}
+not_found:
+	em->start = start;
+	em->end = end;
+not_found_em:
+	em->block_start = 0;
+	em->block_end = 0;
+insert:
+	btrfs_release_path(root, path);
+	if (em->start > start || em->end < start) {
+		printk("bad extent! %Lu %Lu start %Lu end %Lu\n", em->start, em->end, start, end);
+		WARN_ON(1);
+		err = -EIO;
+		goto out;
+	}
+	ret = add_extent_mapping(em_tree, em);
+	if (ret == -EEXIST) {
+		free_extent_map(em);
+		failed_insert++;
+		if (failed_insert > 5) {
+			printk("failing to insert %Lu %Lu\n", start, end);
+			err = -EIO;
+			goto out;
+		}
+		em = NULL;
+		goto again;
+	}
+	err = 0;
+out:
+	btrfs_free_path(path);
+	if (trans) {
+		ret = btrfs_end_transaction(trans, root);
+		if (!err)
+			err = ret;
+	}
+	mutex_unlock(&root->fs_info->fs_mutex);
+	if (err) {
+		free_extent_map(em);
+		WARN_ON(1);
+		return ERR_PTR(err);
+	}
+	return em;
+}
+
+
 /*
  * FIBMAP and others want to pass in a fake buffer head.  They need to
  * use BTRFS_GET_BLOCK_NO_DIRECT to make sure we don't try to memcpy
@@ -1398,46 +1607,22 @@ int btrfs_get_block(struct inode *inode, sector_t iblock,
 	return err;
 }
 
-static int btrfs_get_block_csum(struct inode *inode, sector_t iblock,
-				struct buffer_head *result, int create)
-{
-	int ret;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct page *page = result->b_page;
-	u64 offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(result);
-	struct btrfs_csum_item *item;
-	struct btrfs_path *path = NULL;
-
-	mutex_lock(&root->fs_info->fs_mutex);
-	ret = btrfs_get_block_lock(inode, iblock, result, create);
-	if (ret)
-		goto out;
-
-	path = btrfs_alloc_path();
-	item = btrfs_lookup_csum(NULL, root, path, inode->i_ino, offset, 0);
-	if (IS_ERR(item)) {
-		ret = PTR_ERR(item);
-		/* a csum that isn't present is a preallocated region. */
-		if (ret == -ENOENT || ret == -EFBIG)
-			ret = 0;
-		result->b_private = NULL;
-		goto out;
-	}
-	memcpy((char *)&result->b_private, &item->csum, BTRFS_CRC32_SIZE);
-out:
-	if (path)
-		btrfs_free_path(path);
-	mutex_unlock(&root->fs_info->fs_mutex);
-	return ret;
-}
-
 static int btrfs_get_block_bmap(struct inode *inode, sector_t iblock,
 			   struct buffer_head *result, int create)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	mutex_lock(&root->fs_info->fs_mutex);
-	btrfs_get_block_lock(inode, iblock, result, BTRFS_GET_BLOCK_NO_DIRECT);
-	mutex_unlock(&root->fs_info->fs_mutex);
+	u64 start = iblock << inode->i_blkbits;
+	u64 end = start + root->blocksize -1;
+	struct extent_map *em;
+
+	em = btrfs_get_extent(inode, NULL, 0, start, end, 0);
+	if (em && !IS_ERR(em) && em->block_start != EXTENT_MAP_INLINE &&
+	    em->block_start != 0) {
+		u64 offset;
+		offset = start - em->start;
+		start = (em->block_start + offset) >> inode->i_blkbits;
+		btrfs_map_bh_to_logical(root, result, start);
+	}
 	return 0;
 }
 
@@ -1449,442 +1634,50 @@ static sector_t btrfs_bmap(struct address_space *as, sector_t block)
 static int btrfs_prepare_write(struct file *file, struct page *page,
 			       unsigned from, unsigned to)
 {
-	return block_prepare_write(page, from, to, btrfs_get_block);
+	return extent_prepare_write(&BTRFS_I(page->mapping->host)->extent_tree,
+				    page->mapping->host, page, from, to,
+				    btrfs_get_extent);
 }
 
-static void buffer_io_error(struct buffer_head *bh)
+int btrfs_readpage(struct file *file, struct page *page)
 {
-	char b[BDEVNAME_SIZE];
-
-	printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
-			bdevname(bh->b_bdev, b),
-			(unsigned long long)bh->b_blocknr);
+	struct extent_map_tree *tree;
+	tree = &BTRFS_I(page->mapping->host)->extent_tree;
+	return extent_read_full_page(tree, page, btrfs_get_extent);
 }
-
-/*
- * I/O completion handler for block_read_full_page() - pages
- * which come unlocked at the end of I/O.
- */
-static void btrfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
+static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
 {
-	unsigned long flags;
-	struct buffer_head *first;
-	struct buffer_head *tmp;
-	struct page *page;
-	int page_uptodate = 1;
-	struct inode *inode;
-	int ret;
-
-	BUG_ON(!buffer_async_read(bh));
-
-	page = bh->b_page;
-	inode = page->mapping->host;
-	if (uptodate) {
-		void *kaddr;
-		struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
-		if (bh->b_private) {
-			char csum[BTRFS_CRC32_SIZE];
-			kaddr = kmap_atomic(page, KM_IRQ0);
-			ret = btrfs_csum_data(root, kaddr + bh_offset(bh),
-					      bh->b_size, csum);
-			BUG_ON(ret);
-			if (memcmp(csum, &bh->b_private, BTRFS_CRC32_SIZE)) {
-				u64 offset;
-				offset = (page->index << PAGE_CACHE_SHIFT) +
-					bh_offset(bh);
-				printk("btrfs csum failed ino %lu off %llu\n",
-				       page->mapping->host->i_ino,
-				       (unsigned long long)offset);
-				memset(kaddr + bh_offset(bh), 1, bh->b_size);
-				flush_dcache_page(page);
-			}
-			kunmap_atomic(kaddr, KM_IRQ0);
-		}
-		set_buffer_uptodate(bh);
-	} else {
-		clear_buffer_uptodate(bh);
-		if (printk_ratelimit())
-			buffer_io_error(bh);
-		SetPageError(page);
-	}
-
-	/*
-	 * Be _very_ careful from here on. Bad things can happen if
-	 * two buffer heads end IO at almost the same time and both
-	 * decide that the page is now completely done.
-	 */
-	first = page_buffers(page);
-	local_irq_save(flags);
-	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
-	clear_buffer_async_read(bh);
-	unlock_buffer(bh);
-	tmp = bh;
-	do {
-		if (!buffer_uptodate(tmp))
-			page_uptodate = 0;
-		if (buffer_async_read(tmp)) {
-			BUG_ON(!buffer_locked(tmp));
-			goto still_busy;
-		}
-		tmp = tmp->b_this_page;
-	} while (tmp != bh);
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
-
-	/*
-	 * If none of the buffers had errors and they are all
-	 * uptodate then we can set the page uptodate.
-	 */
-	if (page_uptodate && !PageError(page))
-		SetPageUptodate(page);
-	unlock_page(page);
-	return;
-
-still_busy:
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
-	return;
+	struct extent_map_tree *tree;
+	tree = &BTRFS_I(page->mapping->host)->extent_tree;
+	return extent_write_full_page(tree, page, btrfs_get_extent, wbc);
 }
 
-/*
- * Generic "read page" function for block devices that have the normal
- * get_block functionality. This is most of the block device filesystems.
- * Reads the page asynchronously --- the unlock_buffer() and
- * set/clear_buffer_uptodate() functions propagate buffer state into the
- * page struct once IO has completed.
- */
-int btrfs_readpage(struct file *file, struct page *page)
+static int btrfs_releasepage(struct page *page, gfp_t unused_gfp_flags)
 {
-	struct inode *inode = page->mapping->host;
-	sector_t iblock, lblock;
-	struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
-	unsigned int blocksize;
-	int nr, i;
-	int fully_mapped = 1;
-
-	BUG_ON(!PageLocked(page));
-	blocksize = 1 << inode->i_blkbits;
-	if (!page_has_buffers(page))
-		create_empty_buffers(page, blocksize, 0);
-	head = page_buffers(page);
-
-	iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
-	lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
-	bh = head;
-	nr = 0;
-	i = 0;
-
-	do {
-		if (buffer_uptodate(bh))
-			continue;
-
-		if (!buffer_mapped(bh)) {
-			int err = 0;
-
-			fully_mapped = 0;
-			if (iblock < lblock) {
-				WARN_ON(bh->b_size != blocksize);
-				err = btrfs_get_block_csum(inode, iblock,
-							   bh, 0);
-				if (err)
-					SetPageError(page);
-			}
-			if (!buffer_mapped(bh)) {
-				void *kaddr = kmap_atomic(page, KM_USER0);
-				memset(kaddr + i * blocksize, 0, blocksize);
-				flush_dcache_page(page);
-				kunmap_atomic(kaddr, KM_USER0);
-				if (!err)
-					set_buffer_uptodate(bh);
-				continue;
-			}
-			/*
-			 * get_block() might have updated the buffer
-			 * synchronously
-			 */
-			if (buffer_uptodate(bh))
-				continue;
-		}
-		arr[nr++] = bh;
-	} while (i++, iblock++, (bh = bh->b_this_page) != head);
-
-	if (fully_mapped)
-		SetPageMappedToDisk(page);
-
-	if (!nr) {
-		/*
-		 * All buffers are uptodate - we can set the page uptodate
-		 * as well. But not if get_block() returned an error.
-		 */
-		if (!PageError(page))
-			SetPageUptodate(page);
-		unlock_page(page);
-		return 0;
-	}
-
-	/* Stage two: lock the buffers */
-	for (i = 0; i < nr; i++) {
-		bh = arr[i];
-		lock_buffer(bh);
-		bh->b_end_io = btrfs_end_buffer_async_read;
-		set_buffer_async_read(bh);
-	}
-
-	/*
-	 * Stage 3: start the IO.  Check for uptodateness
-	 * inside the buffer lock in case another process reading
-	 * the underlying blockdev brought it uptodate (the sct fix).
-	 */
-	for (i = 0; i < nr; i++) {
-		bh = arr[i];
-		if (buffer_uptodate(bh))
-			btrfs_end_buffer_async_read(bh, 1);
-		else
-			submit_bh(READ, bh);
-	}
-	return 0;
-}
-
-/*
- * Aside from a tiny bit of packed file data handling, this is the
- * same as the generic code.
- *
- * While block_write_full_page is writing back the dirty buffers under
- * the page lock, whoever dirtied the buffers may decide to clean them
- * again at any time.  We handle that by only looking at the buffer
- * state inside lock_buffer().
- *
- * If block_write_full_page() is called for regular writeback
- * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
- * locked buffer.   This only can happen if someone has written the buffer
- * directly, with submit_bh().  At the address_space level PageWriteback
- * prevents this contention from occurring.
- */
-static int __btrfs_write_full_page(struct inode *inode, struct page *page,
-				   struct writeback_control *wbc)
-{
-	int err;
-	sector_t block;
-	sector_t last_block;
-	struct buffer_head *bh, *head;
-	const unsigned blocksize = 1 << inode->i_blkbits;
-	int nr_underway = 0;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-
-	BUG_ON(!PageLocked(page));
-
-	last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
-
-	/* no csumming allowed when from PF_MEMALLOC */
-	if (current->flags & PF_MEMALLOC) {
-		redirty_page_for_writepage(wbc, page);
-		unlock_page(page);
-		return 0;
-	}
+	struct extent_map_tree *tree;
+	int ret;
 
-	if (!page_has_buffers(page)) {
-		create_empty_buffers(page, blocksize,
-					(1 << BH_Dirty)|(1 << BH_Uptodate));
+	if (page->private != 1) {
+		WARN_ON(1);
+		return try_to_free_buffers(page);
 	}
-
-	/*
-	 * Be very careful.  We have no exclusion from __set_page_dirty_buffers
-	 * here, and the (potentially unmapped) buffers may become dirty at
-	 * any time.  If a buffer becomes dirty here after we've inspected it
-	 * then we just miss that fact, and the page stays dirty.
-	 *
-	 * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
-	 * handle that here by just cleaning them.
-	 */
-
-	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
-	head = page_buffers(page);
-	bh = head;
-
-	/*
-	 * Get all the dirty buffers mapped to disk addresses and
-	 * handle any aliases from the underlying blockdev's mapping.
-	 */
-	do {
-		if (block > last_block) {
-			/*
-			 * mapped buffers outside i_size will occur, because
-			 * this page can be outside i_size when there is a
-			 * truncate in progress.
-			 */
-			/*
-			 * The buffer was zeroed by block_write_full_page()
-			 */
-			clear_buffer_dirty(bh);
-			set_buffer_uptodate(bh);
-		} else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
-			WARN_ON(bh->b_size != blocksize);
-			err = btrfs_get_block(inode, block, bh, 0);
-			if (err) {
-				goto recover;
-			}
-			if (buffer_new(bh)) {
-				/* blockdev mappings never come here */
-				clear_buffer_new(bh);
-			}
-		}
-		bh = bh->b_this_page;
-		block++;
-	} while (bh != head);
-
-	do {
-		if (!buffer_mapped(bh))
-			continue;
-		/*
-		 * If it's a fully non-blocking write attempt and we cannot
-		 * lock the buffer then redirty the page.  Note that this can
-		 * potentially cause a busy-wait loop from pdflush and kswapd
-		 * activity, but those code paths have their own higher-level
-		 * throttling.
-		 */
-		if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
-			lock_buffer(bh);
-		} else if (test_set_buffer_locked(bh)) {
-			redirty_page_for_writepage(wbc, page);
-			continue;
-		}
-		if (test_clear_buffer_dirty(bh) && bh->b_blocknr != 0) {
-			struct btrfs_trans_handle *trans;
-			int ret;
-			u64 off = page->index << PAGE_CACHE_SHIFT;
-			char *kaddr;
-
-			off += bh_offset(bh);
-			mutex_lock(&root->fs_info->fs_mutex);
-			trans = btrfs_start_transaction(root, 1);
-			btrfs_set_trans_block_group(trans, inode);
-			kaddr = kmap(page);
-			btrfs_csum_file_block(trans, root, inode->i_ino,
-						    off, kaddr + bh_offset(bh),
-						    bh->b_size);
-			kunmap(page);
-			ret = btrfs_end_transaction(trans, root);
-			BUG_ON(ret);
-			mutex_unlock(&root->fs_info->fs_mutex);
-			mark_buffer_async_write(bh);
-		} else {
-			unlock_buffer(bh);
-		}
-	} while ((bh = bh->b_this_page) != head);
-
-	/*
-	 * The page and its buffers are protected by PageWriteback(), so we can
-	 * drop the bh refcounts early.
-	 */
-	BUG_ON(PageWriteback(page));
-	set_page_writeback(page);
-
-	do {
-		struct buffer_head *next = bh->b_this_page;
-		if (buffer_async_write(bh)) {
-			submit_bh(WRITE, bh);
-			nr_underway++;
-		}
-		bh = next;
-	} while (bh != head);
-	unlock_page(page);
-
-	err = 0;
-done:
-	if (nr_underway == 0) {
-		/*
-		 * The page was marked dirty, but the buffers were
-		 * clean.  Someone wrote them back by hand with
-		 * ll_rw_block/submit_bh.  A rare case.
-		 */
-		int uptodate = 1;
-		do {
-			if (!buffer_uptodate(bh)) {
-				uptodate = 0;
-				break;
-			}
-			bh = bh->b_this_page;
-		} while (bh != head);
-		if (uptodate)
-			SetPageUptodate(page);
-		end_page_writeback(page);
+	tree = &BTRFS_I(page->mapping->host)->extent_tree;
+	ret = try_release_extent_mapping(tree, page);
+	if (ret == 1) {
+		ClearPagePrivate(page);
+		set_page_private(page, 0);
+		page_cache_release(page);
 	}
-	return err;
-
-recover:
-	/*
-	 * ENOSPC, or some other error.  We may already have added some
-	 * blocks to the file, so we need to write these out to avoid
-	 * exposing stale data.
-	 * The page is currently locked and not marked for writeback
-	 */
-	bh = head;
-	/* Recovery: lock and submit the mapped buffers */
-	do {
-		if (buffer_mapped(bh) && buffer_dirty(bh)) {
-			lock_buffer(bh);
-			mark_buffer_async_write(bh);
-		} else {
-			/*
-			 * The buffer may have been set dirty during
-			 * attachment to a dirty page.
-			 */
-			clear_buffer_dirty(bh);
-		}
-	} while ((bh = bh->b_this_page) != head);
-	SetPageError(page);
-	BUG_ON(PageWriteback(page));
-	set_page_writeback(page);
-	do {
-		struct buffer_head *next = bh->b_this_page;
-		if (buffer_async_write(bh)) {
-			clear_buffer_dirty(bh);
-			submit_bh(WRITE, bh);
-			nr_underway++;
-		}
-		bh = next;
-	} while (bh != head);
-	unlock_page(page);
-	goto done;
+	return ret;
 }
 
-static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
+static void btrfs_invalidatepage(struct page *page, unsigned long offset)
 {
-	struct inode * const inode = page->mapping->host;
-	loff_t i_size = i_size_read(inode);
-	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
-	unsigned offset;
-	void *kaddr;
-
-	/* Is the page fully inside i_size? */
-	if (page->index < end_index)
-		return __btrfs_write_full_page(inode, page, wbc);
-
-	/* Is the page fully outside i_size? (truncate in progress) */
-	offset = i_size & (PAGE_CACHE_SIZE-1);
-	if (page->index >= end_index+1 || !offset) {
-		/*
-		 * The page may have dirty, unmapped buffers.  For example,
-		 * they may have been added in ext3_writepage().  Make them
-		 * freeable here, so the page does not leak.
-		 */
-		block_invalidatepage(page, 0);
-		unlock_page(page);
-		return 0; /* don't care */
-	}
+	struct extent_map_tree *tree;
 
-	/*
-	 * The page straddles i_size.  It must be zeroed out on each and every
-	 * writepage invokation because it may be mmapped.  "A file is mapped
-	 * in multiples of the page size.  For a file that is not a multiple of
-	 * the  page size, the remaining memory is zeroed when mapped, and
-	 * writes to that region are not written out to the file."
-	 */
-	kaddr = kmap_atomic(page, KM_USER0);
-	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
-	flush_dcache_page(page);
-	kunmap_atomic(kaddr, KM_USER0);
-	return __btrfs_write_full_page(inode, page, wbc);
+	tree = &BTRFS_I(page->mapping->host)->extent_tree;
+	extent_invalidatepage(tree, page, offset);
+	btrfs_releasepage(page, GFP_NOFS);
 }
 
 /*
@@ -1905,28 +1698,39 @@ static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
 int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 {
 	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans;
 	unsigned long end;
 	loff_t size;
 	int ret = -EINVAL;
+	u64 page_start;
 
 	lock_page(page);
 	wait_on_page_writeback(page);
 	size = i_size_read(inode);
+	page_start = page->index << PAGE_CACHE_SHIFT;
+
 	if ((page->mapping != inode->i_mapping) ||
-	    ((page->index << PAGE_CACHE_SHIFT) > size)) {
+	    (page_start > size)) {
 		/* page got truncated out from underneath us */
 		goto out_unlock;
 	}
 
 	/* page is wholly or partially inside EOF */
-	if (((page->index + 1) << PAGE_CACHE_SHIFT) > size)
+	if (page_start + PAGE_CACHE_SIZE > size)
 		end = size & ~PAGE_CACHE_MASK;
 	else
 		end = PAGE_CACHE_SIZE;
 
-	ret = btrfs_prepare_write(NULL, page, 0, end);
-	if (!ret)
-		ret = btrfs_commit_write(NULL, page, 0, end);
+	mutex_lock(&root->fs_info->fs_mutex);
+	trans = btrfs_start_transaction(root, 1);
+	ret = btrfs_cow_one_page(trans, inode, page, end);
+	btrfs_end_transaction(trans, root);
+	mutex_unlock(&root->fs_info->fs_mutex);
+	set_extent_dirty(&BTRFS_I(inode)->extent_tree,
+			 page_start, page_start + PAGE_CACHE_SIZE - 1,
+			 GFP_NOFS);
+	set_page_dirty(page);
 
 out_unlock:
 	unlock_page(page);
@@ -1962,21 +1766,8 @@ static void btrfs_truncate(struct inode *inode)
 int btrfs_commit_write(struct file *file, struct page *page,
 		       unsigned from, unsigned to)
 {
-	struct inode *inode = page->mapping->host;
-	struct buffer_head *bh;
-	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
-
-	SetPageUptodate(page);
-	bh = page_buffers(page);
-	set_buffer_uptodate(bh);
-	if (buffer_mapped(bh) && bh->b_blocknr != 0) {
-		set_page_dirty(page);
-	}
-	if (pos > inode->i_size) {
-		i_size_write(inode, pos);
-		mark_inode_dirty(inode);
-	}
-	return 0;
+	return extent_commit_write(&BTRFS_I(page->mapping->host)->extent_tree,
+				   page->mapping->host, page, from, to);
 }
 
 static int create_subvol(struct btrfs_root *root, char *name, int namelen)
@@ -2471,6 +2262,8 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 		inode->i_mapping->a_ops = &btrfs_aops;
 		inode->i_fop = &btrfs_file_operations;
 		inode->i_op = &btrfs_file_inode_operations;
+		extent_map_tree_init(&BTRFS_I(inode)->extent_tree,
+				     inode->i_mapping, GFP_NOFS);
 	}
 	dir->i_sb->s_dirt = 1;
 	btrfs_update_inode_block_group(trans, inode);
@@ -2553,6 +2346,9 @@ static struct address_space_operations btrfs_aops = {
 	.prepare_write	= btrfs_prepare_write,
 	.commit_write	= btrfs_commit_write,
 	.bmap		= btrfs_bmap,
+	.invalidatepage = btrfs_invalidatepage,
+	.releasepage	= btrfs_releasepage,
+	.set_page_dirty	= __set_page_dirty_nobuffers,
 };
 
 static struct address_space_operations btrfs_symlink_aops = {
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 74f3de47423c..7b7c32c73a2c 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -185,6 +185,7 @@ static int __init init_btrfs_fs(void)
 	err = btrfs_init_cachep();
 	if (err)
 		return err;
+	extent_map_init();
 	return register_filesystem(&btrfs_fs_type);
 }
 
@@ -192,6 +193,7 @@ static void __exit exit_btrfs_fs(void)
 {
 	btrfs_exit_transaction_sys();
 	btrfs_destroy_cachep();
+	extent_map_exit();
 	unregister_filesystem(&btrfs_fs_type);
 }
 
-- 
cgit v1.2.3


From dc17ff8f11d129db9e83ab7244769e4eae05e14d Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 8 Jan 2008 15:46:30 -0500
Subject: Btrfs: Add data=ordered support

This forces file data extents down the disk along with the metadata that
references them.  The current implementation is fairly simple, and just
writes out all of the dirty pages in an inode before the commit.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile       |   2 +-
 fs/btrfs/btrfs_inode.h  |   1 +
 fs/btrfs/ctree.c        |   6 +-
 fs/btrfs/ctree.h        |   7 +-
 fs/btrfs/disk-io.c      |  16 +++-
 fs/btrfs/disk-io.h      |   2 +
 fs/btrfs/extent-tree.c  |   4 +-
 fs/btrfs/file.c         |   5 +-
 fs/btrfs/inode.c        |  46 ++++++----
 fs/btrfs/ordered-data.c | 221 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/ordered-data.h |  39 +++++++++
 fs/btrfs/transaction.c  |  58 +++++++++++++
 fs/btrfs/transaction.h  |   8 +-
 13 files changed, 387 insertions(+), 28 deletions(-)
 create mode 100644 fs/btrfs/ordered-data.c
 create mode 100644 fs/btrfs/ordered-data.h

(limited to 'fs/btrfs/btrfs_inode.h')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index d5804c5ca075..ab9a9f8edbf8 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -5,7 +5,7 @@ obj-m  := btrfs.o
 btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   hash.o file-item.o inode-item.o inode-map.o disk-io.o \
 	   transaction.o bit-radix.o inode.o file.o tree-defrag.o \
-	   extent_map.o sysfs.o struct-funcs.o xattr.o acl.o
+	   extent_map.o sysfs.o struct-funcs.o xattr.o acl.o ordered-data.o
 
 #btrfs-y := ctree.o disk-io.o radix-tree.o extent-tree.o print-tree.o \
 #	  root-tree.o dir-item.o hash.o file-item.o inode-item.o \
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index d1d5af471c3c..f27e633f1742 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -29,6 +29,7 @@ struct btrfs_inode {
 	struct extent_map_tree extent_tree;
 	struct inode vfs_inode;
 
+	u64 ordered_trans;
 	/*
 	 * transid of the trans_handle that last modified this inode
 	 */
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 35c57074a376..43d23148a4fe 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -221,7 +221,9 @@ int btrfs_cow_block(struct btrfs_trans_handle *trans,
 		    struct extent_buffer **cow_ret)
 {
 	u64 search_start;
+	u64 header_trans;
 	int ret;
+
 	if (trans->transaction != root->fs_info->running_transaction) {
 		printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
 		       root->fs_info->running_transaction->transid);
@@ -232,7 +234,9 @@ int btrfs_cow_block(struct btrfs_trans_handle *trans,
 		       root->fs_info->generation);
 		WARN_ON(1);
 	}
-	if (btrfs_header_generation(buf) == trans->transid) {
+
+	header_trans = btrfs_header_generation(buf);
+	if (header_trans == trans->transid) {
 		*cow_ret = buf;
 		return 0;
 	}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9873975ce0ee..b55dba58dfaa 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -16,8 +16,8 @@
  * Boston, MA 021110-1307, USA.
  */
 
-#ifndef __BTRFS__
-#define __BTRFS__
+#ifndef __BTRFS_CTREE__
+#define __BTRFS_CTREE__
 
 #include <linux/version.h>
 #include <linux/mm.h>
@@ -363,7 +363,6 @@ struct btrfs_root {
 	struct inode *inode;
 	struct kobject root_kobj;
 	struct completion kobj_unregister;
-	struct rw_semaphore snap_sem;
 	u64 objectid;
 	u64 last_trans;
 
@@ -1142,6 +1141,8 @@ void btrfs_destroy_cachep(void);
 long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
 				struct btrfs_root *root);
+struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
+			    u64 root_objectid);
 int btrfs_commit_write(struct file *file, struct page *page,
 		       unsigned from, unsigned to);
 struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a6170ff19e7f..34cf1f1f47be 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -406,7 +406,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 	memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
 	memset(&root->root_kobj, 0, sizeof(root->root_kobj));
 	init_completion(&root->kobj_unregister);
-	init_rwsem(&root->snap_sem);
 	root->defrag_running = 0;
 	root->defrag_level = 0;
 	root->root_key.objectid = objectid;
@@ -498,6 +497,21 @@ insert:
 	return root;
 }
 
+struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
+					u64 root_objectid)
+{
+	struct btrfs_root *root;
+
+	if (root_objectid == BTRFS_ROOT_TREE_OBJECTID)
+		return fs_info->tree_root;
+	if (root_objectid == BTRFS_EXTENT_TREE_OBJECTID)
+		return fs_info->extent_root;
+
+	root = radix_tree_lookup(&fs_info->fs_roots_radix,
+				 (unsigned long)root_objectid);
+	return root;
+}
+
 struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
 					      struct btrfs_key *location)
 {
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 8c3cfd02901f..dae9fba8efcd 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -34,6 +34,8 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root);
 struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
 					    u64 bytenr, u32 blocksize);
+struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
+					u64 root_objectid);
 struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
 				      struct btrfs_key *location,
 				      const char *name, int namelen);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c906bb19b211..68137cd8506a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1195,7 +1195,9 @@ static int pin_down_bytes(struct btrfs_root *root, u64 bytenr, u32 num_bytes,
 			if (btrfs_buffer_uptodate(buf)) {
 				u64 transid =
 				    root->fs_info->running_transaction->transid;
-				if (btrfs_header_generation(buf) == transid) {
+				u64 header_transid =
+					btrfs_header_generation(buf);
+				if (header_transid == transid) {
 					free_extent_buffer(buf);
 					return 1;
 				}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 94c93373cb7d..0a5f4defe59b 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -34,6 +34,7 @@
 #include "disk-io.h"
 #include "transaction.h"
 #include "btrfs_inode.h"
+#include "ordered-data.h"
 #include "ioctl.h"
 #include "print-tree.h"
 
@@ -329,6 +330,7 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 		root->fs_info->delalloc_bytes += (end_of_last_block + 1 -
 					  start_pos) - existing_delalloc;
 		spin_unlock(&root->fs_info->delalloc_lock);
+		btrfs_add_ordered_inode(inode);
 	} else {
 		u64 aligned_end;
 		/* step one, delete the existing extents in this range */
@@ -724,8 +726,6 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 
 	pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
 
-	down_read(&BTRFS_I(inode)->root->snap_sem);
-
 	mutex_lock(&inode->i_mutex);
 	first_index = pos >> PAGE_CACHE_SHIFT;
 	last_index = (pos + count) >> PAGE_CACHE_SHIFT;
@@ -804,7 +804,6 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 	}
 out:
 	mutex_unlock(&inode->i_mutex);
-	up_read(&BTRFS_I(inode)->root->snap_sem);
 
 out_nolock:
 	kfree(pages);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6d0cd9a7a615..6d6e1ac0a9a0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -135,6 +135,7 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
 		alloc_hint = ins.objectid + ins.offset;
 		start += cur_alloc_size;
 	}
+	btrfs_add_ordered_inode(inode);
 out:
 	btrfs_end_transaction(trans, root);
 	return ret;
@@ -367,8 +368,8 @@ void btrfs_read_locked_inode(struct inode *inode)
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 	mutex_lock(&root->fs_info->fs_mutex);
-
 	memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
+
 	ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
 	if (ret)
 		goto make_bad;
@@ -898,7 +899,6 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
 	if ((offset & (blocksize - 1)) == 0)
 		goto out;
 
-	down_read(&root->snap_sem);
 	ret = -ENOMEM;
 	page = grab_cache_page(mapping, index);
 	if (!page)
@@ -917,7 +917,6 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
 
 	unlock_page(page);
 	page_cache_release(page);
-	up_read(&BTRFS_I(inode)->root->snap_sem);
 out:
 	return ret;
 }
@@ -1146,6 +1145,19 @@ static int btrfs_find_actor(struct inode *inode, void *opaque)
 		args->root == BTRFS_I(inode)->root);
 }
 
+struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
+			    u64 root_objectid)
+{
+	struct btrfs_iget_args args;
+	args.ino = objectid;
+	args.root = btrfs_lookup_fs_root(btrfs_sb(s)->fs_info, root_objectid);
+
+	if (!args.root)
+		return NULL;
+
+	return ilookup5(s, objectid, btrfs_find_actor, (void *)&args);
+}
+
 struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
 				struct btrfs_root *root)
 {
@@ -1336,7 +1348,6 @@ read_dir_items:
 
 			d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
 			btrfs_dir_item_key_to_cpu(leaf, di, &location);
-
 			over = filldir(dirent, name_ptr, name_len,
 				       found_key.offset,
 				       location.objectid,
@@ -2054,7 +2065,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 
 	ret = -EINVAL;
 
-	down_read(&BTRFS_I(inode)->root->snap_sem);
 	lock_page(page);
 	wait_on_page_writeback(page);
 	size = i_size_read(inode);
@@ -2075,7 +2085,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 	ret = btrfs_cow_one_page(inode, page, end);
 
 out_unlock:
-	up_read(&BTRFS_I(inode)->root->snap_sem);
 	unlock_page(page);
 out:
 	return ret;
@@ -2118,7 +2127,7 @@ static int noinline create_subvol(struct btrfs_root *root, char *name,
 	struct btrfs_root_item root_item;
 	struct btrfs_inode_item *inode_item;
 	struct extent_buffer *leaf;
-	struct btrfs_root *new_root;
+	struct btrfs_root *new_root = root;
 	struct inode *inode;
 	struct inode *dir;
 	int ret;
@@ -2230,7 +2239,7 @@ static int noinline create_subvol(struct btrfs_root *root, char *name,
 		goto fail;
 fail:
 	nr = trans->blocks_used;
-	err = btrfs_commit_transaction(trans, root);
+	err = btrfs_commit_transaction(trans, new_root);
 	if (err && !ret)
 		ret = err;
 fail_commit:
@@ -2253,10 +2262,6 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
 	if (!root->ref_cows)
 		return -EINVAL;
 
-	down_write(&root->snap_sem);
-	freeze_bdev(root->fs_info->sb->s_bdev);
-	thaw_bdev(root->fs_info->sb->s_bdev, root->fs_info->sb);
-
 	mutex_lock(&root->fs_info->fs_mutex);
 	ret = btrfs_check_free_space(root, 1, 0);
 	if (ret)
@@ -2264,6 +2269,9 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
 
 	trans = btrfs_start_transaction(root, 1);
 	BUG_ON(!trans);
+	err = btrfs_commit_transaction(trans, root);
+
+	trans = btrfs_start_transaction(root, 1);
 
 	ret = btrfs_update_inode(trans, root, root->inode);
 	if (ret)
@@ -2272,9 +2280,7 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
 	ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
 				       0, &objectid);
 	if (ret)
-		goto fail;
-
-	memcpy(&new_root_item, &root->root_item,
+		goto fail; memcpy(&new_root_item, &root->root_item,
 	       sizeof(new_root_item));
 
 	key.objectid = objectid;
@@ -2285,12 +2291,20 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
 	btrfs_cow_block(trans, root, root->node, NULL, 0, &tmp);
 	free_extent_buffer(tmp);
 
+	/* write the ordered inodes to force all delayed allocations to
+	 * be filled.  Once this is done, we can copy the root
+	 */
+	mutex_lock(&root->fs_info->trans_mutex);
+	btrfs_write_ordered_inodes(trans, root);
+	mutex_unlock(&root->fs_info->trans_mutex);
+
 	btrfs_copy_root(trans, root, root->node, &tmp, objectid);
 
 	btrfs_set_root_bytenr(&new_root_item, tmp->start);
 	btrfs_set_root_level(&new_root_item, btrfs_header_level(tmp));
 	ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
 				&new_root_item);
+printk("new root %Lu node %Lu\n", objectid, tmp->start);
 	free_extent_buffer(tmp);
 	if (ret)
 		goto fail;
@@ -2321,7 +2335,6 @@ fail:
 		ret = err;
 fail_unlock:
 	mutex_unlock(&root->fs_info->fs_mutex);
-	up_write(&root->snap_sem);
 	btrfs_btree_balance_dirty(root, nr);
 	return ret;
 }
@@ -2608,6 +2621,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 	if (!ei)
 		return NULL;
 	ei->last_trans = 0;
+	ei->ordered_trans = 0;
 	return &ei->vfs_inode;
 }
 
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
new file mode 100644
index 000000000000..411aba84d305
--- /dev/null
+++ b/fs/btrfs/ordered-data.c
@@ -0,0 +1,221 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include "ctree.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+
+struct tree_entry {
+	u64 root_objectid;
+	u64 objectid;
+	struct rb_node rb_node;
+};
+
+/*
+ * returns > 0 if entry passed (root, objectid) is > entry,
+ * < 0 if (root, objectid) < entry and zero if they are equal
+ */
+static int comp_entry(struct tree_entry *entry, u64 root_objectid,
+		      u64 objectid)
+{
+	if (root_objectid < entry->root_objectid)
+		return -1;
+	if (root_objectid > entry->root_objectid)
+		return 1;
+	if (objectid < entry->objectid)
+		return -1;
+	if (objectid > entry->objectid)
+		return 1;
+	return 0;
+}
+
+static struct rb_node *tree_insert(struct rb_root *root, u64 root_objectid,
+				   u64 objectid, struct rb_node *node)
+{
+	struct rb_node ** p = &root->rb_node;
+	struct rb_node * parent = NULL;
+	struct tree_entry *entry;
+	int comp;
+
+	while(*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct tree_entry, rb_node);
+
+		comp = comp_entry(entry, root_objectid, objectid);
+		if (comp < 0)
+			p = &(*p)->rb_left;
+		else if (comp > 0)
+			p = &(*p)->rb_right;
+		else
+			return parent;
+	}
+
+	rb_link_node(node, parent, p);
+	rb_insert_color(node, root);
+	return NULL;
+}
+
+static struct rb_node *__tree_search(struct rb_root *root, u64 root_objectid,
+				     u64 objectid, struct rb_node **prev_ret)
+{
+	struct rb_node * n = root->rb_node;
+	struct rb_node *prev = NULL;
+	struct tree_entry *entry;
+	struct tree_entry *prev_entry = NULL;
+	int comp;
+
+	while(n) {
+		entry = rb_entry(n, struct tree_entry, rb_node);
+		prev = n;
+		prev_entry = entry;
+		comp = comp_entry(entry, root_objectid, objectid);
+
+		if (comp < 0)
+			n = n->rb_left;
+		else if (comp > 0)
+			n = n->rb_right;
+		else
+			return n;
+	}
+	if (!prev_ret)
+		return NULL;
+
+	while(prev && comp_entry(prev_entry, root_objectid, objectid) >= 0) {
+		prev = rb_next(prev);
+		prev_entry = rb_entry(prev, struct tree_entry, rb_node);
+	}
+	*prev_ret = prev;
+	return NULL;
+}
+
+static inline struct rb_node *tree_search(struct rb_root *root,
+					  u64 root_objectid, u64 objectid)
+{
+	struct rb_node *prev;
+	struct rb_node *ret;
+	ret = __tree_search(root, root_objectid, objectid, &prev);
+	if (!ret)
+		return prev;
+	return ret;
+}
+
+int btrfs_add_ordered_inode(struct inode *inode)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	u64 root_objectid = root->root_key.objectid;
+	u64 transid = root->fs_info->running_transaction->transid;
+	struct tree_entry *entry;
+	struct rb_node *node;
+	struct btrfs_ordered_inode_tree *tree;
+
+	if (transid <= BTRFS_I(inode)->ordered_trans)
+		return 0;
+
+	tree = &root->fs_info->running_transaction->ordered_inode_tree;
+
+	read_lock(&tree->lock);
+	node = __tree_search(&tree->tree, root_objectid, inode->i_ino, NULL);
+	read_unlock(&tree->lock);
+	if (node) {
+		return 0;
+	}
+
+	entry = kmalloc(sizeof(*entry), GFP_NOFS);
+	if (!entry)
+		return -ENOMEM;
+
+	write_lock(&tree->lock);
+	entry->objectid = inode->i_ino;
+	entry->root_objectid = root_objectid;
+
+	node = tree_insert(&tree->tree, root_objectid,
+			   inode->i_ino, &entry->rb_node);
+
+	BTRFS_I(inode)->ordered_trans = transid;
+
+	write_unlock(&tree->lock);
+	if (node)
+		kfree(entry);
+	return 0;
+}
+
+int btrfs_find_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
+				       u64 *root_objectid, u64 *objectid)
+{
+	struct tree_entry *entry;
+	struct rb_node *node;
+
+	write_lock(&tree->lock);
+	node = tree_search(&tree->tree, *root_objectid, *objectid);
+	if (!node) {
+		write_unlock(&tree->lock);
+		return 0;
+	}
+	entry = rb_entry(node, struct tree_entry, rb_node);
+
+	while(comp_entry(entry, *root_objectid, *objectid) >= 0) {
+		node = rb_next(node);
+		if (!node)
+			break;
+		entry = rb_entry(node, struct tree_entry, rb_node);
+	}
+	if (!node) {
+		write_unlock(&tree->lock);
+		return 0;
+	}
+
+	*root_objectid = entry->root_objectid;
+	*objectid = entry->objectid;
+	write_unlock(&tree->lock);
+	return 1;
+}
+
+int btrfs_find_del_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
+				       u64 *root_objectid, u64 *objectid)
+{
+	struct tree_entry *entry;
+	struct rb_node *node;
+
+	write_lock(&tree->lock);
+	node = tree_search(&tree->tree, *root_objectid, *objectid);
+	if (!node) {
+		write_unlock(&tree->lock);
+		return 0;
+	}
+
+	entry = rb_entry(node, struct tree_entry, rb_node);
+	while(comp_entry(entry, *root_objectid, *objectid) >= 0) {
+		node = rb_next(node);
+		if (!node)
+			break;
+		entry = rb_entry(node, struct tree_entry, rb_node);
+	}
+	if (!node) {
+		write_unlock(&tree->lock);
+		return 0;
+	}
+
+	*root_objectid = entry->root_objectid;
+	*objectid = entry->objectid;
+	rb_erase(node, &tree->tree);
+	write_unlock(&tree->lock);
+	kfree(entry);
+	return 1;
+}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
new file mode 100644
index 000000000000..aaf9eb142719
--- /dev/null
+++ b/fs/btrfs/ordered-data.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_ORDERED_DATA__
+#define __BTRFS_ORDERED_DATA__
+
+struct btrfs_ordered_inode_tree {
+	rwlock_t lock;
+	struct rb_root tree;
+};
+
+static inline void
+btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
+{
+	rwlock_init(&t->lock);
+	t->tree.rb_node = NULL;
+}
+
+int btrfs_add_ordered_inode(struct inode *inode);
+int btrfs_find_del_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
+				       u64 *root_objectid, u64 *objectid);
+int btrfs_find_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
+				       u64 *root_objectid, u64 *objectid);
+#endif
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 02721eea9a7a..3ed5868e7c0f 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -67,6 +67,7 @@ static int join_transaction(struct btrfs_root *root)
 		cur_trans->commit_done = 0;
 		cur_trans->start_time = get_seconds();
 		list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
+		btrfs_ordered_inode_tree_init(&cur_trans->ordered_inode_tree);
 		extent_map_tree_init(&cur_trans->dirty_pages,
 				     root->fs_info->btree_inode->i_mapping,
 				     GFP_NOFS);
@@ -473,6 +474,60 @@ static int drop_dirty_roots(struct btrfs_root *tree_root,
 	return ret;
 }
 
+int btrfs_write_ordered_inodes(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root)
+{
+	struct btrfs_transaction *cur_trans = trans->transaction;
+	struct inode *inode;
+	u64 root_objectid = 0;
+	u64 objectid = 0;
+	u64 transid = trans->transid;
+	int ret;
+
+printk("write ordered trans %Lu\n", transid);
+	while(1) {
+		ret = btrfs_find_first_ordered_inode(
+				&cur_trans->ordered_inode_tree,
+				&root_objectid, &objectid);
+		if (!ret)
+			break;
+
+		mutex_unlock(&root->fs_info->trans_mutex);
+		mutex_unlock(&root->fs_info->fs_mutex);
+		inode = btrfs_ilookup(root->fs_info->sb, objectid,
+				      root_objectid);
+		if (inode) {
+			if (S_ISREG(inode->i_mode))
+				filemap_fdatawrite(inode->i_mapping);
+			iput(inode);
+		}
+		mutex_lock(&root->fs_info->fs_mutex);
+		mutex_lock(&root->fs_info->trans_mutex);
+	}
+	while(1) {
+		root_objectid = 0;
+		objectid = 0;
+		ret = btrfs_find_del_first_ordered_inode(
+				&cur_trans->ordered_inode_tree,
+				&root_objectid, &objectid);
+		if (!ret)
+			break;
+		mutex_unlock(&root->fs_info->trans_mutex);
+		mutex_unlock(&root->fs_info->fs_mutex);
+		inode = btrfs_ilookup(root->fs_info->sb, objectid,
+				      root_objectid);
+		if (inode) {
+			if (S_ISREG(inode->i_mode))
+				filemap_write_and_wait(inode->i_mapping);
+			iput(inode);
+		}
+		mutex_lock(&root->fs_info->fs_mutex);
+		mutex_lock(&root->fs_info->trans_mutex);
+	}
+printk("done write ordered trans %Lu\n", transid);
+	return 0;
+}
+
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root)
 {
@@ -550,10 +605,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 		mutex_lock(&root->fs_info->fs_mutex);
 		mutex_lock(&root->fs_info->trans_mutex);
 		finish_wait(&cur_trans->writer_wait, &wait);
+		ret = btrfs_write_ordered_inodes(trans, root);
+
 	} while (cur_trans->num_writers > 1 ||
 		 (cur_trans->num_joined != joined));
 
 	WARN_ON(cur_trans != trans->transaction);
+
 	ret = add_dirty_roots(trans, &root->fs_info->fs_roots_radix,
 			      &dirty_fs_roots);
 	BUG_ON(ret);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index eef840bca91e..c157ddbe9d1e 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -16,9 +16,10 @@
  * Boston, MA 021110-1307, USA.
  */
 
-#ifndef __TRANSACTION__
-#define __TRANSACTION__
+#ifndef __BTRFS_TRANSACTION__
+#define __BTRFS_TRANSACTION__
 #include "btrfs_inode.h"
+#include "ordered-data.h"
 
 struct btrfs_transaction {
 	u64 transid;
@@ -30,6 +31,7 @@ struct btrfs_transaction {
 	struct list_head list;
 	struct extent_map_tree dirty_pages;
 	unsigned long start_time;
+	struct btrfs_ordered_inode_tree ordered_inode_tree;
 	wait_queue_head_t writer_wait;
 	wait_queue_head_t commit_wait;
 };
@@ -90,4 +92,6 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly);
 int btrfs_clean_old_snapshots(struct btrfs_root *root);
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root);
+int btrfs_write_ordered_inodes(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root);
 #endif
-- 
cgit v1.2.3


From b98b6767a0f6f9baa1d00eff64a43d58412ecdeb Mon Sep 17 00:00:00 2001
From: Yan <yanzheng@21cn.com>
Date: Tue, 8 Jan 2008 15:54:37 -0500
Subject: Btrfs: Add inode flags support

This patch adds NODATASUM & NODATACOW inode flags support.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/btrfs_inode.h |  1 +
 fs/btrfs/ctree.h       | 13 ++++++++++++-
 fs/btrfs/inode.c       | 24 +++++++++++-------------
 3 files changed, 24 insertions(+), 14 deletions(-)

(limited to 'fs/btrfs/btrfs_inode.h')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index f27e633f1742..9fc99883c70a 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -34,6 +34,7 @@ struct btrfs_inode {
 	 * transid of the trans_handle that last modified this inode
 	 */
 	u64 last_trans;
+	u32 flags;
 };
 static inline struct btrfs_inode *BTRFS_I(struct inode *inode)
 {
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f0fb1978553c..7eda51542d17 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -447,7 +447,18 @@ struct btrfs_root {
 #define btrfs_set_opt(o, opt)		((o) |= BTRFS_MOUNT_##opt)
 #define btrfs_test_opt(root, opt)	((root)->fs_info->mount_opt & \
 					 BTRFS_MOUNT_##opt)
-
+/*
+ * Inode flags
+ */
+#define BTRFS_INODE_NODATASUM 		0x1
+#define BTRFS_INODE_NODATACOW 		0x2
+
+#define btrfs_clear_flag(inode, flag)	(BTRFS_I(inode)->flags &= \
+					 ~BTRFS_INODE_##flag)
+#define btrfs_set_flag(inode, flag)	(BTRFS_I(inode)->flags |= \
+					 BTRFS_INODE_##flag)
+#define btrfs_test_flag(inode, flag)	(BTRFS_I(inode)->flags & \
+					 BTRFS_INODE_##flag)
 /* some macros to generate set/get funcs for the struct fields.  This
  * assumes there is a lefoo_to_cpu for every type, so lets make a simple
  * one for u8:
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 281757007ddb..fba0478a95f4 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -243,9 +243,9 @@ static int run_delalloc_range(struct inode *inode, u64 start, u64 end)
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	u64 num_bytes;
 	int ret;
-
 	mutex_lock(&root->fs_info->fs_mutex);
-	if (btrfs_test_opt(root, NODATACOW))
+	if (btrfs_test_opt(root, NODATACOW) ||
+	    btrfs_test_flag(inode, NODATACOW))
 		ret = run_delalloc_nocow(inode, start, end);
 	else
 		ret = cow_file_range(inode, start, end);
@@ -273,10 +273,9 @@ int btrfs_writepage_io_hook(struct page *page, u64 start, u64 end)
 	int ret = 0;
 	u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
 	size_t offset = start - page_start;
-
-	if (btrfs_test_opt(root, NODATASUM))
+	if (btrfs_test_opt(root, NODATASUM) ||
+	    btrfs_test_flag(inode, NODATASUM))
 		return 0;
-
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
 	btrfs_set_trans_block_group(trans, inode);
@@ -299,10 +298,9 @@ int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end)
 	struct btrfs_csum_item *item;
 	struct btrfs_path *path = NULL;
 	u32 csum;
-
-	if (btrfs_test_opt(root, NODATASUM))
+	if (btrfs_test_opt(root, NODATASUM) ||
+	    btrfs_test_flag(inode, NODATASUM))
 		return 0;
-
 	mutex_lock(&root->fs_info->fs_mutex);
 	path = btrfs_alloc_path();
 	item = btrfs_lookup_csum(NULL, root, path, inode->i_ino, start, 0);
@@ -335,10 +333,9 @@ int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end)
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	u32 csum = ~(u32)0;
 	unsigned long flags;
-
-	if (btrfs_test_opt(root, NODATASUM))
+	if (btrfs_test_opt(root, NODATASUM) ||
+	    btrfs_test_flag(inode, NODATASUM))
 		return 0;
-
 	ret = get_state_private(em_tree, start, &private);
 	local_irq_save(flags);
 	kaddr = kmap_atomic(page, KM_IRQ0);
@@ -415,7 +412,7 @@ void btrfs_read_locked_inode(struct inode *inode)
 	alloc_group_block = btrfs_inode_block_group(leaf, inode_item);
 	BTRFS_I(inode)->block_group = btrfs_lookup_block_group(root->fs_info,
 						       alloc_group_block);
-
+	BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
 	if (!BTRFS_I(inode)->block_group) {
 		BTRFS_I(inode)->block_group = btrfs_find_block_group(root,
 						         NULL, 0, 0, 0);
@@ -484,6 +481,7 @@ static void fill_inode_item(struct extent_buffer *leaf,
 	btrfs_set_inode_nblocks(leaf, item, inode->i_blocks);
 	btrfs_set_inode_generation(leaf, item, inode->i_generation);
 	btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
+	btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
 	btrfs_set_inode_block_group(leaf, item,
 				    BTRFS_I(inode)->block_group->key.objectid);
 }
@@ -1454,7 +1452,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 		owner = 1;
 	group = btrfs_find_block_group(root, group, 0, 0, owner);
 	BTRFS_I(inode)->block_group = group;
-
+	BTRFS_I(inode)->flags = 0;
 	ret = btrfs_insert_empty_inode(trans, root, path, objectid);
 	if (ret)
 		goto fail;
-- 
cgit v1.2.3


From d1310b2e0cd98eb1348553e69b73827b436dca7b Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 24 Jan 2008 16:13:08 -0500
Subject: Btrfs: Split the extent_map code into two parts

There is now extent_map for mapping offsets in the file to disk and
extent_io for state tracking, IO submission and extent_bufers.

The new extent_map code shifts from [start,end] pairs to [start,len], and
pushes the locking out into the caller.  This allows a few performance
optimizations and is easier to use.

A number of extent_map usage bugs were fixed, mostly with failing
to remove extent_map entries when changing the file.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile      |    3 +-
 fs/btrfs/btrfs_inode.h |    2 +
 fs/btrfs/ctree.h       |   15 +-
 fs/btrfs/disk-io.c     |  123 +-
 fs/btrfs/extent-tree.c |   43 +-
 fs/btrfs/extent_io.c   | 3089 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/extent_io.h   |  193 +++
 fs/btrfs/extent_map.c  | 3119 ++----------------------------------------------
 fs/btrfs/extent_map.h  |  214 +---
 fs/btrfs/file.c        |   29 +-
 fs/btrfs/inode.c       |  175 +--
 fs/btrfs/super.c       |   10 +-
 fs/btrfs/transaction.c |    8 +-
 fs/btrfs/transaction.h |    2 +-
 14 files changed, 3612 insertions(+), 3413 deletions(-)
 create mode 100644 fs/btrfs/extent_io.c
 create mode 100644 fs/btrfs/extent_io.h

(limited to 'fs/btrfs/btrfs_inode.h')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 72cc3e16741f..9c61609f3714 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -5,7 +5,8 @@ obj-m  := btrfs.o
 btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   hash.o file-item.o inode-item.o inode-map.o disk-io.o \
 	   transaction.o bit-radix.o inode.o file.o tree-defrag.o \
-	   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o
+	   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
+	   extent_io.o
 
 ifeq ($(CONFIG_FS_POSIX_ACL),y)
 btrfs-y += acl.o
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 9fc99883c70a..9a0647dc5a0b 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -20,6 +20,7 @@
 #define __BTRFS_I__
 
 #include "extent_map.h"
+#include "extent_io.h"
 
 /* in memory btrfs inode */
 struct btrfs_inode {
@@ -27,6 +28,7 @@ struct btrfs_inode {
 	struct btrfs_block_group_cache *block_group;
 	struct btrfs_key location;
 	struct extent_map_tree extent_tree;
+	struct extent_io_tree io_tree;
 	struct inode vfs_inode;
 
 	u64 ordered_trans;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b9f2975b55c9..6c65473e0fe3 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -27,6 +27,7 @@
 #include <linux/completion.h>
 #include <asm/kmap_types.h>
 #include "bit-radix.h"
+#include "extent_io.h"
 #include "extent_map.h"
 
 struct btrfs_trans_handle;
@@ -314,11 +315,11 @@ struct btrfs_fs_info {
 	struct btrfs_root *tree_root;
 	struct radix_tree_root fs_roots_radix;
 
-	struct extent_map_tree free_space_cache;
-	struct extent_map_tree block_group_cache;
-	struct extent_map_tree pinned_extents;
-	struct extent_map_tree pending_del;
-	struct extent_map_tree extent_ins;
+	struct extent_io_tree free_space_cache;
+	struct extent_io_tree block_group_cache;
+	struct extent_io_tree pinned_extents;
+	struct extent_io_tree pending_del;
+	struct extent_io_tree extent_ins;
 
 	u64 generation;
 	u64 last_trans_committed;
@@ -956,7 +957,7 @@ u32 btrfs_count_snapshots_in_path(struct btrfs_root *root,
 				  u64 first_extent);
 int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root);
-int btrfs_copy_pinned(struct btrfs_root *root, struct extent_map_tree *copy);
+int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy);
 struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
 							 btrfs_fs_info *info,
 							 u64 bytenr);
@@ -1001,7 +1002,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 		      u64 owner_objectid, u64 owner_offset, int pin);
 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root,
-			       struct extent_map_tree *unpin);
+			       struct extent_io_tree *unpin);
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
 				u64 bytenr, u64 num_bytes,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 5d1f9bca2712..4c4ebea0b2a9 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -43,14 +43,14 @@ static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf)
 }
 #endif
 
-static struct extent_map_ops btree_extent_map_ops;
+static struct extent_io_ops btree_extent_io_ops;
 
 struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
 					    u64 bytenr, u32 blocksize)
 {
 	struct inode *btree_inode = root->fs_info->btree_inode;
 	struct extent_buffer *eb;
-	eb = find_extent_buffer(&BTRFS_I(btree_inode)->extent_tree,
+	eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
 				bytenr, blocksize, GFP_NOFS);
 	return eb;
 }
@@ -61,13 +61,13 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
 	struct inode *btree_inode = root->fs_info->btree_inode;
 	struct extent_buffer *eb;
 
-	eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->extent_tree,
+	eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
 				 bytenr, blocksize, NULL, GFP_NOFS);
 	return eb;
 }
 
 struct extent_map *btree_get_extent(struct inode *inode, struct page *page,
-				    size_t page_offset, u64 start, u64 end,
+				    size_t page_offset, u64 start, u64 len,
 				    int create)
 {
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
@@ -75,7 +75,9 @@ struct extent_map *btree_get_extent(struct inode *inode, struct page *page,
 	int ret;
 
 again:
-	em = lookup_extent_mapping(em_tree, start, end);
+	spin_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, start, len);
+	spin_unlock(&em_tree->lock);
 	if (em) {
 		goto out;
 	}
@@ -85,11 +87,14 @@ again:
 		goto out;
 	}
 	em->start = 0;
-	em->end = (i_size_read(inode) & ~((u64)PAGE_CACHE_SIZE -1)) - 1;
+	em->len = i_size_read(inode);
 	em->block_start = 0;
-	em->block_end = em->end;
 	em->bdev = inode->i_sb->s_bdev;
+
+	spin_lock(&em_tree->lock);
 	ret = add_extent_mapping(em_tree, em);
+	spin_unlock(&em_tree->lock);
+
 	if (ret == -EEXIST) {
 		free_extent_map(em);
 		em = NULL;
@@ -175,13 +180,13 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
 
 int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
 {
-	struct extent_map_tree *tree;
+	struct extent_io_tree *tree;
 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
 	u64 found_start;
 	int found_level;
 	unsigned long len;
 	struct extent_buffer *eb;
-	tree = &BTRFS_I(page->mapping->host)->extent_tree;
+	tree = &BTRFS_I(page->mapping->host)->io_tree;
 
 	if (page->private == EXTENT_PAGE_PRIVATE)
 		goto out;
@@ -230,16 +235,16 @@ static int btree_writepage_io_hook(struct page *page, u64 start, u64 end)
 
 static int btree_writepage(struct page *page, struct writeback_control *wbc)
 {
-	struct extent_map_tree *tree;
-	tree = &BTRFS_I(page->mapping->host)->extent_tree;
+	struct extent_io_tree *tree;
+	tree = &BTRFS_I(page->mapping->host)->io_tree;
 	return extent_write_full_page(tree, page, btree_get_extent, wbc);
 }
 
 static int btree_writepages(struct address_space *mapping,
 			    struct writeback_control *wbc)
 {
-	struct extent_map_tree *tree;
-	tree = &BTRFS_I(mapping->host)->extent_tree;
+	struct extent_io_tree *tree;
+	tree = &BTRFS_I(mapping->host)->io_tree;
 	if (wbc->sync_mode == WB_SYNC_NONE) {
 		u64 num_dirty;
 		u64 start = 0;
@@ -264,18 +269,20 @@ static int btree_writepages(struct address_space *mapping,
 
 int btree_readpage(struct file *file, struct page *page)
 {
-	struct extent_map_tree *tree;
-	tree = &BTRFS_I(page->mapping->host)->extent_tree;
+	struct extent_io_tree *tree;
+	tree = &BTRFS_I(page->mapping->host)->io_tree;
 	return extent_read_full_page(tree, page, btree_get_extent);
 }
 
 static int btree_releasepage(struct page *page, gfp_t unused_gfp_flags)
 {
-	struct extent_map_tree *tree;
+	struct extent_io_tree *tree;
+	struct extent_map_tree *map;
 	int ret;
 
-	tree = &BTRFS_I(page->mapping->host)->extent_tree;
-	ret = try_release_extent_mapping(tree, page);
+	tree = &BTRFS_I(page->mapping->host)->io_tree;
+	map = &BTRFS_I(page->mapping->host)->extent_tree;
+	ret = try_release_extent_mapping(map, tree, page);
 	if (ret == 1) {
 		ClearPagePrivate(page);
 		set_page_private(page, 0);
@@ -286,8 +293,8 @@ static int btree_releasepage(struct page *page, gfp_t unused_gfp_flags)
 
 static void btree_invalidatepage(struct page *page, unsigned long offset)
 {
-	struct extent_map_tree *tree;
-	tree = &BTRFS_I(page->mapping->host)->extent_tree;
+	struct extent_io_tree *tree;
+	tree = &BTRFS_I(page->mapping->host)->io_tree;
 	extent_invalidatepage(tree, page, offset);
 	btree_releasepage(page, GFP_NOFS);
 }
@@ -331,7 +338,7 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize)
 	buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
 	if (!buf)
 		return 0;
-	read_extent_buffer_pages(&BTRFS_I(btree_inode)->extent_tree,
+	read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
 				 buf, 0, 0);
 	free_extent_buffer(buf);
 	return ret;
@@ -342,40 +349,39 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
 {
 	struct extent_buffer *buf = NULL;
 	struct inode *btree_inode = root->fs_info->btree_inode;
-	struct extent_map_tree *extent_tree;
+	struct extent_io_tree *io_tree;
 	u64 end;
 	int ret;
 
-	extent_tree = &BTRFS_I(btree_inode)->extent_tree;
+	io_tree = &BTRFS_I(btree_inode)->io_tree;
 
 	buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
 	if (!buf)
 		return NULL;
-	read_extent_buffer_pages(&BTRFS_I(btree_inode)->extent_tree,
-				 buf, 0, 1);
+	read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, buf, 0, 1);
 
 	if (buf->flags & EXTENT_CSUM)
 		return buf;
 
 	end = buf->start + PAGE_CACHE_SIZE - 1;
-	if (test_range_bit(extent_tree, buf->start, end, EXTENT_CSUM, 1)) {
+	if (test_range_bit(io_tree, buf->start, end, EXTENT_CSUM, 1)) {
 		buf->flags |= EXTENT_CSUM;
 		return buf;
 	}
 
-	lock_extent(extent_tree, buf->start, end, GFP_NOFS);
+	lock_extent(io_tree, buf->start, end, GFP_NOFS);
 
-	if (test_range_bit(extent_tree, buf->start, end, EXTENT_CSUM, 1)) {
+	if (test_range_bit(io_tree, buf->start, end, EXTENT_CSUM, 1)) {
 		buf->flags |= EXTENT_CSUM;
 		goto out_unlock;
 	}
 
 	ret = csum_tree_block(root, buf, 1);
-	set_extent_bits(extent_tree, buf->start, end, EXTENT_CSUM, GFP_NOFS);
+	set_extent_bits(io_tree, buf->start, end, EXTENT_CSUM, GFP_NOFS);
 	buf->flags |= EXTENT_CSUM;
 
 out_unlock:
-	unlock_extent(extent_tree, buf->start, end, GFP_NOFS);
+	unlock_extent(io_tree, buf->start, end, GFP_NOFS);
 	return buf;
 }
 
@@ -385,7 +391,7 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	struct inode *btree_inode = root->fs_info->btree_inode;
 	if (btrfs_header_generation(buf) ==
 	    root->fs_info->running_transaction->transid)
-		clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->extent_tree,
+		clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
 					  buf);
 	return 0;
 }
@@ -394,7 +400,7 @@ int wait_on_tree_block_writeback(struct btrfs_root *root,
 				 struct extent_buffer *buf)
 {
 	struct inode *btree_inode = root->fs_info->btree_inode;
-	wait_on_extent_buffer_writeback(&BTRFS_I(btree_inode)->extent_tree,
+	wait_on_extent_buffer_writeback(&BTRFS_I(btree_inode)->io_tree,
 					buf);
 	return 0;
 }
@@ -659,20 +665,23 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	fs_info->btree_inode->i_nlink = 1;
 	fs_info->btree_inode->i_size = sb->s_bdev->bd_inode->i_size;
 	fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
-	extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree,
+	extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
 			     fs_info->btree_inode->i_mapping,
 			     GFP_NOFS);
-	BTRFS_I(fs_info->btree_inode)->extent_tree.ops = &btree_extent_map_ops;
+	extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree,
+			     GFP_NOFS);
+
+	BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
 
-	extent_map_tree_init(&fs_info->free_space_cache,
+	extent_io_tree_init(&fs_info->free_space_cache,
 			     fs_info->btree_inode->i_mapping, GFP_NOFS);
-	extent_map_tree_init(&fs_info->block_group_cache,
+	extent_io_tree_init(&fs_info->block_group_cache,
 			     fs_info->btree_inode->i_mapping, GFP_NOFS);
-	extent_map_tree_init(&fs_info->pinned_extents,
+	extent_io_tree_init(&fs_info->pinned_extents,
 			     fs_info->btree_inode->i_mapping, GFP_NOFS);
-	extent_map_tree_init(&fs_info->pending_del,
+	extent_io_tree_init(&fs_info->pending_del,
 			     fs_info->btree_inode->i_mapping, GFP_NOFS);
-	extent_map_tree_init(&fs_info->extent_ins,
+	extent_io_tree_init(&fs_info->extent_ins,
 			     fs_info->btree_inode->i_mapping, GFP_NOFS);
 	fs_info->do_barriers = 1;
 	fs_info->closing = 0;
@@ -787,7 +796,7 @@ int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	if (!btrfs_test_opt(root, NOBARRIER))
 		blkdev_issue_flush(sb->s_bdev, NULL);
-	set_extent_buffer_dirty(&BTRFS_I(btree_inode)->extent_tree, super);
+	set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, super);
 	ret = sync_page_range_nolock(btree_inode, btree_inode->i_mapping,
 				     super->start, super->len);
 	if (!btrfs_test_opt(root, NOBARRIER))
@@ -864,12 +873,12 @@ int close_ctree(struct btrfs_root *root)
 
 	filemap_write_and_wait(fs_info->btree_inode->i_mapping);
 
-	extent_map_tree_empty_lru(&fs_info->free_space_cache);
-	extent_map_tree_empty_lru(&fs_info->block_group_cache);
-	extent_map_tree_empty_lru(&fs_info->pinned_extents);
-	extent_map_tree_empty_lru(&fs_info->pending_del);
-	extent_map_tree_empty_lru(&fs_info->extent_ins);
-	extent_map_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->extent_tree);
+	extent_io_tree_empty_lru(&fs_info->free_space_cache);
+	extent_io_tree_empty_lru(&fs_info->block_group_cache);
+	extent_io_tree_empty_lru(&fs_info->pinned_extents);
+	extent_io_tree_empty_lru(&fs_info->pending_del);
+	extent_io_tree_empty_lru(&fs_info->extent_ins);
+	extent_io_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->io_tree);
 
 	truncate_inode_pages(fs_info->btree_inode->i_mapping, 0);
 
@@ -892,13 +901,13 @@ int close_ctree(struct btrfs_root *root)
 int btrfs_buffer_uptodate(struct extent_buffer *buf)
 {
 	struct inode *btree_inode = buf->first_page->mapping->host;
-	return extent_buffer_uptodate(&BTRFS_I(btree_inode)->extent_tree, buf);
+	return extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf);
 }
 
 int btrfs_set_buffer_uptodate(struct extent_buffer *buf)
 {
 	struct inode *btree_inode = buf->first_page->mapping->host;
-	return set_extent_buffer_uptodate(&BTRFS_I(btree_inode)->extent_tree,
+	return set_extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree,
 					  buf);
 }
 
@@ -914,7 +923,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
 			transid, root->fs_info->generation);
 		WARN_ON(1);
 	}
-	set_extent_buffer_dirty(&BTRFS_I(btree_inode)->extent_tree, buf);
+	set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, buf);
 }
 
 void btrfs_throttle(struct btrfs_root *root)
@@ -941,7 +950,7 @@ void btrfs_set_buffer_defrag(struct extent_buffer *buf)
 {
 	struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
 	struct inode *btree_inode = root->fs_info->btree_inode;
-	set_extent_bits(&BTRFS_I(btree_inode)->extent_tree, buf->start,
+	set_extent_bits(&BTRFS_I(btree_inode)->io_tree, buf->start,
 			buf->start + buf->len - 1, EXTENT_DEFRAG, GFP_NOFS);
 }
 
@@ -949,7 +958,7 @@ void btrfs_set_buffer_defrag_done(struct extent_buffer *buf)
 {
 	struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
 	struct inode *btree_inode = root->fs_info->btree_inode;
-	set_extent_bits(&BTRFS_I(btree_inode)->extent_tree, buf->start,
+	set_extent_bits(&BTRFS_I(btree_inode)->io_tree, buf->start,
 			buf->start + buf->len - 1, EXTENT_DEFRAG_DONE,
 			GFP_NOFS);
 }
@@ -958,7 +967,7 @@ int btrfs_buffer_defrag(struct extent_buffer *buf)
 {
 	struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
 	struct inode *btree_inode = root->fs_info->btree_inode;
-	return test_range_bit(&BTRFS_I(btree_inode)->extent_tree,
+	return test_range_bit(&BTRFS_I(btree_inode)->io_tree,
 		     buf->start, buf->start + buf->len - 1, EXTENT_DEFRAG, 0);
 }
 
@@ -966,7 +975,7 @@ int btrfs_buffer_defrag_done(struct extent_buffer *buf)
 {
 	struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
 	struct inode *btree_inode = root->fs_info->btree_inode;
-	return test_range_bit(&BTRFS_I(btree_inode)->extent_tree,
+	return test_range_bit(&BTRFS_I(btree_inode)->io_tree,
 		     buf->start, buf->start + buf->len - 1,
 		     EXTENT_DEFRAG_DONE, 0);
 }
@@ -975,7 +984,7 @@ int btrfs_clear_buffer_defrag_done(struct extent_buffer *buf)
 {
 	struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
 	struct inode *btree_inode = root->fs_info->btree_inode;
-	return clear_extent_bits(&BTRFS_I(btree_inode)->extent_tree,
+	return clear_extent_bits(&BTRFS_I(btree_inode)->io_tree,
 		     buf->start, buf->start + buf->len - 1,
 		     EXTENT_DEFRAG_DONE, GFP_NOFS);
 }
@@ -984,7 +993,7 @@ int btrfs_clear_buffer_defrag(struct extent_buffer *buf)
 {
 	struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
 	struct inode *btree_inode = root->fs_info->btree_inode;
-	return clear_extent_bits(&BTRFS_I(btree_inode)->extent_tree,
+	return clear_extent_bits(&BTRFS_I(btree_inode)->io_tree,
 		     buf->start, buf->start + buf->len - 1,
 		     EXTENT_DEFRAG, GFP_NOFS);
 }
@@ -993,10 +1002,10 @@ int btrfs_read_buffer(struct extent_buffer *buf)
 {
 	struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
 	struct inode *btree_inode = root->fs_info->btree_inode;
-	return read_extent_buffer_pages(&BTRFS_I(btree_inode)->extent_tree,
+	return read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
 					buf, 0, 1);
 }
 
-static struct extent_map_ops btree_extent_map_ops = {
+static struct extent_io_ops btree_extent_io_ops = {
 	.writepage_io_hook = btree_writepage_io_hook,
 };
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b69a46691a96..1cf125ab7822 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -63,7 +63,7 @@ static int cache_block_group(struct btrfs_root *root,
 	int ret;
 	struct btrfs_key key;
 	struct extent_buffer *leaf;
-	struct extent_map_tree *free_space_cache;
+	struct extent_io_tree *free_space_cache;
 	int slot;
 	u64 last = 0;
 	u64 hole_size;
@@ -158,7 +158,7 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
 							 btrfs_fs_info *info,
 							 u64 bytenr)
 {
-	struct extent_map_tree *block_group_cache;
+	struct extent_io_tree *block_group_cache;
 	struct btrfs_block_group_cache *block_group = NULL;
 	u64 ptr;
 	u64 start;
@@ -281,7 +281,7 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 						 int data, int owner)
 {
 	struct btrfs_block_group_cache *cache;
-	struct extent_map_tree *block_group_cache;
+	struct extent_io_tree *block_group_cache;
 	struct btrfs_block_group_cache *found_group = NULL;
 	struct btrfs_fs_info *info = root->fs_info;
 	u64 used;
@@ -951,7 +951,7 @@ fail:
 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root)
 {
-	struct extent_map_tree *block_group_cache;
+	struct extent_io_tree *block_group_cache;
 	struct btrfs_block_group_cache *cache;
 	int ret;
 	int err = 0;
@@ -1107,12 +1107,12 @@ static int update_pinned_extents(struct btrfs_root *root,
 	return 0;
 }
 
-int btrfs_copy_pinned(struct btrfs_root *root, struct extent_map_tree *copy)
+int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
 {
 	u64 last = 0;
 	u64 start;
 	u64 end;
-	struct extent_map_tree *pinned_extents = &root->fs_info->pinned_extents;
+	struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
 	int ret;
 
 	while(1) {
@@ -1128,12 +1128,12 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_map_tree *copy)
 
 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root,
-			       struct extent_map_tree *unpin)
+			       struct extent_io_tree *unpin)
 {
 	u64 start;
 	u64 end;
 	int ret;
-	struct extent_map_tree *free_space_cache;
+	struct extent_io_tree *free_space_cache;
 	free_space_cache = &root->fs_info->free_space_cache;
 
 	while(1) {
@@ -1329,8 +1329,8 @@ static int del_pending_extents(struct btrfs_trans_handle *trans, struct
 	int err = 0;
 	u64 start;
 	u64 end;
-	struct extent_map_tree *pending_del;
-	struct extent_map_tree *pinned_extents;
+	struct extent_io_tree *pending_del;
+	struct extent_io_tree *pinned_extents;
 
 	pending_del = &extent_root->fs_info->pending_del;
 	pinned_extents = &extent_root->fs_info->pinned_extents;
@@ -1802,7 +1802,7 @@ struct extent_buffer *__btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 
 	set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
 			 buf->start + buf->len - 1, GFP_NOFS);
-	set_extent_bits(&BTRFS_I(root->fs_info->btree_inode)->extent_tree,
+	set_extent_bits(&BTRFS_I(root->fs_info->btree_inode)->io_tree,
 			buf->start, buf->start + buf->len - 1,
 			EXTENT_CSUM, GFP_NOFS);
 	buf->flags |= EXTENT_CSUM;
@@ -2166,7 +2166,7 @@ static int noinline relocate_inode_pages(struct inode *inode, u64 start,
 	unsigned long i;
 	struct page *page;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct file_ra_state *ra;
 
 	ra = kzalloc(sizeof(*ra), GFP_NOFS);
@@ -2195,15 +2195,14 @@ static int noinline relocate_inode_pages(struct inode *inode, u64 start,
 		page_start = (u64)page->index << PAGE_CACHE_SHIFT;
 		page_end = page_start + PAGE_CACHE_SIZE - 1;
 
-		lock_extent(em_tree, page_start, page_end, GFP_NOFS);
+		lock_extent(io_tree, page_start, page_end, GFP_NOFS);
 
 		delalloc_start = page_start;
-		existing_delalloc =
-			count_range_bits(&BTRFS_I(inode)->extent_tree,
-					 &delalloc_start, page_end,
-					 PAGE_CACHE_SIZE, EXTENT_DELALLOC);
+		existing_delalloc = count_range_bits(io_tree,
+					     &delalloc_start, page_end,
+					     PAGE_CACHE_SIZE, EXTENT_DELALLOC);
 
-		set_extent_delalloc(em_tree, page_start,
+		set_extent_delalloc(io_tree, page_start,
 				    page_end, GFP_NOFS);
 
 		spin_lock(&root->fs_info->delalloc_lock);
@@ -2211,7 +2210,7 @@ static int noinline relocate_inode_pages(struct inode *inode, u64 start,
 						 existing_delalloc;
 		spin_unlock(&root->fs_info->delalloc_lock);
 
-		unlock_extent(em_tree, page_start, page_end, GFP_NOFS);
+		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
 		set_page_dirty(page);
 		unlock_page(page);
 		page_cache_release(page);
@@ -2379,7 +2378,7 @@ int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 new_size)
 	u64 cur_byte;
 	u64 total_found;
 	struct btrfs_fs_info *info = root->fs_info;
-	struct extent_map_tree *block_group_cache;
+	struct extent_io_tree *block_group_cache;
 	struct btrfs_key key;
 	struct btrfs_key found_key;
 	struct extent_buffer *leaf;
@@ -2561,7 +2560,7 @@ int btrfs_grow_extent_tree(struct btrfs_trans_handle *trans,
 	struct btrfs_block_group_cache *cache;
 	struct btrfs_block_group_item *item;
 	struct btrfs_fs_info *info = root->fs_info;
-	struct extent_map_tree *block_group_cache;
+	struct extent_io_tree *block_group_cache;
 	struct btrfs_key key;
 	struct extent_buffer *leaf;
 	int ret;
@@ -2645,7 +2644,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 	int bit;
 	struct btrfs_block_group_cache *cache;
 	struct btrfs_fs_info *info = root->fs_info;
-	struct extent_map_tree *block_group_cache;
+	struct extent_io_tree *block_group_cache;
 	struct btrfs_key key;
 	struct btrfs_key found_key;
 	struct extent_buffer *leaf;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
new file mode 100644
index 000000000000..15cc158a0498
--- /dev/null
+++ b/fs/btrfs/extent_io.c
@@ -0,0 +1,3089 @@
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include <linux/bio.h>
+#include <linux/mm.h>
+#include <linux/gfp.h>
+#include <linux/pagemap.h>
+#include <linux/page-flags.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/blkdev.h>
+#include <linux/swap.h>
+#include <linux/version.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
+#include "extent_io.h"
+#include "extent_map.h"
+
+/* temporary define until extent_map moves out of btrfs */
+struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
+				       unsigned long extra_flags,
+				       void (*ctor)(void *, struct kmem_cache *,
+						    unsigned long));
+
+static struct kmem_cache *extent_state_cache;
+static struct kmem_cache *extent_buffer_cache;
+
+static LIST_HEAD(buffers);
+static LIST_HEAD(states);
+
+static spinlock_t state_lock = SPIN_LOCK_UNLOCKED;
+#define BUFFER_LRU_MAX 64
+
+struct tree_entry {
+	u64 start;
+	u64 end;
+	int in_tree;
+	struct rb_node rb_node;
+};
+
+struct extent_page_data {
+	struct bio *bio;
+	struct extent_io_tree *tree;
+	get_extent_t *get_extent;
+};
+
+int __init extent_io_init(void)
+{
+	extent_state_cache = btrfs_cache_create("extent_state",
+					    sizeof(struct extent_state), 0,
+					    NULL);
+	if (!extent_state_cache)
+		return -ENOMEM;
+
+	extent_buffer_cache = btrfs_cache_create("extent_buffers",
+					    sizeof(struct extent_buffer), 0,
+					    NULL);
+	if (!extent_buffer_cache)
+		goto free_state_cache;
+	return 0;
+
+free_state_cache:
+	kmem_cache_destroy(extent_state_cache);
+	return -ENOMEM;
+}
+
+void extent_io_exit(void)
+{
+	struct extent_state *state;
+
+	while (!list_empty(&states)) {
+		state = list_entry(states.next, struct extent_state, list);
+		printk("state leak: start %Lu end %Lu state %lu in tree %d refs %d\n", state->start, state->end, state->state, state->in_tree, atomic_read(&state->refs));
+		list_del(&state->list);
+		kmem_cache_free(extent_state_cache, state);
+
+	}
+
+	if (extent_state_cache)
+		kmem_cache_destroy(extent_state_cache);
+	if (extent_buffer_cache)
+		kmem_cache_destroy(extent_buffer_cache);
+}
+
+void extent_io_tree_init(struct extent_io_tree *tree,
+			  struct address_space *mapping, gfp_t mask)
+{
+	tree->state.rb_node = NULL;
+	tree->ops = NULL;
+	tree->dirty_bytes = 0;
+	rwlock_init(&tree->lock);
+	spin_lock_init(&tree->lru_lock);
+	tree->mapping = mapping;
+	INIT_LIST_HEAD(&tree->buffer_lru);
+	tree->lru_size = 0;
+}
+EXPORT_SYMBOL(extent_io_tree_init);
+
+void extent_io_tree_empty_lru(struct extent_io_tree *tree)
+{
+	struct extent_buffer *eb;
+	while(!list_empty(&tree->buffer_lru)) {
+		eb = list_entry(tree->buffer_lru.next, struct extent_buffer,
+				lru);
+		list_del_init(&eb->lru);
+		free_extent_buffer(eb);
+	}
+}
+EXPORT_SYMBOL(extent_io_tree_empty_lru);
+
+struct extent_state *alloc_extent_state(gfp_t mask)
+{
+	struct extent_state *state;
+	unsigned long flags;
+
+	state = kmem_cache_alloc(extent_state_cache, mask);
+	if (!state || IS_ERR(state))
+		return state;
+	state->state = 0;
+	state->in_tree = 0;
+	state->private = 0;
+
+	spin_lock_irqsave(&state_lock, flags);
+	list_add(&state->list, &states);
+	spin_unlock_irqrestore(&state_lock, flags);
+
+	atomic_set(&state->refs, 1);
+	init_waitqueue_head(&state->wq);
+	return state;
+}
+EXPORT_SYMBOL(alloc_extent_state);
+
+void free_extent_state(struct extent_state *state)
+{
+	unsigned long flags;
+	if (!state)
+		return;
+	if (atomic_dec_and_test(&state->refs)) {
+		WARN_ON(state->in_tree);
+		spin_lock_irqsave(&state_lock, flags);
+		list_del(&state->list);
+		spin_unlock_irqrestore(&state_lock, flags);
+		kmem_cache_free(extent_state_cache, state);
+	}
+}
+EXPORT_SYMBOL(free_extent_state);
+
+static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
+				   struct rb_node *node)
+{
+	struct rb_node ** p = &root->rb_node;
+	struct rb_node * parent = NULL;
+	struct tree_entry *entry;
+
+	while(*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct tree_entry, rb_node);
+
+		if (offset < entry->start)
+			p = &(*p)->rb_left;
+		else if (offset > entry->end)
+			p = &(*p)->rb_right;
+		else
+			return parent;
+	}
+
+	entry = rb_entry(node, struct tree_entry, rb_node);
+	entry->in_tree = 1;
+	rb_link_node(node, parent, p);
+	rb_insert_color(node, root);
+	return NULL;
+}
+
+static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
+				     struct rb_node **prev_ret,
+				     struct rb_node **next_ret)
+{
+	struct rb_node * n = root->rb_node;
+	struct rb_node *prev = NULL;
+	struct rb_node *orig_prev = NULL;
+	struct tree_entry *entry;
+	struct tree_entry *prev_entry = NULL;
+
+	while(n) {
+		entry = rb_entry(n, struct tree_entry, rb_node);
+		prev = n;
+		prev_entry = entry;
+
+		if (offset < entry->start)
+			n = n->rb_left;
+		else if (offset > entry->end)
+			n = n->rb_right;
+		else
+			return n;
+	}
+
+	if (prev_ret) {
+		orig_prev = prev;
+		while(prev && offset > prev_entry->end) {
+			prev = rb_next(prev);
+			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
+		}
+		*prev_ret = prev;
+		prev = orig_prev;
+	}
+
+	if (next_ret) {
+		prev_entry = rb_entry(prev, struct tree_entry, rb_node);
+		while(prev && offset < prev_entry->start) {
+			prev = rb_prev(prev);
+			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
+		}
+		*next_ret = prev;
+	}
+	return NULL;
+}
+
+static inline struct rb_node *tree_search(struct rb_root *root, u64 offset)
+{
+	struct rb_node *prev;
+	struct rb_node *ret;
+	ret = __tree_search(root, offset, &prev, NULL);
+	if (!ret)
+		return prev;
+	return ret;
+}
+
+/*
+ * utility function to look for merge candidates inside a given range.
+ * Any extents with matching state are merged together into a single
+ * extent in the tree.  Extents with EXTENT_IO in their state field
+ * are not merged because the end_io handlers need to be able to do
+ * operations on them without sleeping (or doing allocations/splits).
+ *
+ * This should be called with the tree lock held.
+ */
+static int merge_state(struct extent_io_tree *tree,
+		       struct extent_state *state)
+{
+	struct extent_state *other;
+	struct rb_node *other_node;
+
+	if (state->state & EXTENT_IOBITS)
+		return 0;
+
+	other_node = rb_prev(&state->rb_node);
+	if (other_node) {
+		other = rb_entry(other_node, struct extent_state, rb_node);
+		if (other->end == state->start - 1 &&
+		    other->state == state->state) {
+			state->start = other->start;
+			other->in_tree = 0;
+			rb_erase(&other->rb_node, &tree->state);
+			free_extent_state(other);
+		}
+	}
+	other_node = rb_next(&state->rb_node);
+	if (other_node) {
+		other = rb_entry(other_node, struct extent_state, rb_node);
+		if (other->start == state->end + 1 &&
+		    other->state == state->state) {
+			other->start = state->start;
+			state->in_tree = 0;
+			rb_erase(&state->rb_node, &tree->state);
+			free_extent_state(state);
+		}
+	}
+	return 0;
+}
+
+/*
+ * insert an extent_state struct into the tree.  'bits' are set on the
+ * struct before it is inserted.
+ *
+ * This may return -EEXIST if the extent is already there, in which case the
+ * state struct is freed.
+ *
+ * The tree lock is not taken internally.  This is a utility function and
+ * probably isn't what you want to call (see set/clear_extent_bit).
+ */
+static int insert_state(struct extent_io_tree *tree,
+			struct extent_state *state, u64 start, u64 end,
+			int bits)
+{
+	struct rb_node *node;
+
+	if (end < start) {
+		printk("end < start %Lu %Lu\n", end, start);
+		WARN_ON(1);
+	}
+	if (bits & EXTENT_DIRTY)
+		tree->dirty_bytes += end - start + 1;
+	state->state |= bits;
+	state->start = start;
+	state->end = end;
+	node = tree_insert(&tree->state, end, &state->rb_node);
+	if (node) {
+		struct extent_state *found;
+		found = rb_entry(node, struct extent_state, rb_node);
+		printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, start, end);
+		free_extent_state(state);
+		return -EEXIST;
+	}
+	merge_state(tree, state);
+	return 0;
+}
+
+/*
+ * split a given extent state struct in two, inserting the preallocated
+ * struct 'prealloc' as the newly created second half.  'split' indicates an
+ * offset inside 'orig' where it should be split.
+ *
+ * Before calling,
+ * the tree has 'orig' at [orig->start, orig->end].  After calling, there
+ * are two extent state structs in the tree:
+ * prealloc: [orig->start, split - 1]
+ * orig: [ split, orig->end ]
+ *
+ * The tree locks are not taken by this function. They need to be held
+ * by the caller.
+ */
+static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
+		       struct extent_state *prealloc, u64 split)
+{
+	struct rb_node *node;
+	prealloc->start = orig->start;
+	prealloc->end = split - 1;
+	prealloc->state = orig->state;
+	orig->start = split;
+
+	node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
+	if (node) {
+		struct extent_state *found;
+		found = rb_entry(node, struct extent_state, rb_node);
+		printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, prealloc->start, prealloc->end);
+		free_extent_state(prealloc);
+		return -EEXIST;
+	}
+	return 0;
+}
+
+/*
+ * utility function to clear some bits in an extent state struct.
+ * it will optionally wake up any one waiting on this state (wake == 1), or
+ * forcibly remove the state from the tree (delete == 1).
+ *
+ * If no bits are set on the state struct after clearing things, the
+ * struct is freed and removed from the tree
+ */
+static int clear_state_bit(struct extent_io_tree *tree,
+			    struct extent_state *state, int bits, int wake,
+			    int delete)
+{
+	int ret = state->state & bits;
+
+	if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
+		u64 range = state->end - state->start + 1;
+		WARN_ON(range > tree->dirty_bytes);
+		tree->dirty_bytes -= range;
+	}
+	state->state &= ~bits;
+	if (wake)
+		wake_up(&state->wq);
+	if (delete || state->state == 0) {
+		if (state->in_tree) {
+			rb_erase(&state->rb_node, &tree->state);
+			state->in_tree = 0;
+			free_extent_state(state);
+		} else {
+			WARN_ON(1);
+		}
+	} else {
+		merge_state(tree, state);
+	}
+	return ret;
+}
+
+/*
+ * clear some bits on a range in the tree.  This may require splitting
+ * or inserting elements in the tree, so the gfp mask is used to
+ * indicate which allocations or sleeping are allowed.
+ *
+ * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
+ * the given range from the tree regardless of state (ie for truncate).
+ *
+ * the range [start, end] is inclusive.
+ *
+ * This takes the tree lock, and returns < 0 on error, > 0 if any of the
+ * bits were already set, or zero if none of the bits were already set.
+ */
+int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+		     int bits, int wake, int delete, gfp_t mask)
+{
+	struct extent_state *state;
+	struct extent_state *prealloc = NULL;
+	struct rb_node *node;
+	unsigned long flags;
+	int err;
+	int set = 0;
+
+again:
+	if (!prealloc && (mask & __GFP_WAIT)) {
+		prealloc = alloc_extent_state(mask);
+		if (!prealloc)
+			return -ENOMEM;
+	}
+
+	write_lock_irqsave(&tree->lock, flags);
+	/*
+	 * this search will find the extents that end after
+	 * our range starts
+	 */
+	node = tree_search(&tree->state, start);
+	if (!node)
+		goto out;
+	state = rb_entry(node, struct extent_state, rb_node);
+	if (state->start > end)
+		goto out;
+	WARN_ON(state->end < start);
+
+	/*
+	 *     | ---- desired range ---- |
+	 *  | state | or
+	 *  | ------------- state -------------- |
+	 *
+	 * We need to split the extent we found, and may flip
+	 * bits on second half.
+	 *
+	 * If the extent we found extends past our range, we
+	 * just split and search again.  It'll get split again
+	 * the next time though.
+	 *
+	 * If the extent we found is inside our range, we clear
+	 * the desired bit on it.
+	 */
+
+	if (state->start < start) {
+		err = split_state(tree, state, prealloc, start);
+		BUG_ON(err == -EEXIST);
+		prealloc = NULL;
+		if (err)
+			goto out;
+		if (state->end <= end) {
+			start = state->end + 1;
+			set |= clear_state_bit(tree, state, bits,
+					wake, delete);
+		} else {
+			start = state->start;
+		}
+		goto search_again;
+	}
+	/*
+	 * | ---- desired range ---- |
+	 *                        | state |
+	 * We need to split the extent, and clear the bit
+	 * on the first half
+	 */
+	if (state->start <= end && state->end > end) {
+		err = split_state(tree, state, prealloc, end + 1);
+		BUG_ON(err == -EEXIST);
+
+		if (wake)
+			wake_up(&state->wq);
+		set |= clear_state_bit(tree, prealloc, bits,
+				       wake, delete);
+		prealloc = NULL;
+		goto out;
+	}
+
+	start = state->end + 1;
+	set |= clear_state_bit(tree, state, bits, wake, delete);
+	goto search_again;
+
+out:
+	write_unlock_irqrestore(&tree->lock, flags);
+	if (prealloc)
+		free_extent_state(prealloc);
+
+	return set;
+
+search_again:
+	if (start > end)
+		goto out;
+	write_unlock_irqrestore(&tree->lock, flags);
+	if (mask & __GFP_WAIT)
+		cond_resched();
+	goto again;
+}
+EXPORT_SYMBOL(clear_extent_bit);
+
+static int wait_on_state(struct extent_io_tree *tree,
+			 struct extent_state *state)
+{
+	DEFINE_WAIT(wait);
+	prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
+	read_unlock_irq(&tree->lock);
+	schedule();
+	read_lock_irq(&tree->lock);
+	finish_wait(&state->wq, &wait);
+	return 0;
+}
+
+/*
+ * waits for one or more bits to clear on a range in the state tree.
+ * The range [start, end] is inclusive.
+ * The tree lock is taken by this function
+ */
+int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
+{
+	struct extent_state *state;
+	struct rb_node *node;
+
+	read_lock_irq(&tree->lock);
+again:
+	while (1) {
+		/*
+		 * this search will find all the extents that end after
+		 * our range starts
+		 */
+		node = tree_search(&tree->state, start);
+		if (!node)
+			break;
+
+		state = rb_entry(node, struct extent_state, rb_node);
+
+		if (state->start > end)
+			goto out;
+
+		if (state->state & bits) {
+			start = state->start;
+			atomic_inc(&state->refs);
+			wait_on_state(tree, state);
+			free_extent_state(state);
+			goto again;
+		}
+		start = state->end + 1;
+
+		if (start > end)
+			break;
+
+		if (need_resched()) {
+			read_unlock_irq(&tree->lock);
+			cond_resched();
+			read_lock_irq(&tree->lock);
+		}
+	}
+out:
+	read_unlock_irq(&tree->lock);
+	return 0;
+}
+EXPORT_SYMBOL(wait_extent_bit);
+
+static void set_state_bits(struct extent_io_tree *tree,
+			   struct extent_state *state,
+			   int bits)
+{
+	if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
+		u64 range = state->end - state->start + 1;
+		tree->dirty_bytes += range;
+	}
+	state->state |= bits;
+}
+
+/*
+ * set some bits on a range in the tree.  This may require allocations
+ * or sleeping, so the gfp mask is used to indicate what is allowed.
+ *
+ * If 'exclusive' == 1, this will fail with -EEXIST if some part of the
+ * range already has the desired bits set.  The start of the existing
+ * range is returned in failed_start in this case.
+ *
+ * [start, end] is inclusive
+ * This takes the tree lock.
+ */
+int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
+		   int exclusive, u64 *failed_start, gfp_t mask)
+{
+	struct extent_state *state;
+	struct extent_state *prealloc = NULL;
+	struct rb_node *node;
+	unsigned long flags;
+	int err = 0;
+	int set;
+	u64 last_start;
+	u64 last_end;
+again:
+	if (!prealloc && (mask & __GFP_WAIT)) {
+		prealloc = alloc_extent_state(mask);
+		if (!prealloc)
+			return -ENOMEM;
+	}
+
+	write_lock_irqsave(&tree->lock, flags);
+	/*
+	 * this search will find all the extents that end after
+	 * our range starts.
+	 */
+	node = tree_search(&tree->state, start);
+	if (!node) {
+		err = insert_state(tree, prealloc, start, end, bits);
+		prealloc = NULL;
+		BUG_ON(err == -EEXIST);
+		goto out;
+	}
+
+	state = rb_entry(node, struct extent_state, rb_node);
+	last_start = state->start;
+	last_end = state->end;
+
+	/*
+	 * | ---- desired range ---- |
+	 * | state |
+	 *
+	 * Just lock what we found and keep going
+	 */
+	if (state->start == start && state->end <= end) {
+		set = state->state & bits;
+		if (set && exclusive) {
+			*failed_start = state->start;
+			err = -EEXIST;
+			goto out;
+		}
+		set_state_bits(tree, state, bits);
+		start = state->end + 1;
+		merge_state(tree, state);
+		goto search_again;
+	}
+
+	/*
+	 *     | ---- desired range ---- |
+	 * | state |
+	 *   or
+	 * | ------------- state -------------- |
+	 *
+	 * We need to split the extent we found, and may flip bits on
+	 * second half.
+	 *
+	 * If the extent we found extends past our
+	 * range, we just split and search again.  It'll get split
+	 * again the next time though.
+	 *
+	 * If the extent we found is inside our range, we set the
+	 * desired bit on it.
+	 */
+	if (state->start < start) {
+		set = state->state & bits;
+		if (exclusive && set) {
+			*failed_start = start;
+			err = -EEXIST;
+			goto out;
+		}
+		err = split_state(tree, state, prealloc, start);
+		BUG_ON(err == -EEXIST);
+		prealloc = NULL;
+		if (err)
+			goto out;
+		if (state->end <= end) {
+			set_state_bits(tree, state, bits);
+			start = state->end + 1;
+			merge_state(tree, state);
+		} else {
+			start = state->start;
+		}
+		goto search_again;
+	}
+	/*
+	 * | ---- desired range ---- |
+	 *     | state | or               | state |
+	 *
+	 * There's a hole, we need to insert something in it and
+	 * ignore the extent we found.
+	 */
+	if (state->start > start) {
+		u64 this_end;
+		if (end < last_start)
+			this_end = end;
+		else
+			this_end = last_start -1;
+		err = insert_state(tree, prealloc, start, this_end,
+				   bits);
+		prealloc = NULL;
+		BUG_ON(err == -EEXIST);
+		if (err)
+			goto out;
+		start = this_end + 1;
+		goto search_again;
+	}
+	/*
+	 * | ---- desired range ---- |
+	 *                        | state |
+	 * We need to split the extent, and set the bit
+	 * on the first half
+	 */
+	if (state->start <= end && state->end > end) {
+		set = state->state & bits;
+		if (exclusive && set) {
+			*failed_start = start;
+			err = -EEXIST;
+			goto out;
+		}
+		err = split_state(tree, state, prealloc, end + 1);
+		BUG_ON(err == -EEXIST);
+
+		set_state_bits(tree, prealloc, bits);
+		merge_state(tree, prealloc);
+		prealloc = NULL;
+		goto out;
+	}
+
+	goto search_again;
+
+out:
+	write_unlock_irqrestore(&tree->lock, flags);
+	if (prealloc)
+		free_extent_state(prealloc);
+
+	return err;
+
+search_again:
+	if (start > end)
+		goto out;
+	write_unlock_irqrestore(&tree->lock, flags);
+	if (mask & __GFP_WAIT)
+		cond_resched();
+	goto again;
+}
+EXPORT_SYMBOL(set_extent_bit);
+
+/* wrappers around set/clear extent bit */
+int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
+		     gfp_t mask)
+{
+	return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
+			      mask);
+}
+EXPORT_SYMBOL(set_extent_dirty);
+
+int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+		    int bits, gfp_t mask)
+{
+	return set_extent_bit(tree, start, end, bits, 0, NULL,
+			      mask);
+}
+EXPORT_SYMBOL(set_extent_bits);
+
+int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+		      int bits, gfp_t mask)
+{
+	return clear_extent_bit(tree, start, end, bits, 0, 0, mask);
+}
+EXPORT_SYMBOL(clear_extent_bits);
+
+int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
+		     gfp_t mask)
+{
+	return set_extent_bit(tree, start, end,
+			      EXTENT_DELALLOC | EXTENT_DIRTY, 0, NULL,
+			      mask);
+}
+EXPORT_SYMBOL(set_extent_delalloc);
+
+int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
+		       gfp_t mask)
+{
+	return clear_extent_bit(tree, start, end,
+				EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask);
+}
+EXPORT_SYMBOL(clear_extent_dirty);
+
+int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
+		     gfp_t mask)
+{
+	return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
+			      mask);
+}
+EXPORT_SYMBOL(set_extent_new);
+
+int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
+		       gfp_t mask)
+{
+	return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask);
+}
+EXPORT_SYMBOL(clear_extent_new);
+
+int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
+			gfp_t mask)
+{
+	return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
+			      mask);
+}
+EXPORT_SYMBOL(set_extent_uptodate);
+
+int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
+			  gfp_t mask)
+{
+	return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask);
+}
+EXPORT_SYMBOL(clear_extent_uptodate);
+
+int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
+			 gfp_t mask)
+{
+	return set_extent_bit(tree, start, end, EXTENT_WRITEBACK,
+			      0, NULL, mask);
+}
+EXPORT_SYMBOL(set_extent_writeback);
+
+int clear_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
+			   gfp_t mask)
+{
+	return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask);
+}
+EXPORT_SYMBOL(clear_extent_writeback);
+
+int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
+{
+	return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK);
+}
+EXPORT_SYMBOL(wait_on_extent_writeback);
+
+/*
+ * locks a range in ascending order, waiting for any locked regions
+ * it hits on the way.  [start,end] are inclusive, and this will sleep.
+ */
+int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
+{
+	int err;
+	u64 failed_start;
+	while (1) {
+		err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
+				     &failed_start, mask);
+		if (err == -EEXIST && (mask & __GFP_WAIT)) {
+			wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
+			start = failed_start;
+		} else {
+			break;
+		}
+		WARN_ON(start > end);
+	}
+	return err;
+}
+EXPORT_SYMBOL(lock_extent);
+
+int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+		  gfp_t mask)
+{
+	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask);
+}
+EXPORT_SYMBOL(unlock_extent);
+
+/*
+ * helper function to set pages and extents in the tree dirty
+ */
+int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end)
+{
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+	struct page *page;
+
+	while (index <= end_index) {
+		page = find_get_page(tree->mapping, index);
+		BUG_ON(!page);
+		__set_page_dirty_nobuffers(page);
+		page_cache_release(page);
+		index++;
+	}
+	set_extent_dirty(tree, start, end, GFP_NOFS);
+	return 0;
+}
+EXPORT_SYMBOL(set_range_dirty);
+
+/*
+ * helper function to set both pages and extents in the tree writeback
+ */
+int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
+{
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+	struct page *page;
+
+	while (index <= end_index) {
+		page = find_get_page(tree->mapping, index);
+		BUG_ON(!page);
+		set_page_writeback(page);
+		page_cache_release(page);
+		index++;
+	}
+	set_extent_writeback(tree, start, end, GFP_NOFS);
+	return 0;
+}
+EXPORT_SYMBOL(set_range_writeback);
+
+int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
+			  u64 *start_ret, u64 *end_ret, int bits)
+{
+	struct rb_node *node;
+	struct extent_state *state;
+	int ret = 1;
+
+	read_lock_irq(&tree->lock);
+	/*
+	 * this search will find all the extents that end after
+	 * our range starts.
+	 */
+	node = tree_search(&tree->state, start);
+	if (!node || IS_ERR(node)) {
+		goto out;
+	}
+
+	while(1) {
+		state = rb_entry(node, struct extent_state, rb_node);
+		if (state->end >= start && (state->state & bits)) {
+			*start_ret = state->start;
+			*end_ret = state->end;
+			ret = 0;
+			break;
+		}
+		node = rb_next(node);
+		if (!node)
+			break;
+	}
+out:
+	read_unlock_irq(&tree->lock);
+	return ret;
+}
+EXPORT_SYMBOL(find_first_extent_bit);
+
+u64 find_lock_delalloc_range(struct extent_io_tree *tree,
+			     u64 *start, u64 *end, u64 max_bytes)
+{
+	struct rb_node *node;
+	struct extent_state *state;
+	u64 cur_start = *start;
+	u64 found = 0;
+	u64 total_bytes = 0;
+
+	write_lock_irq(&tree->lock);
+	/*
+	 * this search will find all the extents that end after
+	 * our range starts.
+	 */
+search_again:
+	node = tree_search(&tree->state, cur_start);
+	if (!node || IS_ERR(node)) {
+		*end = (u64)-1;
+		goto out;
+	}
+
+	while(1) {
+		state = rb_entry(node, struct extent_state, rb_node);
+		if (found && state->start != cur_start) {
+			goto out;
+		}
+		if (!(state->state & EXTENT_DELALLOC)) {
+			if (!found)
+				*end = state->end;
+			goto out;
+		}
+		if (!found) {
+			struct extent_state *prev_state;
+			struct rb_node *prev_node = node;
+			while(1) {
+				prev_node = rb_prev(prev_node);
+				if (!prev_node)
+					break;
+				prev_state = rb_entry(prev_node,
+						      struct extent_state,
+						      rb_node);
+				if (!(prev_state->state & EXTENT_DELALLOC))
+					break;
+				state = prev_state;
+				node = prev_node;
+			}
+		}
+		if (state->state & EXTENT_LOCKED) {
+			DEFINE_WAIT(wait);
+			atomic_inc(&state->refs);
+			prepare_to_wait(&state->wq, &wait,
+					TASK_UNINTERRUPTIBLE);
+			write_unlock_irq(&tree->lock);
+			schedule();
+			write_lock_irq(&tree->lock);
+			finish_wait(&state->wq, &wait);
+			free_extent_state(state);
+			goto search_again;
+		}
+		state->state |= EXTENT_LOCKED;
+		if (!found)
+			*start = state->start;
+		found++;
+		*end = state->end;
+		cur_start = state->end + 1;
+		node = rb_next(node);
+		if (!node)
+			break;
+		total_bytes += state->end - state->start + 1;
+		if (total_bytes >= max_bytes)
+			break;
+	}
+out:
+	write_unlock_irq(&tree->lock);
+	return found;
+}
+
+u64 count_range_bits(struct extent_io_tree *tree,
+		     u64 *start, u64 search_end, u64 max_bytes,
+		     unsigned long bits)
+{
+	struct rb_node *node;
+	struct extent_state *state;
+	u64 cur_start = *start;
+	u64 total_bytes = 0;
+	int found = 0;
+
+	if (search_end <= cur_start) {
+		printk("search_end %Lu start %Lu\n", search_end, cur_start);
+		WARN_ON(1);
+		return 0;
+	}
+
+	write_lock_irq(&tree->lock);
+	if (cur_start == 0 && bits == EXTENT_DIRTY) {
+		total_bytes = tree->dirty_bytes;
+		goto out;
+	}
+	/*
+	 * this search will find all the extents that end after
+	 * our range starts.
+	 */
+	node = tree_search(&tree->state, cur_start);
+	if (!node || IS_ERR(node)) {
+		goto out;
+	}
+
+	while(1) {
+		state = rb_entry(node, struct extent_state, rb_node);
+		if (state->start > search_end)
+			break;
+		if (state->end >= cur_start && (state->state & bits)) {
+			total_bytes += min(search_end, state->end) + 1 -
+				       max(cur_start, state->start);
+			if (total_bytes >= max_bytes)
+				break;
+			if (!found) {
+				*start = state->start;
+				found = 1;
+			}
+		}
+		node = rb_next(node);
+		if (!node)
+			break;
+	}
+out:
+	write_unlock_irq(&tree->lock);
+	return total_bytes;
+}
+/*
+ * helper function to lock both pages and extents in the tree.
+ * pages must be locked first.
+ */
+int lock_range(struct extent_io_tree *tree, u64 start, u64 end)
+{
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+	struct page *page;
+	int err;
+
+	while (index <= end_index) {
+		page = grab_cache_page(tree->mapping, index);
+		if (!page) {
+			err = -ENOMEM;
+			goto failed;
+		}
+		if (IS_ERR(page)) {
+			err = PTR_ERR(page);
+			goto failed;
+		}
+		index++;
+	}
+	lock_extent(tree, start, end, GFP_NOFS);
+	return 0;
+
+failed:
+	/*
+	 * we failed above in getting the page at 'index', so we undo here
+	 * up to but not including the page at 'index'
+	 */
+	end_index = index;
+	index = start >> PAGE_CACHE_SHIFT;
+	while (index < end_index) {
+		page = find_get_page(tree->mapping, index);
+		unlock_page(page);
+		page_cache_release(page);
+		index++;
+	}
+	return err;
+}
+EXPORT_SYMBOL(lock_range);
+
+/*
+ * helper function to unlock both pages and extents in the tree.
+ */
+int unlock_range(struct extent_io_tree *tree, u64 start, u64 end)
+{
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+	struct page *page;
+
+	while (index <= end_index) {
+		page = find_get_page(tree->mapping, index);
+		unlock_page(page);
+		page_cache_release(page);
+		index++;
+	}
+	unlock_extent(tree, start, end, GFP_NOFS);
+	return 0;
+}
+EXPORT_SYMBOL(unlock_range);
+
+int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
+{
+	struct rb_node *node;
+	struct extent_state *state;
+	int ret = 0;
+
+	write_lock_irq(&tree->lock);
+	/*
+	 * this search will find all the extents that end after
+	 * our range starts.
+	 */
+	node = tree_search(&tree->state, start);
+	if (!node || IS_ERR(node)) {
+		ret = -ENOENT;
+		goto out;
+	}
+	state = rb_entry(node, struct extent_state, rb_node);
+	if (state->start != start) {
+		ret = -ENOENT;
+		goto out;
+	}
+	state->private = private;
+out:
+	write_unlock_irq(&tree->lock);
+	return ret;
+}
+
+int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
+{
+	struct rb_node *node;
+	struct extent_state *state;
+	int ret = 0;
+
+	read_lock_irq(&tree->lock);
+	/*
+	 * this search will find all the extents that end after
+	 * our range starts.
+	 */
+	node = tree_search(&tree->state, start);
+	if (!node || IS_ERR(node)) {
+		ret = -ENOENT;
+		goto out;
+	}
+	state = rb_entry(node, struct extent_state, rb_node);
+	if (state->start != start) {
+		ret = -ENOENT;
+		goto out;
+	}
+	*private = state->private;
+out:
+	read_unlock_irq(&tree->lock);
+	return ret;
+}
+
+/*
+ * searches a range in the state tree for a given mask.
+ * If 'filled' == 1, this returns 1 only if ever extent in the tree
+ * has the bits set.  Otherwise, 1 is returned if any bit in the
+ * range is found set.
+ */
+int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
+		   int bits, int filled)
+{
+	struct extent_state *state = NULL;
+	struct rb_node *node;
+	int bitset = 0;
+	unsigned long flags;
+
+	read_lock_irqsave(&tree->lock, flags);
+	node = tree_search(&tree->state, start);
+	while (node && start <= end) {
+		state = rb_entry(node, struct extent_state, rb_node);
+
+		if (filled && state->start > start) {
+			bitset = 0;
+			break;
+		}
+
+		if (state->start > end)
+			break;
+
+		if (state->state & bits) {
+			bitset = 1;
+			if (!filled)
+				break;
+		} else if (filled) {
+			bitset = 0;
+			break;
+		}
+		start = state->end + 1;
+		if (start > end)
+			break;
+		node = rb_next(node);
+		if (!node) {
+			if (filled)
+				bitset = 0;
+			break;
+		}
+	}
+	read_unlock_irqrestore(&tree->lock, flags);
+	return bitset;
+}
+EXPORT_SYMBOL(test_range_bit);
+
+/*
+ * helper function to set a given page up to date if all the
+ * extents in the tree for that page are up to date
+ */
+static int check_page_uptodate(struct extent_io_tree *tree,
+			       struct page *page)
+{
+	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+	u64 end = start + PAGE_CACHE_SIZE - 1;
+	if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1))
+		SetPageUptodate(page);
+	return 0;
+}
+
+/*
+ * helper function to unlock a page if all the extents in the tree
+ * for that page are unlocked
+ */
+static int check_page_locked(struct extent_io_tree *tree,
+			     struct page *page)
+{
+	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+	u64 end = start + PAGE_CACHE_SIZE - 1;
+	if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0))
+		unlock_page(page);
+	return 0;
+}
+
+/*
+ * helper function to end page writeback if all the extents
+ * in the tree for that page are done with writeback
+ */
+static int check_page_writeback(struct extent_io_tree *tree,
+			     struct page *page)
+{
+	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+	u64 end = start + PAGE_CACHE_SIZE - 1;
+	if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0))
+		end_page_writeback(page);
+	return 0;
+}
+
+/* lots and lots of room for performance fixes in the end_bio funcs */
+
+/*
+ * after a writepage IO is done, we need to:
+ * clear the uptodate bits on error
+ * clear the writeback bits in the extent tree for this IO
+ * end_page_writeback if the page has no more pending IO
+ *
+ * Scheduling is not allowed, so the extent state tree is expected
+ * to have one and only one object corresponding to this IO.
+ */
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
+static void end_bio_extent_writepage(struct bio *bio, int err)
+#else
+static int end_bio_extent_writepage(struct bio *bio,
+				   unsigned int bytes_done, int err)
+#endif
+{
+	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+	struct extent_io_tree *tree = bio->bi_private;
+	u64 start;
+	u64 end;
+	int whole_page;
+
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
+	if (bio->bi_size)
+		return 1;
+#endif
+
+	do {
+		struct page *page = bvec->bv_page;
+		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
+			 bvec->bv_offset;
+		end = start + bvec->bv_len - 1;
+
+		if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
+			whole_page = 1;
+		else
+			whole_page = 0;
+
+		if (--bvec >= bio->bi_io_vec)
+			prefetchw(&bvec->bv_page->flags);
+
+		if (!uptodate) {
+			clear_extent_uptodate(tree, start, end, GFP_ATOMIC);
+			ClearPageUptodate(page);
+			SetPageError(page);
+		}
+		clear_extent_writeback(tree, start, end, GFP_ATOMIC);
+
+		if (whole_page)
+			end_page_writeback(page);
+		else
+			check_page_writeback(tree, page);
+		if (tree->ops && tree->ops->writepage_end_io_hook)
+			tree->ops->writepage_end_io_hook(page, start, end);
+	} while (bvec >= bio->bi_io_vec);
+
+	bio_put(bio);
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
+	return 0;
+#endif
+}
+
+/*
+ * after a readpage IO is done, we need to:
+ * clear the uptodate bits on error
+ * set the uptodate bits if things worked
+ * set the page up to date if all extents in the tree are uptodate
+ * clear the lock bit in the extent tree
+ * unlock the page if there are no other extents locked for it
+ *
+ * Scheduling is not allowed, so the extent state tree is expected
+ * to have one and only one object corresponding to this IO.
+ */
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
+static void end_bio_extent_readpage(struct bio *bio, int err)
+#else
+static int end_bio_extent_readpage(struct bio *bio,
+				   unsigned int bytes_done, int err)
+#endif
+{
+	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+	struct extent_io_tree *tree = bio->bi_private;
+	u64 start;
+	u64 end;
+	int whole_page;
+	int ret;
+
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
+	if (bio->bi_size)
+		return 1;
+#endif
+
+	do {
+		struct page *page = bvec->bv_page;
+		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
+			bvec->bv_offset;
+		end = start + bvec->bv_len - 1;
+
+		if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
+			whole_page = 1;
+		else
+			whole_page = 0;
+
+		if (--bvec >= bio->bi_io_vec)
+			prefetchw(&bvec->bv_page->flags);
+
+		if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
+			ret = tree->ops->readpage_end_io_hook(page, start, end);
+			if (ret)
+				uptodate = 0;
+		}
+		if (uptodate) {
+			set_extent_uptodate(tree, start, end, GFP_ATOMIC);
+			if (whole_page)
+				SetPageUptodate(page);
+			else
+				check_page_uptodate(tree, page);
+		} else {
+			ClearPageUptodate(page);
+			SetPageError(page);
+		}
+
+		unlock_extent(tree, start, end, GFP_ATOMIC);
+
+		if (whole_page)
+			unlock_page(page);
+		else
+			check_page_locked(tree, page);
+	} while (bvec >= bio->bi_io_vec);
+
+	bio_put(bio);
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
+	return 0;
+#endif
+}
+
+/*
+ * IO done from prepare_write is pretty simple, we just unlock
+ * the structs in the extent tree when done, and set the uptodate bits
+ * as appropriate.
+ */
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
+static void end_bio_extent_preparewrite(struct bio *bio, int err)
+#else
+static int end_bio_extent_preparewrite(struct bio *bio,
+				       unsigned int bytes_done, int err)
+#endif
+{
+	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+	struct extent_io_tree *tree = bio->bi_private;
+	u64 start;
+	u64 end;
+
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
+	if (bio->bi_size)
+		return 1;
+#endif
+
+	do {
+		struct page *page = bvec->bv_page;
+		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
+			bvec->bv_offset;
+		end = start + bvec->bv_len - 1;
+
+		if (--bvec >= bio->bi_io_vec)
+			prefetchw(&bvec->bv_page->flags);
+
+		if (uptodate) {
+			set_extent_uptodate(tree, start, end, GFP_ATOMIC);
+		} else {
+			ClearPageUptodate(page);
+			SetPageError(page);
+		}
+
+		unlock_extent(tree, start, end, GFP_ATOMIC);
+
+	} while (bvec >= bio->bi_io_vec);
+
+	bio_put(bio);
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
+	return 0;
+#endif
+}
+
+static struct bio *
+extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
+		 gfp_t gfp_flags)
+{
+	struct bio *bio;
+
+	bio = bio_alloc(gfp_flags, nr_vecs);
+
+	if (bio == NULL && (current->flags & PF_MEMALLOC)) {
+		while (!bio && (nr_vecs /= 2))
+			bio = bio_alloc(gfp_flags, nr_vecs);
+	}
+
+	if (bio) {
+		bio->bi_bdev = bdev;
+		bio->bi_sector = first_sector;
+	}
+	return bio;
+}
+
+static int submit_one_bio(int rw, struct bio *bio)
+{
+	u64 maxsector;
+	int ret = 0;
+
+	bio_get(bio);
+
+        maxsector = bio->bi_bdev->bd_inode->i_size >> 9;
+	if (maxsector < bio->bi_sector) {
+		printk("sector too large max %Lu got %llu\n", maxsector,
+			(unsigned long long)bio->bi_sector);
+		WARN_ON(1);
+	}
+
+	submit_bio(rw, bio);
+	if (bio_flagged(bio, BIO_EOPNOTSUPP))
+		ret = -EOPNOTSUPP;
+	bio_put(bio);
+	return ret;
+}
+
+static int submit_extent_page(int rw, struct extent_io_tree *tree,
+			      struct page *page, sector_t sector,
+			      size_t size, unsigned long offset,
+			      struct block_device *bdev,
+			      struct bio **bio_ret,
+			      unsigned long max_pages,
+			      bio_end_io_t end_io_func)
+{
+	int ret = 0;
+	struct bio *bio;
+	int nr;
+
+	if (bio_ret && *bio_ret) {
+		bio = *bio_ret;
+		if (bio->bi_sector + (bio->bi_size >> 9) != sector ||
+		    bio_add_page(bio, page, size, offset) < size) {
+			ret = submit_one_bio(rw, bio);
+			bio = NULL;
+		} else {
+			return 0;
+		}
+	}
+	nr = min_t(int, max_pages, bio_get_nr_vecs(bdev));
+	bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
+	if (!bio) {
+		printk("failed to allocate bio nr %d\n", nr);
+	}
+	bio_add_page(bio, page, size, offset);
+	bio->bi_end_io = end_io_func;
+	bio->bi_private = tree;
+	if (bio_ret) {
+		*bio_ret = bio;
+	} else {
+		ret = submit_one_bio(rw, bio);
+	}
+
+	return ret;
+}
+
+void set_page_extent_mapped(struct page *page)
+{
+	if (!PagePrivate(page)) {
+		SetPagePrivate(page);
+		WARN_ON(!page->mapping->a_ops->invalidatepage);
+		set_page_private(page, EXTENT_PAGE_PRIVATE);
+		page_cache_get(page);
+	}
+}
+
+void set_page_extent_head(struct page *page, unsigned long len)
+{
+	set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
+}
+
+/*
+ * basic readpage implementation.  Locked extent state structs are inserted
+ * into the tree that are removed when the IO is done (by the end_io
+ * handlers)
+ */
+static int __extent_read_full_page(struct extent_io_tree *tree,
+				   struct page *page,
+				   get_extent_t *get_extent,
+				   struct bio **bio)
+{
+	struct inode *inode = page->mapping->host;
+	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+	u64 page_end = start + PAGE_CACHE_SIZE - 1;
+	u64 end;
+	u64 cur = start;
+	u64 extent_offset;
+	u64 last_byte = i_size_read(inode);
+	u64 block_start;
+	u64 cur_end;
+	sector_t sector;
+	struct extent_map *em;
+	struct block_device *bdev;
+	int ret;
+	int nr = 0;
+	size_t page_offset = 0;
+	size_t iosize;
+	size_t blocksize = inode->i_sb->s_blocksize;
+
+	set_page_extent_mapped(page);
+
+	end = page_end;
+	lock_extent(tree, start, end, GFP_NOFS);
+
+	while (cur <= end) {
+		if (cur >= last_byte) {
+			char *userpage;
+			iosize = PAGE_CACHE_SIZE - page_offset;
+			userpage = kmap_atomic(page, KM_USER0);
+			memset(userpage + page_offset, 0, iosize);
+			flush_dcache_page(page);
+			kunmap_atomic(userpage, KM_USER0);
+			set_extent_uptodate(tree, cur, cur + iosize - 1,
+					    GFP_NOFS);
+			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+			break;
+		}
+		em = get_extent(inode, page, page_offset, cur,
+				end - cur + 1, 0);
+		if (IS_ERR(em) || !em) {
+			SetPageError(page);
+			unlock_extent(tree, cur, end, GFP_NOFS);
+			break;
+		}
+
+		extent_offset = cur - em->start;
+		BUG_ON(extent_map_end(em) <= cur);
+		BUG_ON(end < cur);
+
+		iosize = min(extent_map_end(em) - cur, end - cur + 1);
+		cur_end = min(extent_map_end(em) - 1, end);
+		iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
+		sector = (em->block_start + extent_offset) >> 9;
+		bdev = em->bdev;
+		block_start = em->block_start;
+		free_extent_map(em);
+		em = NULL;
+
+		/* we've found a hole, just zero and go on */
+		if (block_start == EXTENT_MAP_HOLE) {
+			char *userpage;
+			userpage = kmap_atomic(page, KM_USER0);
+			memset(userpage + page_offset, 0, iosize);
+			flush_dcache_page(page);
+			kunmap_atomic(userpage, KM_USER0);
+
+			set_extent_uptodate(tree, cur, cur + iosize - 1,
+					    GFP_NOFS);
+			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+			cur = cur + iosize;
+			page_offset += iosize;
+			continue;
+		}
+		/* the get_extent function already copied into the page */
+		if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) {
+			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+			cur = cur + iosize;
+			page_offset += iosize;
+			continue;
+		}
+
+		ret = 0;
+		if (tree->ops && tree->ops->readpage_io_hook) {
+			ret = tree->ops->readpage_io_hook(page, cur,
+							  cur + iosize - 1);
+		}
+		if (!ret) {
+			unsigned long nr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
+			nr -= page->index;
+			ret = submit_extent_page(READ, tree, page,
+					 sector, iosize, page_offset,
+					 bdev, bio, nr,
+					 end_bio_extent_readpage);
+		}
+		if (ret)
+			SetPageError(page);
+		cur = cur + iosize;
+		page_offset += iosize;
+		nr++;
+	}
+	if (!nr) {
+		if (!PageError(page))
+			SetPageUptodate(page);
+		unlock_page(page);
+	}
+	return 0;
+}
+
+int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
+			    get_extent_t *get_extent)
+{
+	struct bio *bio = NULL;
+	int ret;
+
+	ret = __extent_read_full_page(tree, page, get_extent, &bio);
+	if (bio)
+		submit_one_bio(READ, bio);
+	return ret;
+}
+EXPORT_SYMBOL(extent_read_full_page);
+
+/*
+ * the writepage semantics are similar to regular writepage.  extent
+ * records are inserted to lock ranges in the tree, and as dirty areas
+ * are found, they are marked writeback.  Then the lock bits are removed
+ * and the end_io handler clears the writeback ranges
+ */
+static int __extent_writepage(struct page *page, struct writeback_control *wbc,
+			      void *data)
+{
+	struct inode *inode = page->mapping->host;
+	struct extent_page_data *epd = data;
+	struct extent_io_tree *tree = epd->tree;
+	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+	u64 delalloc_start;
+	u64 page_end = start + PAGE_CACHE_SIZE - 1;
+	u64 end;
+	u64 cur = start;
+	u64 extent_offset;
+	u64 last_byte = i_size_read(inode);
+	u64 block_start;
+	u64 iosize;
+	sector_t sector;
+	struct extent_map *em;
+	struct block_device *bdev;
+	int ret;
+	int nr = 0;
+	size_t page_offset = 0;
+	size_t blocksize;
+	loff_t i_size = i_size_read(inode);
+	unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
+	u64 nr_delalloc;
+	u64 delalloc_end;
+
+	WARN_ON(!PageLocked(page));
+	if (page->index > end_index) {
+		clear_extent_dirty(tree, start, page_end, GFP_NOFS);
+		unlock_page(page);
+		return 0;
+	}
+
+	if (page->index == end_index) {
+		char *userpage;
+
+		size_t offset = i_size & (PAGE_CACHE_SIZE - 1);
+
+		userpage = kmap_atomic(page, KM_USER0);
+		memset(userpage + offset, 0, PAGE_CACHE_SIZE - offset);
+		flush_dcache_page(page);
+		kunmap_atomic(userpage, KM_USER0);
+	}
+
+	set_page_extent_mapped(page);
+
+	delalloc_start = start;
+	delalloc_end = 0;
+	while(delalloc_end < page_end) {
+		nr_delalloc = find_lock_delalloc_range(tree, &delalloc_start,
+						       &delalloc_end,
+						       128 * 1024 * 1024);
+		if (nr_delalloc == 0) {
+			delalloc_start = delalloc_end + 1;
+			continue;
+		}
+		tree->ops->fill_delalloc(inode, delalloc_start,
+					 delalloc_end);
+		clear_extent_bit(tree, delalloc_start,
+				 delalloc_end,
+				 EXTENT_LOCKED | EXTENT_DELALLOC,
+				 1, 0, GFP_NOFS);
+		delalloc_start = delalloc_end + 1;
+	}
+	lock_extent(tree, start, page_end, GFP_NOFS);
+
+	end = page_end;
+	if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) {
+		printk("found delalloc bits after lock_extent\n");
+	}
+
+	if (last_byte <= start) {
+		clear_extent_dirty(tree, start, page_end, GFP_NOFS);
+		goto done;
+	}
+
+	set_extent_uptodate(tree, start, page_end, GFP_NOFS);
+	blocksize = inode->i_sb->s_blocksize;
+
+	while (cur <= end) {
+		if (cur >= last_byte) {
+			clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
+			break;
+		}
+		em = epd->get_extent(inode, page, page_offset, cur,
+				     end - cur + 1, 1);
+		if (IS_ERR(em) || !em) {
+			SetPageError(page);
+			break;
+		}
+
+		extent_offset = cur - em->start;
+		BUG_ON(extent_map_end(em) <= cur);
+		BUG_ON(end < cur);
+		iosize = min(extent_map_end(em) - cur, end - cur + 1);
+		iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
+		sector = (em->block_start + extent_offset) >> 9;
+		bdev = em->bdev;
+		block_start = em->block_start;
+		free_extent_map(em);
+		em = NULL;
+
+		if (block_start == EXTENT_MAP_HOLE ||
+		    block_start == EXTENT_MAP_INLINE) {
+			clear_extent_dirty(tree, cur,
+					   cur + iosize - 1, GFP_NOFS);
+			cur = cur + iosize;
+			page_offset += iosize;
+			continue;
+		}
+
+		/* leave this out until we have a page_mkwrite call */
+		if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
+				   EXTENT_DIRTY, 0)) {
+			cur = cur + iosize;
+			page_offset += iosize;
+			continue;
+		}
+		clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
+		if (tree->ops && tree->ops->writepage_io_hook) {
+			ret = tree->ops->writepage_io_hook(page, cur,
+						cur + iosize - 1);
+		} else {
+			ret = 0;
+		}
+		if (ret)
+			SetPageError(page);
+		else {
+			unsigned long max_nr = end_index + 1;
+			set_range_writeback(tree, cur, cur + iosize - 1);
+			if (!PageWriteback(page)) {
+				printk("warning page %lu not writeback, "
+				       "cur %llu end %llu\n", page->index,
+				       (unsigned long long)cur,
+				       (unsigned long long)end);
+			}
+
+			ret = submit_extent_page(WRITE, tree, page, sector,
+						 iosize, page_offset, bdev,
+						 &epd->bio, max_nr,
+						 end_bio_extent_writepage);
+			if (ret)
+				SetPageError(page);
+		}
+		cur = cur + iosize;
+		page_offset += iosize;
+		nr++;
+	}
+done:
+	if (nr == 0) {
+		/* make sure the mapping tag for page dirty gets cleared */
+		set_page_writeback(page);
+		end_page_writeback(page);
+	}
+	unlock_extent(tree, start, page_end, GFP_NOFS);
+	unlock_page(page);
+	return 0;
+}
+
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
+
+/* Taken directly from 2.6.23 for 2.6.18 back port */
+typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
+                                void *data);
+
+/**
+ * write_cache_pages - walk the list of dirty pages of the given address space
+ * and write all of them.
+ * @mapping: address space structure to write
+ * @wbc: subtract the number of written pages from *@wbc->nr_to_write
+ * @writepage: function called for each page
+ * @data: data passed to writepage function
+ *
+ * If a page is already under I/O, write_cache_pages() skips it, even
+ * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
+ * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
+ * and msync() need to guarantee that all the data which was dirty at the time
+ * the call was made get new I/O started against them.  If wbc->sync_mode is
+ * WB_SYNC_ALL then we were called for data integrity and we must wait for
+ * existing IO to complete.
+ */
+static int write_cache_pages(struct address_space *mapping,
+		      struct writeback_control *wbc, writepage_t writepage,
+		      void *data)
+{
+	struct backing_dev_info *bdi = mapping->backing_dev_info;
+	int ret = 0;
+	int done = 0;
+	struct pagevec pvec;
+	int nr_pages;
+	pgoff_t index;
+	pgoff_t end;		/* Inclusive */
+	int scanned = 0;
+	int range_whole = 0;
+
+	if (wbc->nonblocking && bdi_write_congested(bdi)) {
+		wbc->encountered_congestion = 1;
+		return 0;
+	}
+
+	pagevec_init(&pvec, 0);
+	if (wbc->range_cyclic) {
+		index = mapping->writeback_index; /* Start from prev offset */
+		end = -1;
+	} else {
+		index = wbc->range_start >> PAGE_CACHE_SHIFT;
+		end = wbc->range_end >> PAGE_CACHE_SHIFT;
+		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+			range_whole = 1;
+		scanned = 1;
+	}
+retry:
+	while (!done && (index <= end) &&
+	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+					      PAGECACHE_TAG_DIRTY,
+					      min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+		unsigned i;
+
+		scanned = 1;
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+
+			/*
+			 * At this point we hold neither mapping->tree_lock nor
+			 * lock on the page itself: the page may be truncated or
+			 * invalidated (changing page->mapping to NULL), or even
+			 * swizzled back from swapper_space to tmpfs file
+			 * mapping
+			 */
+			lock_page(page);
+
+			if (unlikely(page->mapping != mapping)) {
+				unlock_page(page);
+				continue;
+			}
+
+			if (!wbc->range_cyclic && page->index > end) {
+				done = 1;
+				unlock_page(page);
+				continue;
+			}
+
+			if (wbc->sync_mode != WB_SYNC_NONE)
+				wait_on_page_writeback(page);
+
+			if (PageWriteback(page) ||
+			    !clear_page_dirty_for_io(page)) {
+				unlock_page(page);
+				continue;
+			}
+
+			ret = (*writepage)(page, wbc, data);
+
+			if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
+				unlock_page(page);
+				ret = 0;
+			}
+			if (ret || (--(wbc->nr_to_write) <= 0))
+				done = 1;
+			if (wbc->nonblocking && bdi_write_congested(bdi)) {
+				wbc->encountered_congestion = 1;
+				done = 1;
+			}
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+	if (!scanned && !done) {
+		/*
+		 * We hit the last page and there is more work to be done: wrap
+		 * back to the start of the file
+		 */
+		scanned = 1;
+		index = 0;
+		goto retry;
+	}
+	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+		mapping->writeback_index = index;
+	return ret;
+}
+#endif
+
+int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
+			  get_extent_t *get_extent,
+			  struct writeback_control *wbc)
+{
+	int ret;
+	struct address_space *mapping = page->mapping;
+	struct extent_page_data epd = {
+		.bio = NULL,
+		.tree = tree,
+		.get_extent = get_extent,
+	};
+	struct writeback_control wbc_writepages = {
+		.bdi		= wbc->bdi,
+		.sync_mode	= WB_SYNC_NONE,
+		.older_than_this = NULL,
+		.nr_to_write	= 64,
+		.range_start	= page_offset(page) + PAGE_CACHE_SIZE,
+		.range_end	= (loff_t)-1,
+	};
+
+
+	ret = __extent_writepage(page, wbc, &epd);
+
+	write_cache_pages(mapping, &wbc_writepages, __extent_writepage, &epd);
+	if (epd.bio) {
+		submit_one_bio(WRITE, epd.bio);
+	}
+	return ret;
+}
+EXPORT_SYMBOL(extent_write_full_page);
+
+
+int extent_writepages(struct extent_io_tree *tree,
+		      struct address_space *mapping,
+		      get_extent_t *get_extent,
+		      struct writeback_control *wbc)
+{
+	int ret = 0;
+	struct extent_page_data epd = {
+		.bio = NULL,
+		.tree = tree,
+		.get_extent = get_extent,
+	};
+
+	ret = write_cache_pages(mapping, wbc, __extent_writepage, &epd);
+	if (epd.bio) {
+		submit_one_bio(WRITE, epd.bio);
+	}
+	return ret;
+}
+EXPORT_SYMBOL(extent_writepages);
+
+int extent_readpages(struct extent_io_tree *tree,
+		     struct address_space *mapping,
+		     struct list_head *pages, unsigned nr_pages,
+		     get_extent_t get_extent)
+{
+	struct bio *bio = NULL;
+	unsigned page_idx;
+	struct pagevec pvec;
+
+	pagevec_init(&pvec, 0);
+	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
+		struct page *page = list_entry(pages->prev, struct page, lru);
+
+		prefetchw(&page->flags);
+		list_del(&page->lru);
+		/*
+		 * what we want to do here is call add_to_page_cache_lru,
+		 * but that isn't exported, so we reproduce it here
+		 */
+		if (!add_to_page_cache(page, mapping,
+					page->index, GFP_KERNEL)) {
+
+			/* open coding of lru_cache_add, also not exported */
+			page_cache_get(page);
+			if (!pagevec_add(&pvec, page))
+				__pagevec_lru_add(&pvec);
+			__extent_read_full_page(tree, page, get_extent, &bio);
+		}
+		page_cache_release(page);
+	}
+	if (pagevec_count(&pvec))
+		__pagevec_lru_add(&pvec);
+	BUG_ON(!list_empty(pages));
+	if (bio)
+		submit_one_bio(READ, bio);
+	return 0;
+}
+EXPORT_SYMBOL(extent_readpages);
+
+/*
+ * basic invalidatepage code, this waits on any locked or writeback
+ * ranges corresponding to the page, and then deletes any extent state
+ * records from the tree
+ */
+int extent_invalidatepage(struct extent_io_tree *tree,
+			  struct page *page, unsigned long offset)
+{
+	u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
+	u64 end = start + PAGE_CACHE_SIZE - 1;
+	size_t blocksize = page->mapping->host->i_sb->s_blocksize;
+
+	start += (offset + blocksize -1) & ~(blocksize - 1);
+	if (start > end)
+		return 0;
+
+	lock_extent(tree, start, end, GFP_NOFS);
+	wait_on_extent_writeback(tree, start, end);
+	clear_extent_bit(tree, start, end,
+			 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
+			 1, 1, GFP_NOFS);
+	return 0;
+}
+EXPORT_SYMBOL(extent_invalidatepage);
+
+/*
+ * simple commit_write call, set_range_dirty is used to mark both
+ * the pages and the extent records as dirty
+ */
+int extent_commit_write(struct extent_io_tree *tree,
+			struct inode *inode, struct page *page,
+			unsigned from, unsigned to)
+{
+	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+
+	set_page_extent_mapped(page);
+	set_page_dirty(page);
+
+	if (pos > inode->i_size) {
+		i_size_write(inode, pos);
+		mark_inode_dirty(inode);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(extent_commit_write);
+
+int extent_prepare_write(struct extent_io_tree *tree,
+			 struct inode *inode, struct page *page,
+			 unsigned from, unsigned to, get_extent_t *get_extent)
+{
+	u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
+	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
+	u64 block_start;
+	u64 orig_block_start;
+	u64 block_end;
+	u64 cur_end;
+	struct extent_map *em;
+	unsigned blocksize = 1 << inode->i_blkbits;
+	size_t page_offset = 0;
+	size_t block_off_start;
+	size_t block_off_end;
+	int err = 0;
+	int iocount = 0;
+	int ret = 0;
+	int isnew;
+
+	set_page_extent_mapped(page);
+
+	block_start = (page_start + from) & ~((u64)blocksize - 1);
+	block_end = (page_start + to - 1) | (blocksize - 1);
+	orig_block_start = block_start;
+
+	lock_extent(tree, page_start, page_end, GFP_NOFS);
+	while(block_start <= block_end) {
+		em = get_extent(inode, page, page_offset, block_start,
+				block_end - block_start + 1, 1);
+		if (IS_ERR(em) || !em) {
+			goto err;
+		}
+		cur_end = min(block_end, extent_map_end(em) - 1);
+		block_off_start = block_start & (PAGE_CACHE_SIZE - 1);
+		block_off_end = block_off_start + blocksize;
+		isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS);
+
+		if (!PageUptodate(page) && isnew &&
+		    (block_off_end > to || block_off_start < from)) {
+			void *kaddr;
+
+			kaddr = kmap_atomic(page, KM_USER0);
+			if (block_off_end > to)
+				memset(kaddr + to, 0, block_off_end - to);
+			if (block_off_start < from)
+				memset(kaddr + block_off_start, 0,
+				       from - block_off_start);
+			flush_dcache_page(page);
+			kunmap_atomic(kaddr, KM_USER0);
+		}
+		if ((em->block_start != EXTENT_MAP_HOLE &&
+		     em->block_start != EXTENT_MAP_INLINE) &&
+		    !isnew && !PageUptodate(page) &&
+		    (block_off_end > to || block_off_start < from) &&
+		    !test_range_bit(tree, block_start, cur_end,
+				    EXTENT_UPTODATE, 1)) {
+			u64 sector;
+			u64 extent_offset = block_start - em->start;
+			size_t iosize;
+			sector = (em->block_start + extent_offset) >> 9;
+			iosize = (cur_end - block_start + blocksize) &
+				~((u64)blocksize - 1);
+			/*
+			 * we've already got the extent locked, but we
+			 * need to split the state such that our end_bio
+			 * handler can clear the lock.
+			 */
+			set_extent_bit(tree, block_start,
+				       block_start + iosize - 1,
+				       EXTENT_LOCKED, 0, NULL, GFP_NOFS);
+			ret = submit_extent_page(READ, tree, page,
+					 sector, iosize, page_offset, em->bdev,
+					 NULL, 1,
+					 end_bio_extent_preparewrite);
+			iocount++;
+			block_start = block_start + iosize;
+		} else {
+			set_extent_uptodate(tree, block_start, cur_end,
+					    GFP_NOFS);
+			unlock_extent(tree, block_start, cur_end, GFP_NOFS);
+			block_start = cur_end + 1;
+		}
+		page_offset = block_start & (PAGE_CACHE_SIZE - 1);
+		free_extent_map(em);
+	}
+	if (iocount) {
+		wait_extent_bit(tree, orig_block_start,
+				block_end, EXTENT_LOCKED);
+	}
+	check_page_uptodate(tree, page);
+err:
+	/* FIXME, zero out newly allocated blocks on error */
+	return err;
+}
+EXPORT_SYMBOL(extent_prepare_write);
+
+/*
+ * a helper for releasepage.  As long as there are no locked extents
+ * in the range corresponding to the page, both state records and extent
+ * map records are removed
+ */
+int try_release_extent_mapping(struct extent_map_tree *map,
+			       struct extent_io_tree *tree, struct page *page)
+{
+	struct extent_map *em;
+	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+	u64 end = start + PAGE_CACHE_SIZE - 1;
+	u64 orig_start = start;
+	int ret = 1;
+
+	while (start <= end) {
+		spin_lock(&map->lock);
+		em = lookup_extent_mapping(map, start, end);
+		if (!em || IS_ERR(em)) {
+			spin_unlock(&map->lock);
+			break;
+		}
+		if (!test_range_bit(tree, em->start, extent_map_end(em) - 1,
+				    EXTENT_LOCKED, 0)) {
+			remove_extent_mapping(map, em);
+			/* once for the rb tree */
+			free_extent_map(em);
+		}
+		start = extent_map_end(em);
+		spin_unlock(&map->lock);
+
+		/* once for us */
+		free_extent_map(em);
+	}
+	if (test_range_bit(tree, orig_start, end, EXTENT_LOCKED, 0))
+		ret = 0;
+	else
+		clear_extent_bit(tree, orig_start, end, EXTENT_UPTODATE,
+				 1, 1, GFP_NOFS);
+	return ret;
+}
+EXPORT_SYMBOL(try_release_extent_mapping);
+
+sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
+		get_extent_t *get_extent)
+{
+	struct inode *inode = mapping->host;
+	u64 start = iblock << inode->i_blkbits;
+	sector_t sector = 0;
+	struct extent_map *em;
+
+	em = get_extent(inode, NULL, 0, start, (1 << inode->i_blkbits), 0);
+	if (!em || IS_ERR(em))
+		return 0;
+
+	if (em->block_start == EXTENT_MAP_INLINE ||
+	    em->block_start == EXTENT_MAP_HOLE)
+		goto out;
+
+	sector = (em->block_start + start - em->start) >> inode->i_blkbits;
+printk("bmap finds %Lu %Lu block %Lu\n", em->start, em->len, em->block_start);
+out:
+	free_extent_map(em);
+	return sector;
+}
+
+static int add_lru(struct extent_io_tree *tree, struct extent_buffer *eb)
+{
+	if (list_empty(&eb->lru)) {
+		extent_buffer_get(eb);
+		list_add(&eb->lru, &tree->buffer_lru);
+		tree->lru_size++;
+		if (tree->lru_size >= BUFFER_LRU_MAX) {
+			struct extent_buffer *rm;
+			rm = list_entry(tree->buffer_lru.prev,
+					struct extent_buffer, lru);
+			tree->lru_size--;
+			list_del_init(&rm->lru);
+			free_extent_buffer(rm);
+		}
+	} else
+		list_move(&eb->lru, &tree->buffer_lru);
+	return 0;
+}
+static struct extent_buffer *find_lru(struct extent_io_tree *tree,
+				      u64 start, unsigned long len)
+{
+	struct list_head *lru = &tree->buffer_lru;
+	struct list_head *cur = lru->next;
+	struct extent_buffer *eb;
+
+	if (list_empty(lru))
+		return NULL;
+
+	do {
+		eb = list_entry(cur, struct extent_buffer, lru);
+		if (eb->start == start && eb->len == len) {
+			extent_buffer_get(eb);
+			return eb;
+		}
+		cur = cur->next;
+	} while (cur != lru);
+	return NULL;
+}
+
+static inline unsigned long num_extent_pages(u64 start, u64 len)
+{
+	return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
+		(start >> PAGE_CACHE_SHIFT);
+}
+
+static inline struct page *extent_buffer_page(struct extent_buffer *eb,
+					      unsigned long i)
+{
+	struct page *p;
+	struct address_space *mapping;
+
+	if (i == 0)
+		return eb->first_page;
+	i += eb->start >> PAGE_CACHE_SHIFT;
+	mapping = eb->first_page->mapping;
+	read_lock_irq(&mapping->tree_lock);
+	p = radix_tree_lookup(&mapping->page_tree, i);
+	read_unlock_irq(&mapping->tree_lock);
+	return p;
+}
+
+static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
+						   u64 start,
+						   unsigned long len,
+						   gfp_t mask)
+{
+	struct extent_buffer *eb = NULL;
+
+	spin_lock(&tree->lru_lock);
+	eb = find_lru(tree, start, len);
+	spin_unlock(&tree->lru_lock);
+	if (eb) {
+		return eb;
+	}
+
+	eb = kmem_cache_zalloc(extent_buffer_cache, mask);
+	INIT_LIST_HEAD(&eb->lru);
+	eb->start = start;
+	eb->len = len;
+	atomic_set(&eb->refs, 1);
+
+	return eb;
+}
+
+static void __free_extent_buffer(struct extent_buffer *eb)
+{
+	kmem_cache_free(extent_buffer_cache, eb);
+}
+
+struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
+					  u64 start, unsigned long len,
+					  struct page *page0,
+					  gfp_t mask)
+{
+	unsigned long num_pages = num_extent_pages(start, len);
+	unsigned long i;
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	struct extent_buffer *eb;
+	struct page *p;
+	struct address_space *mapping = tree->mapping;
+	int uptodate = 1;
+
+	eb = __alloc_extent_buffer(tree, start, len, mask);
+	if (!eb || IS_ERR(eb))
+		return NULL;
+
+	if (eb->flags & EXTENT_BUFFER_FILLED)
+		goto lru_add;
+
+	if (page0) {
+		eb->first_page = page0;
+		i = 1;
+		index++;
+		page_cache_get(page0);
+		mark_page_accessed(page0);
+		set_page_extent_mapped(page0);
+		WARN_ON(!PageUptodate(page0));
+		set_page_extent_head(page0, len);
+	} else {
+		i = 0;
+	}
+	for (; i < num_pages; i++, index++) {
+		p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM);
+		if (!p) {
+			WARN_ON(1);
+			goto fail;
+		}
+		set_page_extent_mapped(p);
+		mark_page_accessed(p);
+		if (i == 0) {
+			eb->first_page = p;
+			set_page_extent_head(p, len);
+		} else {
+			set_page_private(p, EXTENT_PAGE_PRIVATE);
+		}
+		if (!PageUptodate(p))
+			uptodate = 0;
+		unlock_page(p);
+	}
+	if (uptodate)
+		eb->flags |= EXTENT_UPTODATE;
+	eb->flags |= EXTENT_BUFFER_FILLED;
+
+lru_add:
+	spin_lock(&tree->lru_lock);
+	add_lru(tree, eb);
+	spin_unlock(&tree->lru_lock);
+	return eb;
+
+fail:
+	spin_lock(&tree->lru_lock);
+	list_del_init(&eb->lru);
+	spin_unlock(&tree->lru_lock);
+	if (!atomic_dec_and_test(&eb->refs))
+		return NULL;
+	for (index = 1; index < i; index++) {
+		page_cache_release(extent_buffer_page(eb, index));
+	}
+	if (i > 0)
+		page_cache_release(extent_buffer_page(eb, 0));
+	__free_extent_buffer(eb);
+	return NULL;
+}
+EXPORT_SYMBOL(alloc_extent_buffer);
+
+struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
+					 u64 start, unsigned long len,
+					  gfp_t mask)
+{
+	unsigned long num_pages = num_extent_pages(start, len);
+	unsigned long i;
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	struct extent_buffer *eb;
+	struct page *p;
+	struct address_space *mapping = tree->mapping;
+	int uptodate = 1;
+
+	eb = __alloc_extent_buffer(tree, start, len, mask);
+	if (!eb || IS_ERR(eb))
+		return NULL;
+
+	if (eb->flags & EXTENT_BUFFER_FILLED)
+		goto lru_add;
+
+	for (i = 0; i < num_pages; i++, index++) {
+		p = find_lock_page(mapping, index);
+		if (!p) {
+			goto fail;
+		}
+		set_page_extent_mapped(p);
+		mark_page_accessed(p);
+
+		if (i == 0) {
+			eb->first_page = p;
+			set_page_extent_head(p, len);
+		} else {
+			set_page_private(p, EXTENT_PAGE_PRIVATE);
+		}
+
+		if (!PageUptodate(p))
+			uptodate = 0;
+		unlock_page(p);
+	}
+	if (uptodate)
+		eb->flags |= EXTENT_UPTODATE;
+	eb->flags |= EXTENT_BUFFER_FILLED;
+
+lru_add:
+	spin_lock(&tree->lru_lock);
+	add_lru(tree, eb);
+	spin_unlock(&tree->lru_lock);
+	return eb;
+fail:
+	spin_lock(&tree->lru_lock);
+	list_del_init(&eb->lru);
+	spin_unlock(&tree->lru_lock);
+	if (!atomic_dec_and_test(&eb->refs))
+		return NULL;
+	for (index = 1; index < i; index++) {
+		page_cache_release(extent_buffer_page(eb, index));
+	}
+	if (i > 0)
+		page_cache_release(extent_buffer_page(eb, 0));
+	__free_extent_buffer(eb);
+	return NULL;
+}
+EXPORT_SYMBOL(find_extent_buffer);
+
+void free_extent_buffer(struct extent_buffer *eb)
+{
+	unsigned long i;
+	unsigned long num_pages;
+
+	if (!eb)
+		return;
+
+	if (!atomic_dec_and_test(&eb->refs))
+		return;
+
+	WARN_ON(!list_empty(&eb->lru));
+	num_pages = num_extent_pages(eb->start, eb->len);
+
+	for (i = 1; i < num_pages; i++) {
+		page_cache_release(extent_buffer_page(eb, i));
+	}
+	page_cache_release(extent_buffer_page(eb, 0));
+	__free_extent_buffer(eb);
+}
+EXPORT_SYMBOL(free_extent_buffer);
+
+int clear_extent_buffer_dirty(struct extent_io_tree *tree,
+			      struct extent_buffer *eb)
+{
+	int set;
+	unsigned long i;
+	unsigned long num_pages;
+	struct page *page;
+
+	u64 start = eb->start;
+	u64 end = start + eb->len - 1;
+
+	set = clear_extent_dirty(tree, start, end, GFP_NOFS);
+	num_pages = num_extent_pages(eb->start, eb->len);
+
+	for (i = 0; i < num_pages; i++) {
+		page = extent_buffer_page(eb, i);
+		lock_page(page);
+		if (i == 0)
+			set_page_extent_head(page, eb->len);
+		else
+			set_page_private(page, EXTENT_PAGE_PRIVATE);
+
+		/*
+		 * if we're on the last page or the first page and the
+		 * block isn't aligned on a page boundary, do extra checks
+		 * to make sure we don't clean page that is partially dirty
+		 */
+		if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
+		    ((i == num_pages - 1) &&
+		     ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
+			start = (u64)page->index << PAGE_CACHE_SHIFT;
+			end  = start + PAGE_CACHE_SIZE - 1;
+			if (test_range_bit(tree, start, end,
+					   EXTENT_DIRTY, 0)) {
+				unlock_page(page);
+				continue;
+			}
+		}
+		clear_page_dirty_for_io(page);
+		write_lock_irq(&page->mapping->tree_lock);
+		if (!PageDirty(page)) {
+			radix_tree_tag_clear(&page->mapping->page_tree,
+						page_index(page),
+						PAGECACHE_TAG_DIRTY);
+		}
+		write_unlock_irq(&page->mapping->tree_lock);
+		unlock_page(page);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(clear_extent_buffer_dirty);
+
+int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
+				    struct extent_buffer *eb)
+{
+	return wait_on_extent_writeback(tree, eb->start,
+					eb->start + eb->len - 1);
+}
+EXPORT_SYMBOL(wait_on_extent_buffer_writeback);
+
+int set_extent_buffer_dirty(struct extent_io_tree *tree,
+			     struct extent_buffer *eb)
+{
+	unsigned long i;
+	unsigned long num_pages;
+
+	num_pages = num_extent_pages(eb->start, eb->len);
+	for (i = 0; i < num_pages; i++) {
+		struct page *page = extent_buffer_page(eb, i);
+		/* writepage may need to do something special for the
+		 * first page, we have to make sure page->private is
+		 * properly set.  releasepage may drop page->private
+		 * on us if the page isn't already dirty.
+		 */
+		if (i == 0) {
+			lock_page(page);
+			set_page_extent_head(page, eb->len);
+		} else if (PagePrivate(page) &&
+			   page->private != EXTENT_PAGE_PRIVATE) {
+			lock_page(page);
+			set_page_extent_mapped(page);
+			unlock_page(page);
+		}
+		__set_page_dirty_nobuffers(extent_buffer_page(eb, i));
+		if (i == 0)
+			unlock_page(page);
+	}
+	return set_extent_dirty(tree, eb->start,
+				eb->start + eb->len - 1, GFP_NOFS);
+}
+EXPORT_SYMBOL(set_extent_buffer_dirty);
+
+int set_extent_buffer_uptodate(struct extent_io_tree *tree,
+				struct extent_buffer *eb)
+{
+	unsigned long i;
+	struct page *page;
+	unsigned long num_pages;
+
+	num_pages = num_extent_pages(eb->start, eb->len);
+
+	set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
+			    GFP_NOFS);
+	for (i = 0; i < num_pages; i++) {
+		page = extent_buffer_page(eb, i);
+		if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
+		    ((i == num_pages - 1) &&
+		     ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
+			check_page_uptodate(tree, page);
+			continue;
+		}
+		SetPageUptodate(page);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(set_extent_buffer_uptodate);
+
+int extent_buffer_uptodate(struct extent_io_tree *tree,
+			     struct extent_buffer *eb)
+{
+	if (eb->flags & EXTENT_UPTODATE)
+		return 1;
+	return test_range_bit(tree, eb->start, eb->start + eb->len - 1,
+			   EXTENT_UPTODATE, 1);
+}
+EXPORT_SYMBOL(extent_buffer_uptodate);
+
+int read_extent_buffer_pages(struct extent_io_tree *tree,
+			     struct extent_buffer *eb,
+			     u64 start,
+			     int wait)
+{
+	unsigned long i;
+	unsigned long start_i;
+	struct page *page;
+	int err;
+	int ret = 0;
+	unsigned long num_pages;
+
+	if (eb->flags & EXTENT_UPTODATE)
+		return 0;
+
+	if (0 && test_range_bit(tree, eb->start, eb->start + eb->len - 1,
+			   EXTENT_UPTODATE, 1)) {
+		return 0;
+	}
+
+	if (start) {
+		WARN_ON(start < eb->start);
+		start_i = (start >> PAGE_CACHE_SHIFT) -
+			(eb->start >> PAGE_CACHE_SHIFT);
+	} else {
+		start_i = 0;
+	}
+
+	num_pages = num_extent_pages(eb->start, eb->len);
+	for (i = start_i; i < num_pages; i++) {
+		page = extent_buffer_page(eb, i);
+		if (PageUptodate(page)) {
+			continue;
+		}
+		if (!wait) {
+			if (TestSetPageLocked(page)) {
+				continue;
+			}
+		} else {
+			lock_page(page);
+		}
+		if (!PageUptodate(page)) {
+			err = page->mapping->a_ops->readpage(NULL, page);
+			if (err) {
+				ret = err;
+			}
+		} else {
+			unlock_page(page);
+		}
+	}
+
+	if (ret || !wait) {
+		return ret;
+	}
+
+	for (i = start_i; i < num_pages; i++) {
+		page = extent_buffer_page(eb, i);
+		wait_on_page_locked(page);
+		if (!PageUptodate(page)) {
+			ret = -EIO;
+		}
+	}
+	if (!ret)
+		eb->flags |= EXTENT_UPTODATE;
+	return ret;
+}
+EXPORT_SYMBOL(read_extent_buffer_pages);
+
+void read_extent_buffer(struct extent_buffer *eb, void *dstv,
+			unsigned long start,
+			unsigned long len)
+{
+	size_t cur;
+	size_t offset;
+	struct page *page;
+	char *kaddr;
+	char *dst = (char *)dstv;
+	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+	unsigned long num_pages = num_extent_pages(eb->start, eb->len);
+
+	WARN_ON(start > eb->len);
+	WARN_ON(start + len > eb->start + eb->len);
+
+	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
+
+	while(len > 0) {
+		page = extent_buffer_page(eb, i);
+		if (!PageUptodate(page)) {
+			printk("page %lu not up to date i %lu, total %lu, len %lu\n", page->index, i, num_pages, eb->len);
+			WARN_ON(1);
+		}
+		WARN_ON(!PageUptodate(page));
+
+		cur = min(len, (PAGE_CACHE_SIZE - offset));
+		kaddr = kmap_atomic(page, KM_USER1);
+		memcpy(dst, kaddr + offset, cur);
+		kunmap_atomic(kaddr, KM_USER1);
+
+		dst += cur;
+		len -= cur;
+		offset = 0;
+		i++;
+	}
+}
+EXPORT_SYMBOL(read_extent_buffer);
+
+int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
+			       unsigned long min_len, char **token, char **map,
+			       unsigned long *map_start,
+			       unsigned long *map_len, int km)
+{
+	size_t offset = start & (PAGE_CACHE_SIZE - 1);
+	char *kaddr;
+	struct page *p;
+	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+	unsigned long end_i = (start_offset + start + min_len - 1) >>
+		PAGE_CACHE_SHIFT;
+
+	if (i != end_i)
+		return -EINVAL;
+
+	if (i == 0) {
+		offset = start_offset;
+		*map_start = 0;
+	} else {
+		offset = 0;
+		*map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
+	}
+	if (start + min_len > eb->len) {
+printk("bad mapping eb start %Lu len %lu, wanted %lu %lu\n", eb->start, eb->len, start, min_len);
+		WARN_ON(1);
+	}
+
+	p = extent_buffer_page(eb, i);
+	WARN_ON(!PageUptodate(p));
+	kaddr = kmap_atomic(p, km);
+	*token = kaddr;
+	*map = kaddr + offset;
+	*map_len = PAGE_CACHE_SIZE - offset;
+	return 0;
+}
+EXPORT_SYMBOL(map_private_extent_buffer);
+
+int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
+		      unsigned long min_len,
+		      char **token, char **map,
+		      unsigned long *map_start,
+		      unsigned long *map_len, int km)
+{
+	int err;
+	int save = 0;
+	if (eb->map_token) {
+		unmap_extent_buffer(eb, eb->map_token, km);
+		eb->map_token = NULL;
+		save = 1;
+	}
+	err = map_private_extent_buffer(eb, start, min_len, token, map,
+				       map_start, map_len, km);
+	if (!err && save) {
+		eb->map_token = *token;
+		eb->kaddr = *map;
+		eb->map_start = *map_start;
+		eb->map_len = *map_len;
+	}
+	return err;
+}
+EXPORT_SYMBOL(map_extent_buffer);
+
+void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km)
+{
+	kunmap_atomic(token, km);
+}
+EXPORT_SYMBOL(unmap_extent_buffer);
+
+int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
+			  unsigned long start,
+			  unsigned long len)
+{
+	size_t cur;
+	size_t offset;
+	struct page *page;
+	char *kaddr;
+	char *ptr = (char *)ptrv;
+	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+	int ret = 0;
+
+	WARN_ON(start > eb->len);
+	WARN_ON(start + len > eb->start + eb->len);
+
+	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
+
+	while(len > 0) {
+		page = extent_buffer_page(eb, i);
+		WARN_ON(!PageUptodate(page));
+
+		cur = min(len, (PAGE_CACHE_SIZE - offset));
+
+		kaddr = kmap_atomic(page, KM_USER0);
+		ret = memcmp(ptr, kaddr + offset, cur);
+		kunmap_atomic(kaddr, KM_USER0);
+		if (ret)
+			break;
+
+		ptr += cur;
+		len -= cur;
+		offset = 0;
+		i++;
+	}
+	return ret;
+}
+EXPORT_SYMBOL(memcmp_extent_buffer);
+
+void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
+			 unsigned long start, unsigned long len)
+{
+	size_t cur;
+	size_t offset;
+	struct page *page;
+	char *kaddr;
+	char *src = (char *)srcv;
+	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+
+	WARN_ON(start > eb->len);
+	WARN_ON(start + len > eb->start + eb->len);
+
+	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
+
+	while(len > 0) {
+		page = extent_buffer_page(eb, i);
+		WARN_ON(!PageUptodate(page));
+
+		cur = min(len, PAGE_CACHE_SIZE - offset);
+		kaddr = kmap_atomic(page, KM_USER1);
+		memcpy(kaddr + offset, src, cur);
+		kunmap_atomic(kaddr, KM_USER1);
+
+		src += cur;
+		len -= cur;
+		offset = 0;
+		i++;
+	}
+}
+EXPORT_SYMBOL(write_extent_buffer);
+
+void memset_extent_buffer(struct extent_buffer *eb, char c,
+			  unsigned long start, unsigned long len)
+{
+	size_t cur;
+	size_t offset;
+	struct page *page;
+	char *kaddr;
+	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+
+	WARN_ON(start > eb->len);
+	WARN_ON(start + len > eb->start + eb->len);
+
+	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
+
+	while(len > 0) {
+		page = extent_buffer_page(eb, i);
+		WARN_ON(!PageUptodate(page));
+
+		cur = min(len, PAGE_CACHE_SIZE - offset);
+		kaddr = kmap_atomic(page, KM_USER0);
+		memset(kaddr + offset, c, cur);
+		kunmap_atomic(kaddr, KM_USER0);
+
+		len -= cur;
+		offset = 0;
+		i++;
+	}
+}
+EXPORT_SYMBOL(memset_extent_buffer);
+
+void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
+			unsigned long dst_offset, unsigned long src_offset,
+			unsigned long len)
+{
+	u64 dst_len = dst->len;
+	size_t cur;
+	size_t offset;
+	struct page *page;
+	char *kaddr;
+	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
+	unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
+
+	WARN_ON(src->len != dst_len);
+
+	offset = (start_offset + dst_offset) &
+		((unsigned long)PAGE_CACHE_SIZE - 1);
+
+	while(len > 0) {
+		page = extent_buffer_page(dst, i);
+		WARN_ON(!PageUptodate(page));
+
+		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
+
+		kaddr = kmap_atomic(page, KM_USER0);
+		read_extent_buffer(src, kaddr + offset, src_offset, cur);
+		kunmap_atomic(kaddr, KM_USER0);
+
+		src_offset += cur;
+		len -= cur;
+		offset = 0;
+		i++;
+	}
+}
+EXPORT_SYMBOL(copy_extent_buffer);
+
+static void move_pages(struct page *dst_page, struct page *src_page,
+		       unsigned long dst_off, unsigned long src_off,
+		       unsigned long len)
+{
+	char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
+	if (dst_page == src_page) {
+		memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
+	} else {
+		char *src_kaddr = kmap_atomic(src_page, KM_USER1);
+		char *p = dst_kaddr + dst_off + len;
+		char *s = src_kaddr + src_off + len;
+
+		while (len--)
+			*--p = *--s;
+
+		kunmap_atomic(src_kaddr, KM_USER1);
+	}
+	kunmap_atomic(dst_kaddr, KM_USER0);
+}
+
+static void copy_pages(struct page *dst_page, struct page *src_page,
+		       unsigned long dst_off, unsigned long src_off,
+		       unsigned long len)
+{
+	char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
+	char *src_kaddr;
+
+	if (dst_page != src_page)
+		src_kaddr = kmap_atomic(src_page, KM_USER1);
+	else
+		src_kaddr = dst_kaddr;
+
+	memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
+	kunmap_atomic(dst_kaddr, KM_USER0);
+	if (dst_page != src_page)
+		kunmap_atomic(src_kaddr, KM_USER1);
+}
+
+void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
+			   unsigned long src_offset, unsigned long len)
+{
+	size_t cur;
+	size_t dst_off_in_page;
+	size_t src_off_in_page;
+	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
+	unsigned long dst_i;
+	unsigned long src_i;
+
+	if (src_offset + len > dst->len) {
+		printk("memmove bogus src_offset %lu move len %lu len %lu\n",
+		       src_offset, len, dst->len);
+		BUG_ON(1);
+	}
+	if (dst_offset + len > dst->len) {
+		printk("memmove bogus dst_offset %lu move len %lu len %lu\n",
+		       dst_offset, len, dst->len);
+		BUG_ON(1);
+	}
+
+	while(len > 0) {
+		dst_off_in_page = (start_offset + dst_offset) &
+			((unsigned long)PAGE_CACHE_SIZE - 1);
+		src_off_in_page = (start_offset + src_offset) &
+			((unsigned long)PAGE_CACHE_SIZE - 1);
+
+		dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
+		src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
+
+		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
+					       src_off_in_page));
+		cur = min_t(unsigned long, cur,
+			(unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
+
+		copy_pages(extent_buffer_page(dst, dst_i),
+			   extent_buffer_page(dst, src_i),
+			   dst_off_in_page, src_off_in_page, cur);
+
+		src_offset += cur;
+		dst_offset += cur;
+		len -= cur;
+	}
+}
+EXPORT_SYMBOL(memcpy_extent_buffer);
+
+void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
+			   unsigned long src_offset, unsigned long len)
+{
+	size_t cur;
+	size_t dst_off_in_page;
+	size_t src_off_in_page;
+	unsigned long dst_end = dst_offset + len - 1;
+	unsigned long src_end = src_offset + len - 1;
+	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
+	unsigned long dst_i;
+	unsigned long src_i;
+
+	if (src_offset + len > dst->len) {
+		printk("memmove bogus src_offset %lu move len %lu len %lu\n",
+		       src_offset, len, dst->len);
+		BUG_ON(1);
+	}
+	if (dst_offset + len > dst->len) {
+		printk("memmove bogus dst_offset %lu move len %lu len %lu\n",
+		       dst_offset, len, dst->len);
+		BUG_ON(1);
+	}
+	if (dst_offset < src_offset) {
+		memcpy_extent_buffer(dst, dst_offset, src_offset, len);
+		return;
+	}
+	while(len > 0) {
+		dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
+		src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
+
+		dst_off_in_page = (start_offset + dst_end) &
+			((unsigned long)PAGE_CACHE_SIZE - 1);
+		src_off_in_page = (start_offset + src_end) &
+			((unsigned long)PAGE_CACHE_SIZE - 1);
+
+		cur = min_t(unsigned long, len, src_off_in_page + 1);
+		cur = min(cur, dst_off_in_page + 1);
+		move_pages(extent_buffer_page(dst, dst_i),
+			   extent_buffer_page(dst, src_i),
+			   dst_off_in_page - cur + 1,
+			   src_off_in_page - cur + 1, cur);
+
+		dst_end -= cur;
+		src_end -= cur;
+		len -= cur;
+	}
+}
+EXPORT_SYMBOL(memmove_extent_buffer);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
new file mode 100644
index 000000000000..06be1fe84b29
--- /dev/null
+++ b/fs/btrfs/extent_io.h
@@ -0,0 +1,193 @@
+#ifndef __EXTENTIO__
+#define __EXTENTIO__
+
+#include <linux/rbtree.h>
+
+/* bits for the extent state */
+#define EXTENT_DIRTY 1
+#define EXTENT_WRITEBACK (1 << 1)
+#define EXTENT_UPTODATE (1 << 2)
+#define EXTENT_LOCKED (1 << 3)
+#define EXTENT_NEW (1 << 4)
+#define EXTENT_DELALLOC (1 << 5)
+#define EXTENT_DEFRAG (1 << 6)
+#define EXTENT_DEFRAG_DONE (1 << 7)
+#define EXTENT_BUFFER_FILLED (1 << 8)
+#define EXTENT_CSUM (1 << 9)
+#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
+
+/*
+ * page->private values.  Every page that is controlled by the extent
+ * map has page->private set to one.
+ */
+#define EXTENT_PAGE_PRIVATE 1
+#define EXTENT_PAGE_PRIVATE_FIRST_PAGE 3
+
+struct extent_io_ops {
+	int (*fill_delalloc)(struct inode *inode, u64 start, u64 end);
+	int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
+	int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
+	int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end);
+	void (*writepage_end_io_hook)(struct page *page, u64 start, u64 end);
+};
+
+struct extent_io_tree {
+	struct rb_root state;
+	struct address_space *mapping;
+	u64 dirty_bytes;
+	rwlock_t lock;
+	struct extent_io_ops *ops;
+	spinlock_t lru_lock;
+	struct list_head buffer_lru;
+	int lru_size;
+};
+
+struct extent_state {
+	u64 start;
+	u64 end; /* inclusive */
+	int in_tree;
+	struct rb_node rb_node;
+	wait_queue_head_t wq;
+	atomic_t refs;
+	unsigned long state;
+
+	/* for use by the FS */
+	u64 private;
+
+	struct list_head list;
+};
+
+struct extent_buffer {
+	u64 start;
+	unsigned long len;
+	char *map_token;
+	char *kaddr;
+	unsigned long map_start;
+	unsigned long map_len;
+	struct page *first_page;
+	struct list_head lru;
+	atomic_t refs;
+	int flags;
+};
+
+struct extent_map_tree;
+
+typedef struct extent_map *(get_extent_t)(struct inode *inode,
+					  struct page *page,
+					  size_t page_offset,
+					  u64 start, u64 len,
+					  int create);
+
+void extent_io_tree_init(struct extent_io_tree *tree,
+			  struct address_space *mapping, gfp_t mask);
+void extent_io_tree_empty_lru(struct extent_io_tree *tree);
+int try_release_extent_mapping(struct extent_map_tree *map,
+			       struct extent_io_tree *tree, struct page *page);
+int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
+int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
+int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
+			  get_extent_t *get_extent);
+int __init extent_io_init(void);
+void extent_io_exit(void);
+
+u64 count_range_bits(struct extent_io_tree *tree,
+		     u64 *start, u64 search_end,
+		     u64 max_bytes, unsigned long bits);
+
+int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
+		   int bits, int filled);
+int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+		      int bits, gfp_t mask);
+int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+		    int bits, gfp_t mask);
+int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
+			gfp_t mask);
+int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
+		   gfp_t mask);
+int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
+		     gfp_t mask);
+int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
+		       gfp_t mask);
+int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
+		     gfp_t mask);
+int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
+			  u64 *start_ret, u64 *end_ret, int bits);
+int extent_invalidatepage(struct extent_io_tree *tree,
+			  struct page *page, unsigned long offset);
+int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
+			  get_extent_t *get_extent,
+			  struct writeback_control *wbc);
+int extent_writepages(struct extent_io_tree *tree,
+		      struct address_space *mapping,
+		      get_extent_t *get_extent,
+		      struct writeback_control *wbc);
+int extent_readpages(struct extent_io_tree *tree,
+		     struct address_space *mapping,
+		     struct list_head *pages, unsigned nr_pages,
+		     get_extent_t get_extent);
+int extent_prepare_write(struct extent_io_tree *tree,
+			 struct inode *inode, struct page *page,
+			 unsigned from, unsigned to, get_extent_t *get_extent);
+int extent_commit_write(struct extent_io_tree *tree,
+			struct inode *inode, struct page *page,
+			unsigned from, unsigned to);
+sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
+		get_extent_t *get_extent);
+int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end);
+int set_state_private(struct extent_io_tree *tree, u64 start, u64 private);
+int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
+void set_page_extent_mapped(struct page *page);
+
+struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
+					  u64 start, unsigned long len,
+					  struct page *page0,
+					  gfp_t mask);
+struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
+					 u64 start, unsigned long len,
+					  gfp_t mask);
+void free_extent_buffer(struct extent_buffer *eb);
+int read_extent_buffer_pages(struct extent_io_tree *tree,
+			     struct extent_buffer *eb, u64 start, int wait);
+
+static inline void extent_buffer_get(struct extent_buffer *eb)
+{
+	atomic_inc(&eb->refs);
+}
+
+int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
+			  unsigned long start,
+			  unsigned long len);
+void read_extent_buffer(struct extent_buffer *eb, void *dst,
+			unsigned long start,
+			unsigned long len);
+void write_extent_buffer(struct extent_buffer *eb, const void *src,
+			 unsigned long start, unsigned long len);
+void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
+			unsigned long dst_offset, unsigned long src_offset,
+			unsigned long len);
+void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
+			   unsigned long src_offset, unsigned long len);
+void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
+			   unsigned long src_offset, unsigned long len);
+void memset_extent_buffer(struct extent_buffer *eb, char c,
+			  unsigned long start, unsigned long len);
+int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
+				    struct extent_buffer *eb);
+int clear_extent_buffer_dirty(struct extent_io_tree *tree,
+			      struct extent_buffer *eb);
+int set_extent_buffer_dirty(struct extent_io_tree *tree,
+			     struct extent_buffer *eb);
+int set_extent_buffer_uptodate(struct extent_io_tree *tree,
+			       struct extent_buffer *eb);
+int extent_buffer_uptodate(struct extent_io_tree *tree,
+			   struct extent_buffer *eb);
+int map_extent_buffer(struct extent_buffer *eb, unsigned long offset,
+		      unsigned long min_len, char **token, char **map,
+		      unsigned long *map_start,
+		      unsigned long *map_len, int km);
+int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset,
+		      unsigned long min_len, char **token, char **map,
+		      unsigned long *map_start,
+		      unsigned long *map_len, int km);
+void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km);
+#endif
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 010a287fbd71..268ad8facf6e 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1,17 +1,10 @@
-#include <linux/bitops.h>
-#include <linux/slab.h>
-#include <linux/bio.h>
-#include <linux/mm.h>
+#include <linux/err.h>
 #include <linux/gfp.h>
-#include <linux/pagemap.h>
-#include <linux/page-flags.h>
+#include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/spinlock.h>
-#include <linux/blkdev.h>
-#include <linux/swap.h>
 #include <linux/version.h>
-#include <linux/writeback.h>
-#include <linux/pagevec.h>
+#include <linux/hardirq.h>
 #include "extent_map.h"
 
 /* temporary define until extent_map moves out of btrfs */
@@ -21,27 +14,6 @@ struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
 						    unsigned long));
 
 static struct kmem_cache *extent_map_cache;
-static struct kmem_cache *extent_state_cache;
-static struct kmem_cache *extent_buffer_cache;
-
-static LIST_HEAD(buffers);
-static LIST_HEAD(states);
-
-static spinlock_t state_lock = SPIN_LOCK_UNLOCKED;
-#define BUFFER_LRU_MAX 64
-
-struct tree_entry {
-	u64 start;
-	u64 end;
-	int in_tree;
-	struct rb_node rb_node;
-};
-
-struct extent_page_data {
-	struct bio *bio;
-	struct extent_map_tree *tree;
-	get_extent_t *get_extent;
-};
 
 int __init extent_map_init(void)
 {
@@ -50,72 +22,23 @@ int __init extent_map_init(void)
 					    NULL);
 	if (!extent_map_cache)
 		return -ENOMEM;
-	extent_state_cache = btrfs_cache_create("extent_state",
-					    sizeof(struct extent_state), 0,
-					    NULL);
-	if (!extent_state_cache)
-		goto free_map_cache;
-	extent_buffer_cache = btrfs_cache_create("extent_buffers",
-					    sizeof(struct extent_buffer), 0,
-					    NULL);
-	if (!extent_buffer_cache)
-		goto free_state_cache;
 	return 0;
-
-free_state_cache:
-	kmem_cache_destroy(extent_state_cache);
-free_map_cache:
-	kmem_cache_destroy(extent_map_cache);
-	return -ENOMEM;
 }
 
 void extent_map_exit(void)
 {
-	struct extent_state *state;
-
-	while (!list_empty(&states)) {
-		state = list_entry(states.next, struct extent_state, list);
-		printk("state leak: start %Lu end %Lu state %lu in tree %d refs %d\n", state->start, state->end, state->state, state->in_tree, atomic_read(&state->refs));
-		list_del(&state->list);
-		kmem_cache_free(extent_state_cache, state);
-
-	}
-
 	if (extent_map_cache)
 		kmem_cache_destroy(extent_map_cache);
-	if (extent_state_cache)
-		kmem_cache_destroy(extent_state_cache);
-	if (extent_buffer_cache)
-		kmem_cache_destroy(extent_buffer_cache);
 }
 
-void extent_map_tree_init(struct extent_map_tree *tree,
-			  struct address_space *mapping, gfp_t mask)
+void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask)
 {
 	tree->map.rb_node = NULL;
-	tree->state.rb_node = NULL;
-	tree->ops = NULL;
-	tree->dirty_bytes = 0;
-	rwlock_init(&tree->lock);
-	spin_lock_init(&tree->lru_lock);
-	tree->mapping = mapping;
-	INIT_LIST_HEAD(&tree->buffer_lru);
-	tree->lru_size = 0;
+	tree->last = NULL;
+	spin_lock_init(&tree->lock);
 }
 EXPORT_SYMBOL(extent_map_tree_init);
 
-void extent_map_tree_empty_lru(struct extent_map_tree *tree)
-{
-	struct extent_buffer *eb;
-	while(!list_empty(&tree->buffer_lru)) {
-		eb = list_entry(tree->buffer_lru.next, struct extent_buffer,
-				lru);
-		list_del_init(&eb->lru);
-		free_extent_buffer(eb);
-	}
-}
-EXPORT_SYMBOL(extent_map_tree_empty_lru);
-
 struct extent_map *alloc_extent_map(gfp_t mask)
 {
 	struct extent_map *em;
@@ -123,6 +46,7 @@ struct extent_map *alloc_extent_map(gfp_t mask)
 	if (!em || IS_ERR(em))
 		return em;
 	em->in_tree = 0;
+	em->flags = 0;
 	atomic_set(&em->refs, 1);
 	return em;
 }
@@ -132,6 +56,7 @@ void free_extent_map(struct extent_map *em)
 {
 	if (!em)
 		return;
+	WARN_ON(atomic_read(&em->refs) == 0);
 	if (atomic_dec_and_test(&em->refs)) {
 		WARN_ON(em->in_tree);
 		kmem_cache_free(extent_map_cache, em);
@@ -139,64 +64,28 @@ void free_extent_map(struct extent_map *em)
 }
 EXPORT_SYMBOL(free_extent_map);
 
-
-struct extent_state *alloc_extent_state(gfp_t mask)
-{
-	struct extent_state *state;
-	unsigned long flags;
-
-	state = kmem_cache_alloc(extent_state_cache, mask);
-	if (!state || IS_ERR(state))
-		return state;
-	state->state = 0;
-	state->in_tree = 0;
-	state->private = 0;
-
-	spin_lock_irqsave(&state_lock, flags);
-	list_add(&state->list, &states);
-	spin_unlock_irqrestore(&state_lock, flags);
-
-	atomic_set(&state->refs, 1);
-	init_waitqueue_head(&state->wq);
-	return state;
-}
-EXPORT_SYMBOL(alloc_extent_state);
-
-void free_extent_state(struct extent_state *state)
-{
-	unsigned long flags;
-	if (!state)
-		return;
-	if (atomic_dec_and_test(&state->refs)) {
-		WARN_ON(state->in_tree);
-		spin_lock_irqsave(&state_lock, flags);
-		list_del(&state->list);
-		spin_unlock_irqrestore(&state_lock, flags);
-		kmem_cache_free(extent_state_cache, state);
-	}
-}
-EXPORT_SYMBOL(free_extent_state);
-
 static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
 				   struct rb_node *node)
 {
 	struct rb_node ** p = &root->rb_node;
 	struct rb_node * parent = NULL;
-	struct tree_entry *entry;
+	struct extent_map *entry;
 
 	while(*p) {
 		parent = *p;
-		entry = rb_entry(parent, struct tree_entry, rb_node);
+		entry = rb_entry(parent, struct extent_map, rb_node);
+
+		WARN_ON(!entry->in_tree);
 
 		if (offset < entry->start)
 			p = &(*p)->rb_left;
-		else if (offset > entry->end)
+		else if (offset >= extent_map_end(entry))
 			p = &(*p)->rb_right;
 		else
 			return parent;
 	}
 
-	entry = rb_entry(node, struct tree_entry, rb_node);
+	entry = rb_entry(node, struct extent_map, rb_node);
 	entry->in_tree = 1;
 	rb_link_node(node, parent, p);
 	rb_insert_color(node, root);
@@ -210,17 +99,19 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
 	struct rb_node * n = root->rb_node;
 	struct rb_node *prev = NULL;
 	struct rb_node *orig_prev = NULL;
-	struct tree_entry *entry;
-	struct tree_entry *prev_entry = NULL;
+	struct extent_map *entry;
+	struct extent_map *prev_entry = NULL;
 
 	while(n) {
-		entry = rb_entry(n, struct tree_entry, rb_node);
+		entry = rb_entry(n, struct extent_map, rb_node);
 		prev = n;
 		prev_entry = entry;
 
+		WARN_ON(!entry->in_tree);
+
 		if (offset < entry->start)
 			n = n->rb_left;
-		else if (offset > entry->end)
+		else if (offset >= extent_map_end(entry))
 			n = n->rb_right;
 		else
 			return n;
@@ -228,19 +119,19 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
 
 	if (prev_ret) {
 		orig_prev = prev;
-		while(prev && offset > prev_entry->end) {
+		while(prev && offset >= extent_map_end(prev_entry)) {
 			prev = rb_next(prev);
-			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
+			prev_entry = rb_entry(prev, struct extent_map, rb_node);
 		}
 		*prev_ret = prev;
 		prev = orig_prev;
 	}
 
 	if (next_ret) {
-		prev_entry = rb_entry(prev, struct tree_entry, rb_node);
+		prev_entry = rb_entry(prev, struct extent_map, rb_node);
 		while(prev && offset < prev_entry->start) {
 			prev = rb_prev(prev);
-			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
+			prev_entry = rb_entry(prev, struct extent_map, rb_node);
 		}
 		*next_ret = prev;
 	}
@@ -257,22 +148,26 @@ static inline struct rb_node *tree_search(struct rb_root *root, u64 offset)
 	return ret;
 }
 
-static int tree_delete(struct rb_root *root, u64 offset)
+static int mergable_maps(struct extent_map *prev, struct extent_map *next)
 {
-	struct rb_node *node;
-	struct tree_entry *entry;
-
-	node = __tree_search(root, offset, NULL, NULL);
-	if (!node)
-		return -ENOENT;
-	entry = rb_entry(node, struct tree_entry, rb_node);
-	entry->in_tree = 0;
-	rb_erase(node, root);
+	if (extent_map_end(prev) == next->start &&
+	    prev->flags == next->flags &&
+	    prev->bdev == next->bdev &&
+	    ((next->block_start == EXTENT_MAP_HOLE &&
+	      prev->block_start == EXTENT_MAP_HOLE) ||
+	     (next->block_start == EXTENT_MAP_INLINE &&
+	      prev->block_start == EXTENT_MAP_INLINE) ||
+	     (next->block_start == EXTENT_MAP_DELALLOC &&
+	      prev->block_start == EXTENT_MAP_DELALLOC) ||
+	     (next->block_start < EXTENT_MAP_LAST_BYTE - 1 &&
+	      next->block_start == extent_map_block_end(prev)))) {
+		return 1;
+	}
 	return 0;
 }
 
 /*
- * add_extent_mapping tries a simple backward merge with existing
+ * add_extent_mapping tries a simple forward/backward merge with existing
  * mappings.  The extent_map struct passed in will be inserted into
  * the tree directly (no copies made, just a reference taken).
  */
@@ -280,13 +175,12 @@ int add_extent_mapping(struct extent_map_tree *tree,
 		       struct extent_map *em)
 {
 	int ret = 0;
-	struct extent_map *prev = NULL;
+	struct extent_map *merge = NULL;
 	struct rb_node *rb;
 
-	write_lock_irq(&tree->lock);
-	rb = tree_insert(&tree->map, em->end, &em->rb_node);
+	rb = tree_insert(&tree->map, em->start, &em->rb_node);
 	if (rb) {
-		prev = rb_entry(rb, struct extent_map, rb_node);
+		merge = rb_entry(rb, struct extent_map, rb_node);
 		ret = -EEXIST;
 		goto out;
 	}
@@ -294,53 +188,60 @@ int add_extent_mapping(struct extent_map_tree *tree,
 	if (em->start != 0) {
 		rb = rb_prev(&em->rb_node);
 		if (rb)
-			prev = rb_entry(rb, struct extent_map, rb_node);
-		if (prev && prev->end + 1 == em->start &&
-		    ((em->block_start == EXTENT_MAP_HOLE &&
-		      prev->block_start == EXTENT_MAP_HOLE) ||
-		     (em->block_start == EXTENT_MAP_INLINE &&
-		      prev->block_start == EXTENT_MAP_INLINE) ||
-		     (em->block_start == EXTENT_MAP_DELALLOC &&
-		      prev->block_start == EXTENT_MAP_DELALLOC) ||
-		     (em->block_start < EXTENT_MAP_DELALLOC - 1 &&
-		      em->block_start == prev->block_end + 1))) {
-			em->start = prev->start;
-			em->block_start = prev->block_start;
-			rb_erase(&prev->rb_node, &tree->map);
-			prev->in_tree = 0;
-			free_extent_map(prev);
+			merge = rb_entry(rb, struct extent_map, rb_node);
+		if (rb && mergable_maps(merge, em)) {
+			em->start = merge->start;
+			em->len += merge->len;
+			em->block_start = merge->block_start;
+			merge->in_tree = 0;
+			rb_erase(&merge->rb_node, &tree->map);
+			free_extent_map(merge);
 		}
 	 }
+	rb = rb_next(&em->rb_node);
+	if (rb)
+		merge = rb_entry(rb, struct extent_map, rb_node);
+	if (rb && mergable_maps(em, merge)) {
+		em->len += merge->len;
+		rb_erase(&merge->rb_node, &tree->map);
+		merge->in_tree = 0;
+		free_extent_map(merge);
+	}
+	tree->last = em;
 out:
-	write_unlock_irq(&tree->lock);
 	return ret;
 }
 EXPORT_SYMBOL(add_extent_mapping);
 
+static u64 range_end(u64 start, u64 len)
+{
+	if (start + len < start)
+		return (u64)-1;
+	return start + len;
+}
+
 /*
  * lookup_extent_mapping returns the first extent_map struct in the
- * tree that intersects the [start, end] (inclusive) range.  There may
+ * tree that intersects the [start, len] range.  There may
  * be additional objects in the tree that intersect, so check the object
  * returned carefully to make sure you don't need additional lookups.
  */
 struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
-					 u64 start, u64 end)
+					 u64 start, u64 len)
 {
 	struct extent_map *em;
 	struct rb_node *rb_node;
-	struct rb_node *prev = NULL;
-	struct rb_node *next = NULL;
+	struct rb_node *prev = NULL; struct rb_node *next = NULL; u64 end = range_end(start, len); em = tree->last; if (em && end > em->start && start < extent_map_end(em)) goto found;
 
-	read_lock_irq(&tree->lock);
 	rb_node = __tree_search(&tree->map, start, &prev, &next);
 	if (!rb_node && prev) {
 		em = rb_entry(prev, struct extent_map, rb_node);
-		if (em->start <= end && em->end >= start)
+		if (end > em->start && start < extent_map_end(em))
 			goto found;
 	}
 	if (!rb_node && next) {
 		em = rb_entry(next, struct extent_map, rb_node);
-		if (em->start <= end && em->end >= start)
+		if (end > em->start && start < extent_map_end(em))
 			goto found;
 	}
 	if (!rb_node) {
@@ -352,14 +253,16 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
 		goto out;
 	}
 	em = rb_entry(rb_node, struct extent_map, rb_node);
-	if (em->end < start || em->start > end) {
-		em = NULL;
-		goto out;
-	}
+	if (end > em->start && start < extent_map_end(em))
+		goto found;
+
+	em = NULL;
+	goto out;
+
 found:
 	atomic_inc(&em->refs);
+	tree->last = em;
 out:
-	read_unlock_irq(&tree->lock);
 	return em;
 }
 EXPORT_SYMBOL(lookup_extent_mapping);
@@ -370,2866 +273,12 @@ EXPORT_SYMBOL(lookup_extent_mapping);
  */
 int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
 {
-	int ret;
+	int ret = 0;
 
-	write_lock_irq(&tree->lock);
-	ret = tree_delete(&tree->map, em->end);
-	write_unlock_irq(&tree->lock);
+	rb_erase(&em->rb_node, &tree->map);
+	em->in_tree = 0;
+	if (tree->last == em)
+		tree->last = NULL;
 	return ret;
 }
 EXPORT_SYMBOL(remove_extent_mapping);
-
-/*
- * utility function to look for merge candidates inside a given range.
- * Any extents with matching state are merged together into a single
- * extent in the tree.  Extents with EXTENT_IO in their state field
- * are not merged because the end_io handlers need to be able to do
- * operations on them without sleeping (or doing allocations/splits).
- *
- * This should be called with the tree lock held.
- */
-static int merge_state(struct extent_map_tree *tree,
-		       struct extent_state *state)
-{
-	struct extent_state *other;
-	struct rb_node *other_node;
-
-	if (state->state & EXTENT_IOBITS)
-		return 0;
-
-	other_node = rb_prev(&state->rb_node);
-	if (other_node) {
-		other = rb_entry(other_node, struct extent_state, rb_node);
-		if (other->end == state->start - 1 &&
-		    other->state == state->state) {
-			state->start = other->start;
-			other->in_tree = 0;
-			rb_erase(&other->rb_node, &tree->state);
-			free_extent_state(other);
-		}
-	}
-	other_node = rb_next(&state->rb_node);
-	if (other_node) {
-		other = rb_entry(other_node, struct extent_state, rb_node);
-		if (other->start == state->end + 1 &&
-		    other->state == state->state) {
-			other->start = state->start;
-			state->in_tree = 0;
-			rb_erase(&state->rb_node, &tree->state);
-			free_extent_state(state);
-		}
-	}
-	return 0;
-}
-
-/*
- * insert an extent_state struct into the tree.  'bits' are set on the
- * struct before it is inserted.
- *
- * This may return -EEXIST if the extent is already there, in which case the
- * state struct is freed.
- *
- * The tree lock is not taken internally.  This is a utility function and
- * probably isn't what you want to call (see set/clear_extent_bit).
- */
-static int insert_state(struct extent_map_tree *tree,
-			struct extent_state *state, u64 start, u64 end,
-			int bits)
-{
-	struct rb_node *node;
-
-	if (end < start) {
-		printk("end < start %Lu %Lu\n", end, start);
-		WARN_ON(1);
-	}
-	if (bits & EXTENT_DIRTY)
-		tree->dirty_bytes += end - start + 1;
-	state->state |= bits;
-	state->start = start;
-	state->end = end;
-	node = tree_insert(&tree->state, end, &state->rb_node);
-	if (node) {
-		struct extent_state *found;
-		found = rb_entry(node, struct extent_state, rb_node);
-		printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, start, end);
-		free_extent_state(state);
-		return -EEXIST;
-	}
-	merge_state(tree, state);
-	return 0;
-}
-
-/*
- * split a given extent state struct in two, inserting the preallocated
- * struct 'prealloc' as the newly created second half.  'split' indicates an
- * offset inside 'orig' where it should be split.
- *
- * Before calling,
- * the tree has 'orig' at [orig->start, orig->end].  After calling, there
- * are two extent state structs in the tree:
- * prealloc: [orig->start, split - 1]
- * orig: [ split, orig->end ]
- *
- * The tree locks are not taken by this function. They need to be held
- * by the caller.
- */
-static int split_state(struct extent_map_tree *tree, struct extent_state *orig,
-		       struct extent_state *prealloc, u64 split)
-{
-	struct rb_node *node;
-	prealloc->start = orig->start;
-	prealloc->end = split - 1;
-	prealloc->state = orig->state;
-	orig->start = split;
-
-	node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
-	if (node) {
-		struct extent_state *found;
-		found = rb_entry(node, struct extent_state, rb_node);
-		printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, prealloc->start, prealloc->end);
-		free_extent_state(prealloc);
-		return -EEXIST;
-	}
-	return 0;
-}
-
-/*
- * utility function to clear some bits in an extent state struct.
- * it will optionally wake up any one waiting on this state (wake == 1), or
- * forcibly remove the state from the tree (delete == 1).
- *
- * If no bits are set on the state struct after clearing things, the
- * struct is freed and removed from the tree
- */
-static int clear_state_bit(struct extent_map_tree *tree,
-			    struct extent_state *state, int bits, int wake,
-			    int delete)
-{
-	int ret = state->state & bits;
-
-	if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
-		u64 range = state->end - state->start + 1;
-		WARN_ON(range > tree->dirty_bytes);
-		tree->dirty_bytes -= range;
-	}
-	state->state &= ~bits;
-	if (wake)
-		wake_up(&state->wq);
-	if (delete || state->state == 0) {
-		if (state->in_tree) {
-			rb_erase(&state->rb_node, &tree->state);
-			state->in_tree = 0;
-			free_extent_state(state);
-		} else {
-			WARN_ON(1);
-		}
-	} else {
-		merge_state(tree, state);
-	}
-	return ret;
-}
-
-/*
- * clear some bits on a range in the tree.  This may require splitting
- * or inserting elements in the tree, so the gfp mask is used to
- * indicate which allocations or sleeping are allowed.
- *
- * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
- * the given range from the tree regardless of state (ie for truncate).
- *
- * the range [start, end] is inclusive.
- *
- * This takes the tree lock, and returns < 0 on error, > 0 if any of the
- * bits were already set, or zero if none of the bits were already set.
- */
-int clear_extent_bit(struct extent_map_tree *tree, u64 start, u64 end,
-		     int bits, int wake, int delete, gfp_t mask)
-{
-	struct extent_state *state;
-	struct extent_state *prealloc = NULL;
-	struct rb_node *node;
-	unsigned long flags;
-	int err;
-	int set = 0;
-
-again:
-	if (!prealloc && (mask & __GFP_WAIT)) {
-		prealloc = alloc_extent_state(mask);
-		if (!prealloc)
-			return -ENOMEM;
-	}
-
-	write_lock_irqsave(&tree->lock, flags);
-	/*
-	 * this search will find the extents that end after
-	 * our range starts
-	 */
-	node = tree_search(&tree->state, start);
-	if (!node)
-		goto out;
-	state = rb_entry(node, struct extent_state, rb_node);
-	if (state->start > end)
-		goto out;
-	WARN_ON(state->end < start);
-
-	/*
-	 *     | ---- desired range ---- |
-	 *  | state | or
-	 *  | ------------- state -------------- |
-	 *
-	 * We need to split the extent we found, and may flip
-	 * bits on second half.
-	 *
-	 * If the extent we found extends past our range, we
-	 * just split and search again.  It'll get split again
-	 * the next time though.
-	 *
-	 * If the extent we found is inside our range, we clear
-	 * the desired bit on it.
-	 */
-
-	if (state->start < start) {
-		err = split_state(tree, state, prealloc, start);
-		BUG_ON(err == -EEXIST);
-		prealloc = NULL;
-		if (err)
-			goto out;
-		if (state->end <= end) {
-			start = state->end + 1;
-			set |= clear_state_bit(tree, state, bits,
-					wake, delete);
-		} else {
-			start = state->start;
-		}
-		goto search_again;
-	}
-	/*
-	 * | ---- desired range ---- |
-	 *                        | state |
-	 * We need to split the extent, and clear the bit
-	 * on the first half
-	 */
-	if (state->start <= end && state->end > end) {
-		err = split_state(tree, state, prealloc, end + 1);
-		BUG_ON(err == -EEXIST);
-
-		if (wake)
-			wake_up(&state->wq);
-		set |= clear_state_bit(tree, prealloc, bits,
-				       wake, delete);
-		prealloc = NULL;
-		goto out;
-	}
-
-	start = state->end + 1;
-	set |= clear_state_bit(tree, state, bits, wake, delete);
-	goto search_again;
-
-out:
-	write_unlock_irqrestore(&tree->lock, flags);
-	if (prealloc)
-		free_extent_state(prealloc);
-
-	return set;
-
-search_again:
-	if (start > end)
-		goto out;
-	write_unlock_irqrestore(&tree->lock, flags);
-	if (mask & __GFP_WAIT)
-		cond_resched();
-	goto again;
-}
-EXPORT_SYMBOL(clear_extent_bit);
-
-static int wait_on_state(struct extent_map_tree *tree,
-			 struct extent_state *state)
-{
-	DEFINE_WAIT(wait);
-	prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
-	read_unlock_irq(&tree->lock);
-	schedule();
-	read_lock_irq(&tree->lock);
-	finish_wait(&state->wq, &wait);
-	return 0;
-}
-
-/*
- * waits for one or more bits to clear on a range in the state tree.
- * The range [start, end] is inclusive.
- * The tree lock is taken by this function
- */
-int wait_extent_bit(struct extent_map_tree *tree, u64 start, u64 end, int bits)
-{
-	struct extent_state *state;
-	struct rb_node *node;
-
-	read_lock_irq(&tree->lock);
-again:
-	while (1) {
-		/*
-		 * this search will find all the extents that end after
-		 * our range starts
-		 */
-		node = tree_search(&tree->state, start);
-		if (!node)
-			break;
-
-		state = rb_entry(node, struct extent_state, rb_node);
-
-		if (state->start > end)
-			goto out;
-
-		if (state->state & bits) {
-			start = state->start;
-			atomic_inc(&state->refs);
-			wait_on_state(tree, state);
-			free_extent_state(state);
-			goto again;
-		}
-		start = state->end + 1;
-
-		if (start > end)
-			break;
-
-		if (need_resched()) {
-			read_unlock_irq(&tree->lock);
-			cond_resched();
-			read_lock_irq(&tree->lock);
-		}
-	}
-out:
-	read_unlock_irq(&tree->lock);
-	return 0;
-}
-EXPORT_SYMBOL(wait_extent_bit);
-
-static void set_state_bits(struct extent_map_tree *tree,
-			   struct extent_state *state,
-			   int bits)
-{
-	if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
-		u64 range = state->end - state->start + 1;
-		tree->dirty_bytes += range;
-	}
-	state->state |= bits;
-}
-
-/*
- * set some bits on a range in the tree.  This may require allocations
- * or sleeping, so the gfp mask is used to indicate what is allowed.
- *
- * If 'exclusive' == 1, this will fail with -EEXIST if some part of the
- * range already has the desired bits set.  The start of the existing
- * range is returned in failed_start in this case.
- *
- * [start, end] is inclusive
- * This takes the tree lock.
- */
-int set_extent_bit(struct extent_map_tree *tree, u64 start, u64 end, int bits,
-		   int exclusive, u64 *failed_start, gfp_t mask)
-{
-	struct extent_state *state;
-	struct extent_state *prealloc = NULL;
-	struct rb_node *node;
-	unsigned long flags;
-	int err = 0;
-	int set;
-	u64 last_start;
-	u64 last_end;
-again:
-	if (!prealloc && (mask & __GFP_WAIT)) {
-		prealloc = alloc_extent_state(mask);
-		if (!prealloc)
-			return -ENOMEM;
-	}
-
-	write_lock_irqsave(&tree->lock, flags);
-	/*
-	 * this search will find all the extents that end after
-	 * our range starts.
-	 */
-	node = tree_search(&tree->state, start);
-	if (!node) {
-		err = insert_state(tree, prealloc, start, end, bits);
-		prealloc = NULL;
-		BUG_ON(err == -EEXIST);
-		goto out;
-	}
-
-	state = rb_entry(node, struct extent_state, rb_node);
-	last_start = state->start;
-	last_end = state->end;
-
-	/*
-	 * | ---- desired range ---- |
-	 * | state |
-	 *
-	 * Just lock what we found and keep going
-	 */
-	if (state->start == start && state->end <= end) {
-		set = state->state & bits;
-		if (set && exclusive) {
-			*failed_start = state->start;
-			err = -EEXIST;
-			goto out;
-		}
-		set_state_bits(tree, state, bits);
-		start = state->end + 1;
-		merge_state(tree, state);
-		goto search_again;
-	}
-
-	/*
-	 *     | ---- desired range ---- |
-	 * | state |
-	 *   or
-	 * | ------------- state -------------- |
-	 *
-	 * We need to split the extent we found, and may flip bits on
-	 * second half.
-	 *
-	 * If the extent we found extends past our
-	 * range, we just split and search again.  It'll get split
-	 * again the next time though.
-	 *
-	 * If the extent we found is inside our range, we set the
-	 * desired bit on it.
-	 */
-	if (state->start < start) {
-		set = state->state & bits;
-		if (exclusive && set) {
-			*failed_start = start;
-			err = -EEXIST;
-			goto out;
-		}
-		err = split_state(tree, state, prealloc, start);
-		BUG_ON(err == -EEXIST);
-		prealloc = NULL;
-		if (err)
-			goto out;
-		if (state->end <= end) {
-			set_state_bits(tree, state, bits);
-			start = state->end + 1;
-			merge_state(tree, state);
-		} else {
-			start = state->start;
-		}
-		goto search_again;
-	}
-	/*
-	 * | ---- desired range ---- |
-	 *     | state | or               | state |
-	 *
-	 * There's a hole, we need to insert something in it and
-	 * ignore the extent we found.
-	 */
-	if (state->start > start) {
-		u64 this_end;
-		if (end < last_start)
-			this_end = end;
-		else
-			this_end = last_start -1;
-		err = insert_state(tree, prealloc, start, this_end,
-				   bits);
-		prealloc = NULL;
-		BUG_ON(err == -EEXIST);
-		if (err)
-			goto out;
-		start = this_end + 1;
-		goto search_again;
-	}
-	/*
-	 * | ---- desired range ---- |
-	 *                        | state |
-	 * We need to split the extent, and set the bit
-	 * on the first half
-	 */
-	if (state->start <= end && state->end > end) {
-		set = state->state & bits;
-		if (exclusive && set) {
-			*failed_start = start;
-			err = -EEXIST;
-			goto out;
-		}
-		err = split_state(tree, state, prealloc, end + 1);
-		BUG_ON(err == -EEXIST);
-
-		set_state_bits(tree, prealloc, bits);
-		merge_state(tree, prealloc);
-		prealloc = NULL;
-		goto out;
-	}
-
-	goto search_again;
-
-out:
-	write_unlock_irqrestore(&tree->lock, flags);
-	if (prealloc)
-		free_extent_state(prealloc);
-
-	return err;
-
-search_again:
-	if (start > end)
-		goto out;
-	write_unlock_irqrestore(&tree->lock, flags);
-	if (mask & __GFP_WAIT)
-		cond_resched();
-	goto again;
-}
-EXPORT_SYMBOL(set_extent_bit);
-
-/* wrappers around set/clear extent bit */
-int set_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end,
-		     gfp_t mask)
-{
-	return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
-			      mask);
-}
-EXPORT_SYMBOL(set_extent_dirty);
-
-int set_extent_bits(struct extent_map_tree *tree, u64 start, u64 end,
-		    int bits, gfp_t mask)
-{
-	return set_extent_bit(tree, start, end, bits, 0, NULL,
-			      mask);
-}
-EXPORT_SYMBOL(set_extent_bits);
-
-int clear_extent_bits(struct extent_map_tree *tree, u64 start, u64 end,
-		      int bits, gfp_t mask)
-{
-	return clear_extent_bit(tree, start, end, bits, 0, 0, mask);
-}
-EXPORT_SYMBOL(clear_extent_bits);
-
-int set_extent_delalloc(struct extent_map_tree *tree, u64 start, u64 end,
-		     gfp_t mask)
-{
-	return set_extent_bit(tree, start, end,
-			      EXTENT_DELALLOC | EXTENT_DIRTY, 0, NULL,
-			      mask);
-}
-EXPORT_SYMBOL(set_extent_delalloc);
-
-int clear_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end,
-		       gfp_t mask)
-{
-	return clear_extent_bit(tree, start, end,
-				EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask);
-}
-EXPORT_SYMBOL(clear_extent_dirty);
-
-int set_extent_new(struct extent_map_tree *tree, u64 start, u64 end,
-		     gfp_t mask)
-{
-	return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
-			      mask);
-}
-EXPORT_SYMBOL(set_extent_new);
-
-int clear_extent_new(struct extent_map_tree *tree, u64 start, u64 end,
-		       gfp_t mask)
-{
-	return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask);
-}
-EXPORT_SYMBOL(clear_extent_new);
-
-int set_extent_uptodate(struct extent_map_tree *tree, u64 start, u64 end,
-			gfp_t mask)
-{
-	return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
-			      mask);
-}
-EXPORT_SYMBOL(set_extent_uptodate);
-
-int clear_extent_uptodate(struct extent_map_tree *tree, u64 start, u64 end,
-			  gfp_t mask)
-{
-	return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask);
-}
-EXPORT_SYMBOL(clear_extent_uptodate);
-
-int set_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end,
-			 gfp_t mask)
-{
-	return set_extent_bit(tree, start, end, EXTENT_WRITEBACK,
-			      0, NULL, mask);
-}
-EXPORT_SYMBOL(set_extent_writeback);
-
-int clear_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end,
-			   gfp_t mask)
-{
-	return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask);
-}
-EXPORT_SYMBOL(clear_extent_writeback);
-
-int wait_on_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end)
-{
-	return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK);
-}
-EXPORT_SYMBOL(wait_on_extent_writeback);
-
-/*
- * locks a range in ascending order, waiting for any locked regions
- * it hits on the way.  [start,end] are inclusive, and this will sleep.
- */
-int lock_extent(struct extent_map_tree *tree, u64 start, u64 end, gfp_t mask)
-{
-	int err;
-	u64 failed_start;
-	while (1) {
-		err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
-				     &failed_start, mask);
-		if (err == -EEXIST && (mask & __GFP_WAIT)) {
-			wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
-			start = failed_start;
-		} else {
-			break;
-		}
-		WARN_ON(start > end);
-	}
-	return err;
-}
-EXPORT_SYMBOL(lock_extent);
-
-int unlock_extent(struct extent_map_tree *tree, u64 start, u64 end,
-		  gfp_t mask)
-{
-	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask);
-}
-EXPORT_SYMBOL(unlock_extent);
-
-/*
- * helper function to set pages and extents in the tree dirty
- */
-int set_range_dirty(struct extent_map_tree *tree, u64 start, u64 end)
-{
-	unsigned long index = start >> PAGE_CACHE_SHIFT;
-	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
-	struct page *page;
-
-	while (index <= end_index) {
-		page = find_get_page(tree->mapping, index);
-		BUG_ON(!page);
-		__set_page_dirty_nobuffers(page);
-		page_cache_release(page);
-		index++;
-	}
-	set_extent_dirty(tree, start, end, GFP_NOFS);
-	return 0;
-}
-EXPORT_SYMBOL(set_range_dirty);
-
-/*
- * helper function to set both pages and extents in the tree writeback
- */
-int set_range_writeback(struct extent_map_tree *tree, u64 start, u64 end)
-{
-	unsigned long index = start >> PAGE_CACHE_SHIFT;
-	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
-	struct page *page;
-
-	while (index <= end_index) {
-		page = find_get_page(tree->mapping, index);
-		BUG_ON(!page);
-		set_page_writeback(page);
-		page_cache_release(page);
-		index++;
-	}
-	set_extent_writeback(tree, start, end, GFP_NOFS);
-	return 0;
-}
-EXPORT_SYMBOL(set_range_writeback);
-
-int find_first_extent_bit(struct extent_map_tree *tree, u64 start,
-			  u64 *start_ret, u64 *end_ret, int bits)
-{
-	struct rb_node *node;
-	struct extent_state *state;
-	int ret = 1;
-
-	read_lock_irq(&tree->lock);
-	/*
-	 * this search will find all the extents that end after
-	 * our range starts.
-	 */
-	node = tree_search(&tree->state, start);
-	if (!node || IS_ERR(node)) {
-		goto out;
-	}
-
-	while(1) {
-		state = rb_entry(node, struct extent_state, rb_node);
-		if (state->end >= start && (state->state & bits)) {
-			*start_ret = state->start;
-			*end_ret = state->end;
-			ret = 0;
-			break;
-		}
-		node = rb_next(node);
-		if (!node)
-			break;
-	}
-out:
-	read_unlock_irq(&tree->lock);
-	return ret;
-}
-EXPORT_SYMBOL(find_first_extent_bit);
-
-u64 find_lock_delalloc_range(struct extent_map_tree *tree,
-			     u64 *start, u64 *end, u64 max_bytes)
-{
-	struct rb_node *node;
-	struct extent_state *state;
-	u64 cur_start = *start;
-	u64 found = 0;
-	u64 total_bytes = 0;
-
-	write_lock_irq(&tree->lock);
-	/*
-	 * this search will find all the extents that end after
-	 * our range starts.
-	 */
-search_again:
-	node = tree_search(&tree->state, cur_start);
-	if (!node || IS_ERR(node)) {
-		*end = (u64)-1;
-		goto out;
-	}
-
-	while(1) {
-		state = rb_entry(node, struct extent_state, rb_node);
-		if (found && state->start != cur_start) {
-			goto out;
-		}
-		if (!(state->state & EXTENT_DELALLOC)) {
-			if (!found)
-				*end = state->end;
-			goto out;
-		}
-		if (!found) {
-			struct extent_state *prev_state;
-			struct rb_node *prev_node = node;
-			while(1) {
-				prev_node = rb_prev(prev_node);
-				if (!prev_node)
-					break;
-				prev_state = rb_entry(prev_node,
-						      struct extent_state,
-						      rb_node);
-				if (!(prev_state->state & EXTENT_DELALLOC))
-					break;
-				state = prev_state;
-				node = prev_node;
-			}
-		}
-		if (state->state & EXTENT_LOCKED) {
-			DEFINE_WAIT(wait);
-			atomic_inc(&state->refs);
-			prepare_to_wait(&state->wq, &wait,
-					TASK_UNINTERRUPTIBLE);
-			write_unlock_irq(&tree->lock);
-			schedule();
-			write_lock_irq(&tree->lock);
-			finish_wait(&state->wq, &wait);
-			free_extent_state(state);
-			goto search_again;
-		}
-		state->state |= EXTENT_LOCKED;
-		if (!found)
-			*start = state->start;
-		found++;
-		*end = state->end;
-		cur_start = state->end + 1;
-		node = rb_next(node);
-		if (!node)
-			break;
-		total_bytes += state->end - state->start + 1;
-		if (total_bytes >= max_bytes)
-			break;
-	}
-out:
-	write_unlock_irq(&tree->lock);
-	return found;
-}
-
-u64 count_range_bits(struct extent_map_tree *tree,
-		     u64 *start, u64 search_end, u64 max_bytes,
-		     unsigned long bits)
-{
-	struct rb_node *node;
-	struct extent_state *state;
-	u64 cur_start = *start;
-	u64 total_bytes = 0;
-	int found = 0;
-
-	if (search_end <= cur_start) {
-		printk("search_end %Lu start %Lu\n", search_end, cur_start);
-		WARN_ON(1);
-		return 0;
-	}
-
-	write_lock_irq(&tree->lock);
-	if (cur_start == 0 && bits == EXTENT_DIRTY) {
-		total_bytes = tree->dirty_bytes;
-		goto out;
-	}
-	/*
-	 * this search will find all the extents that end after
-	 * our range starts.
-	 */
-	node = tree_search(&tree->state, cur_start);
-	if (!node || IS_ERR(node)) {
-		goto out;
-	}
-
-	while(1) {
-		state = rb_entry(node, struct extent_state, rb_node);
-		if (state->start > search_end)
-			break;
-		if (state->end >= cur_start && (state->state & bits)) {
-			total_bytes += min(search_end, state->end) + 1 -
-				       max(cur_start, state->start);
-			if (total_bytes >= max_bytes)
-				break;
-			if (!found) {
-				*start = state->start;
-				found = 1;
-			}
-		}
-		node = rb_next(node);
-		if (!node)
-			break;
-	}
-out:
-	write_unlock_irq(&tree->lock);
-	return total_bytes;
-}
-/*
- * helper function to lock both pages and extents in the tree.
- * pages must be locked first.
- */
-int lock_range(struct extent_map_tree *tree, u64 start, u64 end)
-{
-	unsigned long index = start >> PAGE_CACHE_SHIFT;
-	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
-	struct page *page;
-	int err;
-
-	while (index <= end_index) {
-		page = grab_cache_page(tree->mapping, index);
-		if (!page) {
-			err = -ENOMEM;
-			goto failed;
-		}
-		if (IS_ERR(page)) {
-			err = PTR_ERR(page);
-			goto failed;
-		}
-		index++;
-	}
-	lock_extent(tree, start, end, GFP_NOFS);
-	return 0;
-
-failed:
-	/*
-	 * we failed above in getting the page at 'index', so we undo here
-	 * up to but not including the page at 'index'
-	 */
-	end_index = index;
-	index = start >> PAGE_CACHE_SHIFT;
-	while (index < end_index) {
-		page = find_get_page(tree->mapping, index);
-		unlock_page(page);
-		page_cache_release(page);
-		index++;
-	}
-	return err;
-}
-EXPORT_SYMBOL(lock_range);
-
-/*
- * helper function to unlock both pages and extents in the tree.
- */
-int unlock_range(struct extent_map_tree *tree, u64 start, u64 end)
-{
-	unsigned long index = start >> PAGE_CACHE_SHIFT;
-	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
-	struct page *page;
-
-	while (index <= end_index) {
-		page = find_get_page(tree->mapping, index);
-		unlock_page(page);
-		page_cache_release(page);
-		index++;
-	}
-	unlock_extent(tree, start, end, GFP_NOFS);
-	return 0;
-}
-EXPORT_SYMBOL(unlock_range);
-
-int set_state_private(struct extent_map_tree *tree, u64 start, u64 private)
-{
-	struct rb_node *node;
-	struct extent_state *state;
-	int ret = 0;
-
-	write_lock_irq(&tree->lock);
-	/*
-	 * this search will find all the extents that end after
-	 * our range starts.
-	 */
-	node = tree_search(&tree->state, start);
-	if (!node || IS_ERR(node)) {
-		ret = -ENOENT;
-		goto out;
-	}
-	state = rb_entry(node, struct extent_state, rb_node);
-	if (state->start != start) {
-		ret = -ENOENT;
-		goto out;
-	}
-	state->private = private;
-out:
-	write_unlock_irq(&tree->lock);
-	return ret;
-}
-
-int get_state_private(struct extent_map_tree *tree, u64 start, u64 *private)
-{
-	struct rb_node *node;
-	struct extent_state *state;
-	int ret = 0;
-
-	read_lock_irq(&tree->lock);
-	/*
-	 * this search will find all the extents that end after
-	 * our range starts.
-	 */
-	node = tree_search(&tree->state, start);
-	if (!node || IS_ERR(node)) {
-		ret = -ENOENT;
-		goto out;
-	}
-	state = rb_entry(node, struct extent_state, rb_node);
-	if (state->start != start) {
-		ret = -ENOENT;
-		goto out;
-	}
-	*private = state->private;
-out:
-	read_unlock_irq(&tree->lock);
-	return ret;
-}
-
-/*
- * searches a range in the state tree for a given mask.
- * If 'filled' == 1, this returns 1 only if ever extent in the tree
- * has the bits set.  Otherwise, 1 is returned if any bit in the
- * range is found set.
- */
-int test_range_bit(struct extent_map_tree *tree, u64 start, u64 end,
-		   int bits, int filled)
-{
-	struct extent_state *state = NULL;
-	struct rb_node *node;
-	int bitset = 0;
-
-	read_lock_irq(&tree->lock);
-	node = tree_search(&tree->state, start);
-	while (node && start <= end) {
-		state = rb_entry(node, struct extent_state, rb_node);
-
-		if (filled && state->start > start) {
-			bitset = 0;
-			break;
-		}
-
-		if (state->start > end)
-			break;
-
-		if (state->state & bits) {
-			bitset = 1;
-			if (!filled)
-				break;
-		} else if (filled) {
-			bitset = 0;
-			break;
-		}
-		start = state->end + 1;
-		if (start > end)
-			break;
-		node = rb_next(node);
-		if (!node) {
-			if (filled)
-				bitset = 0;
-			break;
-		}
-	}
-	read_unlock_irq(&tree->lock);
-	return bitset;
-}
-EXPORT_SYMBOL(test_range_bit);
-
-/*
- * helper function to set a given page up to date if all the
- * extents in the tree for that page are up to date
- */
-static int check_page_uptodate(struct extent_map_tree *tree,
-			       struct page *page)
-{
-	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
-	u64 end = start + PAGE_CACHE_SIZE - 1;
-	if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1))
-		SetPageUptodate(page);
-	return 0;
-}
-
-/*
- * helper function to unlock a page if all the extents in the tree
- * for that page are unlocked
- */
-static int check_page_locked(struct extent_map_tree *tree,
-			     struct page *page)
-{
-	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
-	u64 end = start + PAGE_CACHE_SIZE - 1;
-	if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0))
-		unlock_page(page);
-	return 0;
-}
-
-/*
- * helper function to end page writeback if all the extents
- * in the tree for that page are done with writeback
- */
-static int check_page_writeback(struct extent_map_tree *tree,
-			     struct page *page)
-{
-	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
-	u64 end = start + PAGE_CACHE_SIZE - 1;
-	if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0))
-		end_page_writeback(page);
-	return 0;
-}
-
-/* lots and lots of room for performance fixes in the end_bio funcs */
-
-/*
- * after a writepage IO is done, we need to:
- * clear the uptodate bits on error
- * clear the writeback bits in the extent tree for this IO
- * end_page_writeback if the page has no more pending IO
- *
- * Scheduling is not allowed, so the extent state tree is expected
- * to have one and only one object corresponding to this IO.
- */
-#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
-static void end_bio_extent_writepage(struct bio *bio, int err)
-#else
-static int end_bio_extent_writepage(struct bio *bio,
-				   unsigned int bytes_done, int err)
-#endif
-{
-	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
-	struct extent_map_tree *tree = bio->bi_private;
-	u64 start;
-	u64 end;
-	int whole_page;
-
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
-	if (bio->bi_size)
-		return 1;
-#endif
-
-	do {
-		struct page *page = bvec->bv_page;
-		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
-			 bvec->bv_offset;
-		end = start + bvec->bv_len - 1;
-
-		if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
-			whole_page = 1;
-		else
-			whole_page = 0;
-
-		if (--bvec >= bio->bi_io_vec)
-			prefetchw(&bvec->bv_page->flags);
-
-		if (!uptodate) {
-			clear_extent_uptodate(tree, start, end, GFP_ATOMIC);
-			ClearPageUptodate(page);
-			SetPageError(page);
-		}
-		clear_extent_writeback(tree, start, end, GFP_ATOMIC);
-
-		if (whole_page)
-			end_page_writeback(page);
-		else
-			check_page_writeback(tree, page);
-		if (tree->ops && tree->ops->writepage_end_io_hook)
-			tree->ops->writepage_end_io_hook(page, start, end);
-	} while (bvec >= bio->bi_io_vec);
-
-	bio_put(bio);
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
-	return 0;
-#endif
-}
-
-/*
- * after a readpage IO is done, we need to:
- * clear the uptodate bits on error
- * set the uptodate bits if things worked
- * set the page up to date if all extents in the tree are uptodate
- * clear the lock bit in the extent tree
- * unlock the page if there are no other extents locked for it
- *
- * Scheduling is not allowed, so the extent state tree is expected
- * to have one and only one object corresponding to this IO.
- */
-#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
-static void end_bio_extent_readpage(struct bio *bio, int err)
-#else
-static int end_bio_extent_readpage(struct bio *bio,
-				   unsigned int bytes_done, int err)
-#endif
-{
-	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
-	struct extent_map_tree *tree = bio->bi_private;
-	u64 start;
-	u64 end;
-	int whole_page;
-	int ret;
-
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
-	if (bio->bi_size)
-		return 1;
-#endif
-
-	do {
-		struct page *page = bvec->bv_page;
-		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
-			bvec->bv_offset;
-		end = start + bvec->bv_len - 1;
-
-		if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
-			whole_page = 1;
-		else
-			whole_page = 0;
-
-		if (--bvec >= bio->bi_io_vec)
-			prefetchw(&bvec->bv_page->flags);
-
-		if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
-			ret = tree->ops->readpage_end_io_hook(page, start, end);
-			if (ret)
-				uptodate = 0;
-		}
-		if (uptodate) {
-			set_extent_uptodate(tree, start, end, GFP_ATOMIC);
-			if (whole_page)
-				SetPageUptodate(page);
-			else
-				check_page_uptodate(tree, page);
-		} else {
-			ClearPageUptodate(page);
-			SetPageError(page);
-		}
-
-		unlock_extent(tree, start, end, GFP_ATOMIC);
-
-		if (whole_page)
-			unlock_page(page);
-		else
-			check_page_locked(tree, page);
-	} while (bvec >= bio->bi_io_vec);
-
-	bio_put(bio);
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
-	return 0;
-#endif
-}
-
-/*
- * IO done from prepare_write is pretty simple, we just unlock
- * the structs in the extent tree when done, and set the uptodate bits
- * as appropriate.
- */
-#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
-static void end_bio_extent_preparewrite(struct bio *bio, int err)
-#else
-static int end_bio_extent_preparewrite(struct bio *bio,
-				       unsigned int bytes_done, int err)
-#endif
-{
-	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
-	struct extent_map_tree *tree = bio->bi_private;
-	u64 start;
-	u64 end;
-
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
-	if (bio->bi_size)
-		return 1;
-#endif
-
-	do {
-		struct page *page = bvec->bv_page;
-		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
-			bvec->bv_offset;
-		end = start + bvec->bv_len - 1;
-
-		if (--bvec >= bio->bi_io_vec)
-			prefetchw(&bvec->bv_page->flags);
-
-		if (uptodate) {
-			set_extent_uptodate(tree, start, end, GFP_ATOMIC);
-		} else {
-			ClearPageUptodate(page);
-			SetPageError(page);
-		}
-
-		unlock_extent(tree, start, end, GFP_ATOMIC);
-
-	} while (bvec >= bio->bi_io_vec);
-
-	bio_put(bio);
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
-	return 0;
-#endif
-}
-
-static struct bio *
-extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
-		 gfp_t gfp_flags)
-{
-	struct bio *bio;
-
-	bio = bio_alloc(gfp_flags, nr_vecs);
-
-	if (bio == NULL && (current->flags & PF_MEMALLOC)) {
-		while (!bio && (nr_vecs /= 2))
-			bio = bio_alloc(gfp_flags, nr_vecs);
-	}
-
-	if (bio) {
-		bio->bi_bdev = bdev;
-		bio->bi_sector = first_sector;
-	}
-	return bio;
-}
-
-static int submit_one_bio(int rw, struct bio *bio)
-{
-	u64 maxsector;
-	int ret = 0;
-
-	bio_get(bio);
-
-        maxsector = bio->bi_bdev->bd_inode->i_size >> 9;
-	if (maxsector < bio->bi_sector) {
-		printk("sector too large max %Lu got %llu\n", maxsector,
-			(unsigned long long)bio->bi_sector);
-		WARN_ON(1);
-	}
-
-	submit_bio(rw, bio);
-	if (bio_flagged(bio, BIO_EOPNOTSUPP))
-		ret = -EOPNOTSUPP;
-	bio_put(bio);
-	return ret;
-}
-
-static int submit_extent_page(int rw, struct extent_map_tree *tree,
-			      struct page *page, sector_t sector,
-			      size_t size, unsigned long offset,
-			      struct block_device *bdev,
-			      struct bio **bio_ret,
-			      unsigned long max_pages,
-			      bio_end_io_t end_io_func)
-{
-	int ret = 0;
-	struct bio *bio;
-	int nr;
-
-	if (bio_ret && *bio_ret) {
-		bio = *bio_ret;
-		if (bio->bi_sector + (bio->bi_size >> 9) != sector ||
-		    bio_add_page(bio, page, size, offset) < size) {
-			ret = submit_one_bio(rw, bio);
-			bio = NULL;
-		} else {
-			return 0;
-		}
-	}
-	nr = min_t(int, max_pages, bio_get_nr_vecs(bdev));
-	bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
-	if (!bio) {
-		printk("failed to allocate bio nr %d\n", nr);
-	}
-	bio_add_page(bio, page, size, offset);
-	bio->bi_end_io = end_io_func;
-	bio->bi_private = tree;
-	if (bio_ret) {
-		*bio_ret = bio;
-	} else {
-		ret = submit_one_bio(rw, bio);
-	}
-
-	return ret;
-}
-
-void set_page_extent_mapped(struct page *page)
-{
-	if (!PagePrivate(page)) {
-		SetPagePrivate(page);
-		WARN_ON(!page->mapping->a_ops->invalidatepage);
-		set_page_private(page, EXTENT_PAGE_PRIVATE);
-		page_cache_get(page);
-	}
-}
-
-void set_page_extent_head(struct page *page, unsigned long len)
-{
-	set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
-}
-
-/*
- * basic readpage implementation.  Locked extent state structs are inserted
- * into the tree that are removed when the IO is done (by the end_io
- * handlers)
- */
-static int __extent_read_full_page(struct extent_map_tree *tree,
-				   struct page *page,
-				   get_extent_t *get_extent,
-				   struct bio **bio)
-{
-	struct inode *inode = page->mapping->host;
-	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
-	u64 page_end = start + PAGE_CACHE_SIZE - 1;
-	u64 end;
-	u64 cur = start;
-	u64 extent_offset;
-	u64 last_byte = i_size_read(inode);
-	u64 block_start;
-	u64 cur_end;
-	sector_t sector;
-	struct extent_map *em;
-	struct block_device *bdev;
-	int ret;
-	int nr = 0;
-	size_t page_offset = 0;
-	size_t iosize;
-	size_t blocksize = inode->i_sb->s_blocksize;
-
-	set_page_extent_mapped(page);
-
-	end = page_end;
-	lock_extent(tree, start, end, GFP_NOFS);
-
-	while (cur <= end) {
-		if (cur >= last_byte) {
-			char *userpage;
-			iosize = PAGE_CACHE_SIZE - page_offset;
-			userpage = kmap_atomic(page, KM_USER0);
-			memset(userpage + page_offset, 0, iosize);
-			flush_dcache_page(page);
-			kunmap_atomic(userpage, KM_USER0);
-			set_extent_uptodate(tree, cur, cur + iosize - 1,
-					    GFP_NOFS);
-			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
-			break;
-		}
-		em = get_extent(inode, page, page_offset, cur, end, 0);
-		if (IS_ERR(em) || !em) {
-			SetPageError(page);
-			unlock_extent(tree, cur, end, GFP_NOFS);
-			break;
-		}
-
-		extent_offset = cur - em->start;
-		BUG_ON(em->end < cur);
-		BUG_ON(end < cur);
-
-		iosize = min(em->end - cur, end - cur) + 1;
-		cur_end = min(em->end, end);
-		iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
-		sector = (em->block_start + extent_offset) >> 9;
-		bdev = em->bdev;
-		block_start = em->block_start;
-		free_extent_map(em);
-		em = NULL;
-
-		/* we've found a hole, just zero and go on */
-		if (block_start == EXTENT_MAP_HOLE) {
-			char *userpage;
-			userpage = kmap_atomic(page, KM_USER0);
-			memset(userpage + page_offset, 0, iosize);
-			flush_dcache_page(page);
-			kunmap_atomic(userpage, KM_USER0);
-
-			set_extent_uptodate(tree, cur, cur + iosize - 1,
-					    GFP_NOFS);
-			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
-			cur = cur + iosize;
-			page_offset += iosize;
-			continue;
-		}
-		/* the get_extent function already copied into the page */
-		if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) {
-			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
-			cur = cur + iosize;
-			page_offset += iosize;
-			continue;
-		}
-
-		ret = 0;
-		if (tree->ops && tree->ops->readpage_io_hook) {
-			ret = tree->ops->readpage_io_hook(page, cur,
-							  cur + iosize - 1);
-		}
-		if (!ret) {
-			unsigned long nr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
-			nr -= page->index;
-			ret = submit_extent_page(READ, tree, page,
-					 sector, iosize, page_offset,
-					 bdev, bio, nr,
-					 end_bio_extent_readpage);
-		}
-		if (ret)
-			SetPageError(page);
-		cur = cur + iosize;
-		page_offset += iosize;
-		nr++;
-	}
-	if (!nr) {
-		if (!PageError(page))
-			SetPageUptodate(page);
-		unlock_page(page);
-	}
-	return 0;
-}
-
-int extent_read_full_page(struct extent_map_tree *tree, struct page *page,
-			    get_extent_t *get_extent)
-{
-	struct bio *bio = NULL;
-	int ret;
-
-	ret = __extent_read_full_page(tree, page, get_extent, &bio);
-	if (bio)
-		submit_one_bio(READ, bio);
-	return ret;
-}
-EXPORT_SYMBOL(extent_read_full_page);
-
-/*
- * the writepage semantics are similar to regular writepage.  extent
- * records are inserted to lock ranges in the tree, and as dirty areas
- * are found, they are marked writeback.  Then the lock bits are removed
- * and the end_io handler clears the writeback ranges
- */
-static int __extent_writepage(struct page *page, struct writeback_control *wbc,
-			      void *data)
-{
-	struct inode *inode = page->mapping->host;
-	struct extent_page_data *epd = data;
-	struct extent_map_tree *tree = epd->tree;
-	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
-	u64 delalloc_start;
-	u64 page_end = start + PAGE_CACHE_SIZE - 1;
-	u64 end;
-	u64 cur = start;
-	u64 extent_offset;
-	u64 last_byte = i_size_read(inode);
-	u64 block_start;
-	u64 iosize;
-	sector_t sector;
-	struct extent_map *em;
-	struct block_device *bdev;
-	int ret;
-	int nr = 0;
-	size_t page_offset = 0;
-	size_t blocksize;
-	loff_t i_size = i_size_read(inode);
-	unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
-	u64 nr_delalloc;
-	u64 delalloc_end;
-
-	WARN_ON(!PageLocked(page));
-	if (page->index > end_index) {
-		clear_extent_dirty(tree, start, page_end, GFP_NOFS);
-		unlock_page(page);
-		return 0;
-	}
-
-	if (page->index == end_index) {
-		char *userpage;
-
-		size_t offset = i_size & (PAGE_CACHE_SIZE - 1);
-
-		userpage = kmap_atomic(page, KM_USER0);
-		memset(userpage + offset, 0, PAGE_CACHE_SIZE - offset);
-		flush_dcache_page(page);
-		kunmap_atomic(userpage, KM_USER0);
-	}
-
-	set_page_extent_mapped(page);
-
-	delalloc_start = start;
-	delalloc_end = 0;
-	while(delalloc_end < page_end) {
-		nr_delalloc = find_lock_delalloc_range(tree, &delalloc_start,
-						       &delalloc_end,
-						       128 * 1024 * 1024);
-		if (nr_delalloc == 0) {
-			delalloc_start = delalloc_end + 1;
-			continue;
-		}
-		tree->ops->fill_delalloc(inode, delalloc_start,
-					 delalloc_end);
-		clear_extent_bit(tree, delalloc_start,
-				 delalloc_end,
-				 EXTENT_LOCKED | EXTENT_DELALLOC,
-				 1, 0, GFP_NOFS);
-		delalloc_start = delalloc_end + 1;
-	}
-	lock_extent(tree, start, page_end, GFP_NOFS);
-
-	end = page_end;
-	if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) {
-		printk("found delalloc bits after lock_extent\n");
-	}
-
-	if (last_byte <= start) {
-		clear_extent_dirty(tree, start, page_end, GFP_NOFS);
-		goto done;
-	}
-
-	set_extent_uptodate(tree, start, page_end, GFP_NOFS);
-	blocksize = inode->i_sb->s_blocksize;
-
-	while (cur <= end) {
-		if (cur >= last_byte) {
-			clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
-			break;
-		}
-		em = epd->get_extent(inode, page, page_offset, cur, end, 1);
-		if (IS_ERR(em) || !em) {
-			SetPageError(page);
-			break;
-		}
-
-		extent_offset = cur - em->start;
-		BUG_ON(em->end < cur);
-		BUG_ON(end < cur);
-		iosize = min(em->end - cur, end - cur) + 1;
-		iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
-		sector = (em->block_start + extent_offset) >> 9;
-		bdev = em->bdev;
-		block_start = em->block_start;
-		free_extent_map(em);
-		em = NULL;
-
-		if (block_start == EXTENT_MAP_HOLE ||
-		    block_start == EXTENT_MAP_INLINE) {
-			clear_extent_dirty(tree, cur,
-					   cur + iosize - 1, GFP_NOFS);
-			cur = cur + iosize;
-			page_offset += iosize;
-			continue;
-		}
-
-		/* leave this out until we have a page_mkwrite call */
-		if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
-				   EXTENT_DIRTY, 0)) {
-			cur = cur + iosize;
-			page_offset += iosize;
-			continue;
-		}
-		clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
-		if (tree->ops && tree->ops->writepage_io_hook) {
-			ret = tree->ops->writepage_io_hook(page, cur,
-						cur + iosize - 1);
-		} else {
-			ret = 0;
-		}
-		if (ret)
-			SetPageError(page);
-		else {
-			unsigned long max_nr = end_index + 1;
-			set_range_writeback(tree, cur, cur + iosize - 1);
-			if (!PageWriteback(page)) {
-				printk("warning page %lu not writeback, "
-				       "cur %llu end %llu\n", page->index,
-				       (unsigned long long)cur,
-				       (unsigned long long)end);
-			}
-
-			ret = submit_extent_page(WRITE, tree, page, sector,
-						 iosize, page_offset, bdev,
-						 &epd->bio, max_nr,
-						 end_bio_extent_writepage);
-			if (ret)
-				SetPageError(page);
-		}
-		cur = cur + iosize;
-		page_offset += iosize;
-		nr++;
-	}
-done:
-	if (nr == 0) {
-		/* make sure the mapping tag for page dirty gets cleared */
-		set_page_writeback(page);
-		end_page_writeback(page);
-	}
-	unlock_extent(tree, start, page_end, GFP_NOFS);
-	unlock_page(page);
-	return 0;
-}
-
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
-
-/* Taken directly from 2.6.23 for 2.6.18 back port */
-typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
-                                void *data);
-
-/**
- * write_cache_pages - walk the list of dirty pages of the given address space
- * and write all of them.
- * @mapping: address space structure to write
- * @wbc: subtract the number of written pages from *@wbc->nr_to_write
- * @writepage: function called for each page
- * @data: data passed to writepage function
- *
- * If a page is already under I/O, write_cache_pages() skips it, even
- * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
- * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
- * and msync() need to guarantee that all the data which was dirty at the time
- * the call was made get new I/O started against them.  If wbc->sync_mode is
- * WB_SYNC_ALL then we were called for data integrity and we must wait for
- * existing IO to complete.
- */
-static int write_cache_pages(struct address_space *mapping,
-		      struct writeback_control *wbc, writepage_t writepage,
-		      void *data)
-{
-	struct backing_dev_info *bdi = mapping->backing_dev_info;
-	int ret = 0;
-	int done = 0;
-	struct pagevec pvec;
-	int nr_pages;
-	pgoff_t index;
-	pgoff_t end;		/* Inclusive */
-	int scanned = 0;
-	int range_whole = 0;
-
-	if (wbc->nonblocking && bdi_write_congested(bdi)) {
-		wbc->encountered_congestion = 1;
-		return 0;
-	}
-
-	pagevec_init(&pvec, 0);
-	if (wbc->range_cyclic) {
-		index = mapping->writeback_index; /* Start from prev offset */
-		end = -1;
-	} else {
-		index = wbc->range_start >> PAGE_CACHE_SHIFT;
-		end = wbc->range_end >> PAGE_CACHE_SHIFT;
-		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
-			range_whole = 1;
-		scanned = 1;
-	}
-retry:
-	while (!done && (index <= end) &&
-	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
-					      PAGECACHE_TAG_DIRTY,
-					      min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
-		unsigned i;
-
-		scanned = 1;
-		for (i = 0; i < nr_pages; i++) {
-			struct page *page = pvec.pages[i];
-
-			/*
-			 * At this point we hold neither mapping->tree_lock nor
-			 * lock on the page itself: the page may be truncated or
-			 * invalidated (changing page->mapping to NULL), or even
-			 * swizzled back from swapper_space to tmpfs file
-			 * mapping
-			 */
-			lock_page(page);
-
-			if (unlikely(page->mapping != mapping)) {
-				unlock_page(page);
-				continue;
-			}
-
-			if (!wbc->range_cyclic && page->index > end) {
-				done = 1;
-				unlock_page(page);
-				continue;
-			}
-
-			if (wbc->sync_mode != WB_SYNC_NONE)
-				wait_on_page_writeback(page);
-
-			if (PageWriteback(page) ||
-			    !clear_page_dirty_for_io(page)) {
-				unlock_page(page);
-				continue;
-			}
-
-			ret = (*writepage)(page, wbc, data);
-
-			if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
-				unlock_page(page);
-				ret = 0;
-			}
-			if (ret || (--(wbc->nr_to_write) <= 0))
-				done = 1;
-			if (wbc->nonblocking && bdi_write_congested(bdi)) {
-				wbc->encountered_congestion = 1;
-				done = 1;
-			}
-		}
-		pagevec_release(&pvec);
-		cond_resched();
-	}
-	if (!scanned && !done) {
-		/*
-		 * We hit the last page and there is more work to be done: wrap
-		 * back to the start of the file
-		 */
-		scanned = 1;
-		index = 0;
-		goto retry;
-	}
-	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
-		mapping->writeback_index = index;
-	return ret;
-}
-#endif
-
-int extent_write_full_page(struct extent_map_tree *tree, struct page *page,
-			  get_extent_t *get_extent,
-			  struct writeback_control *wbc)
-{
-	int ret;
-	struct address_space *mapping = page->mapping;
-	struct extent_page_data epd = {
-		.bio = NULL,
-		.tree = tree,
-		.get_extent = get_extent,
-	};
-	struct writeback_control wbc_writepages = {
-		.bdi		= wbc->bdi,
-		.sync_mode	= WB_SYNC_NONE,
-		.older_than_this = NULL,
-		.nr_to_write	= 64,
-		.range_start	= page_offset(page) + PAGE_CACHE_SIZE,
-		.range_end	= (loff_t)-1,
-	};
-
-
-	ret = __extent_writepage(page, wbc, &epd);
-
-	write_cache_pages(mapping, &wbc_writepages, __extent_writepage, &epd);
-	if (epd.bio) {
-		submit_one_bio(WRITE, epd.bio);
-	}
-	return ret;
-}
-EXPORT_SYMBOL(extent_write_full_page);
-
-
-int extent_writepages(struct extent_map_tree *tree,
-		      struct address_space *mapping,
-		      get_extent_t *get_extent,
-		      struct writeback_control *wbc)
-{
-	int ret = 0;
-	struct extent_page_data epd = {
-		.bio = NULL,
-		.tree = tree,
-		.get_extent = get_extent,
-	};
-
-	ret = write_cache_pages(mapping, wbc, __extent_writepage, &epd);
-	if (epd.bio) {
-		submit_one_bio(WRITE, epd.bio);
-	}
-	return ret;
-}
-EXPORT_SYMBOL(extent_writepages);
-
-int extent_readpages(struct extent_map_tree *tree,
-		     struct address_space *mapping,
-		     struct list_head *pages, unsigned nr_pages,
-		     get_extent_t get_extent)
-{
-	struct bio *bio = NULL;
-	unsigned page_idx;
-	struct pagevec pvec;
-
-	pagevec_init(&pvec, 0);
-	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
-		struct page *page = list_entry(pages->prev, struct page, lru);
-
-		prefetchw(&page->flags);
-		list_del(&page->lru);
-		/*
-		 * what we want to do here is call add_to_page_cache_lru,
-		 * but that isn't exported, so we reproduce it here
-		 */
-		if (!add_to_page_cache(page, mapping,
-					page->index, GFP_KERNEL)) {
-
-			/* open coding of lru_cache_add, also not exported */
-			page_cache_get(page);
-			if (!pagevec_add(&pvec, page))
-				__pagevec_lru_add(&pvec);
-			__extent_read_full_page(tree, page, get_extent, &bio);
-		}
-		page_cache_release(page);
-	}
-	if (pagevec_count(&pvec))
-		__pagevec_lru_add(&pvec);
-	BUG_ON(!list_empty(pages));
-	if (bio)
-		submit_one_bio(READ, bio);
-	return 0;
-}
-EXPORT_SYMBOL(extent_readpages);
-
-/*
- * basic invalidatepage code, this waits on any locked or writeback
- * ranges corresponding to the page, and then deletes any extent state
- * records from the tree
- */
-int extent_invalidatepage(struct extent_map_tree *tree,
-			  struct page *page, unsigned long offset)
-{
-	u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
-	u64 end = start + PAGE_CACHE_SIZE - 1;
-	size_t blocksize = page->mapping->host->i_sb->s_blocksize;
-
-	start += (offset + blocksize -1) & ~(blocksize - 1);
-	if (start > end)
-		return 0;
-
-	lock_extent(tree, start, end, GFP_NOFS);
-	wait_on_extent_writeback(tree, start, end);
-	clear_extent_bit(tree, start, end,
-			 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
-			 1, 1, GFP_NOFS);
-	return 0;
-}
-EXPORT_SYMBOL(extent_invalidatepage);
-
-/*
- * simple commit_write call, set_range_dirty is used to mark both
- * the pages and the extent records as dirty
- */
-int extent_commit_write(struct extent_map_tree *tree,
-			struct inode *inode, struct page *page,
-			unsigned from, unsigned to)
-{
-	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
-
-	set_page_extent_mapped(page);
-	set_page_dirty(page);
-
-	if (pos > inode->i_size) {
-		i_size_write(inode, pos);
-		mark_inode_dirty(inode);
-	}
-	return 0;
-}
-EXPORT_SYMBOL(extent_commit_write);
-
-int extent_prepare_write(struct extent_map_tree *tree,
-			 struct inode *inode, struct page *page,
-			 unsigned from, unsigned to, get_extent_t *get_extent)
-{
-	u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
-	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
-	u64 block_start;
-	u64 orig_block_start;
-	u64 block_end;
-	u64 cur_end;
-	struct extent_map *em;
-	unsigned blocksize = 1 << inode->i_blkbits;
-	size_t page_offset = 0;
-	size_t block_off_start;
-	size_t block_off_end;
-	int err = 0;
-	int iocount = 0;
-	int ret = 0;
-	int isnew;
-
-	set_page_extent_mapped(page);
-
-	block_start = (page_start + from) & ~((u64)blocksize - 1);
-	block_end = (page_start + to - 1) | (blocksize - 1);
-	orig_block_start = block_start;
-
-	lock_extent(tree, page_start, page_end, GFP_NOFS);
-	while(block_start <= block_end) {
-		em = get_extent(inode, page, page_offset, block_start,
-				block_end, 1);
-		if (IS_ERR(em) || !em) {
-			goto err;
-		}
-		cur_end = min(block_end, em->end);
-		block_off_start = block_start & (PAGE_CACHE_SIZE - 1);
-		block_off_end = block_off_start + blocksize;
-		isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS);
-
-		if (!PageUptodate(page) && isnew &&
-		    (block_off_end > to || block_off_start < from)) {
-			void *kaddr;
-
-			kaddr = kmap_atomic(page, KM_USER0);
-			if (block_off_end > to)
-				memset(kaddr + to, 0, block_off_end - to);
-			if (block_off_start < from)
-				memset(kaddr + block_off_start, 0,
-				       from - block_off_start);
-			flush_dcache_page(page);
-			kunmap_atomic(kaddr, KM_USER0);
-		}
-		if ((em->block_start != EXTENT_MAP_HOLE &&
-		     em->block_start != EXTENT_MAP_INLINE) &&
-		    !isnew && !PageUptodate(page) &&
-		    (block_off_end > to || block_off_start < from) &&
-		    !test_range_bit(tree, block_start, cur_end,
-				    EXTENT_UPTODATE, 1)) {
-			u64 sector;
-			u64 extent_offset = block_start - em->start;
-			size_t iosize;
-			sector = (em->block_start + extent_offset) >> 9;
-			iosize = (cur_end - block_start + blocksize) &
-				~((u64)blocksize - 1);
-			/*
-			 * we've already got the extent locked, but we
-			 * need to split the state such that our end_bio
-			 * handler can clear the lock.
-			 */
-			set_extent_bit(tree, block_start,
-				       block_start + iosize - 1,
-				       EXTENT_LOCKED, 0, NULL, GFP_NOFS);
-			ret = submit_extent_page(READ, tree, page,
-					 sector, iosize, page_offset, em->bdev,
-					 NULL, 1,
-					 end_bio_extent_preparewrite);
-			iocount++;
-			block_start = block_start + iosize;
-		} else {
-			set_extent_uptodate(tree, block_start, cur_end,
-					    GFP_NOFS);
-			unlock_extent(tree, block_start, cur_end, GFP_NOFS);
-			block_start = cur_end + 1;
-		}
-		page_offset = block_start & (PAGE_CACHE_SIZE - 1);
-		free_extent_map(em);
-	}
-	if (iocount) {
-		wait_extent_bit(tree, orig_block_start,
-				block_end, EXTENT_LOCKED);
-	}
-	check_page_uptodate(tree, page);
-err:
-	/* FIXME, zero out newly allocated blocks on error */
-	return err;
-}
-EXPORT_SYMBOL(extent_prepare_write);
-
-/*
- * a helper for releasepage.  As long as there are no locked extents
- * in the range corresponding to the page, both state records and extent
- * map records are removed
- */
-int try_release_extent_mapping(struct extent_map_tree *tree, struct page *page)
-{
-	struct extent_map *em;
-	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
-	u64 end = start + PAGE_CACHE_SIZE - 1;
-	u64 orig_start = start;
-	int ret = 1;
-
-	while (start <= end) {
-		em = lookup_extent_mapping(tree, start, end);
-		if (!em || IS_ERR(em))
-			break;
-		if (!test_range_bit(tree, em->start, em->end,
-				    EXTENT_LOCKED, 0)) {
-			remove_extent_mapping(tree, em);
-			/* once for the rb tree */
-			free_extent_map(em);
-		}
-		start = em->end + 1;
-		/* once for us */
-		free_extent_map(em);
-	}
-	if (test_range_bit(tree, orig_start, end, EXTENT_LOCKED, 0))
-		ret = 0;
-	else
-		clear_extent_bit(tree, orig_start, end, EXTENT_UPTODATE,
-				 1, 1, GFP_NOFS);
-	return ret;
-}
-EXPORT_SYMBOL(try_release_extent_mapping);
-
-sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
-		get_extent_t *get_extent)
-{
-	struct inode *inode = mapping->host;
-	u64 start = iblock << inode->i_blkbits;
-	u64 end = start + (1 << inode->i_blkbits) - 1;
-	sector_t sector = 0;
-	struct extent_map *em;
-
-	em = get_extent(inode, NULL, 0, start, end, 0);
-	if (!em || IS_ERR(em))
-		return 0;
-
-	if (em->block_start == EXTENT_MAP_INLINE ||
-	    em->block_start == EXTENT_MAP_HOLE)
-		goto out;
-
-	sector = (em->block_start + start - em->start) >> inode->i_blkbits;
-out:
-	free_extent_map(em);
-	return sector;
-}
-
-static int add_lru(struct extent_map_tree *tree, struct extent_buffer *eb)
-{
-	if (list_empty(&eb->lru)) {
-		extent_buffer_get(eb);
-		list_add(&eb->lru, &tree->buffer_lru);
-		tree->lru_size++;
-		if (tree->lru_size >= BUFFER_LRU_MAX) {
-			struct extent_buffer *rm;
-			rm = list_entry(tree->buffer_lru.prev,
-					struct extent_buffer, lru);
-			tree->lru_size--;
-			list_del_init(&rm->lru);
-			free_extent_buffer(rm);
-		}
-	} else
-		list_move(&eb->lru, &tree->buffer_lru);
-	return 0;
-}
-static struct extent_buffer *find_lru(struct extent_map_tree *tree,
-				      u64 start, unsigned long len)
-{
-	struct list_head *lru = &tree->buffer_lru;
-	struct list_head *cur = lru->next;
-	struct extent_buffer *eb;
-
-	if (list_empty(lru))
-		return NULL;
-
-	do {
-		eb = list_entry(cur, struct extent_buffer, lru);
-		if (eb->start == start && eb->len == len) {
-			extent_buffer_get(eb);
-			return eb;
-		}
-		cur = cur->next;
-	} while (cur != lru);
-	return NULL;
-}
-
-static inline unsigned long num_extent_pages(u64 start, u64 len)
-{
-	return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
-		(start >> PAGE_CACHE_SHIFT);
-}
-
-static inline struct page *extent_buffer_page(struct extent_buffer *eb,
-					      unsigned long i)
-{
-	struct page *p;
-	struct address_space *mapping;
-
-	if (i == 0)
-		return eb->first_page;
-	i += eb->start >> PAGE_CACHE_SHIFT;
-	mapping = eb->first_page->mapping;
-	read_lock_irq(&mapping->tree_lock);
-	p = radix_tree_lookup(&mapping->page_tree, i);
-	read_unlock_irq(&mapping->tree_lock);
-	return p;
-}
-
-static struct extent_buffer *__alloc_extent_buffer(struct extent_map_tree *tree,
-						   u64 start,
-						   unsigned long len,
-						   gfp_t mask)
-{
-	struct extent_buffer *eb = NULL;
-
-	spin_lock(&tree->lru_lock);
-	eb = find_lru(tree, start, len);
-	spin_unlock(&tree->lru_lock);
-	if (eb) {
-		return eb;
-	}
-
-	eb = kmem_cache_zalloc(extent_buffer_cache, mask);
-	INIT_LIST_HEAD(&eb->lru);
-	eb->start = start;
-	eb->len = len;
-	atomic_set(&eb->refs, 1);
-
-	return eb;
-}
-
-static void __free_extent_buffer(struct extent_buffer *eb)
-{
-	kmem_cache_free(extent_buffer_cache, eb);
-}
-
-struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree,
-					  u64 start, unsigned long len,
-					  struct page *page0,
-					  gfp_t mask)
-{
-	unsigned long num_pages = num_extent_pages(start, len);
-	unsigned long i;
-	unsigned long index = start >> PAGE_CACHE_SHIFT;
-	struct extent_buffer *eb;
-	struct page *p;
-	struct address_space *mapping = tree->mapping;
-	int uptodate = 1;
-
-	eb = __alloc_extent_buffer(tree, start, len, mask);
-	if (!eb || IS_ERR(eb))
-		return NULL;
-
-	if (eb->flags & EXTENT_BUFFER_FILLED)
-		goto lru_add;
-
-	if (page0) {
-		eb->first_page = page0;
-		i = 1;
-		index++;
-		page_cache_get(page0);
-		mark_page_accessed(page0);
-		set_page_extent_mapped(page0);
-		WARN_ON(!PageUptodate(page0));
-		set_page_extent_head(page0, len);
-	} else {
-		i = 0;
-	}
-	for (; i < num_pages; i++, index++) {
-		p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM);
-		if (!p) {
-			WARN_ON(1);
-			goto fail;
-		}
-		set_page_extent_mapped(p);
-		mark_page_accessed(p);
-		if (i == 0) {
-			eb->first_page = p;
-			set_page_extent_head(p, len);
-		} else {
-			set_page_private(p, EXTENT_PAGE_PRIVATE);
-		}
-		if (!PageUptodate(p))
-			uptodate = 0;
-		unlock_page(p);
-	}
-	if (uptodate)
-		eb->flags |= EXTENT_UPTODATE;
-	eb->flags |= EXTENT_BUFFER_FILLED;
-
-lru_add:
-	spin_lock(&tree->lru_lock);
-	add_lru(tree, eb);
-	spin_unlock(&tree->lru_lock);
-	return eb;
-
-fail:
-	spin_lock(&tree->lru_lock);
-	list_del_init(&eb->lru);
-	spin_unlock(&tree->lru_lock);
-	if (!atomic_dec_and_test(&eb->refs))
-		return NULL;
-	for (index = 1; index < i; index++) {
-		page_cache_release(extent_buffer_page(eb, index));
-	}
-	if (i > 0)
-		page_cache_release(extent_buffer_page(eb, 0));
-	__free_extent_buffer(eb);
-	return NULL;
-}
-EXPORT_SYMBOL(alloc_extent_buffer);
-
-struct extent_buffer *find_extent_buffer(struct extent_map_tree *tree,
-					 u64 start, unsigned long len,
-					  gfp_t mask)
-{
-	unsigned long num_pages = num_extent_pages(start, len);
-	unsigned long i;
-	unsigned long index = start >> PAGE_CACHE_SHIFT;
-	struct extent_buffer *eb;
-	struct page *p;
-	struct address_space *mapping = tree->mapping;
-	int uptodate = 1;
-
-	eb = __alloc_extent_buffer(tree, start, len, mask);
-	if (!eb || IS_ERR(eb))
-		return NULL;
-
-	if (eb->flags & EXTENT_BUFFER_FILLED)
-		goto lru_add;
-
-	for (i = 0; i < num_pages; i++, index++) {
-		p = find_lock_page(mapping, index);
-		if (!p) {
-			goto fail;
-		}
-		set_page_extent_mapped(p);
-		mark_page_accessed(p);
-
-		if (i == 0) {
-			eb->first_page = p;
-			set_page_extent_head(p, len);
-		} else {
-			set_page_private(p, EXTENT_PAGE_PRIVATE);
-		}
-
-		if (!PageUptodate(p))
-			uptodate = 0;
-		unlock_page(p);
-	}
-	if (uptodate)
-		eb->flags |= EXTENT_UPTODATE;
-	eb->flags |= EXTENT_BUFFER_FILLED;
-
-lru_add:
-	spin_lock(&tree->lru_lock);
-	add_lru(tree, eb);
-	spin_unlock(&tree->lru_lock);
-	return eb;
-fail:
-	spin_lock(&tree->lru_lock);
-	list_del_init(&eb->lru);
-	spin_unlock(&tree->lru_lock);
-	if (!atomic_dec_and_test(&eb->refs))
-		return NULL;
-	for (index = 1; index < i; index++) {
-		page_cache_release(extent_buffer_page(eb, index));
-	}
-	if (i > 0)
-		page_cache_release(extent_buffer_page(eb, 0));
-	__free_extent_buffer(eb);
-	return NULL;
-}
-EXPORT_SYMBOL(find_extent_buffer);
-
-void free_extent_buffer(struct extent_buffer *eb)
-{
-	unsigned long i;
-	unsigned long num_pages;
-
-	if (!eb)
-		return;
-
-	if (!atomic_dec_and_test(&eb->refs))
-		return;
-
-	WARN_ON(!list_empty(&eb->lru));
-	num_pages = num_extent_pages(eb->start, eb->len);
-
-	for (i = 1; i < num_pages; i++) {
-		page_cache_release(extent_buffer_page(eb, i));
-	}
-	page_cache_release(extent_buffer_page(eb, 0));
-	__free_extent_buffer(eb);
-}
-EXPORT_SYMBOL(free_extent_buffer);
-
-int clear_extent_buffer_dirty(struct extent_map_tree *tree,
-			      struct extent_buffer *eb)
-{
-	int set;
-	unsigned long i;
-	unsigned long num_pages;
-	struct page *page;
-
-	u64 start = eb->start;
-	u64 end = start + eb->len - 1;
-
-	set = clear_extent_dirty(tree, start, end, GFP_NOFS);
-	num_pages = num_extent_pages(eb->start, eb->len);
-
-	for (i = 0; i < num_pages; i++) {
-		page = extent_buffer_page(eb, i);
-		lock_page(page);
-		if (i == 0)
-			set_page_extent_head(page, eb->len);
-		else
-			set_page_private(page, EXTENT_PAGE_PRIVATE);
-
-		/*
-		 * if we're on the last page or the first page and the
-		 * block isn't aligned on a page boundary, do extra checks
-		 * to make sure we don't clean page that is partially dirty
-		 */
-		if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
-		    ((i == num_pages - 1) &&
-		     ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
-			start = (u64)page->index << PAGE_CACHE_SHIFT;
-			end  = start + PAGE_CACHE_SIZE - 1;
-			if (test_range_bit(tree, start, end,
-					   EXTENT_DIRTY, 0)) {
-				unlock_page(page);
-				continue;
-			}
-		}
-		clear_page_dirty_for_io(page);
-		write_lock_irq(&page->mapping->tree_lock);
-		if (!PageDirty(page)) {
-			radix_tree_tag_clear(&page->mapping->page_tree,
-						page_index(page),
-						PAGECACHE_TAG_DIRTY);
-		}
-		write_unlock_irq(&page->mapping->tree_lock);
-		unlock_page(page);
-	}
-	return 0;
-}
-EXPORT_SYMBOL(clear_extent_buffer_dirty);
-
-int wait_on_extent_buffer_writeback(struct extent_map_tree *tree,
-				    struct extent_buffer *eb)
-{
-	return wait_on_extent_writeback(tree, eb->start,
-					eb->start + eb->len - 1);
-}
-EXPORT_SYMBOL(wait_on_extent_buffer_writeback);
-
-int set_extent_buffer_dirty(struct extent_map_tree *tree,
-			     struct extent_buffer *eb)
-{
-	unsigned long i;
-	unsigned long num_pages;
-
-	num_pages = num_extent_pages(eb->start, eb->len);
-	for (i = 0; i < num_pages; i++) {
-		struct page *page = extent_buffer_page(eb, i);
-		/* writepage may need to do something special for the
-		 * first page, we have to make sure page->private is
-		 * properly set.  releasepage may drop page->private
-		 * on us if the page isn't already dirty.
-		 */
-		if (i == 0) {
-			lock_page(page);
-			set_page_extent_head(page, eb->len);
-		} else if (PagePrivate(page) &&
-			   page->private != EXTENT_PAGE_PRIVATE) {
-			lock_page(page);
-			set_page_extent_mapped(page);
-			unlock_page(page);
-		}
-		__set_page_dirty_nobuffers(extent_buffer_page(eb, i));
-		if (i == 0)
-			unlock_page(page);
-	}
-	return set_extent_dirty(tree, eb->start,
-				eb->start + eb->len - 1, GFP_NOFS);
-}
-EXPORT_SYMBOL(set_extent_buffer_dirty);
-
-int set_extent_buffer_uptodate(struct extent_map_tree *tree,
-				struct extent_buffer *eb)
-{
-	unsigned long i;
-	struct page *page;
-	unsigned long num_pages;
-
-	num_pages = num_extent_pages(eb->start, eb->len);
-
-	set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
-			    GFP_NOFS);
-	for (i = 0; i < num_pages; i++) {
-		page = extent_buffer_page(eb, i);
-		if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
-		    ((i == num_pages - 1) &&
-		     ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
-			check_page_uptodate(tree, page);
-			continue;
-		}
-		SetPageUptodate(page);
-	}
-	return 0;
-}
-EXPORT_SYMBOL(set_extent_buffer_uptodate);
-
-int extent_buffer_uptodate(struct extent_map_tree *tree,
-			     struct extent_buffer *eb)
-{
-	if (eb->flags & EXTENT_UPTODATE)
-		return 1;
-	return test_range_bit(tree, eb->start, eb->start + eb->len - 1,
-			   EXTENT_UPTODATE, 1);
-}
-EXPORT_SYMBOL(extent_buffer_uptodate);
-
-int read_extent_buffer_pages(struct extent_map_tree *tree,
-			     struct extent_buffer *eb,
-			     u64 start,
-			     int wait)
-{
-	unsigned long i;
-	unsigned long start_i;
-	struct page *page;
-	int err;
-	int ret = 0;
-	unsigned long num_pages;
-
-	if (eb->flags & EXTENT_UPTODATE)
-		return 0;
-
-	if (0 && test_range_bit(tree, eb->start, eb->start + eb->len - 1,
-			   EXTENT_UPTODATE, 1)) {
-		return 0;
-	}
-
-	if (start) {
-		WARN_ON(start < eb->start);
-		start_i = (start >> PAGE_CACHE_SHIFT) -
-			(eb->start >> PAGE_CACHE_SHIFT);
-	} else {
-		start_i = 0;
-	}
-
-	num_pages = num_extent_pages(eb->start, eb->len);
-	for (i = start_i; i < num_pages; i++) {
-		page = extent_buffer_page(eb, i);
-		if (PageUptodate(page)) {
-			continue;
-		}
-		if (!wait) {
-			if (TestSetPageLocked(page)) {
-				continue;
-			}
-		} else {
-			lock_page(page);
-		}
-		if (!PageUptodate(page)) {
-			err = page->mapping->a_ops->readpage(NULL, page);
-			if (err) {
-				ret = err;
-			}
-		} else {
-			unlock_page(page);
-		}
-	}
-
-	if (ret || !wait) {
-		return ret;
-	}
-
-	for (i = start_i; i < num_pages; i++) {
-		page = extent_buffer_page(eb, i);
-		wait_on_page_locked(page);
-		if (!PageUptodate(page)) {
-			ret = -EIO;
-		}
-	}
-	if (!ret)
-		eb->flags |= EXTENT_UPTODATE;
-	return ret;
-}
-EXPORT_SYMBOL(read_extent_buffer_pages);
-
-void read_extent_buffer(struct extent_buffer *eb, void *dstv,
-			unsigned long start,
-			unsigned long len)
-{
-	size_t cur;
-	size_t offset;
-	struct page *page;
-	char *kaddr;
-	char *dst = (char *)dstv;
-	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
-	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
-	unsigned long num_pages = num_extent_pages(eb->start, eb->len);
-
-	WARN_ON(start > eb->len);
-	WARN_ON(start + len > eb->start + eb->len);
-
-	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
-
-	while(len > 0) {
-		page = extent_buffer_page(eb, i);
-		if (!PageUptodate(page)) {
-			printk("page %lu not up to date i %lu, total %lu, len %lu\n", page->index, i, num_pages, eb->len);
-			WARN_ON(1);
-		}
-		WARN_ON(!PageUptodate(page));
-
-		cur = min(len, (PAGE_CACHE_SIZE - offset));
-		kaddr = kmap_atomic(page, KM_USER1);
-		memcpy(dst, kaddr + offset, cur);
-		kunmap_atomic(kaddr, KM_USER1);
-
-		dst += cur;
-		len -= cur;
-		offset = 0;
-		i++;
-	}
-}
-EXPORT_SYMBOL(read_extent_buffer);
-
-int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
-			       unsigned long min_len, char **token, char **map,
-			       unsigned long *map_start,
-			       unsigned long *map_len, int km)
-{
-	size_t offset = start & (PAGE_CACHE_SIZE - 1);
-	char *kaddr;
-	struct page *p;
-	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
-	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
-	unsigned long end_i = (start_offset + start + min_len - 1) >>
-		PAGE_CACHE_SHIFT;
-
-	if (i != end_i)
-		return -EINVAL;
-
-	if (i == 0) {
-		offset = start_offset;
-		*map_start = 0;
-	} else {
-		offset = 0;
-		*map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
-	}
-	if (start + min_len > eb->len) {
-printk("bad mapping eb start %Lu len %lu, wanted %lu %lu\n", eb->start, eb->len, start, min_len);
-		WARN_ON(1);
-	}
-
-	p = extent_buffer_page(eb, i);
-	WARN_ON(!PageUptodate(p));
-	kaddr = kmap_atomic(p, km);
-	*token = kaddr;
-	*map = kaddr + offset;
-	*map_len = PAGE_CACHE_SIZE - offset;
-	return 0;
-}
-EXPORT_SYMBOL(map_private_extent_buffer);
-
-int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
-		      unsigned long min_len,
-		      char **token, char **map,
-		      unsigned long *map_start,
-		      unsigned long *map_len, int km)
-{
-	int err;
-	int save = 0;
-	if (eb->map_token) {
-		unmap_extent_buffer(eb, eb->map_token, km);
-		eb->map_token = NULL;
-		save = 1;
-	}
-	err = map_private_extent_buffer(eb, start, min_len, token, map,
-				       map_start, map_len, km);
-	if (!err && save) {
-		eb->map_token = *token;
-		eb->kaddr = *map;
-		eb->map_start = *map_start;
-		eb->map_len = *map_len;
-	}
-	return err;
-}
-EXPORT_SYMBOL(map_extent_buffer);
-
-void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km)
-{
-	kunmap_atomic(token, km);
-}
-EXPORT_SYMBOL(unmap_extent_buffer);
-
-int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
-			  unsigned long start,
-			  unsigned long len)
-{
-	size_t cur;
-	size_t offset;
-	struct page *page;
-	char *kaddr;
-	char *ptr = (char *)ptrv;
-	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
-	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
-	int ret = 0;
-
-	WARN_ON(start > eb->len);
-	WARN_ON(start + len > eb->start + eb->len);
-
-	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
-
-	while(len > 0) {
-		page = extent_buffer_page(eb, i);
-		WARN_ON(!PageUptodate(page));
-
-		cur = min(len, (PAGE_CACHE_SIZE - offset));
-
-		kaddr = kmap_atomic(page, KM_USER0);
-		ret = memcmp(ptr, kaddr + offset, cur);
-		kunmap_atomic(kaddr, KM_USER0);
-		if (ret)
-			break;
-
-		ptr += cur;
-		len -= cur;
-		offset = 0;
-		i++;
-	}
-	return ret;
-}
-EXPORT_SYMBOL(memcmp_extent_buffer);
-
-void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
-			 unsigned long start, unsigned long len)
-{
-	size_t cur;
-	size_t offset;
-	struct page *page;
-	char *kaddr;
-	char *src = (char *)srcv;
-	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
-	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
-
-	WARN_ON(start > eb->len);
-	WARN_ON(start + len > eb->start + eb->len);
-
-	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
-
-	while(len > 0) {
-		page = extent_buffer_page(eb, i);
-		WARN_ON(!PageUptodate(page));
-
-		cur = min(len, PAGE_CACHE_SIZE - offset);
-		kaddr = kmap_atomic(page, KM_USER1);
-		memcpy(kaddr + offset, src, cur);
-		kunmap_atomic(kaddr, KM_USER1);
-
-		src += cur;
-		len -= cur;
-		offset = 0;
-		i++;
-	}
-}
-EXPORT_SYMBOL(write_extent_buffer);
-
-void memset_extent_buffer(struct extent_buffer *eb, char c,
-			  unsigned long start, unsigned long len)
-{
-	size_t cur;
-	size_t offset;
-	struct page *page;
-	char *kaddr;
-	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
-	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
-
-	WARN_ON(start > eb->len);
-	WARN_ON(start + len > eb->start + eb->len);
-
-	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
-
-	while(len > 0) {
-		page = extent_buffer_page(eb, i);
-		WARN_ON(!PageUptodate(page));
-
-		cur = min(len, PAGE_CACHE_SIZE - offset);
-		kaddr = kmap_atomic(page, KM_USER0);
-		memset(kaddr + offset, c, cur);
-		kunmap_atomic(kaddr, KM_USER0);
-
-		len -= cur;
-		offset = 0;
-		i++;
-	}
-}
-EXPORT_SYMBOL(memset_extent_buffer);
-
-void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
-			unsigned long dst_offset, unsigned long src_offset,
-			unsigned long len)
-{
-	u64 dst_len = dst->len;
-	size_t cur;
-	size_t offset;
-	struct page *page;
-	char *kaddr;
-	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
-	unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
-
-	WARN_ON(src->len != dst_len);
-
-	offset = (start_offset + dst_offset) &
-		((unsigned long)PAGE_CACHE_SIZE - 1);
-
-	while(len > 0) {
-		page = extent_buffer_page(dst, i);
-		WARN_ON(!PageUptodate(page));
-
-		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
-
-		kaddr = kmap_atomic(page, KM_USER0);
-		read_extent_buffer(src, kaddr + offset, src_offset, cur);
-		kunmap_atomic(kaddr, KM_USER0);
-
-		src_offset += cur;
-		len -= cur;
-		offset = 0;
-		i++;
-	}
-}
-EXPORT_SYMBOL(copy_extent_buffer);
-
-static void move_pages(struct page *dst_page, struct page *src_page,
-		       unsigned long dst_off, unsigned long src_off,
-		       unsigned long len)
-{
-	char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
-	if (dst_page == src_page) {
-		memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
-	} else {
-		char *src_kaddr = kmap_atomic(src_page, KM_USER1);
-		char *p = dst_kaddr + dst_off + len;
-		char *s = src_kaddr + src_off + len;
-
-		while (len--)
-			*--p = *--s;
-
-		kunmap_atomic(src_kaddr, KM_USER1);
-	}
-	kunmap_atomic(dst_kaddr, KM_USER0);
-}
-
-static void copy_pages(struct page *dst_page, struct page *src_page,
-		       unsigned long dst_off, unsigned long src_off,
-		       unsigned long len)
-{
-	char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
-	char *src_kaddr;
-
-	if (dst_page != src_page)
-		src_kaddr = kmap_atomic(src_page, KM_USER1);
-	else
-		src_kaddr = dst_kaddr;
-
-	memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
-	kunmap_atomic(dst_kaddr, KM_USER0);
-	if (dst_page != src_page)
-		kunmap_atomic(src_kaddr, KM_USER1);
-}
-
-void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
-			   unsigned long src_offset, unsigned long len)
-{
-	size_t cur;
-	size_t dst_off_in_page;
-	size_t src_off_in_page;
-	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
-	unsigned long dst_i;
-	unsigned long src_i;
-
-	if (src_offset + len > dst->len) {
-		printk("memmove bogus src_offset %lu move len %lu len %lu\n",
-		       src_offset, len, dst->len);
-		BUG_ON(1);
-	}
-	if (dst_offset + len > dst->len) {
-		printk("memmove bogus dst_offset %lu move len %lu len %lu\n",
-		       dst_offset, len, dst->len);
-		BUG_ON(1);
-	}
-
-	while(len > 0) {
-		dst_off_in_page = (start_offset + dst_offset) &
-			((unsigned long)PAGE_CACHE_SIZE - 1);
-		src_off_in_page = (start_offset + src_offset) &
-			((unsigned long)PAGE_CACHE_SIZE - 1);
-
-		dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
-		src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
-
-		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
-					       src_off_in_page));
-		cur = min_t(unsigned long, cur,
-			(unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
-
-		copy_pages(extent_buffer_page(dst, dst_i),
-			   extent_buffer_page(dst, src_i),
-			   dst_off_in_page, src_off_in_page, cur);
-
-		src_offset += cur;
-		dst_offset += cur;
-		len -= cur;
-	}
-}
-EXPORT_SYMBOL(memcpy_extent_buffer);
-
-void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
-			   unsigned long src_offset, unsigned long len)
-{
-	size_t cur;
-	size_t dst_off_in_page;
-	size_t src_off_in_page;
-	unsigned long dst_end = dst_offset + len - 1;
-	unsigned long src_end = src_offset + len - 1;
-	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
-	unsigned long dst_i;
-	unsigned long src_i;
-
-	if (src_offset + len > dst->len) {
-		printk("memmove bogus src_offset %lu move len %lu len %lu\n",
-		       src_offset, len, dst->len);
-		BUG_ON(1);
-	}
-	if (dst_offset + len > dst->len) {
-		printk("memmove bogus dst_offset %lu move len %lu len %lu\n",
-		       dst_offset, len, dst->len);
-		BUG_ON(1);
-	}
-	if (dst_offset < src_offset) {
-		memcpy_extent_buffer(dst, dst_offset, src_offset, len);
-		return;
-	}
-	while(len > 0) {
-		dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
-		src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
-
-		dst_off_in_page = (start_offset + dst_end) &
-			((unsigned long)PAGE_CACHE_SIZE - 1);
-		src_off_in_page = (start_offset + src_end) &
-			((unsigned long)PAGE_CACHE_SIZE - 1);
-
-		cur = min_t(unsigned long, len, src_off_in_page + 1);
-		cur = min(cur, dst_off_in_page + 1);
-		move_pages(extent_buffer_page(dst, dst_i),
-			   extent_buffer_page(dst, src_i),
-			   dst_off_in_page - cur + 1,
-			   src_off_in_page - cur + 1, cur);
-
-		dst_end -= cur;
-		src_end -= cur;
-		len -= cur;
-	}
-}
-EXPORT_SYMBOL(memmove_extent_buffer);
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index ea60f5447b5b..56314217cfc0 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -3,215 +3,53 @@
 
 #include <linux/rbtree.h>
 
+#define EXTENT_MAP_LAST_BYTE (u64)-4
 #define EXTENT_MAP_HOLE (u64)-3
 #define EXTENT_MAP_INLINE (u64)-2
 #define EXTENT_MAP_DELALLOC (u64)-1
 
-/* bits for the extent state */
-#define EXTENT_DIRTY 1
-#define EXTENT_WRITEBACK (1 << 1)
-#define EXTENT_UPTODATE (1 << 2)
-#define EXTENT_LOCKED (1 << 3)
-#define EXTENT_NEW (1 << 4)
-#define EXTENT_DELALLOC (1 << 5)
-#define EXTENT_DEFRAG (1 << 6)
-#define EXTENT_DEFRAG_DONE (1 << 7)
-#define EXTENT_BUFFER_FILLED (1 << 8)
-#define EXTENT_CSUM (1 << 9)
-#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
-
-/*
- * page->private values.  Every page that is controlled by the extent
- * map has page->private set to one.
- */
-#define EXTENT_PAGE_PRIVATE 1
-#define EXTENT_PAGE_PRIVATE_FIRST_PAGE 3
-
-
-struct extent_map_ops {
-	int (*fill_delalloc)(struct inode *inode, u64 start, u64 end);
-	int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
-	int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
-	int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end);
-	void (*writepage_end_io_hook)(struct page *page, u64 start, u64 end);
-};
-
-struct extent_map_tree {
-	struct rb_root map;
-	struct rb_root state;
-	struct address_space *mapping;
-	u64 dirty_bytes;
-	rwlock_t lock;
-	struct extent_map_ops *ops;
-	spinlock_t lru_lock;
-	struct list_head buffer_lru;
-	int lru_size;
-};
-
-/* note, this must start with the same fields as fs/extent_map.c:tree_entry */
 struct extent_map {
-	u64 start;
-	u64 end; /* inclusive */
-	int in_tree;
 	struct rb_node rb_node;
-	/* block_start and block_end are in bytes */
+
+	/* all of these are in bytes */
+	u64 start;
+	u64 len;
 	u64 block_start;
-	u64 block_end; /* inclusive */
+	unsigned long flags;
 	struct block_device *bdev;
 	atomic_t refs;
-};
-
-/* note, this must start with the same fields as fs/extent_map.c:tree_entry */
-struct extent_state {
-	u64 start;
-	u64 end; /* inclusive */
 	int in_tree;
-	struct rb_node rb_node;
-	wait_queue_head_t wq;
-	atomic_t refs;
-	unsigned long state;
-
-	/* for use by the FS */
-	u64 private;
-
-	struct list_head list;
 };
 
-struct extent_buffer {
-	u64 start;
-	unsigned long len;
-	char *map_token;
-	char *kaddr;
-	unsigned long map_start;
-	unsigned long map_len;
-	struct page *first_page;
-	struct list_head lru;
-	atomic_t refs;
-	int flags;
+struct extent_map_tree {
+	struct rb_root map;
+	struct extent_map *last;
+	spinlock_t lock;
 };
 
-typedef struct extent_map *(get_extent_t)(struct inode *inode,
-					  struct page *page,
-					  size_t page_offset,
-					  u64 start, u64 end,
-					  int create);
+static inline u64 extent_map_end(struct extent_map *em)
+{
+	if (em->start + em->len < em->start)
+		return (u64)-1;
+	return em->start + em->len;
+}
+
+static inline u64 extent_map_block_end(struct extent_map *em)
+{
+	if (em->block_start + em->len < em->block_start)
+		return (u64)-1;
+	return em->block_start + em->len;
+}
 
-void extent_map_tree_init(struct extent_map_tree *tree,
-			  struct address_space *mapping, gfp_t mask);
-void extent_map_tree_empty_lru(struct extent_map_tree *tree);
+void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask);
 struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
-					 u64 start, u64 end);
+					 u64 start, u64 len);
 int add_extent_mapping(struct extent_map_tree *tree,
 		       struct extent_map *em);
 int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
-int try_release_extent_mapping(struct extent_map_tree *tree, struct page *page);
-int lock_extent(struct extent_map_tree *tree, u64 start, u64 end, gfp_t mask);
-int unlock_extent(struct extent_map_tree *tree, u64 start, u64 end, gfp_t mask);
+
 struct extent_map *alloc_extent_map(gfp_t mask);
 void free_extent_map(struct extent_map *em);
-int extent_read_full_page(struct extent_map_tree *tree, struct page *page,
-			  get_extent_t *get_extent);
 int __init extent_map_init(void);
 void extent_map_exit(void);
-
-u64 count_range_bits(struct extent_map_tree *tree,
-		     u64 *start, u64 search_end,
-		     u64 max_bytes, unsigned long bits);
-
-int test_range_bit(struct extent_map_tree *tree, u64 start, u64 end,
-		   int bits, int filled);
-int clear_extent_bits(struct extent_map_tree *tree, u64 start, u64 end,
-		      int bits, gfp_t mask);
-int set_extent_bits(struct extent_map_tree *tree, u64 start, u64 end,
-		    int bits, gfp_t mask);
-int set_extent_uptodate(struct extent_map_tree *tree, u64 start, u64 end,
-			gfp_t mask);
-int set_extent_new(struct extent_map_tree *tree, u64 start, u64 end,
-		   gfp_t mask);
-int set_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end,
-		     gfp_t mask);
-int clear_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end,
-		       gfp_t mask);
-int set_extent_delalloc(struct extent_map_tree *tree, u64 start, u64 end,
-		     gfp_t mask);
-int find_first_extent_bit(struct extent_map_tree *tree, u64 start,
-			  u64 *start_ret, u64 *end_ret, int bits);
-int extent_invalidatepage(struct extent_map_tree *tree,
-			  struct page *page, unsigned long offset);
-int extent_write_full_page(struct extent_map_tree *tree, struct page *page,
-			  get_extent_t *get_extent,
-			  struct writeback_control *wbc);
-int extent_writepages(struct extent_map_tree *tree,
-		      struct address_space *mapping,
-		      get_extent_t *get_extent,
-		      struct writeback_control *wbc);
-int extent_readpages(struct extent_map_tree *tree,
-		     struct address_space *mapping,
-		     struct list_head *pages, unsigned nr_pages,
-		     get_extent_t get_extent);
-int extent_prepare_write(struct extent_map_tree *tree,
-			 struct inode *inode, struct page *page,
-			 unsigned from, unsigned to, get_extent_t *get_extent);
-int extent_commit_write(struct extent_map_tree *tree,
-			struct inode *inode, struct page *page,
-			unsigned from, unsigned to);
-sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
-		get_extent_t *get_extent);
-int set_range_dirty(struct extent_map_tree *tree, u64 start, u64 end);
-int set_state_private(struct extent_map_tree *tree, u64 start, u64 private);
-int get_state_private(struct extent_map_tree *tree, u64 start, u64 *private);
-void set_page_extent_mapped(struct page *page);
-
-struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree,
-					  u64 start, unsigned long len,
-					  struct page *page0,
-					  gfp_t mask);
-struct extent_buffer *find_extent_buffer(struct extent_map_tree *tree,
-					 u64 start, unsigned long len,
-					  gfp_t mask);
-void free_extent_buffer(struct extent_buffer *eb);
-int read_extent_buffer_pages(struct extent_map_tree *tree,
-			     struct extent_buffer *eb, u64 start, int wait);
-
-static inline void extent_buffer_get(struct extent_buffer *eb)
-{
-	atomic_inc(&eb->refs);
-}
-
-int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
-			  unsigned long start,
-			  unsigned long len);
-void read_extent_buffer(struct extent_buffer *eb, void *dst,
-			unsigned long start,
-			unsigned long len);
-void write_extent_buffer(struct extent_buffer *eb, const void *src,
-			 unsigned long start, unsigned long len);
-void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
-			unsigned long dst_offset, unsigned long src_offset,
-			unsigned long len);
-void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
-			   unsigned long src_offset, unsigned long len);
-void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
-			   unsigned long src_offset, unsigned long len);
-void memset_extent_buffer(struct extent_buffer *eb, char c,
-			  unsigned long start, unsigned long len);
-int wait_on_extent_buffer_writeback(struct extent_map_tree *tree,
-				    struct extent_buffer *eb);
-int clear_extent_buffer_dirty(struct extent_map_tree *tree,
-			      struct extent_buffer *eb);
-int set_extent_buffer_dirty(struct extent_map_tree *tree,
-			     struct extent_buffer *eb);
-int set_extent_buffer_uptodate(struct extent_map_tree *tree,
-			       struct extent_buffer *eb);
-int extent_buffer_uptodate(struct extent_map_tree *tree,
-			   struct extent_buffer *eb);
-int map_extent_buffer(struct extent_buffer *eb, unsigned long offset,
-		      unsigned long min_len, char **token, char **map,
-		      unsigned long *map_start,
-		      unsigned long *map_len, int km);
-int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset,
-		      unsigned long min_len, char **token, char **map,
-		      unsigned long *map_start,
-		      unsigned long *map_len, int km);
-void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km);
 #endif
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 1cd8c908811e..c5bb00f92396 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -233,8 +233,7 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 	int err = 0;
 	int i;
 	struct inode *inode = fdentry(file)->d_inode;
-	struct extent_map *em;
-	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	u64 hint_byte;
 	u64 num_bytes;
 	u64 start_pos;
@@ -242,11 +241,6 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 	u64 end_pos = pos + write_bytes;
 	u64 inline_size;
 	loff_t isize = i_size_read(inode);
-	em = alloc_extent_map(GFP_NOFS);
-	if (!em)
-		return -ENOMEM;
-
-	em->bdev = inode->i_sb->s_bdev;
 
 	start_pos = pos & ~((u64)root->sectorsize - 1);
 	num_bytes = (write_bytes + pos - start_pos +
@@ -254,7 +248,7 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 
 	end_of_last_block = start_pos + num_bytes - 1;
 
-	lock_extent(em_tree, start_pos, end_of_last_block, GFP_NOFS);
+	lock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
 	if (!trans) {
@@ -268,7 +262,7 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 	if ((end_of_last_block & 4095) == 0) {
 		printk("strange end of last %Lu %zu %Lu\n", start_pos, write_bytes, end_of_last_block);
 	}
-	set_extent_uptodate(em_tree, start_pos, end_of_last_block, GFP_NOFS);
+	set_extent_uptodate(io_tree, start_pos, end_of_last_block, GFP_NOFS);
 
 	/* FIXME...EIEIO, ENOSPC and more */
 
@@ -293,6 +287,8 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 						       inode->i_ino,
 						       last_pos_in_file,
 						       0, 0, hole_size);
+			btrfs_drop_extent_cache(inode, last_pos_in_file,
+					last_pos_in_file + hole_size -1);
 			btrfs_check_file(root, inode);
 		}
 		if (err)
@@ -320,12 +316,12 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 		last_end += PAGE_CACHE_SIZE - 1;
 		if (start_pos < isize) {
 			u64 delalloc_start = start_pos;
-			existing_delalloc = count_range_bits(em_tree,
+			existing_delalloc = count_range_bits(io_tree,
 					     &delalloc_start,
 					     end_of_last_block, (u64)-1,
 					     EXTENT_DELALLOC);
 		}
-		set_extent_delalloc(em_tree, start_pos, end_of_last_block,
+		set_extent_delalloc(io_tree, start_pos, end_of_last_block,
 				 GFP_NOFS);
 		spin_lock(&root->fs_info->delalloc_lock);
 		root->fs_info->delalloc_bytes += (end_of_last_block + 1 -
@@ -346,6 +342,7 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 		inline_size -= start_pos;
 		err = insert_inline_extent(trans, root, inode, start_pos,
 					   inline_size, pages, 0, num_pages);
+		btrfs_drop_extent_cache(inode, start_pos, aligned_end - 1);
 		BUG_ON(err);
 	}
 	if (end_pos > isize) {
@@ -356,8 +353,7 @@ failed:
 	err = btrfs_end_transaction(trans, root);
 out_unlock:
 	mutex_unlock(&root->fs_info->fs_mutex);
-	unlock_extent(em_tree, start_pos, end_of_last_block, GFP_NOFS);
-	free_extent_map(em);
+	unlock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
 	return err;
 }
 
@@ -367,10 +363,15 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end)
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 
 	while(1) {
+		spin_lock(&em_tree->lock);
 		em = lookup_extent_mapping(em_tree, start, end);
-		if (!em)
+		if (!em) {
+			spin_unlock(&em_tree->lock);
 			break;
+		}
 		remove_extent_mapping(em_tree, em);
+		spin_unlock(&em_tree->lock);
+
 		/* once for us */
 		free_extent_map(em);
 		/* once for the tree*/
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 67005480e139..16d3aef45d18 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -53,7 +53,7 @@ static struct inode_operations btrfs_file_inode_operations;
 static struct address_space_operations btrfs_aops;
 static struct address_space_operations btrfs_symlink_aops;
 static struct file_operations btrfs_dir_file_operations;
-static struct extent_map_ops btrfs_extent_map_ops;
+static struct extent_io_ops btrfs_extent_io_ops;
 
 static struct kmem_cache *btrfs_inode_cachep;
 struct kmem_cache *btrfs_trans_handle_cachep;
@@ -104,6 +104,8 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
 	u64 num_bytes;
 	u64 cur_alloc_size;
 	u64 blocksize = root->sectorsize;
+	u64 orig_start = start;
+	u64 orig_num_bytes;
 	struct btrfs_key ins;
 	int ret;
 
@@ -115,6 +117,7 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
 	num_bytes = max(blocksize,  num_bytes);
 	ret = btrfs_drop_extents(trans, root, inode,
 				 start, start + num_bytes, start, &alloc_hint);
+	orig_num_bytes = num_bytes;
 
 	if (alloc_hint == EXTENT_MAP_INLINE)
 		goto out;
@@ -138,6 +141,8 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
 		alloc_hint = ins.objectid + ins.offset;
 		start += cur_alloc_size;
 	}
+	btrfs_drop_extent_cache(inode, orig_start,
+				orig_start + orig_num_bytes - 1);
 	btrfs_add_ordered_inode(inode);
 out:
 	btrfs_end_transaction(trans, root);
@@ -297,7 +302,7 @@ int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end)
 	int ret = 0;
 	struct inode *inode = page->mapping->host;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct btrfs_csum_item *item;
 	struct btrfs_path *path = NULL;
 	u32 csum;
@@ -317,7 +322,7 @@ int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end)
 	}
 	read_extent_buffer(path->nodes[0], &csum, (unsigned long)item,
 			   BTRFS_CRC32_SIZE);
-	set_state_private(em_tree, start, csum);
+	set_state_private(io_tree, start, csum);
 out:
 	if (path)
 		btrfs_free_path(path);
@@ -329,17 +334,19 @@ int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end)
 {
 	size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT);
 	struct inode *inode = page->mapping->host;
-	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	char *kaddr;
 	u64 private;
 	int ret;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	u32 csum = ~(u32)0;
 	unsigned long flags;
+
 	if (btrfs_test_opt(root, NODATASUM) ||
 	    btrfs_test_flag(inode, NODATASUM))
 		return 0;
-	ret = get_state_private(em_tree, start, &private);
+
+	ret = get_state_private(io_tree, start, &private);
 	local_irq_save(flags);
 	kaddr = kmap_atomic(page, KM_IRQ0);
 	if (ret) {
@@ -428,7 +435,7 @@ void btrfs_read_locked_inode(struct inode *inode)
 	switch (inode->i_mode & S_IFMT) {
 	case S_IFREG:
 		inode->i_mapping->a_ops = &btrfs_aops;
-		BTRFS_I(inode)->extent_tree.ops = &btrfs_extent_map_ops;
+		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 		inode->i_fop = &btrfs_file_operations;
 		inode->i_op = &btrfs_file_inode_operations;
 		break;
@@ -873,7 +880,7 @@ static int btrfs_cow_one_page(struct inode *inode, struct page *page,
 			      size_t zero_start)
 {
 	char *kaddr;
-	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
 	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
@@ -884,12 +891,12 @@ static int btrfs_cow_one_page(struct inode *inode, struct page *page,
 	WARN_ON(!PageLocked(page));
 	set_page_extent_mapped(page);
 
-	lock_extent(em_tree, page_start, page_end, GFP_NOFS);
+	lock_extent(io_tree, page_start, page_end, GFP_NOFS);
 	delalloc_start = page_start;
-	existing_delalloc = count_range_bits(&BTRFS_I(inode)->extent_tree,
+	existing_delalloc = count_range_bits(&BTRFS_I(inode)->io_tree,
 					     &delalloc_start, page_end,
 					     PAGE_CACHE_SIZE, EXTENT_DELALLOC);
-	set_extent_delalloc(&BTRFS_I(inode)->extent_tree, page_start,
+	set_extent_delalloc(&BTRFS_I(inode)->io_tree, page_start,
 			    page_end, GFP_NOFS);
 
 	spin_lock(&root->fs_info->delalloc_lock);
@@ -903,7 +910,7 @@ static int btrfs_cow_one_page(struct inode *inode, struct page *page,
 		kunmap(page);
 	}
 	set_page_dirty(page);
-	unlock_extent(em_tree, page_start, page_end, GFP_NOFS);
+	unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
 
 	return ret;
 }
@@ -961,7 +968,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 	    attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) {
 		struct btrfs_trans_handle *trans;
 		struct btrfs_root *root = BTRFS_I(inode)->root;
-		struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+		struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 
 		u64 mask = root->sectorsize - 1;
 		u64 pos = (inode->i_size + mask) & ~mask;
@@ -986,7 +993,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 
 		btrfs_truncate_page(inode->i_mapping, inode->i_size);
 
-		lock_extent(em_tree, pos, block_end, GFP_NOFS);
+		lock_extent(io_tree, pos, block_end, GFP_NOFS);
 		hole_size = block_end - hole_start;
 
 		mutex_lock(&root->fs_info->fs_mutex);
@@ -1001,11 +1008,13 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 						       inode->i_ino,
 						       hole_start, 0, 0,
 						       hole_size);
+			btrfs_drop_extent_cache(inode, hole_start,
+						hole_size - 1);
 			btrfs_check_file(root, inode);
 		}
 		btrfs_end_transaction(trans, root);
 		mutex_unlock(&root->fs_info->fs_mutex);
-		unlock_extent(em_tree, pos, block_end, GFP_NOFS);
+		unlock_extent(io_tree, pos, block_end, GFP_NOFS);
 		if (err)
 			return err;
 	}
@@ -1189,7 +1198,8 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
 	struct btrfs_iget_args *args = p;
 	inode->i_ino = args->ino;
 	BTRFS_I(inode)->root = args->root;
-	extent_map_tree_init(&BTRFS_I(inode)->extent_tree,
+	extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
+	extent_io_tree_init(&BTRFS_I(inode)->io_tree,
 			     inode->i_mapping, GFP_NOFS);
 	return 0;
 }
@@ -1485,7 +1495,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
 
-	extent_map_tree_init(&BTRFS_I(inode)->extent_tree,
+	extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
+	extent_io_tree_init(&BTRFS_I(inode)->io_tree,
 			     inode->i_mapping, GFP_NOFS);
 	BTRFS_I(inode)->root = root;
 
@@ -1672,9 +1683,10 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 		inode->i_mapping->a_ops = &btrfs_aops;
 		inode->i_fop = &btrfs_file_operations;
 		inode->i_op = &btrfs_file_inode_operations;
-		extent_map_tree_init(&BTRFS_I(inode)->extent_tree,
+		extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
+		extent_io_tree_init(&BTRFS_I(inode)->io_tree,
 				     inode->i_mapping, GFP_NOFS);
-		BTRFS_I(inode)->extent_tree.ops = &btrfs_extent_map_ops;
+		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 	}
 	dir->i_sb->s_dirt = 1;
 	btrfs_update_inode_block_group(trans, inode);
@@ -1816,7 +1828,7 @@ out_unlock:
 }
 
 struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
-				    size_t page_offset, u64 start, u64 end,
+				    size_t page_offset, u64 start, u64 len,
 				    int create)
 {
 	int ret;
@@ -1826,7 +1838,6 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
 	u64 extent_end = 0;
 	u64 objectid = inode->i_ino;
 	u32 found_type;
-	int failed_insert = 0;
 	struct btrfs_path *path;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_file_extent_item *item;
@@ -1834,6 +1845,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
 	struct btrfs_key found_key;
 	struct extent_map *em = NULL;
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct btrfs_trans_handle *trans = NULL;
 
 	path = btrfs_alloc_path();
@@ -1841,24 +1853,26 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
 	mutex_lock(&root->fs_info->fs_mutex);
 
 again:
-	em = lookup_extent_mapping(em_tree, start, end);
+	spin_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, start, len);
+	spin_unlock(&em_tree->lock);
+
 	if (em) {
 		if (em->start > start) {
-			printk("get_extent start %Lu em start %Lu\n",
-			       start, em->start);
+			printk("get_extent lookup [%Lu %Lu] em [%Lu %Lu]\n",
+			       start, len, em->start, em->len);
 			WARN_ON(1);
 		}
 		goto out;
 	}
+	em = alloc_extent_map(GFP_NOFS);
 	if (!em) {
-		em = alloc_extent_map(GFP_NOFS);
-		if (!em) {
-			err = -ENOMEM;
-			goto out;
-		}
-		em->start = EXTENT_MAP_HOLE;
-		em->end = EXTENT_MAP_HOLE;
+		err = -ENOMEM;
+		goto out;
 	}
+
+	em->start = EXTENT_MAP_HOLE;
+	em->len = (u64)-1;
 	em->bdev = inode->i_sb->s_bdev;
 	ret = btrfs_lookup_file_extent(trans, root, path,
 				       objectid, start, trans != NULL);
@@ -1893,28 +1907,25 @@ again:
 		if (start < extent_start || start >= extent_end) {
 			em->start = start;
 			if (start < extent_start) {
-				if (end < extent_start)
+				if (start + len <= extent_start)
 					goto not_found;
-				em->end = extent_end - 1;
+				em->len = extent_end - extent_start;
 			} else {
-				em->end = end;
+				em->len = len;
 			}
 			goto not_found_em;
 		}
 		bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
 		if (bytenr == 0) {
 			em->start = extent_start;
-			em->end = extent_end - 1;
+			em->len = extent_end - extent_start;
 			em->block_start = EXTENT_MAP_HOLE;
-			em->block_end = EXTENT_MAP_HOLE;
 			goto insert;
 		}
 		bytenr += btrfs_file_extent_offset(leaf, item);
 		em->block_start = bytenr;
-		em->block_end = em->block_start +
-			btrfs_file_extent_num_bytes(leaf, item) - 1;
 		em->start = extent_start;
-		em->end = extent_end - 1;
+		em->len = extent_end - extent_start;
 		goto insert;
 	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
 		unsigned long ptr;
@@ -1925,25 +1936,24 @@ again:
 
 		size = btrfs_file_extent_inline_len(leaf, btrfs_item_nr(leaf,
 						    path->slots[0]));
-		extent_end = (extent_start + size - 1) |
-			((u64)root->sectorsize - 1);
+		extent_end = (extent_start + size + root->sectorsize - 1) &
+			~((u64)root->sectorsize - 1);
 		if (start < extent_start || start >= extent_end) {
 			em->start = start;
 			if (start < extent_start) {
-				if (end < extent_start)
+				if (start + len <= extent_start)
 					goto not_found;
-				em->end = extent_end;
+				em->len = extent_end - extent_start;
 			} else {
-				em->end = end;
+				em->len = len;
 			}
 			goto not_found_em;
 		}
 		em->block_start = EXTENT_MAP_INLINE;
-		em->block_end = EXTENT_MAP_INLINE;
 
 		if (!page) {
 			em->start = extent_start;
-			em->end = extent_start + size - 1;
+			em->len = size;
 			goto out;
 		}
 
@@ -1952,8 +1962,7 @@ again:
 		copy_size = min_t(u64, PAGE_CACHE_SIZE - page_offset,
 				size - extent_offset);
 		em->start = extent_start + extent_offset;
-		em->end = (em->start + copy_size -1) |
-			((u64)root->sectorsize -1);
+		em->len = copy_size;
 		map = kmap(page);
 		ptr = btrfs_file_extent_inline_start(item) + extent_offset;
 		if (create == 0 && !PageUptodate(page)) {
@@ -1974,7 +1983,8 @@ again:
 			btrfs_mark_buffer_dirty(leaf);
 		}
 		kunmap(page);
-		set_extent_uptodate(em_tree, em->start, em->end, GFP_NOFS);
+		set_extent_uptodate(io_tree, em->start,
+				    extent_map_end(em) - 1, GFP_NOFS);
 		goto insert;
 	} else {
 		printk("unkknown found_type %d\n", found_type);
@@ -1982,33 +1992,29 @@ again:
 	}
 not_found:
 	em->start = start;
-	em->end = end;
+	em->len = len;
 not_found_em:
 	em->block_start = EXTENT_MAP_HOLE;
-	em->block_end = EXTENT_MAP_HOLE;
 insert:
 	btrfs_release_path(root, path);
-	if (em->start > start || em->end < start) {
-		printk("bad extent! em: [%Lu %Lu] passed [%Lu %Lu]\n", em->start, em->end, start, end);
+	if (em->start > start || extent_map_end(em) <= start) {
+		printk("bad extent! em: [%Lu %Lu] passed [%Lu %Lu]\n", em->start, em->len, start, len);
 		err = -EIO;
 		goto out;
 	}
+
+	err = 0;
+	spin_lock(&em_tree->lock);
 	ret = add_extent_mapping(em_tree, em);
 	if (ret == -EEXIST) {
 		free_extent_map(em);
-		em = NULL;
-		if (0 && failed_insert == 1) {
-			btrfs_drop_extent_cache(inode, start, end);
-		}
-		failed_insert++;
-		if (failed_insert > 5) {
-			printk("failing to insert %Lu %Lu\n", start, end);
+		em = lookup_extent_mapping(em_tree, start, len);
+		if (!em) {
 			err = -EIO;
-			goto out;
+			printk("failing to insert %Lu %Lu\n", start, len);
 		}
-		goto again;
 	}
-	err = 0;
+	spin_unlock(&em_tree->lock);
 out:
 	btrfs_free_path(path);
 	if (trans) {
@@ -2032,14 +2038,14 @@ static sector_t btrfs_bmap(struct address_space *mapping, sector_t iblock)
 
 int btrfs_readpage(struct file *file, struct page *page)
 {
-	struct extent_map_tree *tree;
-	tree = &BTRFS_I(page->mapping->host)->extent_tree;
+	struct extent_io_tree *tree;
+	tree = &BTRFS_I(page->mapping->host)->io_tree;
 	return extent_read_full_page(tree, page, btrfs_get_extent);
 }
 
 static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
 {
-	struct extent_map_tree *tree;
+	struct extent_io_tree *tree;
 
 
 	if (current->flags & PF_MEMALLOC) {
@@ -2047,15 +2053,15 @@ static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
 		unlock_page(page);
 		return 0;
 	}
-	tree = &BTRFS_I(page->mapping->host)->extent_tree;
+	tree = &BTRFS_I(page->mapping->host)->io_tree;
 	return extent_write_full_page(tree, page, btrfs_get_extent, wbc);
 }
 
 static int btrfs_writepages(struct address_space *mapping,
 			    struct writeback_control *wbc)
 {
-	struct extent_map_tree *tree;
-	tree = &BTRFS_I(mapping->host)->extent_tree;
+	struct extent_io_tree *tree;
+	tree = &BTRFS_I(mapping->host)->io_tree;
 	return extent_writepages(tree, mapping, btrfs_get_extent, wbc);
 }
 
@@ -2063,19 +2069,21 @@ static int
 btrfs_readpages(struct file *file, struct address_space *mapping,
 		struct list_head *pages, unsigned nr_pages)
 {
-	struct extent_map_tree *tree;
-	tree = &BTRFS_I(mapping->host)->extent_tree;
+	struct extent_io_tree *tree;
+	tree = &BTRFS_I(mapping->host)->io_tree;
 	return extent_readpages(tree, mapping, pages, nr_pages,
 				btrfs_get_extent);
 }
 
 static int btrfs_releasepage(struct page *page, gfp_t unused_gfp_flags)
 {
-	struct extent_map_tree *tree;
+	struct extent_io_tree *tree;
+	struct extent_map_tree *map;
 	int ret;
 
-	tree = &BTRFS_I(page->mapping->host)->extent_tree;
-	ret = try_release_extent_mapping(tree, page);
+	tree = &BTRFS_I(page->mapping->host)->io_tree;
+	map = &BTRFS_I(page->mapping->host)->extent_tree;
+	ret = try_release_extent_mapping(map, tree, page);
 	if (ret == 1) {
 		ClearPagePrivate(page);
 		set_page_private(page, 0);
@@ -2086,9 +2094,9 @@ static int btrfs_releasepage(struct page *page, gfp_t unused_gfp_flags)
 
 static void btrfs_invalidatepage(struct page *page, unsigned long offset)
 {
-	struct extent_map_tree *tree;
+	struct extent_io_tree *tree;
 
-	tree = &BTRFS_I(page->mapping->host)->extent_tree;
+	tree = &BTRFS_I(page->mapping->host)->io_tree;
 	extent_invalidatepage(tree, page, offset);
 	btrfs_releasepage(page, GFP_NOFS);
 }
@@ -2374,7 +2382,7 @@ unsigned long btrfs_force_ra(struct address_space *mapping,
 int btrfs_defrag_file(struct file *file) {
 	struct inode *inode = fdentry(file)->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct page *page;
 	unsigned long last_index;
 	unsigned long ra_index = 0;
@@ -2414,13 +2422,13 @@ int btrfs_defrag_file(struct file *file) {
 		page_start = (u64)page->index << PAGE_CACHE_SHIFT;
 		page_end = page_start + PAGE_CACHE_SIZE - 1;
 
-		lock_extent(em_tree, page_start, page_end, GFP_NOFS);
+		lock_extent(io_tree, page_start, page_end, GFP_NOFS);
 		delalloc_start = page_start;
 		existing_delalloc =
-			count_range_bits(&BTRFS_I(inode)->extent_tree,
+			count_range_bits(&BTRFS_I(inode)->io_tree,
 					 &delalloc_start, page_end,
 					 PAGE_CACHE_SIZE, EXTENT_DELALLOC);
-		set_extent_delalloc(em_tree, page_start,
+		set_extent_delalloc(io_tree, page_start,
 				    page_end, GFP_NOFS);
 
 		spin_lock(&root->fs_info->delalloc_lock);
@@ -2428,7 +2436,7 @@ int btrfs_defrag_file(struct file *file) {
 						 existing_delalloc;
 		spin_unlock(&root->fs_info->delalloc_lock);
 
-		unlock_extent(em_tree, page_start, page_end, GFP_NOFS);
+		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
 		set_page_dirty(page);
 		unlock_page(page);
 		page_cache_release(page);
@@ -2842,9 +2850,10 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 		inode->i_mapping->a_ops = &btrfs_aops;
 		inode->i_fop = &btrfs_file_operations;
 		inode->i_op = &btrfs_file_inode_operations;
-		extent_map_tree_init(&BTRFS_I(inode)->extent_tree,
+		extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
+		extent_io_tree_init(&BTRFS_I(inode)->io_tree,
 				     inode->i_mapping, GFP_NOFS);
-		BTRFS_I(inode)->extent_tree.ops = &btrfs_extent_map_ops;
+		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 	}
 	dir->i_sb->s_dirt = 1;
 	btrfs_update_inode_block_group(trans, inode);
@@ -2934,7 +2943,7 @@ static struct file_operations btrfs_dir_file_operations = {
 #endif
 };
 
-static struct extent_map_ops btrfs_extent_map_ops = {
+static struct extent_io_ops btrfs_extent_io_ops = {
 	.fill_delalloc = run_delalloc_range,
 	.writepage_io_hook = btrfs_writepage_io_hook,
 	.readpage_io_hook = btrfs_readpage_io_hook,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 8b52c69fda2e..f8a1016600b1 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -468,10 +468,15 @@ static int __init init_btrfs_fs(void)
 	err = btrfs_init_cachep();
 	if (err)
 		goto free_transaction_sys;
-	err = extent_map_init();
+
+	err = extent_io_init();
 	if (err)
 		goto free_cachep;
 
+	err = extent_map_init();
+	if (err)
+		goto free_extent_io;
+
 	err = register_filesystem(&btrfs_fs_type);
 	if (err)
 		goto free_extent_map;
@@ -479,6 +484,8 @@ static int __init init_btrfs_fs(void)
 
 free_extent_map:
 	extent_map_exit();
+free_extent_io:
+	extent_io_exit();
 free_cachep:
 	btrfs_destroy_cachep();
 free_transaction_sys:
@@ -492,6 +499,7 @@ static void __exit exit_btrfs_fs(void)
 	btrfs_exit_transaction_sys();
 	btrfs_destroy_cachep();
 	extent_map_exit();
+	extent_io_exit();
 	unregister_filesystem(&btrfs_fs_type);
 	btrfs_exit_sysfs();
 }
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 163c01a24498..b4a1bc62a784 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -70,7 +70,7 @@ static int join_transaction(struct btrfs_root *root)
 		INIT_LIST_HEAD(&cur_trans->pending_snapshots);
 		list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
 		btrfs_ordered_inode_tree_init(&cur_trans->ordered_inode_tree);
-		extent_map_tree_init(&cur_trans->dirty_pages,
+		extent_io_tree_init(&cur_trans->dirty_pages,
 				     root->fs_info->btree_inode->i_mapping,
 				     GFP_NOFS);
 	} else {
@@ -153,7 +153,7 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
 	int ret;
 	int err;
 	int werr = 0;
-	struct extent_map_tree *dirty_pages;
+	struct extent_io_tree *dirty_pages;
 	struct page *page;
 	struct inode *btree_inode = root->fs_info->btree_inode;
 	u64 start;
@@ -610,7 +610,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	struct btrfs_transaction *cur_trans;
 	struct btrfs_transaction *prev_trans = NULL;
 	struct list_head dirty_fs_roots;
-	struct extent_map_tree *pinned_copy;
+	struct extent_io_tree *pinned_copy;
 	DEFINE_WAIT(wait);
 	int ret;
 
@@ -639,7 +639,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	if (!pinned_copy)
 		return -ENOMEM;
 
-	extent_map_tree_init(pinned_copy,
+	extent_io_tree_init(pinned_copy,
 			     root->fs_info->btree_inode->i_mapping, GFP_NOFS);
 
 	trans->transaction->in_commit = 1;
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index fd52e9b23922..c3172ddb3321 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -29,7 +29,7 @@ struct btrfs_transaction {
 	int use_count;
 	int commit_done;
 	struct list_head list;
-	struct extent_map_tree dirty_pages;
+	struct extent_io_tree dirty_pages;
 	unsigned long start_time;
 	struct btrfs_ordered_inode_tree ordered_inode_tree;
 	wait_queue_head_t writer_wait;
-- 
cgit v1.2.3


From 9069218d448ea547dbad5f1cbd537e88d6519d66 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 8 Feb 2008 13:49:28 -0500
Subject: Btrfs: Fix i_blocks accounting

Now that delayed allocation accounting works, i_blocks accounting is changed
to only modify i_blocks when extents inserted or removed.

The fillattr call is changed to include the delayed allocation byte count
in the i_blocks result.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/btrfs_inode.h |  1 +
 fs/btrfs/ctree.h       |  9 +++++++++
 fs/btrfs/file.c        | 17 +++++++++++------
 fs/btrfs/inode.c       | 43 +++++++++++++++++++++++++++++--------------
 4 files changed, 50 insertions(+), 20 deletions(-)

(limited to 'fs/btrfs/btrfs_inode.h')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 9a0647dc5a0b..9b9db9cbc019 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -36,6 +36,7 @@ struct btrfs_inode {
 	 * transid of the trans_handle that last modified this inode
 	 */
 	u64 last_trans;
+	u64 delalloc_bytes;
 	u32 flags;
 };
 static inline struct btrfs_inode *BTRFS_I(struct inode *inode)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index dfb700efcd6b..52144b04f1af 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1161,6 +1161,15 @@ int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root, struct btrfs_path *path,
 			u64 isize);
 /* inode.c */
+static inline void dec_i_blocks(struct inode *inode, u64 dec)
+{
+	dec = dec >> 9;
+	if (dec <= inode->i_blocks)
+		inode->i_blocks -= dec;
+	else
+		inode->i_blocks = 0;
+}
+
 unsigned long btrfs_force_ra(struct address_space *mapping,
 			      struct file_ra_state *ra, struct file *file,
 			      pgoff_t offset, pgoff_t last_index);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index bfa4149c053d..f89396082544 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -175,6 +175,7 @@ static int noinline insert_inline_extent(struct btrfs_trans_handle *trans,
 			leaf = path->nodes[0];
 			ei = btrfs_item_ptr(leaf, path->slots[0],
 					    struct btrfs_file_extent_item);
+			inode->i_blocks += (offset + size - found_end) >> 9;
 		}
 		if (found_end < offset) {
 			ptr = btrfs_file_extent_inline_start(ei) + found_size;
@@ -184,6 +185,7 @@ static int noinline insert_inline_extent(struct btrfs_trans_handle *trans,
 insert:
 		btrfs_release_path(root, path);
 		datasize = offset + size - key.offset;
+		inode->i_blocks += datasize >> 9;
 		datasize = btrfs_file_extent_calc_inline_size(datasize);
 		ret = btrfs_insert_empty_item(trans, root, path, &key,
 					      datasize);
@@ -256,7 +258,6 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 		goto out_unlock;
 	}
 	btrfs_set_trans_block_group(trans, inode);
-	inode->i_blocks += num_bytes >> 9;
 	hint_byte = 0;
 
 	if ((end_of_last_block & 4095) == 0) {
@@ -410,7 +411,7 @@ int btrfs_check_file(struct btrfs_root *root, struct inode *inode)
 		if (found_key.type != BTRFS_EXTENT_DATA_KEY)
 			goto out;
 
-		if (found_key.offset != last_offset) {
+		if (found_key.offset < last_offset) {
 			WARN_ON(1);
 			btrfs_print_leaf(root, leaf);
 			printk("inode %lu found offset %Lu expected %Lu\n",
@@ -435,7 +436,7 @@ int btrfs_check_file(struct btrfs_root *root, struct inode *inode)
 		last_offset = extent_end;
 		path->slots[0]++;
 	}
-	if (last_offset < inode->i_size) {
+	if (0 && last_offset < inode->i_size) {
 		WARN_ON(1);
 		btrfs_print_leaf(root, leaf);
 		printk("inode %lu found offset %Lu size %Lu\n", inode->i_ino,
@@ -608,8 +609,7 @@ next_slot:
 								      extent);
 				if (btrfs_file_extent_disk_bytenr(leaf,
 								  extent)) {
-					inode->i_blocks -=
-						(old_num - new_num) >> 9;
+					dec_i_blocks(inode, old_num - new_num);
 				}
 				btrfs_set_file_extent_num_bytes(leaf, extent,
 								new_num);
@@ -620,6 +620,8 @@ next_slot:
 				u32 new_size;
 				new_size = btrfs_file_extent_calc_inline_size(
 						   inline_limit - key.offset);
+				dec_i_blocks(inode, (extent_end - key.offset) -
+					(inline_limit - key.offset));
 				btrfs_truncate_item(trans, root, path,
 						    new_size, 1);
 			}
@@ -653,7 +655,7 @@ next_slot:
 			btrfs_release_path(root, path);
 			extent = NULL;
 			if (found_extent && disk_bytenr != 0) {
-				inode->i_blocks -= extent_num_bytes >> 9;
+				dec_i_blocks(inode, extent_num_bytes);
 				ret = btrfs_free_extent(trans, root,
 						disk_bytenr,
 						disk_num_bytes,
@@ -674,6 +676,8 @@ next_slot:
 			u32 new_size;
 			new_size = btrfs_file_extent_calc_inline_size(
 						   extent_end - end);
+			dec_i_blocks(inode, (extent_end - key.offset) -
+					(extent_end - end));
 			btrfs_truncate_item(trans, root, path, new_size, 0);
 		}
 		/* create bookend, splitting the extent in two */
@@ -718,6 +722,7 @@ next_slot:
 	}
 out:
 	btrfs_free_path(path);
+	btrfs_check_file(root, inode);
 	return ret;
 }
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 44fc94fee2c4..913ab128eee1 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -134,6 +134,7 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
 		ret = btrfs_insert_file_extent(trans, root, inode->i_ino,
 					       start, ins.objectid, ins.offset,
 					       ins.offset);
+		inode->i_blocks += ins.offset >> 9;
 		btrfs_check_file(root, inode);
 		num_bytes -= cur_alloc_size;
 		alloc_hint = ins.objectid + ins.offset;
@@ -142,6 +143,7 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
 	btrfs_drop_extent_cache(inode, orig_start,
 				orig_start + orig_num_bytes - 1);
 	btrfs_add_ordered_inode(inode);
+	btrfs_update_inode(trans, root, inode);
 out:
 	btrfs_end_transaction(trans, root);
 	return ret;
@@ -265,6 +267,7 @@ int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
 	if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
 		struct btrfs_root *root = BTRFS_I(inode)->root;
 		spin_lock(&root->fs_info->delalloc_lock);
+		BTRFS_I(inode)->delalloc_bytes += end - start + 1;
 		root->fs_info->delalloc_bytes += end - start + 1;
 		spin_unlock(&root->fs_info->delalloc_lock);
 	}
@@ -281,8 +284,10 @@ int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
 			printk("warning: delalloc account %Lu %Lu\n",
 			       end - start + 1, root->fs_info->delalloc_bytes);
 			root->fs_info->delalloc_bytes = 0;
+			BTRFS_I(inode)->delalloc_bytes = 0;
 		} else {
 			root->fs_info->delalloc_bytes -= end - start + 1;
+			BTRFS_I(inode)->delalloc_bytes -= end - start + 1;
 		}
 		spin_unlock(&root->fs_info->delalloc_lock);
 	}
@@ -833,32 +838,37 @@ search_again:
 				btrfs_set_file_extent_num_bytes(leaf, fi,
 							 extent_num_bytes);
 				num_dec = (orig_num_bytes -
-					   extent_num_bytes) >> 9;
-				if (extent_start != 0) {
-					inode->i_blocks -= num_dec;
-				}
+					   extent_num_bytes);
+				if (extent_start != 0)
+					dec_i_blocks(inode, num_dec);
 				btrfs_mark_buffer_dirty(leaf);
 			} else {
 				extent_num_bytes =
 					btrfs_file_extent_disk_num_bytes(leaf,
 									 fi);
 				/* FIXME blocksize != 4096 */
-				num_dec = btrfs_file_extent_num_bytes(leaf,
-								       fi) >> 9;
+				num_dec = btrfs_file_extent_num_bytes(leaf, fi);
 				if (extent_start != 0) {
 					found_extent = 1;
-					inode->i_blocks -= num_dec;
+					dec_i_blocks(inode, num_dec);
 				}
 				root_gen = btrfs_header_generation(leaf);
 				root_owner = btrfs_header_owner(leaf);
 			}
-		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE &&
-			   !del_item) {
-			u32 newsize = inode->i_size - found_key.offset;
-			newsize = btrfs_file_extent_calc_inline_size(newsize);
-			ret = btrfs_truncate_item(trans, root, path,
-						  newsize, 1);
-			BUG_ON(ret);
+		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+			if (!del_item) {
+				u32 newsize = inode->i_size - found_key.offset;
+				dec_i_blocks(inode, item_end + 1 -
+					    found_key.offset - newsize);
+				newsize =
+				    btrfs_file_extent_calc_inline_size(newsize);
+				ret = btrfs_truncate_item(trans, root, path,
+							  newsize, 1);
+				BUG_ON(ret);
+			} else {
+				dec_i_blocks(inode, item_end + 1 -
+					     found_key.offset);
+			}
 		}
 delete:
 		if (del_item) {
@@ -1222,6 +1232,7 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
 	struct btrfs_iget_args *args = p;
 	inode->i_ino = args->ino;
 	BTRFS_I(inode)->root = args->root;
+	BTRFS_I(inode)->delalloc_bytes = 0;
 	extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
 	extent_io_tree_init(&BTRFS_I(inode)->io_tree,
 			     inode->i_mapping, GFP_NOFS);
@@ -1528,6 +1539,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
 	extent_io_tree_init(&BTRFS_I(inode)->io_tree,
 			     inode->i_mapping, GFP_NOFS);
+	BTRFS_I(inode)->delalloc_bytes = 0;
 	BTRFS_I(inode)->root = root;
 
 	if (mode & S_IFDIR)
@@ -1746,6 +1758,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 		extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
 		extent_io_tree_init(&BTRFS_I(inode)->io_tree,
 				     inode->i_mapping, GFP_NOFS);
+		BTRFS_I(inode)->delalloc_bytes = 0;
 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 	}
 	dir->i_sb->s_dirt = 1;
@@ -2797,6 +2810,7 @@ static int btrfs_getattr(struct vfsmount *mnt,
 	struct inode *inode = dentry->d_inode;
 	generic_fillattr(inode, stat);
 	stat->blksize = PAGE_CACHE_SIZE;
+	stat->blocks = inode->i_blocks + (BTRFS_I(inode)->delalloc_bytes >> 9);
 	return 0;
 }
 
@@ -2912,6 +2926,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 		extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
 		extent_io_tree_init(&BTRFS_I(inode)->io_tree,
 				     inode->i_mapping, GFP_NOFS);
+		BTRFS_I(inode)->delalloc_bytes = 0;
 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 	}
 	dir->i_sb->s_dirt = 1;
-- 
cgit v1.2.3


From 7e38326f5b125288a2baea1d815c500502ab9fc0 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 9 Apr 2008 16:28:12 -0400
Subject: Btrfs: Handle checksumming errors while reading data blocks

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/btrfs_inode.h |   1 +
 fs/btrfs/extent_io.c   |   9 ++++
 fs/btrfs/extent_io.h   |   3 ++
 fs/btrfs/inode.c       | 114 ++++++++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 126 insertions(+), 1 deletion(-)

(limited to 'fs/btrfs/btrfs_inode.h')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 9b9db9cbc019..fe6ef8e34166 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -29,6 +29,7 @@ struct btrfs_inode {
 	struct btrfs_key location;
 	struct extent_map_tree extent_tree;
 	struct extent_io_tree io_tree;
+	struct extent_io_tree io_failure_tree;
 	struct inode vfs_inode;
 
 	u64 ordered_trans;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 2f159375c878..866460c3d72c 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1534,6 +1534,15 @@ static int end_bio_extent_readpage(struct bio *bio,
 			if (ret)
 				uptodate = 0;
 		}
+		if (!uptodate && tree->ops &&
+		    tree->ops->readpage_io_failed_hook) {
+			ret = tree->ops->readpage_io_failed_hook(bio, page,
+							 start, end, state);
+			if (ret == 0) {
+				state = NULL;
+				continue;
+			}
+		}
 
 		spin_lock_irqsave(&tree->lock, flags);
 		if (!state || state->end != end) {
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 8d6b8a14cc30..b47859ccd78a 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -32,6 +32,9 @@ struct extent_io_ops {
 	int (*merge_bio_hook)(struct page *page, unsigned long offset,
 			      size_t size, struct bio *bio);
 	int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
+	int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
+				       u64 start, u64 end,
+				       struct extent_state *state);
 	int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end,
 				    struct extent_state *state);
 	void (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8c2d5d036bd6..48f1d1b96450 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -385,6 +385,86 @@ out:
 	return ret;
 }
 
+struct io_failure_record {
+	struct page *page;
+	u64 start;
+	u64 len;
+	u64 logical;
+	int last_mirror;
+};
+
+int btrfs_readpage_io_failed_hook(struct bio *failed_bio,
+				  struct page *page, u64 start, u64 end,
+				  struct extent_state *state)
+{
+	struct io_failure_record *failrec = NULL;
+	u64 private;
+	struct extent_map *em;
+	struct inode *inode = page->mapping->host;
+	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
+	struct bio *bio;
+	int num_copies;
+	int ret;
+	u64 logical;
+
+	ret = get_state_private(failure_tree, start, &private);
+	if (ret) {
+		size_t pg_offset = start - page_offset(page);
+		failrec = kmalloc(sizeof(*failrec), GFP_NOFS);
+		if (!failrec)
+			return -ENOMEM;
+		failrec->start = start;
+		failrec->len = end - start + 1;
+		failrec->last_mirror = 0;
+
+		em = btrfs_get_extent(inode, NULL, pg_offset, start,
+				      failrec->len, 0);
+
+		if (!em || IS_ERR(em)) {
+			kfree(failrec);
+			return -EIO;
+		}
+		logical = start - em->start;
+		logical = em->block_start + logical;
+		failrec->logical = logical;
+		free_extent_map(em);
+		set_extent_bits(failure_tree, start, end, EXTENT_LOCKED |
+				EXTENT_DIRTY, GFP_NOFS);
+		set_state_private(failure_tree, start, (u64)failrec);
+	} else {
+		failrec = (struct io_failure_record *)private;
+	}
+	num_copies = btrfs_num_copies(
+			      &BTRFS_I(inode)->root->fs_info->mapping_tree,
+			      failrec->logical, failrec->len);
+	failrec->last_mirror++;
+	if (!state) {
+		spin_lock_irq(&BTRFS_I(inode)->io_tree.lock);
+		state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
+						    failrec->start,
+						    EXTENT_LOCKED);
+		if (state && state->start != failrec->start)
+			state = NULL;
+		spin_unlock_irq(&BTRFS_I(inode)->io_tree.lock);
+	}
+	if (!state || failrec->last_mirror > num_copies) {
+		set_state_private(failure_tree, failrec->start, 0);
+		clear_extent_bits(failure_tree, failrec->start,
+				  failrec->start + failrec->len - 1,
+				  EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
+		kfree(failrec);
+		return -EIO;
+	}
+	bio = bio_alloc(GFP_NOFS, 1);
+	bio->bi_private = state;
+	bio->bi_end_io = failed_bio->bi_end_io;
+	bio->bi_sector = failrec->logical >> 9;
+	bio->bi_bdev = failed_bio->bi_bdev;
+	bio_add_page(bio, page, failrec->len, start - page_offset(page));
+	btrfs_submit_bio_hook(inode, READ, bio, failrec->last_mirror);
+	return 0;
+}
+
 int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 			       struct extent_state *state)
 {
@@ -419,6 +499,29 @@ int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 	}
 	kunmap_atomic(kaddr, KM_IRQ0);
 	local_irq_restore(flags);
+
+	/* if the io failure tree for this inode is non-empty,
+	 * check to see if we've recovered from a failed IO
+	 */
+	private = 0;
+	if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
+			     (u64)-1, 1, EXTENT_DIRTY)) {
+		u64 private_failure;
+		struct io_failure_record *failure;
+		ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
+					start, &private_failure);
+		if (ret == 0) {
+			failure = (struct io_failure_record *)private_failure;
+			set_state_private(&BTRFS_I(inode)->io_failure_tree,
+					  failure->start, 0);
+			clear_extent_bits(&BTRFS_I(inode)->io_failure_tree,
+					  failure->start,
+					  failure->start + failure->len - 1,
+					  EXTENT_DIRTY | EXTENT_LOCKED,
+					  GFP_NOFS);
+			kfree(failure);
+		}
+	}
 	return 0;
 
 zeroit:
@@ -429,7 +532,7 @@ zeroit:
 	flush_dcache_page(page);
 	kunmap_atomic(kaddr, KM_IRQ0);
 	local_irq_restore(flags);
-	return 0;
+	return -EIO;
 }
 
 void btrfs_read_locked_inode(struct inode *inode)
@@ -1271,6 +1374,8 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
 	extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
 	extent_io_tree_init(&BTRFS_I(inode)->io_tree,
 			     inode->i_mapping, GFP_NOFS);
+	extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
+			     inode->i_mapping, GFP_NOFS);
 	return 0;
 }
 
@@ -1578,6 +1683,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
 	extent_io_tree_init(&BTRFS_I(inode)->io_tree,
 			     inode->i_mapping, GFP_NOFS);
+	extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
+			     inode->i_mapping, GFP_NOFS);
 	BTRFS_I(inode)->delalloc_bytes = 0;
 	BTRFS_I(inode)->root = root;
 
@@ -1803,6 +1910,8 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 		extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
 		extent_io_tree_init(&BTRFS_I(inode)->io_tree,
 				     inode->i_mapping, GFP_NOFS);
+		extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
+				     inode->i_mapping, GFP_NOFS);
 		BTRFS_I(inode)->delalloc_bytes = 0;
 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 	}
@@ -2972,6 +3081,8 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 		extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
 		extent_io_tree_init(&BTRFS_I(inode)->io_tree,
 				     inode->i_mapping, GFP_NOFS);
+		extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
+				     inode->i_mapping, GFP_NOFS);
 		BTRFS_I(inode)->delalloc_bytes = 0;
 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 	}
@@ -3070,6 +3181,7 @@ static struct extent_io_ops btrfs_extent_io_ops = {
 	.merge_bio_hook = btrfs_merge_bio_hook,
 	.readpage_io_hook = btrfs_readpage_io_hook,
 	.readpage_end_io_hook = btrfs_readpage_end_io_hook,
+	.readpage_io_failed_hook = btrfs_readpage_io_failed_hook,
 	.set_bit_hook = btrfs_set_bit_hook,
 	.clear_bit_hook = btrfs_clear_bit_hook,
 };
-- 
cgit v1.2.3


From 81d7ed29ff6bdec903c36c26b386e16c014993b2 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 25 Apr 2008 08:51:48 -0400
Subject: Btrfs: Throttle file_write when data=ordered is flushing the inode

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/btrfs_inode.h  |  1 +
 fs/btrfs/file.c         |  1 +
 fs/btrfs/inode.c        |  4 ++++
 fs/btrfs/ordered-data.c | 13 +++++++++++++
 fs/btrfs/ordered-data.h |  1 +
 fs/btrfs/transaction.c  | 10 ++++++++--
 6 files changed, 28 insertions(+), 2 deletions(-)

(limited to 'fs/btrfs/btrfs_inode.h')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index fe6ef8e34166..5ba83894c8b7 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -31,6 +31,7 @@ struct btrfs_inode {
 	struct extent_io_tree io_tree;
 	struct extent_io_tree io_failure_tree;
 	struct inode vfs_inode;
+	atomic_t ordered_writeback;
 
 	u64 ordered_trans;
 	/*
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 5d537f26dc83..8effdf4f5d6f 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -977,6 +977,7 @@ out_nolock:
 		     (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
 	}
 	current->backing_dev_info = NULL;
+	btrfs_ordered_throttle(root, inode);
 	return num_written ? num_written : err;
 }
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d58b4d5a1fef..b31f52d4f2ca 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1419,6 +1419,7 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
 			     inode->i_mapping, GFP_NOFS);
 	extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
 			     inode->i_mapping, GFP_NOFS);
+	atomic_set(&BTRFS_I(inode)->ordered_writeback, 0);
 	return 0;
 }
 
@@ -1728,6 +1729,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 			     inode->i_mapping, GFP_NOFS);
 	extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
 			     inode->i_mapping, GFP_NOFS);
+	atomic_set(&BTRFS_I(inode)->ordered_writeback, 0);
 	BTRFS_I(inode)->delalloc_bytes = 0;
 	BTRFS_I(inode)->root = root;
 
@@ -1956,6 +1958,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 		extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
 				     inode->i_mapping, GFP_NOFS);
 		BTRFS_I(inode)->delalloc_bytes = 0;
+		atomic_set(&BTRFS_I(inode)->ordered_writeback, 0);
 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 	}
 	dir->i_sb->s_dirt = 1;
@@ -3292,6 +3295,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 		extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
 				     inode->i_mapping, GFP_NOFS);
 		BTRFS_I(inode)->delalloc_bytes = 0;
+		atomic_set(&BTRFS_I(inode)->ordered_writeback, 0);
 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 	}
 	dir->i_sb->s_dirt = 1;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 3ee51e10c187..b474902c90e2 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -269,3 +269,16 @@ int btrfs_del_ordered_inode(struct inode *inode)
 	return ret;
 }
 
+int btrfs_ordered_throttle(struct btrfs_root *root, struct inode *inode)
+{
+	struct btrfs_transaction *cur = root->fs_info->running_transaction;
+	while(cur == root->fs_info->running_transaction &&
+	      atomic_read(&BTRFS_I(inode)->ordered_writeback)) {
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,18)
+		congestion_wait(WRITE, HZ/20);
+#else
+		blk_congestion_wait(WRITE, HZ/20);
+#endif
+	}
+	return 0;
+}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index f25c6771ec64..29047e0abaab 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -39,4 +39,5 @@ int btrfs_find_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
 				       u64 *root_objectid, u64 *objectid,
 				       struct inode **inode);
 int btrfs_del_ordered_inode(struct inode *inode);
+int btrfs_ordered_throttle(struct btrfs_root *root, struct inode *inode);
 #endif
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 1ed179c020af..c85cb48d95ee 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -512,8 +512,11 @@ int btrfs_write_ordered_inodes(struct btrfs_trans_handle *trans,
 		mutex_unlock(&root->fs_info->trans_mutex);
 		mutex_unlock(&root->fs_info->fs_mutex);
 
-		if (S_ISREG(inode->i_mode))
+		if (S_ISREG(inode->i_mode)) {
+			atomic_inc(&BTRFS_I(inode)->ordered_writeback);
 			filemap_fdatawrite(inode->i_mapping);
+			atomic_dec(&BTRFS_I(inode)->ordered_writeback);
+		}
 		iput(inode);
 
 		mutex_lock(&root->fs_info->fs_mutex);
@@ -530,8 +533,11 @@ int btrfs_write_ordered_inodes(struct btrfs_trans_handle *trans,
 		mutex_unlock(&root->fs_info->trans_mutex);
 		mutex_unlock(&root->fs_info->fs_mutex);
 
-		if (S_ISREG(inode->i_mode))
+		if (S_ISREG(inode->i_mode)) {
+			atomic_inc(&BTRFS_I(inode)->ordered_writeback);
 			filemap_write_and_wait(inode->i_mapping);
+			atomic_dec(&BTRFS_I(inode)->ordered_writeback);
+		}
 		atomic_dec(&inode->i_count);
 		iput(inode);
 
-- 
cgit v1.2.3


From 1b1e2135dc1e4efbcf25ac9ac9979316d4e1193e Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 25 Jun 2008 16:01:31 -0400
Subject: Btrfs: Add a per-inode csum mutex to avoid races creating csum items

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/btrfs_inode.h  | 1 +
 fs/btrfs/file.c         | 6 +++---
 fs/btrfs/inode.c        | 6 ++++++
 fs/btrfs/ordered-data.c | 5 +++--
 fs/btrfs/transaction.c  | 2 +-
 fs/btrfs/tree-defrag.c  | 7 +++++++
 6 files changed, 21 insertions(+), 6 deletions(-)

(limited to 'fs/btrfs/btrfs_inode.h')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 5ba83894c8b7..40b4e0c9cd09 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -30,6 +30,7 @@ struct btrfs_inode {
 	struct extent_map_tree extent_tree;
 	struct extent_io_tree io_tree;
 	struct extent_io_tree io_failure_tree;
+	struct mutex csum_mutex;
 	struct inode vfs_inode;
 	atomic_t ordered_writeback;
 
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index ece221cba90c..8037792f8789 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -267,13 +267,13 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 
 	/* FIXME...EIEIO, ENOSPC and more */
 	/* insert any holes we need to create */
-	if (isize < end_pos) {
+	if (isize < start_pos) {
 		u64 last_pos_in_file;
 		u64 hole_size;
 		u64 mask = root->sectorsize - 1;
 		last_pos_in_file = (isize + mask) & ~mask;
-		hole_size = (end_pos - last_pos_in_file + mask) & ~mask;
-		if (last_pos_in_file < end_pos) {
+		hole_size = (start_pos - last_pos_in_file + mask) & ~mask;
+		if (last_pos_in_file < start_pos) {
 			err = btrfs_drop_extents(trans, root, inode,
 						 last_pos_in_file,
 						 last_pos_in_file + hole_size,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index bbba3350d023..d39433dfb2c7 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -351,7 +351,9 @@ int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 	trans = btrfs_start_transaction(root, 1);
 
 	btrfs_set_trans_block_group(trans, inode);
+	mutex_lock(&BTRFS_I(inode)->csum_mutex);
 	btrfs_csum_file_blocks(trans, root, inode, bio, sums);
+	mutex_unlock(&BTRFS_I(inode)->csum_mutex);
 
 	ret = btrfs_end_transaction(trans, root);
 	BUG_ON(ret);
@@ -1400,6 +1402,7 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
 			     inode->i_mapping, GFP_NOFS);
 	extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
 			     inode->i_mapping, GFP_NOFS);
+	mutex_init(&BTRFS_I(inode)->csum_mutex);
 	atomic_set(&BTRFS_I(inode)->ordered_writeback, 0);
 	return 0;
 }
@@ -1701,6 +1704,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 			     inode->i_mapping, GFP_NOFS);
 	extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
 			     inode->i_mapping, GFP_NOFS);
+	mutex_init(&BTRFS_I(inode)->csum_mutex);
 	atomic_set(&BTRFS_I(inode)->ordered_writeback, 0);
 	BTRFS_I(inode)->delalloc_bytes = 0;
 	BTRFS_I(inode)->root = root;
@@ -1924,6 +1928,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 				     inode->i_mapping, GFP_NOFS);
 		extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
 				     inode->i_mapping, GFP_NOFS);
+		mutex_init(&BTRFS_I(inode)->csum_mutex);
 		BTRFS_I(inode)->delalloc_bytes = 0;
 		atomic_set(&BTRFS_I(inode)->ordered_writeback, 0);
 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
@@ -2862,6 +2867,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 				     inode->i_mapping, GFP_NOFS);
 		extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
 				     inode->i_mapping, GFP_NOFS);
+		mutex_init(&BTRFS_I(inode)->csum_mutex);
 		BTRFS_I(inode)->delalloc_bytes = 0;
 		atomic_set(&BTRFS_I(inode)->ordered_writeback, 0);
 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 5e4c0d95ce43..254da8225664 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -152,12 +152,13 @@ int btrfs_add_ordered_inode(struct inode *inode)
 			   inode->i_ino, &entry->rb_node);
 
 	BTRFS_I(inode)->ordered_trans = transid;
+	if (!node)
+		igrab(inode);
 
 	write_unlock(&tree->lock);
+
 	if (node)
 		kfree(entry);
-	else
-		igrab(inode);
 	return 0;
 }
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 0c53ff775b92..8e909cb97c6d 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -477,7 +477,7 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
 			if (err)
 				ret = err;
 			nr = trans->blocks_used;
-			ret = btrfs_end_transaction(trans, tree_root);
+			ret = btrfs_end_transaction_throttle(trans, tree_root);
 			BUG_ON(ret);
 
 			mutex_unlock(&root->fs_info->drop_mutex);
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 1677e4edaf6f..b17693f61fbc 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -38,8 +38,15 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 	u64 last_ret = 0;
 
 	if (root->fs_info->extent_root == root) {
+		/*
+		 * there's recursion here right now in the tree locking,
+		 * we can't defrag the extent root without deadlock
+		 */
+		goto out;
+#if 0
 		mutex_lock(&root->fs_info->alloc_mutex);
 		is_extent = 1;
+#endif
 	}
 
 	if (root->ref_cows == 0 && !is_extent)
-- 
cgit v1.2.3


From e6dcd2dc9c489108648e2ed543315dd134d50a9a Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 17 Jul 2008 12:53:50 -0400
Subject: Btrfs: New data=ordered implementation

The old data=ordered code would force commit to wait until
all the data extents from the transaction were fully on disk.  This
introduced large latencies into the commit and stalled new writers
in the transaction for a long time.

The new code changes the way data allocations and extents work:

* When delayed allocation is filled, data extents are reserved, and
  the extent bit EXTENT_ORDERED is set on the entire range of the extent.
  A struct btrfs_ordered_extent is allocated an inserted into a per-inode
  rbtree to track the pending extents.

* As each page is written EXTENT_ORDERED is cleared on the bytes corresponding
  to that page.

* When all of the bytes corresponding to a single struct btrfs_ordered_extent
  are written, The previously reserved extent is inserted into the FS
  btree and into the extent allocation trees.  The checksums for the file
  data are also updated.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/btrfs_inode.h  |   4 +-
 fs/btrfs/ctree.h        |  19 +-
 fs/btrfs/disk-io.c      |  13 +-
 fs/btrfs/extent-tree.c  | 132 +++++++++-----
 fs/btrfs/extent_io.c    |  52 +++++-
 fs/btrfs/extent_io.h    |  14 +-
 fs/btrfs/extent_map.c   |   5 +-
 fs/btrfs/file-item.c    |  62 ++++---
 fs/btrfs/file.c         |  67 ++++---
 fs/btrfs/inode.c        | 447 ++++++++++++++++++++++++++++++-----------------
 fs/btrfs/ordered-data.c | 455 ++++++++++++++++++++++++++++++------------------
 fs/btrfs/ordered-data.h |  71 ++++++--
 fs/btrfs/transaction.c  |  67 +------
 fs/btrfs/transaction.h  |   4 -
 14 files changed, 910 insertions(+), 502 deletions(-)

(limited to 'fs/btrfs/btrfs_inode.h')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 40b4e0c9cd09..8d03687510e0 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -21,6 +21,7 @@
 
 #include "extent_map.h"
 #include "extent_io.h"
+#include "ordered-data.h"
 
 /* in memory btrfs inode */
 struct btrfs_inode {
@@ -32,9 +33,8 @@ struct btrfs_inode {
 	struct extent_io_tree io_failure_tree;
 	struct mutex csum_mutex;
 	struct inode vfs_inode;
-	atomic_t ordered_writeback;
+	struct btrfs_ordered_inode_tree ordered_tree;
 
-	u64 ordered_trans;
 	/*
 	 * transid of the trans_handle that last modified this inode
 	 */
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f3783dbd9b60..ceebc052ddcb 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -25,6 +25,7 @@
 #include <linux/fs.h>
 #include <linux/completion.h>
 #include <linux/backing-dev.h>
+#include <linux/wait.h>
 #include <asm/kmap_types.h>
 #include "bit-radix.h"
 #include "extent_io.h"
@@ -37,6 +38,7 @@ extern struct kmem_cache *btrfs_trans_handle_cachep;
 extern struct kmem_cache *btrfs_transaction_cachep;
 extern struct kmem_cache *btrfs_bit_radix_cachep;
 extern struct kmem_cache *btrfs_path_cachep;
+struct btrfs_ordered_sum;
 
 #define BTRFS_MAGIC "_B5RfS_M"
 
@@ -510,6 +512,7 @@ struct btrfs_fs_info {
 	u64 max_inline;
 	u64 alloc_start;
 	struct btrfs_transaction *running_transaction;
+	wait_queue_head_t transaction_throttle;
 	struct btrfs_super_block super_copy;
 	struct btrfs_super_block super_for_commit;
 	struct block_device *__bdev;
@@ -541,6 +544,7 @@ struct btrfs_fs_info {
 	 */
 	struct btrfs_workers workers;
 	struct btrfs_workers endio_workers;
+	struct btrfs_workers endio_write_workers;
 	struct btrfs_workers submit_workers;
 	struct task_struct *transaction_kthread;
 	struct task_struct *cleaner_kthread;
@@ -1384,6 +1388,17 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 		       u64 owner, u64 owner_offset,
 		       u64 empty_size, u64 hint_byte,
 		       u64 search_end, struct btrfs_key *ins, u64 data);
+int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				u64 root_objectid, u64 ref_generation,
+				u64 owner, u64 owner_offset,
+				struct btrfs_key *ins);
+int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *root,
+				  u64 num_bytes, u64 min_alloc_size,
+				  u64 empty_size, u64 hint_byte,
+				  u64 search_end, struct btrfs_key *ins,
+				  u64 data);
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		  struct extent_buffer *buf);
 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
@@ -1556,9 +1571,9 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 			     u64 bytenr, int mod);
 int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root, struct inode *inode,
-			   struct bio *bio, char *sums);
+			   struct btrfs_ordered_sum *sums);
 int btrfs_csum_one_bio(struct btrfs_root *root,
-		       struct bio *bio, char **sums_ret);
+		       struct bio *bio, struct btrfs_ordered_sum **sums_ret);
 struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
 					  struct btrfs_root *root,
 					  struct btrfs_path *path,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b01b3f4f92a9..4a5ebafb935a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -407,7 +407,11 @@ static int end_workqueue_bio(struct bio *bio,
 	end_io_wq->error = err;
 	end_io_wq->work.func = end_workqueue_fn;
 	end_io_wq->work.flags = 0;
-	btrfs_queue_worker(&fs_info->endio_workers, &end_io_wq->work);
+	if (bio->bi_rw & (1 << BIO_RW))
+		btrfs_queue_worker(&fs_info->endio_write_workers,
+				   &end_io_wq->work);
+	else
+		btrfs_queue_worker(&fs_info->endio_workers, &end_io_wq->work);
 
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
 	return 0;
@@ -1286,6 +1290,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	mutex_init(&fs_info->transaction_kthread_mutex);
 	mutex_init(&fs_info->cleaner_mutex);
 	mutex_init(&fs_info->volume_mutex);
+	init_waitqueue_head(&fs_info->transaction_throttle);
 
 #if 0
 	ret = add_hasher(fs_info, "crc32c");
@@ -1325,9 +1330,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	btrfs_init_workers(&fs_info->workers, fs_info->thread_pool_size);
 	btrfs_init_workers(&fs_info->submit_workers, fs_info->thread_pool_size);
 	btrfs_init_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
+	btrfs_init_workers(&fs_info->endio_write_workers,
+			   fs_info->thread_pool_size);
 	btrfs_start_workers(&fs_info->workers, 1);
 	btrfs_start_workers(&fs_info->submit_workers, 1);
 	btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
+	btrfs_start_workers(&fs_info->endio_write_workers,
+			    fs_info->thread_pool_size);
 
 	err = -EINVAL;
 	if (btrfs_super_num_devices(disk_super) > fs_devices->open_devices) {
@@ -1447,6 +1456,7 @@ fail_sb_buffer:
 	extent_io_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->io_tree);
 	btrfs_stop_workers(&fs_info->workers);
 	btrfs_stop_workers(&fs_info->endio_workers);
+	btrfs_stop_workers(&fs_info->endio_write_workers);
 	btrfs_stop_workers(&fs_info->submit_workers);
 fail_iput:
 	iput(fs_info->btree_inode);
@@ -1702,6 +1712,7 @@ int close_ctree(struct btrfs_root *root)
 
 	btrfs_stop_workers(&fs_info->workers);
 	btrfs_stop_workers(&fs_info->endio_workers);
+	btrfs_stop_workers(&fs_info->endio_write_workers);
 	btrfs_stop_workers(&fs_info->submit_workers);
 
 	iput(fs_info->btree_inode);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 8ebfa6be0790..343d1101c31c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1895,36 +1895,17 @@ error:
 	return ret;
 }
 
-/*
- * finds a free extent and does all the dirty work required for allocation
- * returns the key for the extent through ins, and a tree buffer for
- * the first block of the extent through buf.
- *
- * returns 0 if everything worked, non-zero otherwise.
- */
-int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
-		       struct btrfs_root *root,
-		       u64 num_bytes, u64 min_alloc_size,
-		       u64 root_objectid, u64 ref_generation,
-		       u64 owner, u64 owner_offset,
-		       u64 empty_size, u64 hint_byte,
-		       u64 search_end, struct btrfs_key *ins, u64 data)
+static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *root,
+				  u64 num_bytes, u64 min_alloc_size,
+				  u64 empty_size, u64 hint_byte,
+				  u64 search_end, struct btrfs_key *ins,
+				  u64 data)
 {
 	int ret;
-	int pending_ret;
-	u64 super_used;
-	u64 root_used;
 	u64 search_start = 0;
 	u64 alloc_profile;
-	u32 sizes[2];
 	struct btrfs_fs_info *info = root->fs_info;
-	struct btrfs_root *extent_root = info->extent_root;
-	struct btrfs_extent_item *extent_item;
-	struct btrfs_extent_ref *ref;
-	struct btrfs_path *path;
-	struct btrfs_key keys[2];
-
-	maybe_lock_mutex(root);
 
 	if (data) {
 		alloc_profile = info->avail_data_alloc_bits &
@@ -1974,11 +1955,48 @@ again:
 	}
 	if (ret) {
 		printk("allocation failed flags %Lu\n", data);
-	}
-	if (ret) {
 		BUG();
-		goto out;
 	}
+	clear_extent_dirty(&root->fs_info->free_space_cache,
+			   ins->objectid, ins->objectid + ins->offset - 1,
+			   GFP_NOFS);
+	return 0;
+}
+
+int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *root,
+				  u64 num_bytes, u64 min_alloc_size,
+				  u64 empty_size, u64 hint_byte,
+				  u64 search_end, struct btrfs_key *ins,
+				  u64 data)
+{
+	int ret;
+	maybe_lock_mutex(root);
+	ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
+				     empty_size, hint_byte, search_end, ins,
+				     data);
+	maybe_unlock_mutex(root);
+	return ret;
+}
+
+static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
+					 struct btrfs_root *root,
+					 u64 root_objectid, u64 ref_generation,
+					 u64 owner, u64 owner_offset,
+					 struct btrfs_key *ins)
+{
+	int ret;
+	int pending_ret;
+	u64 super_used;
+	u64 root_used;
+	u64 num_bytes = ins->offset;
+	u32 sizes[2];
+	struct btrfs_fs_info *info = root->fs_info;
+	struct btrfs_root *extent_root = info->extent_root;
+	struct btrfs_extent_item *extent_item;
+	struct btrfs_extent_ref *ref;
+	struct btrfs_path *path;
+	struct btrfs_key keys[2];
 
 	/* block accounting for super block */
 	spin_lock_irq(&info->delalloc_lock);
@@ -1990,10 +2008,6 @@ again:
 	root_used = btrfs_root_used(&root->root_item);
 	btrfs_set_root_used(&root->root_item, root_used + num_bytes);
 
-	clear_extent_dirty(&root->fs_info->free_space_cache,
-			   ins->objectid, ins->objectid + ins->offset - 1,
-			   GFP_NOFS);
-
 	if (root == extent_root) {
 		set_extent_bits(&root->fs_info->extent_ins, ins->objectid,
 				ins->objectid + ins->offset - 1,
@@ -2001,10 +2015,6 @@ again:
 		goto update_block;
 	}
 
-	WARN_ON(trans->alloc_exclude_nr);
-	trans->alloc_exclude_start = ins->objectid;
-	trans->alloc_exclude_nr = ins->offset;
-
 	memcpy(&keys[0], ins, sizeof(*ins));
 	keys[1].offset = hash_extent_ref(root_objectid, ref_generation,
 					 owner, owner_offset);
@@ -2054,6 +2064,51 @@ update_block:
 		BUG();
 	}
 out:
+	return ret;
+}
+
+int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				u64 root_objectid, u64 ref_generation,
+				u64 owner, u64 owner_offset,
+				struct btrfs_key *ins)
+{
+	int ret;
+	maybe_lock_mutex(root);
+	ret = __btrfs_alloc_reserved_extent(trans, root, root_objectid,
+					    ref_generation, owner,
+					    owner_offset, ins);
+	maybe_unlock_mutex(root);
+	return ret;
+}
+/*
+ * finds a free extent and does all the dirty work required for allocation
+ * returns the key for the extent through ins, and a tree buffer for
+ * the first block of the extent through buf.
+ *
+ * returns 0 if everything worked, non-zero otherwise.
+ */
+int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root,
+		       u64 num_bytes, u64 min_alloc_size,
+		       u64 root_objectid, u64 ref_generation,
+		       u64 owner, u64 owner_offset,
+		       u64 empty_size, u64 hint_byte,
+		       u64 search_end, struct btrfs_key *ins, u64 data)
+{
+	int ret;
+
+	maybe_lock_mutex(root);
+
+	ret = __btrfs_reserve_extent(trans, root, num_bytes,
+				     min_alloc_size, empty_size, hint_byte,
+				     search_end, ins, data);
+	BUG_ON(ret);
+	ret = __btrfs_alloc_reserved_extent(trans, root, root_objectid,
+					    ref_generation, owner,
+					    owner_offset, ins);
+	BUG_ON(ret);
+
 	maybe_unlock_mutex(root);
 	return ret;
 }
@@ -2288,8 +2343,8 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 			mutex_lock(&root->fs_info->alloc_mutex);
 
 			/* we've dropped the lock, double check */
-			ret = drop_snap_lookup_refcount(root, bytenr,
-						blocksize, &refs);
+			ret = lookup_extent_ref(NULL, root, bytenr, blocksize,
+						&refs);
 			BUG_ON(ret);
 			if (refs != 1) {
 				parent = path->nodes[*level];
@@ -2584,7 +2639,6 @@ out_unlock:
 	kfree(ra);
 	trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1);
 	if (trans) {
-		btrfs_add_ordered_inode(inode);
 		btrfs_end_transaction(trans, BTRFS_I(inode)->root);
 		mark_inode_dirty(inode);
 	}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 40a5f53cb040..3f82a6e9ca4f 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -793,6 +793,13 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 }
 EXPORT_SYMBOL(set_extent_dirty);
 
+int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
+		       gfp_t mask)
+{
+	return set_extent_bit(tree, start, end, EXTENT_ORDERED, 0, NULL, mask);
+}
+EXPORT_SYMBOL(set_extent_ordered);
+
 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 		    int bits, gfp_t mask)
 {
@@ -812,8 +819,8 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
 		     gfp_t mask)
 {
 	return set_extent_bit(tree, start, end,
-			      EXTENT_DELALLOC | EXTENT_DIRTY, 0, NULL,
-			      mask);
+			      EXTENT_DELALLOC | EXTENT_DIRTY,
+			      0, NULL, mask);
 }
 EXPORT_SYMBOL(set_extent_delalloc);
 
@@ -825,6 +832,13 @@ int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 }
 EXPORT_SYMBOL(clear_extent_dirty);
 
+int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
+			 gfp_t mask)
+{
+	return clear_extent_bit(tree, start, end, EXTENT_ORDERED, 1, 0, mask);
+}
+EXPORT_SYMBOL(clear_extent_ordered);
+
 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
 		     gfp_t mask)
 {
@@ -1395,10 +1409,9 @@ static int end_bio_extent_writepage(struct bio *bio,
 
 		if (--bvec >= bio->bi_io_vec)
 			prefetchw(&bvec->bv_page->flags);
-
 		if (tree->ops && tree->ops->writepage_end_io_hook) {
 			ret = tree->ops->writepage_end_io_hook(page, start,
-						       end, state);
+						       end, state, uptodate);
 			if (ret)
 				uptodate = 0;
 		}
@@ -1868,9 +1881,14 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
 			unlock_extent(tree, cur, end, GFP_NOFS);
 			break;
 		}
-
 		extent_offset = cur - em->start;
+		if (extent_map_end(em) <= cur) {
+printk("bad mapping em [%Lu %Lu] cur %Lu\n", em->start, extent_map_end(em), cur);
+		}
 		BUG_ON(extent_map_end(em) <= cur);
+		if (end < cur) {
+printk("2bad mapping end %Lu cur %Lu\n", end, cur);
+		}
 		BUG_ON(end < cur);
 
 		iosize = min(extent_map_end(em) - cur, end - cur + 1);
@@ -1976,6 +1994,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	u64 last_byte = i_size_read(inode);
 	u64 block_start;
 	u64 iosize;
+	u64 unlock_start;
 	sector_t sector;
 	struct extent_map *em;
 	struct block_device *bdev;
@@ -1988,7 +2007,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	u64 nr_delalloc;
 	u64 delalloc_end;
 
-
 	WARN_ON(!PageLocked(page));
 	page_offset = i_size & (PAGE_CACHE_SIZE - 1);
 	if (page->index > end_index ||
@@ -2030,6 +2048,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 		delalloc_start = delalloc_end + 1;
 	}
 	lock_extent(tree, start, page_end, GFP_NOFS);
+	unlock_start = start;
 
 	end = page_end;
 	if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) {
@@ -2038,6 +2057,11 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 
 	if (last_byte <= start) {
 		clear_extent_dirty(tree, start, page_end, GFP_NOFS);
+		unlock_extent(tree, start, page_end, GFP_NOFS);
+		if (tree->ops && tree->ops->writepage_end_io_hook)
+			tree->ops->writepage_end_io_hook(page, start,
+							 page_end, NULL, 1);
+		unlock_start = page_end + 1;
 		goto done;
 	}
 
@@ -2047,6 +2071,11 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	while (cur <= end) {
 		if (cur >= last_byte) {
 			clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
+			unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
+			if (tree->ops && tree->ops->writepage_end_io_hook)
+				tree->ops->writepage_end_io_hook(page, cur,
+							 page_end, NULL, 1);
+			unlock_start = page_end + 1;
 			break;
 		}
 		em = epd->get_extent(inode, page, page_offset, cur,
@@ -2071,8 +2100,16 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 		    block_start == EXTENT_MAP_INLINE) {
 			clear_extent_dirty(tree, cur,
 					   cur + iosize - 1, GFP_NOFS);
+
+			unlock_extent(tree, unlock_start, cur + iosize -1,
+				      GFP_NOFS);
+			if (tree->ops && tree->ops->writepage_end_io_hook)
+				tree->ops->writepage_end_io_hook(page, cur,
+							 cur + iosize - 1,
+							 NULL, 1);
 			cur = cur + iosize;
 			page_offset += iosize;
+			unlock_start = cur;
 			continue;
 		}
 
@@ -2119,7 +2156,8 @@ done:
 		set_page_writeback(page);
 		end_page_writeback(page);
 	}
-	unlock_extent(tree, start, page_end, GFP_NOFS);
+	if (unlock_start <= page_end)
+		unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
 	unlock_page(page);
 	return 0;
 }
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index f1960dafaa19..2268a7995896 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -13,6 +13,8 @@
 #define EXTENT_DEFRAG (1 << 6)
 #define EXTENT_DEFRAG_DONE (1 << 7)
 #define EXTENT_BUFFER_FILLED (1 << 8)
+#define EXTENT_ORDERED (1 << 9)
+#define EXTENT_ORDERED_METADATA (1 << 10)
 #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
 
 /*
@@ -42,7 +44,7 @@ struct extent_io_ops {
 	int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end,
 				    struct extent_state *state);
 	int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
-				      struct extent_state *state);
+				      struct extent_state *state, int uptodate);
 	int (*set_bit_hook)(struct inode *inode, u64 start, u64 end,
 			    unsigned long old, unsigned long bits);
 	int (*clear_bit_hook)(struct inode *inode, u64 start, u64 end,
@@ -131,6 +133,8 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
 		   int bits, int filled);
 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 		      int bits, gfp_t mask);
+int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+		     int bits, int wake, int delete, gfp_t mask);
 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 		    int bits, gfp_t mask);
 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
@@ -141,8 +145,14 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 		     gfp_t mask);
 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 		       gfp_t mask);
+int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
+		       gfp_t mask);
+int clear_extent_ordered_metadata(struct extent_io_tree *tree, u64 start,
+				  u64 end, gfp_t mask);
 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
 		     gfp_t mask);
+int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
+		     gfp_t mask);
 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
 			  u64 *start_ret, u64 *end_ret, int bits);
 struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
@@ -209,6 +219,8 @@ void memset_extent_buffer(struct extent_buffer *eb, char c,
 			  unsigned long start, unsigned long len);
 int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
 				    struct extent_buffer *eb);
+int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end);
+int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits);
 int clear_extent_buffer_dirty(struct extent_io_tree *tree,
 			      struct extent_buffer *eb);
 int set_extent_buffer_dirty(struct extent_io_tree *tree,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index f5a04eb9a2ac..81123277c2b8 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -206,10 +206,11 @@ int add_extent_mapping(struct extent_map_tree *tree,
 	struct extent_map *merge = NULL;
 	struct rb_node *rb;
 
+	BUG_ON(spin_trylock(&tree->lock));
 	rb = tree_insert(&tree->map, em->start, &em->rb_node);
 	if (rb) {
-		merge = rb_entry(rb, struct extent_map, rb_node);
 		ret = -EEXIST;
+		free_extent_map(merge);
 		goto out;
 	}
 	atomic_inc(&em->refs);
@@ -268,6 +269,7 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
 	struct rb_node *next = NULL;
 	u64 end = range_end(start, len);
 
+	BUG_ON(spin_trylock(&tree->lock));
 	em = tree->last;
 	if (em && end > em->start && start < extent_map_end(em))
 		goto found;
@@ -318,6 +320,7 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
 {
 	int ret = 0;
 
+	BUG_ON(spin_trylock(&tree->lock));
 	rb_erase(&em->rb_node, &tree->map);
 	em->in_tree = 0;
 	if (tree->last == em)
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index f537eb43c2c6..345caf8ff516 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -135,26 +135,37 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 }
 
 int btrfs_csum_one_bio(struct btrfs_root *root,
-		       struct bio *bio, char **sums_ret)
+		       struct bio *bio, struct btrfs_ordered_sum **sums_ret)
 {
-	u32 *sums;
+	struct btrfs_ordered_sum *sums;
+	struct btrfs_sector_sum *sector_sum;
 	char *data;
 	struct bio_vec *bvec = bio->bi_io_vec;
 	int bio_index = 0;
 
-	sums = kmalloc(bio->bi_vcnt * BTRFS_CRC32_SIZE, GFP_NOFS);
+	WARN_ON(bio->bi_vcnt <= 0);
+	sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_size), GFP_NOFS);
 	if (!sums)
 		return -ENOMEM;
-	*sums_ret = (char *)sums;
+	*sums_ret = sums;
+	sector_sum = &sums->sums;
+	sums->file_offset = page_offset(bvec->bv_page);
+	sums->len = bio->bi_size;
+	INIT_LIST_HEAD(&sums->list);
 
 	while(bio_index < bio->bi_vcnt) {
 		data = kmap_atomic(bvec->bv_page, KM_USER0);
-		*sums = ~(u32)0;
-		*sums = btrfs_csum_data(root, data + bvec->bv_offset,
-					*sums, bvec->bv_len);
+		sector_sum->sum = ~(u32)0;
+		sector_sum->sum = btrfs_csum_data(root,
+						  data + bvec->bv_offset,
+						  sector_sum->sum,
+						  bvec->bv_len);
 		kunmap_atomic(data, KM_USER0);
-		btrfs_csum_final(*sums, (char *)sums);
-		sums++;
+		btrfs_csum_final(sector_sum->sum,
+				 (char *)&sector_sum->sum);
+		sector_sum->offset = page_offset(bvec->bv_page) +
+			bvec->bv_offset;
+		sector_sum++;
 		bio_index++;
 		bvec++;
 	}
@@ -163,7 +174,7 @@ int btrfs_csum_one_bio(struct btrfs_root *root,
 
 int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root, struct inode *inode,
-			   struct bio *bio, char *sums)
+			   struct btrfs_ordered_sum *sums)
 {
 	u64 objectid = inode->i_ino;
 	u64 offset;
@@ -171,17 +182,16 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
 	struct btrfs_key file_key;
 	struct btrfs_key found_key;
 	u64 next_offset;
+	u64 total_bytes = 0;
 	int found_next;
 	struct btrfs_path *path;
 	struct btrfs_csum_item *item;
 	struct btrfs_csum_item *item_end;
 	struct extent_buffer *leaf = NULL;
 	u64 csum_offset;
-	u32 *sums32 = (u32 *)sums;
+	struct btrfs_sector_sum *sector_sum;
 	u32 nritems;
 	u32 ins_size;
-	int bio_index = 0;
-	struct bio_vec *bvec = bio->bi_io_vec;
 	char *eb_map;
 	char *eb_token;
 	unsigned long map_len;
@@ -189,10 +199,11 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
+	sector_sum = &sums->sums;
 again:
 	next_offset = (u64)-1;
 	found_next = 0;
-	offset = page_offset(bvec->bv_page) + bvec->bv_offset;
+	offset = sector_sum->offset;
 	file_key.objectid = objectid;
 	file_key.offset = offset;
 	btrfs_set_key_type(&file_key, BTRFS_CSUM_ITEM_KEY);
@@ -303,7 +314,7 @@ found:
 	item_end = (struct btrfs_csum_item *)((unsigned char *)item_end +
 				      btrfs_item_size_nr(leaf, path->slots[0]));
 	eb_token = NULL;
-next_bvec:
+next_sector:
 
 	if (!eb_token ||
 	   (unsigned long)item  + BTRFS_CRC32_SIZE >= map_start + map_len) {
@@ -321,21 +332,20 @@ next_bvec:
 	}
 	if (eb_token) {
 		memcpy(eb_token + ((unsigned long)item & (PAGE_CACHE_SIZE - 1)),
-		       sums32, BTRFS_CRC32_SIZE);
+		       &sector_sum->sum, BTRFS_CRC32_SIZE);
 	} else {
-		write_extent_buffer(leaf, sums32, (unsigned long)item,
-				    BTRFS_CRC32_SIZE);
+		write_extent_buffer(leaf, &sector_sum->sum,
+				    (unsigned long)item, BTRFS_CRC32_SIZE);
 	}
-	bio_index++;
-	bvec++;
-	sums32++;
-	if (bio_index < bio->bi_vcnt) {
+	total_bytes += root->sectorsize;
+	sector_sum++;
+	if (total_bytes < sums->len) {
 		item = (struct btrfs_csum_item *)((char *)item +
 						  BTRFS_CRC32_SIZE);
 		if (item < item_end && offset + PAGE_CACHE_SIZE ==
-		    page_offset(bvec->bv_page)) {
-			offset = page_offset(bvec->bv_page);
-			goto next_bvec;
+		    sector_sum->offset) {
+			    offset = sector_sum->offset;
+			goto next_sector;
 		}
 	}
 	if (eb_token) {
@@ -343,7 +353,7 @@ next_bvec:
 		eb_token = NULL;
 	}
 	btrfs_mark_buffer_dirty(path->nodes[0]);
-	if (bio_index < bio->bi_vcnt) {
+	if (total_bytes < sums->len) {
 		btrfs_release_path(root, path);
 		goto again;
 	}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 8037792f8789..12e765f7e0d4 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -34,7 +34,6 @@
 #include "disk-io.h"
 #include "transaction.h"
 #include "btrfs_inode.h"
-#include "ordered-data.h"
 #include "ioctl.h"
 #include "print-tree.h"
 #include "compat.h"
@@ -273,7 +272,9 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 		u64 mask = root->sectorsize - 1;
 		last_pos_in_file = (isize + mask) & ~mask;
 		hole_size = (start_pos - last_pos_in_file + mask) & ~mask;
-		if (last_pos_in_file < start_pos) {
+		if (hole_size > 0) {
+			btrfs_wait_ordered_range(inode, last_pos_in_file,
+						 last_pos_in_file + hole_size);
 			err = btrfs_drop_extents(trans, root, inode,
 						 last_pos_in_file,
 						 last_pos_in_file + hole_size,
@@ -303,19 +304,17 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 	    inline_size > root->fs_info->max_inline ||
 	    (inline_size & (root->sectorsize -1)) == 0 ||
 	    inline_size >= BTRFS_MAX_INLINE_DATA_SIZE(root)) {
-		u64 last_end;
-
+		/* check for reserved extents on each page, we don't want
+		 * to reset the delalloc bit on things that already have
+		 * extents reserved.
+		 */
+		set_extent_delalloc(io_tree, start_pos,
+				    end_of_last_block, GFP_NOFS);
 		for (i = 0; i < num_pages; i++) {
 			struct page *p = pages[i];
 			SetPageUptodate(p);
 			set_page_dirty(p);
 		}
-		last_end = (u64)(pages[num_pages -1]->index) <<
-				PAGE_CACHE_SHIFT;
-		last_end += PAGE_CACHE_SIZE - 1;
-		set_extent_delalloc(io_tree, start_pos, end_of_last_block,
-				 GFP_NOFS);
-		btrfs_add_ordered_inode(inode);
 	} else {
 		u64 aligned_end;
 		/* step one, delete the existing extents in this range */
@@ -350,10 +349,13 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end)
 	struct extent_map *split = NULL;
 	struct extent_map *split2 = NULL;
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct extent_map *tmp;
 	u64 len = end - start + 1;
+	u64 next_start;
 	int ret;
 	int testend = 1;
 
+	WARN_ON(end < start);
 	if (end == (u64)-1) {
 		len = (u64)-1;
 		testend = 0;
@@ -370,6 +372,8 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end)
 			spin_unlock(&em_tree->lock);
 			break;
 		}
+		tmp = rb_entry(&em->rb_node, struct extent_map, rb_node);
+		next_start = tmp->start;
 		remove_extent_mapping(em_tree, em);
 
 		if (em->block_start < EXTENT_MAP_LAST_BYTE &&
@@ -778,37 +782,58 @@ static int prepare_pages(struct btrfs_root *root, struct file *file,
 	struct inode *inode = fdentry(file)->d_inode;
 	int err = 0;
 	u64 start_pos;
+	u64 last_pos;
 
 	start_pos = pos & ~((u64)root->sectorsize - 1);
+	last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
 
 	memset(pages, 0, num_pages * sizeof(struct page *));
-
+again:
 	for (i = 0; i < num_pages; i++) {
 		pages[i] = grab_cache_page(inode->i_mapping, index + i);
 		if (!pages[i]) {
 			err = -ENOMEM;
 			BUG_ON(1);
 		}
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
-		ClearPageDirty(pages[i]);
-#else
-		cancel_dirty_page(pages[i], PAGE_CACHE_SIZE);
-#endif
 		wait_on_page_writeback(pages[i]);
-		set_page_extent_mapped(pages[i]);
-		WARN_ON(!PageLocked(pages[i]));
 	}
 	if (start_pos < inode->i_size) {
-		u64 last_pos;
-		last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
+		struct btrfs_ordered_extent *ordered;
 		lock_extent(&BTRFS_I(inode)->io_tree,
 			    start_pos, last_pos - 1, GFP_NOFS);
+		ordered = btrfs_lookup_first_ordered_extent(inode, last_pos -1);
+		if (ordered &&
+		    ordered->file_offset + ordered->len > start_pos &&
+		    ordered->file_offset < last_pos) {
+			btrfs_put_ordered_extent(ordered);
+			unlock_extent(&BTRFS_I(inode)->io_tree,
+				      start_pos, last_pos - 1, GFP_NOFS);
+			for (i = 0; i < num_pages; i++) {
+				unlock_page(pages[i]);
+				page_cache_release(pages[i]);
+			}
+			btrfs_wait_ordered_range(inode, start_pos,
+						 last_pos - start_pos);
+			goto again;
+		}
+		if (ordered)
+			btrfs_put_ordered_extent(ordered);
+
 		clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos,
 				  last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC,
 				  GFP_NOFS);
 		unlock_extent(&BTRFS_I(inode)->io_tree,
 			      start_pos, last_pos - 1, GFP_NOFS);
 	}
+	for (i = 0; i < num_pages; i++) {
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
+		ClearPageDirty(pages[i]);
+#else
+		cancel_dirty_page(pages[i], PAGE_CACHE_SIZE);
+#endif
+		set_page_extent_mapped(pages[i]);
+		WARN_ON(!PageLocked(pages[i]));
+	}
 	return 0;
 }
 
@@ -969,13 +994,11 @@ out_nolock:
 		     (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
 	}
 	current->backing_dev_info = NULL;
-	btrfs_ordered_throttle(root, inode);
 	return num_written ? num_written : err;
 }
 
 int btrfs_release_file(struct inode * inode, struct file * filp)
 {
-	btrfs_del_ordered_inode(inode, 0);
 	if (filp->private_data)
 		btrfs_ioctl_trans_end(filp);
 	return 0;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d39433dfb2c7..c5a62f0b9595 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -43,6 +43,7 @@
 #include "ioctl.h"
 #include "print-tree.h"
 #include "volumes.h"
+#include "ordered-data.h"
 
 struct btrfs_iget_args {
 	u64 ino;
@@ -109,10 +110,11 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
 	u64 num_bytes;
 	u64 cur_alloc_size;
 	u64 blocksize = root->sectorsize;
-	u64 orig_start = start;
 	u64 orig_num_bytes;
 	struct btrfs_key ins;
-	int ret;
+	struct extent_map *em;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	int ret = 0;
 
 	trans = btrfs_start_transaction(root, 1);
 	BUG_ON(!trans);
@@ -120,33 +122,44 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
 
 	num_bytes = (end - start + blocksize) & ~(blocksize - 1);
 	num_bytes = max(blocksize,  num_bytes);
-	ret = btrfs_drop_extents(trans, root, inode,
-				 start, start + num_bytes, start, &alloc_hint);
 	orig_num_bytes = num_bytes;
 
 	if (alloc_hint == EXTENT_MAP_INLINE)
 		goto out;
 
 	BUG_ON(num_bytes > btrfs_super_total_bytes(&root->fs_info->super_copy));
+	btrfs_drop_extent_cache(inode, start, start + num_bytes - 1);
 
 	while(num_bytes > 0) {
 		cur_alloc_size = min(num_bytes, root->fs_info->max_extent);
-		ret = btrfs_alloc_extent(trans, root, cur_alloc_size,
-					 root->sectorsize,
-					 root->root_key.objectid,
-					 trans->transid,
-					 inode->i_ino, start, 0,
-					 alloc_hint, (u64)-1, &ins, 1);
+		ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
+					   root->sectorsize, 0, 0,
+					   (u64)-1, &ins, 1);
 		if (ret) {
 			WARN_ON(1);
 			goto out;
 		}
+		em = alloc_extent_map(GFP_NOFS);
+		em->start = start;
+		em->len = ins.offset;
+		em->block_start = ins.objectid;
+		em->bdev = root->fs_info->fs_devices->latest_bdev;
+		while(1) {
+			spin_lock(&em_tree->lock);
+			ret = add_extent_mapping(em_tree, em);
+			spin_unlock(&em_tree->lock);
+			if (ret != -EEXIST) {
+				free_extent_map(em);
+				break;
+			}
+			btrfs_drop_extent_cache(inode, start,
+						start + ins.offset - 1);
+		}
+
 		cur_alloc_size = ins.offset;
-		ret = btrfs_insert_file_extent(trans, root, inode->i_ino,
-					       start, ins.objectid, ins.offset,
-					       ins.offset, 0);
-		inode->i_blocks += ins.offset >> 9;
-		btrfs_check_file(root, inode);
+		ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
+					       ins.offset);
+		BUG_ON(ret);
 		if (num_bytes < cur_alloc_size) {
 			printk("num_bytes %Lu cur_alloc %Lu\n", num_bytes,
 			       cur_alloc_size);
@@ -156,10 +169,6 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
 		alloc_hint = ins.objectid + ins.offset;
 		start += cur_alloc_size;
 	}
-	btrfs_drop_extent_cache(inode, orig_start,
-				orig_start + orig_num_bytes - 1);
-	btrfs_add_ordered_inode(inode);
-	btrfs_update_inode(trans, root, inode);
 out:
 	btrfs_end_transaction(trans, root);
 	return ret;
@@ -341,25 +350,15 @@ int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 			  int mirror_num)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct btrfs_trans_handle *trans;
 	int ret = 0;
-	char *sums = NULL;
+	struct btrfs_ordered_sum *sums;
 
 	ret = btrfs_csum_one_bio(root, bio, &sums);
 	BUG_ON(ret);
 
-	trans = btrfs_start_transaction(root, 1);
-
-	btrfs_set_trans_block_group(trans, inode);
-	mutex_lock(&BTRFS_I(inode)->csum_mutex);
-	btrfs_csum_file_blocks(trans, root, inode, bio, sums);
-	mutex_unlock(&BTRFS_I(inode)->csum_mutex);
-
-	ret = btrfs_end_transaction(trans, root);
+	ret = btrfs_add_ordered_sum(inode, sums);
 	BUG_ON(ret);
 
-	kfree(sums);
-
 	return btrfs_map_bio(root, rw, bio, mirror_num, 1);
 }
 
@@ -369,14 +368,10 @@ int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret = 0;
 
-	if (!(rw & (1 << BIO_RW))) {
-		ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
-		BUG_ON(ret);
-		goto mapit;
-	}
+	ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+	BUG_ON(ret);
 
-	if (btrfs_test_opt(root, NODATASUM) ||
-	    btrfs_test_flag(inode, NODATASUM)) {
+	if (!(rw & (1 << BIO_RW))) {
 		goto mapit;
 	}
 
@@ -387,6 +382,96 @@ mapit:
 	return btrfs_map_bio(root, rw, bio, mirror_num, 0);
 }
 
+static int add_pending_csums(struct btrfs_trans_handle *trans,
+			     struct inode *inode, u64 file_offset,
+			     struct list_head *list)
+{
+	struct list_head *cur;
+	struct btrfs_ordered_sum *sum;
+
+	btrfs_set_trans_block_group(trans, inode);
+	while(!list_empty(list)) {
+		cur = list->next;
+		sum = list_entry(cur, struct btrfs_ordered_sum, list);
+		mutex_lock(&BTRFS_I(inode)->csum_mutex);
+		btrfs_csum_file_blocks(trans, BTRFS_I(inode)->root,
+				       inode, sum);
+		mutex_unlock(&BTRFS_I(inode)->csum_mutex);
+		list_del(&sum->list);
+		kfree(sum);
+	}
+	return 0;
+}
+
+int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
+				struct extent_state *state, int uptodate)
+{
+	struct inode *inode = page->mapping->host;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_ordered_extent *ordered_extent;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	u64 alloc_hint = 0;
+	struct list_head list;
+	struct btrfs_key ins;
+	int ret;
+
+	ret = btrfs_dec_test_ordered_pending(inode, start, end - start + 1);
+	if (!ret) {
+		return 0;
+	}
+
+	trans = btrfs_start_transaction(root, 1);
+
+	ordered_extent = btrfs_lookup_ordered_extent(inode, start);
+	BUG_ON(!ordered_extent);
+
+	lock_extent(io_tree, ordered_extent->file_offset,
+		    ordered_extent->file_offset + ordered_extent->len - 1,
+		    GFP_NOFS);
+
+	INIT_LIST_HEAD(&list);
+
+	ins.objectid = ordered_extent->start;
+	ins.offset = ordered_extent->len;
+	ins.type = BTRFS_EXTENT_ITEM_KEY;
+	ret = btrfs_alloc_reserved_extent(trans, root, root->root_key.objectid,
+					  trans->transid, inode->i_ino,
+					  ordered_extent->file_offset, &ins);
+	BUG_ON(ret);
+	ret = btrfs_drop_extents(trans, root, inode,
+				 ordered_extent->file_offset,
+				 ordered_extent->file_offset +
+				 ordered_extent->len,
+				 ordered_extent->file_offset, &alloc_hint);
+	BUG_ON(ret);
+	ret = btrfs_insert_file_extent(trans, root, inode->i_ino,
+				       ordered_extent->file_offset,
+				       ordered_extent->start,
+				       ordered_extent->len,
+				       ordered_extent->len, 0);
+	BUG_ON(ret);
+	btrfs_drop_extent_cache(inode, ordered_extent->file_offset,
+				ordered_extent->file_offset +
+				ordered_extent->len - 1);
+	inode->i_blocks += ordered_extent->len >> 9;
+	unlock_extent(io_tree, ordered_extent->file_offset,
+		    ordered_extent->file_offset + ordered_extent->len - 1,
+		    GFP_NOFS);
+	add_pending_csums(trans, inode, ordered_extent->file_offset,
+			  &ordered_extent->list);
+
+	btrfs_remove_ordered_extent(inode, ordered_extent);
+	/* once for us */
+	btrfs_put_ordered_extent(ordered_extent);
+	/* once for the tree */
+	btrfs_put_ordered_extent(ordered_extent);
+
+	btrfs_update_inode(trans, root, inode);
+	btrfs_end_transaction(trans, root);
+	return 0;
+}
+
 int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end)
 {
 	int ret = 0;
@@ -409,7 +494,8 @@ int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end)
 		if (ret == -ENOENT || ret == -EFBIG)
 			ret = 0;
 		csum = 0;
-		printk("no csum found for inode %lu start %Lu\n", inode->i_ino, start);
+		printk("no csum found for inode %lu start %Lu\n", inode->i_ino,
+		       start);
 		goto out;
 	}
 	read_extent_buffer(path->nodes[0], &csum, (unsigned long)item,
@@ -833,7 +919,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 {
 	struct btrfs_root *root;
 	struct btrfs_trans_handle *trans;
-	struct inode *inode = dentry->d_inode;
 	int ret;
 	unsigned long nr = 0;
 
@@ -849,14 +934,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 	ret = btrfs_unlink_trans(trans, root, dir, dentry);
 	nr = trans->blocks_used;
 
-	if (inode->i_nlink == 0) {
-		/* if the inode isn't linked anywhere,
-		 * we don't need to worry about
-		 * data=ordered
-		 */
-		btrfs_del_ordered_inode(inode, 1);
-	}
-
 	btrfs_end_transaction_throttle(trans, root);
 fail:
 	btrfs_btree_balance_dirty(root, nr);
@@ -931,6 +1008,7 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 	int extent_type = -1;
 	u64 mask = root->sectorsize - 1;
 
+	btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
 	btrfs_drop_extent_cache(inode, inode->i_size & (~mask), (u64)-1);
 	path = btrfs_alloc_path();
 	path->reada = -1;
@@ -1117,34 +1195,6 @@ error:
 	return ret;
 }
 
-static int btrfs_cow_one_page(struct inode *inode, struct page *page,
-			      size_t zero_start)
-{
-	char *kaddr;
-	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
-	u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
-	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
-	int ret = 0;
-
-	WARN_ON(!PageLocked(page));
-	set_page_extent_mapped(page);
-
-	lock_extent(io_tree, page_start, page_end, GFP_NOFS);
-	set_extent_delalloc(&BTRFS_I(inode)->io_tree, page_start,
-			    page_end, GFP_NOFS);
-
-	if (zero_start != PAGE_CACHE_SIZE) {
-		kaddr = kmap(page);
-		memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start);
-		flush_dcache_page(page);
-		kunmap(page);
-	}
-	set_page_dirty(page);
-	unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
-
-	return ret;
-}
-
 /*
  * taken from block_truncate_page, but does cow as it zeros out
  * any bytes left in the last page in the file.
@@ -1153,12 +1203,16 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
 {
 	struct inode *inode = mapping->host;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct btrfs_ordered_extent *ordered;
+	char *kaddr;
 	u32 blocksize = root->sectorsize;
 	pgoff_t index = from >> PAGE_CACHE_SHIFT;
 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
 	struct page *page;
 	int ret = 0;
 	u64 page_start;
+	u64 page_end;
 
 	if ((offset & (blocksize - 1)) == 0)
 		goto out;
@@ -1168,6 +1222,10 @@ again:
 	page = grab_cache_page(mapping, index);
 	if (!page)
 		goto out;
+
+	page_start = page_offset(page);
+	page_end = page_start + PAGE_CACHE_SIZE - 1;
+
 	if (!PageUptodate(page)) {
 		ret = btrfs_readpage(NULL, page);
 		lock_page(page);
@@ -1181,10 +1239,32 @@ again:
 			goto out;
 		}
 	}
-
-	page_start = (u64)page->index << PAGE_CACHE_SHIFT;
 	wait_on_page_writeback(page);
-	ret = btrfs_cow_one_page(inode, page, offset);
+
+	lock_extent(io_tree, page_start, page_end, GFP_NOFS);
+	set_page_extent_mapped(page);
+
+	ordered = btrfs_lookup_ordered_extent(inode, page_start);
+	if (ordered) {
+		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+		unlock_page(page);
+		page_cache_release(page);
+		btrfs_wait_ordered_extent(inode, ordered);
+		btrfs_put_ordered_extent(ordered);
+		goto again;
+	}
+
+	set_extent_delalloc(&BTRFS_I(inode)->io_tree, page_start,
+			    page_end, GFP_NOFS);
+	ret = 0;
+	if (offset != PAGE_CACHE_SIZE) {
+		kaddr = kmap(page);
+		memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
+		flush_dcache_page(page);
+		kunmap(page);
+	}
+	set_page_dirty(page);
+	unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
 
 	unlock_page(page);
 	page_cache_release(page);
@@ -1222,8 +1302,9 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 
 		btrfs_truncate_page(inode->i_mapping, inode->i_size);
 
-		lock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
 		hole_size = block_end - hole_start;
+		btrfs_wait_ordered_range(inode, hole_start, hole_size);
+		lock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
 
 		trans = btrfs_start_transaction(root, 1);
 		btrfs_set_trans_block_group(trans, inode);
@@ -1258,6 +1339,7 @@ void btrfs_delete_inode(struct inode *inode)
 	unsigned long nr;
 	int ret;
 
+	btrfs_wait_ordered_range(inode, 0, (u64)-1);
 	truncate_inode_pages(&inode->i_data, 0);
 	if (is_bad_inode(inode)) {
 		goto no_delete;
@@ -1403,7 +1485,6 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
 	extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
 			     inode->i_mapping, GFP_NOFS);
 	mutex_init(&BTRFS_I(inode)->csum_mutex);
-	atomic_set(&BTRFS_I(inode)->ordered_writeback, 0);
 	return 0;
 }
 
@@ -1705,7 +1786,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
 			     inode->i_mapping, GFP_NOFS);
 	mutex_init(&BTRFS_I(inode)->csum_mutex);
-	atomic_set(&BTRFS_I(inode)->ordered_writeback, 0);
 	BTRFS_I(inode)->delalloc_bytes = 0;
 	BTRFS_I(inode)->root = root;
 
@@ -1930,7 +2010,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 				     inode->i_mapping, GFP_NOFS);
 		mutex_init(&BTRFS_I(inode)->csum_mutex);
 		BTRFS_I(inode)->delalloc_bytes = 0;
-		atomic_set(&BTRFS_I(inode)->ordered_writeback, 0);
 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 	}
 	dir->i_sb->s_dirt = 1;
@@ -2066,64 +2145,18 @@ out_unlock:
 
 static int merge_extent_mapping(struct extent_map_tree *em_tree,
 				struct extent_map *existing,
-				struct extent_map *em)
+				struct extent_map *em,
+				u64 map_start, u64 map_len)
 {
 	u64 start_diff;
-	u64 new_end;
-	int ret = 0;
-	int real_blocks = existing->block_start < EXTENT_MAP_LAST_BYTE;
-
-	if (real_blocks && em->block_start >= EXTENT_MAP_LAST_BYTE)
-		goto invalid;
-
-	if (!real_blocks && em->block_start != existing->block_start)
-		goto invalid;
-
-	new_end = max(existing->start + existing->len, em->start + em->len);
-
-	if (existing->start >= em->start) {
-		if (em->start + em->len < existing->start)
-			goto invalid;
 
-		start_diff = existing->start - em->start;
-		if (real_blocks && em->block_start + start_diff !=
-		    existing->block_start)
-			goto invalid;
-
-		em->len = new_end - em->start;
-
-		remove_extent_mapping(em_tree, existing);
-		/* free for the tree */
-		free_extent_map(existing);
-		ret = add_extent_mapping(em_tree, em);
-
-	} else if (em->start > existing->start) {
-
-		if (existing->start + existing->len < em->start)
-			goto invalid;
-
-		start_diff = em->start - existing->start;
-		if (real_blocks && existing->block_start + start_diff !=
-		    em->block_start)
-			goto invalid;
-
-		remove_extent_mapping(em_tree, existing);
-		em->block_start = existing->block_start;
-		em->start = existing->start;
-		em->len = new_end - existing->start;
-		free_extent_map(existing);
-
-		ret = add_extent_mapping(em_tree, em);
-	} else {
-		goto invalid;
-	}
-	return ret;
-
-invalid:
-	printk("invalid extent map merge [%Lu %Lu %Lu] [%Lu %Lu %Lu]\n",
-	       existing->start, existing->len, existing->block_start,
-	       em->start, em->len, em->block_start);
-	return -EIO;
+	BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
+	start_diff = map_start - em->start;
+	em->start = map_start;
+	em->len = map_len;
+	if (em->block_start < EXTENT_MAP_LAST_BYTE)
+		em->block_start += start_diff;
+	return add_extent_mapping(em_tree, em);
 }
 
 struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
@@ -2170,10 +2203,9 @@ again:
 		err = -ENOMEM;
 		goto out;
 	}
-
+	em->bdev = root->fs_info->fs_devices->latest_bdev;
 	em->start = EXTENT_MAP_HOLE;
 	em->len = (u64)-1;
-	em->bdev = root->fs_info->fs_devices->latest_bdev;
 	ret = btrfs_lookup_file_extent(trans, root, path,
 				       objectid, start, trans != NULL);
 	if (ret < 0) {
@@ -2314,6 +2346,9 @@ insert:
 	 */
 	if (ret == -EEXIST) {
 		struct extent_map *existing;
+
+		ret = 0;
+
 		existing = lookup_extent_mapping(em_tree, start, len);
 		if (existing && (existing->start > start ||
 		    existing->start + existing->len <= start)) {
@@ -2325,7 +2360,8 @@ insert:
 							 em->len);
 			if (existing) {
 				err = merge_extent_mapping(em_tree, existing,
-							   em);
+							   em, start,
+							   root->sectorsize);
 				free_extent_map(existing);
 				if (err) {
 					free_extent_map(em);
@@ -2341,6 +2377,7 @@ insert:
 		} else {
 			free_extent_map(em);
 			em = existing;
+			err = 0;
 		}
 	}
 	spin_unlock(&em_tree->lock);
@@ -2348,8 +2385,9 @@ out:
 	btrfs_free_path(path);
 	if (trans) {
 		ret = btrfs_end_transaction(trans, root);
-		if (!err)
+		if (!err) {
 			err = ret;
+		}
 	}
 	if (err) {
 		free_extent_map(em);
@@ -2474,8 +2512,7 @@ btrfs_readpages(struct file *file, struct address_space *mapping,
 	return extent_readpages(tree, mapping, pages, nr_pages,
 				btrfs_get_extent);
 }
-
-static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
+static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
 {
 	struct extent_io_tree *tree;
 	struct extent_map_tree *map;
@@ -2493,15 +2530,54 @@ static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
 	return ret;
 }
 
+static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
+{
+	struct btrfs_ordered_extent *ordered;
+
+	ordered = btrfs_lookup_ordered_extent(page->mapping->host,
+					      page_offset(page));
+	if (ordered) {
+		btrfs_put_ordered_extent(ordered);
+		return 0;
+	}
+	return __btrfs_releasepage(page, gfp_flags);
+}
+
 static void btrfs_invalidatepage(struct page *page, unsigned long offset)
 {
 	struct extent_io_tree *tree;
+	struct btrfs_ordered_extent *ordered;
+	u64 page_start = page_offset(page);
+	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
 
+	wait_on_page_writeback(page);
 	tree = &BTRFS_I(page->mapping->host)->io_tree;
-	extent_invalidatepage(tree, page, offset);
-	btrfs_releasepage(page, GFP_NOFS);
+	if (offset) {
+		btrfs_releasepage(page, GFP_NOFS);
+		return;
+	}
+
+	lock_extent(tree, page_start, page_end, GFP_NOFS);
+	ordered = btrfs_lookup_ordered_extent(page->mapping->host,
+					   page_offset(page));
+	if (ordered) {
+		clear_extent_bit(tree, page_start, page_end,
+				 EXTENT_DIRTY | EXTENT_DELALLOC |
+				 EXTENT_LOCKED, 1, 0, GFP_NOFS);
+		btrfs_writepage_end_io_hook(page, page_start,
+					    page_end, NULL, 1);
+		btrfs_put_ordered_extent(ordered);
+		lock_extent(tree, page_start, page_end, GFP_NOFS);
+	}
+	clear_extent_bit(tree, page_start, page_end,
+		 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
+		 EXTENT_ORDERED,
+		 1, 1, GFP_NOFS);
+	__btrfs_releasepage(page, GFP_NOFS);
+
 	if (PagePrivate(page)) {
-		invalidate_extent_lru(tree, page_offset(page), PAGE_CACHE_SIZE);
+		invalidate_extent_lru(tree, page_offset(page),
+				      PAGE_CACHE_SIZE);
 		ClearPagePrivate(page);
 		set_page_private(page, 0);
 		page_cache_release(page);
@@ -2527,35 +2603,63 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 {
 	struct inode *inode = fdentry(vma->vm_file)->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	unsigned long end;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct btrfs_ordered_extent *ordered;
+	char *kaddr;
+	unsigned long zero_start;
 	loff_t size;
 	int ret;
 	u64 page_start;
+	u64 page_end;
 
 	ret = btrfs_check_free_space(root, PAGE_CACHE_SIZE, 0);
 	if (ret)
 		goto out;
 
 	ret = -EINVAL;
-
+again:
 	lock_page(page);
-	wait_on_page_writeback(page);
 	size = i_size_read(inode);
-	page_start = (u64)page->index << PAGE_CACHE_SHIFT;
+	page_start = page_offset(page);
+	page_end = page_start + PAGE_CACHE_SIZE - 1;
 
 	if ((page->mapping != inode->i_mapping) ||
-	    (page_start > size)) {
+	    (page_start >= size)) {
 		/* page got truncated out from underneath us */
 		goto out_unlock;
 	}
+	wait_on_page_writeback(page);
+
+	lock_extent(io_tree, page_start, page_end, GFP_NOFS);
+	set_page_extent_mapped(page);
+
+	ordered = btrfs_lookup_ordered_extent(inode, page_start);
+	if (ordered) {
+		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+		unlock_page(page);
+		btrfs_wait_ordered_extent(inode, ordered);
+		btrfs_put_ordered_extent(ordered);
+		goto again;
+	}
+
+	set_extent_delalloc(&BTRFS_I(inode)->io_tree, page_start,
+			    page_end, GFP_NOFS);
+	ret = 0;
 
 	/* page is wholly or partially inside EOF */
 	if (page_start + PAGE_CACHE_SIZE > size)
-		end = size & ~PAGE_CACHE_MASK;
+		zero_start = size & ~PAGE_CACHE_MASK;
 	else
-		end = PAGE_CACHE_SIZE;
+		zero_start = PAGE_CACHE_SIZE;
 
-	ret = btrfs_cow_one_page(inode, page, end);
+	if (zero_start != PAGE_CACHE_SIZE) {
+		kaddr = kmap(page);
+		memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start);
+		flush_dcache_page(page);
+		kunmap(page);
+	}
+	set_page_dirty(page);
+	unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
 
 out_unlock:
 	unlock_page(page);
@@ -2662,15 +2766,28 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 	if (!ei)
 		return NULL;
 	ei->last_trans = 0;
-	ei->ordered_trans = 0;
+	btrfs_ordered_inode_tree_init(&ei->ordered_tree);
 	return &ei->vfs_inode;
 }
 
 void btrfs_destroy_inode(struct inode *inode)
 {
+	struct btrfs_ordered_extent *ordered;
 	WARN_ON(!list_empty(&inode->i_dentry));
 	WARN_ON(inode->i_data.nrpages);
 
+	while(1) {
+		ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
+		if (!ordered)
+			break;
+		else {
+			printk("found ordered extent %Lu %Lu\n",
+			       ordered->file_offset, ordered->len);
+			btrfs_remove_ordered_extent(inode, ordered);
+			btrfs_put_ordered_extent(ordered);
+			btrfs_put_ordered_extent(ordered);
+		}
+	}
 	btrfs_drop_extent_cache(inode, 0, (u64)-1);
 	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
 }
@@ -2869,7 +2986,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 				     inode->i_mapping, GFP_NOFS);
 		mutex_init(&BTRFS_I(inode)->csum_mutex);
 		BTRFS_I(inode)->delalloc_bytes = 0;
-		atomic_set(&BTRFS_I(inode)->ordered_writeback, 0);
 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 	}
 	dir->i_sb->s_dirt = 1;
@@ -2921,6 +3037,20 @@ out_fail:
 	return err;
 }
 
+static int btrfs_set_page_dirty(struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	u64 page_start = page_offset(page);
+	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
+
+	if (!test_range_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
+			    EXTENT_DELALLOC, 0)) {
+printk("inode %lu page %Lu not delalloc\n", inode->i_ino, page_offset(page));
+WARN_ON(1);
+	}
+	return __set_page_dirty_nobuffers(page);
+}
+
 static int btrfs_permission(struct inode *inode, int mask,
 			    struct nameidata *nd)
 {
@@ -2967,6 +3097,7 @@ static struct extent_io_ops btrfs_extent_io_ops = {
 	.merge_bio_hook = btrfs_merge_bio_hook,
 	.readpage_io_hook = btrfs_readpage_io_hook,
 	.readpage_end_io_hook = btrfs_readpage_end_io_hook,
+	.writepage_end_io_hook = btrfs_writepage_end_io_hook,
 	.readpage_io_failed_hook = btrfs_io_failed_hook,
 	.set_bit_hook = btrfs_set_bit_hook,
 	.clear_bit_hook = btrfs_clear_bit_hook,
@@ -2982,7 +3113,7 @@ static struct address_space_operations btrfs_aops = {
 	.direct_IO	= btrfs_direct_IO,
 	.invalidatepage = btrfs_invalidatepage,
 	.releasepage	= btrfs_releasepage,
-	.set_page_dirty	= __set_page_dirty_nobuffers,
+	.set_page_dirty	= btrfs_set_page_dirty,
 };
 
 static struct address_space_operations btrfs_symlink_aops = {
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 254da8225664..6513270f054c 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -22,48 +22,30 @@
 #include "ctree.h"
 #include "transaction.h"
 #include "btrfs_inode.h"
+#include "extent_io.h"
 
-struct tree_entry {
-	u64 root_objectid;
-	u64 objectid;
-	struct inode *inode;
-	struct rb_node rb_node;
-};
 
-/*
- * returns > 0 if entry passed (root, objectid) is > entry,
- * < 0 if (root, objectid) < entry and zero if they are equal
- */
-static int comp_entry(struct tree_entry *entry, u64 root_objectid,
-		      u64 objectid)
+static u64 entry_end(struct btrfs_ordered_extent *entry)
 {
-	if (root_objectid < entry->root_objectid)
-		return -1;
-	if (root_objectid > entry->root_objectid)
-		return 1;
-	if (objectid < entry->objectid)
-		return -1;
-	if (objectid > entry->objectid)
-		return 1;
-	return 0;
+	if (entry->file_offset + entry->len < entry->file_offset)
+		return (u64)-1;
+	return entry->file_offset + entry->len;
 }
 
-static struct rb_node *tree_insert(struct rb_root *root, u64 root_objectid,
-				   u64 objectid, struct rb_node *node)
+static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
+				   struct rb_node *node)
 {
 	struct rb_node ** p = &root->rb_node;
 	struct rb_node * parent = NULL;
-	struct tree_entry *entry;
-	int comp;
+	struct btrfs_ordered_extent *entry;
 
 	while(*p) {
 		parent = *p;
-		entry = rb_entry(parent, struct tree_entry, rb_node);
+		entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node);
 
-		comp = comp_entry(entry, root_objectid, objectid);
-		if (comp < 0)
+		if (file_offset < entry->file_offset)
 			p = &(*p)->rb_left;
-		else if (comp > 0)
+		else if (file_offset >= entry_end(entry))
 			p = &(*p)->rb_right;
 		else
 			return parent;
@@ -74,24 +56,23 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 root_objectid,
 	return NULL;
 }
 
-static struct rb_node *__tree_search(struct rb_root *root, u64 root_objectid,
-				     u64 objectid, struct rb_node **prev_ret)
+static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
+				     struct rb_node **prev_ret)
 {
 	struct rb_node * n = root->rb_node;
 	struct rb_node *prev = NULL;
-	struct tree_entry *entry;
-	struct tree_entry *prev_entry = NULL;
-	int comp;
+	struct rb_node *test;
+	struct btrfs_ordered_extent *entry;
+	struct btrfs_ordered_extent *prev_entry = NULL;
 
 	while(n) {
-		entry = rb_entry(n, struct tree_entry, rb_node);
+		entry = rb_entry(n, struct btrfs_ordered_extent, rb_node);
 		prev = n;
 		prev_entry = entry;
-		comp = comp_entry(entry, root_objectid, objectid);
 
-		if (comp < 0)
+		if (file_offset < entry->file_offset)
 			n = n->rb_left;
-		else if (comp > 0)
+		else if (file_offset >= entry_end(entry))
 			n = n->rb_right;
 		else
 			return n;
@@ -99,195 +80,329 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 root_objectid,
 	if (!prev_ret)
 		return NULL;
 
-	while(prev && comp_entry(prev_entry, root_objectid, objectid) >= 0) {
-		prev = rb_next(prev);
-		prev_entry = rb_entry(prev, struct tree_entry, rb_node);
+	while(prev && file_offset >= entry_end(prev_entry)) {
+		test = rb_next(prev);
+		if (!test)
+			break;
+		prev_entry = rb_entry(test, struct btrfs_ordered_extent,
+				      rb_node);
+		if (file_offset < entry_end(prev_entry))
+			break;
+
+		prev = test;
+	}
+	if (prev)
+		prev_entry = rb_entry(prev, struct btrfs_ordered_extent,
+				      rb_node);
+	while(prev && file_offset < entry_end(prev_entry)) {
+		test = rb_prev(prev);
+		if (!test)
+			break;
+		prev_entry = rb_entry(test, struct btrfs_ordered_extent,
+				      rb_node);
+		prev = test;
 	}
 	*prev_ret = prev;
 	return NULL;
 }
 
-static inline struct rb_node *tree_search(struct rb_root *root,
-					  u64 root_objectid, u64 objectid)
+static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
+{
+	if (file_offset < entry->file_offset ||
+	    entry->file_offset + entry->len <= file_offset)
+		return 0;
+	return 1;
+}
+
+static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
+					  u64 file_offset)
 {
+	struct rb_root *root = &tree->tree;
 	struct rb_node *prev;
 	struct rb_node *ret;
-	ret = __tree_search(root, root_objectid, objectid, &prev);
+	struct btrfs_ordered_extent *entry;
+
+	if (tree->last) {
+		entry = rb_entry(tree->last, struct btrfs_ordered_extent,
+				 rb_node);
+		if (offset_in_entry(entry, file_offset))
+			return tree->last;
+	}
+	ret = __tree_search(root, file_offset, &prev);
 	if (!ret)
-		return prev;
+		ret = prev;
+	if (ret)
+		tree->last = ret;
 	return ret;
 }
 
-int btrfs_add_ordered_inode(struct inode *inode)
+int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
+			     u64 start, u64 len)
 {
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	u64 root_objectid = root->root_key.objectid;
-	u64 transid = root->fs_info->running_transaction->transid;
-	struct tree_entry *entry;
-	struct rb_node *node;
 	struct btrfs_ordered_inode_tree *tree;
+	struct rb_node *node;
+	struct btrfs_ordered_extent *entry;
 
-	if (transid <= BTRFS_I(inode)->ordered_trans)
-		return 0;
-
-	tree = &root->fs_info->running_transaction->ordered_inode_tree;
-
-	read_lock(&tree->lock);
-	node = __tree_search(&tree->tree, root_objectid, inode->i_ino, NULL);
-	read_unlock(&tree->lock);
-	if (node) {
-		return 0;
-	}
-
-	entry = kmalloc(sizeof(*entry), GFP_NOFS);
+	tree = &BTRFS_I(inode)->ordered_tree;
+	entry = kzalloc(sizeof(*entry), GFP_NOFS);
 	if (!entry)
 		return -ENOMEM;
 
-	write_lock(&tree->lock);
-	entry->objectid = inode->i_ino;
-	entry->root_objectid = root_objectid;
+	mutex_lock(&tree->mutex);
+	entry->file_offset = file_offset;
+	entry->start = start;
+	entry->len = len;
 	entry->inode = inode;
+	/* one ref for the tree */
+	atomic_set(&entry->refs, 1);
+	init_waitqueue_head(&entry->wait);
+	INIT_LIST_HEAD(&entry->list);
 
-	node = tree_insert(&tree->tree, root_objectid,
-			   inode->i_ino, &entry->rb_node);
-
-	BTRFS_I(inode)->ordered_trans = transid;
-	if (!node)
-		igrab(inode);
-
-	write_unlock(&tree->lock);
+	node = tree_insert(&tree->tree, file_offset,
+			   &entry->rb_node);
+	if (node) {
+		entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+		atomic_inc(&entry->refs);
+	}
+	set_extent_ordered(&BTRFS_I(inode)->io_tree, file_offset,
+			   entry_end(entry) - 1, GFP_NOFS);
 
-	if (node)
-		kfree(entry);
+	set_bit(BTRFS_ORDERED_START, &entry->flags);
+	mutex_unlock(&tree->mutex);
+	BUG_ON(node);
 	return 0;
 }
 
-int btrfs_find_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
-				   u64 *root_objectid, u64 *objectid,
-				   struct inode **inode)
+int btrfs_add_ordered_sum(struct inode *inode, struct btrfs_ordered_sum *sum)
 {
-	struct tree_entry *entry;
+	struct btrfs_ordered_inode_tree *tree;
 	struct rb_node *node;
+	struct btrfs_ordered_extent *entry;
 
-	write_lock(&tree->lock);
-	node = tree_search(&tree->tree, *root_objectid, *objectid);
+	tree = &BTRFS_I(inode)->ordered_tree;
+	mutex_lock(&tree->mutex);
+	node = tree_search(tree, sum->file_offset);
 	if (!node) {
-		write_unlock(&tree->lock);
-		return 0;
+search_fail:
+printk("add ordered sum failed to find a node for inode %lu offset %Lu\n", inode->i_ino, sum->file_offset);
+		node = rb_first(&tree->tree);
+		while(node) {
+			entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+			printk("entry %Lu %Lu %Lu\n", entry->file_offset, entry->file_offset + entry->len, entry->start);
+			node = rb_next(node);
+		}
+		BUG();
 	}
-	entry = rb_entry(node, struct tree_entry, rb_node);
+	BUG_ON(!node);
 
-	while(comp_entry(entry, *root_objectid, *objectid) >= 0) {
-		node = rb_next(node);
-		if (!node)
-			break;
-		entry = rb_entry(node, struct tree_entry, rb_node);
-	}
-	if (!node) {
-		write_unlock(&tree->lock);
-		return 0;
+	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+	if (!offset_in_entry(entry, sum->file_offset)) {
+		goto search_fail;
 	}
 
-	*root_objectid = entry->root_objectid;
-	*inode = entry->inode;
-	atomic_inc(&entry->inode->i_count);
-	*objectid = entry->objectid;
-	write_unlock(&tree->lock);
-	return 1;
+	list_add_tail(&sum->list, &entry->list);
+	mutex_unlock(&tree->mutex);
+	return 0;
 }
 
-int btrfs_find_del_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
-				       u64 *root_objectid, u64 *objectid,
-				       struct inode **inode)
+int btrfs_dec_test_ordered_pending(struct inode *inode,
+				   u64 file_offset, u64 io_size)
 {
-	struct tree_entry *entry;
+	struct btrfs_ordered_inode_tree *tree;
 	struct rb_node *node;
-
-	write_lock(&tree->lock);
-	node = tree_search(&tree->tree, *root_objectid, *objectid);
+	struct btrfs_ordered_extent *entry;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	int ret;
+
+	tree = &BTRFS_I(inode)->ordered_tree;
+	mutex_lock(&tree->mutex);
+	clear_extent_ordered(io_tree, file_offset, file_offset + io_size - 1,
+			     GFP_NOFS);
+	node = tree_search(tree, file_offset);
 	if (!node) {
-		write_unlock(&tree->lock);
-		return 0;
+		ret = 1;
+		goto out;
 	}
 
-	entry = rb_entry(node, struct tree_entry, rb_node);
-	while(comp_entry(entry, *root_objectid, *objectid) >= 0) {
-		node = rb_next(node);
-		if (!node)
-			break;
-		entry = rb_entry(node, struct tree_entry, rb_node);
+	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+	if (!offset_in_entry(entry, file_offset)) {
+		ret = 1;
+		goto out;
 	}
-	if (!node) {
-		write_unlock(&tree->lock);
-		return 0;
+
+	ret = test_range_bit(io_tree, entry->file_offset,
+			     entry->file_offset + entry->len - 1,
+			     EXTENT_ORDERED, 0);
+	if (!test_bit(BTRFS_ORDERED_START, &entry->flags)) {
+printk("inode %lu not ready yet for extent %Lu %Lu\n", inode->i_ino, entry->file_offset, entry_end(entry));
 	}
+	if (ret == 0)
+		ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
+out:
+	mutex_unlock(&tree->mutex);
+	return ret == 0;
+}
 
-	*root_objectid = entry->root_objectid;
-	*objectid = entry->objectid;
-	*inode = entry->inode;
-	atomic_inc(&entry->inode->i_count);
-	rb_erase(node, &tree->tree);
-	write_unlock(&tree->lock);
-	kfree(entry);
-	return 1;
+int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
+{
+	if (atomic_dec_and_test(&entry->refs))
+		kfree(entry);
+	return 0;
 }
 
-static void __btrfs_del_ordered_inode(struct btrfs_ordered_inode_tree *tree,
-				     struct inode *inode,
-				     u64 root_objectid, u64 objectid)
+int btrfs_remove_ordered_extent(struct inode *inode,
+				struct btrfs_ordered_extent *entry)
 {
-	struct tree_entry *entry;
+	struct btrfs_ordered_inode_tree *tree;
 	struct rb_node *node;
-	struct rb_node *prev;
 
-	write_lock(&tree->lock);
-	node = __tree_search(&tree->tree, root_objectid, objectid, &prev);
-	if (!node) {
-		write_unlock(&tree->lock);
-		return;
-	}
+	tree = &BTRFS_I(inode)->ordered_tree;
+	mutex_lock(&tree->mutex);
+	node = &entry->rb_node;
 	rb_erase(node, &tree->tree);
-	BTRFS_I(inode)->ordered_trans = 0;
-	write_unlock(&tree->lock);
-	atomic_dec(&inode->i_count);
-	entry = rb_entry(node, struct tree_entry, rb_node);
-	kfree(entry);
-	return;
+	tree->last = NULL;
+	set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
+	mutex_unlock(&tree->mutex);
+	wake_up(&entry->wait);
+	return 0;
 }
 
-void btrfs_del_ordered_inode(struct inode *inode, int force)
+void btrfs_wait_ordered_extent(struct inode *inode,
+			       struct btrfs_ordered_extent *entry)
 {
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	u64 root_objectid = root->root_key.objectid;
+	u64 start = entry->file_offset;
+	u64 end = start + entry->len - 1;
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22)
+	do_sync_file_range(file, start, end, SYNC_FILE_RANGE_WRITE);
+#else
+	do_sync_mapping_range(inode->i_mapping, start, end,
+			      SYNC_FILE_RANGE_WRITE);
+#endif
+	wait_event(entry->wait,
+		   test_bit(BTRFS_ORDERED_COMPLETE, &entry->flags));
+}
 
-	if (!BTRFS_I(inode)->ordered_trans) {
-		return;
-	}
+static void btrfs_start_ordered_extent(struct inode *inode,
+			       struct btrfs_ordered_extent *entry, int wait)
+{
+	u64 start = entry->file_offset;
+	u64 end = start + entry->len - 1;
 
-	if (!force && (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY) ||
-	    mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
-		return;
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22)
+	do_sync_file_range(file, start, end, SYNC_FILE_RANGE_WRITE);
+#else
+	do_sync_mapping_range(inode->i_mapping, start, end,
+			      SYNC_FILE_RANGE_WRITE);
+#endif
+	if (wait)
+		wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
+						 &entry->flags));
+}
 
-	spin_lock(&root->fs_info->new_trans_lock);
-	if (root->fs_info->running_transaction) {
-		struct btrfs_ordered_inode_tree *tree;
-		tree = &root->fs_info->running_transaction->ordered_inode_tree;
-		 __btrfs_del_ordered_inode(tree, inode, root_objectid,
-						inode->i_ino);
+void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
+{
+	u64 end;
+	struct btrfs_ordered_extent *ordered;
+	int found;
+	int should_wait = 0;
+
+again:
+	if (start + len < start)
+		end = (u64)-1;
+	else
+		end = start + len - 1;
+	found = 0;
+	while(1) {
+		ordered = btrfs_lookup_first_ordered_extent(inode, end);
+		if (!ordered) {
+			break;
+		}
+		if (ordered->file_offset >= start + len) {
+			btrfs_put_ordered_extent(ordered);
+			break;
+		}
+		if (ordered->file_offset + ordered->len < start) {
+			btrfs_put_ordered_extent(ordered);
+			break;
+		}
+		btrfs_start_ordered_extent(inode, ordered, should_wait);
+		found++;
+		end = ordered->file_offset;
+		btrfs_put_ordered_extent(ordered);
+		if (end == 0)
+			break;
+		end--;
+	}
+	if (should_wait && found) {
+		should_wait = 0;
+		goto again;
 	}
-	spin_unlock(&root->fs_info->new_trans_lock);
 }
 
-int btrfs_ordered_throttle(struct btrfs_root *root, struct inode *inode)
+int btrfs_add_ordered_pending(struct inode *inode,
+			      struct btrfs_ordered_extent *ordered,
+			      u64 start, u64 len)
 {
-	struct btrfs_transaction *cur = root->fs_info->running_transaction;
-	while(cur == root->fs_info->running_transaction &&
-	      atomic_read(&BTRFS_I(inode)->ordered_writeback)) {
-#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,18)
-		congestion_wait(WRITE, HZ/20);
-#else
-		blk_congestion_wait(WRITE, HZ/20);
-#endif
-	}
+	WARN_ON(1);
 	return 0;
+#if 0
+	int ret;
+	struct btrfs_ordered_inode_tree *tree;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+
+	tree = &BTRFS_I(inode)->ordered_tree;
+	mutex_lock(&tree->mutex);
+	if (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags)) {
+		ret = -EAGAIN;
+		goto out;
+	}
+	set_extent_ordered(io_tree, start, start + len - 1, GFP_NOFS);
+	ret = 0;
+out:
+	mutex_unlock(&tree->mutex);
+	return ret;
+#endif
+}
+
+struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
+							 u64 file_offset)
+{
+	struct btrfs_ordered_inode_tree *tree;
+	struct rb_node *node;
+	struct btrfs_ordered_extent *entry = NULL;
+
+	tree = &BTRFS_I(inode)->ordered_tree;
+	mutex_lock(&tree->mutex);
+	node = tree_search(tree, file_offset);
+	if (!node)
+		goto out;
+
+	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+	if (!offset_in_entry(entry, file_offset))
+		entry = NULL;
+	if (entry)
+		atomic_inc(&entry->refs);
+out:
+	mutex_unlock(&tree->mutex);
+	return entry;
+}
+
+struct btrfs_ordered_extent *
+btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset)
+{
+	struct btrfs_ordered_inode_tree *tree;
+	struct rb_node *node;
+	struct btrfs_ordered_extent *entry = NULL;
+
+	tree = &BTRFS_I(inode)->ordered_tree;
+	mutex_lock(&tree->mutex);
+	node = tree_search(tree, file_offset);
+	if (!node)
+		goto out;
+
+	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+	atomic_inc(&entry->refs);
+out:
+	mutex_unlock(&tree->mutex);
+	return entry;
 }
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 4fa78736423e..33292c5fe90c 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -20,24 +20,73 @@
 #define __BTRFS_ORDERED_DATA__
 
 struct btrfs_ordered_inode_tree {
-	rwlock_t lock;
+	struct mutex mutex;
 	struct rb_root tree;
+	struct rb_node *last;
 };
 
+struct btrfs_sector_sum {
+	u64 offset;
+	u32 sum;
+};
+
+struct btrfs_ordered_sum {
+	u64 file_offset;
+	u64 len;
+	struct list_head list;
+	struct btrfs_sector_sum sums;
+};
+
+/* bits for the flags field */
+#define BTRFS_ORDERED_IO_DONE 0 /* set when all the pages are written */
+#define BTRFS_ORDERED_COMPLETE 1 /* set when removed from the tree */
+#define BTRFS_ORDERED_START 2 /* set when tree setup */
+
+struct btrfs_ordered_extent {
+	u64 file_offset;
+	u64 start;
+	u64 len;
+	unsigned long flags;
+	atomic_t refs;
+	struct list_head list;
+	struct inode *inode;
+	wait_queue_head_t wait;
+	struct rb_node rb_node;
+};
+
+
+static inline int btrfs_ordered_sum_size(struct btrfs_root *root, u64 bytes)
+{
+	unsigned long num_sectors = (bytes + root->sectorsize - 1) /
+		root->sectorsize;
+	return sizeof(struct btrfs_ordered_sum) +
+		num_sectors * sizeof(struct btrfs_sector_sum);
+}
+
 static inline void
 btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
 {
-	rwlock_init(&t->lock);
+	mutex_init(&t->mutex);
 	t->tree.rb_node = NULL;
+	t->last = NULL;
 }
 
-int btrfs_add_ordered_inode(struct inode *inode);
-int btrfs_find_del_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
-				       u64 *root_objectid, u64 *objectid,
-				       struct inode **inode);
-int btrfs_find_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
-				       u64 *root_objectid, u64 *objectid,
-				       struct inode **inode);
-void btrfs_del_ordered_inode(struct inode *inode, int force);
-int btrfs_ordered_throttle(struct btrfs_root *root, struct inode *inode);
+int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
+int btrfs_remove_ordered_extent(struct inode *inode,
+				struct btrfs_ordered_extent *entry);
+int btrfs_dec_test_ordered_pending(struct inode *inode,
+				       u64 file_offset, u64 io_size);
+int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
+			     u64 start, u64 len);
+int btrfs_add_ordered_sum(struct inode *inode, struct btrfs_ordered_sum *sum);
+struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
+							 u64 file_offset);
+void btrfs_wait_ordered_extent(struct inode *inode,
+			       struct btrfs_ordered_extent *entry);
+void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
+struct btrfs_ordered_extent *
+btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
+int btrfs_add_ordered_pending(struct inode *inode,
+			      struct btrfs_ordered_extent *ordered,
+			      u64 start, u64 len);
 #endif
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index a8a3cb03de59..86a5acc19ce7 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -67,7 +67,6 @@ static noinline int join_transaction(struct btrfs_root *root)
 		cur_trans->start_time = get_seconds();
 		INIT_LIST_HEAD(&cur_trans->pending_snapshots);
 		list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
-		btrfs_ordered_inode_tree_init(&cur_trans->ordered_inode_tree);
 		extent_io_tree_init(&cur_trans->dirty_pages,
 				     root->fs_info->btree_inode->i_mapping,
 				     GFP_NOFS);
@@ -158,10 +157,12 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 		wake_up(&cur_trans->writer_wait);
 
 	if (cur_trans->in_commit && throttle) {
-		int ret;
+		DEFINE_WAIT(wait);
 		mutex_unlock(&root->fs_info->trans_mutex);
-		ret = wait_for_commit(root, cur_trans);
-		BUG_ON(ret);
+		prepare_to_wait(&root->fs_info->transaction_throttle, &wait,
+				TASK_UNINTERRUPTIBLE);
+		schedule();
+		finish_wait(&root->fs_info->transaction_throttle, &wait);
 		mutex_lock(&root->fs_info->trans_mutex);
 	}
 
@@ -486,58 +487,6 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
 	return ret;
 }
 
-int btrfs_write_ordered_inodes(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root)
-{
-	struct btrfs_transaction *cur_trans = trans->transaction;
-	struct inode *inode;
-	u64 root_objectid = 0;
-	u64 objectid = 0;
-	int ret;
-
-	atomic_inc(&root->fs_info->throttles);
-	while(1) {
-		ret = btrfs_find_first_ordered_inode(
-				&cur_trans->ordered_inode_tree,
-				&root_objectid, &objectid, &inode);
-		if (!ret)
-			break;
-
-		mutex_unlock(&root->fs_info->trans_mutex);
-
-		if (S_ISREG(inode->i_mode)) {
-			atomic_inc(&BTRFS_I(inode)->ordered_writeback);
-			filemap_fdatawrite(inode->i_mapping);
-			atomic_dec(&BTRFS_I(inode)->ordered_writeback);
-		}
-		iput(inode);
-
-		mutex_lock(&root->fs_info->trans_mutex);
-	}
-	while(1) {
-		root_objectid = 0;
-		objectid = 0;
-		ret = btrfs_find_del_first_ordered_inode(
-				&cur_trans->ordered_inode_tree,
-				&root_objectid, &objectid, &inode);
-		if (!ret)
-			break;
-		mutex_unlock(&root->fs_info->trans_mutex);
-
-		if (S_ISREG(inode->i_mode)) {
-			atomic_inc(&BTRFS_I(inode)->ordered_writeback);
-			filemap_write_and_wait(inode->i_mapping);
-			atomic_dec(&BTRFS_I(inode)->ordered_writeback);
-		}
-		atomic_dec(&inode->i_count);
-		iput(inode);
-
-		mutex_lock(&root->fs_info->trans_mutex);
-	}
-	atomic_dec(&root->fs_info->throttles);
-	return 0;
-}
-
 static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 				   struct btrfs_fs_info *fs_info,
 				   struct btrfs_pending_snapshot *pending)
@@ -666,6 +615,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	extent_io_tree_init(pinned_copy,
 			     root->fs_info->btree_inode->i_mapping, GFP_NOFS);
 
+printk("commit trans %Lu\n", trans->transid);
 	trans->transaction->in_commit = 1;
 	cur_trans = trans->transaction;
 	if (cur_trans->list.prev != &root->fs_info->trans_list) {
@@ -699,8 +649,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 		mutex_lock(&root->fs_info->trans_mutex);
 		finish_wait(&cur_trans->writer_wait, &wait);
-		ret = btrfs_write_ordered_inodes(trans, root);
-
 	} while (cur_trans->num_writers > 1 ||
 		 (cur_trans->num_joined != joined));
 
@@ -736,6 +684,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 	btrfs_copy_pinned(root, pinned_copy);
 
+	wake_up(&root->fs_info->transaction_throttle);
+
 	mutex_unlock(&root->fs_info->trans_mutex);
 	ret = btrfs_write_and_wait_transaction(trans, root);
 	BUG_ON(ret);
@@ -758,6 +708,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 		list_splice_init(&dirty_fs_roots, &root->fs_info->dead_roots);
 
 	mutex_unlock(&root->fs_info->trans_mutex);
+printk("done commit trans %Lu\n", trans->transid);
 	kmem_cache_free(btrfs_trans_handle_cachep, trans);
 
 	if (root->fs_info->closing) {
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 9ccd5a5b170f..910350cd4cf0 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -19,7 +19,6 @@
 #ifndef __BTRFS_TRANSACTION__
 #define __BTRFS_TRANSACTION__
 #include "btrfs_inode.h"
-#include "ordered-data.h"
 
 struct btrfs_transaction {
 	u64 transid;
@@ -31,7 +30,6 @@ struct btrfs_transaction {
 	struct list_head list;
 	struct extent_io_tree dirty_pages;
 	unsigned long start_time;
-	struct btrfs_ordered_inode_tree ordered_inode_tree;
 	wait_queue_head_t writer_wait;
 	wait_queue_head_t commit_wait;
 	struct list_head pending_snapshots;
@@ -88,8 +86,6 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly);
 int btrfs_clean_old_snapshots(struct btrfs_root *root);
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root);
-int btrfs_write_ordered_inodes(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root);
 int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root);
 #endif
-- 
cgit v1.2.3


From dbe674a99c8af088faa4c95eddaeb271a3140ab6 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 17 Jul 2008 12:54:05 -0400
Subject: Btrfs: Update on disk i_size only after pending ordered extents are
 done

This changes the ordered data code to update i_size after the extent
is on disk.  An on disk i_size is maintained in the in-memory btrfs inode
structures, and this is updated as extents finish.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/btrfs_inode.h  |  9 +++++
 fs/btrfs/file.c         |  2 +-
 fs/btrfs/inode.c        | 28 ++++++++++------
 fs/btrfs/ordered-data.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/ordered-data.h |  2 ++
 5 files changed, 119 insertions(+), 11 deletions(-)

(limited to 'fs/btrfs/btrfs_inode.h')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 8d03687510e0..81c0444f37b6 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -40,11 +40,20 @@ struct btrfs_inode {
 	 */
 	u64 last_trans;
 	u64 delalloc_bytes;
+	u64 disk_i_size;
 	u32 flags;
 };
+
 static inline struct btrfs_inode *BTRFS_I(struct inode *inode)
 {
 	return container_of(inode, struct btrfs_inode, vfs_inode);
 }
 
+static inline void btrfs_i_size_write(struct inode *inode, u64 size)
+{
+	inode->i_size = size;
+	BTRFS_I(inode)->disk_i_size = size;
+}
+
+
 #endif
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 20928639d173..3e4e5c227c0c 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -338,7 +338,7 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 		btrfs_update_inode(trans, root, inode);
 	}
 failed:
-	err = btrfs_end_transaction(trans, root);
+	err = btrfs_end_transaction_throttle(trans, root);
 out_unlock:
 	unlock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
 	return err;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 47a008c19308..baf46017d0d3 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -542,6 +542,7 @@ int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
 	add_pending_csums(trans, inode, ordered_extent->file_offset,
 			  &ordered_extent->list);
 
+	btrfs_ordered_update_i_size(inode, ordered_extent);
 	btrfs_remove_ordered_extent(inode, ordered_extent);
 	/* once for us */
 	btrfs_put_ordered_extent(ordered_extent);
@@ -792,7 +793,7 @@ void btrfs_read_locked_inode(struct inode *inode)
 	inode->i_nlink = btrfs_inode_nlink(leaf, inode_item);
 	inode->i_uid = btrfs_inode_uid(leaf, inode_item);
 	inode->i_gid = btrfs_inode_gid(leaf, inode_item);
-	inode->i_size = btrfs_inode_size(leaf, inode_item);
+	btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
 
 	tspec = btrfs_inode_atime(inode_item);
 	inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec);
@@ -860,7 +861,7 @@ static void fill_inode_item(struct extent_buffer *leaf,
 {
 	btrfs_set_inode_uid(leaf, item, inode->i_uid);
 	btrfs_set_inode_gid(leaf, item, inode->i_gid);
-	btrfs_set_inode_size(leaf, item, inode->i_size);
+	btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size);
 	btrfs_set_inode_mode(leaf, item, inode->i_mode);
 	btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
 
@@ -982,7 +983,7 @@ static int btrfs_unlink_trans(struct btrfs_trans_handle *trans,
 err:
 	btrfs_free_path(path);
 	if (!ret) {
-		dir->i_size -= name_len * 2;
+		btrfs_i_size_write(dir, dir->i_size - name_len * 2);
 		dir->i_mtime = dir->i_ctime = CURRENT_TIME;
 		btrfs_update_inode(trans, root, dir);
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
@@ -1044,7 +1045,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 	/* now the directory is empty */
 	err = btrfs_unlink_trans(trans, root, dir, dentry);
 	if (!err) {
-		inode->i_size = 0;
+		btrfs_i_size_write(inode, 0);
 	}
 
 	nr = trans->blocks_used;
@@ -1089,7 +1090,6 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 	int extent_type = -1;
 	u64 mask = root->sectorsize - 1;
 
-	btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
 	btrfs_drop_extent_cache(inode, inode->i_size & (~mask), (u64)-1);
 	path = btrfs_alloc_path();
 	path->reada = -1;
@@ -1427,7 +1427,7 @@ void btrfs_delete_inode(struct inode *inode)
 		goto no_delete;
 	}
 
-	inode->i_size = 0;
+	btrfs_i_size_write(inode, 0);
 	trans = btrfs_start_transaction(root, 1);
 
 	btrfs_set_trans_block_group(trans, inode);
@@ -1561,6 +1561,7 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
 	inode->i_ino = args->ino;
 	BTRFS_I(inode)->root = args->root;
 	BTRFS_I(inode)->delalloc_bytes = 0;
+	BTRFS_I(inode)->disk_i_size = 0;
 	extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
 	extent_io_tree_init(&BTRFS_I(inode)->io_tree,
 			     inode->i_mapping, GFP_NOFS);
@@ -1869,6 +1870,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 			     inode->i_mapping, GFP_NOFS);
 	mutex_init(&BTRFS_I(inode)->csum_mutex);
 	BTRFS_I(inode)->delalloc_bytes = 0;
+	BTRFS_I(inode)->disk_i_size = 0;
 	BTRFS_I(inode)->root = root;
 
 	if (mode & S_IFDIR)
@@ -1964,7 +1966,8 @@ static int btrfs_add_link(struct btrfs_trans_handle *trans,
 					     dentry->d_parent->d_inode->i_ino);
 		}
 		parent_inode = dentry->d_parent->d_inode;
-		parent_inode->i_size += dentry->d_name.len * 2;
+		btrfs_i_size_write(parent_inode, parent_inode->i_size +
+				   dentry->d_name.len * 2);
 		parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
 		ret = btrfs_update_inode(trans, root,
 					 dentry->d_parent->d_inode);
@@ -2092,6 +2095,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 				     inode->i_mapping, GFP_NOFS);
 		mutex_init(&BTRFS_I(inode)->csum_mutex);
 		BTRFS_I(inode)->delalloc_bytes = 0;
+		BTRFS_I(inode)->disk_i_size = 0;
 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 	}
 	dir->i_sb->s_dirt = 1;
@@ -2199,7 +2203,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	inode->i_fop = &btrfs_dir_file_operations;
 	btrfs_set_trans_block_group(trans, inode);
 
-	inode->i_size = 0;
+	btrfs_i_size_write(inode, 0);
 	err = btrfs_update_inode(trans, root, inode);
 	if (err)
 		goto out_fail;
@@ -2756,6 +2760,7 @@ static void btrfs_truncate(struct inode *inode)
 	int ret;
 	struct btrfs_trans_handle *trans;
 	unsigned long nr;
+	u64 mask = root->sectorsize - 1;
 
 	if (!S_ISREG(inode->i_mode))
 		return;
@@ -2766,6 +2771,8 @@ static void btrfs_truncate(struct inode *inode)
 
 	trans = btrfs_start_transaction(root, 1);
 	btrfs_set_trans_block_group(trans, inode);
+	btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
+	btrfs_i_size_write(inode, inode->i_size);
 
 	/* FIXME, add redo link to tree so we don't leak on crash */
 	ret = btrfs_truncate_in_trans(trans, root, inode,
@@ -2821,7 +2828,7 @@ int btrfs_create_subvol_root(struct btrfs_root *new_root,
 	ret = btrfs_insert_inode_ref(trans, new_root, "..", 2, new_dirid,
 				     new_dirid);
 	inode->i_nlink = 1;
-	inode->i_size = 0;
+	btrfs_i_size_write(inode, 0);
 
 	return btrfs_update_inode(trans, new_root, inode);
 }
@@ -3069,6 +3076,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 				     inode->i_mapping, GFP_NOFS);
 		mutex_init(&BTRFS_I(inode)->csum_mutex);
 		BTRFS_I(inode)->delalloc_bytes = 0;
+		BTRFS_I(inode)->disk_i_size = 0;
 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 	}
 	dir->i_sb->s_dirt = 1;
@@ -3103,7 +3111,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 	inode->i_op = &btrfs_symlink_inode_operations;
 	inode->i_mapping->a_ops = &btrfs_symlink_aops;
 	inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
-	inode->i_size = name_len - 1;
+	btrfs_i_size_write(inode, name_len - 1);
 	err = btrfs_update_inode(trans, root, inode);
 	if (err)
 		drop_inode = 1;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 6513270f054c..d86a953ae51d 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -406,3 +406,92 @@ out:
 	mutex_unlock(&tree->mutex);
 	return entry;
 }
+
+int btrfs_ordered_update_i_size(struct inode *inode,
+				struct btrfs_ordered_extent *ordered)
+{
+	struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	u64 disk_i_size;
+	u64 new_i_size;
+	u64 i_size_test;
+	struct rb_node *node;
+	struct btrfs_ordered_extent *test;
+
+	mutex_lock(&tree->mutex);
+	disk_i_size = BTRFS_I(inode)->disk_i_size;
+
+	/*
+	 * if the disk i_size is already at the inode->i_size, or
+	 * this ordered extent is inside the disk i_size, we're done
+	 */
+	if (disk_i_size >= inode->i_size ||
+	    ordered->file_offset + ordered->len <= disk_i_size) {
+		goto out;
+	}
+
+	/*
+	 * we can't update the disk_isize if there are delalloc bytes
+	 * between disk_i_size and  this ordered extent
+	 */
+	if (test_range_bit(io_tree, disk_i_size,
+			   ordered->file_offset + ordered->len - 1,
+			   EXTENT_DELALLOC, 0)) {
+		goto out;
+	}
+	/*
+	 * walk backward from this ordered extent to disk_i_size.
+	 * if we find an ordered extent then we can't update disk i_size
+	 * yet
+	 */
+	while(1) {
+		node = rb_prev(&ordered->rb_node);
+		if (!node)
+			break;
+		test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+		if (test->file_offset + test->len <= disk_i_size)
+			break;
+		if (test->file_offset >= inode->i_size)
+			break;
+		if (test->file_offset >= disk_i_size)
+			goto out;
+	}
+	new_i_size = min_t(u64, entry_end(ordered), i_size_read(inode));
+
+	/*
+	 * at this point, we know we can safely update i_size to at least
+	 * the offset from this ordered extent.  But, we need to
+	 * walk forward and see if ios from higher up in the file have
+	 * finished.
+	 */
+	node = rb_next(&ordered->rb_node);
+	i_size_test = 0;
+	if (node) {
+		/*
+		 * do we have an area where IO might have finished
+		 * between our ordered extent and the next one.
+		 */
+		test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+		if (test->file_offset > entry_end(ordered)) {
+			i_size_test = test->file_offset - 1;
+		}
+	} else {
+		i_size_test = i_size_read(inode);
+	}
+
+	/*
+	 * i_size_test is the end of a region after this ordered
+	 * extent where there are no ordered extents.  As long as there
+	 * are no delalloc bytes in this area, it is safe to update
+	 * disk_i_size to the end of the region.
+	 */
+	if (i_size_test > entry_end(ordered) &&
+	    !test_range_bit(io_tree, entry_end(ordered), i_size_test,
+			   EXTENT_DELALLOC, 0)) {
+		new_i_size = min_t(u64, i_size_test, i_size_read(inode));
+	}
+	BTRFS_I(inode)->disk_i_size = new_i_size;
+out:
+	mutex_unlock(&tree->mutex);
+	return 0;
+}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 33292c5fe90c..40e9126ad954 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -89,4 +89,6 @@ btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
 int btrfs_add_ordered_pending(struct inode *inode,
 			      struct btrfs_ordered_extent *ordered,
 			      u64 start, u64 len);
+int btrfs_ordered_update_i_size(struct inode *inode,
+				struct btrfs_ordered_extent *ordered);
 #endif
-- 
cgit v1.2.3


From ee6e6504e147a59a9f4d582662c105e9d72ae638 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 17 Jul 2008 12:54:40 -0400
Subject: Add a per-inode lock around btrfs_drop_extents

btrfs_drop_extents is always called with a range lock held on the inode.
But, it may operate on extents outside that range as it drops and splits
them.

This patch adds a per-inode mutex that is held while calling
btrfs_drop_extents and while inserting new extents into the tree.  It
prevents races from two procs working against adjacent ranges in the tree.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/btrfs_inode.h |  1 +
 fs/btrfs/extent-tree.c |  2 ++
 fs/btrfs/file.c        |  8 ++++++++
 fs/btrfs/inode.c       | 10 ++++++++++
 4 files changed, 21 insertions(+)

(limited to 'fs/btrfs/btrfs_inode.h')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 81c0444f37b6..3bf40591742a 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -32,6 +32,7 @@ struct btrfs_inode {
 	struct extent_io_tree io_tree;
 	struct extent_io_tree io_failure_tree;
 	struct mutex csum_mutex;
+	struct mutex extent_mutex;
 	struct inode vfs_inode;
 	struct btrfs_ordered_inode_tree ordered_tree;
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 343d1101c31c..4036c62b6671 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1671,6 +1671,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 	ret = __free_extent(trans, root, bytenr, num_bytes, root_objectid,
 			    ref_generation, owner_objectid, owner_offset,
 			    pin, pin == 0);
+
+	finish_current_insert(trans, root->fs_info->extent_root);
 	pending_ret = del_pending_extents(trans, root->fs_info->extent_root);
 	return ret ? ret : pending_ret;
 }
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 3e4e5c227c0c..40ad1b2958cb 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -242,6 +242,7 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 	u64 end_of_last_block;
 	u64 end_pos = pos + write_bytes;
 	u64 inline_size;
+	int did_inline = 0;
 	loff_t isize = i_size_read(inode);
 
 	start_pos = pos & ~((u64)root->sectorsize - 1);
@@ -275,6 +276,7 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 		if (hole_size > 0) {
 			btrfs_wait_ordered_range(inode, last_pos_in_file,
 						 last_pos_in_file + hole_size);
+			mutex_lock(&BTRFS_I(inode)->extent_mutex);
 			err = btrfs_drop_extents(trans, root, inode,
 						 last_pos_in_file,
 						 last_pos_in_file + hole_size,
@@ -289,6 +291,7 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 						       0, 0, hole_size, 0);
 			btrfs_drop_extent_cache(inode, last_pos_in_file,
 					last_pos_in_file + hole_size -1);
+			mutex_unlock(&BTRFS_I(inode)->extent_mutex);
 			btrfs_check_file(root, inode);
 		}
 		if (err)
@@ -321,6 +324,7 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 		/* step one, delete the existing extents in this range */
 		aligned_end = (pos + write_bytes + root->sectorsize - 1) &
 			~((u64)root->sectorsize - 1);
+		mutex_lock(&BTRFS_I(inode)->extent_mutex);
 		err = btrfs_drop_extents(trans, root, inode, start_pos,
 					 aligned_end, aligned_end, &hint_byte);
 		if (err)
@@ -332,9 +336,13 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 					   inline_size, pages, 0, num_pages);
 		btrfs_drop_extent_cache(inode, start_pos, aligned_end - 1);
 		BUG_ON(err);
+		mutex_unlock(&BTRFS_I(inode)->extent_mutex);
+		did_inline = 1;
 	}
 	if (end_pos > isize) {
 		i_size_write(inode, end_pos);
+		if (did_inline)
+			BTRFS_I(inode)->disk_i_size = end_pos;
 		btrfs_update_inode(trans, root, inode);
 	}
 failed:
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 293355c92a4f..0d1e2ba47a9c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -516,6 +516,8 @@ int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
 					  trans->transid, inode->i_ino,
 					  ordered_extent->file_offset, &ins);
 	BUG_ON(ret);
+
+	mutex_lock(&BTRFS_I(inode)->extent_mutex);
 	ret = btrfs_drop_extents(trans, root, inode,
 				 ordered_extent->file_offset,
 				 ordered_extent->file_offset +
@@ -531,6 +533,8 @@ int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
 	btrfs_drop_extent_cache(inode, ordered_extent->file_offset,
 				ordered_extent->file_offset +
 				ordered_extent->len - 1);
+	mutex_unlock(&BTRFS_I(inode)->extent_mutex);
+
 	inode->i_blocks += ordered_extent->len >> 9;
 	unlock_extent(io_tree, ordered_extent->file_offset,
 		    ordered_extent->file_offset + ordered_extent->len - 1,
@@ -1399,6 +1403,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 
 		trans = btrfs_start_transaction(root, 1);
 		btrfs_set_trans_block_group(trans, inode);
+		mutex_lock(&BTRFS_I(inode)->extent_mutex);
 		err = btrfs_drop_extents(trans, root, inode,
 					 hole_start, block_end, hole_start,
 					 &alloc_hint);
@@ -1412,6 +1417,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 						(u64)-1);
 			btrfs_check_file(root, inode);
 		}
+		mutex_unlock(&BTRFS_I(inode)->extent_mutex);
 		btrfs_end_transaction(trans, root);
 		unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
 		if (err)
@@ -1578,6 +1584,7 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
 			     inode->i_mapping, GFP_NOFS);
 	btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
 	mutex_init(&BTRFS_I(inode)->csum_mutex);
+	mutex_init(&BTRFS_I(inode)->extent_mutex);
 	return 0;
 }
 
@@ -1880,6 +1887,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 			     inode->i_mapping, GFP_NOFS);
 	btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
 	mutex_init(&BTRFS_I(inode)->csum_mutex);
+	mutex_init(&BTRFS_I(inode)->extent_mutex);
 	BTRFS_I(inode)->delalloc_bytes = 0;
 	BTRFS_I(inode)->disk_i_size = 0;
 	BTRFS_I(inode)->root = root;
@@ -2105,6 +2113,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 		extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
 				     inode->i_mapping, GFP_NOFS);
 		mutex_init(&BTRFS_I(inode)->csum_mutex);
+		mutex_init(&BTRFS_I(inode)->extent_mutex);
 		BTRFS_I(inode)->delalloc_bytes = 0;
 		BTRFS_I(inode)->disk_i_size = 0;
 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
@@ -3079,6 +3088,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 		extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
 				     inode->i_mapping, GFP_NOFS);
 		mutex_init(&BTRFS_I(inode)->csum_mutex);
+		mutex_init(&BTRFS_I(inode)->extent_mutex);
 		BTRFS_I(inode)->delalloc_bytes = 0;
 		BTRFS_I(inode)->disk_i_size = 0;
 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
-- 
cgit v1.2.3


From aec7477b3b0e8ec93f6d274f25ba40b0665134d4 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@redhat.com>
Date: Thu, 24 Jul 2008 12:12:38 -0400
Subject: Btrfs: Implement new dir index format

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/btrfs_inode.h |  15 ++++++
 fs/btrfs/ctree.h       |   8 +--
 fs/btrfs/dir-item.c    |   4 +-
 fs/btrfs/inode-item.c  |  10 +++-
 fs/btrfs/inode.c       | 143 +++++++++++++++++++++++++++++++++++++++++--------
 fs/btrfs/ioctl.c       |   4 +-
 fs/btrfs/transaction.c |   4 +-
 7 files changed, 155 insertions(+), 33 deletions(-)

(limited to 'fs/btrfs/btrfs_inode.h')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 3bf40591742a..b7bd60e4fdd7 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -43,6 +43,21 @@ struct btrfs_inode {
 	u64 delalloc_bytes;
 	u64 disk_i_size;
 	u32 flags;
+
+	/*
+	 * if this is a directory then index_cnt is the counter for the index
+	 * number for new files that are created
+	 */
+	u64 index_cnt;
+
+	/*
+	 * index holds the directory index for this inode on creation, so
+	 * add_link can do what its supposed to.  This isn't populated when the
+	 * inode is read because there isn't really a reason to know this unless
+	 * we are creating the directory index or deleting it, and deletion
+	 * reads the index off of the inode reference at unlink time.
+	 */
+	u64 index;
 };
 
 static inline struct btrfs_inode *BTRFS_I(struct inode *inode)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 6675e916ebcd..beb05b1de54c 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -372,6 +372,7 @@ struct btrfs_dev_extent {
 } __attribute__ ((__packed__));
 
 struct btrfs_inode_ref {
+	__le64 index;
 	__le16 name_len;
 	/* name goes here */
 } __attribute__ ((__packed__));
@@ -902,6 +903,7 @@ BTRFS_SETGET_STACK_FUNCS(block_group_flags,
 
 /* struct btrfs_inode_ref */
 BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16);
+BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64);
 
 /* struct btrfs_inode_item */
 BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64);
@@ -1528,7 +1530,7 @@ int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
 /* dir-item.c */
 int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 			  *root, const char *name, int name_len, u64 dir,
-			  struct btrfs_key *location, u8 type);
+			  struct btrfs_key *location, u8 type, u64 index);
 struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
 					     struct btrfs_root *root,
 					     struct btrfs_path *path, u64 dir,
@@ -1566,11 +1568,11 @@ int btrfs_find_highest_inode(struct btrfs_root *fs_root, u64 *objectid);
 int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   const char *name, int name_len,
-			   u64 inode_objectid, u64 ref_objectid);
+			   u64 inode_objectid, u64 ref_objectid, u64 index);
 int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   const char *name, int name_len,
-			   u64 inode_objectid, u64 ref_objectid);
+			   u64 inode_objectid, u64 ref_objectid, u64 *index);
 int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     struct btrfs_path *path, u64 objectid);
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 7a73dc59dc4d..eb4dd3d75cf9 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -110,7 +110,7 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
 
 int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 			  *root, const char *name, int name_len, u64 dir,
-			  struct btrfs_key *location, u8 type)
+			  struct btrfs_key *location, u8 type, u64 index)
 {
 	int ret = 0;
 	int ret2 = 0;
@@ -156,7 +156,7 @@ second_insert:
 	btrfs_release_path(root, path);
 
 	btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
-	key.offset = location->objectid;
+	key.offset = index;
 	dir_item = insert_with_overflow(trans, root, path, &key, data_size,
 					name, name_len);
 	if (IS_ERR(dir_item)) {
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index cba30b6cc6fe..d93451c66ba1 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -52,7 +52,7 @@ int find_name_in_backref(struct btrfs_path *path, const char * name,
 int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   const char *name, int name_len,
-			   u64 inode_objectid, u64 ref_objectid)
+			   u64 inode_objectid, u64 ref_objectid, u64 *index)
 {
 	struct btrfs_path *path;
 	struct btrfs_key key;
@@ -86,6 +86,10 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
 	}
 	leaf = path->nodes[0];
 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+
+	if (index)
+		*index = btrfs_inode_ref_index(leaf, ref);
+
 	if (del_len == item_size) {
 		ret = btrfs_del_item(trans, root, path);
 		goto out;
@@ -106,7 +110,7 @@ out:
 int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   const char *name, int name_len,
-			   u64 inode_objectid, u64 ref_objectid)
+			   u64 inode_objectid, u64 ref_objectid, u64 index)
 {
 	struct btrfs_path *path;
 	struct btrfs_key key;
@@ -138,6 +142,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
 				     struct btrfs_inode_ref);
 		ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size);
 		btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
+		btrfs_set_inode_ref_index(path->nodes[0], ref, index);
 		ptr = (unsigned long)(ref + 1);
 		ret = 0;
 	} else if (ret < 0) {
@@ -146,6 +151,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
 		ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
 				     struct btrfs_inode_ref);
 		btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
+		btrfs_set_inode_ref_index(path->nodes[0], ref, index);
 		ptr = (unsigned long)(ref + 1);
 	}
 	write_extent_buffer(path->nodes[0], name, ptr, name_len);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 0e90315ea803..8d371d6fe551 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -872,6 +872,8 @@ void btrfs_read_locked_inode(struct inode *inode)
 	inode->i_rdev = 0;
 	rdev = btrfs_inode_rdev(leaf, inode_item);
 
+	BTRFS_I(inode)->index_cnt = (u64)-1;
+
 	alloc_group_block = btrfs_inode_block_group(leaf, inode_item);
 	BTRFS_I(inode)->block_group = btrfs_lookup_block_group(root->fs_info,
 						       alloc_group_block);
@@ -993,6 +995,7 @@ static int btrfs_unlink_trans(struct btrfs_trans_handle *trans,
 	struct extent_buffer *leaf;
 	struct btrfs_dir_item *di;
 	struct btrfs_key key;
+	u64 index;
 
 	path = btrfs_alloc_path();
 	if (!path) {
@@ -1017,8 +1020,19 @@ static int btrfs_unlink_trans(struct btrfs_trans_handle *trans,
 		goto err;
 	btrfs_release_path(root, path);
 
+	ret = btrfs_del_inode_ref(trans, root, name, name_len,
+				  dentry->d_inode->i_ino,
+				  dentry->d_parent->d_inode->i_ino, &index);
+	if (ret) {
+		printk("failed to delete reference to %.*s, "
+		       "inode %lu parent %lu\n", name_len, name,
+		       dentry->d_inode->i_ino,
+		       dentry->d_parent->d_inode->i_ino);
+		goto err;
+	}
+
 	di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
-					 key.objectid, name, name_len, -1);
+					 index, name, name_len, -1);
 	if (IS_ERR(di)) {
 		ret = PTR_ERR(di);
 		goto err;
@@ -1031,15 +1045,6 @@ static int btrfs_unlink_trans(struct btrfs_trans_handle *trans,
 	btrfs_release_path(root, path);
 
 	dentry->d_inode->i_ctime = dir->i_ctime;
-	ret = btrfs_del_inode_ref(trans, root, name, name_len,
-				  dentry->d_inode->i_ino,
-				  dentry->d_parent->d_inode->i_ino);
-	if (ret) {
-		printk("failed to delete reference to %.*s, "
-		       "inode %lu parent %lu\n", name_len, name,
-		       dentry->d_inode->i_ino,
-		       dentry->d_parent->d_inode->i_ino);
-	}
 err:
 	btrfs_free_path(path);
 	if (!ret) {
@@ -1625,6 +1630,7 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
 	BTRFS_I(inode)->root = args->root;
 	BTRFS_I(inode)->delalloc_bytes = 0;
 	BTRFS_I(inode)->disk_i_size = 0;
+	BTRFS_I(inode)->index_cnt = (u64)-1;
 	extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
 	extent_io_tree_init(&BTRFS_I(inode)->io_tree,
 			     inode->i_mapping, GFP_NOFS);
@@ -1901,8 +1907,77 @@ void btrfs_dirty_inode(struct inode *inode)
 	btrfs_end_transaction(trans, root);
 }
 
+static int btrfs_set_inode_index_count(struct inode *inode)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_key key, found_key;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	int ret;
+
+	key.objectid = inode->i_ino;
+	btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
+	key.offset = (u64)-1;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+	/* FIXME: we should be able to handle this */
+	if (ret == 0)
+		goto out;
+	ret = 0;
+
+	/*
+	 * MAGIC NUMBER EXPLANATION:
+	 * since we search a directory based on f_pos we have to start at 2
+	 * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody
+	 * else has to start at 2
+	 */
+	if (path->slots[0] == 0) {
+		BTRFS_I(inode)->index_cnt = 2;
+		goto out;
+	}
+
+	path->slots[0]--;
+
+	leaf = path->nodes[0];
+	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+	if (found_key.objectid != inode->i_ino ||
+	    btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) {
+		BTRFS_I(inode)->index_cnt = 2;
+		goto out;
+	}
+
+	BTRFS_I(inode)->index_cnt = found_key.offset + 1;
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int btrfs_set_inode_index(struct inode *dir, struct inode *inode)
+{
+	int ret = 0;
+
+	if (BTRFS_I(dir)->index_cnt == (u64)-1) {
+		ret = btrfs_set_inode_index_count(dir);
+		if (ret)
+			return ret;
+	}
+
+	BTRFS_I(inode)->index = BTRFS_I(dir)->index_cnt;
+	BTRFS_I(dir)->index_cnt++;
+
+	return ret;
+}
+
 static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root,
+				     struct inode *dir,
 				     const char *name, int name_len,
 				     u64 ref_objectid,
 				     u64 objectid,
@@ -1928,6 +2003,20 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
 
+	if (dir) {
+		ret = btrfs_set_inode_index(dir, inode);
+		if (ret)
+			return ERR_PTR(ret);
+	} else {
+		BTRFS_I(inode)->index = 0;
+	}
+	/*
+	 * index_cnt is ignored for everything but a dir,
+	 * btrfs_get_inode_index_count has an explanation for the magic
+	 * number
+	 */
+	BTRFS_I(inode)->index_cnt = 2;
+
 	extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
 	extent_io_tree_init(&BTRFS_I(inode)->io_tree,
 			     inode->i_mapping, GFP_NOFS);
@@ -1984,6 +2073,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
 			     struct btrfs_inode_ref);
 	btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
+	btrfs_set_inode_ref_index(path->nodes[0], ref, BTRFS_I(inode)->index);
 	ptr = (unsigned long)(ref + 1);
 	write_extent_buffer(path->nodes[0], name, ptr, name_len);
 
@@ -1998,6 +2088,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	insert_inode_hash(inode);
 	return inode;
 fail:
+	if (dir)
+		BTRFS_I(dir)->index_cnt--;
 	btrfs_free_path(path);
 	return ERR_PTR(ret);
 }
@@ -2014,7 +2106,7 @@ static int btrfs_add_link(struct btrfs_trans_handle *trans,
 	int ret;
 	struct btrfs_key key;
 	struct btrfs_root *root = BTRFS_I(dentry->d_parent->d_inode)->root;
-	struct inode *parent_inode;
+	struct inode *parent_inode = dentry->d_parent->d_inode;
 
 	key.objectid = inode->i_ino;
 	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
@@ -2023,16 +2115,17 @@ static int btrfs_add_link(struct btrfs_trans_handle *trans,
 	ret = btrfs_insert_dir_item(trans, root,
 				    dentry->d_name.name, dentry->d_name.len,
 				    dentry->d_parent->d_inode->i_ino,
-				    &key, btrfs_inode_type(inode));
+				    &key, btrfs_inode_type(inode),
+				    BTRFS_I(inode)->index);
 	if (ret == 0) {
 		if (add_backref) {
 			ret = btrfs_insert_inode_ref(trans, root,
 					     dentry->d_name.name,
 					     dentry->d_name.len,
 					     inode->i_ino,
-					     dentry->d_parent->d_inode->i_ino);
+					     parent_inode->i_ino,
+					     BTRFS_I(inode)->index);
 		}
-		parent_inode = dentry->d_parent->d_inode;
 		btrfs_i_size_write(parent_inode, parent_inode->i_size +
 				   dentry->d_name.len * 2);
 		parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
@@ -2083,7 +2176,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 		goto out_unlock;
 	}
 
-	inode = btrfs_new_inode(trans, root, dentry->d_name.name,
+	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
 				dentry->d_name.len,
 				dentry->d_parent->d_inode->i_ino, objectid,
 				BTRFS_I(dir)->block_group, mode);
@@ -2138,7 +2231,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 		goto out_unlock;
 	}
 
-	inode = btrfs_new_inode(trans, root, dentry->d_name.name,
+	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
 				dentry->d_name.len,
 				dentry->d_parent->d_inode->i_ino,
 				objectid, BTRFS_I(dir)->block_group, mode);
@@ -2203,10 +2296,15 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 	err = btrfs_check_free_space(root, 1, 0);
 	if (err)
 		goto fail;
+	err = btrfs_set_inode_index(dir, inode);
+	if (err)
+		goto fail;
+
 	trans = btrfs_start_transaction(root, 1);
 
 	btrfs_set_trans_block_group(trans, dir);
 	atomic_inc(&inode->i_count);
+
 	err = btrfs_add_nondir(trans, dentry, inode, 1);
 
 	if (err)
@@ -2258,7 +2356,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 		goto out_unlock;
 	}
 
-	inode = btrfs_new_inode(trans, root, dentry->d_name.name,
+	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
 				dentry->d_name.len,
 				dentry->d_parent->d_inode->i_ino, objectid,
 				BTRFS_I(dir)->block_group, S_IFDIR | mode);
@@ -2886,9 +2984,8 @@ int btrfs_create_subvol_root(struct btrfs_root *new_root,
 		struct btrfs_block_group_cache *block_group)
 {
 	struct inode *inode;
-	int ret;
 
-	inode = btrfs_new_inode(trans, new_root, "..", 2, new_dirid,
+	inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid,
 				new_dirid, block_group, S_IFDIR | 0700);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
@@ -2896,8 +2993,6 @@ int btrfs_create_subvol_root(struct btrfs_root *new_root,
 	inode->i_fop = &btrfs_dir_file_operations;
 	new_root->inode = inode;
 
-	ret = btrfs_insert_inode_ref(trans, new_root, "..", 2, new_dirid,
-				     new_dirid);
 	inode->i_nlink = 1;
 	btrfs_i_size_write(inode, 0);
 
@@ -3078,6 +3173,10 @@ static int btrfs_rename(struct inode * old_dir, struct dentry *old_dentry,
 		if (ret)
 			goto out_fail;
 	}
+	ret = btrfs_set_inode_index(new_dir, old_inode);
+	if (ret)
+		goto out_fail;
+
 	ret = btrfs_add_link(trans, new_dentry, old_inode, 1);
 	if (ret)
 		goto out_fail;
@@ -3123,7 +3222,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 		goto out_unlock;
 	}
 
-	inode = btrfs_new_inode(trans, root, dentry->d_name.name,
+	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
 				dentry->d_name.len,
 				dentry->d_parent->d_inode->i_ino, objectid,
 				BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index a61f2e7e2db5..faf081302d02 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -129,13 +129,13 @@ static noinline int create_subvol(struct btrfs_root *root, char *name,
 	dir = root->fs_info->sb->s_root->d_inode;
 	ret = btrfs_insert_dir_item(trans, root->fs_info->tree_root,
 				    name, namelen, dir->i_ino, &key,
-				    BTRFS_FT_DIR);
+				    BTRFS_FT_DIR, 0);
 	if (ret)
 		goto fail;
 
 	ret = btrfs_insert_inode_ref(trans, root->fs_info->tree_root,
 			     name, namelen, objectid,
-			     root->fs_info->sb->s_root->d_inode->i_ino);
+			     root->fs_info->sb->s_root->d_inode->i_ino, 0);
 	if (ret)
 		goto fail;
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 0f756e0175c0..768b0d223e68 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -577,14 +577,14 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	ret = btrfs_insert_dir_item(trans, root->fs_info->tree_root,
 				    pending->name, namelen,
 				    root->fs_info->sb->s_root->d_inode->i_ino,
-				    &key, BTRFS_FT_DIR);
+				    &key, BTRFS_FT_DIR, 0);
 
 	if (ret)
 		goto fail;
 
 	ret = btrfs_insert_inode_ref(trans, root->fs_info->tree_root,
 			     pending->name, strlen(pending->name), objectid,
-			     root->fs_info->sb->s_root->d_inode->i_ino);
+			     root->fs_info->sb->s_root->d_inode->i_ino, 0);
 
 	/* Invalidate existing dcache entry for new snapshot. */
 	btrfs_invalidate_dcache_root(root, pending->name, namelen);
-- 
cgit v1.2.3


From 33268eaf0b3db5e2bd12c0ada81a8e8f87a46d68 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@redhat.com>
Date: Thu, 24 Jul 2008 12:16:36 -0400
Subject: Btrfs: Add ACL support

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/acl.c         | 296 +++++++++++++++++++++++++++++++++++++++++++------
 fs/btrfs/btrfs_inode.h |   3 +
 fs/btrfs/ctree.h       |   7 ++
 fs/btrfs/inode.c       |  42 ++++++-
 fs/btrfs/super.c       |   9 +-
 fs/btrfs/xattr.c       |  26 +++--
 6 files changed, 335 insertions(+), 48 deletions(-)

(limited to 'fs/btrfs/btrfs_inode.h')

diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index ff0fcc72848a..b192659b4f5f 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -20,76 +20,302 @@
 #include <linux/string.h>
 #include <linux/xattr.h>
 #include <linux/posix_acl_xattr.h>
+#include <linux/posix_acl.h>
 #include <linux/sched.h>
+
 #include "ctree.h"
+#include "btrfs_inode.h"
 #include "xattr.h"
-#ifndef is_owner_or_cap
-#define is_owner_or_cap(inode)	\
-	((current->fsuid == (inode)->i_uid) || capable(CAP_FOWNER))
-#endif
+
+static void btrfs_update_cached_acl(struct inode *inode,
+				    struct posix_acl **p_acl,
+				    struct posix_acl *acl)
+{
+	spin_lock(&inode->i_lock);
+	if (*p_acl && *p_acl != BTRFS_ACL_NOT_CACHED)
+		posix_acl_release(*p_acl);
+	*p_acl = posix_acl_dup(acl);
+	spin_unlock(&inode->i_lock);
+}
+
+static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
+{
+	int size, name_index;
+	char *value = NULL;
+	struct posix_acl *acl = NULL, **p_acl;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name_index = BTRFS_XATTR_INDEX_POSIX_ACL_ACCESS;
+		p_acl = &BTRFS_I(inode)->i_acl;
+		break;
+	case ACL_TYPE_DEFAULT:
+		name_index = BTRFS_XATTR_INDEX_POSIX_ACL_DEFAULT;
+		p_acl = &BTRFS_I(inode)->i_default_acl;
+		break;
+	default:
+		return ERR_PTR(-EINVAL);
+	}
+
+	spin_lock(&inode->i_lock);
+	if (*p_acl != BTRFS_ACL_NOT_CACHED)
+		acl = posix_acl_dup(*p_acl);
+	spin_unlock(&inode->i_lock);
+
+	if (acl)
+		return acl;
+
+
+	size = btrfs_xattr_get(inode, name_index, "", NULL, 0);
+	if (size > 0) {
+		value = kzalloc(size, GFP_NOFS);
+		if (!value)
+			return ERR_PTR(-ENOMEM);
+		size = btrfs_xattr_get(inode, name_index, "", value, size);
+		if (size > 0) {
+			acl = posix_acl_from_xattr(value, size);
+			btrfs_update_cached_acl(inode, p_acl, acl);
+		}
+		kfree(value);
+	} else if (size == -ENOENT) {
+		acl = NULL;
+		btrfs_update_cached_acl(inode, p_acl, acl);
+	}
+
+	return acl;
+}
+
+static int btrfs_xattr_get_acl(struct inode *inode, int type,
+			       void *value, size_t size)
+{
+	struct posix_acl *acl;
+	int ret = 0;
+
+	acl = btrfs_get_acl(inode, type);
+
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	if (acl == NULL)
+		return -ENODATA;
+	ret = posix_acl_to_xattr(acl, value, size);
+	posix_acl_release(acl);
+
+	return ret;
+}
+
+/*
+ * Needs to be called with fs_mutex held
+ */
+static int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+{
+	int ret, name_index = 0, size = 0;
+	struct posix_acl **p_acl;
+	char *value = NULL;
+	mode_t mode;
+
+	if (acl) {
+		ret = posix_acl_valid(acl);
+		if (ret < 0)
+			return ret;
+		ret = 0;
+	}
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		mode = inode->i_mode;
+		ret = posix_acl_equiv_mode(acl, &mode);
+		if (ret < 0)
+			return ret;
+		ret = 0;
+		inode->i_mode = mode;
+		name_index = BTRFS_XATTR_INDEX_POSIX_ACL_ACCESS;
+		p_acl = &BTRFS_I(inode)->i_acl;
+		break;
+	case ACL_TYPE_DEFAULT:
+		if (!S_ISDIR(inode->i_mode))
+			return acl ? -EINVAL : 0;
+		name_index = BTRFS_XATTR_INDEX_POSIX_ACL_DEFAULT;
+		p_acl = &BTRFS_I(inode)->i_default_acl;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (acl) {
+		size = posix_acl_xattr_size(acl->a_count);
+		value = kmalloc(size, GFP_NOFS);
+		if (!value) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		ret = posix_acl_to_xattr(acl, value, size);
+		if (ret < 0)
+			goto out;
+	}
+
+	ret = btrfs_xattr_set(inode, name_index, "", value, size, 0);
+
+out:
+	if (value)
+		kfree(value);
+
+	if (!ret)
+		btrfs_update_cached_acl(inode, p_acl, acl);
+
+	return ret;
+}
 
 static int btrfs_xattr_set_acl(struct inode *inode, int type,
 			       const void *value, size_t size)
 {
 	int ret = 0;
-	struct posix_acl *acl;
+	struct posix_acl *acl = NULL;
 
-	if (!is_owner_or_cap(inode))
-		return -EPERM;
 	if (value) {
 		acl = posix_acl_from_xattr(value, size);
 		if (acl == NULL) {
 			value = NULL;
 			size = 0;
 		} else if (IS_ERR(acl)) {
-			ret = PTR_ERR(acl);
-		} else {
-			ret = posix_acl_valid(acl);
-			posix_acl_release(acl);
+			return PTR_ERR(acl);
 		}
-		if (ret)
-			return ret;
 	}
-	return btrfs_xattr_set(inode, type, "", value, size, 0);
-}
 
-static int btrfs_xattr_get_acl(struct inode *inode, int type,
-			       void *value, size_t size)
-{
-	return btrfs_xattr_get(inode, type, "", value, size);
+	ret = btrfs_set_acl(inode, acl, type);
+
+	posix_acl_release(acl);
+
+	return ret;
 }
+
+
 static int btrfs_xattr_acl_access_get(struct inode *inode, const char *name,
 				      void *value, size_t size)
 {
-	if (*name != '\0')
-	       return -EINVAL;
-	return btrfs_xattr_get_acl(inode, BTRFS_XATTR_INDEX_POSIX_ACL_ACCESS,
-				   value, size);
+	return btrfs_xattr_get_acl(inode, ACL_TYPE_ACCESS, value, size);
 }
+
 static int btrfs_xattr_acl_access_set(struct inode *inode, const char *name,
 				      const void *value, size_t size, int flags)
 {
-	if (*name != '\0')
-	       return -EINVAL;
-	return btrfs_xattr_set_acl(inode, BTRFS_XATTR_INDEX_POSIX_ACL_ACCESS,
-				   value, size);
+	return btrfs_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
 }
+
 static int btrfs_xattr_acl_default_get(struct inode *inode, const char *name,
 				       void *value, size_t size)
 {
-	if (*name != '\0')
-	       return -EINVAL;
-	return btrfs_xattr_get_acl(inode, BTRFS_XATTR_INDEX_POSIX_ACL_DEFAULT,
-				   value, size);
+	return btrfs_xattr_get_acl(inode, ACL_TYPE_DEFAULT, value, size);
 }
+
 static int btrfs_xattr_acl_default_set(struct inode *inode, const char *name,
 				       const void *value, size_t size, int flags)
 {
-	if (*name != '\0')
-	       return -EINVAL;
-	return btrfs_xattr_set_acl(inode, BTRFS_XATTR_INDEX_POSIX_ACL_DEFAULT,
-				   value, size);
+	return btrfs_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
+}
+
+int btrfs_check_acl(struct inode *inode, int mask)
+{
+	struct posix_acl *acl;
+	int error = -EAGAIN;
+
+	acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
+
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	if (acl) {
+		error = posix_acl_permission(inode, acl, mask);
+		posix_acl_release(acl);
+	}
+
+	return error;
+}
+
+/*
+ * btrfs_init_acl is already generally called under fs_mutex, so the locking
+ * stuff has been fixed to work with that.  If the locking stuff changes, we
+ * need to re-evaluate the acl locking stuff.
+ */
+int btrfs_init_acl(struct inode *inode, struct inode *dir)
+{
+	struct posix_acl *acl = NULL;
+	int ret = 0;
+
+	/* this happens with subvols */
+	if (!dir)
+		return 0;
+
+	if (!S_ISLNK(inode->i_mode)) {
+		if (IS_POSIXACL(dir)) {
+			acl = btrfs_get_acl(dir, ACL_TYPE_DEFAULT);
+			if (IS_ERR(acl))
+				return PTR_ERR(acl);
+		}
+
+		if (!acl)
+			inode->i_mode &= ~current->fs->umask;
+	}
+
+	if (IS_POSIXACL(dir) && acl) {
+		struct posix_acl *clone;
+		mode_t mode;
+
+		if (S_ISDIR(inode->i_mode)) {
+			ret = btrfs_set_acl(inode, acl, ACL_TYPE_DEFAULT);
+			if (ret)
+				goto failed;
+		}
+		clone = posix_acl_clone(acl, GFP_NOFS);
+		ret = -ENOMEM;
+		if (!clone)
+			goto failed;
+
+		mode = inode->i_mode;
+		ret = posix_acl_create_masq(clone, &mode);
+		if (ret >= 0) {
+			inode->i_mode = mode;
+			if (ret > 0) {
+				/* we need an acl */
+				ret = btrfs_set_acl(inode, clone,
+						    ACL_TYPE_ACCESS);
+			}
+		}
+	}
+failed:
+	posix_acl_release(acl);
+
+	return ret;
+}
+
+int btrfs_acl_chmod(struct inode *inode)
+{
+	struct posix_acl *acl, *clone;
+	int ret = 0;
+
+	if (S_ISLNK(inode->i_mode))
+		return -EOPNOTSUPP;
+
+	if (!IS_POSIXACL(inode))
+		return 0;
+
+	acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
+	if (IS_ERR(acl) || !acl)
+		return PTR_ERR(acl);
+
+	clone = posix_acl_clone(acl, GFP_KERNEL);
+	posix_acl_release(acl);
+	if (!clone)
+		return -ENOMEM;
+
+	ret = posix_acl_chmod_masq(clone, inode->i_mode);
+	if (!ret)
+		ret = btrfs_set_acl(inode, clone, ACL_TYPE_ACCESS);
+
+	posix_acl_release(clone);
+
+	return ret;
 }
+
 struct xattr_handler btrfs_xattr_acl_default_handler = {
 	.prefix = POSIX_ACL_XATTR_DEFAULT,
 	.list	= btrfs_xattr_generic_list,
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index b7bd60e4fdd7..9f2a4ef944a7 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -36,6 +36,9 @@ struct btrfs_inode {
 	struct inode vfs_inode;
 	struct btrfs_ordered_inode_tree ordered_tree;
 
+	struct posix_acl *i_acl;
+	struct posix_acl *i_default_acl;
+
 	/*
 	 * transid of the trans_handle that last modified this inode
 	 */
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 53d315bdd16d..f87d7263f2d7 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -42,6 +42,8 @@ struct btrfs_ordered_sum;
 
 #define BTRFS_MAGIC "_B5RfS_M"
 
+#define BTRFS_ACL_NOT_CACHED    ((void *)-1)
+
 #ifdef CONFIG_LOCKDEP
 # define BTRFS_MAX_LEVEL 7
 #else
@@ -1694,4 +1696,9 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
 u64 btrfs_parse_size(char *str);
 int btrfs_parse_options(struct btrfs_root *root, char *options);
 int btrfs_sync_fs(struct super_block *sb, int wait);
+
+/* acl.c */
+int btrfs_check_acl(struct inode *inode, int mask);
+int btrfs_init_acl(struct inode *inode, struct inode *dir);
+int btrfs_acl_chmod(struct inode *inode);
 #endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8d371d6fe551..2d8853543a71 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -36,6 +36,7 @@
 #include <linux/bit_spinlock.h>
 #include <linux/version.h>
 #include <linux/xattr.h>
+#include <linux/posix_acl.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -1478,6 +1479,9 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 	}
 out:
 	err = inode_setattr(inode, attr);
+
+	if (!err && ((attr->ia_valid & ATTR_MODE)))
+		err = btrfs_acl_chmod(inode);
 fail:
 	return err;
 }
@@ -2184,6 +2188,12 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 	if (IS_ERR(inode))
 		goto out_unlock;
 
+	err = btrfs_init_acl(inode, dir);
+	if (err) {
+		drop_inode = 1;
+		goto out_unlock;
+	}
+
 	btrfs_set_trans_block_group(trans, inode);
 	err = btrfs_add_nondir(trans, dentry, inode, 0);
 	if (err)
@@ -2239,6 +2249,12 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 	if (IS_ERR(inode))
 		goto out_unlock;
 
+	err = btrfs_init_acl(inode, dir);
+	if (err) {
+		drop_inode = 1;
+		goto out_unlock;
+	}
+
 	btrfs_set_trans_block_group(trans, inode);
 	err = btrfs_add_nondir(trans, dentry, inode, 0);
 	if (err)
@@ -2366,6 +2382,11 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	}
 
 	drop_on_err = 1;
+
+	err = btrfs_init_acl(inode, dir);
+	if (err)
+		goto out_fail;
+
 	inode->i_op = &btrfs_dir_inode_operations;
 	inode->i_fop = &btrfs_dir_file_operations;
 	btrfs_set_trans_block_group(trans, inode);
@@ -3023,6 +3044,8 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 		return NULL;
 	ei->last_trans = 0;
 	btrfs_ordered_inode_tree_init(&ei->ordered_tree);
+	ei->i_acl = BTRFS_ACL_NOT_CACHED;
+	ei->i_default_acl = BTRFS_ACL_NOT_CACHED;
 	return &ei->vfs_inode;
 }
 
@@ -3032,6 +3055,13 @@ void btrfs_destroy_inode(struct inode *inode)
 	WARN_ON(!list_empty(&inode->i_dentry));
 	WARN_ON(inode->i_data.nrpages);
 
+	if (BTRFS_I(inode)->i_acl &&
+	    BTRFS_I(inode)->i_acl != BTRFS_ACL_NOT_CACHED)
+		posix_acl_release(BTRFS_I(inode)->i_acl);
+	if (BTRFS_I(inode)->i_default_acl &&
+	    BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED)
+		posix_acl_release(BTRFS_I(inode)->i_default_acl);
+
 	while(1) {
 		ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
 		if (!ordered)
@@ -3230,6 +3260,12 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 	if (IS_ERR(inode))
 		goto out_unlock;
 
+	err = btrfs_init_acl(inode, dir);
+	if (err) {
+		drop_inode = 1;
+		goto out_unlock;
+	}
+
 	btrfs_set_trans_block_group(trans, inode);
 	err = btrfs_add_nondir(trans, dentry, inode, 0);
 	if (err)
@@ -3310,7 +3346,7 @@ static int btrfs_permission(struct inode *inode, int mask,
 {
 	if (btrfs_test_flag(inode, READONLY) && (mask & MAY_WRITE))
 		return -EACCES;
-	return generic_permission(inode, mask, NULL);
+	return generic_permission(inode, mask, btrfs_check_acl);
 }
 
 static struct inode_operations btrfs_dir_inode_operations = {
@@ -3392,6 +3428,10 @@ static struct inode_operations btrfs_special_inode_operations = {
 	.getattr	= btrfs_getattr,
 	.setattr	= btrfs_setattr,
 	.permission	= btrfs_permission,
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= btrfs_listxattr,
+	.removexattr	= generic_removexattr,
 };
 static struct inode_operations btrfs_symlink_inode_operations = {
 	.readlink	= generic_readlink,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 4cb6aac5122e..a6a418b6894b 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -68,7 +68,7 @@ static void btrfs_put_super (struct super_block * sb)
 enum {
 	Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
 	Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
-	Opt_ssd, Opt_thread_pool, Opt_err,
+	Opt_ssd, Opt_thread_pool, Opt_noacl,  Opt_err,
 };
 
 static match_table_t tokens = {
@@ -83,7 +83,8 @@ static match_table_t tokens = {
 	{Opt_alloc_start, "alloc_start=%s"},
 	{Opt_thread_pool, "thread_pool=%d"},
 	{Opt_ssd, "ssd"},
-	{Opt_err, NULL}
+	{Opt_noacl, "noacl"},
+	{Opt_err, NULL},
 };
 
 u64 btrfs_parse_size(char *str)
@@ -215,6 +216,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 					info->alloc_start);
 			}
 			break;
+		case Opt_noacl:
+			root->fs_info->sb->s_flags &= ~MS_POSIXACL;
+			break;
 		default:
 			break;
 		}
@@ -301,6 +305,7 @@ static int btrfs_fill_super(struct super_block * sb,
 	sb->s_op = &btrfs_super_ops;
 	sb->s_xattr = btrfs_xattr_handlers;
 	sb->s_time_gran = 1;
+	sb->s_flags |= MS_POSIXACL;
 
 	tree_root = open_ctree(sb, fs_devices, (char *)data);
 
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 6730b5958844..121c9550314f 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -26,25 +26,27 @@
 #include "transaction.h"
 #include "xattr.h"
 #include "disk-io.h"
+
 static struct xattr_handler *btrfs_xattr_handler_map[] = {
 	[BTRFS_XATTR_INDEX_USER]		= &btrfs_xattr_user_handler,
 #ifdef CONFIG_FS_POSIX_ACL
-//	[BTRFS_XATTR_INDEX_POSIX_ACL_ACCESS]	= &btrfs_xattr_acl_access_handler,
-//	[BTRFS_XATTR_INDEX_POSIX_ACL_DEFAULT]	= &btrfs_xattr_acl_default_handler,
+	[BTRFS_XATTR_INDEX_POSIX_ACL_ACCESS]	= &btrfs_xattr_acl_access_handler,
+	[BTRFS_XATTR_INDEX_POSIX_ACL_DEFAULT]	= &btrfs_xattr_acl_default_handler,
 #endif
 	[BTRFS_XATTR_INDEX_TRUSTED]		= &btrfs_xattr_trusted_handler,
 	[BTRFS_XATTR_INDEX_SECURITY]		= &btrfs_xattr_security_handler,
-//	[BTRFS_XATTR_INDEX_SYSTEM]		= &btrfs_xattr_system_handler,
+	[BTRFS_XATTR_INDEX_SYSTEM]		= &btrfs_xattr_system_handler,
 };
+
 struct xattr_handler *btrfs_xattr_handlers[] = {
 	&btrfs_xattr_user_handler,
 #ifdef CONFIG_FS_POSIX_ACL
-//	&btrfs_xattr_acl_access_handler,
-//	&btrfs_xattr_acl_default_handler,
+	&btrfs_xattr_acl_access_handler,
+	&btrfs_xattr_acl_default_handler,
 #endif
 	&btrfs_xattr_trusted_handler,
 	&btrfs_xattr_security_handler,
-//	&btrfs_xattr_system_handler,
+	&btrfs_xattr_system_handler,
 	NULL,
 };
 
@@ -237,10 +239,14 @@ int btrfs_xattr_set(struct inode *inode, int name_index,
 			mod = 1;
 			goto out;
 		}
-	} else if (flags & XATTR_REPLACE) {
-		/* we couldn't find the attr to replace, so error out */
-		ret = -ENODATA;
-		goto out;
+	} else {
+		btrfs_release_path(root, path);
+
+		if (flags & XATTR_REPLACE) {
+			/* we couldn't find the attr to replace */
+			ret = -ENODATA;
+			goto out;
+		}
 	}
 
 	/* ok we have to create a completely new xattr */
-- 
cgit v1.2.3


From 7b1287662304c3cb05cb38f5e3e2d69f386e8f10 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@redhat.com>
Date: Thu, 24 Jul 2008 12:17:14 -0400
Subject: Btrfs: Create orphan inode records to prevent lost files after a
 crash

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile      |   2 +-
 fs/btrfs/btrfs_inode.h |   3 +
 fs/btrfs/ctree.c       |   2 +-
 fs/btrfs/ctree.h       |  14 +++
 fs/btrfs/disk-io.c     |   2 +
 fs/btrfs/inode.c       | 237 ++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/btrfs/orphan.c      |  67 ++++++++++++++
 7 files changed, 322 insertions(+), 5 deletions(-)
 create mode 100644 fs/btrfs/orphan.c

(limited to 'fs/btrfs/btrfs_inode.h')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index f8a38b0da7fb..5a0fd7b0e3e7 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -6,7 +6,7 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   hash.o file-item.o inode-item.o inode-map.o disk-io.o \
 	   transaction.o bit-radix.o inode.o file.o tree-defrag.o \
 	   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
-	   extent_io.o volumes.o async-thread.o ioctl.o locking.o
+	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o
 
 btrfs-$(CONFIG_FS_POSIX_ACL)	+= acl.o
 else
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 9f2a4ef944a7..df624fd735c1 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -39,6 +39,9 @@ struct btrfs_inode {
 	struct posix_acl *i_acl;
 	struct posix_acl *i_default_acl;
 
+	/* for keeping track of orphaned inodes */
+	struct list_head i_orphan;
+
 	/*
 	 * transid of the trans_handle that last modified this inode
 	 */
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index cdc713062b03..f2a94999c371 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2622,7 +2622,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
 		total_data += data_size[i];
 	}
 
-	total_size = total_data + (nr - 1) * sizeof(struct btrfs_item);
+	total_size = total_data + (nr * sizeof(struct btrfs_item));
 	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
 	if (ret == 0) {
 		return -EEXIST;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f87d7263f2d7..012ad529cb18 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -74,6 +74,9 @@ struct btrfs_ordered_sum;
 /* directory objectid inside the root tree */
 #define BTRFS_ROOT_TREE_DIR_OBJECTID 6ULL
 
+/* orhpan objectid for tracking unlinked/truncated files */
+#define BTRFS_ORPHAN_OBJECTID -5ULL
+
 /*
  * All files have objectids higher than this.
  */
@@ -646,6 +649,9 @@ struct btrfs_root {
 
 	/* the dirty list is only used by non-reference counted roots */
 	struct list_head dirty_list;
+
+	spinlock_t orphan_lock;
+	struct list_head orphan_list;
 };
 
 /*
@@ -657,6 +663,7 @@ struct btrfs_root {
 #define BTRFS_INODE_ITEM_KEY		1
 #define BTRFS_INODE_REF_KEY		2
 #define BTRFS_XATTR_ITEM_KEY		8
+#define BTRFS_ORPHAN_ITEM_KEY		9
 /* reserve 2-15 close to the inode for later flexibility */
 
 /*
@@ -1560,6 +1567,13 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
 					  struct btrfs_path *path, u64 dir,
 					  const char *name, u16 name_len,
 					  int mod);
+
+/* orphan.c */
+int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root, u64 offset);
+int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, u64 offset);
+
 /* inode-map.c */
 int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *fs_root,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index ec01062eb41d..d60923967347 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -732,7 +732,9 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 	root->in_sysfs = 0;
 
 	INIT_LIST_HEAD(&root->dirty_list);
+	INIT_LIST_HEAD(&root->orphan_list);
 	spin_lock_init(&root->node_lock);
+	spin_lock_init(&root->orphan_lock);
 	mutex_init(&root->objectid_mutex);
 	memset(&root->root_key, 0, sizeof(root->root_key));
 	memset(&root->root_item, 0, sizeof(root->root_item));
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2d8853543a71..0c9ec8aa304a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -78,6 +78,8 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
 	[S_IFLNK >> S_SHIFT]	= BTRFS_FT_SYMLINK,
 };
 
+static void btrfs_truncate(struct inode *inode);
+
 int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
 			   int for_del)
 {
@@ -826,6 +828,190 @@ zeroit:
 	return -EIO;
 }
 
+/*
+ * This creates an orphan entry for the given inode in case something goes
+ * wrong in the middle of an unlink/truncate.
+ */
+int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret = 0;
+
+	spin_lock(&root->orphan_lock);
+
+	/* already on the orphan list, we're good */
+	if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
+		spin_unlock(&root->orphan_lock);
+		return 0;
+	}
+
+	list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+
+	spin_unlock(&root->orphan_lock);
+
+	/*
+	 * insert an orphan item to track this unlinked/truncated file
+	 */
+	ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
+
+	return ret;
+}
+
+/*
+ * We have done the truncate/delete so we can go ahead and remove the orphan
+ * item for this particular inode.
+ */
+int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret = 0;
+
+	spin_lock(&root->orphan_lock);
+
+	if (list_empty(&BTRFS_I(inode)->i_orphan)) {
+		spin_unlock(&root->orphan_lock);
+		return 0;
+	}
+
+	list_del_init(&BTRFS_I(inode)->i_orphan);
+	if (!trans) {
+		spin_unlock(&root->orphan_lock);
+		return 0;
+	}
+
+	spin_unlock(&root->orphan_lock);
+
+	ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
+
+	return ret;
+}
+
+/*
+ * this cleans up any orphans that may be left on the list from the last use
+ * of this root.
+ */
+void btrfs_orphan_cleanup(struct btrfs_root *root)
+{
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_item *item;
+	struct btrfs_key key, found_key;
+	struct btrfs_trans_handle *trans;
+	struct inode *inode;
+	int ret = 0, nr_unlink = 0, nr_truncate = 0;
+
+	/* don't do orphan cleanup if the fs is readonly. */
+	if (root->inode->i_sb->s_flags & MS_RDONLY)
+		return;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return;
+	path->reada = -1;
+
+	key.objectid = BTRFS_ORPHAN_OBJECTID;
+	btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
+	key.offset = (u64)-1;
+
+	trans = btrfs_start_transaction(root, 1);
+	btrfs_set_trans_block_group(trans, root->inode);
+
+	while (1) {
+		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+		if (ret < 0) {
+			printk(KERN_ERR "Error searching slot for orphan: %d"
+			       "\n", ret);
+			break;
+		}
+
+		/*
+		 * if ret == 0 means we found what we were searching for, which
+		 * is weird, but possible, so only screw with path if we didnt
+		 * find the key and see if we have stuff that matches
+		 */
+		if (ret > 0) {
+			if (path->slots[0] == 0)
+				break;
+			path->slots[0]--;
+		}
+
+		/* pull out the item */
+		leaf = path->nodes[0];
+		item = btrfs_item_nr(leaf, path->slots[0]);
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+		/* make sure the item matches what we want */
+		if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
+			break;
+		if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY)
+			break;
+
+		/* release the path since we're done with it */
+		btrfs_release_path(root, path);
+
+		/*
+		 * this is where we are basically btrfs_lookup, without the
+		 * crossing root thing.  we store the inode number in the
+		 * offset of the orphan item.
+		 */
+		inode = btrfs_iget_locked(root->inode->i_sb,
+					  found_key.offset, root);
+		if (!inode)
+			break;
+
+		if (inode->i_state & I_NEW) {
+			BTRFS_I(inode)->root = root;
+
+			/* have to set the location manually */
+			BTRFS_I(inode)->location.objectid = inode->i_ino;
+			BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
+			BTRFS_I(inode)->location.offset = 0;
+
+			btrfs_read_locked_inode(inode);
+			unlock_new_inode(inode);
+		}
+
+		/*
+		 * add this inode to the orphan list so btrfs_orphan_del does
+		 * the proper thing when we hit it
+		 */
+		spin_lock(&root->orphan_lock);
+		list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+		spin_unlock(&root->orphan_lock);
+
+		/*
+		 * if this is a bad inode, means we actually succeeded in
+		 * removing the inode, but not the orphan record, which means
+		 * we need to manually delete the orphan since iput will just
+		 * do a destroy_inode
+		 */
+		if (is_bad_inode(inode)) {
+			btrfs_orphan_del(trans, inode);
+			iput(inode);
+			continue;
+		}
+
+		/* if we have links, this was a truncate, lets do that */
+		if (inode->i_nlink) {
+			nr_truncate++;
+			btrfs_truncate(inode);
+		} else {
+			nr_unlink++;
+		}
+
+		/* this will do delete_inode and everything for us */
+		iput(inode);
+	}
+
+	if (nr_unlink)
+		printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink);
+	if (nr_truncate)
+		printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
+
+	btrfs_free_path(path);
+	btrfs_end_transaction(trans, root);
+}
+
 void btrfs_read_locked_inode(struct inode *inode)
 {
 	struct btrfs_path *path;
@@ -1067,6 +1253,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 {
 	struct btrfs_root *root;
 	struct btrfs_trans_handle *trans;
+	struct inode *inode = dentry->d_inode;
 	int ret;
 	unsigned long nr = 0;
 
@@ -1080,6 +1267,10 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 
 	btrfs_set_trans_block_group(trans, dir);
 	ret = btrfs_unlink_trans(trans, root, dir, dentry);
+
+	if (inode->i_nlink == 0)
+		ret = btrfs_orphan_add(trans, inode);
+
 	nr = trans->blocks_used;
 
 	btrfs_end_transaction_throttle(trans, root);
@@ -1108,12 +1299,17 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 	trans = btrfs_start_transaction(root, 1);
 	btrfs_set_trans_block_group(trans, dir);
 
+	err = btrfs_orphan_add(trans, inode);
+	if (err)
+		goto fail_trans;
+
 	/* now the directory is empty */
 	err = btrfs_unlink_trans(trans, root, dir, dentry);
 	if (!err) {
 		btrfs_i_size_write(inode, 0);
 	}
 
+fail_trans:
 	nr = trans->blocks_used;
 	ret = btrfs_end_transaction_throttle(trans, root);
 fail:
@@ -1131,6 +1327,9 @@ fail:
  *
  * csum items that cross the new i_size are truncated to the new size
  * as well.
+ *
+ * min_type is the minimum key type to truncate down to.  If set to 0, this
+ * will kill all the items on this inode, including the INODE_ITEM_KEY.
  */
 static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root,
@@ -1495,6 +1694,7 @@ void btrfs_delete_inode(struct inode *inode)
 
 	truncate_inode_pages(&inode->i_data, 0);
 	if (is_bad_inode(inode)) {
+		btrfs_orphan_del(NULL, inode);
 		goto no_delete;
 	}
 	btrfs_wait_ordered_range(inode, 0, (u64)-1);
@@ -1504,8 +1704,12 @@ void btrfs_delete_inode(struct inode *inode)
 
 	btrfs_set_trans_block_group(trans, inode);
 	ret = btrfs_truncate_in_trans(trans, root, inode, 0);
-	if (ret)
+	if (ret) {
+		btrfs_orphan_del(NULL, inode);
 		goto no_delete_lock;
+	}
+
+	btrfs_orphan_del(trans, inode);
 
 	nr = trans->blocks_used;
 	clear_inode(inode);
@@ -1688,7 +1892,7 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
 	struct btrfs_root *root = bi->root;
 	struct btrfs_root *sub_root = root;
 	struct btrfs_key location;
-	int ret;
+	int ret, do_orphan = 0;
 
 	if (dentry->d_name.len > BTRFS_NAME_LEN)
 		return ERR_PTR(-ENAMETOOLONG);
@@ -1706,6 +1910,7 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
 			return ERR_PTR(ret);
 		if (ret > 0)
 			return ERR_PTR(-ENOENT);
+
 		inode = btrfs_iget_locked(dir->i_sb, location.objectid,
 					  sub_root);
 		if (!inode)
@@ -1715,6 +1920,7 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
 			if (sub_root != root) {
 				igrab(inode);
 				sub_root->inode = inode;
+				do_orphan = 1;
 			}
 			BTRFS_I(inode)->root = sub_root;
 			memcpy(&BTRFS_I(inode)->location, &location,
@@ -1723,6 +1929,10 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
 			unlock_new_inode(inode);
 		}
 	}
+
+	if (unlikely(do_orphan))
+		btrfs_orphan_cleanup(sub_root);
+
 	return d_splice_alias(inode, dentry);
 }
 
@@ -2964,12 +3174,19 @@ static void btrfs_truncate(struct inode *inode)
 	btrfs_set_trans_block_group(trans, inode);
 	btrfs_i_size_write(inode, inode->i_size);
 
+	ret = btrfs_orphan_add(trans, inode);
+	if (ret)
+		goto out;
 	/* FIXME, add redo link to tree so we don't leak on crash */
 	ret = btrfs_truncate_in_trans(trans, root, inode,
 				      BTRFS_EXTENT_DATA_KEY);
 	btrfs_update_inode(trans, root, inode);
-	nr = trans->blocks_used;
 
+	ret = btrfs_orphan_del(trans, inode);
+	BUG_ON(ret);
+
+out:
+	nr = trans->blocks_used;
 	ret = btrfs_end_transaction_throttle(trans, root);
 	BUG_ON(ret);
 	btrfs_btree_balance_dirty(root, nr);
@@ -3046,6 +3263,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 	btrfs_ordered_inode_tree_init(&ei->ordered_tree);
 	ei->i_acl = BTRFS_ACL_NOT_CACHED;
 	ei->i_default_acl = BTRFS_ACL_NOT_CACHED;
+	INIT_LIST_HEAD(&ei->i_orphan);
 	return &ei->vfs_inode;
 }
 
@@ -3062,6 +3280,14 @@ void btrfs_destroy_inode(struct inode *inode)
 	    BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED)
 		posix_acl_release(BTRFS_I(inode)->i_default_acl);
 
+	spin_lock(&BTRFS_I(inode)->root->orphan_lock);
+	if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
+		printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan"
+		       " list\n", inode->i_ino);
+		dump_stack();
+	}
+	spin_unlock(&BTRFS_I(inode)->root->orphan_lock);
+
 	while(1) {
 		ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
 		if (!ordered)
@@ -3202,6 +3428,11 @@ static int btrfs_rename(struct inode * old_dir, struct dentry *old_dentry,
 		ret = btrfs_unlink_trans(trans, root, new_dir, new_dentry);
 		if (ret)
 			goto out_fail;
+		if (new_inode->i_nlink == 0) {
+			ret = btrfs_orphan_add(trans, new_inode);
+			if (ret)
+				goto out_fail;
+		}
 	}
 	ret = btrfs_set_inode_index(new_dir, old_inode);
 	if (ret)
diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
new file mode 100644
index 000000000000..3c0d52af4f80
--- /dev/null
+++ b/fs/btrfs/orphan.c
@@ -0,0 +1,67 @@
+/*
+ * Copyright (C) 2008 Red Hat.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "ctree.h"
+#include "disk-io.h"
+
+int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root, u64 offset)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	int ret = 0;
+
+	key.objectid = BTRFS_ORPHAN_OBJECTID;
+	btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
+	key.offset = offset;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, u64 offset)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	int ret = 0;
+
+	key.objectid = BTRFS_ORPHAN_OBJECTID;
+	btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
+	key.offset = offset;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret)
+		goto out;
+
+	ret = btrfs_del_item(trans, root, path);
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
-- 
cgit v1.2.3


From ea8c281947950fac5f78818b767821d696c9512a Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 4 Aug 2008 23:17:27 -0400
Subject: Btrfs: Maintain a list of inodes that are delalloc and a way to wait
 on them

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/btrfs_inode.h |  2 ++
 fs/btrfs/ctree.h       |  4 ++++
 fs/btrfs/disk-io.c     |  1 +
 fs/btrfs/extent-tree.c | 12 +++++++++---
 fs/btrfs/file.c        |  3 +--
 fs/btrfs/inode.c       | 47 +++++++++++++++++++++++++++++++++++++++++------
 fs/btrfs/ioctl.c       |  4 ++--
 7 files changed, 60 insertions(+), 13 deletions(-)

(limited to 'fs/btrfs/btrfs_inode.h')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index df624fd735c1..534ac4153ddd 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -42,6 +42,8 @@ struct btrfs_inode {
 	/* for keeping track of orphaned inodes */
 	struct list_head i_orphan;
 
+	struct list_head delalloc_inodes;
+
 	/*
 	 * transid of the trans_handle that last modified this inode
 	 */
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 62499dd761b7..116aee21bf77 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -551,6 +551,7 @@ struct btrfs_fs_info {
 	 */
 	spinlock_t ordered_extent_lock;
 	struct list_head ordered_extents;
+	struct list_head delalloc_inodes;
 
 	/*
 	 * there is a pool of worker threads for checksumming during writes
@@ -637,6 +638,7 @@ struct btrfs_root {
 	struct kobject root_kobj;
 	struct completion kobj_unregister;
 	struct mutex objectid_mutex;
+
 	u64 objectid;
 	u64 last_trans;
 
@@ -1651,6 +1653,8 @@ int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
 #define PageChecked PageFsMisc
 #endif
 
+int btrfs_start_delalloc_inodes(struct btrfs_root *root);
+int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end);
 int btrfs_writepages(struct address_space *mapping,
 		     struct writeback_control *wbc);
 int btrfs_create_subvol_root(struct btrfs_root *new_root,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index da9dda4338a3..76543683f3b9 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1234,6 +1234,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	INIT_LIST_HEAD(&fs_info->trans_list);
 	INIT_LIST_HEAD(&fs_info->dead_roots);
 	INIT_LIST_HEAD(&fs_info->hashers);
+	INIT_LIST_HEAD(&fs_info->delalloc_inodes);
 	spin_lock_init(&fs_info->hash_lock);
 	spin_lock_init(&fs_info->delalloc_lock);
 	spin_lock_init(&fs_info->new_trans_lock);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index dbde12881244..33cb2ac4cb28 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1230,7 +1230,6 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 		found->total_bytes += total_bytes;
 		found->bytes_used += bytes_used;
 		found->full = 0;
-		WARN_ON(found->total_bytes < found->bytes_used);
 		*space_info = found;
 		return 0;
 	}
@@ -2841,8 +2840,7 @@ again:
 		 */
 		clear_page_dirty_for_io(page);
 
-		set_extent_delalloc(io_tree, page_start,
-				    page_end, GFP_NOFS);
+		btrfs_set_extent_delalloc(inode, page_start, page_end);
 		set_page_dirty(page);
 
 		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
@@ -3319,6 +3317,13 @@ again:
 	key.type = 0;
 	cur_byte = key.objectid;
 
+	mutex_unlock(&root->fs_info->alloc_mutex);
+
+	btrfs_start_delalloc_inodes(root);
+	btrfs_wait_ordered_extents(tree_root);
+
+	mutex_lock(&root->fs_info->alloc_mutex);
+
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	if (ret < 0)
 		goto out;
@@ -3401,6 +3406,7 @@ next:
 
 		btrfs_clean_old_snapshots(tree_root);
 
+		btrfs_start_delalloc_inodes(root);
 		btrfs_wait_ordered_extents(tree_root);
 
 		trans = btrfs_start_transaction(tree_root, 1);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 8915f2dc1bce..eb8e4556fa71 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -312,8 +312,7 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 		 * to reset the delalloc bit on things that already have
 		 * extents reserved.
 		 */
-		set_extent_delalloc(io_tree, start_pos,
-				    end_of_last_block, GFP_NOFS);
+		btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
 		for (i = 0; i < num_pages; i++) {
 			struct page *p = pages[i];
 			SetPageUptodate(p);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 640648c66b22..8a405a5fa6a3 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -303,6 +303,10 @@ int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
 		spin_lock_irqsave(&root->fs_info->delalloc_lock, flags);
 		BTRFS_I(inode)->delalloc_bytes += end - start + 1;
 		root->fs_info->delalloc_bytes += end - start + 1;
+		if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+			list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
+				      &root->fs_info->delalloc_inodes);
+		}
 		spin_unlock_irqrestore(&root->fs_info->delalloc_lock, flags);
 	}
 	return 0;
@@ -325,6 +329,10 @@ int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
 			root->fs_info->delalloc_bytes -= end - start + 1;
 			BTRFS_I(inode)->delalloc_bytes -= end - start + 1;
 		}
+		if (BTRFS_I(inode)->delalloc_bytes == 0 &&
+		    !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+			list_del_init(&BTRFS_I(inode)->delalloc_inodes);
+		}
 		spin_unlock_irqrestore(&root->fs_info->delalloc_lock, flags);
 	}
 	return 0;
@@ -408,6 +416,12 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end)
+{
+	return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
+				   GFP_NOFS);
+}
+
 struct btrfs_writepage_fixup {
 	struct page *page;
 	struct btrfs_work work;
@@ -453,8 +467,7 @@ again:
 		goto again;
 	}
 
-	set_extent_delalloc(&BTRFS_I(inode)->io_tree, page_start, page_end,
-			    GFP_NOFS);
+	btrfs_set_extent_delalloc(inode, page_start, page_end);
 	ClearPageChecked(page);
 out:
 	unlock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
@@ -1530,8 +1543,7 @@ again:
 		goto again;
 	}
 
-	set_extent_delalloc(&BTRFS_I(inode)->io_tree, page_start,
-			    page_end, GFP_NOFS);
+	btrfs_set_extent_delalloc(inode, page_start, page_end);
 	ret = 0;
 	if (offset != PAGE_CACHE_SIZE) {
 		kaddr = kmap(page);
@@ -1766,6 +1778,7 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
 			     inode->i_mapping, GFP_NOFS);
 	extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
 			     inode->i_mapping, GFP_NOFS);
+	INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
 	btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
 	mutex_init(&BTRFS_I(inode)->csum_mutex);
 	mutex_init(&BTRFS_I(inode)->extent_mutex);
@@ -2158,6 +2171,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
 			     inode->i_mapping, GFP_NOFS);
 	btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
+	INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
 	mutex_init(&BTRFS_I(inode)->csum_mutex);
 	mutex_init(&BTRFS_I(inode)->extent_mutex);
 	BTRFS_I(inode)->delalloc_bytes = 0;
@@ -2400,6 +2414,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 				     inode->i_mapping, GFP_NOFS);
 		extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
 				     inode->i_mapping, GFP_NOFS);
+		INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
 		mutex_init(&BTRFS_I(inode)->csum_mutex);
 		mutex_init(&BTRFS_I(inode)->extent_mutex);
 		BTRFS_I(inode)->delalloc_bytes = 0;
@@ -3049,8 +3064,7 @@ again:
 		goto again;
 	}
 
-	set_extent_delalloc(&BTRFS_I(inode)->io_tree, page_start,
-			    page_end, GFP_NOFS);
+	btrfs_set_extent_delalloc(inode, page_start, page_end);
 	ret = 0;
 
 	/* page is wholly or partially inside EOF */
@@ -3373,6 +3387,26 @@ out_unlock:
 	return ret;
 }
 
+int btrfs_start_delalloc_inodes(struct btrfs_root *root)
+{
+	struct list_head *head = &root->fs_info->delalloc_inodes;
+	struct btrfs_inode *binode;
+	unsigned long flags;
+
+	spin_lock_irqsave(&root->fs_info->delalloc_lock, flags);
+	while(!list_empty(head)) {
+		binode = list_entry(head->next, struct btrfs_inode,
+				    delalloc_inodes);
+		atomic_inc(&binode->vfs_inode.i_count);
+		spin_unlock_irqrestore(&root->fs_info->delalloc_lock, flags);
+		filemap_write_and_wait(binode->vfs_inode.i_mapping);
+		iput(&binode->vfs_inode);
+		spin_lock_irqsave(&root->fs_info->delalloc_lock, flags);
+	}
+	spin_unlock_irqrestore(&root->fs_info->delalloc_lock, flags);
+	return 0;
+}
+
 static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 			 const char *symname)
 {
@@ -3436,6 +3470,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 				     inode->i_mapping, GFP_NOFS);
 		extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
 				     inode->i_mapping, GFP_NOFS);
+		INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
 		mutex_init(&BTRFS_I(inode)->csum_mutex);
 		mutex_init(&BTRFS_I(inode)->extent_mutex);
 		BTRFS_I(inode)->delalloc_bytes = 0;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 0b63c3c77cfd..e1046a54b1c5 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -274,8 +274,7 @@ again:
 		 */
 		clear_page_dirty_for_io(page);
 
-		set_extent_delalloc(io_tree, page_start,
-				    page_end, GFP_NOFS);
+		btrfs_set_extent_delalloc(inode, page_start, page_end);
 
 		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
 		set_page_dirty(page);
@@ -784,6 +783,7 @@ long btrfs_ioctl(struct file *file, unsigned int
 	case BTRFS_IOC_TRANS_END:
 		return btrfs_ioctl_trans_end(file);
 	case BTRFS_IOC_SYNC:
+		btrfs_start_delalloc_inodes(root);
 		btrfs_sync_fs(file->f_dentry->d_sb, 1);
 		return 0;
 	}
-- 
cgit v1.2.3


From 00e4e6b33a0f78aab4b788d6d31c884fd8bf88da Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 5 Aug 2008 11:18:09 -0400
Subject: Get rid of BTRFS_I(inode)->index and use local vars instead

rename and link don't always have a lock on the source inode, and
our use of a per-inode index variable was racy.  This changes things to
store the index in a local variable instead.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/btrfs_inode.h |  9 --------
 fs/btrfs/inode.c       | 59 +++++++++++++++++++++++++++++---------------------
 2 files changed, 34 insertions(+), 34 deletions(-)

(limited to 'fs/btrfs/btrfs_inode.h')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 534ac4153ddd..111f90524ae1 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -57,15 +57,6 @@ struct btrfs_inode {
 	 * number for new files that are created
 	 */
 	u64 index_cnt;
-
-	/*
-	 * index holds the directory index for this inode on creation, so
-	 * add_link can do what its supposed to.  This isn't populated when the
-	 * inode is read because there isn't really a reason to know this unless
-	 * we are creating the directory index or deleting it, and deletion
-	 * reads the index off of the inode reference at unlink time.
-	 */
-	u64 index;
 };
 
 static inline struct btrfs_inode *BTRFS_I(struct inode *inode)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 99121a55ffbe..4d8ffc01931e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2153,7 +2153,8 @@ out:
 	return ret;
 }
 
-static int btrfs_set_inode_index(struct inode *dir, struct inode *inode)
+static int btrfs_set_inode_index(struct inode *dir, struct inode *inode,
+				 u64 *index)
 {
 	int ret = 0;
 
@@ -2163,7 +2164,7 @@ static int btrfs_set_inode_index(struct inode *dir, struct inode *inode)
 			return ret;
 	}
 
-	BTRFS_I(inode)->index = BTRFS_I(dir)->index_cnt;
+	*index = BTRFS_I(dir)->index_cnt;
 	BTRFS_I(dir)->index_cnt++;
 
 	return ret;
@@ -2176,7 +2177,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 				     u64 ref_objectid,
 				     u64 objectid,
 				     struct btrfs_block_group_cache *group,
-				     int mode)
+				     int mode, u64 *index)
 {
 	struct inode *inode;
 	struct btrfs_inode_item *inode_item;
@@ -2198,11 +2199,9 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 		return ERR_PTR(-ENOMEM);
 
 	if (dir) {
-		ret = btrfs_set_inode_index(dir, inode);
+		ret = btrfs_set_inode_index(dir, inode, index);
 		if (ret)
 			return ERR_PTR(ret);
-	} else {
-		BTRFS_I(inode)->index = 0;
 	}
 	/*
 	 * index_cnt is ignored for everything but a dir,
@@ -2268,7 +2267,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
 			     struct btrfs_inode_ref);
 	btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
-	btrfs_set_inode_ref_index(path->nodes[0], ref, BTRFS_I(inode)->index);
+	btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
 	ptr = (unsigned long)(ref + 1);
 	write_extent_buffer(path->nodes[0], name, ptr, name_len);
 
@@ -2296,7 +2295,7 @@ static inline u8 btrfs_inode_type(struct inode *inode)
 
 static int btrfs_add_link(struct btrfs_trans_handle *trans,
 			    struct dentry *dentry, struct inode *inode,
-			    int add_backref)
+			    int add_backref, u64 index)
 {
 	int ret;
 	struct btrfs_key key;
@@ -2311,7 +2310,7 @@ static int btrfs_add_link(struct btrfs_trans_handle *trans,
 				    dentry->d_name.name, dentry->d_name.len,
 				    dentry->d_parent->d_inode->i_ino,
 				    &key, btrfs_inode_type(inode),
-				    BTRFS_I(inode)->index);
+				    index);
 	if (ret == 0) {
 		if (add_backref) {
 			ret = btrfs_insert_inode_ref(trans, root,
@@ -2319,7 +2318,7 @@ static int btrfs_add_link(struct btrfs_trans_handle *trans,
 					     dentry->d_name.len,
 					     inode->i_ino,
 					     parent_inode->i_ino,
-					     BTRFS_I(inode)->index);
+					     index);
 		}
 		btrfs_i_size_write(parent_inode, parent_inode->i_size +
 				   dentry->d_name.len * 2);
@@ -2332,9 +2331,9 @@ static int btrfs_add_link(struct btrfs_trans_handle *trans,
 
 static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
 			    struct dentry *dentry, struct inode *inode,
-			    int backref)
+			    int backref, u64 index)
 {
-	int err = btrfs_add_link(trans, dentry, inode, backref);
+	int err = btrfs_add_link(trans, dentry, inode, backref, index);
 	if (!err) {
 		d_instantiate(dentry, inode);
 		return 0;
@@ -2354,6 +2353,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 	int drop_inode = 0;
 	u64 objectid;
 	unsigned long nr = 0;
+	u64 index = 0;
 
 	if (!new_valid_dev(rdev))
 		return -EINVAL;
@@ -2374,7 +2374,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
 				dentry->d_name.len,
 				dentry->d_parent->d_inode->i_ino, objectid,
-				BTRFS_I(dir)->block_group, mode);
+				BTRFS_I(dir)->block_group, mode, &index);
 	err = PTR_ERR(inode);
 	if (IS_ERR(inode))
 		goto out_unlock;
@@ -2386,7 +2386,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 	}
 
 	btrfs_set_trans_block_group(trans, inode);
-	err = btrfs_add_nondir(trans, dentry, inode, 0);
+	err = btrfs_add_nondir(trans, dentry, inode, 0, index);
 	if (err)
 		drop_inode = 1;
 	else {
@@ -2419,6 +2419,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 	int drop_inode = 0;
 	unsigned long nr = 0;
 	u64 objectid;
+	u64 index = 0;
 
 	err = btrfs_check_free_space(root, 1, 0);
 	if (err)
@@ -2435,7 +2436,8 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
 				dentry->d_name.len,
 				dentry->d_parent->d_inode->i_ino,
-				objectid, BTRFS_I(dir)->block_group, mode);
+				objectid, BTRFS_I(dir)->block_group, mode,
+				&index);
 	err = PTR_ERR(inode);
 	if (IS_ERR(inode))
 		goto out_unlock;
@@ -2447,7 +2449,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 	}
 
 	btrfs_set_trans_block_group(trans, inode);
-	err = btrfs_add_nondir(trans, dentry, inode, 0);
+	err = btrfs_add_nondir(trans, dentry, inode, 0, index);
 	if (err)
 		drop_inode = 1;
 	else {
@@ -2489,6 +2491,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = BTRFS_I(dir)->root;
 	struct inode *inode = old_dentry->d_inode;
+	u64 index;
 	unsigned long nr = 0;
 	int err;
 	int drop_inode = 0;
@@ -2504,7 +2507,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 	err = btrfs_check_free_space(root, 1, 0);
 	if (err)
 		goto fail;
-	err = btrfs_set_inode_index(dir, inode);
+	err = btrfs_set_inode_index(dir, inode, &index);
 	if (err)
 		goto fail;
 
@@ -2513,7 +2516,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 	btrfs_set_trans_block_group(trans, dir);
 	atomic_inc(&inode->i_count);
 
-	err = btrfs_add_nondir(trans, dentry, inode, 1);
+	err = btrfs_add_nondir(trans, dentry, inode, 1, index);
 
 	if (err)
 		drop_inode = 1;
@@ -2544,6 +2547,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	int err = 0;
 	int drop_on_err = 0;
 	u64 objectid = 0;
+	u64 index = 0;
 	unsigned long nr = 1;
 
 	err = btrfs_check_free_space(root, 1, 0);
@@ -2567,7 +2571,8 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
 				dentry->d_name.len,
 				dentry->d_parent->d_inode->i_ino, objectid,
-				BTRFS_I(dir)->block_group, S_IFDIR | mode);
+				BTRFS_I(dir)->block_group, S_IFDIR | mode,
+				&index);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
 		goto out_fail;
@@ -2588,7 +2593,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	if (err)
 		goto out_fail;
 
-	err = btrfs_add_link(trans, dentry, inode, 0);
+	err = btrfs_add_link(trans, dentry, inode, 0, index);
 	if (err)
 		goto out_fail;
 
@@ -3203,9 +3208,10 @@ int btrfs_create_subvol_root(struct btrfs_root *new_root,
 		struct btrfs_block_group_cache *block_group)
 {
 	struct inode *inode;
+	u64 index = 0;
 
 	inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid,
-				new_dirid, block_group, S_IFDIR | 0700);
+				new_dirid, block_group, S_IFDIR | 0700, &index);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
 	inode->i_op = &btrfs_dir_inode_operations;
@@ -3384,6 +3390,7 @@ static int btrfs_rename(struct inode * old_dir, struct dentry *old_dentry,
 	struct inode *new_inode = new_dentry->d_inode;
 	struct inode *old_inode = old_dentry->d_inode;
 	struct timespec ctime = CURRENT_TIME;
+	u64 index = 0;
 	int ret;
 
 	if (S_ISDIR(old_inode->i_mode) && new_inode &&
@@ -3419,11 +3426,11 @@ static int btrfs_rename(struct inode * old_dir, struct dentry *old_dentry,
 				goto out_fail;
 		}
 	}
-	ret = btrfs_set_inode_index(new_dir, old_inode);
+	ret = btrfs_set_inode_index(new_dir, old_inode, &index);
 	if (ret)
 		goto out_fail;
 
-	ret = btrfs_add_link(trans, new_dentry, old_inode, 1);
+	ret = btrfs_add_link(trans, new_dentry, old_inode, 1, index);
 	if (ret)
 		goto out_fail;
 
@@ -3464,6 +3471,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 	int err;
 	int drop_inode = 0;
 	u64 objectid;
+	u64 index = 0 ;
 	int name_len;
 	int datasize;
 	unsigned long ptr;
@@ -3491,7 +3499,8 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
 				dentry->d_name.len,
 				dentry->d_parent->d_inode->i_ino, objectid,
-				BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO);
+				BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO,
+				&index);
 	err = PTR_ERR(inode);
 	if (IS_ERR(inode))
 		goto out_unlock;
@@ -3503,7 +3512,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 	}
 
 	btrfs_set_trans_block_group(trans, inode);
-	err = btrfs_add_nondir(trans, dentry, inode, 0);
+	err = btrfs_add_nondir(trans, dentry, inode, 0, index);
 	if (err)
 		drop_inode = 1;
 	else {
-- 
cgit v1.2.3


From e02119d5a7b4396c5a872582fddc8bd6d305a70a Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 5 Sep 2008 16:13:11 -0400
Subject: Btrfs: Add a write ahead tree log to optimize synchronous operations

File syncs and directory syncs are optimized by copying their
items into a special (copy-on-write) log tree.  There is one log tree per
subvolume and the btrfs super block points to a tree of log tree roots.

After a crash, items are copied out of the log tree and back into the
subvolume.  See tree-log.c for all the details.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile      |    3 +-
 fs/btrfs/btrfs_inode.h |    8 +
 fs/btrfs/compat.h      |   15 +
 fs/btrfs/ctree.c       |   73 +-
 fs/btrfs/ctree.h       |   72 +-
 fs/btrfs/dir-item.c    |    3 +
 fs/btrfs/disk-io.c     |  138 ++-
 fs/btrfs/disk-io.h     |    8 +-
 fs/btrfs/extent-tree.c |   93 +-
 fs/btrfs/file.c        |   39 +-
 fs/btrfs/inode.c       |  261 ++---
 fs/btrfs/root-tree.c   |    5 +-
 fs/btrfs/transaction.c |   45 +-
 fs/btrfs/transaction.h |    1 +
 fs/btrfs/tree-defrag.c |    4 +-
 fs/btrfs/tree-log.c    | 2804 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/tree-log.h    |   41 +
 17 files changed, 3408 insertions(+), 205 deletions(-)
 create mode 100644 fs/btrfs/tree-log.c
 create mode 100644 fs/btrfs/tree-log.h

(limited to 'fs/btrfs/btrfs_inode.h')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 3a01065d4ef1..b7addbfd8c22 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -7,8 +7,7 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   transaction.o bit-radix.o inode.o file.o tree-defrag.o \
 	   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
 	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
-	   ref-cache.o acl.o export.o
-
+	   ref-cache.o export.o tree-log.o acl.o
 else
 
 # Normal Makefile
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 111f90524ae1..fcc8cf27e906 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -33,6 +33,7 @@ struct btrfs_inode {
 	struct extent_io_tree io_failure_tree;
 	struct mutex csum_mutex;
 	struct mutex extent_mutex;
+	struct mutex log_mutex;
 	struct inode vfs_inode;
 	struct btrfs_ordered_inode_tree ordered_tree;
 
@@ -44,10 +45,17 @@ struct btrfs_inode {
 
 	struct list_head delalloc_inodes;
 
+	/* full 64 bit generation number */
+	u64 generation;
+
 	/*
 	 * transid of the trans_handle that last modified this inode
 	 */
 	u64 last_trans;
+	/*
+	 * transid that last logged this inode
+	 */
+	u64 logged_trans;
 	u64 delalloc_bytes;
 	u64 disk_i_size;
 	u32 flags;
diff --git a/fs/btrfs/compat.h b/fs/btrfs/compat.h
index d45fb37887bc..b0ed1887d9b1 100644
--- a/fs/btrfs/compat.h
+++ b/fs/btrfs/compat.h
@@ -22,6 +22,21 @@ static inline struct dentry *d_obtain_alias(struct inode *inode)
 }
 #endif
 
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
+static inline void btrfs_drop_nlink(struct inode *inode)
+{
+	inode->i_nlink--;
+}
+
+static inline void btrfs_inc_nlink(struct inode *inode)
+{
+	inode->i_nlink++;
+}
+#else
+# define btrfs_drop_nlink(inode) drop_nlink(inode)
+# define btrfs_inc_nlink(inode)	inc_nlink(inode)
+#endif
+
 /*
  * Even if AppArmor isn't enabled, it still has different prototypes.
  * Add more distro/version pairs here to declare which has AppArmor applied.
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 7114faafa9d4..579124043d9b 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -60,7 +60,7 @@ void btrfs_free_path(struct btrfs_path *p)
 	kmem_cache_free(btrfs_path_cachep, p);
 }
 
-void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
+void noinline btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
 {
 	int i;
 
@@ -176,7 +176,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-int __btrfs_cow_block(struct btrfs_trans_handle *trans,
+int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     struct extent_buffer *buf,
 			     struct extent_buffer *parent, int parent_slot,
@@ -294,7 +294,7 @@ int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-int btrfs_cow_block(struct btrfs_trans_handle *trans,
+int noinline btrfs_cow_block(struct btrfs_trans_handle *trans,
 		    struct btrfs_root *root, struct extent_buffer *buf,
 		    struct extent_buffer *parent, int parent_slot,
 		    struct extent_buffer **cow_ret, u64 prealloc_dest)
@@ -677,9 +677,10 @@ static int noinline check_block(struct btrfs_root *root,
  *
  * slot may point to max if the key is bigger than all of the keys
  */
-static int generic_bin_search(struct extent_buffer *eb, unsigned long p,
-			      int item_size, struct btrfs_key *key,
-			      int max, int *slot)
+static noinline int generic_bin_search(struct extent_buffer *eb,
+				       unsigned long p,
+				       int item_size, struct btrfs_key *key,
+				       int max, int *slot)
 {
 	int low = 0;
 	int high = max;
@@ -765,7 +766,7 @@ static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
 	return -1;
 }
 
-static struct extent_buffer *read_node_slot(struct btrfs_root *root,
+static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
 				   struct extent_buffer *parent, int slot)
 {
 	int level = btrfs_header_level(parent);
@@ -781,7 +782,7 @@ static struct extent_buffer *read_node_slot(struct btrfs_root *root,
 		       btrfs_node_ptr_generation(parent, slot));
 }
 
-static int balance_level(struct btrfs_trans_handle *trans,
+static noinline int balance_level(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root,
 			 struct btrfs_path *path, int level)
 {
@@ -1128,8 +1129,9 @@ static int noinline push_nodes_for_insert(struct btrfs_trans_handle *trans,
 /*
  * readahead one full node of leaves
  */
-static void reada_for_search(struct btrfs_root *root, struct btrfs_path *path,
-			     int level, int slot, u64 objectid)
+static noinline void reada_for_search(struct btrfs_root *root,
+				      struct btrfs_path *path,
+				      int level, int slot, u64 objectid)
 {
 	struct extent_buffer *node;
 	struct btrfs_disk_key disk_key;
@@ -1201,7 +1203,8 @@ static void reada_for_search(struct btrfs_root *root, struct btrfs_path *path,
 	}
 }
 
-static void unlock_up(struct btrfs_path *path, int level, int lowest_unlock)
+static noinline void unlock_up(struct btrfs_path *path, int level,
+			       int lowest_unlock)
 {
 	int i;
 	int skip_level = level;
@@ -1759,8 +1762,9 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
  *
  * returns 0 on success and < 0 on failure
  */
-static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
-		      *root, struct btrfs_path *path, int level)
+static noinline int split_node(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       struct btrfs_path *path, int level)
 {
 	u64 root_gen;
 	struct extent_buffer *c;
@@ -1874,7 +1878,8 @@ static int leaf_space_used(struct extent_buffer *l, int start, int nr)
  * the start of the leaf data.  IOW, how much room
  * the leaf has left for both items and data
  */
-int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf)
+int noinline btrfs_leaf_free_space(struct btrfs_root *root,
+				   struct extent_buffer *leaf)
 {
 	int nritems = btrfs_header_nritems(leaf);
 	int ret;
@@ -2283,9 +2288,11 @@ out:
  *
  * returns 0 if all went well and < 0 on failure.
  */
-static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
-		      *root, struct btrfs_key *ins_key,
-		      struct btrfs_path *path, int data_size, int extend)
+static noinline int split_leaf(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       struct btrfs_key *ins_key,
+			       struct btrfs_path *path, int data_size,
+			       int extend)
 {
 	u64 root_gen;
 	struct extent_buffer *l;
@@ -3079,6 +3086,7 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
  * was nothing in the tree that matched the search criteria.
  */
 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
+			 struct btrfs_key *max_key,
 			 struct btrfs_path *path, int cache_only,
 			 u64 min_trans)
 {
@@ -3093,6 +3101,7 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
 again:
 	cur = btrfs_lock_root_node(root);
 	level = btrfs_header_level(cur);
+	WARN_ON(path->nodes[level]);
 	path->nodes[level] = cur;
 	path->locks[level] = 1;
 
@@ -3107,6 +3116,8 @@ again:
 
 		/* at level = 0, we're done, setup the path and exit */
 		if (level == 0) {
+			if (slot >= nritems)
+				goto find_next_key;
 			ret = 0;
 			path->slots[level] = slot;
 			btrfs_item_key_to_cpu(cur, &found_key, slot);
@@ -3123,6 +3134,8 @@ again:
 			u64 blockptr;
 			u64 gen;
 			struct extent_buffer *tmp;
+			struct btrfs_disk_key disk_key;
+
 			blockptr = btrfs_node_blockptr(cur, slot);
 			gen = btrfs_node_ptr_generation(cur, slot);
 			if (gen < min_trans) {
@@ -3132,6 +3145,14 @@ again:
 			if (!cache_only)
 				break;
 
+			if (max_key) {
+				btrfs_node_key(cur, &disk_key, slot);
+				if (comp_keys(&disk_key, max_key) >= 0) {
+					ret = 1;
+					goto out;
+				}
+			}
+
 			tmp = btrfs_find_tree_block(root, blockptr,
 					    btrfs_level_size(root, level - 1));
 
@@ -3143,14 +3164,16 @@ again:
 				free_extent_buffer(tmp);
 			slot++;
 		}
+find_next_key:
 		/*
 		 * we didn't find a candidate key in this node, walk forward
 		 * and find another one
 		 */
 		if (slot >= nritems) {
-			ret = btrfs_find_next_key(root, path, min_key, level,
+			path->slots[level] = slot;
+			sret = btrfs_find_next_key(root, path, min_key, level,
 						  cache_only, min_trans);
-			if (ret == 0) {
+			if (sret == 0) {
 				btrfs_release_path(root, path);
 				goto again;
 			} else {
@@ -3351,6 +3374,7 @@ int btrfs_previous_item(struct btrfs_root *root,
 {
 	struct btrfs_key found_key;
 	struct extent_buffer *leaf;
+	u32 nritems;
 	int ret;
 
 	while(1) {
@@ -3362,9 +3386,20 @@ int btrfs_previous_item(struct btrfs_root *root,
 			path->slots[0]--;
 		}
 		leaf = path->nodes[0];
+		nritems = btrfs_header_nritems(leaf);
+		if (nritems == 0)
+			return 1;
+		if (path->slots[0] == nritems)
+			path->slots[0]--;
+
 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 		if (found_key.type == type)
 			return 0;
+		if (found_key.objectid < min_objectid)
+			break;
+		if (found_key.objectid == min_objectid &&
+		    found_key.type < type)
+			break;
 	}
 	return 1;
 }
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b305ae7e10b0..6532b60683ef 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -77,6 +77,10 @@ struct btrfs_ordered_sum;
 /* orhpan objectid for tracking unlinked/truncated files */
 #define BTRFS_ORPHAN_OBJECTID -5ULL
 
+/* does write ahead logging to speed up fsyncs */
+#define BTRFS_TREE_LOG_OBJECTID -6ULL
+#define BTRFS_TREE_LOG_FIXUP_OBJECTID -7ULL
+
 /*
  * All files have objectids higher than this.
  */
@@ -276,6 +280,7 @@ struct btrfs_super_block {
 	__le64 generation;
 	__le64 root;
 	__le64 chunk_root;
+	__le64 log_root;
 	__le64 total_bytes;
 	__le64 bytes_used;
 	__le64 root_dir_objectid;
@@ -287,6 +292,7 @@ struct btrfs_super_block {
 	__le32 sys_chunk_array_size;
 	u8 root_level;
 	u8 chunk_root_level;
+	u8 log_root_level;
 	struct btrfs_dev_item dev_item;
 	char label[BTRFS_LABEL_SIZE];
 	u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
@@ -392,7 +398,10 @@ struct btrfs_timespec {
  * make a new item type
  */
 struct btrfs_inode_item {
+	/* nfs style generation number */
 	__le64 generation;
+	/* transid that last touched this inode */
+	__le64 transid;
 	__le64 size;
 	__le64 nblocks;
 	__le64 block_group;
@@ -409,8 +418,13 @@ struct btrfs_inode_item {
 	struct btrfs_timespec otime;
 } __attribute__ ((__packed__));
 
+struct btrfs_dir_log_item {
+	__le64 end;
+} __attribute__ ((__packed__));
+
 struct btrfs_dir_item {
 	struct btrfs_disk_key location;
+	__le64 transid;
 	__le16 data_len;
 	__le16 name_len;
 	u8 type;
@@ -505,6 +519,9 @@ struct btrfs_fs_info {
 	struct btrfs_root *tree_root;
 	struct btrfs_root *chunk_root;
 	struct btrfs_root *dev_root;
+
+	/* the log root tree is a directory of all the other log roots */
+	struct btrfs_root *log_root_tree;
 	struct radix_tree_root fs_roots_radix;
 
 	struct extent_io_tree free_space_cache;
@@ -518,6 +535,7 @@ struct btrfs_fs_info {
 
 	u64 generation;
 	u64 last_trans_committed;
+	u64 last_trans_new_blockgroup;
 	u64 open_ioctl_trans;
 	unsigned long mount_opt;
 	u64 max_extent;
@@ -527,6 +545,9 @@ struct btrfs_fs_info {
 	wait_queue_head_t transaction_throttle;
 	wait_queue_head_t transaction_wait;
 	wait_queue_head_t async_submit_wait;
+
+	wait_queue_head_t tree_log_wait;
+
 	struct btrfs_super_block super_copy;
 	struct btrfs_super_block super_for_commit;
 	struct block_device *__bdev;
@@ -535,6 +556,7 @@ struct btrfs_fs_info {
 	struct backing_dev_info bdi;
 	spinlock_t hash_lock;
 	struct mutex trans_mutex;
+	struct mutex tree_log_mutex;
 	struct mutex transaction_kthread_mutex;
 	struct mutex cleaner_mutex;
 	struct mutex alloc_mutex;
@@ -544,8 +566,13 @@ struct btrfs_fs_info {
 	struct list_head trans_list;
 	struct list_head hashers;
 	struct list_head dead_roots;
+
 	atomic_t nr_async_submits;
 	atomic_t nr_async_bios;
+	atomic_t tree_log_writers;
+	atomic_t tree_log_commit;
+	unsigned long tree_log_batch;
+	u64 tree_log_transid;
 
 	/*
 	 * this is used by the balancing code to wait for all the pending
@@ -583,6 +610,7 @@ struct btrfs_fs_info {
 	struct completion kobj_unregister;
 	int do_barriers;
 	int closing;
+	int log_root_recovering;
 	atomic_t throttles;
 	atomic_t throttle_gen;
 
@@ -596,6 +624,7 @@ struct btrfs_fs_info {
 	u64 delalloc_bytes;
 	u64 last_alloc;
 	u64 last_data_alloc;
+	u64 last_log_alloc;
 
 	spinlock_t ref_cache_lock;
 	u64 total_ref_cache_size;
@@ -632,6 +661,7 @@ struct btrfs_root {
 	struct btrfs_leaf_ref_tree *ref_tree;
 	struct btrfs_leaf_ref_tree ref_tree_struct;
 	struct btrfs_dirty_root *dirty_root;
+	struct btrfs_root *log_root;
 
 	struct btrfs_root_item root_item;
 	struct btrfs_key root_key;
@@ -640,6 +670,7 @@ struct btrfs_root {
 	struct kobject root_kobj;
 	struct completion kobj_unregister;
 	struct mutex objectid_mutex;
+	struct mutex log_mutex;
 
 	u64 objectid;
 	u64 last_trans;
@@ -692,6 +723,8 @@ struct btrfs_root {
  * dir items are the name -> inode pointers in a directory.  There is one
  * for every name in a directory.
  */
+#define BTRFS_DIR_LOG_ITEM_KEY  14
+#define BTRFS_DIR_LOG_INDEX_KEY 15
 #define BTRFS_DIR_ITEM_KEY	16
 #define BTRFS_DIR_INDEX_KEY	17
 /*
@@ -703,7 +736,8 @@ struct btrfs_root {
  */
 #define BTRFS_CSUM_ITEM_KEY	19
 
-/* reserve 20-31 for other file stuff */
+
+/* reserve 21-31 for other file/dir stuff */
 
 /*
  * root items point to tree roots.  There are typically in the root
@@ -938,6 +972,7 @@ BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64);
 
 /* struct btrfs_inode_item */
 BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64);
+BTRFS_SETGET_FUNCS(inode_transid, struct btrfs_inode_item, transid, 64);
 BTRFS_SETGET_FUNCS(inode_size, struct btrfs_inode_item, size, 64);
 BTRFS_SETGET_FUNCS(inode_nblocks, struct btrfs_inode_item, nblocks, 64);
 BTRFS_SETGET_FUNCS(inode_block_group, struct btrfs_inode_item, block_group, 64);
@@ -1126,10 +1161,13 @@ static inline void btrfs_set_item_key(struct extent_buffer *eb,
 	write_eb_member(eb, item, struct btrfs_item, key, disk_key);
 }
 
+BTRFS_SETGET_FUNCS(dir_log_end, struct btrfs_dir_log_item, end, 64);
+
 /* struct btrfs_dir_item */
 BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16);
 BTRFS_SETGET_FUNCS(dir_type, struct btrfs_dir_item, type, 8);
 BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16);
+BTRFS_SETGET_FUNCS(dir_transid, struct btrfs_dir_item, transid, 64);
 
 static inline void btrfs_dir_item_key(struct extent_buffer *eb,
 				      struct btrfs_dir_item *item,
@@ -1301,7 +1339,11 @@ BTRFS_SETGET_STACK_FUNCS(super_root_level, struct btrfs_super_block,
 BTRFS_SETGET_STACK_FUNCS(super_chunk_root, struct btrfs_super_block,
 			 chunk_root, 64);
 BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block,
-			 chunk_root_level, 64);
+			 chunk_root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block,
+			 log_root, 64);
+BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block,
+			 log_root_level, 8);
 BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block,
 			 total_bytes, 64);
 BTRFS_SETGET_STACK_FUNCS(super_bytes_used, struct btrfs_super_block,
@@ -1405,6 +1447,12 @@ static inline struct dentry *fdentry(struct file *file) {
 }
 
 /* extent-tree.c */
+int btrfs_lookup_extent(struct btrfs_root *root, struct btrfs_path *path,
+			u64 start, u64 len);
+int btrfs_update_pinned_extents(struct btrfs_root *root,
+				u64 bytenr, u64 num, int pin);
+int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root, struct extent_buffer *leaf);
 int btrfs_cross_ref_exists(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   struct btrfs_key *key, u64 bytenr);
@@ -1448,6 +1496,11 @@ int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
 				u64 root_objectid, u64 ref_generation,
 				u64 owner, u64 owner_offset,
 				struct btrfs_key *ins);
+int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				u64 root_objectid, u64 ref_generation,
+				u64 owner, u64 owner_offset,
+				struct btrfs_key *ins);
 int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 				  struct btrfs_root *root,
 				  u64 num_bytes, u64 min_alloc_size,
@@ -1488,9 +1541,9 @@ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
 			struct btrfs_key *key, int lowest_level,
 			int cache_only, u64 min_trans);
 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
+			 struct btrfs_key *max_key,
 			 struct btrfs_path *path, int cache_only,
 			 u64 min_trans);
-
 int btrfs_cow_block(struct btrfs_trans_handle *trans,
 		    struct btrfs_root *root, struct extent_buffer *buf,
 		    struct extent_buffer *parent, int parent_slot,
@@ -1656,6 +1709,18 @@ int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
 #define PageChecked PageFsMisc
 #endif
 
+int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root,
+		       struct inode *dir, struct inode *inode,
+		       const char *name, int name_len);
+int btrfs_add_link(struct btrfs_trans_handle *trans,
+		   struct inode *parent_inode, struct inode *inode,
+		   const char *name, int name_len, int add_backref, u64 index);
+int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       struct inode *inode, u64 new_size,
+			       u32 min_type);
+
 int btrfs_start_delalloc_inodes(struct btrfs_root *root);
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end);
 int btrfs_writepages(struct address_space *mapping,
@@ -1715,6 +1780,7 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans,
 long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 
 /* file.c */
+int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync);
 int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end);
 int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
 extern struct file_operations btrfs_file_operations;
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 125094617fe8..e4f30090d640 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -96,6 +96,7 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
 	btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
 	btrfs_set_dir_type(leaf, dir_item, BTRFS_FT_XATTR);
 	btrfs_set_dir_name_len(leaf, dir_item, name_len);
+	btrfs_set_dir_transid(leaf, dir_item, trans->transid);
 	btrfs_set_dir_data_len(leaf, dir_item, data_len);
 	name_ptr = (unsigned long)(dir_item + 1);
 	data_ptr = (unsigned long)((char *)name_ptr + name_len);
@@ -142,6 +143,7 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_set_dir_type(leaf, dir_item, type);
 	btrfs_set_dir_data_len(leaf, dir_item, 0);
 	btrfs_set_dir_name_len(leaf, dir_item, name_len);
+	btrfs_set_dir_transid(leaf, dir_item, trans->transid);
 	name_ptr = (unsigned long)(dir_item + 1);
 
 	write_extent_buffer(leaf, name, name_ptr, name_len);
@@ -169,6 +171,7 @@ second_insert:
 	btrfs_set_dir_type(leaf, dir_item, type);
 	btrfs_set_dir_data_len(leaf, dir_item, 0);
 	btrfs_set_dir_name_len(leaf, dir_item, name_len);
+	btrfs_set_dir_transid(leaf, dir_item, trans->transid);
 	name_ptr = (unsigned long)(dir_item + 1);
 	write_extent_buffer(leaf, name, name_ptr, name_len);
 	btrfs_mark_buffer_dirty(leaf);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8e7a938bfbc7..a4373db5967a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -41,6 +41,7 @@
 #include "async-thread.h"
 #include "locking.h"
 #include "ref-cache.h"
+#include "tree-log.h"
 
 #if 0
 static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf)
@@ -694,6 +695,18 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
 }
 
 
+int btrfs_write_tree_block(struct extent_buffer *buf)
+{
+	return btrfs_fdatawrite_range(buf->first_page->mapping, buf->start,
+				      buf->start + buf->len - 1, WB_SYNC_NONE);
+}
+
+int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
+{
+	return btrfs_wait_on_page_writeback_range(buf->first_page->mapping,
+				  buf->start, buf->start + buf->len -1);
+}
+
 struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
 				      u32 blocksize, u64 parent_transid)
 {
@@ -732,15 +745,6 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	return 0;
 }
 
-int wait_on_tree_block_writeback(struct btrfs_root *root,
-				 struct extent_buffer *buf)
-{
-	struct inode *btree_inode = root->fs_info->btree_inode;
-	wait_on_extent_buffer_writeback(&BTRFS_I(btree_inode)->io_tree,
-					buf);
-	return 0;
-}
-
 static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 			u32 stripesize, struct btrfs_root *root,
 			struct btrfs_fs_info *fs_info,
@@ -771,6 +775,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 	spin_lock_init(&root->node_lock);
 	spin_lock_init(&root->list_lock);
 	mutex_init(&root->objectid_mutex);
+	mutex_init(&root->log_mutex);
 
 	btrfs_leaf_ref_tree_init(&root->ref_tree_struct);
 	root->ref_tree = &root->ref_tree_struct;
@@ -809,11 +814,74 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
 	return 0;
 }
 
-struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_fs_info *fs_info,
-					       struct btrfs_key *location)
+int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
+			     struct btrfs_fs_info *fs_info)
+{
+	struct extent_buffer *eb;
+	int ret;
+
+	if (!fs_info->log_root_tree)
+		return 0;
+
+	eb = fs_info->log_root_tree->node;
+
+	WARN_ON(btrfs_header_level(eb) != 0);
+	WARN_ON(btrfs_header_nritems(eb) != 0);
+
+	ret = btrfs_free_extent(trans, fs_info->tree_root,
+				eb->start, eb->len,
+				BTRFS_TREE_LOG_OBJECTID, 0, 0, 0, 1);
+	BUG_ON(ret);
+
+	free_extent_buffer(eb);
+	kfree(fs_info->log_root_tree);
+	fs_info->log_root_tree = NULL;
+	return 0;
+}
+
+int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
+			     struct btrfs_fs_info *fs_info)
 {
 	struct btrfs_root *root;
 	struct btrfs_root *tree_root = fs_info->tree_root;
+
+	root = kzalloc(sizeof(*root), GFP_NOFS);
+	if (!root)
+		return -ENOMEM;
+
+	__setup_root(tree_root->nodesize, tree_root->leafsize,
+		     tree_root->sectorsize, tree_root->stripesize,
+		     root, fs_info, BTRFS_TREE_LOG_OBJECTID);
+
+	root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
+	root->root_key.type = BTRFS_ROOT_ITEM_KEY;
+	root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
+	root->ref_cows = 0;
+
+	root->node = btrfs_alloc_free_block(trans, root, root->leafsize,
+					    BTRFS_TREE_LOG_OBJECTID,
+					    0, 0, 0, 0, 0);
+
+	btrfs_set_header_nritems(root->node, 0);
+	btrfs_set_header_level(root->node, 0);
+	btrfs_set_header_bytenr(root->node, root->node->start);
+	btrfs_set_header_generation(root->node, trans->transid);
+	btrfs_set_header_owner(root->node, BTRFS_TREE_LOG_OBJECTID);
+
+	write_extent_buffer(root->node, root->fs_info->fsid,
+			    (unsigned long)btrfs_header_fsid(root->node),
+			    BTRFS_FSID_SIZE);
+	btrfs_mark_buffer_dirty(root->node);
+	btrfs_tree_unlock(root->node);
+	fs_info->log_root_tree = root;
+	return 0;
+}
+
+struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
+					       struct btrfs_key *location)
+{
+	struct btrfs_root *root;
+	struct btrfs_fs_info *fs_info = tree_root->fs_info;
 	struct btrfs_path *path;
 	struct extent_buffer *l;
 	u64 highest_inode;
@@ -863,11 +931,13 @@ out:
 				     blocksize, 0);
 	BUG_ON(!root->node);
 insert:
-	root->ref_cows = 1;
-	ret = btrfs_find_highest_inode(root, &highest_inode);
-	if (ret == 0) {
-		root->highest_inode = highest_inode;
-		root->last_inode_alloc = highest_inode;
+	if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
+		root->ref_cows = 1;
+		ret = btrfs_find_highest_inode(root, &highest_inode);
+		if (ret == 0) {
+			root->highest_inode = highest_inode;
+			root->last_inode_alloc = highest_inode;
+		}
 	}
 	return root;
 }
@@ -907,7 +977,7 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
 	if (root)
 		return root;
 
-	root = btrfs_read_fs_root_no_radix(fs_info, location);
+	root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
 	if (IS_ERR(root))
 		return root;
 	ret = radix_tree_insert(&fs_info->fs_roots_radix,
@@ -1250,16 +1320,18 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	u32 blocksize;
 	u32 stripesize;
 	struct buffer_head *bh;
-	struct btrfs_root *extent_root = kmalloc(sizeof(struct btrfs_root),
+	struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root),
 						 GFP_NOFS);
-	struct btrfs_root *tree_root = kmalloc(sizeof(struct btrfs_root),
+	struct btrfs_root *tree_root = kzalloc(sizeof(struct btrfs_root),
 					       GFP_NOFS);
 	struct btrfs_fs_info *fs_info = kzalloc(sizeof(*fs_info),
 						GFP_NOFS);
-	struct btrfs_root *chunk_root = kmalloc(sizeof(struct btrfs_root),
+	struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root),
 						GFP_NOFS);
-	struct btrfs_root *dev_root = kmalloc(sizeof(struct btrfs_root),
+	struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root),
 					      GFP_NOFS);
+	struct btrfs_root *log_tree_root;
+
 	int ret;
 	int err = -EINVAL;
 
@@ -1343,6 +1415,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
 
 	mutex_init(&fs_info->trans_mutex);
+	mutex_init(&fs_info->tree_log_mutex);
 	mutex_init(&fs_info->drop_mutex);
 	mutex_init(&fs_info->alloc_mutex);
 	mutex_init(&fs_info->chunk_mutex);
@@ -1352,6 +1425,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	init_waitqueue_head(&fs_info->transaction_throttle);
 	init_waitqueue_head(&fs_info->transaction_wait);
 	init_waitqueue_head(&fs_info->async_submit_wait);
+	init_waitqueue_head(&fs_info->tree_log_wait);
+	atomic_set(&fs_info->tree_log_commit, 0);
+	atomic_set(&fs_info->tree_log_writers, 0);
+	fs_info->tree_log_transid = 0;
 
 #if 0
 	ret = add_hasher(fs_info, "crc32c");
@@ -1532,7 +1609,26 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	if (!fs_info->transaction_kthread)
 		goto fail_cleaner;
 
+	if (btrfs_super_log_root(disk_super) != 0) {
+		u32 blocksize;
+		u64 bytenr = btrfs_super_log_root(disk_super);
+
+		blocksize =
+		     btrfs_level_size(tree_root,
+				      btrfs_super_log_root_level(disk_super));
 
+		log_tree_root = kzalloc(sizeof(struct btrfs_root),
+						      GFP_NOFS);
+
+		__setup_root(nodesize, leafsize, sectorsize, stripesize,
+			     log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
+
+		log_tree_root->node = read_tree_block(tree_root, bytenr,
+						      blocksize, 0);
+		ret = btrfs_recover_log_trees(log_tree_root);
+		BUG_ON(ret);
+	}
+	fs_info->last_trans_committed = btrfs_super_generation(disk_super);
 	return tree_root;
 
 fail_cleaner:
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 2562a273ae18..6b6fdc697f31 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -45,7 +45,7 @@ struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
 struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
 				      struct btrfs_key *location,
 				      const char *name, int namelen);
-struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_fs_info *fs_info,
+struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
 					       struct btrfs_key *location);
 struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
 					      struct btrfs_key *location);
@@ -74,4 +74,10 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 			extent_submit_bio_hook_t *submit_bio_hook);
 int btrfs_congested_async(struct btrfs_fs_info *info, int iodone);
 unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
+int btrfs_write_tree_block(struct extent_buffer *buf);
+int btrfs_wait_tree_block_writeback(struct extent_buffer *buf);
+int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
+			     struct btrfs_fs_info *fs_info);
+int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
+			     struct btrfs_fs_info *fs_info);
 #endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e63b3b4bed7c..646b9148ca21 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -496,6 +496,23 @@ static int match_extent_ref(struct extent_buffer *leaf,
 	return ret == 0;
 }
 
+/* simple helper to search for an existing extent at a given offset */
+int btrfs_lookup_extent(struct btrfs_root *root, struct btrfs_path *path,
+			u64 start, u64 len)
+{
+	int ret;
+	struct btrfs_key key;
+
+	maybe_lock_mutex(root);
+	key.objectid = start;
+	key.offset = len;
+	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
+	ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
+				0, 0);
+	maybe_unlock_mutex(root);
+	return ret;
+}
+
 static int noinline lookup_extent_backref(struct btrfs_trans_handle *trans,
 					  struct btrfs_root *root,
 					  struct btrfs_path *path, u64 bytenr,
@@ -1409,7 +1426,7 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
 }
 
 
-static int update_pinned_extents(struct btrfs_root *root,
+int btrfs_update_pinned_extents(struct btrfs_root *root,
 				u64 bytenr, u64 num, int pin)
 {
 	u64 len;
@@ -1492,7 +1509,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 					    EXTENT_DIRTY);
 		if (ret)
 			break;
-		update_pinned_extents(root, start, end + 1 - start, 0);
+		btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
 		clear_extent_dirty(unpin, start, end, GFP_NOFS);
 		set_extent_dirty(free_space_cache, start, end, GFP_NOFS);
 		if (need_resched()) {
@@ -1538,14 +1555,11 @@ static int finish_current_insert(struct btrfs_trans_handle *trans,
 		clear_extent_bits(&info->extent_ins, start, end, EXTENT_LOCKED,
 				  GFP_NOFS);
 
-		eb = btrfs_find_tree_block(extent_root, ins.objectid,
+		eb = btrfs_find_create_tree_block(extent_root, ins.objectid,
 					   ins.offset);
 
-		if (!btrfs_buffer_uptodate(eb, trans->transid)) {
-			mutex_unlock(&extent_root->fs_info->alloc_mutex);
+		if (!btrfs_buffer_uptodate(eb, trans->transid))
 			btrfs_read_buffer(eb, trans->transid);
-			mutex_lock(&extent_root->fs_info->alloc_mutex);
-		}
 
 		btrfs_tree_lock(eb);
 		level = btrfs_header_level(eb);
@@ -1585,13 +1599,20 @@ static int pin_down_bytes(struct btrfs_root *root, u64 bytenr, u32 num_bytes,
 		struct extent_buffer *buf;
 		buf = btrfs_find_tree_block(root, bytenr, num_bytes);
 		if (buf) {
+			/* we can reuse a block if it hasn't been written
+			 * and it is from this transaction.  We can't
+			 * reuse anything from the tree log root because
+			 * it has tiny sub-transactions.
+			 */
 			if (btrfs_buffer_uptodate(buf, 0) &&
 			    btrfs_try_tree_lock(buf)) {
 				u64 transid =
 				    root->fs_info->running_transaction->transid;
 				u64 header_transid =
 					btrfs_header_generation(buf);
-				if (header_transid == transid &&
+				if (btrfs_header_owner(buf) !=
+				    BTRFS_TREE_LOG_OBJECTID &&
+				    header_transid == transid &&
 				    !btrfs_header_flag(buf,
 					       BTRFS_HEADER_FLAG_WRITTEN)) {
 					clean_tree_block(NULL, root, buf);
@@ -1603,7 +1624,7 @@ static int pin_down_bytes(struct btrfs_root *root, u64 bytenr, u32 num_bytes,
 			}
 			free_extent_buffer(buf);
 		}
-		update_pinned_extents(root, bytenr, num_bytes, 1);
+		btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
 	} else {
 		set_extent_bits(&root->fs_info->pending_del,
 				bytenr, bytenr + num_bytes - 1,
@@ -1801,7 +1822,7 @@ static int del_pending_extents(struct btrfs_trans_handle *trans, struct
 				  GFP_NOFS);
 		if (!test_range_bit(&extent_root->fs_info->extent_ins,
 				    start, end, EXTENT_LOCKED, 0)) {
-			update_pinned_extents(extent_root, start,
+			btrfs_update_pinned_extents(extent_root, start,
 					      end + 1 - start, 1);
 			ret = __free_extent(trans, extent_root,
 					     start, end + 1 - start,
@@ -1919,6 +1940,12 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 	if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) {
 		last_ptr = &root->fs_info->last_data_alloc;
 	}
+	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
+		last_ptr = &root->fs_info->last_log_alloc;
+		if (!last_ptr == 0 && root->fs_info->last_alloc) {
+			*last_ptr = root->fs_info->last_alloc + empty_cluster;
+		}
+	}
 
 	if (last_ptr) {
 		if (*last_ptr)
@@ -2268,6 +2295,35 @@ int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
 	maybe_unlock_mutex(root);
 	return ret;
 }
+
+/*
+ * this is used by the tree logging recovery code.  It records that
+ * an extent has been allocated and makes sure to clear the free
+ * space cache bits as well
+ */
+int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				u64 root_objectid, u64 ref_generation,
+				u64 owner, u64 owner_offset,
+				struct btrfs_key *ins)
+{
+	int ret;
+	struct btrfs_block_group_cache *block_group;
+
+	maybe_lock_mutex(root);
+	block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
+	cache_block_group(root, block_group);
+
+	clear_extent_dirty(&root->fs_info->free_space_cache,
+			   ins->objectid, ins->objectid + ins->offset - 1,
+			   GFP_NOFS);
+	ret = __btrfs_alloc_reserved_extent(trans, root, root_objectid,
+					    ref_generation, owner,
+					    owner_offset, ins);
+	maybe_unlock_mutex(root);
+	return ret;
+}
+
 /*
  * finds a free extent and does all the dirty work required for allocation
  * returns the key for the extent through ins, and a tree buffer for
@@ -2350,9 +2406,8 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 	return buf;
 }
 
-static int noinline drop_leaf_ref_no_cache(struct btrfs_trans_handle *trans,
-					   struct btrfs_root *root,
-					   struct extent_buffer *leaf)
+int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root, struct extent_buffer *leaf)
 {
 	u64 leaf_owner;
 	u64 leaf_generation;
@@ -2402,9 +2457,9 @@ static int noinline drop_leaf_ref_no_cache(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static int noinline drop_leaf_ref(struct btrfs_trans_handle *trans,
-					 struct btrfs_root *root,
-					 struct btrfs_leaf_ref *ref)
+static int noinline cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
+					struct btrfs_root *root,
+					struct btrfs_leaf_ref *ref)
 {
 	int i;
 	int ret;
@@ -2512,7 +2567,7 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 		    btrfs_header_nritems(cur))
 			break;
 		if (*level == 0) {
-			ret = drop_leaf_ref_no_cache(trans, root, cur);
+			ret = btrfs_drop_leaf_ref(trans, root, cur);
 			BUG_ON(ret);
 			break;
 		}
@@ -2552,7 +2607,7 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 			btrfs_node_key_to_cpu(cur, &key, path->slots[*level]);
 			ref = btrfs_lookup_leaf_ref(root, bytenr);
 			if (ref) {
-				ret = drop_leaf_ref(trans, root, ref);
+				ret = cache_drop_leaf_ref(trans, root, ref);
 				BUG_ON(ret);
 				btrfs_remove_leaf_ref(root, ref);
 				btrfs_free_leaf_ref(root, ref);
@@ -3628,6 +3683,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 	extent_root = root->fs_info->extent_root;
 	block_group_cache = &root->fs_info->block_group_cache;
 
+	root->fs_info->last_trans_new_blockgroup = trans->transid;
+
 	cache = kzalloc(sizeof(*cache), GFP_NOFS);
 	BUG_ON(!cache);
 	cache->key.objectid = chunk_offset;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e9e86fbaa243..84ecf3ab8511 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -36,6 +36,8 @@
 #include "btrfs_inode.h"
 #include "ioctl.h"
 #include "print-tree.h"
+#include "tree-log.h"
+#include "locking.h"
 #include "compat.h"
 
 
@@ -988,10 +990,27 @@ out_nolock:
 	*ppos = pos;
 
 	if (num_written > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
-		err = sync_page_range(inode, inode->i_mapping,
-				      start_pos, num_written);
+		struct btrfs_trans_handle *trans;
+
+		err = btrfs_fdatawrite_range(inode->i_mapping, start_pos,
+					     start_pos + num_written -1,
+					     WB_SYNC_NONE);
+		if (err < 0)
+			num_written = err;
+
+		err = btrfs_wait_on_page_writeback_range(inode->i_mapping,
+				 start_pos, start_pos + num_written - 1);
 		if (err < 0)
 			num_written = err;
+
+		trans = btrfs_start_transaction(root, 1);
+		ret = btrfs_log_dentry_safe(trans, root, file->f_dentry);
+		if (ret == 0) {
+			btrfs_sync_log(trans, root);
+			btrfs_end_transaction(trans, root);
+		} else {
+			btrfs_commit_transaction(trans, root);
+		}
 	} else if (num_written > 0 && (file->f_flags & O_DIRECT)) {
 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22)
 		do_sync_file_range(file, start_pos,
@@ -1019,8 +1038,7 @@ int btrfs_release_file(struct inode * inode, struct file * filp)
 	return 0;
 }
 
-static int btrfs_sync_file(struct file *file,
-			   struct dentry *dentry, int datasync)
+int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
 {
 	struct inode *inode = dentry->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -1043,6 +1061,8 @@ static int btrfs_sync_file(struct file *file,
 	}
 	mutex_unlock(&root->fs_info->trans_mutex);
 
+	filemap_fdatawait(inode->i_mapping);
+
 	/*
 	 * ok we haven't committed the transaction yet, lets do a commit
 	 */
@@ -1054,7 +1074,16 @@ static int btrfs_sync_file(struct file *file,
 		ret = -ENOMEM;
 		goto out;
 	}
-	ret = btrfs_commit_transaction(trans, root);
+
+	ret = btrfs_log_dentry_safe(trans, root, file->f_dentry);
+	if (ret < 0)
+		goto out;
+	if (ret > 0) {
+		ret = btrfs_commit_transaction(trans, root);
+	} else {
+		btrfs_sync_log(trans, root);
+		ret = btrfs_end_transaction(trans, root);
+	}
 out:
 	return ret > 0 ? EIO : ret;
 }
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 43d3f2649ca3..65df9d830230 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -46,6 +46,8 @@
 #include "volumes.h"
 #include "ordered-data.h"
 #include "xattr.h"
+#include "compat.h"
+#include "tree-log.h"
 
 struct btrfs_iget_args {
 	u64 ino;
@@ -586,6 +588,7 @@ nocow:
 			  &ordered_extent->list);
 
 	btrfs_ordered_update_i_size(inode, ordered_extent);
+	btrfs_update_inode(trans, root, inode);
 	btrfs_remove_ordered_extent(inode, ordered_extent);
 
 	/* once for us */
@@ -593,7 +596,6 @@ nocow:
 	/* once for the tree */
 	btrfs_put_ordered_extent(ordered_extent);
 
-	btrfs_update_inode(trans, root, inode);
 	btrfs_end_transaction(trans, root);
 	return 0;
 }
@@ -1007,7 +1009,8 @@ void btrfs_read_locked_inode(struct inode *inode)
 	inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
 
 	inode->i_blocks = btrfs_inode_nblocks(leaf, inode_item);
-	inode->i_generation = btrfs_inode_generation(leaf, inode_item);
+	BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
+	inode->i_generation = BTRFS_I(inode)->generation;
 	inode->i_rdev = 0;
 	rdev = btrfs_inode_rdev(leaf, inode_item);
 
@@ -1056,7 +1059,8 @@ make_bad:
 	make_bad_inode(inode);
 }
 
-static void fill_inode_item(struct extent_buffer *leaf,
+static void fill_inode_item(struct btrfs_trans_handle *trans,
+			    struct extent_buffer *leaf,
 			    struct btrfs_inode_item *item,
 			    struct inode *inode)
 {
@@ -1082,7 +1086,8 @@ static void fill_inode_item(struct extent_buffer *leaf,
 				inode->i_ctime.tv_nsec);
 
 	btrfs_set_inode_nblocks(leaf, item, inode->i_blocks);
-	btrfs_set_inode_generation(leaf, item, inode->i_generation);
+	btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
+	btrfs_set_inode_transid(leaf, item, trans->transid);
 	btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
 	btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
 	btrfs_set_inode_block_group(leaf, item,
@@ -1112,7 +1117,7 @@ int noinline btrfs_update_inode(struct btrfs_trans_handle *trans,
 	inode_item = btrfs_item_ptr(leaf, path->slots[0],
 				  struct btrfs_inode_item);
 
-	fill_inode_item(leaf, inode_item, inode);
+	fill_inode_item(trans, leaf, inode_item, inode);
 	btrfs_mark_buffer_dirty(leaf);
 	btrfs_set_inode_last_trans(trans, inode);
 	ret = 0;
@@ -1122,14 +1127,12 @@ failed:
 }
 
 
-static int btrfs_unlink_trans(struct btrfs_trans_handle *trans,
-			      struct btrfs_root *root,
-			      struct inode *dir,
-			      struct dentry *dentry)
+int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root,
+		       struct inode *dir, struct inode *inode,
+		       const char *name, int name_len)
 {
 	struct btrfs_path *path;
-	const char *name = dentry->d_name.name;
-	int name_len = dentry->d_name.len;
 	int ret = 0;
 	struct extent_buffer *leaf;
 	struct btrfs_dir_item *di;
@@ -1160,13 +1163,12 @@ static int btrfs_unlink_trans(struct btrfs_trans_handle *trans,
 	btrfs_release_path(root, path);
 
 	ret = btrfs_del_inode_ref(trans, root, name, name_len,
-				  dentry->d_inode->i_ino,
-				  dentry->d_parent->d_inode->i_ino, &index);
+				  inode->i_ino,
+				  dir->i_ino, &index);
 	if (ret) {
 		printk("failed to delete reference to %.*s, "
 		       "inode %lu parent %lu\n", name_len, name,
-		       dentry->d_inode->i_ino,
-		       dentry->d_parent->d_inode->i_ino);
+		       inode->i_ino, dir->i_ino);
 		goto err;
 	}
 
@@ -1183,21 +1185,25 @@ static int btrfs_unlink_trans(struct btrfs_trans_handle *trans,
 	ret = btrfs_delete_one_dir_name(trans, root, path, di);
 	btrfs_release_path(root, path);
 
-	dentry->d_inode->i_ctime = dir->i_ctime;
+	ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
+					 inode, dir->i_ino);
+	BUG_ON(ret);
+
+	ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
+					   dir, index);
+	BUG_ON(ret);
 err:
 	btrfs_free_path(path);
-	if (!ret) {
-		btrfs_i_size_write(dir, dir->i_size - name_len * 2);
-		dir->i_mtime = dir->i_ctime = CURRENT_TIME;
-		btrfs_update_inode(trans, root, dir);
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
-		dentry->d_inode->i_nlink--;
-#else
-		drop_nlink(dentry->d_inode);
-#endif
-		ret = btrfs_update_inode(trans, root, dentry->d_inode);
-		dir->i_sb->s_dirt = 1;
-	}
+	if (ret)
+		goto out;
+
+	btrfs_i_size_write(dir, dir->i_size - name_len * 2);
+	inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+	btrfs_update_inode(trans, root, dir);
+	btrfs_drop_nlink(inode);
+	ret = btrfs_update_inode(trans, root, inode);
+	dir->i_sb->s_dirt = 1;
+out:
 	return ret;
 }
 
@@ -1218,7 +1224,8 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 	trans = btrfs_start_transaction(root, 1);
 
 	btrfs_set_trans_block_group(trans, dir);
-	ret = btrfs_unlink_trans(trans, root, dir, dentry);
+	ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
+				 dentry->d_name.name, dentry->d_name.len);
 
 	if (inode->i_nlink == 0)
 		ret = btrfs_orphan_add(trans, inode);
@@ -1256,7 +1263,8 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 		goto fail_trans;
 
 	/* now the directory is empty */
-	err = btrfs_unlink_trans(trans, root, dir, dentry);
+	err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
+				 dentry->d_name.name, dentry->d_name.len);
 	if (!err) {
 		btrfs_i_size_write(inode, 0);
 	}
@@ -1283,10 +1291,10 @@ fail:
  * min_type is the minimum key type to truncate down to.  If set to 0, this
  * will kill all the items on this inode, including the INODE_ITEM_KEY.
  */
-static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
-				   struct btrfs_root *root,
-				   struct inode *inode,
-				   u32 min_type)
+noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
+					struct btrfs_root *root,
+					struct inode *inode,
+					u64 new_size, u32 min_type)
 {
 	int ret;
 	struct btrfs_path *path;
@@ -1307,7 +1315,9 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 	int extent_type = -1;
 	u64 mask = root->sectorsize - 1;
 
-	btrfs_drop_extent_cache(inode, inode->i_size & (~mask), (u64)-1);
+	if (root->ref_cows)
+		btrfs_drop_extent_cache(inode,
+					new_size & (~mask), (u64)-1);
 	path = btrfs_alloc_path();
 	path->reada = -1;
 	BUG_ON(!path);
@@ -1324,7 +1334,13 @@ search_again:
 		goto error;
 	}
 	if (ret > 0) {
-		BUG_ON(path->slots[0] == 0);
+		/* there are no items in the tree for us to truncate, we're
+		 * done
+		 */
+		if (path->slots[0] == 0) {
+			ret = 0;
+			goto error;
+		}
 		path->slots[0]--;
 	}
 
@@ -1358,10 +1374,10 @@ search_again:
 		}
 		if (found_type == BTRFS_CSUM_ITEM_KEY) {
 			ret = btrfs_csum_truncate(trans, root, path,
-						  inode->i_size);
+						  new_size);
 			BUG_ON(ret);
 		}
-		if (item_end < inode->i_size) {
+		if (item_end < new_size) {
 			if (found_type == BTRFS_DIR_ITEM_KEY) {
 				found_type = BTRFS_INODE_ITEM_KEY;
 			} else if (found_type == BTRFS_EXTENT_ITEM_KEY) {
@@ -1378,7 +1394,7 @@ search_again:
 			btrfs_set_key_type(&key, found_type);
 			goto next;
 		}
-		if (found_key.offset >= inode->i_size)
+		if (found_key.offset >= new_size)
 			del_item = 1;
 		else
 			del_item = 0;
@@ -1394,7 +1410,7 @@ search_again:
 			if (!del_item) {
 				u64 orig_num_bytes =
 					btrfs_file_extent_num_bytes(leaf, fi);
-				extent_num_bytes = inode->i_size -
+				extent_num_bytes = new_size -
 					found_key.offset + root->sectorsize - 1;
 				extent_num_bytes = extent_num_bytes &
 					~((u64)root->sectorsize - 1);
@@ -1402,7 +1418,7 @@ search_again:
 							 extent_num_bytes);
 				num_dec = (orig_num_bytes -
 					   extent_num_bytes);
-				if (extent_start != 0)
+				if (root->ref_cows && extent_start != 0)
 					dec_i_blocks(inode, num_dec);
 				btrfs_mark_buffer_dirty(leaf);
 			} else {
@@ -1413,22 +1429,29 @@ search_again:
 				num_dec = btrfs_file_extent_num_bytes(leaf, fi);
 				if (extent_start != 0) {
 					found_extent = 1;
-					dec_i_blocks(inode, num_dec);
+					if (root->ref_cows)
+						dec_i_blocks(inode, num_dec);
+				}
+				if (root->ref_cows) {
+					root_gen =
+						btrfs_header_generation(leaf);
 				}
-				root_gen = btrfs_header_generation(leaf);
 				root_owner = btrfs_header_owner(leaf);
 			}
 		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
 			if (!del_item) {
-				u32 newsize = inode->i_size - found_key.offset;
-				dec_i_blocks(inode, item_end + 1 -
-					    found_key.offset - newsize);
-				newsize =
-				    btrfs_file_extent_calc_inline_size(newsize);
+				u32 size = new_size - found_key.offset;
+
+				if (root->ref_cows) {
+					dec_i_blocks(inode, item_end + 1 -
+						    found_key.offset - size);
+				}
+				size =
+				    btrfs_file_extent_calc_inline_size(size);
 				ret = btrfs_truncate_item(trans, root, path,
-							  newsize, 1);
+							  size, 1);
 				BUG_ON(ret);
-			} else {
+			} else if (root->ref_cows) {
 				dec_i_blocks(inode, item_end + 1 -
 					     found_key.offset);
 			}
@@ -1666,7 +1689,7 @@ void btrfs_delete_inode(struct inode *inode)
 	trans = btrfs_start_transaction(root, 1);
 
 	btrfs_set_trans_block_group(trans, inode);
-	ret = btrfs_truncate_in_trans(trans, root, inode, 0);
+	ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, 0);
 	if (ret) {
 		btrfs_orphan_del(NULL, inode);
 		goto no_delete_lock;
@@ -1753,15 +1776,20 @@ static int fixup_tree_root_location(struct btrfs_root *root,
 	return 0;
 }
 
-static int btrfs_init_locked_inode(struct inode *inode, void *p)
+static noinline void init_btrfs_i(struct inode *inode)
 {
-	struct btrfs_iget_args *args = p;
-	inode->i_ino = args->ino;
-	BTRFS_I(inode)->root = args->root;
-	BTRFS_I(inode)->delalloc_bytes = 0;
-	inode->i_mapping->writeback_index = 0;
-	BTRFS_I(inode)->disk_i_size = 0;
-	BTRFS_I(inode)->index_cnt = (u64)-1;
+	struct btrfs_inode *bi = BTRFS_I(inode);
+
+	bi->i_acl = NULL;
+	bi->i_default_acl = NULL;
+
+	bi->generation = 0;
+	bi->last_trans = 0;
+	bi->logged_trans = 0;
+	bi->delalloc_bytes = 0;
+	bi->disk_i_size = 0;
+	bi->flags = 0;
+	bi->index_cnt = (u64)-1;
 	extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
 	extent_io_tree_init(&BTRFS_I(inode)->io_tree,
 			     inode->i_mapping, GFP_NOFS);
@@ -1771,6 +1799,15 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
 	btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
 	mutex_init(&BTRFS_I(inode)->csum_mutex);
 	mutex_init(&BTRFS_I(inode)->extent_mutex);
+	mutex_init(&BTRFS_I(inode)->log_mutex);
+}
+
+static int btrfs_init_locked_inode(struct inode *inode, void *p)
+{
+	struct btrfs_iget_args *args = p;
+	inode->i_ino = args->ino;
+	init_btrfs_i(inode);
+	BTRFS_I(inode)->root = args->root;
 	return 0;
 }
 
@@ -2263,21 +2300,10 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	 * btrfs_get_inode_index_count has an explanation for the magic
 	 * number
 	 */
+	init_btrfs_i(inode);
 	BTRFS_I(inode)->index_cnt = 2;
-
-	extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
-	extent_io_tree_init(&BTRFS_I(inode)->io_tree,
-			     inode->i_mapping, GFP_NOFS);
-	extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
-			     inode->i_mapping, GFP_NOFS);
-	btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
-	INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
-	mutex_init(&BTRFS_I(inode)->csum_mutex);
-	mutex_init(&BTRFS_I(inode)->extent_mutex);
-	BTRFS_I(inode)->delalloc_bytes = 0;
-	inode->i_mapping->writeback_index = 0;
-	BTRFS_I(inode)->disk_i_size = 0;
 	BTRFS_I(inode)->root = root;
+	BTRFS_I(inode)->generation = trans->transid;
 
 	if (mode & S_IFDIR)
 		owner = 0;
@@ -2290,7 +2316,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 		new_inode_group = group;
 	}
 	BTRFS_I(inode)->block_group = new_inode_group;
-	BTRFS_I(inode)->flags = 0;
 
 	key[0].objectid = objectid;
 	btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
@@ -2318,7 +2343,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 	inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
 				  struct btrfs_inode_item);
-	fill_inode_item(path->nodes[0], inode_item, inode);
+	fill_inode_item(trans, path->nodes[0], inode_item, inode);
 
 	ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
 			     struct btrfs_inode_ref);
@@ -2349,38 +2374,34 @@ static inline u8 btrfs_inode_type(struct inode *inode)
 	return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT];
 }
 
-static int btrfs_add_link(struct btrfs_trans_handle *trans,
-			    struct dentry *dentry, struct inode *inode,
-			    int add_backref, u64 index)
+int btrfs_add_link(struct btrfs_trans_handle *trans,
+		   struct inode *parent_inode, struct inode *inode,
+		   const char *name, int name_len, int add_backref, u64 index)
 {
 	int ret;
 	struct btrfs_key key;
-	struct btrfs_root *root = BTRFS_I(dentry->d_parent->d_inode)->root;
-	struct inode *parent_inode = dentry->d_parent->d_inode;
+	struct btrfs_root *root = BTRFS_I(parent_inode)->root;
 
 	key.objectid = inode->i_ino;
 	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
 	key.offset = 0;
 
-	ret = btrfs_insert_dir_item(trans, root,
-				    dentry->d_name.name, dentry->d_name.len,
-				    dentry->d_parent->d_inode->i_ino,
+	ret = btrfs_insert_dir_item(trans, root, name, name_len,
+				    parent_inode->i_ino,
 				    &key, btrfs_inode_type(inode),
 				    index);
 	if (ret == 0) {
 		if (add_backref) {
 			ret = btrfs_insert_inode_ref(trans, root,
-					     dentry->d_name.name,
-					     dentry->d_name.len,
-					     inode->i_ino,
-					     parent_inode->i_ino,
-					     index);
+						     name, name_len,
+						     inode->i_ino,
+						     parent_inode->i_ino,
+						     index);
 		}
 		btrfs_i_size_write(parent_inode, parent_inode->i_size +
-				   dentry->d_name.len * 2);
+				   name_len * 2);
 		parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
-		ret = btrfs_update_inode(trans, root,
-					 dentry->d_parent->d_inode);
+		ret = btrfs_update_inode(trans, root, parent_inode);
 	}
 	return ret;
 }
@@ -2389,7 +2410,9 @@ static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
 			    struct dentry *dentry, struct inode *inode,
 			    int backref, u64 index)
 {
-	int err = btrfs_add_link(trans, dentry, inode, backref, index);
+	int err = btrfs_add_link(trans, dentry->d_parent->d_inode,
+				 inode, dentry->d_name.name,
+				 dentry->d_name.len, backref, index);
 	if (!err) {
 		d_instantiate(dentry, inode);
 		return 0;
@@ -2513,19 +2536,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 		inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
 		inode->i_fop = &btrfs_file_operations;
 		inode->i_op = &btrfs_file_inode_operations;
-		extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
-		extent_io_tree_init(&BTRFS_I(inode)->io_tree,
-				     inode->i_mapping, GFP_NOFS);
-		extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
-				     inode->i_mapping, GFP_NOFS);
-		INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
-		mutex_init(&BTRFS_I(inode)->csum_mutex);
-		mutex_init(&BTRFS_I(inode)->extent_mutex);
-		BTRFS_I(inode)->delalloc_bytes = 0;
-		BTRFS_I(inode)->disk_i_size = 0;
-		inode->i_mapping->writeback_index = 0;
 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
-		btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
 	}
 	dir->i_sb->s_dirt = 1;
 	btrfs_update_inode_block_group(trans, inode);
@@ -2556,11 +2567,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 	if (inode->i_nlink == 0)
 		return -ENOENT;
 
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
-	inode->i_nlink++;
-#else
-	inc_nlink(inode);
-#endif
+	btrfs_inc_nlink(inode);
 	err = btrfs_check_free_space(root, 1, 0);
 	if (err)
 		goto fail;
@@ -2650,7 +2657,9 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	if (err)
 		goto out_fail;
 
-	err = btrfs_add_link(trans, dentry, inode, 0, index);
+	err = btrfs_add_link(trans, dentry->d_parent->d_inode,
+				 inode, dentry->d_name.name,
+				 dentry->d_name.len, 0, index);
 	if (err)
 		goto out_fail;
 
@@ -3221,7 +3230,7 @@ static void btrfs_truncate(struct inode *inode)
 	if (ret)
 		goto out;
 	/* FIXME, add redo link to tree so we don't leak on crash */
-	ret = btrfs_truncate_in_trans(trans, root, inode,
+	ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size,
 				      BTRFS_EXTENT_DATA_KEY);
 	btrfs_update_inode(trans, root, inode);
 
@@ -3304,6 +3313,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 	if (!ei)
 		return NULL;
 	ei->last_trans = 0;
+	ei->logged_trans = 0;
 	btrfs_ordered_inode_tree_init(&ei->ordered_tree);
 	ei->i_acl = BTRFS_ACL_NOT_CACHED;
 	ei->i_default_acl = BTRFS_ACL_NOT_CACHED;
@@ -3463,31 +3473,39 @@ static int btrfs_rename(struct inode * old_dir, struct dentry *old_dentry,
 
 	btrfs_set_trans_block_group(trans, new_dir);
 
-	old_dentry->d_inode->i_nlink++;
+	btrfs_inc_nlink(old_dentry->d_inode);
 	old_dir->i_ctime = old_dir->i_mtime = ctime;
 	new_dir->i_ctime = new_dir->i_mtime = ctime;
 	old_inode->i_ctime = ctime;
 
-	ret = btrfs_unlink_trans(trans, root, old_dir, old_dentry);
+	ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode,
+				 old_dentry->d_name.name,
+				 old_dentry->d_name.len);
 	if (ret)
 		goto out_fail;
 
 	if (new_inode) {
 		new_inode->i_ctime = CURRENT_TIME;
-		ret = btrfs_unlink_trans(trans, root, new_dir, new_dentry);
+		ret = btrfs_unlink_inode(trans, root, new_dir,
+					 new_dentry->d_inode,
+					 new_dentry->d_name.name,
+					 new_dentry->d_name.len);
 		if (ret)
 			goto out_fail;
 		if (new_inode->i_nlink == 0) {
-			ret = btrfs_orphan_add(trans, new_inode);
+			ret = btrfs_orphan_add(trans, new_dentry->d_inode);
 			if (ret)
 				goto out_fail;
 		}
+
 	}
 	ret = btrfs_set_inode_index(new_dir, old_inode, &index);
 	if (ret)
 		goto out_fail;
 
-	ret = btrfs_add_link(trans, new_dentry, old_inode, 1, index);
+	ret = btrfs_add_link(trans, new_dentry->d_parent->d_inode,
+			     old_inode, new_dentry->d_name.name,
+			     new_dentry->d_name.len, 1, index);
 	if (ret)
 		goto out_fail;
 
@@ -3577,19 +3595,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 		inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
 		inode->i_fop = &btrfs_file_operations;
 		inode->i_op = &btrfs_file_inode_operations;
-		extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
-		extent_io_tree_init(&BTRFS_I(inode)->io_tree,
-				     inode->i_mapping, GFP_NOFS);
-		extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
-				     inode->i_mapping, GFP_NOFS);
-		INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
-		mutex_init(&BTRFS_I(inode)->csum_mutex);
-		mutex_init(&BTRFS_I(inode)->extent_mutex);
-		BTRFS_I(inode)->delalloc_bytes = 0;
-		BTRFS_I(inode)->disk_i_size = 0;
-		inode->i_mapping->writeback_index = 0;
 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
-		btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
 	}
 	dir->i_sb->s_dirt = 1;
 	btrfs_update_inode_block_group(trans, inode);
@@ -3691,6 +3697,7 @@ static struct file_operations btrfs_dir_file_operations = {
 	.compat_ioctl	= btrfs_ioctl,
 #endif
 	.release        = btrfs_release_file,
+	.fsync		= btrfs_sync_file,
 };
 
 static struct extent_io_ops btrfs_extent_io_ops = {
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 36726696e58b..e3984f902e71 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -202,8 +202,9 @@ again:
 		memcpy(&found_key, &key, sizeof(key));
 		key.offset++;
 		btrfs_release_path(root, path);
-		dead_root = btrfs_read_fs_root_no_radix(root->fs_info,
-							&found_key);
+		dead_root =
+			btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
+						    &found_key);
 		if (IS_ERR(dead_root)) {
 			ret = PTR_ERR(dead_root);
 			goto err;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index eff3ad72991b..49c4f5b40ed6 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -25,6 +25,7 @@
 #include "transaction.h"
 #include "locking.h"
 #include "ref-cache.h"
+#include "tree-log.h"
 
 static int total_trans = 0;
 extern struct kmem_cache *btrfs_trans_handle_cachep;
@@ -57,6 +58,7 @@ static noinline int join_transaction(struct btrfs_root *root)
 		root->fs_info->generation++;
 		root->fs_info->last_alloc = 0;
 		root->fs_info->last_data_alloc = 0;
+		root->fs_info->last_log_alloc = 0;
 		cur_trans->num_writers = 1;
 		cur_trans->num_joined = 0;
 		cur_trans->transid = root->fs_info->generation;
@@ -83,7 +85,7 @@ static noinline int join_transaction(struct btrfs_root *root)
 	return 0;
 }
 
-static noinline int record_root_in_trans(struct btrfs_root *root)
+noinline int btrfs_record_root_in_trans(struct btrfs_root *root)
 {
 	struct btrfs_dirty_root *dirty;
 	u64 running_trans_id = root->fs_info->running_transaction->transid;
@@ -151,7 +153,7 @@ static void wait_current_trans(struct btrfs_root *root)
 	}
 }
 
-struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
+static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
 					     int num_blocks, int wait)
 {
 	struct btrfs_trans_handle *h =
@@ -164,7 +166,7 @@ struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
 	ret = join_transaction(root);
 	BUG_ON(ret);
 
-	record_root_in_trans(root);
+	btrfs_record_root_in_trans(root);
 	h->transid = root->fs_info->running_transaction->transid;
 	h->transaction = root->fs_info->running_transaction;
 	h->blocks_reserved = num_blocks;
@@ -456,6 +458,8 @@ static noinline int add_dirty_roots(struct btrfs_trans_handle *trans,
 			BUG_ON(!root->ref_tree);
 			dirty = root->dirty_root;
 
+			btrfs_free_log(trans, root);
+
 			if (root->commit_root == root->node) {
 				WARN_ON(root->node->start !=
 					btrfs_root_bytenr(&root->root_item));
@@ -600,7 +604,7 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
 		num_bytes -= btrfs_root_used(&dirty->root->root_item);
 		bytes_used = btrfs_root_used(&root->root_item);
 		if (num_bytes) {
-			record_root_in_trans(root);
+			btrfs_record_root_in_trans(root);
 			btrfs_set_root_used(&root->root_item,
 					    bytes_used - num_bytes);
 		}
@@ -745,7 +749,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	int ret;
 
 	INIT_LIST_HEAD(&dirty_fs_roots);
-
 	mutex_lock(&root->fs_info->trans_mutex);
 	if (trans->transaction->in_commit) {
 		cur_trans = trans->transaction;
@@ -821,10 +824,30 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 	WARN_ON(cur_trans != trans->transaction);
 
+	/* btrfs_commit_tree_roots is responsible for getting the
+	 * various roots consistent with each other.  Every pointer
+	 * in the tree of tree roots has to point to the most up to date
+	 * root for every subvolume and other tree.  So, we have to keep
+	 * the tree logging code from jumping in and changing any
+	 * of the trees.
+	 *
+	 * At this point in the commit, there can't be any tree-log
+	 * writers, but a little lower down we drop the trans mutex
+	 * and let new people in.  By holding the tree_log_mutex
+	 * from now until after the super is written, we avoid races
+	 * with the tree-log code.
+	 */
+	mutex_lock(&root->fs_info->tree_log_mutex);
+
 	ret = add_dirty_roots(trans, &root->fs_info->fs_roots_radix,
 			      &dirty_fs_roots);
 	BUG_ON(ret);
 
+	/* add_dirty_roots gets rid of all the tree log roots, it is now
+	 * safe to free the root of tree log roots
+	 */
+	btrfs_free_log_root_tree(trans, root->fs_info);
+
 	ret = btrfs_commit_tree_roots(trans, root);
 	BUG_ON(ret);
 
@@ -843,6 +866,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 				   chunk_root->node->start);
 	btrfs_set_super_chunk_root_level(&root->fs_info->super_copy,
 					 btrfs_header_level(chunk_root->node));
+
+	if (!root->fs_info->log_root_recovering) {
+		btrfs_set_super_log_root(&root->fs_info->super_copy, 0);
+		btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0);
+	}
+
 	memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy,
 	       sizeof(root->fs_info->super_copy));
 
@@ -857,6 +886,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	BUG_ON(ret);
 	write_ctree_super(trans, root);
 
+	/*
+	 * the super is written, we can safely allow the tree-loggers
+	 * to go about their business
+	 */
+	mutex_unlock(&root->fs_info->tree_log_mutex);
+
 	btrfs_finish_extent_commit(trans, root, pinned_copy);
 	mutex_lock(&root->fs_info->trans_mutex);
 
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 598baa312417..cc63650d60d6 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -98,4 +98,5 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root);
 void btrfs_throttle(struct btrfs_root *root);
+int btrfs_record_root_in_trans(struct btrfs_root *root);
 #endif
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index cc2650b06952..b3bb5bbad76e 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -81,12 +81,12 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 		memcpy(&key, &root->defrag_progress, sizeof(key));
 	}
 
-	path->lowest_level = 1;
 	path->keep_locks = 1;
 	if (cache_only)
 		min_trans = root->defrag_trans_start;
 
-	ret = btrfs_search_forward(root, &key, path, cache_only, min_trans);
+	ret = btrfs_search_forward(root, &key, NULL, path,
+				   cache_only, min_trans);
 	if (ret < 0)
 		goto out;
 	if (ret > 0) {
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
new file mode 100644
index 000000000000..d1ce8314b948
--- /dev/null
+++ b/fs/btrfs/tree-log.c
@@ -0,0 +1,2804 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include "ctree.h"
+#include "transaction.h"
+#include "disk-io.h"
+#include "locking.h"
+#include "print-tree.h"
+#include "compat.h"
+
+/* magic values for the inode_only field in btrfs_log_inode:
+ *
+ * LOG_INODE_ALL means to log everything
+ * LOG_INODE_EXISTS means to log just enough to recreate the inode
+ * during log replay
+ */
+#define LOG_INODE_ALL 0
+#define LOG_INODE_EXISTS 1
+
+/*
+ * stages for the tree walking.  The first
+ * stage (0) is to only pin down the blocks we find
+ * the second stage (1) is to make sure that all the inodes
+ * we find in the log are created in the subvolume.
+ *
+ * The last stage is to deal with directories and links and extents
+ * and all the other fun semantics
+ */
+#define LOG_WALK_PIN_ONLY 0
+#define LOG_WALK_REPLAY_INODES 1
+#define LOG_WALK_REPLAY_ALL 2
+
+static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root, struct inode *inode,
+			     int inode_only);
+
+/*
+ * tree logging is a special write ahead log used to make sure that
+ * fsyncs and O_SYNCs can happen without doing full tree commits.
+ *
+ * Full tree commits are expensive because they require commonly
+ * modified blocks to be recowed, creating many dirty pages in the
+ * extent tree an 4x-6x higher write load than ext3.
+ *
+ * Instead of doing a tree commit on every fsync, we use the
+ * key ranges and transaction ids to find items for a given file or directory
+ * that have changed in this transaction.  Those items are copied into
+ * a special tree (one per subvolume root), that tree is written to disk
+ * and then the fsync is considered complete.
+ *
+ * After a crash, items are copied out of the log-tree back into the
+ * subvolume tree.  Any file data extents found are recorded in the extent
+ * allocation tree, and the log-tree freed.
+ *
+ * The log tree is read three times, once to pin down all the extents it is
+ * using in ram and once, once to create all the inodes logged in the tree
+ * and once to do all the other items.
+ */
+
+/*
+ * btrfs_add_log_tree adds a new per-subvolume log tree into the
+ * tree of log tree roots.  This must be called with a tree log transaction
+ * running (see start_log_trans).
+ */
+int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
+		      struct btrfs_root *root)
+{
+	struct btrfs_key key;
+	struct btrfs_root_item root_item;
+	struct btrfs_inode_item *inode_item;
+	struct extent_buffer *leaf;
+	struct btrfs_root *new_root = root;
+	int ret;
+	u64 objectid = root->root_key.objectid;
+
+	leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
+				      BTRFS_TREE_LOG_OBJECTID,
+				      0, 0, 0, 0, 0);
+	if (IS_ERR(leaf)) {
+		ret = PTR_ERR(leaf);
+		return ret;
+	}
+
+	btrfs_set_header_nritems(leaf, 0);
+	btrfs_set_header_level(leaf, 0);
+	btrfs_set_header_bytenr(leaf, leaf->start);
+	btrfs_set_header_generation(leaf, trans->transid);
+	btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID);
+
+	write_extent_buffer(leaf, root->fs_info->fsid,
+			    (unsigned long)btrfs_header_fsid(leaf),
+			    BTRFS_FSID_SIZE);
+	btrfs_mark_buffer_dirty(leaf);
+
+	inode_item = &root_item.inode;
+	memset(inode_item, 0, sizeof(*inode_item));
+	inode_item->generation = cpu_to_le64(1);
+	inode_item->size = cpu_to_le64(3);
+	inode_item->nlink = cpu_to_le32(1);
+	inode_item->nblocks = cpu_to_le64(1);
+	inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
+
+	btrfs_set_root_bytenr(&root_item, leaf->start);
+	btrfs_set_root_level(&root_item, 0);
+	btrfs_set_root_refs(&root_item, 0);
+	btrfs_set_root_used(&root_item, 0);
+
+	memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
+	root_item.drop_level = 0;
+
+	btrfs_tree_unlock(leaf);
+	free_extent_buffer(leaf);
+	leaf = NULL;
+
+	btrfs_set_root_dirid(&root_item, 0);
+
+	key.objectid = BTRFS_TREE_LOG_OBJECTID;
+	key.offset = objectid;
+	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+	ret = btrfs_insert_root(trans, root->fs_info->log_root_tree, &key,
+				&root_item);
+	if (ret)
+		goto fail;
+
+	new_root = btrfs_read_fs_root_no_radix(root->fs_info->log_root_tree,
+					       &key);
+	BUG_ON(!new_root);
+
+	WARN_ON(root->log_root);
+	root->log_root = new_root;
+
+	/*
+	 * log trees do not get reference counted because they go away
+	 * before a real commit is actually done.  They do store pointers
+	 * to file data extents, and those reference counts still get
+	 * updated (along with back refs to the log tree).
+	 */
+	new_root->ref_cows = 0;
+	new_root->last_trans = trans->transid;
+fail:
+	return ret;
+}
+
+/*
+ * start a sub transaction and setup the log tree
+ * this increments the log tree writer count to make the people
+ * syncing the tree wait for us to finish
+ */
+static int start_log_trans(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root)
+{
+	int ret;
+	mutex_lock(&root->fs_info->tree_log_mutex);
+	if (!root->fs_info->log_root_tree) {
+		ret = btrfs_init_log_root_tree(trans, root->fs_info);
+		BUG_ON(ret);
+	}
+	if (!root->log_root) {
+		ret = btrfs_add_log_tree(trans, root);
+		BUG_ON(ret);
+	}
+	atomic_inc(&root->fs_info->tree_log_writers);
+	root->fs_info->tree_log_batch++;
+	mutex_unlock(&root->fs_info->tree_log_mutex);
+	return 0;
+}
+
+/*
+ * returns 0 if there was a log transaction running and we were able
+ * to join, or returns -ENOENT if there were not transactions
+ * in progress
+ */
+static int join_running_log_trans(struct btrfs_root *root)
+{
+	int ret = -ENOENT;
+
+	smp_mb();
+	if (!root->log_root)
+		return -ENOENT;
+
+	mutex_lock(&root->fs_info->tree_log_mutex);
+	if (root->log_root) {
+		ret = 0;
+		atomic_inc(&root->fs_info->tree_log_writers);
+		root->fs_info->tree_log_batch++;
+	}
+	mutex_unlock(&root->fs_info->tree_log_mutex);
+	return ret;
+}
+
+/*
+ * indicate we're done making changes to the log tree
+ * and wake up anyone waiting to do a sync
+ */
+static int end_log_trans(struct btrfs_root *root)
+{
+	atomic_dec(&root->fs_info->tree_log_writers);
+	smp_mb();
+	if (waitqueue_active(&root->fs_info->tree_log_wait))
+		wake_up(&root->fs_info->tree_log_wait);
+	return 0;
+}
+
+
+/*
+ * the walk control struct is used to pass state down the chain when
+ * processing the log tree.  The stage field tells us which part
+ * of the log tree processing we are currently doing.  The others
+ * are state fields used for that specific part
+ */
+struct walk_control {
+	/* should we free the extent on disk when done?  This is used
+	 * at transaction commit time while freeing a log tree
+	 */
+	int free;
+
+	/* should we write out the extent buffer?  This is used
+	 * while flushing the log tree to disk during a sync
+	 */
+	int write;
+
+	/* should we wait for the extent buffer io to finish?  Also used
+	 * while flushing the log tree to disk for a sync
+	 */
+	int wait;
+
+	/* pin only walk, we record which extents on disk belong to the
+	 * log trees
+	 */
+	int pin;
+
+	/* what stage of the replay code we're currently in */
+	int stage;
+
+	/* the root we are currently replaying */
+	struct btrfs_root *replay_dest;
+
+	/* the trans handle for the current replay */
+	struct btrfs_trans_handle *trans;
+
+	/* the function that gets used to process blocks we find in the
+	 * tree.  Note the extent_buffer might not be up to date when it is
+	 * passed in, and it must be checked or read if you need the data
+	 * inside it
+	 */
+	int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
+			    struct walk_control *wc, u64 gen);
+};
+
+/*
+ * process_func used to pin down extents, write them or wait on them
+ */
+static int process_one_buffer(struct btrfs_root *log,
+			      struct extent_buffer *eb,
+			      struct walk_control *wc, u64 gen)
+{
+	if (wc->pin) {
+		mutex_lock(&log->fs_info->alloc_mutex);
+		btrfs_update_pinned_extents(log->fs_info->extent_root,
+					    eb->start, eb->len, 1);
+		mutex_unlock(&log->fs_info->alloc_mutex);
+	}
+
+	if (btrfs_buffer_uptodate(eb, gen)) {
+		if (wc->write)
+			btrfs_write_tree_block(eb);
+		if (wc->wait)
+			btrfs_wait_tree_block_writeback(eb);
+	}
+	return 0;
+}
+
+/*
+ * Item overwrite used by replay and tree logging.  eb, slot and key all refer
+ * to the src data we are copying out.
+ *
+ * root is the tree we are copying into, and path is a scratch
+ * path for use in this function (it should be released on entry and
+ * will be released on exit).
+ *
+ * If the key is already in the destination tree the existing item is
+ * overwritten.  If the existing item isn't big enough, it is extended.
+ * If it is too large, it is truncated.
+ *
+ * If the key isn't in the destination yet, a new item is inserted.
+ */
+static noinline int overwrite_item(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root,
+				   struct btrfs_path *path,
+				   struct extent_buffer *eb, int slot,
+				   struct btrfs_key *key)
+{
+	int ret;
+	u32 item_size;
+	u64 saved_i_size = 0;
+	int save_old_i_size = 0;
+	unsigned long src_ptr;
+	unsigned long dst_ptr;
+	int overwrite_root = 0;
+
+	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
+		overwrite_root = 1;
+
+	item_size = btrfs_item_size_nr(eb, slot);
+	src_ptr = btrfs_item_ptr_offset(eb, slot);
+
+	/* look for the key in the destination tree */
+	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+	if (ret == 0) {
+		char *src_copy;
+		char *dst_copy;
+		u32 dst_size = btrfs_item_size_nr(path->nodes[0],
+						  path->slots[0]);
+		if (dst_size != item_size)
+			goto insert;
+
+		if (item_size == 0) {
+			btrfs_release_path(root, path);
+			return 0;
+		}
+		dst_copy = kmalloc(item_size, GFP_NOFS);
+		src_copy = kmalloc(item_size, GFP_NOFS);
+
+		read_extent_buffer(eb, src_copy, src_ptr, item_size);
+
+		dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
+		read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
+				   item_size);
+		ret = memcmp(dst_copy, src_copy, item_size);
+
+		kfree(dst_copy);
+		kfree(src_copy);
+		/*
+		 * they have the same contents, just return, this saves
+		 * us from cowing blocks in the destination tree and doing
+		 * extra writes that may not have been done by a previous
+		 * sync
+		 */
+		if (ret == 0) {
+			btrfs_release_path(root, path);
+			return 0;
+		}
+
+	}
+insert:
+	btrfs_release_path(root, path);
+	/* try to insert the key into the destination tree */
+	ret = btrfs_insert_empty_item(trans, root, path,
+				      key, item_size);
+
+	/* make sure any existing item is the correct size */
+	if (ret == -EEXIST) {
+		u32 found_size;
+		found_size = btrfs_item_size_nr(path->nodes[0],
+						path->slots[0]);
+		if (found_size > item_size) {
+			btrfs_truncate_item(trans, root, path, item_size, 1);
+		} else if (found_size < item_size) {
+			ret = btrfs_del_item(trans, root,
+					     path);
+			BUG_ON(ret);
+
+			btrfs_release_path(root, path);
+			ret = btrfs_insert_empty_item(trans,
+				  root, path, key, item_size);
+			BUG_ON(ret);
+		}
+	} else if (ret) {
+		BUG();
+	}
+	dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
+					path->slots[0]);
+
+	/* don't overwrite an existing inode if the generation number
+	 * was logged as zero.  This is done when the tree logging code
+	 * is just logging an inode to make sure it exists after recovery.
+	 *
+	 * Also, don't overwrite i_size on directories during replay.
+	 * log replay inserts and removes directory items based on the
+	 * state of the tree found in the subvolume, and i_size is modified
+	 * as it goes
+	 */
+	if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
+		struct btrfs_inode_item *src_item;
+		struct btrfs_inode_item *dst_item;
+
+		src_item = (struct btrfs_inode_item *)src_ptr;
+		dst_item = (struct btrfs_inode_item *)dst_ptr;
+
+		if (btrfs_inode_generation(eb, src_item) == 0)
+			goto no_copy;
+
+		if (overwrite_root &&
+		    S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
+		    S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
+			save_old_i_size = 1;
+			saved_i_size = btrfs_inode_size(path->nodes[0],
+							dst_item);
+		}
+	}
+
+	copy_extent_buffer(path->nodes[0], eb, dst_ptr,
+			   src_ptr, item_size);
+
+	if (save_old_i_size) {
+		struct btrfs_inode_item *dst_item;
+		dst_item = (struct btrfs_inode_item *)dst_ptr;
+		btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
+	}
+
+	/* make sure the generation is filled in */
+	if (key->type == BTRFS_INODE_ITEM_KEY) {
+		struct btrfs_inode_item *dst_item;
+		dst_item = (struct btrfs_inode_item *)dst_ptr;
+		if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
+			btrfs_set_inode_generation(path->nodes[0], dst_item,
+						   trans->transid);
+		}
+	}
+no_copy:
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+	btrfs_release_path(root, path);
+	return 0;
+}
+
+/*
+ * simple helper to read an inode off the disk from a given root
+ * This can only be called for subvolume roots and not for the log
+ */
+static noinline struct inode *read_one_inode(struct btrfs_root *root,
+					     u64 objectid)
+{
+	struct inode *inode;
+	inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
+	if (inode->i_state & I_NEW) {
+		BTRFS_I(inode)->root = root;
+		BTRFS_I(inode)->location.objectid = objectid;
+		BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
+		BTRFS_I(inode)->location.offset = 0;
+		btrfs_read_locked_inode(inode);
+		unlock_new_inode(inode);
+
+	}
+	if (is_bad_inode(inode)) {
+		iput(inode);
+		inode = NULL;
+	}
+	return inode;
+}
+
+/* replays a single extent in 'eb' at 'slot' with 'key' into the
+ * subvolume 'root'.  path is released on entry and should be released
+ * on exit.
+ *
+ * extents in the log tree have not been allocated out of the extent
+ * tree yet.  So, this completes the allocation, taking a reference
+ * as required if the extent already exists or creating a new extent
+ * if it isn't in the extent allocation tree yet.
+ *
+ * The extent is inserted into the file, dropping any existing extents
+ * from the file that overlap the new one.
+ */
+static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *root,
+				      struct btrfs_path *path,
+				      struct extent_buffer *eb, int slot,
+				      struct btrfs_key *key)
+{
+	int found_type;
+	u64 mask = root->sectorsize - 1;
+	u64 extent_end;
+	u64 alloc_hint;
+	u64 start = key->offset;
+	struct btrfs_file_extent_item *item;
+	struct inode *inode = NULL;
+	unsigned long size;
+	int ret = 0;
+
+	item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
+	found_type = btrfs_file_extent_type(eb, item);
+
+	if (found_type == BTRFS_FILE_EXTENT_REG)
+		extent_end = start + btrfs_file_extent_num_bytes(eb, item);
+	else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+		size = btrfs_file_extent_inline_len(eb,
+						    btrfs_item_nr(eb, slot));
+		extent_end = (start + size + mask) & ~mask;
+	} else {
+		ret = 0;
+		goto out;
+	}
+
+	inode = read_one_inode(root, key->objectid);
+	if (!inode) {
+		ret = -EIO;
+		goto out;
+	}
+
+	/*
+	 * first check to see if we already have this extent in the
+	 * file.  This must be done before the btrfs_drop_extents run
+	 * so we don't try to drop this extent.
+	 */
+	ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
+				       start, 0);
+
+	if (ret == 0 && found_type == BTRFS_FILE_EXTENT_REG) {
+		struct btrfs_file_extent_item cmp1;
+		struct btrfs_file_extent_item cmp2;
+		struct btrfs_file_extent_item *existing;
+		struct extent_buffer *leaf;
+
+		leaf = path->nodes[0];
+		existing = btrfs_item_ptr(leaf, path->slots[0],
+					  struct btrfs_file_extent_item);
+
+		read_extent_buffer(eb, &cmp1, (unsigned long)item,
+				   sizeof(cmp1));
+		read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
+				   sizeof(cmp2));
+
+		/*
+		 * we already have a pointer to this exact extent,
+		 * we don't have to do anything
+		 */
+		if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
+			btrfs_release_path(root, path);
+			goto out;
+		}
+	}
+	btrfs_release_path(root, path);
+
+	/* drop any overlapping extents */
+	ret = btrfs_drop_extents(trans, root, inode,
+			 start, extent_end, start, &alloc_hint);
+	BUG_ON(ret);
+
+	BUG_ON(ret);
+	if (found_type == BTRFS_FILE_EXTENT_REG) {
+		struct btrfs_key ins;
+
+		ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
+		ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
+		ins.type = BTRFS_EXTENT_ITEM_KEY;
+
+		/* insert the extent pointer in the file */
+		ret = overwrite_item(trans, root, path, eb, slot, key);
+		BUG_ON(ret);
+
+		/*
+		 * is this extent already allocated in the extent
+		 * allocation tree?  If so, just add a reference
+		 */
+		ret = btrfs_lookup_extent(root, path, ins.objectid, ins.offset);
+		btrfs_release_path(root, path);
+		if (ret == 0) {
+			ret = btrfs_inc_extent_ref(trans, root,
+				   ins.objectid, ins.offset,
+				   root->root_key.objectid,
+				   trans->transid, key->objectid, start);
+		} else {
+			/*
+			 * insert the extent pointer in the extent
+			 * allocation tree
+			 */
+			ret = btrfs_alloc_logged_extent(trans, root,
+						root->root_key.objectid,
+						trans->transid, key->objectid,
+						start, &ins);
+			BUG_ON(ret);
+		}
+	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+		/* inline extents are easy, we just overwrite them */
+		ret = overwrite_item(trans, root, path, eb, slot, key);
+		BUG_ON(ret);
+	}
+	/* btrfs_drop_extents changes i_blocks, update it here */
+	inode->i_blocks += (extent_end - start) >> 9;
+	btrfs_update_inode(trans, root, inode);
+out:
+	if (inode)
+		iput(inode);
+	return ret;
+}
+
+/*
+ * when cleaning up conflicts between the directory names in the
+ * subvolume, directory names in the log and directory names in the
+ * inode back references, we may have to unlink inodes from directories.
+ *
+ * This is a helper function to do the unlink of a specific directory
+ * item
+ */
+static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *root,
+				      struct btrfs_path *path,
+				      struct inode *dir,
+				      struct btrfs_dir_item *di)
+{
+	struct inode *inode;
+	char *name;
+	int name_len;
+	struct extent_buffer *leaf;
+	struct btrfs_key location;
+	int ret;
+
+	leaf = path->nodes[0];
+
+	btrfs_dir_item_key_to_cpu(leaf, di, &location);
+	name_len = btrfs_dir_name_len(leaf, di);
+	name = kmalloc(name_len, GFP_NOFS);
+	read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
+	btrfs_release_path(root, path);
+
+	inode = read_one_inode(root, location.objectid);
+	BUG_ON(!inode);
+
+	btrfs_inc_nlink(inode);
+	ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
+	kfree(name);
+
+	iput(inode);
+	return ret;
+}
+
+/*
+ * helper function to see if a given name and sequence number found
+ * in an inode back reference are already in a directory and correctly
+ * point to this inode
+ */
+static noinline int inode_in_dir(struct btrfs_root *root,
+				 struct btrfs_path *path,
+				 u64 dirid, u64 objectid, u64 index,
+				 const char *name, int name_len)
+{
+	struct btrfs_dir_item *di;
+	struct btrfs_key location;
+	int match = 0;
+
+	di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
+					 index, name, name_len, 0);
+	if (di && !IS_ERR(di)) {
+		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
+		if (location.objectid != objectid)
+			goto out;
+	} else
+		goto out;
+	btrfs_release_path(root, path);
+
+	di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
+	if (di && !IS_ERR(di)) {
+		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
+		if (location.objectid != objectid)
+			goto out;
+	} else
+		goto out;
+	match = 1;
+out:
+	btrfs_release_path(root, path);
+	return match;
+}
+
+/*
+ * helper function to check a log tree for a named back reference in
+ * an inode.  This is used to decide if a back reference that is
+ * found in the subvolume conflicts with what we find in the log.
+ *
+ * inode backreferences may have multiple refs in a single item,
+ * during replay we process one reference at a time, and we don't
+ * want to delete valid links to a file from the subvolume if that
+ * link is also in the log.
+ */
+static noinline int backref_in_log(struct btrfs_root *log,
+				   struct btrfs_key *key,
+				   char *name, int namelen)
+{
+	struct btrfs_path *path;
+	struct btrfs_inode_ref *ref;
+	unsigned long ptr;
+	unsigned long ptr_end;
+	unsigned long name_ptr;
+	int found_name_len;
+	int item_size;
+	int ret;
+	int match = 0;
+
+	path = btrfs_alloc_path();
+	ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
+	if (ret != 0)
+		goto out;
+
+	item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
+	ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
+	ptr_end = ptr + item_size;
+	while (ptr < ptr_end) {
+		ref = (struct btrfs_inode_ref *)ptr;
+		found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref);
+		if (found_name_len == namelen) {
+			name_ptr = (unsigned long)(ref + 1);
+			ret = memcmp_extent_buffer(path->nodes[0], name,
+						   name_ptr, namelen);
+			if (ret == 0) {
+				match = 1;
+				goto out;
+			}
+		}
+		ptr = (unsigned long)(ref + 1) + found_name_len;
+	}
+out:
+	btrfs_free_path(path);
+	return match;
+}
+
+
+/*
+ * replay one inode back reference item found in the log tree.
+ * eb, slot and key refer to the buffer and key found in the log tree.
+ * root is the destination we are replaying into, and path is for temp
+ * use by this function.  (it should be released on return).
+ */
+static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *root,
+				  struct btrfs_root *log,
+				  struct btrfs_path *path,
+				  struct extent_buffer *eb, int slot,
+				  struct btrfs_key *key)
+{
+	struct inode *dir;
+	int ret;
+	struct btrfs_key location;
+	struct btrfs_inode_ref *ref;
+	struct btrfs_dir_item *di;
+	struct inode *inode;
+	char *name;
+	int namelen;
+	unsigned long ref_ptr;
+	unsigned long ref_end;
+
+	location.objectid = key->objectid;
+	location.type = BTRFS_INODE_ITEM_KEY;
+	location.offset = 0;
+
+	/*
+	 * it is possible that we didn't log all the parent directories
+	 * for a given inode.  If we don't find the dir, just don't
+	 * copy the back ref in.  The link count fixup code will take
+	 * care of the rest
+	 */
+	dir = read_one_inode(root, key->offset);
+	if (!dir)
+		return -ENOENT;
+
+	inode = read_one_inode(root, key->objectid);
+	BUG_ON(!dir);
+
+	ref_ptr = btrfs_item_ptr_offset(eb, slot);
+	ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
+
+again:
+	ref = (struct btrfs_inode_ref *)ref_ptr;
+
+	namelen = btrfs_inode_ref_name_len(eb, ref);
+	name = kmalloc(namelen, GFP_NOFS);
+	BUG_ON(!name);
+
+	read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen);
+
+	/* if we already have a perfect match, we're done */
+	if (inode_in_dir(root, path, dir->i_ino, inode->i_ino,
+			 btrfs_inode_ref_index(eb, ref),
+			 name, namelen)) {
+		goto out;
+	}
+
+	/*
+	 * look for a conflicting back reference in the metadata.
+	 * if we find one we have to unlink that name of the file
+	 * before we add our new link.  Later on, we overwrite any
+	 * existing back reference, and we don't want to create
+	 * dangling pointers in the directory.
+	 */
+conflict_again:
+	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+	if (ret == 0) {
+		char *victim_name;
+		int victim_name_len;
+		struct btrfs_inode_ref *victim_ref;
+		unsigned long ptr;
+		unsigned long ptr_end;
+		struct extent_buffer *leaf = path->nodes[0];
+
+		/* are we trying to overwrite a back ref for the root directory
+		 * if so, just jump out, we're done
+		 */
+		if (key->objectid == key->offset)
+			goto out_nowrite;
+
+		/* check all the names in this back reference to see
+		 * if they are in the log.  if so, we allow them to stay
+		 * otherwise they must be unlinked as a conflict
+		 */
+		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+		ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
+		while(ptr < ptr_end) {
+			victim_ref = (struct btrfs_inode_ref *)ptr;
+			victim_name_len = btrfs_inode_ref_name_len(leaf,
+								   victim_ref);
+			victim_name = kmalloc(victim_name_len, GFP_NOFS);
+			BUG_ON(!victim_name);
+
+			read_extent_buffer(leaf, victim_name,
+					   (unsigned long)(victim_ref + 1),
+					   victim_name_len);
+
+			if (!backref_in_log(log, key, victim_name,
+					    victim_name_len)) {
+				btrfs_inc_nlink(inode);
+				btrfs_release_path(root, path);
+				ret = btrfs_unlink_inode(trans, root, dir,
+							 inode, victim_name,
+							 victim_name_len);
+				kfree(victim_name);
+				btrfs_release_path(root, path);
+				goto conflict_again;
+			}
+			kfree(victim_name);
+			ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
+		}
+		BUG_ON(ret);
+	}
+	btrfs_release_path(root, path);
+
+	/* look for a conflicting sequence number */
+	di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
+					 btrfs_inode_ref_index(eb, ref),
+					 name, namelen, 0);
+	if (di && !IS_ERR(di)) {
+		ret = drop_one_dir_item(trans, root, path, dir, di);
+		BUG_ON(ret);
+	}
+	btrfs_release_path(root, path);
+
+
+	/* look for a conflicting name */
+	di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
+				   name, namelen, 0);
+	if (di && !IS_ERR(di)) {
+		ret = drop_one_dir_item(trans, root, path, dir, di);
+		BUG_ON(ret);
+	}
+	btrfs_release_path(root, path);
+
+	/* insert our name */
+	ret = btrfs_add_link(trans, dir, inode, name, namelen, 0,
+			     btrfs_inode_ref_index(eb, ref));
+	BUG_ON(ret);
+
+	btrfs_update_inode(trans, root, inode);
+
+out:
+	ref_ptr = (unsigned long)(ref + 1) + namelen;
+	kfree(name);
+	if (ref_ptr < ref_end)
+		goto again;
+
+	/* finally write the back reference in the inode */
+	ret = overwrite_item(trans, root, path, eb, slot, key);
+	BUG_ON(ret);
+
+out_nowrite:
+	btrfs_release_path(root, path);
+	iput(dir);
+	iput(inode);
+	return 0;
+}
+
+/*
+ * replay one csum item from the log tree into the subvolume 'root'
+ * eb, slot and key all refer to the log tree
+ * path is for temp use by this function and should be released on return
+ *
+ * This copies the checksums out of the log tree and inserts them into
+ * the subvolume.  Any existing checksums for this range in the file
+ * are overwritten, and new items are added where required.
+ *
+ * We keep this simple by reusing the btrfs_ordered_sum code from
+ * the data=ordered mode.  This basically means making a copy
+ * of all the checksums in ram, which we have to do anyway for kmap
+ * rules.
+ *
+ * The copy is then sent down to btrfs_csum_file_blocks, which
+ * does all the hard work of finding existing items in the file
+ * or adding new ones.
+ */
+static noinline int replay_one_csum(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *root,
+				      struct btrfs_path *path,
+				      struct extent_buffer *eb, int slot,
+				      struct btrfs_key *key)
+{
+	int ret;
+	u32 item_size = btrfs_item_size_nr(eb, slot);
+	u64 cur_offset;
+	unsigned long file_bytes;
+	struct btrfs_ordered_sum *sums;
+	struct btrfs_sector_sum *sector_sum;
+	struct inode *inode;
+	unsigned long ptr;
+
+	file_bytes = (item_size / BTRFS_CRC32_SIZE) * root->sectorsize;
+	inode = read_one_inode(root, key->objectid);
+	if (!inode) {
+		return -EIO;
+	}
+
+	sums = kzalloc(btrfs_ordered_sum_size(root, file_bytes), GFP_NOFS);
+	if (!sums) {
+		iput(inode);
+		return -ENOMEM;
+	}
+
+	INIT_LIST_HEAD(&sums->list);
+	sums->len = file_bytes;
+	sums->file_offset = key->offset;
+
+	/*
+	 * copy all the sums into the ordered sum struct
+	 */
+	sector_sum = sums->sums;
+	cur_offset = key->offset;
+	ptr = btrfs_item_ptr_offset(eb, slot);
+	while(item_size > 0) {
+		sector_sum->offset = cur_offset;
+		read_extent_buffer(eb, &sector_sum->sum, ptr, BTRFS_CRC32_SIZE);
+		sector_sum++;
+		item_size -= BTRFS_CRC32_SIZE;
+		ptr += BTRFS_CRC32_SIZE;
+		cur_offset += root->sectorsize;
+	}
+
+	/* let btrfs_csum_file_blocks add them into the file */
+	ret = btrfs_csum_file_blocks(trans, root, inode, sums);
+	BUG_ON(ret);
+	kfree(sums);
+	iput(inode);
+
+	return 0;
+}
+/*
+ * There are a few corners where the link count of the file can't
+ * be properly maintained during replay.  So, instead of adding
+ * lots of complexity to the log code, we just scan the backrefs
+ * for any file that has been through replay.
+ *
+ * The scan will update the link count on the inode to reflect the
+ * number of back refs found.  If it goes down to zero, the iput
+ * will free the inode.
+ */
+static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
+					   struct btrfs_root *root,
+					   struct inode *inode)
+{
+	struct btrfs_path *path;
+	int ret;
+	struct btrfs_key key;
+	u64 nlink = 0;
+	unsigned long ptr;
+	unsigned long ptr_end;
+	int name_len;
+
+	key.objectid = inode->i_ino;
+	key.type = BTRFS_INODE_REF_KEY;
+	key.offset = (u64)-1;
+
+	path = btrfs_alloc_path();
+
+	while(1) {
+		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+		if (ret < 0)
+			break;
+		if (ret > 0) {
+			if (path->slots[0] == 0)
+				break;
+			path->slots[0]--;
+		}
+		btrfs_item_key_to_cpu(path->nodes[0], &key,
+				      path->slots[0]);
+		if (key.objectid != inode->i_ino ||
+		    key.type != BTRFS_INODE_REF_KEY)
+			break;
+		ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
+		ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
+						   path->slots[0]);
+		while(ptr < ptr_end) {
+			struct btrfs_inode_ref *ref;
+
+			ref = (struct btrfs_inode_ref *)ptr;
+			name_len = btrfs_inode_ref_name_len(path->nodes[0],
+							    ref);
+			ptr = (unsigned long)(ref + 1) + name_len;
+			nlink++;
+		}
+
+		if (key.offset == 0)
+			break;
+		key.offset--;
+		btrfs_release_path(root, path);
+	}
+	btrfs_free_path(path);
+	if (nlink != inode->i_nlink) {
+		inode->i_nlink = nlink;
+		btrfs_update_inode(trans, root, inode);
+	}
+
+	return 0;
+}
+
+static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
+					    struct btrfs_root *root,
+					    struct btrfs_path *path)
+{
+	int ret;
+	struct btrfs_key key;
+	struct inode *inode;
+
+	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
+	key.type = BTRFS_ORPHAN_ITEM_KEY;
+	key.offset = (u64)-1;
+	while(1) {
+		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+		if (ret < 0)
+			break;
+
+		if (ret == 1) {
+			if (path->slots[0] == 0)
+				break;
+			path->slots[0]--;
+		}
+
+		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+		if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
+		    key.type != BTRFS_ORPHAN_ITEM_KEY)
+			break;
+
+		ret = btrfs_del_item(trans, root, path);
+		BUG_ON(ret);
+
+		btrfs_release_path(root, path);
+		inode = read_one_inode(root, key.offset);
+		BUG_ON(!inode);
+
+		ret = fixup_inode_link_count(trans, root, inode);
+		BUG_ON(ret);
+
+		iput(inode);
+
+		if (key.offset == 0)
+			break;
+		key.offset--;
+	}
+	btrfs_release_path(root, path);
+	return 0;
+}
+
+
+/*
+ * record a given inode in the fixup dir so we can check its link
+ * count when replay is done.  The link count is incremented here
+ * so the inode won't go away until we check it
+ */
+static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *root,
+				      struct btrfs_path *path,
+				      u64 objectid)
+{
+	struct btrfs_key key;
+	int ret = 0;
+	struct inode *inode;
+
+	inode = read_one_inode(root, objectid);
+	BUG_ON(!inode);
+
+	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
+	btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
+	key.offset = objectid;
+
+	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+
+	btrfs_release_path(root, path);
+	if (ret == 0) {
+		btrfs_inc_nlink(inode);
+		btrfs_update_inode(trans, root, inode);
+	} else if (ret == -EEXIST) {
+		ret = 0;
+	} else {
+		BUG();
+	}
+	iput(inode);
+
+	return ret;
+}
+
+/*
+ * when replaying the log for a directory, we only insert names
+ * for inodes that actually exist.  This means an fsync on a directory
+ * does not implicitly fsync all the new files in it
+ */
+static noinline int insert_one_name(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *root,
+				    struct btrfs_path *path,
+				    u64 dirid, u64 index,
+				    char *name, int name_len, u8 type,
+				    struct btrfs_key *location)
+{
+	struct inode *inode;
+	struct inode *dir;
+	int ret;
+
+	inode = read_one_inode(root, location->objectid);
+	if (!inode)
+		return -ENOENT;
+
+	dir = read_one_inode(root, dirid);
+	if (!dir) {
+		iput(inode);
+		return -EIO;
+	}
+	ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index);
+
+	/* FIXME, put inode into FIXUP list */
+
+	iput(inode);
+	iput(dir);
+	return ret;
+}
+
+/*
+ * take a single entry in a log directory item and replay it into
+ * the subvolume.
+ *
+ * if a conflicting item exists in the subdirectory already,
+ * the inode it points to is unlinked and put into the link count
+ * fix up tree.
+ *
+ * If a name from the log points to a file or directory that does
+ * not exist in the FS, it is skipped.  fsyncs on directories
+ * do not force down inodes inside that directory, just changes to the
+ * names or unlinks in a directory.
+ */
+static noinline int replay_one_name(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *root,
+				    struct btrfs_path *path,
+				    struct extent_buffer *eb,
+				    struct btrfs_dir_item *di,
+				    struct btrfs_key *key)
+{
+	char *name;
+	int name_len;
+	struct btrfs_dir_item *dst_di;
+	struct btrfs_key found_key;
+	struct btrfs_key log_key;
+	struct inode *dir;
+	struct inode *inode;
+	u8 log_type;
+	int ret;
+
+	dir = read_one_inode(root, key->objectid);
+	BUG_ON(!dir);
+
+	name_len = btrfs_dir_name_len(eb, di);
+	name = kmalloc(name_len, GFP_NOFS);
+	log_type = btrfs_dir_type(eb, di);
+	read_extent_buffer(eb, name, (unsigned long)(di + 1),
+		   name_len);
+
+	btrfs_dir_item_key_to_cpu(eb, di, &log_key);
+	if (key->type == BTRFS_DIR_ITEM_KEY) {
+		dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
+				       name, name_len, 1);
+	}
+	else if (key->type == BTRFS_DIR_INDEX_KEY) {
+		dst_di = btrfs_lookup_dir_index_item(trans, root, path,
+						     key->objectid,
+						     key->offset, name,
+						     name_len, 1);
+	} else {
+		BUG();
+	}
+	if (!dst_di || IS_ERR(dst_di)) {
+		/* we need a sequence number to insert, so we only
+		 * do inserts for the BTRFS_DIR_INDEX_KEY types
+		 */
+		if (key->type != BTRFS_DIR_INDEX_KEY)
+			goto out;
+		goto insert;
+	}
+
+	btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
+	/* the existing item matches the logged item */
+	if (found_key.objectid == log_key.objectid &&
+	    found_key.type == log_key.type &&
+	    found_key.offset == log_key.offset &&
+	    btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
+		goto out;
+	}
+
+	/*
+	 * don't drop the conflicting directory entry if the inode
+	 * for the new entry doesn't exist
+	 */
+	inode = read_one_inode(root, log_key.objectid);
+	if (!inode)
+		goto out;
+
+	iput(inode);
+	ret = drop_one_dir_item(trans, root, path, dir, dst_di);
+	BUG_ON(ret);
+
+	if (key->type == BTRFS_DIR_INDEX_KEY)
+		goto insert;
+out:
+	btrfs_release_path(root, path);
+	kfree(name);
+	iput(dir);
+	return 0;
+
+insert:
+	btrfs_release_path(root, path);
+	ret = insert_one_name(trans, root, path, key->objectid, key->offset,
+			      name, name_len, log_type, &log_key);
+
+	if (ret && ret != -ENOENT)
+		BUG();
+	goto out;
+}
+
+/*
+ * find all the names in a directory item and reconcile them into
+ * the subvolume.  Only BTRFS_DIR_ITEM_KEY types will have more than
+ * one name in a directory item, but the same code gets used for
+ * both directory index types
+ */
+static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
+					struct btrfs_root *root,
+					struct btrfs_path *path,
+					struct extent_buffer *eb, int slot,
+					struct btrfs_key *key)
+{
+	int ret;
+	u32 item_size = btrfs_item_size_nr(eb, slot);
+	struct btrfs_dir_item *di;
+	int name_len;
+	unsigned long ptr;
+	unsigned long ptr_end;
+
+	ptr = btrfs_item_ptr_offset(eb, slot);
+	ptr_end = ptr + item_size;
+	while(ptr < ptr_end) {
+		di = (struct btrfs_dir_item *)ptr;
+		name_len = btrfs_dir_name_len(eb, di);
+		ret = replay_one_name(trans, root, path, eb, di, key);
+		BUG_ON(ret);
+		ptr = (unsigned long)(di + 1);
+		ptr += name_len;
+	}
+	return 0;
+}
+
+/*
+ * directory replay has two parts.  There are the standard directory
+ * items in the log copied from the subvolume, and range items
+ * created in the log while the subvolume was logged.
+ *
+ * The range items tell us which parts of the key space the log
+ * is authoritative for.  During replay, if a key in the subvolume
+ * directory is in a logged range item, but not actually in the log
+ * that means it was deleted from the directory before the fsync
+ * and should be removed.
+ */
+static noinline int find_dir_range(struct btrfs_root *root,
+				   struct btrfs_path *path,
+				   u64 dirid, int key_type,
+				   u64 *start_ret, u64 *end_ret)
+{
+	struct btrfs_key key;
+	u64 found_end;
+	struct btrfs_dir_log_item *item;
+	int ret;
+	int nritems;
+
+	if (*start_ret == (u64)-1)
+		return 1;
+
+	key.objectid = dirid;
+	key.type = key_type;
+	key.offset = *start_ret;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+	if (ret > 0) {
+		if (path->slots[0] == 0)
+			goto out;
+		path->slots[0]--;
+	}
+	if (ret != 0)
+		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+	if (key.type != key_type || key.objectid != dirid) {
+		ret = 1;
+		goto next;
+	}
+	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+			      struct btrfs_dir_log_item);
+	found_end = btrfs_dir_log_end(path->nodes[0], item);
+
+	if (*start_ret >= key.offset && *start_ret <= found_end) {
+		ret = 0;
+		*start_ret = key.offset;
+		*end_ret = found_end;
+		goto out;
+	}
+	ret = 1;
+next:
+	/* check the next slot in the tree to see if it is a valid item */
+	nritems = btrfs_header_nritems(path->nodes[0]);
+	if (path->slots[0] >= nritems) {
+		ret = btrfs_next_leaf(root, path);
+		if (ret)
+			goto out;
+	} else {
+		path->slots[0]++;
+	}
+
+	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+	if (key.type != key_type || key.objectid != dirid) {
+		ret = 1;
+		goto out;
+	}
+	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+			      struct btrfs_dir_log_item);
+	found_end = btrfs_dir_log_end(path->nodes[0], item);
+	*start_ret = key.offset;
+	*end_ret = found_end;
+	ret = 0;
+out:
+	btrfs_release_path(root, path);
+	return ret;
+}
+
+/*
+ * this looks for a given directory item in the log.  If the directory
+ * item is not in the log, the item is removed and the inode it points
+ * to is unlinked
+ */
+static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *root,
+				      struct btrfs_root *log,
+				      struct btrfs_path *path,
+				      struct btrfs_path *log_path,
+				      struct inode *dir,
+				      struct btrfs_key *dir_key)
+{
+	int ret;
+	struct extent_buffer *eb;
+	int slot;
+	u32 item_size;
+	struct btrfs_dir_item *di;
+	struct btrfs_dir_item *log_di;
+	int name_len;
+	unsigned long ptr;
+	unsigned long ptr_end;
+	char *name;
+	struct inode *inode;
+	struct btrfs_key location;
+
+again:
+	eb = path->nodes[0];
+	slot = path->slots[0];
+	item_size = btrfs_item_size_nr(eb, slot);
+	ptr = btrfs_item_ptr_offset(eb, slot);
+	ptr_end = ptr + item_size;
+	while(ptr < ptr_end) {
+		di = (struct btrfs_dir_item *)ptr;
+		name_len = btrfs_dir_name_len(eb, di);
+		name = kmalloc(name_len, GFP_NOFS);
+		if (!name) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		read_extent_buffer(eb, name, (unsigned long)(di + 1),
+				  name_len);
+		log_di = NULL;
+		if (dir_key->type == BTRFS_DIR_ITEM_KEY) {
+			log_di = btrfs_lookup_dir_item(trans, log, log_path,
+						       dir_key->objectid,
+						       name, name_len, 0);
+		} else if (dir_key->type == BTRFS_DIR_INDEX_KEY) {
+			log_di = btrfs_lookup_dir_index_item(trans, log,
+						     log_path,
+						     dir_key->objectid,
+						     dir_key->offset,
+						     name, name_len, 0);
+		}
+		if (!log_di || IS_ERR(log_di)) {
+			btrfs_dir_item_key_to_cpu(eb, di, &location);
+			btrfs_release_path(root, path);
+			btrfs_release_path(log, log_path);
+			inode = read_one_inode(root, location.objectid);
+			BUG_ON(!inode);
+
+			ret = link_to_fixup_dir(trans, root,
+						path, location.objectid);
+			BUG_ON(ret);
+			btrfs_inc_nlink(inode);
+			ret = btrfs_unlink_inode(trans, root, dir, inode,
+						 name, name_len);
+			BUG_ON(ret);
+			kfree(name);
+			iput(inode);
+
+			/* there might still be more names under this key
+			 * check and repeat if required
+			 */
+			ret = btrfs_search_slot(NULL, root, dir_key, path,
+						0, 0);
+			if (ret == 0)
+				goto again;
+			ret = 0;
+			goto out;
+		}
+		btrfs_release_path(log, log_path);
+		kfree(name);
+
+		ptr = (unsigned long)(di + 1);
+		ptr += name_len;
+	}
+	ret = 0;
+out:
+	btrfs_release_path(root, path);
+	btrfs_release_path(log, log_path);
+	return ret;
+}
+
+/*
+ * deletion replay happens before we copy any new directory items
+ * out of the log or out of backreferences from inodes.  It
+ * scans the log to find ranges of keys that log is authoritative for,
+ * and then scans the directory to find items in those ranges that are
+ * not present in the log.
+ *
+ * Anything we don't find in the log is unlinked and removed from the
+ * directory.
+ */
+static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
+				       struct btrfs_root *root,
+				       struct btrfs_root *log,
+				       struct btrfs_path *path,
+				       u64 dirid)
+{
+	u64 range_start;
+	u64 range_end;
+	int key_type = BTRFS_DIR_LOG_ITEM_KEY;
+	int ret = 0;
+	struct btrfs_key dir_key;
+	struct btrfs_key found_key;
+	struct btrfs_path *log_path;
+	struct inode *dir;
+
+	dir_key.objectid = dirid;
+	dir_key.type = BTRFS_DIR_ITEM_KEY;
+	log_path = btrfs_alloc_path();
+	if (!log_path)
+		return -ENOMEM;
+
+	dir = read_one_inode(root, dirid);
+	/* it isn't an error if the inode isn't there, that can happen
+	 * because we replay the deletes before we copy in the inode item
+	 * from the log
+	 */
+	if (!dir) {
+		btrfs_free_path(log_path);
+		return 0;
+	}
+again:
+	range_start = 0;
+	range_end = 0;
+	while(1) {
+		ret = find_dir_range(log, path, dirid, key_type,
+				     &range_start, &range_end);
+		if (ret != 0)
+			break;
+
+		dir_key.offset = range_start;
+		while(1) {
+			int nritems;
+			ret = btrfs_search_slot(NULL, root, &dir_key, path,
+						0, 0);
+			if (ret < 0)
+				goto out;
+
+			nritems = btrfs_header_nritems(path->nodes[0]);
+			if (path->slots[0] >= nritems) {
+				ret = btrfs_next_leaf(root, path);
+				if (ret)
+					break;
+			}
+			btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+					      path->slots[0]);
+			if (found_key.objectid != dirid ||
+			    found_key.type != dir_key.type)
+				goto next_type;
+
+			if (found_key.offset > range_end)
+				break;
+
+			ret = check_item_in_log(trans, root, log, path,
+						log_path, dir, &found_key);
+			BUG_ON(ret);
+			if (found_key.offset == (u64)-1)
+				break;
+			dir_key.offset = found_key.offset + 1;
+		}
+		btrfs_release_path(root, path);
+		if (range_end == (u64)-1)
+			break;
+		range_start = range_end + 1;
+	}
+
+next_type:
+	ret = 0;
+	if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
+		key_type = BTRFS_DIR_LOG_INDEX_KEY;
+		dir_key.type = BTRFS_DIR_INDEX_KEY;
+		btrfs_release_path(root, path);
+		goto again;
+	}
+out:
+	btrfs_release_path(root, path);
+	btrfs_free_path(log_path);
+	iput(dir);
+	return ret;
+}
+
+/*
+ * the process_func used to replay items from the log tree.  This
+ * gets called in two different stages.  The first stage just looks
+ * for inodes and makes sure they are all copied into the subvolume.
+ *
+ * The second stage copies all the other item types from the log into
+ * the subvolume.  The two stage approach is slower, but gets rid of
+ * lots of complexity around inodes referencing other inodes that exist
+ * only in the log (references come from either directory items or inode
+ * back refs).
+ */
+static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
+			     struct walk_control *wc, u64 gen)
+{
+	int nritems;
+	struct btrfs_path *path;
+	struct btrfs_root *root = wc->replay_dest;
+	struct btrfs_key key;
+	u32 item_size;
+	int level;
+	int i;
+	int ret;
+
+	btrfs_read_buffer(eb, gen);
+
+	level = btrfs_header_level(eb);
+
+	if (level != 0)
+		return 0;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+
+	nritems = btrfs_header_nritems(eb);
+	for (i = 0; i < nritems; i++) {
+		btrfs_item_key_to_cpu(eb, &key, i);
+		item_size = btrfs_item_size_nr(eb, i);
+
+		/* inode keys are done during the first stage */
+		if (key.type == BTRFS_INODE_ITEM_KEY &&
+		    wc->stage == LOG_WALK_REPLAY_INODES) {
+			struct inode *inode;
+			struct btrfs_inode_item *inode_item;
+			u32 mode;
+
+			inode_item = btrfs_item_ptr(eb, i,
+					    struct btrfs_inode_item);
+			mode = btrfs_inode_mode(eb, inode_item);
+			if (S_ISDIR(mode)) {
+				ret = replay_dir_deletes(wc->trans,
+					 root, log, path, key.objectid);
+				BUG_ON(ret);
+			}
+			ret = overwrite_item(wc->trans, root, path,
+					     eb, i, &key);
+			BUG_ON(ret);
+
+			/* for regular files, truncate away
+			 * extents past the new EOF
+			 */
+			if (S_ISREG(mode)) {
+				inode = read_one_inode(root,
+						       key.objectid);
+				BUG_ON(!inode);
+
+				ret = btrfs_truncate_inode_items(wc->trans,
+					root, inode, inode->i_size,
+					BTRFS_EXTENT_DATA_KEY);
+				BUG_ON(ret);
+				iput(inode);
+			}
+			ret = link_to_fixup_dir(wc->trans, root,
+						path, key.objectid);
+			BUG_ON(ret);
+		}
+		if (wc->stage < LOG_WALK_REPLAY_ALL)
+			continue;
+
+		/* these keys are simply copied */
+		if (key.type == BTRFS_XATTR_ITEM_KEY) {
+			ret = overwrite_item(wc->trans, root, path,
+					     eb, i, &key);
+			BUG_ON(ret);
+		} else if (key.type == BTRFS_INODE_REF_KEY) {
+			ret = add_inode_ref(wc->trans, root, log, path,
+					    eb, i, &key);
+			BUG_ON(ret && ret != -ENOENT);
+		} else if (key.type == BTRFS_EXTENT_DATA_KEY) {
+			ret = replay_one_extent(wc->trans, root, path,
+						eb, i, &key);
+			BUG_ON(ret);
+		} else if (key.type == BTRFS_CSUM_ITEM_KEY) {
+			ret = replay_one_csum(wc->trans, root, path,
+					      eb, i, &key);
+			BUG_ON(ret);
+		} else if (key.type == BTRFS_DIR_ITEM_KEY ||
+			   key.type == BTRFS_DIR_INDEX_KEY) {
+			ret = replay_one_dir_item(wc->trans, root, path,
+						  eb, i, &key);
+			BUG_ON(ret);
+		}
+	}
+	btrfs_free_path(path);
+	return 0;
+}
+
+static int noinline walk_down_log_tree(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root,
+				   struct btrfs_path *path, int *level,
+				   struct walk_control *wc)
+{
+	u64 root_owner;
+	u64 root_gen;
+	u64 bytenr;
+	u64 ptr_gen;
+	struct extent_buffer *next;
+	struct extent_buffer *cur;
+	struct extent_buffer *parent;
+	u32 blocksize;
+	int ret = 0;
+
+	WARN_ON(*level < 0);
+	WARN_ON(*level >= BTRFS_MAX_LEVEL);
+
+	while(*level > 0) {
+		WARN_ON(*level < 0);
+		WARN_ON(*level >= BTRFS_MAX_LEVEL);
+		cur = path->nodes[*level];
+
+		if (btrfs_header_level(cur) != *level)
+			WARN_ON(1);
+
+		if (path->slots[*level] >=
+		    btrfs_header_nritems(cur))
+			break;
+
+		bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
+		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
+		blocksize = btrfs_level_size(root, *level - 1);
+
+		parent = path->nodes[*level];
+		root_owner = btrfs_header_owner(parent);
+		root_gen = btrfs_header_generation(parent);
+
+		next = btrfs_find_create_tree_block(root, bytenr, blocksize);
+
+		wc->process_func(root, next, wc, ptr_gen);
+
+		if (*level == 1) {
+			path->slots[*level]++;
+			if (wc->free) {
+				btrfs_read_buffer(next, ptr_gen);
+
+				btrfs_tree_lock(next);
+				clean_tree_block(trans, root, next);
+				btrfs_wait_tree_block_writeback(next);
+				btrfs_tree_unlock(next);
+
+				ret = btrfs_drop_leaf_ref(trans, root, next);
+				BUG_ON(ret);
+
+				WARN_ON(root_owner !=
+					BTRFS_TREE_LOG_OBJECTID);
+				ret = btrfs_free_extent(trans, root, bytenr,
+							blocksize, root_owner,
+							root_gen, 0, 0, 1);
+				BUG_ON(ret);
+			}
+			free_extent_buffer(next);
+			continue;
+		}
+		btrfs_read_buffer(next, ptr_gen);
+
+		WARN_ON(*level <= 0);
+		if (path->nodes[*level-1])
+			free_extent_buffer(path->nodes[*level-1]);
+		path->nodes[*level-1] = next;
+		*level = btrfs_header_level(next);
+		path->slots[*level] = 0;
+		cond_resched();
+	}
+	WARN_ON(*level < 0);
+	WARN_ON(*level >= BTRFS_MAX_LEVEL);
+
+	if (path->nodes[*level] == root->node) {
+		parent = path->nodes[*level];
+	} else {
+		parent = path->nodes[*level + 1];
+	}
+	bytenr = path->nodes[*level]->start;
+
+	blocksize = btrfs_level_size(root, *level);
+	root_owner = btrfs_header_owner(parent);
+	root_gen = btrfs_header_generation(parent);
+
+	wc->process_func(root, path->nodes[*level], wc,
+			 btrfs_header_generation(path->nodes[*level]));
+
+	if (wc->free) {
+		next = path->nodes[*level];
+		btrfs_tree_lock(next);
+		clean_tree_block(trans, root, next);
+		btrfs_wait_tree_block_writeback(next);
+		btrfs_tree_unlock(next);
+
+		if (*level == 0) {
+			ret = btrfs_drop_leaf_ref(trans, root, next);
+			BUG_ON(ret);
+		}
+		WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
+		ret = btrfs_free_extent(trans, root, bytenr, blocksize,
+					  root_owner, root_gen, 0, 0, 1);
+		BUG_ON(ret);
+	}
+	free_extent_buffer(path->nodes[*level]);
+	path->nodes[*level] = NULL;
+	*level += 1;
+
+	cond_resched();
+	return 0;
+}
+
+static int noinline walk_up_log_tree(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct btrfs_path *path, int *level,
+				 struct walk_control *wc)
+{
+	u64 root_owner;
+	u64 root_gen;
+	int i;
+	int slot;
+	int ret;
+
+	for(i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
+		slot = path->slots[i];
+		if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
+			struct extent_buffer *node;
+			node = path->nodes[i];
+			path->slots[i]++;
+			*level = i;
+			WARN_ON(*level == 0);
+			return 0;
+		} else {
+			if (path->nodes[*level] == root->node) {
+				root_owner = root->root_key.objectid;
+				root_gen =
+				   btrfs_header_generation(path->nodes[*level]);
+			} else {
+				struct extent_buffer *node;
+				node = path->nodes[*level + 1];
+				root_owner = btrfs_header_owner(node);
+				root_gen = btrfs_header_generation(node);
+			}
+			wc->process_func(root, path->nodes[*level], wc,
+				 btrfs_header_generation(path->nodes[*level]));
+			if (wc->free) {
+				struct extent_buffer *next;
+
+				next = path->nodes[*level];
+
+				btrfs_tree_lock(next);
+				clean_tree_block(trans, root, next);
+				btrfs_wait_tree_block_writeback(next);
+				btrfs_tree_unlock(next);
+
+				if (*level == 0) {
+					ret = btrfs_drop_leaf_ref(trans, root,
+								  next);
+					BUG_ON(ret);
+				}
+
+				WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
+				ret = btrfs_free_extent(trans, root,
+						path->nodes[*level]->start,
+						path->nodes[*level]->len,
+						root_owner, root_gen, 0, 0, 1);
+				BUG_ON(ret);
+			}
+			free_extent_buffer(path->nodes[*level]);
+			path->nodes[*level] = NULL;
+			*level = i + 1;
+		}
+	}
+	return 1;
+}
+
+/*
+ * drop the reference count on the tree rooted at 'snap'.  This traverses
+ * the tree freeing any blocks that have a ref count of zero after being
+ * decremented.
+ */
+static int walk_log_tree(struct btrfs_trans_handle *trans,
+			 struct btrfs_root *log, struct walk_control *wc)
+{
+	int ret = 0;
+	int wret;
+	int level;
+	struct btrfs_path *path;
+	int i;
+	int orig_level;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+
+	level = btrfs_header_level(log->node);
+	orig_level = level;
+	path->nodes[level] = log->node;
+	extent_buffer_get(log->node);
+	path->slots[level] = 0;
+
+	while(1) {
+		wret = walk_down_log_tree(trans, log, path, &level, wc);
+		if (wret > 0)
+			break;
+		if (wret < 0)
+			ret = wret;
+
+		wret = walk_up_log_tree(trans, log, path, &level, wc);
+		if (wret > 0)
+			break;
+		if (wret < 0)
+			ret = wret;
+	}
+
+	/* was the root node processed? if not, catch it here */
+	if (path->nodes[orig_level]) {
+		wc->process_func(log, path->nodes[orig_level], wc,
+			 btrfs_header_generation(path->nodes[orig_level]));
+		if (wc->free) {
+			struct extent_buffer *next;
+
+			next = path->nodes[orig_level];
+
+			btrfs_tree_lock(next);
+			clean_tree_block(trans, log, next);
+			btrfs_wait_tree_block_writeback(next);
+			btrfs_tree_unlock(next);
+
+			if (orig_level == 0) {
+				ret = btrfs_drop_leaf_ref(trans, log,
+							  next);
+				BUG_ON(ret);
+			}
+			WARN_ON(log->root_key.objectid !=
+				BTRFS_TREE_LOG_OBJECTID);
+			ret = btrfs_free_extent(trans, log,
+						next->start, next->len,
+						log->root_key.objectid,
+						btrfs_header_generation(next),
+						0, 0, 1);
+			BUG_ON(ret);
+		}
+	}
+
+	for (i = 0; i <= orig_level; i++) {
+		if (path->nodes[i]) {
+			free_extent_buffer(path->nodes[i]);
+			path->nodes[i] = NULL;
+		}
+	}
+	btrfs_free_path(path);
+	if (wc->free)
+		free_extent_buffer(log->node);
+	return ret;
+}
+
+int wait_log_commit(struct btrfs_root *log)
+{
+	DEFINE_WAIT(wait);
+	u64 transid = log->fs_info->tree_log_transid;
+
+	do {
+		prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
+				TASK_UNINTERRUPTIBLE);
+		mutex_unlock(&log->fs_info->tree_log_mutex);
+		if (atomic_read(&log->fs_info->tree_log_commit))
+			schedule();
+		finish_wait(&log->fs_info->tree_log_wait, &wait);
+		mutex_lock(&log->fs_info->tree_log_mutex);
+	} while(transid == log->fs_info->tree_log_transid &&
+		atomic_read(&log->fs_info->tree_log_commit));
+	return 0;
+}
+
+/*
+ * btrfs_sync_log does sends a given tree log down to the disk and
+ * updates the super blocks to record it.  When this call is done,
+ * you know that any inodes previously logged are safely on disk
+ */
+int btrfs_sync_log(struct btrfs_trans_handle *trans,
+		   struct btrfs_root *root)
+{
+	int ret;
+	unsigned long batch;
+	struct btrfs_root *log = root->log_root;
+	struct walk_control wc = {
+		.write = 1,
+		.process_func = process_one_buffer
+	};
+
+	mutex_lock(&log->fs_info->tree_log_mutex);
+	if (atomic_read(&log->fs_info->tree_log_commit)) {
+		wait_log_commit(log);
+		goto out;
+	}
+	atomic_set(&log->fs_info->tree_log_commit, 1);
+
+	while(1) {
+		mutex_unlock(&log->fs_info->tree_log_mutex);
+		schedule_timeout_uninterruptible(1);
+		mutex_lock(&log->fs_info->tree_log_mutex);
+		batch = log->fs_info->tree_log_batch;
+
+		while(atomic_read(&log->fs_info->tree_log_writers)) {
+			DEFINE_WAIT(wait);
+			prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
+					TASK_UNINTERRUPTIBLE);
+			batch = log->fs_info->tree_log_batch;
+			mutex_unlock(&log->fs_info->tree_log_mutex);
+			if (atomic_read(&log->fs_info->tree_log_writers))
+				schedule();
+			mutex_lock(&log->fs_info->tree_log_mutex);
+			finish_wait(&log->fs_info->tree_log_wait, &wait);
+		}
+		if (batch == log->fs_info->tree_log_batch)
+			break;
+	}
+	ret = walk_log_tree(trans, log, &wc);
+	BUG_ON(ret);
+
+	ret = walk_log_tree(trans, log->fs_info->log_root_tree, &wc);
+	BUG_ON(ret);
+
+	wc.wait = 1;
+
+	ret = walk_log_tree(trans, log, &wc);
+	BUG_ON(ret);
+
+	ret = walk_log_tree(trans, log->fs_info->log_root_tree, &wc);
+	BUG_ON(ret);
+
+	btrfs_set_super_log_root(&root->fs_info->super_for_commit,
+				 log->fs_info->log_root_tree->node->start);
+	btrfs_set_super_log_root_level(&root->fs_info->super_for_commit,
+		       btrfs_header_level(log->fs_info->log_root_tree->node));
+
+	write_ctree_super(trans, log->fs_info->tree_root);
+	log->fs_info->tree_log_transid++;
+	log->fs_info->tree_log_batch = 0;
+	atomic_set(&log->fs_info->tree_log_commit, 0);
+	smp_mb();
+	if (waitqueue_active(&log->fs_info->tree_log_wait))
+		wake_up(&log->fs_info->tree_log_wait);
+out:
+	mutex_unlock(&log->fs_info->tree_log_mutex);
+	return 0;
+
+}
+
+/*
+ * free all the extents used by the tree log.  This should be called
+ * at commit time of the full transaction
+ */
+int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
+{
+	int ret;
+	struct btrfs_root *log;
+	struct key;
+	struct walk_control wc = {
+		.free = 1,
+		.process_func = process_one_buffer
+	};
+
+	if (!root->log_root)
+		return 0;
+
+	log = root->log_root;
+	ret = walk_log_tree(trans, log, &wc);
+	BUG_ON(ret);
+
+	log = root->log_root;
+	ret = btrfs_del_root(trans, root->fs_info->log_root_tree,
+			     &log->root_key);
+	BUG_ON(ret);
+	root->log_root = NULL;
+	kfree(root->log_root);
+	return 0;
+}
+
+/*
+ * helper function to update the item for a given subvolumes log root
+ * in the tree of log roots
+ */
+static int update_log_root(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *log)
+{
+	u64 bytenr = btrfs_root_bytenr(&log->root_item);
+	int ret;
+
+	if (log->node->start == bytenr)
+		return 0;
+
+	btrfs_set_root_bytenr(&log->root_item, log->node->start);
+	btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node));
+	ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
+				&log->root_key, &log->root_item);
+	BUG_ON(ret);
+	return ret;
+}
+
+/*
+ * If both a file and directory are logged, and unlinks or renames are
+ * mixed in, we have a few interesting corners:
+ *
+ * create file X in dir Y
+ * link file X to X.link in dir Y
+ * fsync file X
+ * unlink file X but leave X.link
+ * fsync dir Y
+ *
+ * After a crash we would expect only X.link to exist.  But file X
+ * didn't get fsync'd again so the log has back refs for X and X.link.
+ *
+ * We solve this by removing directory entries and inode backrefs from the
+ * log when a file that was logged in the current transaction is
+ * unlinked.  Any later fsync will include the updated log entries, and
+ * we'll be able to reconstruct the proper directory items from backrefs.
+ *
+ * This optimizations allows us to avoid relogging the entire inode
+ * or the entire directory.
+ */
+int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 const char *name, int name_len,
+				 struct inode *dir, u64 index)
+{
+	struct btrfs_root *log;
+	struct btrfs_dir_item *di;
+	struct btrfs_path *path;
+	int ret;
+	int bytes_del = 0;
+
+	ret = join_running_log_trans(root);
+	if (ret)
+		return 0;
+
+	mutex_lock(&BTRFS_I(dir)->log_mutex);
+
+	log = root->log_root;
+	path = btrfs_alloc_path();
+	di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino,
+				   name, name_len, -1);
+	if (di && !IS_ERR(di)) {
+		ret = btrfs_delete_one_dir_name(trans, log, path, di);
+		bytes_del += name_len;
+		BUG_ON(ret);
+	}
+	btrfs_release_path(log, path);
+	di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino,
+					 index, name, name_len, -1);
+	if (di && !IS_ERR(di)) {
+		ret = btrfs_delete_one_dir_name(trans, log, path, di);
+		bytes_del += name_len;
+		BUG_ON(ret);
+	}
+
+	/* update the directory size in the log to reflect the names
+	 * we have removed
+	 */
+	if (bytes_del) {
+		struct btrfs_key key;
+
+		key.objectid = dir->i_ino;
+		key.offset = 0;
+		key.type = BTRFS_INODE_ITEM_KEY;
+		btrfs_release_path(log, path);
+
+		ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
+		if (ret == 0) {
+			struct btrfs_inode_item *item;
+			u64 i_size;
+
+			item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+					      struct btrfs_inode_item);
+			i_size = btrfs_inode_size(path->nodes[0], item);
+			if (i_size > bytes_del)
+				i_size -= bytes_del;
+			else
+				i_size = 0;
+			btrfs_set_inode_size(path->nodes[0], item, i_size);
+			btrfs_mark_buffer_dirty(path->nodes[0]);
+		} else
+			ret = 0;
+		btrfs_release_path(log, path);
+	}
+
+	btrfs_free_path(path);
+	mutex_unlock(&BTRFS_I(dir)->log_mutex);
+	end_log_trans(root);
+
+	return 0;
+}
+
+/* see comments for btrfs_del_dir_entries_in_log */
+int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       const char *name, int name_len,
+			       struct inode *inode, u64 dirid)
+{
+	struct btrfs_root *log;
+	u64 index;
+	int ret;
+
+	ret = join_running_log_trans(root);
+	if (ret)
+		return 0;
+	log = root->log_root;
+	mutex_lock(&BTRFS_I(inode)->log_mutex);
+
+	ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
+				  dirid, &index);
+	mutex_unlock(&BTRFS_I(inode)->log_mutex);
+	end_log_trans(root);
+
+	if (ret == 0 || ret == -ENOENT)
+		return 0;
+	return ret;
+}
+
+/*
+ * creates a range item in the log for 'dirid'.  first_offset and
+ * last_offset tell us which parts of the key space the log should
+ * be considered authoritative for.
+ */
+static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
+				       struct btrfs_root *log,
+				       struct btrfs_path *path,
+				       int key_type, u64 dirid,
+				       u64 first_offset, u64 last_offset)
+{
+	int ret;
+	struct btrfs_key key;
+	struct btrfs_dir_log_item *item;
+
+	key.objectid = dirid;
+	key.offset = first_offset;
+	if (key_type == BTRFS_DIR_ITEM_KEY)
+		key.type = BTRFS_DIR_LOG_ITEM_KEY;
+	else
+		key.type = BTRFS_DIR_LOG_INDEX_KEY;
+	ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
+	BUG_ON(ret);
+
+	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+			      struct btrfs_dir_log_item);
+	btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+	btrfs_release_path(log, path);
+	return 0;
+}
+
+/*
+ * log all the items included in the current transaction for a given
+ * directory.  This also creates the range items in the log tree required
+ * to replay anything deleted before the fsync
+ */
+static noinline int log_dir_items(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, struct inode *inode,
+			  struct btrfs_path *path,
+			  struct btrfs_path *dst_path, int key_type,
+			  u64 min_offset, u64 *last_offset_ret)
+{
+	struct btrfs_key min_key;
+	struct btrfs_key max_key;
+	struct btrfs_root *log = root->log_root;
+	struct extent_buffer *src;
+	int ret;
+	int i;
+	int nritems;
+	u64 first_offset = min_offset;
+	u64 last_offset = (u64)-1;
+
+	log = root->log_root;
+	max_key.objectid = inode->i_ino;
+	max_key.offset = (u64)-1;
+	max_key.type = key_type;
+
+	min_key.objectid = inode->i_ino;
+	min_key.type = key_type;
+	min_key.offset = min_offset;
+
+	path->keep_locks = 1;
+
+	ret = btrfs_search_forward(root, &min_key, &max_key,
+				   path, 0, trans->transid);
+
+	/*
+	 * we didn't find anything from this transaction, see if there
+	 * is anything at all
+	 */
+	if (ret != 0 || min_key.objectid != inode->i_ino ||
+	    min_key.type != key_type) {
+		min_key.objectid = inode->i_ino;
+		min_key.type = key_type;
+		min_key.offset = (u64)-1;
+		btrfs_release_path(root, path);
+		ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
+		if (ret < 0) {
+			btrfs_release_path(root, path);
+			return ret;
+		}
+		ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
+
+		/* if ret == 0 there are items for this type,
+		 * create a range to tell us the last key of this type.
+		 * otherwise, there are no items in this directory after
+		 * *min_offset, and we create a range to indicate that.
+		 */
+		if (ret == 0) {
+			struct btrfs_key tmp;
+			btrfs_item_key_to_cpu(path->nodes[0], &tmp,
+					      path->slots[0]);
+			if (key_type == tmp.type) {
+				first_offset = max(min_offset, tmp.offset) + 1;
+			}
+		}
+		goto done;
+	}
+
+	/* go backward to find any previous key */
+	ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
+	if (ret == 0) {
+		struct btrfs_key tmp;
+		btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
+		if (key_type == tmp.type) {
+			first_offset = tmp.offset;
+			ret = overwrite_item(trans, log, dst_path,
+					     path->nodes[0], path->slots[0],
+					     &tmp);
+		}
+	}
+	btrfs_release_path(root, path);
+
+	/* find the first key from this transaction again */
+	ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
+	if (ret != 0) {
+		WARN_ON(1);
+		goto done;
+	}
+
+	/*
+	 * we have a block from this transaction, log every item in it
+	 * from our directory
+	 */
+	while(1) {
+		struct btrfs_key tmp;
+		src = path->nodes[0];
+		nritems = btrfs_header_nritems(src);
+		for (i = path->slots[0]; i < nritems; i++) {
+			btrfs_item_key_to_cpu(src, &min_key, i);
+
+			if (min_key.objectid != inode->i_ino ||
+			    min_key.type != key_type)
+				goto done;
+			ret = overwrite_item(trans, log, dst_path, src, i,
+					     &min_key);
+			BUG_ON(ret);
+		}
+		path->slots[0] = nritems;
+
+		/*
+		 * look ahead to the next item and see if it is also
+		 * from this directory and from this transaction
+		 */
+		ret = btrfs_next_leaf(root, path);
+		if (ret == 1) {
+			last_offset = (u64)-1;
+			goto done;
+		}
+		btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
+		if (tmp.objectid != inode->i_ino || tmp.type != key_type) {
+			last_offset = (u64)-1;
+			goto done;
+		}
+		if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
+			ret = overwrite_item(trans, log, dst_path,
+					     path->nodes[0], path->slots[0],
+					     &tmp);
+
+			BUG_ON(ret);
+			last_offset = tmp.offset;
+			goto done;
+		}
+	}
+done:
+	*last_offset_ret = last_offset;
+	btrfs_release_path(root, path);
+	btrfs_release_path(log, dst_path);
+
+	/* insert the log range keys to indicate where the log is valid */
+	ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino,
+				 first_offset, last_offset);
+	BUG_ON(ret);
+	return 0;
+}
+
+/*
+ * logging directories is very similar to logging inodes, We find all the items
+ * from the current transaction and write them to the log.
+ *
+ * The recovery code scans the directory in the subvolume, and if it finds a
+ * key in the range logged that is not present in the log tree, then it means
+ * that dir entry was unlinked during the transaction.
+ *
+ * In order for that scan to work, we must include one key smaller than
+ * the smallest logged by this transaction and one key larger than the largest
+ * key logged by this transaction.
+ */
+static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, struct inode *inode,
+			  struct btrfs_path *path,
+			  struct btrfs_path *dst_path)
+{
+	u64 min_key;
+	u64 max_key;
+	int ret;
+	int key_type = BTRFS_DIR_ITEM_KEY;
+
+again:
+	min_key = 0;
+	max_key = 0;
+	while(1) {
+		ret = log_dir_items(trans, root, inode, path,
+				    dst_path, key_type, min_key,
+				    &max_key);
+		BUG_ON(ret);
+		if (max_key == (u64)-1)
+			break;
+		min_key = max_key + 1;
+	}
+
+	if (key_type == BTRFS_DIR_ITEM_KEY) {
+		key_type = BTRFS_DIR_INDEX_KEY;
+		goto again;
+	}
+	return 0;
+}
+
+/*
+ * a helper function to drop items from the log before we relog an
+ * inode.  max_key_type indicates the highest item type to remove.
+ * This cannot be run for file data extents because it does not
+ * free the extents they point to.
+ */
+static int drop_objectid_items(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *log,
+				  struct btrfs_path *path,
+				  u64 objectid, int max_key_type)
+{
+	int ret;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+
+	key.objectid = objectid;
+	key.type = max_key_type;
+	key.offset = (u64)-1;
+
+	while(1) {
+		ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
+
+		if (ret != 1)
+			break;
+
+		if (path->slots[0] == 0)
+			break;
+
+		path->slots[0]--;
+		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+				      path->slots[0]);
+
+		if (found_key.objectid != objectid)
+			break;
+
+		ret = btrfs_del_item(trans, log, path);
+		BUG_ON(ret);
+		btrfs_release_path(log, path);
+	}
+	btrfs_release_path(log, path);
+	return 0;
+}
+
+/* log a single inode in the tree log.
+ * At least one parent directory for this inode must exist in the tree
+ * or be logged already.
+ *
+ * Any items from this inode changed by the current transaction are copied
+ * to the log tree.  An extra reference is taken on any extents in this
+ * file, allowing us to avoid a whole pile of corner cases around logging
+ * blocks that have been removed from the tree.
+ *
+ * See LOG_INODE_ALL and related defines for a description of what inode_only
+ * does.
+ *
+ * This handles both files and directories.
+ */
+static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root, struct inode *inode,
+			     int inode_only)
+{
+	struct btrfs_path *path;
+	struct btrfs_path *dst_path;
+	struct btrfs_key min_key;
+	struct btrfs_key max_key;
+	struct btrfs_root *log = root->log_root;
+	unsigned long src_offset;
+	unsigned long dst_offset;
+	struct extent_buffer *src;
+	struct btrfs_file_extent_item *extent;
+	struct btrfs_inode_item *inode_item;
+	u32 size;
+	int ret;
+
+	log = root->log_root;
+
+	path = btrfs_alloc_path();
+	dst_path = btrfs_alloc_path();
+
+	min_key.objectid = inode->i_ino;
+	min_key.type = BTRFS_INODE_ITEM_KEY;
+	min_key.offset = 0;
+
+	max_key.objectid = inode->i_ino;
+	if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
+		max_key.type = BTRFS_XATTR_ITEM_KEY;
+	else
+		max_key.type = (u8)-1;
+	max_key.offset = (u64)-1;
+
+	/*
+	 * if this inode has already been logged and we're in inode_only
+	 * mode, we don't want to delete the things that have already
+	 * been written to the log.
+	 *
+	 * But, if the inode has been through an inode_only log,
+	 * the logged_trans field is not set.  This allows us to catch
+	 * any new names for this inode in the backrefs by logging it
+	 * again
+	 */
+	if (inode_only == LOG_INODE_EXISTS &&
+	    BTRFS_I(inode)->logged_trans == trans->transid) {
+		btrfs_free_path(path);
+		btrfs_free_path(dst_path);
+		goto out;
+	}
+	mutex_lock(&BTRFS_I(inode)->log_mutex);
+
+	/*
+	 * a brute force approach to making sure we get the most uptodate
+	 * copies of everything.
+	 */
+	if (S_ISDIR(inode->i_mode)) {
+		int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
+
+		if (inode_only == LOG_INODE_EXISTS)
+			max_key_type = BTRFS_XATTR_ITEM_KEY;
+		ret = drop_objectid_items(trans, log, path,
+					  inode->i_ino, max_key_type);
+	} else {
+		ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0);
+	}
+	BUG_ON(ret);
+	path->keep_locks = 1;
+
+	while(1) {
+		ret = btrfs_search_forward(root, &min_key, &max_key,
+					   path, 0, trans->transid);
+		if (ret != 0)
+			break;
+
+		if (min_key.objectid != inode->i_ino)
+			break;
+		if (min_key.type > max_key.type)
+			break;
+
+		src = path->nodes[0];
+		size = btrfs_item_size_nr(src, path->slots[0]);
+		ret = btrfs_insert_empty_item(trans, log, dst_path, &min_key,
+					      size);
+		if (ret)
+			BUG();
+
+		dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
+						   dst_path->slots[0]);
+
+		src_offset = btrfs_item_ptr_offset(src, path->slots[0]);
+
+		copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
+				   src_offset, size);
+
+		if (inode_only == LOG_INODE_EXISTS &&
+		    min_key.type == BTRFS_INODE_ITEM_KEY) {
+			inode_item = btrfs_item_ptr(dst_path->nodes[0],
+						    dst_path->slots[0],
+						    struct btrfs_inode_item);
+			btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0);
+
+			/* set the generation to zero so the recover code
+			 * can tell the difference between an logging
+			 * just to say 'this inode exists' and a logging
+			 * to say 'update this inode with these values'
+			 */
+			btrfs_set_inode_generation(dst_path->nodes[0],
+						   inode_item, 0);
+		}
+		/* take a reference on file data extents so that truncates
+		 * or deletes of this inode don't have to relog the inode
+		 * again
+		 */
+		if (btrfs_key_type(&min_key) == BTRFS_EXTENT_DATA_KEY) {
+			int found_type;
+			extent = btrfs_item_ptr(src, path->slots[0],
+						struct btrfs_file_extent_item);
+
+			found_type = btrfs_file_extent_type(src, extent);
+			if (found_type == BTRFS_FILE_EXTENT_REG) {
+				u64 ds = btrfs_file_extent_disk_bytenr(src,
+								   extent);
+				u64 dl = btrfs_file_extent_disk_num_bytes(src,
+								      extent);
+				/* ds == 0 is a hole */
+				if (ds != 0) {
+					ret = btrfs_inc_extent_ref(trans, log,
+						   ds, dl,
+						   log->root_key.objectid,
+						   0,
+						   inode->i_ino,
+						   min_key.offset);
+					BUG_ON(ret);
+				}
+			}
+		}
+
+		btrfs_mark_buffer_dirty(dst_path->nodes[0]);
+		btrfs_release_path(root, path);
+		btrfs_release_path(log, dst_path);
+
+		if (min_key.offset < (u64)-1)
+			min_key.offset++;
+		else if (min_key.type < (u8)-1)
+			min_key.type++;
+		else if (min_key.objectid < (u64)-1)
+			min_key.objectid++;
+		else
+			break;
+	}
+	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
+		btrfs_release_path(root, path);
+		btrfs_release_path(log, dst_path);
+		ret = log_directory_changes(trans, root, inode, path, dst_path);
+		BUG_ON(ret);
+	}
+	mutex_unlock(&BTRFS_I(inode)->log_mutex);
+
+	btrfs_free_path(path);
+	btrfs_free_path(dst_path);
+
+	mutex_lock(&root->fs_info->tree_log_mutex);
+	ret = update_log_root(trans, log);
+	BUG_ON(ret);
+	mutex_unlock(&root->fs_info->tree_log_mutex);
+out:
+	return 0;
+}
+
+int btrfs_log_inode(struct btrfs_trans_handle *trans,
+		    struct btrfs_root *root, struct inode *inode,
+		    int inode_only)
+{
+	int ret;
+
+	start_log_trans(trans, root);
+	ret = __btrfs_log_inode(trans, root, inode, inode_only);
+	end_log_trans(root);
+	return ret;
+}
+
+/*
+ * helper function around btrfs_log_inode to make sure newly created
+ * parent directories also end up in the log.  A minimal inode and backref
+ * only logging is done of any parent directories that are older than
+ * the last committed transaction
+ */
+int btrfs_log_dentry(struct btrfs_trans_handle *trans,
+		    struct btrfs_root *root, struct dentry *dentry)
+{
+	int inode_only = LOG_INODE_ALL;
+	struct super_block *sb;
+	int ret;
+
+	start_log_trans(trans, root);
+	sb = dentry->d_inode->i_sb;
+	while(1) {
+		ret = __btrfs_log_inode(trans, root, dentry->d_inode,
+					inode_only);
+		BUG_ON(ret);
+		inode_only = LOG_INODE_EXISTS;
+
+		dentry = dentry->d_parent;
+		if (!dentry || !dentry->d_inode || sb != dentry->d_inode->i_sb)
+			break;
+
+		if (BTRFS_I(dentry->d_inode)->generation <=
+		    root->fs_info->last_trans_committed)
+			break;
+	}
+	end_log_trans(root);
+	return 0;
+}
+
+/*
+ * it is not safe to log dentry if the chunk root has added new
+ * chunks.  This returns 0 if the dentry was logged, and 1 otherwise.
+ * If this returns 1, you must commit the transaction to safely get your
+ * data on disk.
+ */
+int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, struct dentry *dentry)
+{
+	u64 gen;
+	gen = root->fs_info->last_trans_new_blockgroup;
+	if (gen > root->fs_info->last_trans_committed)
+		return 1;
+	else
+		return btrfs_log_dentry(trans, root, dentry);
+}
+
+/*
+ * should be called during mount to recover any replay any log trees
+ * from the FS
+ */
+int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct btrfs_key tmp_key;
+	struct btrfs_root *log;
+	struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
+	struct walk_control wc = {
+		.process_func = process_one_buffer,
+		.stage = 0,
+	};
+
+	fs_info->log_root_recovering = 1;
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+
+	trans = btrfs_start_transaction(fs_info->tree_root, 1);
+
+	wc.trans = trans;
+	wc.pin = 1;
+
+	walk_log_tree(trans, log_root_tree, &wc);
+
+again:
+	key.objectid = BTRFS_TREE_LOG_OBJECTID;
+	key.offset = (u64)-1;
+	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+
+	while(1) {
+		ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
+		if (ret < 0)
+			break;
+		if (ret > 0) {
+			if (path->slots[0] == 0)
+				break;
+			path->slots[0]--;
+		}
+		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+				      path->slots[0]);
+		btrfs_release_path(log_root_tree, path);
+		if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
+			break;
+
+		log = btrfs_read_fs_root_no_radix(log_root_tree,
+						  &found_key);
+		BUG_ON(!log);
+
+
+		tmp_key.objectid = found_key.offset;
+		tmp_key.type = BTRFS_ROOT_ITEM_KEY;
+		tmp_key.offset = (u64)-1;
+
+		wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
+
+		BUG_ON(!wc.replay_dest);
+
+		btrfs_record_root_in_trans(wc.replay_dest);
+		ret = walk_log_tree(trans, log, &wc);
+		BUG_ON(ret);
+
+		if (wc.stage == LOG_WALK_REPLAY_ALL) {
+			ret = fixup_inode_link_counts(trans, wc.replay_dest,
+						      path);
+			BUG_ON(ret);
+		}
+
+		key.offset = found_key.offset - 1;
+		free_extent_buffer(log->node);
+		kfree(log);
+
+		if (found_key.offset == 0)
+			break;
+	}
+	btrfs_release_path(log_root_tree, path);
+
+	/* step one is to pin it all, step two is to replay just inodes */
+	if (wc.pin) {
+		wc.pin = 0;
+		wc.process_func = replay_one_buffer;
+		wc.stage = LOG_WALK_REPLAY_INODES;
+		goto again;
+	}
+	/* step three is to replay everything */
+	if (wc.stage < LOG_WALK_REPLAY_ALL) {
+		wc.stage++;
+		goto again;
+	}
+
+	btrfs_free_path(path);
+
+	free_extent_buffer(log_root_tree->node);
+	log_root_tree->log_root = NULL;
+	fs_info->log_root_recovering = 0;
+
+	/* step 4: commit the transaction, which also unpins the blocks */
+	btrfs_commit_transaction(trans, fs_info->tree_root);
+
+	kfree(log_root_tree);
+	return 0;
+}
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
new file mode 100644
index 000000000000..b9409b32ed02
--- /dev/null
+++ b/fs/btrfs/tree-log.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __TREE_LOG_
+#define __TREE_LOG_
+
+int btrfs_sync_log(struct btrfs_trans_handle *trans,
+		   struct btrfs_root *root);
+int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
+int btrfs_log_dentry(struct btrfs_trans_handle *trans,
+		    struct btrfs_root *root, struct dentry *dentry);
+int btrfs_recover_log_trees(struct btrfs_root *tree_root);
+int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, struct dentry *dentry);
+int btrfs_log_inode(struct btrfs_trans_handle *trans,
+		    struct btrfs_root *root, struct inode *inode,
+		    int inode_only);
+int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 const char *name, int name_len,
+				 struct inode *dir, u64 index);
+int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       const char *name, int name_len,
+			       struct inode *inode, u64 dirid);
+#endif
-- 
cgit v1.2.3


From 49eb7e46d47ea72a9bd2a5f8cedb04f5159cc277 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 11 Sep 2008 15:53:12 -0400
Subject: Btrfs: Dir fsync optimizations

Drop i_mutex during the commit

Don't bother doing the fsync at all unless the dir is marked as dirtied
and needing fsync in this transaction.  For directories, this means
that someone has unlinked a file from the dir without fsyncing the
file.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/btrfs_inode.h |  3 +++
 fs/btrfs/file.c        | 19 ++++++++++++++++++-
 fs/btrfs/inode.c       |  5 ++++-
 fs/btrfs/tree-log.c    |  8 ++++----
 4 files changed, 29 insertions(+), 6 deletions(-)

(limited to 'fs/btrfs/btrfs_inode.h')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index fcc8cf27e906..0577fda2168a 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -56,6 +56,9 @@ struct btrfs_inode {
 	 * transid that last logged this inode
 	 */
 	u64 logged_trans;
+
+	/* trans that last made a change that should be fully fsync'd */
+	u64 log_dirty_trans;
 	u64 delalloc_bytes;
 	u64 disk_i_size;
 	u32 flags;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 84ecf3ab8511..58b329ddb426 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1061,7 +1061,9 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
 	}
 	mutex_unlock(&root->fs_info->trans_mutex);
 
+	root->fs_info->tree_log_batch++;
 	filemap_fdatawait(inode->i_mapping);
+	root->fs_info->tree_log_batch++;
 
 	/*
 	 * ok we haven't committed the transaction yet, lets do a commit
@@ -1076,14 +1078,29 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
 	}
 
 	ret = btrfs_log_dentry_safe(trans, root, file->f_dentry);
-	if (ret < 0)
+	if (ret < 0) {
 		goto out;
+	}
+
+	/* we've logged all the items and now have a consistent
+	 * version of the file in the log.  It is possible that
+	 * someone will come in and modify the file, but that's
+	 * fine because the log is consistent on disk, and we
+	 * have references to all of the file's extents
+	 *
+	 * It is possible that someone will come in and log the
+	 * file again, but that will end up using the synchronization
+	 * inside btrfs_sync_log to keep things safe.
+	 */
+	mutex_unlock(&file->f_dentry->d_inode->i_mutex);
+
 	if (ret > 0) {
 		ret = btrfs_commit_transaction(trans, root);
 	} else {
 		btrfs_sync_log(trans, root);
 		ret = btrfs_end_transaction(trans, root);
 	}
+	mutex_lock(&file->f_dentry->d_inode->i_mutex);
 out:
 	return ret > 0 ? EIO : ret;
 }
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 24b7e97fccb9..12c1c0530f3d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1187,7 +1187,9 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 
 	ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
 					 inode, dir->i_ino);
-	BUG_ON(ret);
+	BUG_ON(ret != 0 && ret != -ENOENT);
+	if (ret != -ENOENT)
+		BTRFS_I(dir)->log_dirty_trans = trans->transid;
 
 	ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
 					   dir, index);
@@ -1790,6 +1792,7 @@ static noinline void init_btrfs_i(struct inode *inode)
 	bi->disk_i_size = 0;
 	bi->flags = 0;
 	bi->index_cnt = (u64)-1;
+	bi->log_dirty_trans = 0;
 	extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
 	extent_io_tree_init(&BTRFS_I(inode)->io_tree,
 			     inode->i_mapping, GFP_NOFS);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 3f4b139b27ed..5d49a701bdcd 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1973,10 +1973,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	atomic_set(&log->fs_info->tree_log_commit, 1);
 
 	while(1) {
+		batch = log->fs_info->tree_log_batch;
 		mutex_unlock(&log->fs_info->tree_log_mutex);
 		schedule_timeout_uninterruptible(1);
 		mutex_lock(&log->fs_info->tree_log_mutex);
-		batch = log->fs_info->tree_log_batch;
 
 		while(atomic_read(&log->fs_info->tree_log_writers)) {
 			DEFINE_WAIT(wait);
@@ -2189,8 +2189,6 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
 	mutex_unlock(&BTRFS_I(inode)->log_mutex);
 	end_log_trans(root);
 
-	if (ret == 0 || ret == -ENOENT)
-		return 0;
 	return ret;
 }
 
@@ -2620,9 +2618,11 @@ static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
 		else
 			break;
 	}
-	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
+	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode) &&
+	    BTRFS_I(inode)->log_dirty_trans >= trans->transid) {
 		btrfs_release_path(root, path);
 		btrfs_release_path(log, dst_path);
+		BTRFS_I(inode)->log_dirty_trans = 0;
 		ret = log_directory_changes(trans, root, inode, path, dst_path);
 		BUG_ON(ret);
 	}
-- 
cgit v1.2.3


From d352ac68148b69937d39ca5d48bcc4478e118dbf Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 29 Sep 2008 15:18:18 -0400
Subject: Btrfs: add and improve comments

This improves the comments at the top of many functions.  It didn't
dive into the guts of functions because I was trying to
avoid merging problems with the new allocator and back reference work.

extent-tree.c and volumes.c were both skipped, and there is definitely
more work todo in cleaning and commenting the code.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile       |   2 +-
 fs/btrfs/TODO           |  20 -----
 fs/btrfs/async-thread.c |  10 ++-
 fs/btrfs/async-thread.h |   7 +-
 fs/btrfs/bit-radix.c    | 130 ---------------------------------
 fs/btrfs/bit-radix.h    |  33 ---------
 fs/btrfs/btrfs_inode.h  |  54 +++++++++++++-
 fs/btrfs/crc32c.h       |  18 +++++
 fs/btrfs/ctree.c        | 127 +++++++++++++++++++++++++++++---
 fs/btrfs/ctree.h        |   1 -
 fs/btrfs/dir-item.c     |  41 +++++++++++
 fs/btrfs/disk-io.c      |  33 ++++++++-
 fs/btrfs/extent_io.c    |  34 ++++++++-
 fs/btrfs/extent_map.c   |  10 +++
 fs/btrfs/file.c         |  44 ++++++++++-
 fs/btrfs/inode.c        | 189 +++++++++++++++++++++++++++++++-----------------
 fs/btrfs/locking.c      |  13 ++++
 fs/btrfs/ordered-data.c |  19 ++++-
 fs/btrfs/ref-cache.c    |  26 +++++++
 fs/btrfs/ref-cache.h    |   3 +
 fs/btrfs/root-tree.c    |  21 +++++-
 fs/btrfs/struct-funcs.c |  21 ++++++
 fs/btrfs/super.c        |   3 +
 fs/btrfs/transaction.c  |  67 ++++++++++++++++-
 fs/btrfs/tree-defrag.c  |   4 +
 25 files changed, 653 insertions(+), 277 deletions(-)
 delete mode 100644 fs/btrfs/TODO
 delete mode 100644 fs/btrfs/bit-radix.c
 delete mode 100644 fs/btrfs/bit-radix.h

(limited to 'fs/btrfs/btrfs_inode.h')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index d5c28557fba9..48b7909ca8d1 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -4,7 +4,7 @@ ifneq ($(KERNELRELEASE),)
 obj-m  := btrfs.o
 btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   file-item.o inode-item.o inode-map.o disk-io.o \
-	   transaction.o bit-radix.o inode.o file.o tree-defrag.o \
+	   transaction.o inode.o file.o tree-defrag.o \
 	   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
 	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
 	   ref-cache.o export.o tree-log.o acl.o free-space-cache.o
diff --git a/fs/btrfs/TODO b/fs/btrfs/TODO
deleted file mode 100644
index d9b6d38c603a..000000000000
--- a/fs/btrfs/TODO
+++ /dev/null
@@ -1,20 +0,0 @@
-* cleanup, add more error checking, get rid of BUG_ONs
-* Fix ENOSPC handling
-* Make allocator smarter
-* add a block group to struct inode
-* Do actual block accounting
-* Check compat and incompat flags on the inode
-* Get rid of struct ctree_path, limiting tree levels held at one time
-* Add generation number to key pointer in nodes
-* Add generation number to inode
-* forbid cross subvolume renames and hardlinks
-* Release
-* Do real tree locking
-* Add extent mirroring (backup copies of blocks)
-* Add fancy interface to get access to incremental backups
-* Add fancy striped extents to make big reads faster
-* Use relocation to try and fix write errors
-* Make allocator much smarter
-* xattrs (directory streams for regular files)
-* Scrub & defrag
-
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 4e780b279de6..04fb9702d14c 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -231,17 +231,25 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
 
 	/*
 	 * if we pick a busy task, move the task to the end of the list.
-	 * hopefully this will keep things somewhat evenly balanced
+	 * hopefully this will keep things somewhat evenly balanced.
+	 * Do the move in batches based on the sequence number.  This groups
+	 * requests submitted at roughly the same time onto the same worker.
 	 */
 	next = workers->worker_list.next;
 	worker = list_entry(next, struct btrfs_worker_thread, worker_list);
 	atomic_inc(&worker->num_pending);
 	worker->sequence++;
+
 	if (worker->sequence % workers->idle_thresh == 0)
 		list_move_tail(next, &workers->worker_list);
 	return worker;
 }
 
+/*
+ * selects a worker thread to take the next job.  This will either find
+ * an idle worker, start a new worker up to the max count, or just return
+ * one of the existing busy workers.
+ */
 static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
 {
 	struct btrfs_worker_thread *worker;
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 43e44d115dd1..4ec9a2ee0f9d 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -63,14 +63,17 @@ struct btrfs_workers {
 	/* once a worker has this many requests or fewer, it is idle */
 	int idle_thresh;
 
-	/* list with all the work threads */
+	/* list with all the work threads.  The workers on the idle thread
+	 * may be actively servicing jobs, but they haven't yet hit the
+	 * idle thresh limit above.
+	 */
 	struct list_head worker_list;
 	struct list_head idle_list;
 
 	/* lock for finding the next worker thread to queue on */
 	spinlock_t lock;
 
-	/* extra name for this worker */
+	/* extra name for this worker, used for current->name */
 	char *name;
 };
 
diff --git a/fs/btrfs/bit-radix.c b/fs/btrfs/bit-radix.c
deleted file mode 100644
index e8bf876db393..000000000000
--- a/fs/btrfs/bit-radix.c
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (C) 2007 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#include "bit-radix.h"
-
-#define BIT_ARRAY_BYTES 256
-#define BIT_RADIX_BITS_PER_ARRAY ((BIT_ARRAY_BYTES - sizeof(unsigned long)) * 8)
-
-extern struct kmem_cache *btrfs_bit_radix_cachep;
-int set_radix_bit(struct radix_tree_root *radix, unsigned long bit)
-{
-	unsigned long *bits;
-	unsigned long slot;
-	int bit_slot;
-	int ret;
-
-	slot = bit / BIT_RADIX_BITS_PER_ARRAY;
-	bit_slot = bit % BIT_RADIX_BITS_PER_ARRAY;
-
-	bits = radix_tree_lookup(radix, slot);
-	if (!bits) {
-		bits = kmem_cache_alloc(btrfs_bit_radix_cachep, GFP_NOFS);
-		if (!bits)
-			return -ENOMEM;
-		memset(bits + 1, 0, BIT_ARRAY_BYTES - sizeof(unsigned long));
-		bits[0] = slot;
-		ret = radix_tree_insert(radix, slot, bits);
-		if (ret)
-			return ret;
-	}
-	ret = test_and_set_bit(bit_slot, bits + 1);
-	if (ret < 0)
-		ret = 1;
-	return ret;
-}
-
-int test_radix_bit(struct radix_tree_root *radix, unsigned long bit)
-{
-	unsigned long *bits;
-	unsigned long slot;
-	int bit_slot;
-
-	slot = bit / BIT_RADIX_BITS_PER_ARRAY;
-	bit_slot = bit % BIT_RADIX_BITS_PER_ARRAY;
-
-	bits = radix_tree_lookup(radix, slot);
-	if (!bits)
-		return 0;
-	return test_bit(bit_slot, bits + 1);
-}
-
-int clear_radix_bit(struct radix_tree_root *radix, unsigned long bit)
-{
-	unsigned long *bits;
-	unsigned long slot;
-	int bit_slot;
-	int i;
-	int empty = 1;
-
-	slot = bit / BIT_RADIX_BITS_PER_ARRAY;
-	bit_slot = bit % BIT_RADIX_BITS_PER_ARRAY;
-
-	bits = radix_tree_lookup(radix, slot);
-	if (!bits)
-		return 0;
-	clear_bit(bit_slot, bits + 1);
-	for (i = 1; i < BIT_ARRAY_BYTES / sizeof(unsigned long); i++) {
-		if (bits[i]) {
-			empty = 0;
-			break;
-		}
-	}
-	if (empty) {
-		bits = radix_tree_delete(radix, slot);
-		BUG_ON(!bits);
-		kmem_cache_free(btrfs_bit_radix_cachep, bits);
-	}
-	return 0;
-}
-
-int find_first_radix_bit(struct radix_tree_root *radix, unsigned long *retbits,
-			 unsigned long start, int nr)
-{
-	unsigned long *bits;
-	unsigned long *gang[4];
-	int found;
-	int ret;
-	int i;
-	int total_found = 0;
-	unsigned long slot;
-
-	slot = start / BIT_RADIX_BITS_PER_ARRAY;
-	ret = radix_tree_gang_lookup(radix, (void **)gang, slot,
-				     ARRAY_SIZE(gang));
-	found = start % BIT_RADIX_BITS_PER_ARRAY;
-	for (i = 0; i < ret && nr > 0; i++) {
-		bits = gang[i];
-		while(nr > 0) {
-			found = find_next_bit(bits + 1,
-					      BIT_RADIX_BITS_PER_ARRAY,
-					      found);
-			if (found < BIT_RADIX_BITS_PER_ARRAY) {
-				*retbits = bits[0] *
-					BIT_RADIX_BITS_PER_ARRAY + found;
-				retbits++;
-				nr--;
-				total_found++;
-				found++;
-			} else
-				break;
-		}
-		found = 0;
-	}
-	return total_found;
-}
diff --git a/fs/btrfs/bit-radix.h b/fs/btrfs/bit-radix.h
deleted file mode 100644
index c100f54d5c32..000000000000
--- a/fs/btrfs/bit-radix.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (C) 2007 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef __BIT_RADIX__
-#define __BIT_RADIX__
-#include <linux/radix-tree.h>
-
-int set_radix_bit(struct radix_tree_root *radix, unsigned long bit);
-int test_radix_bit(struct radix_tree_root *radix, unsigned long bit);
-int clear_radix_bit(struct radix_tree_root *radix, unsigned long bit);
-int find_first_radix_bit(struct radix_tree_root *radix, unsigned long *retbits,
-			 unsigned long start, int nr);
-
-static inline void init_bit_radix(struct radix_tree_root *radix)
-{
-	INIT_RADIX_TREE(radix, GFP_NOFS);
-}
-#endif
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 0577fda2168a..0b2e623cf421 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -25,27 +25,58 @@
 
 /* in memory btrfs inode */
 struct btrfs_inode {
+	/* which subvolume this inode belongs to */
 	struct btrfs_root *root;
+
+	/* the block group preferred for allocations.  This pointer is buggy
+	 * and needs to be replaced with a bytenr instead
+	 */
 	struct btrfs_block_group_cache *block_group;
+
+	/* key used to find this inode on disk.  This is used by the code
+	 * to read in roots of subvolumes
+	 */
 	struct btrfs_key location;
+
+	/* the extent_tree has caches of all the extent mappings to disk */
 	struct extent_map_tree extent_tree;
+
+	/* the io_tree does range state (DIRTY, LOCKED etc) */
 	struct extent_io_tree io_tree;
+
+	/* special utility tree used to record which mirrors have already been
+	 * tried when checksums fail for a given block
+	 */
 	struct extent_io_tree io_failure_tree;
+
+	/* held while inserting checksums to avoid races */
 	struct mutex csum_mutex;
+
+	/* held while inesrting or deleting extents from files */
 	struct mutex extent_mutex;
+
+	/* held while logging the inode in tree-log.c */
 	struct mutex log_mutex;
-	struct inode vfs_inode;
+
+	/* used to order data wrt metadata */
 	struct btrfs_ordered_inode_tree ordered_tree;
 
+	/* standard acl pointers */
 	struct posix_acl *i_acl;
 	struct posix_acl *i_default_acl;
 
 	/* for keeping track of orphaned inodes */
 	struct list_head i_orphan;
 
+	/* list of all the delalloc inodes in the FS.  There are times we need
+	 * to write all the delalloc pages to disk, and this list is used
+	 * to walk them all.
+	 */
 	struct list_head delalloc_inodes;
 
-	/* full 64 bit generation number */
+	/* full 64 bit generation number, struct vfs_inode doesn't have a big
+	 * enough field for this.
+	 */
 	u64 generation;
 
 	/*
@@ -57,10 +88,25 @@ struct btrfs_inode {
 	 */
 	u64 logged_trans;
 
-	/* trans that last made a change that should be fully fsync'd */
+	/*
+	 * trans that last made a change that should be fully fsync'd.  This
+	 * gets reset to zero each time the inode is logged
+	 */
 	u64 log_dirty_trans;
+
+	/* total number of bytes pending delalloc, used by stat to calc the
+	 * real block usage of the file
+	 */
 	u64 delalloc_bytes;
+
+	/*
+	 * the size of the file stored in the metadata on disk.  data=ordered
+	 * means the in-memory i_size might be larger than the size on disk
+	 * because not all the blocks are written yet.
+	 */
 	u64 disk_i_size;
+
+	/* flags field from the on disk inode */
 	u32 flags;
 
 	/*
@@ -68,6 +114,8 @@ struct btrfs_inode {
 	 * number for new files that are created
 	 */
 	u64 index_cnt;
+
+	struct inode vfs_inode;
 };
 
 static inline struct btrfs_inode *BTRFS_I(struct inode *inode)
diff --git a/fs/btrfs/crc32c.h b/fs/btrfs/crc32c.h
index 4f0fefed132a..1eaf11d334fd 100644
--- a/fs/btrfs/crc32c.h
+++ b/fs/btrfs/crc32c.h
@@ -1,3 +1,21 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
 #ifndef __BTRFS_CRC32C__
 #define __BTRFS_CRC32C__
 #include <asm/byteorder.h>
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 50e81f43e6d4..ff3261ff2e19 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2007 Oracle.  All rights reserved.
+ * Copyright (C) 2007,2008 Oracle.  All rights reserved.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public
@@ -54,12 +54,19 @@ struct btrfs_path *btrfs_alloc_path(void)
 	return path;
 }
 
+/* this also releases the path */
 void btrfs_free_path(struct btrfs_path *p)
 {
 	btrfs_release_path(NULL, p);
 	kmem_cache_free(btrfs_path_cachep, p);
 }
 
+/*
+ * path release drops references on the extent buffers in the path
+ * and it drops any locks held by this path
+ *
+ * It is safe to call this on paths that no locks or extent buffers held.
+ */
 void noinline btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
 {
 	int i;
@@ -77,6 +84,16 @@ void noinline btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
 	}
 }
 
+/*
+ * safely gets a reference on the root node of a tree.  A lock
+ * is not taken, so a concurrent writer may put a different node
+ * at the root of the tree.  See btrfs_lock_root_node for the
+ * looping required.
+ *
+ * The extent buffer returned by this has a reference taken, so
+ * it won't disappear.  It may stop being the root of the tree
+ * at any time because there are no locks held.
+ */
 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
 {
 	struct extent_buffer *eb;
@@ -87,6 +104,10 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
 	return eb;
 }
 
+/* loop around taking references on and locking the root node of the
+ * tree until you end up with a lock on the root.  A locked buffer
+ * is returned, with a reference held.
+ */
 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
 {
 	struct extent_buffer *eb;
@@ -108,6 +129,10 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
 	return eb;
 }
 
+/* cowonly root (everything not a reference counted cow subvolume), just get
+ * put onto a simple dirty list.  transaction.c walks this to make sure they
+ * get properly updated on disk.
+ */
 static void add_root_to_dirty_list(struct btrfs_root *root)
 {
 	if (root->track_dirty && list_empty(&root->dirty_list)) {
@@ -116,6 +141,11 @@ static void add_root_to_dirty_list(struct btrfs_root *root)
 	}
 }
 
+/*
+ * used by snapshot creation to make a copy of a root for a tree with
+ * a given objectid.  The buffer with the new root node is returned in
+ * cow_ret, and this func returns zero on success or a negative error code.
+ */
 int btrfs_copy_root(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root,
 		      struct extent_buffer *buf,
@@ -167,6 +197,22 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+/*
+ * does the dirty work in cow of a single block.  The parent block
+ * (if supplied) is updated to point to the new cow copy.  The new
+ * buffer is marked dirty and returned locked.  If you modify the block
+ * it needs to be marked dirty again.
+ *
+ * search_start -- an allocation hint for the new block
+ *
+ * empty_size -- a hint that you plan on doing more cow.  This is the size in bytes
+ * the allocator should try to find free next to the block it returns.  This is
+ * just a hint and may be ignored by the allocator.
+ *
+ * prealloc_dest -- if you have already reserved a destination for the cow,
+ * this uses that block instead of allocating a new one.  btrfs_alloc_reserved_extent
+ * is used to finish the allocation.
+ */
 int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     struct extent_buffer *buf,
@@ -311,6 +357,11 @@ int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+/*
+ * cows a single block, see __btrfs_cow_block for the real work.
+ * This version of it has extra checks so that a block isn't cow'd more than
+ * once per transaction, as long as it hasn't been written yet
+ */
 int noinline btrfs_cow_block(struct btrfs_trans_handle *trans,
 		    struct btrfs_root *root, struct extent_buffer *buf,
 		    struct extent_buffer *parent, int parent_slot,
@@ -347,6 +398,10 @@ int noinline btrfs_cow_block(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+/*
+ * helper function for defrag to decide if two blocks pointed to by a
+ * node are actually close by
+ */
 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
 {
 	if (blocknr < other && other - (blocknr + blocksize) < 32768)
@@ -381,6 +436,11 @@ static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
 }
 
 
+/*
+ * this is used by the defrag code to go through all the
+ * leaves pointed to by a node and reallocate them so that
+ * disk order is close to key order
+ */
 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct extent_buffer *parent,
 		       int start_slot, int cache_only, u64 *last_ret,
@@ -521,6 +581,10 @@ static inline unsigned int leaf_data_end(struct btrfs_root *root,
 	return btrfs_item_offset_nr(leaf, nr - 1);
 }
 
+/*
+ * extra debugging checks to make sure all the items in a key are
+ * well formed and in the proper order
+ */
 static int check_node(struct btrfs_root *root, struct btrfs_path *path,
 		      int level)
 {
@@ -561,6 +625,10 @@ static int check_node(struct btrfs_root *root, struct btrfs_path *path,
 	return 0;
 }
 
+/*
+ * extra checking to make sure all the items in a leaf are
+ * well formed and in the proper order
+ */
 static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
 		      int level)
 {
@@ -782,6 +850,10 @@ static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
 	return -1;
 }
 
+/* given a node and slot number, this reads the blocks it points to.  The
+ * extent buffer is returned with a reference taken (but unlocked).
+ * NULL is returned on error.
+ */
 static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
 				   struct extent_buffer *parent, int slot)
 {
@@ -798,6 +870,11 @@ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
 		       btrfs_node_ptr_generation(parent, slot));
 }
 
+/*
+ * node level balancing, used to make sure nodes are in proper order for
+ * item deletion.  We balance from the top down, so we have to make sure
+ * that a deletion won't leave an node completely empty later on.
+ */
 static noinline int balance_level(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root,
 			 struct btrfs_path *path, int level)
@@ -1024,7 +1101,10 @@ enospc:
 	return ret;
 }
 
-/* returns zero if the push worked, non-zero otherwise */
+/* Node balancing for insertion.  Here we only split or push nodes around
+ * when they are completely full.  This is also done top down, so we
+ * have to be pessimistic.
+ */
 static int noinline push_nodes_for_insert(struct btrfs_trans_handle *trans,
 					  struct btrfs_root *root,
 					  struct btrfs_path *path, int level)
@@ -1150,7 +1230,8 @@ static int noinline push_nodes_for_insert(struct btrfs_trans_handle *trans,
 }
 
 /*
- * readahead one full node of leaves
+ * readahead one full node of leaves, finding things that are close
+ * to the block in 'slot', and triggering ra on them.
  */
 static noinline void reada_for_search(struct btrfs_root *root,
 				      struct btrfs_path *path,
@@ -1226,6 +1307,19 @@ static noinline void reada_for_search(struct btrfs_root *root,
 	}
 }
 
+/*
+ * when we walk down the tree, it is usually safe to unlock the higher layers in
+ * the tree.  The exceptions are when our path goes through slot 0, because operations
+ * on the tree might require changing key pointers higher up in the tree.
+ *
+ * callers might also have set path->keep_locks, which tells this code to
+ * keep the lock if the path points to the last slot in the block.  This is
+ * part of walking through the tree, and selecting the next slot in the higher
+ * block.
+ *
+ * lowest_unlock sets the lowest level in the tree we're allowed to unlock.
+ * so if lowest_unlock is 1, level 0 won't be unlocked
+ */
 static noinline void unlock_up(struct btrfs_path *path, int level,
 			       int lowest_unlock)
 {
@@ -2705,6 +2799,12 @@ again:
 	return ret;
 }
 
+/*
+ * make the item pointed to by the path smaller.  new_size indicates
+ * how small to make it, and from_end tells us if we just chop bytes
+ * off the end of the item or if we shift the item to chop bytes off
+ * the front.
+ */
 int btrfs_truncate_item(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root,
 			struct btrfs_path *path,
@@ -2818,6 +2918,9 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+/*
+ * make the item pointed to by the path bigger, data_size is the new size.
+ */
 int btrfs_extend_item(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root, struct btrfs_path *path,
 		      u32 data_size)
@@ -2897,7 +3000,7 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans,
 }
 
 /*
- * Given a key and some data, insert an item into the tree.
+ * Given a key and some data, insert items into the tree.
  * This does all the path init required, making room in the tree if needed.
  */
 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
@@ -3046,9 +3149,8 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
 /*
  * delete the pointer from a given node.
  *
- * If the delete empties a node, the node is removed from the tree,
- * continuing all the way the root if required.  The root is converted into
- * a leaf if all the nodes are emptied.
+ * the tree should have been previously balanced so the deletion does not
+ * empty a node.
  */
 static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		   struct btrfs_path *path, int level, int slot)
@@ -3233,6 +3335,9 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
  * search the tree again to find a leaf with lesser keys
  * returns 0 if it found something or 1 if there are no lesser leaves.
  * returns < 0 on io errors.
+ *
+ * This may release the path, and so you may lose any locks held at the
+ * time you call it.
  */
 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
 {
@@ -3265,9 +3370,7 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
 /*
  * A helper function to walk down the tree starting at min_key, and looking
  * for nodes or leaves that are either in cache or have a minimum
- * transaction id.  This is used by the btree defrag code, but could
- * also be used to search for blocks that have changed since a given
- * transaction id.
+ * transaction id.  This is used by the btree defrag code, and tree logging
  *
  * This does not cow, but it does stuff the starting key it finds back
  * into min_key, so you can call btrfs_search_slot with cow=1 on the
@@ -3279,6 +3382,10 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
  * This honors path->lowest_level to prevent descent past a given level
  * of the tree.
  *
+ * min_trans indicates the oldest transaction that you are interested
+ * in walking through.  Any nodes or leaves older than min_trans are
+ * skipped over (without reading them).
+ *
  * returns zero if something useful was found, < 0 on error and 1 if there
  * was nothing in the tree that matched the search criteria.
  */
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0079b60b18f3..ded1643c0273 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -27,7 +27,6 @@
 #include <linux/backing-dev.h>
 #include <linux/wait.h>
 #include <asm/kmap_types.h>
-#include "bit-radix.h"
 #include "extent_io.h"
 #include "extent_map.h"
 #include "async-thread.h"
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index e4f30090d640..5040b71f1900 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -21,6 +21,14 @@
 #include "hash.h"
 #include "transaction.h"
 
+/*
+ * insert a name into a directory, doing overflow properly if there is a hash
+ * collision.  data_size indicates how big the item inserted should be.  On
+ * success a struct btrfs_dir_item pointer is returned, otherwise it is
+ * an ERR_PTR.
+ *
+ * The name is not copied into the dir item, you have to do that yourself.
+ */
 static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
 						   *trans,
 						   struct btrfs_root *root,
@@ -55,6 +63,10 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
 	return (struct btrfs_dir_item *)ptr;
 }
 
+/*
+ * xattrs work a lot like directories, this inserts an xattr item
+ * into the tree
+ */
 int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root, const char *name,
 			    u16 name_len, const void *data, u16 data_len,
@@ -109,6 +121,13 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+/*
+ * insert a directory item in the tree, doing all the magic for
+ * both indexes. 'dir' indicates which objectid to insert it into,
+ * 'location' is the key to stuff into the directory item, 'type' is the
+ * type of the inode we're pointing to, and 'index' is the sequence number
+ * to use for the second index (if one is created).
+ */
 int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 			  *root, const char *name, int name_len, u64 dir,
 			  struct btrfs_key *location, u8 type, u64 index)
@@ -184,6 +203,11 @@ out:
 	return 0;
 }
 
+/*
+ * lookup a directory item based on name.  'dir' is the objectid
+ * we're searching in, and 'mod' tells us if you plan on deleting the
+ * item (use mod < 0) or changing the options (use mod > 0)
+ */
 struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
 					     struct btrfs_root *root,
 					     struct btrfs_path *path, u64 dir,
@@ -222,6 +246,14 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
 	return btrfs_match_dir_item_name(root, path, name, name_len);
 }
 
+/*
+ * lookup a directory item based on index.  'dir' is the objectid
+ * we're searching in, and 'mod' tells us if you plan on deleting the
+ * item (use mod < 0) or changing the options (use mod > 0)
+ *
+ * The name is used to make sure the index really points to the name you were
+ * looking for.
+ */
 struct btrfs_dir_item *
 btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root,
@@ -282,6 +314,11 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
 	return btrfs_match_dir_item_name(root, path, name, name_len);
 }
 
+/*
+ * helper function to look at the directory item pointed to by 'path'
+ * this walks through all the entries in a dir item and finds one
+ * for a specific name.
+ */
 struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
 			      struct btrfs_path *path,
 			      const char *name, int name_len)
@@ -313,6 +350,10 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
 	return NULL;
 }
 
+/*
+ * given a pointer into a directory item, delete it.  This
+ * handles items that have more than one entry in them.
+ */
 int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root,
 			      struct btrfs_path *path,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 45b4f7285275..5ee10d3136f5 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -55,6 +55,11 @@ static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf)
 static struct extent_io_ops btree_extent_io_ops;
 static void end_workqueue_fn(struct btrfs_work *work);
 
+/*
+ * end_io_wq structs are used to do processing in task context when an IO is
+ * complete.  This is used during reads to verify checksums, and it is used
+ * by writes to insert metadata for new file extents after IO is complete.
+ */
 struct end_io_wq {
 	struct bio *bio;
 	bio_end_io_t *end_io;
@@ -66,6 +71,11 @@ struct end_io_wq {
 	struct btrfs_work work;
 };
 
+/*
+ * async submit bios are used to offload expensive checksumming
+ * onto the worker threads.  They checksum file and metadata bios
+ * just before they are sent down the IO stack.
+ */
 struct async_submit_bio {
 	struct inode *inode;
 	struct bio *bio;
@@ -76,6 +86,10 @@ struct async_submit_bio {
 	struct btrfs_work work;
 };
 
+/*
+ * extents on the btree inode are pretty simple, there's one extent
+ * that covers the entire device
+ */
 struct extent_map *btree_get_extent(struct inode *inode, struct page *page,
 				    size_t page_offset, u64 start, u64 len,
 				    int create)
@@ -151,6 +165,10 @@ void btrfs_csum_final(u32 crc, char *result)
 	*(__le32 *)result = ~cpu_to_le32(crc);
 }
 
+/*
+ * compute the csum for a btree block, and either verify it or write it
+ * into the csum field of the block.
+ */
 static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
 			   int verify)
 {
@@ -204,6 +222,12 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
 	return 0;
 }
 
+/*
+ * we can't consider a given block up to date unless the transid of the
+ * block matches the transid in the parent node's pointer.  This is how we
+ * detect blocks that either didn't get written at all or got written
+ * in the wrong place.
+ */
 static int verify_parent_transid(struct extent_io_tree *io_tree,
 				 struct extent_buffer *eb, u64 parent_transid)
 {
@@ -228,9 +252,12 @@ out:
 	unlock_extent(io_tree, eb->start, eb->start + eb->len - 1,
 		      GFP_NOFS);
 	return ret;
-
 }
 
+/*
+ * helper to read a given tree block, doing retries as required when
+ * the checksums don't match and we have alternate mirrors to try.
+ */
 static int btree_read_extent_buffer_pages(struct btrfs_root *root,
 					  struct extent_buffer *eb,
 					  u64 start, u64 parent_transid)
@@ -260,6 +287,10 @@ printk("read extent buffer pages failed with ret %d mirror no %d\n", ret, mirror
 	return -EIO;
 }
 
+/*
+ * checksum a dirty tree block before IO.  This has extra checks to make
+ * sure we only fill in the checksum field in the first page of a multi-page block
+ */
 int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
 {
 	struct extent_io_tree *tree;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 8bd1b402f3fd..563b2d12f4f2 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -914,6 +914,10 @@ int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
 }
 EXPORT_SYMBOL(wait_on_extent_writeback);
 
+/*
+ * either insert or lock state struct between start and end use mask to tell
+ * us if waiting is desired.
+ */
 int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
 {
 	int err;
@@ -982,6 +986,13 @@ int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
 }
 EXPORT_SYMBOL(set_range_writeback);
 
+/*
+ * find the first offset in the io tree with 'bits' set. zero is
+ * returned if we find something, and *start_ret and *end_ret are
+ * set to reflect the state struct that was found.
+ *
+ * If nothing was found, 1 is returned, < 0 on error
+ */
 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
 			  u64 *start_ret, u64 *end_ret, int bits)
 {
@@ -1017,6 +1028,10 @@ out:
 }
 EXPORT_SYMBOL(find_first_extent_bit);
 
+/* find the first state struct with 'bits' set after 'start', and
+ * return it.  tree->lock must be held.  NULL will returned if
+ * nothing was found after 'start'
+ */
 struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
 						 u64 start, int bits)
 {
@@ -1046,8 +1061,14 @@ out:
 }
 EXPORT_SYMBOL(find_first_extent_bit_state);
 
-u64 find_lock_delalloc_range(struct extent_io_tree *tree,
-			     u64 *start, u64 *end, u64 max_bytes)
+/*
+ * find a contiguous range of bytes in the file marked as delalloc, not
+ * more than 'max_bytes'.  start and end are used to return the range,
+ *
+ * 1 is returned if we find something, 0 if nothing was in the tree
+ */
+static noinline u64 find_lock_delalloc_range(struct extent_io_tree *tree,
+					     u64 *start, u64 *end, u64 max_bytes)
 {
 	struct rb_node *node;
 	struct extent_state *state;
@@ -1130,6 +1151,11 @@ out:
 	return found;
 }
 
+/*
+ * count the number of bytes in the tree that have a given bit(s)
+ * set.  This can be fairly slow, except for EXTENT_DIRTY which is
+ * cached.  The total number found is returned.
+ */
 u64 count_range_bits(struct extent_io_tree *tree,
 		     u64 *start, u64 search_end, u64 max_bytes,
 		     unsigned long bits)
@@ -1245,6 +1271,10 @@ int unlock_range(struct extent_io_tree *tree, u64 start, u64 end)
 }
 EXPORT_SYMBOL(unlock_range);
 
+/*
+ * set the private field for a given byte offset in the tree.  If there isn't
+ * an extent_state there already, this does nothing.
+ */
 int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
 {
 	struct rb_node *node;
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 78ced11d18c7..74b2a29880d3 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -114,6 +114,10 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
 	return NULL;
 }
 
+/*
+ * search through the tree for an extent_map with a given offset.  If
+ * it can't be found, try to find some neighboring extents
+ */
 static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
 				     struct rb_node **prev_ret,
 				     struct rb_node **next_ret)
@@ -160,6 +164,10 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
 	return NULL;
 }
 
+/*
+ * look for an offset in the tree, and if it can't be found, return
+ * the first offset we can find smaller than 'offset'.
+ */
 static inline struct rb_node *tree_search(struct rb_root *root, u64 offset)
 {
 	struct rb_node *prev;
@@ -170,6 +178,7 @@ static inline struct rb_node *tree_search(struct rb_root *root, u64 offset)
 	return ret;
 }
 
+/* check to see if two extent_map structs are adjacent and safe to merge */
 static int mergable_maps(struct extent_map *prev, struct extent_map *next)
 {
 	if (test_bit(EXTENT_FLAG_PINNED, &prev->flags))
@@ -250,6 +259,7 @@ out:
 }
 EXPORT_SYMBOL(add_extent_mapping);
 
+/* simple helper to do math around the end of an extent, handling wrap */
 static u64 range_end(u64 start, u64 len)
 {
 	if (start + len < start)
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 1b7e51a9db0f..3088a1184483 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -41,6 +41,9 @@
 #include "compat.h"
 
 
+/* simple helper to fault in pages and copy.  This should go away
+ * and be replaced with calls into generic code.
+ */
 static int noinline btrfs_copy_from_user(loff_t pos, int num_pages,
 					 int write_bytes,
 					 struct page **prepared_pages,
@@ -72,12 +75,19 @@ static int noinline btrfs_copy_from_user(loff_t pos, int num_pages,
 	return page_fault ? -EFAULT : 0;
 }
 
+/*
+ * unlocks pages after btrfs_file_write is done with them
+ */
 static void noinline btrfs_drop_pages(struct page **pages, size_t num_pages)
 {
 	size_t i;
 	for (i = 0; i < num_pages; i++) {
 		if (!pages[i])
 			break;
+		/* page checked is some magic around finding pages that
+		 * have been modified without going through btrfs_set_page_dirty
+		 * clear it here
+		 */
 		ClearPageChecked(pages[i]);
 		unlock_page(pages[i]);
 		mark_page_accessed(pages[i]);
@@ -85,6 +95,10 @@ static void noinline btrfs_drop_pages(struct page **pages, size_t num_pages)
 	}
 }
 
+/* this does all the hard work for inserting an inline extent into
+ * the btree.  Any existing inline extent is extended as required to make room,
+ * otherwise things are inserted as required into the btree
+ */
 static int noinline insert_inline_extent(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root, struct inode *inode,
 				u64 offset, size_t size,
@@ -228,6 +242,14 @@ fail:
 	return err;
 }
 
+/*
+ * after copy_from_user, pages need to be dirtied and we need to make
+ * sure holes are created between the current EOF and the start of
+ * any next extents (if required).
+ *
+ * this also makes the decision about creating an inline extent vs
+ * doing real data extents, marking pages dirty and delalloc as required.
+ */
 static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root,
 				   struct file *file,
@@ -362,6 +384,10 @@ out_unlock:
 	return err;
 }
 
+/*
+ * this drops all the extents in the cache that intersect the range
+ * [start, end].  Existing extents are split as required.
+ */
 int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 			    int skip_pinned)
 {
@@ -536,6 +562,9 @@ out:
  * If an extent intersects the range but is not entirely inside the range
  * it is either truncated or split.  Anything entirely inside the range
  * is deleted from the tree.
+ *
+ * inline_limit is used to tell this code which offsets in the file to keep
+ * if they contain inline extents.
  */
 int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct inode *inode,
@@ -796,7 +825,9 @@ out:
 }
 
 /*
- * this gets pages into the page cache and locks them down
+ * this gets pages into the page cache and locks them down, it also properly
+ * waits for data=ordered extents to finish before allowing the pages to be
+ * modified.
  */
 static int noinline prepare_pages(struct btrfs_root *root, struct file *file,
 			 struct page **pages, size_t num_pages,
@@ -1034,6 +1065,17 @@ int btrfs_release_file(struct inode * inode, struct file * filp)
 	return 0;
 }
 
+/*
+ * fsync call for both files and directories.  This logs the inode into
+ * the tree log instead of forcing full commits whenever possible.
+ *
+ * It needs to call filemap_fdatawait so that all ordered extent updates are
+ * in the metadata btree are up to date for copying to the log.
+ *
+ * It drops the inode mutex before doing the tree log commit.  This is an
+ * important optimization for directories because holding the mutex prevents
+ * new operations on the dir while we write to disk.
+ */
 int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
 {
 	struct inode *inode = dentry->d_inode;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 404704d26822..f3abecc2d14c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -83,6 +83,10 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
 
 static void btrfs_truncate(struct inode *inode);
 
+/*
+ * a very lame attempt at stopping writes when the FS is 85% full.  There
+ * are countless ways this is incorrect, but it is better than nothing.
+ */
 int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
 			   int for_del)
 {
@@ -108,6 +112,12 @@ int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
 	return ret;
 }
 
+/*
+ * when extent_io.c finds a delayed allocation range in the file,
+ * the call backs end up in this code.  The basic idea is to
+ * allocate extents on disk for the range, and create ordered data structs
+ * in ram to track those extents.
+ */
 static int cow_file_range(struct inode *inode, u64 start, u64 end)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -185,6 +195,13 @@ out:
 	return ret;
 }
 
+/*
+ * when nowcow writeback call back.  This checks for snapshots or COW copies
+ * of the extents that exist in the file, and COWs the file as required.
+ *
+ * If no cow copies or snapshots exist, we write directly to the existing
+ * blocks on disk
+ */
 static int run_delalloc_nocow(struct inode *inode, u64 start, u64 end)
 {
 	u64 extent_start;
@@ -291,6 +308,9 @@ out:
 	return err;
 }
 
+/*
+ * extent_io.c call back to do delayed allocation processing
+ */
 static int run_delalloc_range(struct inode *inode, u64 start, u64 end)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -305,6 +325,11 @@ static int run_delalloc_range(struct inode *inode, u64 start, u64 end)
 	return ret;
 }
 
+/*
+ * extent_io.c set_bit_hook, used to track delayed allocation
+ * bytes in this file, and to maintain the list of inodes that
+ * have pending delalloc work to be done.
+ */
 int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
 		       unsigned long old, unsigned long bits)
 {
@@ -323,6 +348,9 @@ int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
 	return 0;
 }
 
+/*
+ * extent_io.c clear_bit_hook, see set_bit_hook for why
+ */
 int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
 			 unsigned long old, unsigned long bits)
 {
@@ -349,6 +377,10 @@ int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
 	return 0;
 }
 
+/*
+ * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
+ * we don't create bios that span stripes or chunks
+ */
 int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 			 size_t size, struct bio *bio)
 {
@@ -371,6 +403,14 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 	return 0;
 }
 
+/*
+ * in order to insert checksums into the metadata in large chunks,
+ * we wait until bio submission time.   All the pages in the bio are
+ * checksummed and sums are attached onto the ordered extent record.
+ *
+ * At IO completion time the cums attached on the ordered extent record
+ * are inserted into the btree
+ */
 int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 			  int mirror_num)
 {
@@ -383,6 +423,10 @@ int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 	return btrfs_map_bio(root, rw, bio, mirror_num, 1);
 }
 
+/*
+ * extent_io.c submission hook. This does the right thing for csum calculation on write,
+ * or reading the csums from the tree before a read
+ */
 int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 			  int mirror_num)
 {
@@ -408,6 +452,10 @@ mapit:
 	return btrfs_map_bio(root, rw, bio, mirror_num, 0);
 }
 
+/*
+ * given a list of ordered sums record them in the inode.  This happens
+ * at IO completion time based on sums calculated at bio submission time.
+ */
 static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
 			     struct inode *inode, u64 file_offset,
 			     struct list_head *list)
@@ -430,12 +478,12 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end)
 				   GFP_NOFS);
 }
 
+/* see btrfs_writepage_start_hook for details on why this is required */
 struct btrfs_writepage_fixup {
 	struct page *page;
 	struct btrfs_work work;
 };
 
-/* see btrfs_writepage_start_hook for details on why this is required */
 void btrfs_writepage_fixup_worker(struct btrfs_work *work)
 {
 	struct btrfs_writepage_fixup *fixup;
@@ -522,6 +570,10 @@ int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
 	return -EAGAIN;
 }
 
+/* as ordered data IO finishes, this gets called so we can finish
+ * an ordered extent if the range of bytes in the file it covers are
+ * fully written.
+ */
 static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -631,6 +683,14 @@ int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
 	return btrfs_finish_ordered_io(page->mapping->host, start, end);
 }
 
+/*
+ * When IO fails, either with EIO or csum verification fails, we
+ * try other mirrors that might have a good copy of the data.  This
+ * io_failure_record is used to record state as we go through all the
+ * mirrors.  If another mirror has good data, the page is set up to date
+ * and things continue.  If a good mirror can't be found, the original
+ * bio end_io callback is called to indicate things have failed.
+ */
 struct io_failure_record {
 	struct page *page;
 	u64 start;
@@ -725,6 +785,10 @@ int btrfs_io_failed_hook(struct bio *failed_bio,
 	return 0;
 }
 
+/*
+ * each time an IO finishes, we do a fast check in the IO failure tree
+ * to see if we need to process or clean up an io_failure_record
+ */
 int btrfs_clean_io_failures(struct inode *inode, u64 start)
 {
 	u64 private;
@@ -753,6 +817,11 @@ int btrfs_clean_io_failures(struct inode *inode, u64 start)
 	return 0;
 }
 
+/*
+ * when reads are done, we need to check csums to verify the data is correct
+ * if there's a match, we allow the bio to finish.  If not, we go through
+ * the io_failure_record routines to find good copies
+ */
 int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 			       struct extent_state *state)
 {
@@ -990,6 +1059,9 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
 	btrfs_free_path(path);
 }
 
+/*
+ * read an inode from the btree into the in-memory inode
+ */
 void btrfs_read_locked_inode(struct inode *inode)
 {
 	struct btrfs_path *path;
@@ -1083,6 +1155,9 @@ make_bad:
 	make_bad_inode(inode);
 }
 
+/*
+ * given a leaf and an inode, copy the inode fields into the leaf
+ */
 static void fill_inode_item(struct btrfs_trans_handle *trans,
 			    struct extent_buffer *leaf,
 			    struct btrfs_inode_item *item,
@@ -1118,6 +1193,9 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
 				    BTRFS_I(inode)->block_group->key.objectid);
 }
 
+/*
+ * copy everything in the in-memory inode into the btree.
+ */
 int noinline btrfs_update_inode(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root,
 			      struct inode *inode)
@@ -1151,6 +1229,11 @@ failed:
 }
 
 
+/*
+ * unlink helper that gets used here in inode.c and in the tree logging
+ * recovery code.  It remove a link in a directory with a given name, and
+ * also drops the back refs in the inode to the directory
+ */
 int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root,
 		       struct inode *dir, struct inode *inode,
@@ -1309,7 +1392,7 @@ fail:
 /*
  * this can truncate away extent items, csum items and directory items.
  * It starts at a high offset and removes keys until it can't find
- * any higher than i_size.
+ * any higher than new_size
  *
  * csum items that cross the new i_size are truncated to the new size
  * as well.
@@ -2123,6 +2206,11 @@ void btrfs_dirty_inode(struct inode *inode)
 	btrfs_end_transaction(trans, root);
 }
 
+/*
+ * find the highest existing sequence number in a directory
+ * and then set the in-memory index_cnt variable to reflect
+ * free sequence numbers
+ */
 static int btrfs_set_inode_index_count(struct inode *inode)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -2175,6 +2263,10 @@ out:
 	return ret;
 }
 
+/*
+ * helper to find a free sequence number in a given directory.  This current
+ * code is very simple, later versions will do smarter things in the btree
+ */
 static int btrfs_set_inode_index(struct inode *dir, struct inode *inode,
 				 u64 *index)
 {
@@ -2305,6 +2397,12 @@ static inline u8 btrfs_inode_type(struct inode *inode)
 	return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT];
 }
 
+/*
+ * utility function to add 'inode' into 'parent_inode' with
+ * a give name and a given sequence number.
+ * if 'add_backref' is true, also insert a backref from the
+ * inode to the parent directory.
+ */
 int btrfs_add_link(struct btrfs_trans_handle *trans,
 		   struct inode *parent_inode, struct inode *inode,
 		   const char *name, int name_len, int add_backref, u64 index)
@@ -2611,6 +2709,10 @@ out_unlock:
 	return err;
 }
 
+/* helper for btfs_get_extent.  Given an existing extent in the tree,
+ * and an extent that you want to insert, deal with overlap and insert
+ * the new extent into the tree.
+ */
 static int merge_extent_mapping(struct extent_map_tree *em_tree,
 				struct extent_map *existing,
 				struct extent_map *em,
@@ -2627,6 +2729,14 @@ static int merge_extent_mapping(struct extent_map_tree *em_tree,
 	return add_extent_mapping(em_tree, em);
 }
 
+/*
+ * a bit scary, this does extent mapping from logical file offset to the disk.
+ * the ugly parts come from merging extents from the disk with the
+ * in-ram representation.  This gets more complex because of the data=ordered code,
+ * where the in-ram extents might be locked pending data=ordered completion.
+ *
+ * This also copies inline extents directly into the page.
+ */
 struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
 				    size_t pg_offset, u64 start, u64 len,
 				    int create)
@@ -2869,76 +2979,11 @@ out:
 	return em;
 }
 
-#if 0 /* waiting for O_DIRECT reads */
-static int btrfs_get_block(struct inode *inode, sector_t iblock,
-			struct buffer_head *bh_result, int create)
-{
-	struct extent_map *em;
-	u64 start = (u64)iblock << inode->i_blkbits;
-	struct btrfs_multi_bio *multi = NULL;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	u64 len;
-	u64 logical;
-	u64 map_length;
-	int ret = 0;
-
-	em = btrfs_get_extent(inode, NULL, 0, start, bh_result->b_size, 0);
-
-	if (!em || IS_ERR(em))
-		goto out;
-
-	if (em->start > start || em->start + em->len <= start) {
-	    goto out;
-	}
-
-	if (em->block_start == EXTENT_MAP_INLINE) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	len = em->start + em->len - start;
-	len = min_t(u64, len, INT_LIMIT(typeof(bh_result->b_size)));
-
-	if (em->block_start == EXTENT_MAP_HOLE ||
-	    em->block_start == EXTENT_MAP_DELALLOC) {
-		bh_result->b_size = len;
-		goto out;
-	}
-
-	logical = start - em->start;
-	logical = em->block_start + logical;
-
-	map_length = len;
-	ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
-			      logical, &map_length, &multi, 0);
-	BUG_ON(ret);
-	bh_result->b_blocknr = multi->stripes[0].physical >> inode->i_blkbits;
-	bh_result->b_size = min(map_length, len);
-
-	bh_result->b_bdev = multi->stripes[0].dev->bdev;
-	set_buffer_mapped(bh_result);
-	kfree(multi);
-out:
-	free_extent_map(em);
-	return ret;
-}
-#endif
-
 static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
 			const struct iovec *iov, loff_t offset,
 			unsigned long nr_segs)
 {
 	return -EINVAL;
-#if 0
-	struct file *file = iocb->ki_filp;
-	struct inode *inode = file->f_mapping->host;
-
-	if (rw == WRITE)
-		return -EINVAL;
-
-	return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
-				  offset, nr_segs, btrfs_get_block, NULL);
-#endif
 }
 
 static sector_t btrfs_bmap(struct address_space *mapping, sector_t iblock)
@@ -3202,6 +3247,9 @@ void btrfs_invalidate_dcache_root(struct btrfs_root *root, char *name,
 	}
 }
 
+/*
+ * create a new subvolume directory/inode (helper for the ioctl).
+ */
 int btrfs_create_subvol_root(struct btrfs_root *new_root,
 		struct btrfs_trans_handle *trans, u64 new_dirid,
 		struct btrfs_block_group_cache *block_group)
@@ -3223,6 +3271,9 @@ int btrfs_create_subvol_root(struct btrfs_root *new_root,
 	return btrfs_update_inode(trans, new_root, inode);
 }
 
+/* helper function for file defrag and space balancing.  This
+ * forces readahead on a given range of bytes in an inode
+ */
 unsigned long btrfs_force_ra(struct address_space *mapping,
 			      struct file_ra_state *ra, struct file *file,
 			      pgoff_t offset, pgoff_t last_index)
@@ -3424,6 +3475,10 @@ out_unlock:
 	return ret;
 }
 
+/*
+ * some fairly slow code that needs optimization. This walks the list
+ * of all the inodes with pending delalloc and forces them to disk.
+ */
 int btrfs_start_delalloc_inodes(struct btrfs_root *root)
 {
 	struct list_head *head = &root->fs_info->delalloc_inodes;
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 0cc314c10d66..e30aa6e2958f 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -25,6 +25,15 @@
 #include "extent_io.h"
 #include "locking.h"
 
+/*
+ * locks the per buffer mutex in an extent buffer.  This uses adaptive locks
+ * and the spin is not tuned very extensively.  The spinning does make a big
+ * difference in almost every workload, but spinning for the right amount of
+ * time needs some help.
+ *
+ * In general, we want to spin as long as the lock holder is doing btree searches,
+ * and we should give up if they are in more expensive code.
+ */
 int btrfs_tree_lock(struct extent_buffer *eb)
 {
 	int i;
@@ -57,6 +66,10 @@ int btrfs_tree_locked(struct extent_buffer *eb)
 	return mutex_is_locked(&eb->mutex);
 }
 
+/*
+ * btrfs_search_slot uses this to decide if it should drop its locks
+ * before doing something expensive like allocating free blocks for cow.
+ */
 int btrfs_path_lock_waiting(struct btrfs_path *path, int level)
 {
 	int i;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 951eacff2420..dcc1730dd837 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -26,7 +26,6 @@
 #include "btrfs_inode.h"
 #include "extent_io.h"
 
-
 static u64 entry_end(struct btrfs_ordered_extent *entry)
 {
 	if (entry->file_offset + entry->len < entry->file_offset)
@@ -34,6 +33,9 @@ static u64 entry_end(struct btrfs_ordered_extent *entry)
 	return entry->file_offset + entry->len;
 }
 
+/* returns NULL if the insertion worked, or it returns the node it did find
+ * in the tree
+ */
 static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
 				   struct rb_node *node)
 {
@@ -58,6 +60,10 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
 	return NULL;
 }
 
+/*
+ * look for a given offset in the tree, and if it can't be found return the
+ * first lesser offset
+ */
 static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
 				     struct rb_node **prev_ret)
 {
@@ -108,6 +114,9 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
 	return NULL;
 }
 
+/*
+ * helper to check if a given offset is inside a given entry
+ */
 static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
 {
 	if (file_offset < entry->file_offset ||
@@ -116,6 +125,10 @@ static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
 	return 1;
 }
 
+/*
+ * look find the first ordered struct that has this offset, otherwise
+ * the first one less than this offset
+ */
 static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
 					  u64 file_offset)
 {
@@ -305,6 +318,10 @@ int btrfs_remove_ordered_extent(struct inode *inode,
 	return 0;
 }
 
+/*
+ * wait for all the ordered extents in a root.  This is done when balancing
+ * space between drives.
+ */
 int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only)
 {
 	struct list_head splice;
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
index 30fcb7aea5b5..a50ebb67055d 100644
--- a/fs/btrfs/ref-cache.c
+++ b/fs/btrfs/ref-cache.c
@@ -21,6 +21,16 @@
 #include "ref-cache.h"
 #include "transaction.h"
 
+/*
+ * leaf refs are used to cache the information about which extents
+ * a given leaf has references on.  This allows us to process that leaf
+ * in btrfs_drop_snapshot without needing to read it back from disk.
+ */
+
+/*
+ * kmalloc a leaf reference struct and update the counters for the
+ * total ref cache size
+ */
 struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root,
 					    int nr_extents)
 {
@@ -40,6 +50,10 @@ struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root,
 	return ref;
 }
 
+/*
+ * free a leaf reference struct and update the counters for the
+ * total ref cache size
+ */
 void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
 {
 	if (!ref)
@@ -135,6 +149,10 @@ int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen,
 	return 0;
 }
 
+/*
+ * find the leaf ref for a given extent.  This returns the ref struct with
+ * a usage reference incremented
+ */
 struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
 					     u64 bytenr)
 {
@@ -160,6 +178,10 @@ again:
 	return NULL;
 }
 
+/*
+ * add a fully filled in leaf ref struct
+ * remove all the refs older than a given root generation
+ */
 int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref,
 		       int shared)
 {
@@ -184,6 +206,10 @@ int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref,
 	return ret;
 }
 
+/*
+ * remove a single leaf ref from the tree.  This drops the ref held by the tree
+ * only
+ */
 int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
 {
 	struct btrfs_leaf_ref_tree *tree;
diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h
index 617564787f52..16f3183d7c59 100644
--- a/fs/btrfs/ref-cache.h
+++ b/fs/btrfs/ref-cache.h
@@ -19,8 +19,11 @@
 #define __REFCACHE__
 
 struct btrfs_extent_info {
+	/* bytenr and num_bytes find the extent in the extent allocation tree */
 	u64 bytenr;
 	u64 num_bytes;
+
+	/* objectid and offset find the back reference for the file */
 	u64 objectid;
 	u64 offset;
 };
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 0091c01abb06..eb7f7655e9d5 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -22,8 +22,10 @@
 #include "print-tree.h"
 
 /*
- * returns 0 on finding something, 1 if no more roots are there
- * and < 0 on error
+ *  search forward for a root, starting with objectid 'search_start'
+ *  if a root key is found, the objectid we find is filled into 'found_objectid'
+ *  and 0 is returned.  < 0 is returned on error, 1 if there is nothing
+ *  left in the tree.
  */
 int btrfs_search_root(struct btrfs_root *root, u64 search_start,
 		      u64 *found_objectid)
@@ -66,6 +68,11 @@ out:
 	return ret;
 }
 
+/*
+ * lookup the root with the highest offset for a given objectid.  The key we do
+ * find is copied into 'key'.  If we find something return 0, otherwise 1, < 0
+ * on error.
+ */
 int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
 			struct btrfs_root_item *item, struct btrfs_key *key)
 {
@@ -104,6 +111,9 @@ out:
 	return ret;
 }
 
+/*
+ * copy the data in 'item' into the btree
+ */
 int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_key *key, struct btrfs_root_item
 		      *item)
@@ -147,6 +157,12 @@ int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
 	return ret;
 }
 
+/*
+ * at mount time we want to find all the old transaction snapshots that were in
+ * the process of being deleted if we crashed.  This is any root item with an offset
+ * lower than the latest root.  They need to be queued for deletion to finish
+ * what was happening when we crashed.
+ */
 int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
 			  struct btrfs_root *latest)
 {
@@ -227,6 +243,7 @@ err:
 	return ret;
 }
 
+/* drop the root item for 'key' from 'root' */
 int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		   struct btrfs_key *key)
 {
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
index ad03a32d1116..cdedbe144d45 100644
--- a/fs/btrfs/struct-funcs.c
+++ b/fs/btrfs/struct-funcs.c
@@ -17,6 +17,27 @@
  */
 
 #include <linux/highmem.h>
+
+/* this is some deeply nasty code.  ctree.h has a different
+ * definition for this BTRFS_SETGET_FUNCS macro, behind a #ifndef
+ *
+ * The end result is that anyone who #includes ctree.h gets a
+ * declaration for the btrfs_set_foo functions and btrfs_foo functions
+ *
+ * This file declares the macros and then #includes ctree.h, which results
+ * in cpp creating the function here based on the template below.
+ *
+ * These setget functions do all the extent_buffer related mapping
+ * required to efficiently read and write specific fields in the extent
+ * buffers.  Every pointer to metadata items in btrfs is really just
+ * an unsigned long offset into the extent buffer which has been
+ * cast to a specific type.  This gives us all the gcc type checking.
+ *
+ * The extent buffer api is used to do all the kmapping and page
+ * spanning work required to get extent buffers in highmem and have
+ * a metadata blocksize different from the page size.
+ */
+
 #define BTRFS_SETGET_FUNCS(name, type, member, bits)			\
 u##bits btrfs_##name(struct extent_buffer *eb,				\
 				   type *s)				\
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 8399d6d05d63..2e6039825b7b 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -519,6 +519,9 @@ static struct file_system_type btrfs_fs_type = {
 	.fs_flags	= FS_REQUIRES_DEV,
 };
 
+/*
+ * used by btrfsctl to scan devices when no FS is mounted
+ */
 static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
 				unsigned long arg)
 {
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 444abe0796ae..11266d68a6c9 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -46,6 +46,9 @@ static noinline void put_transaction(struct btrfs_transaction *transaction)
 	}
 }
 
+/*
+ * either allocate a new transaction or hop into the existing one
+ */
 static noinline int join_transaction(struct btrfs_root *root)
 {
 	struct btrfs_transaction *cur_trans;
@@ -85,6 +88,12 @@ static noinline int join_transaction(struct btrfs_root *root)
 	return 0;
 }
 
+/*
+ * this does all the record keeping required to make sure that a
+ * reference counted root is properly recorded in a given transaction.
+ * This is required to make sure the old root from before we joined the transaction
+ * is deleted when the transaction commits
+ */
 noinline int btrfs_record_root_in_trans(struct btrfs_root *root)
 {
 	struct btrfs_dirty_root *dirty;
@@ -127,6 +136,10 @@ noinline int btrfs_record_root_in_trans(struct btrfs_root *root)
 	return 0;
 }
 
+/* wait for commit against the current transaction to become unblocked
+ * when this is done, it is safe to start a new transaction, but the current
+ * transaction might not be fully on disk.
+ */
 static void wait_current_trans(struct btrfs_root *root)
 {
 	struct btrfs_transaction *cur_trans;
@@ -198,7 +211,7 @@ struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
 	return start_transaction(r, num_blocks, 2);
 }
 
-
+/* wait for a transaction commit to be fully complete */
 static noinline int wait_for_commit(struct btrfs_root *root,
 				    struct btrfs_transaction *commit)
 {
@@ -218,6 +231,10 @@ static noinline int wait_for_commit(struct btrfs_root *root,
 	return 0;
 }
 
+/*
+ * rate limit against the drop_snapshot code.  This helps to slow down new operations
+ * if the drop_snapshot code isn't able to keep up.
+ */
 static void throttle_on_drops(struct btrfs_root *root)
 {
 	struct btrfs_fs_info *info = root->fs_info;
@@ -302,7 +319,11 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
 	return __btrfs_end_transaction(trans, root, 1);
 }
 
-
+/*
+ * when btree blocks are allocated, they have some corresponding bits set for
+ * them in one of two extent_io trees.  This is used to make sure all of
+ * those extents are on disk for transaction or log commit
+ */
 int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
 					struct extent_io_tree *dirty_pages)
 {
@@ -393,6 +414,16 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
 					   &trans->transaction->dirty_pages);
 }
 
+/*
+ * this is used to update the root pointer in the tree of tree roots.
+ *
+ * But, in the case of the extent allocation tree, updating the root
+ * pointer may allocate blocks which may change the root of the extent
+ * allocation tree.
+ *
+ * So, this loops and repeats and makes sure the cowonly root didn't
+ * change while the root pointer was being updated in the metadata.
+ */
 static int update_cowonly_root(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root)
 {
@@ -418,6 +449,9 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+/*
+ * update all the cowonly tree roots on disk
+ */
 int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root)
 {
@@ -433,6 +467,11 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+/*
+ * dead roots are old snapshots that need to be deleted.  This allocates
+ * a dirty root struct and adds it into the list of dead roots that need to
+ * be deleted
+ */
 int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest)
 {
 	struct btrfs_dirty_root *dirty;
@@ -449,6 +488,12 @@ int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest)
 	return 0;
 }
 
+/*
+ * at transaction commit time we need to schedule the old roots for
+ * deletion via btrfs_drop_snapshot.  This runs through all the
+ * reference counted roots that were modified in the current
+ * transaction and puts them into the drop list
+ */
 static noinline int add_dirty_roots(struct btrfs_trans_handle *trans,
 				    struct radix_tree_root *radix,
 				    struct list_head *list)
@@ -541,6 +586,10 @@ static noinline int add_dirty_roots(struct btrfs_trans_handle *trans,
 	return err;
 }
 
+/*
+ * defrag a given btree.  If cacheonly == 1, this won't read from the disk,
+ * otherwise every leaf in the btree is read and defragged.
+ */
 int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
 {
 	struct btrfs_fs_info *info = root->fs_info;
@@ -570,6 +619,10 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
 	return 0;
 }
 
+/*
+ * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on
+ * all of them
+ */
 static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
 				     struct list_head *list)
 {
@@ -664,6 +717,10 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
 	return ret;
 }
 
+/*
+ * new snapshots need to be created at a very specific time in the
+ * transaction commit.  This does the actual creation
+ */
 static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 				   struct btrfs_fs_info *fs_info,
 				   struct btrfs_pending_snapshot *pending)
@@ -734,6 +791,9 @@ fail:
 	return ret;
 }
 
+/*
+ * create all the snapshots we've scheduled for creation
+ */
 static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
 					     struct btrfs_fs_info *fs_info)
 {
@@ -944,6 +1004,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+/*
+ * interface function to delete all the snapshots we have scheduled for deletion
+ */
 int btrfs_clean_old_snapshots(struct btrfs_root *root)
 {
 	struct list_head dirty_roots;
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index b3bb5bbad76e..6f57d0889b1e 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -23,6 +23,10 @@
 #include "transaction.h"
 #include "locking.h"
 
+/* defrag all the leaves in a given btree.  If cache_only == 1, don't read things
+ * from disk, otherwise read all the leaves and try to get key order to
+ * better reflect disk order
+ */
 int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root, int cache_only)
 {
-- 
cgit v1.2.3


From c3027eb5523d6983f12628f3fe13d8a7576db701 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 8 Dec 2008 16:40:21 -0500
Subject: Btrfs: Add inode sequence number for NFS and reserved space in a few
 structs

This adds a sequence number to the btrfs inode that is increased on
every update.  NFS will be able to use that to detect when an inode has
changed, without relying on inaccurate time fields.

While we're here, this also:

Puts reserved space into the super block and inode

Adds a log root transid to the super so we can pick the newest super
based on the fsync log as well as the main transaction ID.  For now
the log root transid is always zero, but that'll get fixed.

Adds a starting offset to the dev_item.  This will let us do better
alignment calculations if we know the start of a partition on the disk.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/btrfs_inode.h |  6 +++---
 fs/btrfs/ctree.h       | 26 ++++++++++++++++++++++++++
 fs/btrfs/file.c        |  1 +
 fs/btrfs/inode.c       |  4 +++-
 fs/btrfs/volumes.c     |  1 +
 5 files changed, 34 insertions(+), 4 deletions(-)

(limited to 'fs/btrfs/btrfs_inode.h')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 0b2e623cf421..1b9ec1ab1f68 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -49,9 +49,6 @@ struct btrfs_inode {
 	 */
 	struct extent_io_tree io_failure_tree;
 
-	/* held while inserting checksums to avoid races */
-	struct mutex csum_mutex;
-
 	/* held while inesrting or deleting extents from files */
 	struct mutex extent_mutex;
 
@@ -79,6 +76,9 @@ struct btrfs_inode {
 	 */
 	u64 generation;
 
+	/* sequence number for NFS changes */
+	u64 sequence;
+
 	/*
 	 * transid of the trans_handle that last modified this inode
 	 */
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 242b961ae6de..f72b43819349 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -196,6 +196,12 @@ struct btrfs_dev_item {
 	/* expected generation for this device */
 	__le64 generation;
 
+	/*
+	 * starting byte of this partition on the device,
+	 * to allowr for stripe alignment in the future
+	 */
+	__le64 start_offset;
+
 	/* grouping information for allocation decisions */
 	__le32 dev_group;
 
@@ -311,6 +317,9 @@ struct btrfs_super_block {
 	__le64 root;
 	__le64 chunk_root;
 	__le64 log_root;
+
+	/* this will help find the new super based on the log root */
+	__le64 log_root_transid;
 	__le64 total_bytes;
 	__le64 bytes_used;
 	__le64 root_dir_objectid;
@@ -329,7 +338,11 @@ struct btrfs_super_block {
 	u8 chunk_root_level;
 	u8 log_root_level;
 	struct btrfs_dev_item dev_item;
+
 	char label[BTRFS_LABEL_SIZE];
+
+	/* future expansion */
+	__le64 reserved[32];
 	u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
 } __attribute__ ((__packed__));
 
@@ -463,6 +476,14 @@ struct btrfs_inode_item {
 	__le64 rdev;
 	__le64 flags;
 
+	/* modification sequence number for NFS */
+	__le64 sequence;
+
+	/*
+	 * a little future expansion, for more than this we can
+	 * just grow the inode item and version it
+	 */
+	__le64 reserved[4];
 	struct btrfs_timespec atime;
 	struct btrfs_timespec ctime;
 	struct btrfs_timespec mtime;
@@ -1001,6 +1022,8 @@ BTRFS_SETGET_FUNCS(device_total_bytes, struct btrfs_dev_item, total_bytes, 64);
 BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64);
 BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32);
 BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32);
+BTRFS_SETGET_FUNCS(device_start_offset, struct btrfs_dev_item,
+		   start_offset, 64);
 BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32);
 BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64);
 BTRFS_SETGET_FUNCS(device_group, struct btrfs_dev_item, dev_group, 32);
@@ -1135,6 +1158,7 @@ BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64);
 
 /* struct btrfs_inode_item */
 BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64);
+BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64);
 BTRFS_SETGET_FUNCS(inode_transid, struct btrfs_inode_item, transid, 64);
 BTRFS_SETGET_FUNCS(inode_size, struct btrfs_inode_item, size, 64);
 BTRFS_SETGET_FUNCS(inode_nbytes, struct btrfs_inode_item, nbytes, 64);
@@ -1519,6 +1543,8 @@ BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block,
 			 chunk_root_level, 8);
 BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block,
 			 log_root, 64);
+BTRFS_SETGET_STACK_FUNCS(super_log_root_transid, struct btrfs_super_block,
+			 log_root_transid, 64);
 BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block,
 			 log_root_level, 8);
 BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 1c9243560eab..b5a6a2b6f668 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1055,6 +1055,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 	pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
 
 	mutex_lock(&inode->i_mutex);
+	BTRFS_I(inode)->sequence++;
 	first_index = pos >> PAGE_CACHE_SHIFT;
 	last_index = (pos + count) >> PAGE_CACHE_SHIFT;
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c03d847b8c4e..932d8c0b2c05 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1963,6 +1963,7 @@ void btrfs_read_locked_inode(struct inode *inode)
 
 	inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
 	BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
+	BTRFS_I(inode)->sequence = btrfs_inode_sequence(leaf, inode_item);
 	inode->i_generation = BTRFS_I(inode)->generation;
 	inode->i_rdev = 0;
 	rdev = btrfs_inode_rdev(leaf, inode_item);
@@ -2043,6 +2044,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
 
 	btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
 	btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
+	btrfs_set_inode_sequence(leaf, item, BTRFS_I(inode)->sequence);
 	btrfs_set_inode_transid(leaf, item, trans->transid);
 	btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
 	btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
@@ -2945,6 +2947,7 @@ static noinline void init_btrfs_i(struct inode *inode)
 	bi->i_default_acl = NULL;
 
 	bi->generation = 0;
+	bi->sequence = 0;
 	bi->last_trans = 0;
 	bi->logged_trans = 0;
 	bi->delalloc_bytes = 0;
@@ -2959,7 +2962,6 @@ static noinline void init_btrfs_i(struct inode *inode)
 			     inode->i_mapping, GFP_NOFS);
 	INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
 	btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
-	mutex_init(&BTRFS_I(inode)->csum_mutex);
 	mutex_init(&BTRFS_I(inode)->extent_mutex);
 	mutex_init(&BTRFS_I(inode)->log_mutex);
 }
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 825364fae690..4d210a731d40 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -890,6 +890,7 @@ int btrfs_add_device(struct btrfs_trans_handle *trans,
 	btrfs_set_device_group(leaf, dev_item, 0);
 	btrfs_set_device_seek_speed(leaf, dev_item, 0);
 	btrfs_set_device_bandwidth(leaf, dev_item, 0);
+	btrfs_set_device_start_offset(leaf, dev_item, 0);
 
 	ptr = (unsigned long)btrfs_device_uuid(dev_item);
 	write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
-- 
cgit v1.2.3


From d2fb3437e4d8d12c73c587615ad187d5288547ec Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Thu, 11 Dec 2008 16:30:39 -0500
Subject: Btrfs: fix leaking block group on balance

The block group structs are referenced in many different
places, and it's not safe to free while balancing.  So, those block
group structs were simply leaked instead.

This patch replaces the block group pointer in the inode with the starting byte
offset of the block group and adds reference counting to the block group
struct.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/btrfs_inode.h |   8 ++-
 fs/btrfs/ctree.h       |  17 ++++---
 fs/btrfs/extent-tree.c | 132 ++++++++++++++++++++++---------------------------
 fs/btrfs/inode.c       |  43 +++++-----------
 fs/btrfs/ioctl.c       |   2 +-
 fs/btrfs/transaction.c |   2 +-
 fs/btrfs/transaction.h |   2 +-
 7 files changed, 88 insertions(+), 118 deletions(-)

(limited to 'fs/btrfs/btrfs_inode.h')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 1b9ec1ab1f68..a8c9693b75ac 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -28,11 +28,6 @@ struct btrfs_inode {
 	/* which subvolume this inode belongs to */
 	struct btrfs_root *root;
 
-	/* the block group preferred for allocations.  This pointer is buggy
-	 * and needs to be replaced with a bytenr instead
-	 */
-	struct btrfs_block_group_cache *block_group;
-
 	/* key used to find this inode on disk.  This is used by the code
 	 * to read in roots of subvolumes
 	 */
@@ -115,6 +110,9 @@ struct btrfs_inode {
 	 */
 	u64 index_cnt;
 
+	/* the start of block group preferred for allocations. */
+	u64 block_group;
+
 	struct inode vfs_inode;
 };
 
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 5b0c79d22c09..8733081d97a3 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -653,6 +653,9 @@ struct btrfs_block_group_cache {
 
 	/* for block groups in the same raid type */
 	struct list_head list;
+
+	/* usage count */
+	atomic_t count;
 };
 
 struct btrfs_leaf_ref_tree {
@@ -1706,10 +1709,8 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy);
 struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
 							 btrfs_fs_info *info,
 							 u64 bytenr);
-struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
-						 struct btrfs_block_group_cache
-						 *hint, u64 search_start,
-						 int data, int owner);
+u64 btrfs_find_block_group(struct btrfs_root *root,
+			   u64 search_start, u64 search_hint, int owner);
 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 					     struct btrfs_root *root,
 					     u32 blocksize, u64 parent,
@@ -1770,6 +1771,7 @@ int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
 			    u64 owner_objectid);
 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 				    struct btrfs_root *root);
+int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr);
 int btrfs_free_block_groups(struct btrfs_fs_info *info);
 int btrfs_read_block_groups(struct btrfs_root *root);
 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
@@ -2019,10 +2021,9 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root);
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end);
 int btrfs_writepages(struct address_space *mapping,
 		     struct writeback_control *wbc);
-int btrfs_create_subvol_root(struct btrfs_root *new_root, struct dentry *dentry,
-		struct btrfs_trans_handle *trans, u64 new_dirid,
-		struct btrfs_block_group_cache *block_group);
-
+int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *new_root, struct dentry *dentry,
+			     u64 new_dirid, u64 alloc_hint);
 int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 			 size_t size, struct bio *bio, unsigned long bio_flags);
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 673ff59c288a..1cc89246ee2f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -53,10 +53,6 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, struct
 				 btrfs_root *extent_root, int all);
 static int del_pending_extents(struct btrfs_trans_handle *trans, struct
 			       btrfs_root *extent_root, int all);
-static struct btrfs_block_group_cache *
-__btrfs_find_block_group(struct btrfs_root *root,
-			 struct btrfs_block_group_cache *hint,
-			 u64 search_start, int data, int owner);
 static int pin_down_bytes(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root,
 			  u64 bytenr, u64 num_bytes, int is_data);
@@ -142,6 +138,8 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
 			break;
 		}
 	}
+	if (ret)
+		atomic_inc(&ret->count);
 	spin_unlock(&info->block_group_cache_lock);
 
 	return ret;
@@ -318,6 +316,12 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
 	return cache;
 }
 
+static inline void put_block_group(struct btrfs_block_group_cache *cache)
+{
+	if (atomic_dec_and_test(&cache->count))
+		kfree(cache);
+}
+
 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
 						  u64 flags)
 {
@@ -341,54 +345,16 @@ static u64 div_factor(u64 num, int factor)
 	return num;
 }
 
-static struct btrfs_block_group_cache *
-__btrfs_find_block_group(struct btrfs_root *root,
-			 struct btrfs_block_group_cache *hint,
-			 u64 search_start, int data, int owner)
+u64 btrfs_find_block_group(struct btrfs_root *root,
+			   u64 search_start, u64 search_hint, int owner)
 {
 	struct btrfs_block_group_cache *cache;
-	struct btrfs_block_group_cache *found_group = NULL;
-	struct btrfs_fs_info *info = root->fs_info;
 	u64 used;
-	u64 last = 0;
-	u64 free_check;
+	u64 last = max(search_hint, search_start);
+	u64 group_start = 0;
 	int full_search = 0;
-	int factor = 10;
+	int factor = 9;
 	int wrapped = 0;
-
-	if (data & BTRFS_BLOCK_GROUP_METADATA)
-		factor = 9;
-
-	if (search_start) {
-		struct btrfs_block_group_cache *shint;
-		shint = btrfs_lookup_first_block_group(info, search_start);
-		if (shint && block_group_bits(shint, data)) {
-			spin_lock(&shint->lock);
-			used = btrfs_block_group_used(&shint->item);
-			if (used + shint->pinned + shint->reserved <
-			    div_factor(shint->key.offset, factor)) {
-				spin_unlock(&shint->lock);
-				return shint;
-			}
-			spin_unlock(&shint->lock);
-		}
-	}
-	if (hint && block_group_bits(hint, data)) {
-		spin_lock(&hint->lock);
-		used = btrfs_block_group_used(&hint->item);
-		if (used + hint->pinned + hint->reserved <
-		    div_factor(hint->key.offset, factor)) {
-			spin_unlock(&hint->lock);
-			return hint;
-		}
-		spin_unlock(&hint->lock);
-		last = hint->key.objectid + hint->key.offset;
-	} else {
-		if (hint)
-			last = max(hint->key.objectid, search_start);
-		else
-			last = search_start;
-	}
 again:
 	while (1) {
 		cache = btrfs_lookup_first_block_group(root->fs_info, last);
@@ -399,16 +365,18 @@ again:
 		last = cache->key.objectid + cache->key.offset;
 		used = btrfs_block_group_used(&cache->item);
 
-		if (block_group_bits(cache, data)) {
-			free_check = div_factor(cache->key.offset, factor);
+		if ((full_search || !cache->ro) &&
+		    block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) {
 			if (used + cache->pinned + cache->reserved <
-			    free_check) {
-				found_group = cache;
+			    div_factor(cache->key.offset, factor)) {
+				group_start = cache->key.objectid;
 				spin_unlock(&cache->lock);
+				put_block_group(cache);
 				goto found;
 			}
 		}
 		spin_unlock(&cache->lock);
+		put_block_group(cache);
 		cond_resched();
 	}
 	if (!wrapped) {
@@ -423,18 +391,7 @@ again:
 		goto again;
 	}
 found:
-	return found_group;
-}
-
-struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
-						 struct btrfs_block_group_cache
-						 *hint, u64 search_start,
-						 int data, int owner)
-{
-
-	struct btrfs_block_group_cache *ret;
-	ret = __btrfs_find_block_group(root, hint, search_start, data, owner);
-	return ret;
+	return group_start;
 }
 
 /* simple helper to search for an existing extent at a given offset */
@@ -1809,6 +1766,19 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 	return werr;
 }
 
+int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
+{
+	struct btrfs_block_group_cache *block_group;
+	int readonly = 0;
+
+	block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
+	if (!block_group || block_group->ro)
+		readonly = 1;
+	if (block_group)
+		put_block_group(block_group);
+	return readonly;
+}
+
 static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 			     u64 total_bytes, u64 bytes_used,
 			     struct btrfs_space_info **space_info)
@@ -1995,10 +1965,10 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 				int ret;
 				ret = btrfs_add_free_space(cache, bytenr,
 							   num_bytes);
-				if (ret)
-					return -1;
+				WARN_ON(ret);
 			}
 		}
+		put_block_group(cache);
 		total -= num_bytes;
 		bytenr += num_bytes;
 	}
@@ -2008,12 +1978,16 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
 {
 	struct btrfs_block_group_cache *cache;
+	u64 bytenr;
 
 	cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
 	if (!cache)
 		return 0;
 
-	return cache->key.objectid;
+	bytenr = cache->key.objectid;
+	put_block_group(cache);
+
+	return bytenr;
 }
 
 int btrfs_update_pinned_extents(struct btrfs_root *root,
@@ -2055,6 +2029,7 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
 			if (cache->cached)
 				btrfs_add_free_space(cache, bytenr, len);
 		}
+		put_block_group(cache);
 		bytenr += len;
 		num -= len;
 	}
@@ -2085,6 +2060,7 @@ static int update_reserved_extents(struct btrfs_root *root,
 		}
 		spin_unlock(&cache->lock);
 		spin_unlock(&cache->space_info->lock);
+		put_block_group(cache);
 		bytenr += len;
 		num -= len;
 	}
@@ -2724,6 +2700,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 			cache = btrfs_lookup_block_group(root->fs_info, bytenr);
 			BUG_ON(!cache);
 			btrfs_add_free_space(cache, bytenr, num_bytes);
+			put_block_group(cache);
 			update_reserved_extents(root, bytenr, num_bytes, 0);
 			return 0;
 		}
@@ -2928,6 +2905,8 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 		}
 new_group:
 		mutex_unlock(&block_group->alloc_mutex);
+		put_block_group(block_group);
+		block_group = NULL;
 new_group_no_lock:
 		/* don't try to compare new allocations against the
 		 * last allocation any more
@@ -2997,6 +2976,8 @@ loop_check:
 
 		block_group = list_entry(cur, struct btrfs_block_group_cache,
 					 list);
+		atomic_inc(&block_group->count);
+
 		search_start = block_group->key.objectid;
 		cur = cur->next;
 	}
@@ -3004,7 +2985,7 @@ loop_check:
 	/* we found what we needed */
 	if (ins->objectid) {
 		if (!(data & BTRFS_BLOCK_GROUP_DATA))
-			trans->block_group = block_group;
+			trans->block_group = block_group->key.objectid;
 
 		if (last_ptr)
 			*last_ptr = ins->objectid + ins->offset;
@@ -3015,6 +2996,8 @@ loop_check:
 		       loop, allowed_chunk_alloc);
 		ret = -ENOSPC;
 	}
+	if (block_group)
+		put_block_group(block_group);
 
 	up_read(&space_info->groups_sem);
 	return ret;
@@ -3124,6 +3107,7 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
 		return -ENOSPC;
 	}
 	btrfs_add_free_space(cache, start, len);
+	put_block_group(cache);
 	update_reserved_extents(root, start, len, 0);
 	return 0;
 }
@@ -3288,6 +3272,7 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
 	ret = btrfs_remove_free_space(block_group, ins->objectid,
 				      ins->offset);
 	BUG_ON(ret);
+	put_block_group(block_group);
 	ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
 					    ref_generation, owner, ins);
 	return ret;
@@ -5703,6 +5688,7 @@ next:
 	WARN_ON(block_group->reserved > 0);
 	WARN_ON(btrfs_block_group_used(&block_group->item) > 0);
 	spin_unlock(&block_group->lock);
+	put_block_group(block_group);
 	ret = 0;
 out:
 	btrfs_free_path(path);
@@ -5763,6 +5749,8 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
 		down_write(&block_group->space_info->groups_sem);
 		list_del(&block_group->list);
 		up_write(&block_group->space_info->groups_sem);
+
+		WARN_ON(atomic_read(&block_group->count) != 1);
 		kfree(block_group);
 
 		spin_lock(&info->block_group_cache_lock);
@@ -5807,6 +5795,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 			break;
 		}
 
+		atomic_set(&cache->count, 1);
 		spin_lock_init(&cache->lock);
 		mutex_init(&cache->alloc_mutex);
 		mutex_init(&cache->cache_mutex);
@@ -5861,11 +5850,12 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 
 	cache->key.objectid = chunk_offset;
 	cache->key.offset = size;
+	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+	atomic_set(&cache->count, 1);
 	spin_lock_init(&cache->lock);
 	mutex_init(&cache->alloc_mutex);
 	mutex_init(&cache->cache_mutex);
 	INIT_LIST_HEAD(&cache->list);
-	btrfs_set_key_type(&cache->key, BTRFS_BLOCK_GROUP_ITEM_KEY);
 
 	btrfs_set_block_group_used(&cache->item, bytes_used);
 	btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
@@ -5926,10 +5916,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	spin_unlock(&block_group->space_info->lock);
 	block_group->space_info->full = 0;
 
-	/*
-	memset(shrink_block_group, 0, sizeof(*shrink_block_group));
-	kfree(shrink_block_group);
-	*/
+	put_block_group(block_group);
+	put_block_group(block_group);
 
 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 	if (ret > 0)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 932d8c0b2c05..0a28b7706314 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -989,7 +989,6 @@ next_slot:
 
 		if (extent_type == BTRFS_FILE_EXTENT_REG ||
 		    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
-			struct btrfs_block_group_cache *block_group;
 			disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
 			extent_end = found_key.offset +
 				btrfs_file_extent_num_bytes(leaf, fi);
@@ -1007,9 +1006,7 @@ next_slot:
 				goto out_check;
 			if (btrfs_cross_ref_exist(trans, root, disk_bytenr))
 				goto out_check;
-			block_group = btrfs_lookup_block_group(root->fs_info,
-							       disk_bytenr);
-			if (!block_group || block_group->ro)
+			if (btrfs_extent_readonly(root, disk_bytenr))
 				goto out_check;
 			disk_bytenr += btrfs_file_extent_offset(leaf, fi);
 			nocow = 1;
@@ -1969,16 +1966,11 @@ void btrfs_read_locked_inode(struct inode *inode)
 	rdev = btrfs_inode_rdev(leaf, inode_item);
 
 	BTRFS_I(inode)->index_cnt = (u64)-1;
+	BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
 
 	alloc_group_block = btrfs_inode_block_group(leaf, inode_item);
-	BTRFS_I(inode)->block_group = btrfs_lookup_block_group(root->fs_info,
-						       alloc_group_block);
-	BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
-	if (!BTRFS_I(inode)->block_group) {
-		BTRFS_I(inode)->block_group = btrfs_find_block_group(root,
-						 NULL, 0,
-						 BTRFS_BLOCK_GROUP_METADATA, 0);
-	}
+	BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0,
+						alloc_group_block, 0);
 	btrfs_free_path(path);
 	inode_item = NULL;
 
@@ -2048,8 +2040,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
 	btrfs_set_inode_transid(leaf, item, trans->transid);
 	btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
 	btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
-	btrfs_set_inode_block_group(leaf, item,
-				    BTRFS_I(inode)->block_group->key.objectid);
+	btrfs_set_inode_block_group(leaf, item, BTRFS_I(inode)->block_group);
 }
 
 /*
@@ -3358,14 +3349,11 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root,
 				     struct inode *dir,
 				     const char *name, int name_len,
-				     u64 ref_objectid,
-				     u64 objectid,
-				     struct btrfs_block_group_cache *group,
-				     int mode, u64 *index)
+				     u64 ref_objectid, u64 objectid,
+				     u64 alloc_hint, int mode, u64 *index)
 {
 	struct inode *inode;
 	struct btrfs_inode_item *inode_item;
-	struct btrfs_block_group_cache *new_inode_group;
 	struct btrfs_key *location;
 	struct btrfs_path *path;
 	struct btrfs_inode_ref *ref;
@@ -3401,13 +3389,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 		owner = 0;
 	else
 		owner = 1;
-	new_inode_group = btrfs_find_block_group(root, group, 0,
-				       BTRFS_BLOCK_GROUP_METADATA, owner);
-	if (!new_inode_group) {
-		printk("find_block group failed\n");
-		new_inode_group = group;
-	}
-	BTRFS_I(inode)->block_group = new_inode_group;
+	BTRFS_I(inode)->block_group =
+			btrfs_find_block_group(root, 0, alloc_hint, owner);
 
 	key[0].objectid = objectid;
 	btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
@@ -4366,16 +4349,16 @@ out:
 /*
  * create a new subvolume directory/inode (helper for the ioctl).
  */
-int btrfs_create_subvol_root(struct btrfs_root *new_root, struct dentry *dentry,
-		struct btrfs_trans_handle *trans, u64 new_dirid,
-		struct btrfs_block_group_cache *block_group)
+int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *new_root, struct dentry *dentry,
+			     u64 new_dirid, u64 alloc_hint)
 {
 	struct inode *inode;
 	int error;
 	u64 index = 0;
 
 	inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid,
-				new_dirid, block_group, S_IFDIR | 0700, &index);
+				new_dirid, alloc_hint, S_IFDIR | 0700, &index);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
 	inode->i_op = &btrfs_dir_inode_operations;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 69c4a07f5869..5d67858ce993 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -173,7 +173,7 @@ static noinline int create_subvol(struct btrfs_root *root,
 	trans = btrfs_start_transaction(new_root, 1);
 	BUG_ON(!trans);
 
-	ret = btrfs_create_subvol_root(new_root, dentry, trans, new_dirid,
+	ret = btrfs_create_subvol_root(trans, new_root, dentry, new_dirid,
 				       BTRFS_I(dir)->block_group);
 	if (ret)
 		goto fail;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 47cd5fcad2c8..4604178a43a9 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -182,7 +182,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
 	h->transaction = root->fs_info->running_transaction;
 	h->blocks_reserved = num_blocks;
 	h->blocks_used = 0;
-	h->block_group = NULL;
+	h->block_group = 0;
 	h->alloc_exclude_nr = 0;
 	h->alloc_exclude_start = 0;
 	root->fs_info->running_transaction->use_count++;
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 202c8be6c05d..ffe7f639732b 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -41,7 +41,7 @@ struct btrfs_trans_handle {
 	unsigned long blocks_reserved;
 	unsigned long blocks_used;
 	struct btrfs_transaction *transaction;
-	struct btrfs_block_group_cache *block_group;
+	u64 block_group;
 	u64 alloc_exclude_start;
 	u64 alloc_exclude_nr;
 };
-- 
cgit v1.2.3