summaryrefslogtreecommitdiff
path: root/fs/btrfs/inode.c
diff options
context:
space:
mode:
authorChris Mason <chris.mason@oracle.com>2008-12-08 16:58:54 -0500
committerChris Mason <chris.mason@oracle.com>2008-12-08 16:58:54 -0500
commitd20f7043fa65659136c1a7c3c456eeeb5c6f431f (patch)
tree05d1031cadec6d440a97221e3a32adb504a51699 /fs/btrfs/inode.c
parentc99e905c945c462085c6d64646dc5af0c0a16815 (diff)
downloadlwn-d20f7043fa65659136c1a7c3c456eeeb5c6f431f.tar.gz
lwn-d20f7043fa65659136c1a7c3c456eeeb5c6f431f.zip
Btrfs: move data checksumming into a dedicated tree
Btrfs stores checksums for each data block. Until now, they have been stored in the subvolume trees, indexed by the inode that is referencing the data block. This means that when we read the inode, we've probably read in at least some checksums as well. But, this has a few problems: * The checksums are indexed by logical offset in the file. When compression is on, this means we have to do the expensive checksumming on the uncompressed data. It would be faster if we could checksum the compressed data instead. * If we implement encryption, we'll be checksumming the plain text and storing that on disk. This is significantly less secure. * For either compression or encryption, we have to get the plain text back before we can verify the checksum as correct. This makes the raid layer balancing and extent moving much more expensive. * It makes the front end caching code more complex, as we have touch the subvolume and inodes as we cache extents. * There is potentitally one copy of the checksum in each subvolume referencing an extent. The solution used here is to store the extent checksums in a dedicated tree. This allows us to index the checksums by phyiscal extent start and length. It means: * The checksum is against the data stored on disk, after any compression or encryption is done. * The checksum is stored in a central location, and can be verified without following back references, or reading inodes. This makes compression significantly faster by reducing the amount of data that needs to be checksummed. It will also allow much faster raid management code in general. The checksums are indexed by a key with a fixed objectid (a magic value in ctree.h) and offset set to the starting byte of the extent. This allows us to copy the checksum items into the fsync log tree directly (or any other tree), without having to invent a second format for them. Signed-off-by: Chris Mason <chris.mason@oracle.com>
Diffstat (limited to 'fs/btrfs/inode.c')
-rw-r--r--fs/btrfs/inode.c45
1 files changed, 24 insertions, 21 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 09efc9473a3d..c03d847b8c4e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1221,7 +1221,7 @@ static int __btrfs_submit_bio_start(struct inode *inode, int rw, struct bio *bio
struct btrfs_root *root = BTRFS_I(inode)->root;
int ret = 0;
- ret = btrfs_csum_one_bio(root, inode, bio);
+ ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
BUG_ON(ret);
return 0;
}
@@ -1259,12 +1259,11 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
btrfs_test_flag(inode, NODATASUM);
if (!(rw & (1 << BIO_RW))) {
-
- if (bio_flags & EXTENT_BIO_COMPRESSED)
+ if (bio_flags & EXTENT_BIO_COMPRESSED) {
return btrfs_submit_compressed_read(inode, bio,
mirror_num, bio_flags);
- else if (!skip_sum)
- btrfs_lookup_bio_sums(root, inode, bio);
+ } else if (!skip_sum)
+ btrfs_lookup_bio_sums(root, inode, bio, NULL);
goto mapit;
} else if (!skip_sum) {
/* we're doing a write, do the async checksumming */
@@ -1292,8 +1291,8 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
btrfs_set_trans_block_group(trans, inode);
list_for_each(cur, list) {
sum = list_entry(cur, struct btrfs_ordered_sum, list);
- btrfs_csum_file_blocks(trans, BTRFS_I(inode)->root,
- inode, sum);
+ btrfs_csum_file_blocks(trans,
+ BTRFS_I(inode)->root->fs_info->csum_root, sum);
}
return 0;
}
@@ -1545,6 +1544,7 @@ struct io_failure_record {
u64 start;
u64 len;
u64 logical;
+ unsigned long bio_flags;
int last_mirror;
};
@@ -1563,7 +1563,6 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
int ret;
int rw;
u64 logical;
- unsigned long bio_flags = 0;
ret = get_state_private(failure_tree, start, &private);
if (ret) {
@@ -1573,6 +1572,7 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
failrec->start = start;
failrec->len = end - start + 1;
failrec->last_mirror = 0;
+ failrec->bio_flags = 0;
spin_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, failrec->len);
@@ -1588,8 +1588,10 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
}
logical = start - em->start;
logical = em->block_start + logical;
- if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
- bio_flags = EXTENT_BIO_COMPRESSED;
+ if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+ logical = em->block_start;
+ failrec->bio_flags = EXTENT_BIO_COMPRESSED;
+ }
failrec->logical = logical;
free_extent_map(em);
set_extent_bits(failure_tree, start, end, EXTENT_LOCKED |
@@ -1626,6 +1628,7 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
bio->bi_sector = failrec->logical >> 9;
bio->bi_bdev = failed_bio->bi_bdev;
bio->bi_size = 0;
+
bio_add_page(bio, page, failrec->len, start - page_offset(page));
if (failed_bio->bi_rw & (1 << BIO_RW))
rw = WRITE;
@@ -1634,7 +1637,7 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
failrec->last_mirror,
- bio_flags);
+ failrec->bio_flags);
return 0;
}
@@ -1688,9 +1691,14 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
u32 csum = ~(u32)0;
unsigned long flags;
+ if (PageChecked(page)) {
+ ClearPageChecked(page);
+ goto good;
+ }
if (btrfs_test_opt(root, NODATASUM) ||
btrfs_test_flag(inode, NODATASUM))
return 0;
+
if (state && state->start == start) {
private = state->private;
ret = 0;
@@ -1709,7 +1717,7 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
}
kunmap_atomic(kaddr, KM_IRQ0);
local_irq_restore(flags);
-
+good:
/* if the io failure tree for this inode is non-empty,
* check to see if we've recovered from a failed IO
*/
@@ -2243,6 +2251,7 @@ fail:
return err;
}
+#if 0
/*
* when truncating bytes in a file, it is possible to avoid reading
* the leaves that contain only checksum items. This can be the
@@ -2410,6 +2419,8 @@ out:
return ret;
}
+#endif
+
/*
* this can truncate away extent items, csum items and directory items.
* It starts at a high offset and removes keys until it can't find
@@ -2459,9 +2470,6 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
btrfs_init_path(path);
- ret = drop_csum_leaves(trans, root, path, inode, new_size);
- BUG_ON(ret);
-
search_again:
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0) {
@@ -2509,16 +2517,11 @@ search_again:
}
item_end--;
}
- if (found_type == BTRFS_CSUM_ITEM_KEY) {
- ret = btrfs_csum_truncate(trans, root, path,
- new_size);
- BUG_ON(ret);
- }
if (item_end < new_size) {
if (found_type == BTRFS_DIR_ITEM_KEY) {
found_type = BTRFS_INODE_ITEM_KEY;
} else if (found_type == BTRFS_EXTENT_ITEM_KEY) {
- found_type = BTRFS_CSUM_ITEM_KEY;
+ found_type = BTRFS_EXTENT_DATA_KEY;
} else if (found_type == BTRFS_EXTENT_DATA_KEY) {
found_type = BTRFS_XATTR_ITEM_KEY;
} else if (found_type == BTRFS_XATTR_ITEM_KEY) {