summaryrefslogtreecommitdiff
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/Kconfig1
-rw-r--r--fs/btrfs/bio.c211
-rw-r--r--fs/btrfs/bio.h22
-rw-r--r--fs/btrfs/block-group.c40
-rw-r--r--fs/btrfs/block-group.h13
-rw-r--r--fs/btrfs/block-rsv.c21
-rw-r--r--fs/btrfs/block-rsv.h2
-rw-r--r--fs/btrfs/btrfs_inode.h35
-rw-r--r--fs/btrfs/compression.c299
-rw-r--r--fs/btrfs/compression.h20
-rw-r--r--fs/btrfs/ctree.c91
-rw-r--r--fs/btrfs/ctree.h17
-rw-r--r--fs/btrfs/delalloc-space.c2
-rw-r--r--fs/btrfs/delayed-ref.c49
-rw-r--r--fs/btrfs/delayed-ref.h22
-rw-r--r--fs/btrfs/disk-io.c147
-rw-r--r--fs/btrfs/extent-tree.c37
-rw-r--r--fs/btrfs/extent_io.c550
-rw-r--r--fs/btrfs/file-item.c93
-rw-r--r--fs/btrfs/file-item.h3
-rw-r--r--fs/btrfs/fs.h53
-rw-r--r--fs/btrfs/inode-item.c15
-rw-r--r--fs/btrfs/inode.c375
-rw-r--r--fs/btrfs/ioctl.c5
-rw-r--r--fs/btrfs/locking.c25
-rw-r--r--fs/btrfs/locking.h5
-rw-r--r--fs/btrfs/lru_cache.h5
-rw-r--r--fs/btrfs/lzo.c17
-rw-r--r--fs/btrfs/messages.c2
-rw-r--r--fs/btrfs/messages.h2
-rw-r--r--fs/btrfs/ordered-data.c120
-rw-r--r--fs/btrfs/ordered-data.h10
-rw-r--r--fs/btrfs/raid56.c162
-rw-r--r--fs/btrfs/raid56.h12
-rw-r--r--fs/btrfs/relocation.c6
-rw-r--r--fs/btrfs/scrub.c4040
-rw-r--r--fs/btrfs/send.c2
-rw-r--r--fs/btrfs/space-info.c32
-rw-r--r--fs/btrfs/space-info.h1
-rw-r--r--fs/btrfs/super.c3
-rw-r--r--fs/btrfs/sysfs.c5
-rw-r--r--fs/btrfs/tests/extent-map-tests.c1
-rw-r--r--fs/btrfs/transaction.c28
-rw-r--r--fs/btrfs/tree-checker.c14
-rw-r--r--fs/btrfs/tree-log.c171
-rw-r--r--fs/btrfs/volumes.c593
-rw-r--r--fs/btrfs/volumes.h85
-rw-r--r--fs/btrfs/zlib.c2
-rw-r--r--fs/btrfs/zoned.c4
-rw-r--r--fs/btrfs/zstd.c1
50 files changed, 2754 insertions, 4717 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 37b6bab90c83..66fa9ab2c046 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -2,6 +2,7 @@
config BTRFS_FS
tristate "Btrfs filesystem support"
+ select BLK_CGROUP_PUNT_BIO
select CRYPTO
select CRYPTO_CRC32C
select LIBCRC32C
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index 726592868e9c..5379c4714905 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -31,11 +31,11 @@ struct btrfs_failed_bio {
* Initialize a btrfs_bio structure. This skips the embedded bio itself as it
* is already initialized by the block layer.
*/
-void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode,
+void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info,
btrfs_bio_end_io_t end_io, void *private)
{
memset(bbio, 0, offsetof(struct btrfs_bio, bio));
- bbio->inode = inode;
+ bbio->fs_info = fs_info;
bbio->end_io = end_io;
bbio->private = private;
atomic_set(&bbio->pending_ios, 1);
@@ -48,41 +48,58 @@ void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode,
* Just like the underlying bio_alloc_bioset it will not fail as it is backed by
* a mempool.
*/
-struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
- struct btrfs_inode *inode,
- btrfs_bio_end_io_t end_io, void *private)
+struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
+ struct btrfs_fs_info *fs_info,
+ btrfs_bio_end_io_t end_io, void *private)
{
+ struct btrfs_bio *bbio;
struct bio *bio;
bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset);
- btrfs_bio_init(btrfs_bio(bio), inode, end_io, private);
- return bio;
+ bbio = btrfs_bio(bio);
+ btrfs_bio_init(bbio, fs_info, end_io, private);
+ return bbio;
}
-static struct bio *btrfs_split_bio(struct btrfs_fs_info *fs_info,
- struct bio *orig, u64 map_length,
- bool use_append)
+static blk_status_t btrfs_bio_extract_ordered_extent(struct btrfs_bio *bbio)
{
- struct btrfs_bio *orig_bbio = btrfs_bio(orig);
+ struct btrfs_ordered_extent *ordered;
+ int ret;
+
+ ordered = btrfs_lookup_ordered_extent(bbio->inode, bbio->file_offset);
+ if (WARN_ON_ONCE(!ordered))
+ return BLK_STS_IOERR;
+ ret = btrfs_extract_ordered_extent(bbio, ordered);
+ btrfs_put_ordered_extent(ordered);
+
+ return errno_to_blk_status(ret);
+}
+
+static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info,
+ struct btrfs_bio *orig_bbio,
+ u64 map_length, bool use_append)
+{
+ struct btrfs_bio *bbio;
struct bio *bio;
if (use_append) {
unsigned int nr_segs;
- bio = bio_split_rw(orig, &fs_info->limits, &nr_segs,
+ bio = bio_split_rw(&orig_bbio->bio, &fs_info->limits, &nr_segs,
&btrfs_clone_bioset, map_length);
} else {
- bio = bio_split(orig, map_length >> SECTOR_SHIFT, GFP_NOFS,
- &btrfs_clone_bioset);
+ bio = bio_split(&orig_bbio->bio, map_length >> SECTOR_SHIFT,
+ GFP_NOFS, &btrfs_clone_bioset);
}
- btrfs_bio_init(btrfs_bio(bio), orig_bbio->inode, NULL, orig_bbio);
-
- btrfs_bio(bio)->file_offset = orig_bbio->file_offset;
- if (!(orig->bi_opf & REQ_BTRFS_ONE_ORDERED))
+ bbio = btrfs_bio(bio);
+ btrfs_bio_init(bbio, fs_info, NULL, orig_bbio);
+ bbio->inode = orig_bbio->inode;
+ bbio->file_offset = orig_bbio->file_offset;
+ if (!(orig_bbio->bio.bi_opf & REQ_BTRFS_ONE_ORDERED))
orig_bbio->file_offset += map_length;
atomic_inc(&orig_bbio->pending_ios);
- return bio;
+ return bbio;
}
static void btrfs_orig_write_end_io(struct bio *bio);
@@ -164,7 +181,7 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio,
goto done;
}
- btrfs_submit_bio(&repair_bbio->bio, mirror);
+ btrfs_submit_bio(repair_bbio, mirror);
return;
}
@@ -224,15 +241,16 @@ static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio,
repair_bio = bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS,
&btrfs_repair_bioset);
repair_bio->bi_iter.bi_sector = failed_bbio->saved_iter.bi_sector;
- bio_add_page(repair_bio, bv->bv_page, bv->bv_len, bv->bv_offset);
+ __bio_add_page(repair_bio, bv->bv_page, bv->bv_len, bv->bv_offset);
repair_bbio = btrfs_bio(repair_bio);
- btrfs_bio_init(repair_bbio, failed_bbio->inode, NULL, fbio);
+ btrfs_bio_init(repair_bbio, fs_info, NULL, fbio);
+ repair_bbio->inode = failed_bbio->inode;
repair_bbio->file_offset = failed_bbio->file_offset + bio_offset;
mirror = next_repair_mirror(fbio, failed_bbio->mirror_num);
btrfs_debug(fs_info, "submitting repair read to mirror %d", mirror);
- btrfs_submit_bio(repair_bio, mirror);
+ btrfs_submit_bio(repair_bbio, mirror);
return fbio;
}
@@ -246,6 +264,9 @@ static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *de
struct btrfs_failed_bio *fbio = NULL;
u32 offset = 0;
+ /* Read-repair requires the inode field to be set by the submitter. */
+ ASSERT(inode);
+
/*
* Hand off repair bios to the repair code as there is no upper level
* submitter for them.
@@ -306,17 +327,17 @@ static void btrfs_end_bio_work(struct work_struct *work)
struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work);
/* Metadata reads are checked and repaired by the submitter. */
- if (bbio->bio.bi_opf & REQ_META)
- bbio->end_io(bbio);
- else
+ if (bbio->inode && !(bbio->bio.bi_opf & REQ_META))
btrfs_check_read_bio(bbio, bbio->bio.bi_private);
+ else
+ bbio->end_io(bbio);
}
static void btrfs_simple_end_io(struct bio *bio)
{
struct btrfs_bio *bbio = btrfs_bio(bio);
struct btrfs_device *dev = bio->bi_private;
- struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
+ struct btrfs_fs_info *fs_info = bbio->fs_info;
btrfs_bio_counter_dec(fs_info);
@@ -340,7 +361,8 @@ static void btrfs_raid56_end_io(struct bio *bio)
btrfs_bio_counter_dec(bioc->fs_info);
bbio->mirror_num = bioc->mirror_num;
- if (bio_op(bio) == REQ_OP_READ && !(bbio->bio.bi_opf & REQ_META))
+ if (bio_op(bio) == REQ_OP_READ && bbio->inode &&
+ !(bbio->bio.bi_opf & REQ_META))
btrfs_check_read_bio(bbio, NULL);
else
btrfs_orig_bbio_end_io(bbio);
@@ -418,7 +440,11 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio)
dev->devid, bio->bi_iter.bi_size);
btrfsic_check_bio(bio);
- submit_bio(bio);
+
+ if (bio->bi_opf & REQ_BTRFS_CGROUP_PUNT)
+ blkcg_punt_bio_submit(bio);
+ else
+ submit_bio(bio);
}
static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr)
@@ -534,10 +560,10 @@ static void run_one_async_done(struct btrfs_work *work)
/*
* All of the bios that pass through here are from async helpers.
- * Use REQ_CGROUP_PUNT to issue them from the owning cgroup's context.
- * This changes nothing when cgroups aren't in use.
+ * Use REQ_BTRFS_CGROUP_PUNT to issue them from the owning cgroup's
+ * context. This changes nothing when cgroups aren't in use.
*/
- bio->bi_opf |= REQ_CGROUP_PUNT;
+ bio->bi_opf |= REQ_BTRFS_CGROUP_PUNT;
__btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num);
}
@@ -562,7 +588,7 @@ static bool should_async_write(struct btrfs_bio *bbio)
* in order.
*/
if (bbio->bio.bi_opf & REQ_META) {
- struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
+ struct btrfs_fs_info *fs_info = bbio->fs_info;
if (btrfs_is_zoned(fs_info))
return false;
@@ -582,7 +608,7 @@ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio,
struct btrfs_io_context *bioc,
struct btrfs_io_stripe *smap, int mirror_num)
{
- struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
+ struct btrfs_fs_info *fs_info = bbio->fs_info;
struct async_submit_bio *async;
async = kmalloc(sizeof(*async), GFP_NOFS);
@@ -603,12 +629,12 @@ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio,
return true;
}
-static bool btrfs_submit_chunk(struct bio *bio, int mirror_num)
+static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
{
- struct btrfs_bio *bbio = btrfs_bio(bio);
struct btrfs_inode *inode = bbio->inode;
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_fs_info *fs_info = bbio->fs_info;
struct btrfs_bio *orig_bbio = bbio;
+ struct bio *bio = &bbio->bio;
u64 logical = bio->bi_iter.bi_sector << 9;
u64 length = bio->bi_iter.bi_size;
u64 map_length = length;
@@ -631,15 +657,15 @@ static bool btrfs_submit_chunk(struct bio *bio, int mirror_num)
map_length = min(map_length, fs_info->max_zone_append_size);
if (map_length < length) {
- bio = btrfs_split_bio(fs_info, bio, map_length, use_append);
- bbio = btrfs_bio(bio);
+ bbio = btrfs_split_bio(fs_info, bbio, map_length, use_append);
+ bio = &bbio->bio;
}
/*
* Save the iter for the end_io handler and preload the checksums for
* data reads.
*/
- if (bio_op(bio) == REQ_OP_READ && !(bio->bi_opf & REQ_META)) {
+ if (bio_op(bio) == REQ_OP_READ && inode && !(bio->bi_opf & REQ_META)) {
bbio->saved_iter = bio->bi_iter;
ret = btrfs_lookup_bio_sums(bbio);
if (ret)
@@ -650,7 +676,7 @@ static bool btrfs_submit_chunk(struct bio *bio, int mirror_num)
if (use_append) {
bio->bi_opf &= ~REQ_OP_WRITE;
bio->bi_opf |= REQ_OP_ZONE_APPEND;
- ret = btrfs_extract_ordered_extent(btrfs_bio(bio));
+ ret = btrfs_bio_extract_ordered_extent(bbio);
if (ret)
goto fail_put_bio;
}
@@ -659,7 +685,7 @@ static bool btrfs_submit_chunk(struct bio *bio, int mirror_num)
* Csum items for reloc roots have already been cloned at this
* point, so they are handled as part of the no-checksum case.
*/
- if (!(inode->flags & BTRFS_INODE_NODATASUM) &&
+ if (inode && !(inode->flags & BTRFS_INODE_NODATASUM) &&
!test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) &&
!btrfs_is_data_reloc_root(inode->root)) {
if (should_async_write(bbio) &&
@@ -686,9 +712,12 @@ fail:
return true;
}
-void btrfs_submit_bio(struct bio *bio, int mirror_num)
+void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num)
{
- while (!btrfs_submit_chunk(bio, mirror_num))
+ /* If bbio->inode is not populated, its file_offset must be 0. */
+ ASSERT(bbio->inode || bbio->file_offset == 0);
+
+ while (!btrfs_submit_chunk(bbio, mirror_num))
;
}
@@ -706,12 +735,9 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
u64 length, u64 logical, struct page *page,
unsigned int pg_offset, int mirror_num)
{
- struct btrfs_device *dev;
+ struct btrfs_io_stripe smap = { 0 };
struct bio_vec bvec;
struct bio bio;
- u64 map_length = 0;
- u64 sector;
- struct btrfs_io_context *bioc = NULL;
int ret = 0;
ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
@@ -720,68 +746,38 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
if (btrfs_repair_one_zone(fs_info, logical))
return 0;
- map_length = length;
-
/*
* Avoid races with device replace and make sure our bioc has devices
* associated to its stripes that don't go away while we are doing the
* read repair operation.
*/
btrfs_bio_counter_inc_blocked(fs_info);
- if (btrfs_is_parity_mirror(fs_info, logical, length)) {
- /*
- * Note that we don't use BTRFS_MAP_WRITE because it's supposed
- * to update all raid stripes, but here we just want to correct
- * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad
- * stripe's dev and sector.
- */
- ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
- &map_length, &bioc, 0);
- if (ret)
- goto out_counter_dec;
- ASSERT(bioc->mirror_num == 1);
- } else {
- ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
- &map_length, &bioc, mirror_num);
- if (ret)
- goto out_counter_dec;
- /*
- * This happens when dev-replace is also running, and the
- * mirror_num indicates the dev-replace target.
- *
- * In this case, we don't need to do anything, as the read
- * error just means the replace progress hasn't reached our
- * read range, and later replace routine would handle it well.
- */
- if (mirror_num != bioc->mirror_num)
- goto out_counter_dec;
- }
-
- sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9;
- dev = bioc->stripes[bioc->mirror_num - 1].dev;
- btrfs_put_bioc(bioc);
+ ret = btrfs_map_repair_block(fs_info, &smap, logical, length, mirror_num);
+ if (ret < 0)
+ goto out_counter_dec;
- if (!dev || !dev->bdev ||
- !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
+ if (!smap.dev->bdev ||
+ !test_bit(BTRFS_DEV_STATE_WRITEABLE, &smap.dev->dev_state)) {
ret = -EIO;
goto out_counter_dec;
}
- bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC);
- bio.bi_iter.bi_sector = sector;
+ bio_init(&bio, smap.dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC);
+ bio.bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT;
__bio_add_page(&bio, page, length, pg_offset);
btrfsic_check_bio(&bio);
ret = submit_bio_wait(&bio);
if (ret) {
/* try to remap that extent elsewhere? */
- btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
+ btrfs_dev_stat_inc_and_print(smap.dev, BTRFS_DEV_STAT_WRITE_ERRS);
goto out_bio_uninit;
}
btrfs_info_rl_in_rcu(fs_info,
"read error corrected: ino %llu off %llu (dev %s sector %llu)",
- ino, start, btrfs_dev_name(dev), sector);
+ ino, start, btrfs_dev_name(smap.dev),
+ smap.physical >> SECTOR_SHIFT);
ret = 0;
out_bio_uninit:
@@ -791,6 +787,45 @@ out_counter_dec:
return ret;
}
+/*
+ * Submit a btrfs_bio based repair write.
+ *
+ * If @dev_replace is true, the write would be submitted to dev-replace target.
+ */
+void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace)
+{
+ struct btrfs_fs_info *fs_info = bbio->fs_info;
+ u64 logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
+ u64 length = bbio->bio.bi_iter.bi_size;
+ struct btrfs_io_stripe smap = { 0 };
+ int ret;
+
+ ASSERT(fs_info);
+ ASSERT(mirror_num > 0);
+ ASSERT(btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE);
+ ASSERT(!bbio->inode);
+
+ btrfs_bio_counter_inc_blocked(fs_info);
+ ret = btrfs_map_repair_block(fs_info, &smap, logical, length, mirror_num);
+ if (ret < 0)
+ goto fail;
+
+ if (dev_replace) {
+ if (btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE && btrfs_is_zoned(fs_info)) {
+ bbio->bio.bi_opf &= ~REQ_OP_WRITE;
+ bbio->bio.bi_opf |= REQ_OP_ZONE_APPEND;
+ }
+ ASSERT(smap.dev == fs_info->dev_replace.srcdev);
+ smap.dev = fs_info->dev_replace.tgtdev;
+ }
+ __btrfs_submit_bio(&bbio->bio, NULL, &smap, mirror_num);
+ return;
+
+fail:
+ btrfs_bio_counter_dec(fs_info);
+ btrfs_bio_end_io(bbio, errno_to_blk_status(ret));
+}
+
int __init btrfs_bioset_init(void)
{
if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h
index 873ff85817f0..a8eca3a65673 100644
--- a/fs/btrfs/bio.h
+++ b/fs/btrfs/bio.h
@@ -30,7 +30,10 @@ typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio);
* passed to btrfs_submit_bio for mapping to the physical devices.
*/
struct btrfs_bio {
- /* Inode and offset into it that this I/O operates on. */
+ /*
+ * Inode and offset into it that this I/O operates on.
+ * Only set for data I/O.
+ */
struct btrfs_inode *inode;
u64 file_offset;
@@ -58,6 +61,9 @@ struct btrfs_bio {
atomic_t pending_ios;
struct work_struct end_io_work;
+ /* File system that this I/O operates on. */
+ struct btrfs_fs_info *fs_info;
+
/*
* This member must come last, bio_alloc_bioset will allocate enough
* bytes for entire btrfs_bio but relies on bio being last.
@@ -73,11 +79,11 @@ static inline struct btrfs_bio *btrfs_bio(struct bio *bio)
int __init btrfs_bioset_init(void);
void __cold btrfs_bioset_exit(void);
-void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode,
+void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info,
btrfs_bio_end_io_t end_io, void *private);
-struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
- struct btrfs_inode *inode,
- btrfs_bio_end_io_t end_io, void *private);
+struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
+ struct btrfs_fs_info *fs_info,
+ btrfs_bio_end_io_t end_io, void *private);
static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
{
@@ -88,7 +94,11 @@ static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
/* Bio only refers to one ordered extent. */
#define REQ_BTRFS_ONE_ORDERED REQ_DRV
-void btrfs_submit_bio(struct bio *bio, int mirror_num);
+/* Submit using blkcg_punt_bio_submit. */
+#define REQ_BTRFS_CGROUP_PUNT REQ_FS_PRIVATE
+
+void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num);
+void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace);
int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
u64 length, u64 logical, struct page *page,
unsigned int pg_offset, int mirror_num);
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 5fc670c27f86..957ad1c31c4f 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -160,15 +160,6 @@ void btrfs_put_block_group(struct btrfs_block_group *cache)
btrfs_discard_cancel_work(&cache->fs_info->discard_ctl,
cache);
- /*
- * If not empty, someone is still holding mutex of
- * full_stripe_lock, which can only be released by caller.
- * And it will definitely cause use-after-free when caller
- * tries to release full stripe lock.
- *
- * No better way to resolve, but only to warn.
- */
- WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
kfree(cache->free_space_ctl);
kfree(cache->physical_map);
kfree(cache);
@@ -1977,12 +1968,12 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
map = em->map_lookup;
data_stripe_length = em->orig_block_len;
- io_stripe_size = map->stripe_len;
+ io_stripe_size = BTRFS_STRIPE_LEN;
chunk_start = em->start;
/* For RAID5/6 adjust to a full IO stripe length */
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
- io_stripe_size = map->stripe_len * nr_data_stripes(map);
+ io_stripe_size = nr_data_stripes(map) << BTRFS_STRIPE_LEN_SHIFT;
buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
if (!buf) {
@@ -1992,28 +1983,28 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
for (i = 0; i < map->num_stripes; i++) {
bool already_inserted = false;
- u64 stripe_nr;
- u64 offset;
+ u32 stripe_nr;
+ u32 offset;
int j;
if (!in_range(physical, map->stripes[i].physical,
data_stripe_length))
continue;
- stripe_nr = physical - map->stripes[i].physical;
- stripe_nr = div64_u64_rem(stripe_nr, map->stripe_len, &offset);
+ stripe_nr = (physical - map->stripes[i].physical) >>
+ BTRFS_STRIPE_LEN_SHIFT;
+ offset = (physical - map->stripes[i].physical) &
+ BTRFS_STRIPE_LEN_MASK;
if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
- BTRFS_BLOCK_GROUP_RAID10)) {
- stripe_nr = stripe_nr * map->num_stripes + i;
- stripe_nr = div_u64(stripe_nr, map->sub_stripes);
- }
+ BTRFS_BLOCK_GROUP_RAID10))
+ stripe_nr = div_u64(stripe_nr * map->num_stripes + i,
+ map->sub_stripes);
/*
* The remaining case would be for RAID56, multiply by
* nr_data_stripes(). Alternatively, just use rmap_len below
* instead of map->stripe_len
*/
-
bytenr = chunk_start + stripe_nr * io_stripe_size + offset;
/* Ensure we don't add duplicate addresses */
@@ -2124,8 +2115,6 @@ static struct btrfs_block_group *btrfs_create_block_group_cache(
btrfs_init_free_space_ctl(cache, cache->free_space_ctl);
atomic_set(&cache->frozen, 0);
mutex_init(&cache->free_space_lock);
- cache->full_stripe_locks_root.root = RB_ROOT;
- mutex_init(&cache->full_stripe_locks_root.lock);
return cache;
}
@@ -2672,7 +2661,7 @@ static u64 calculate_global_root_id(struct btrfs_fs_info *fs_info, u64 offset)
}
struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
- u64 bytes_used, u64 type,
+ u64 type,
u64 chunk_offset, u64 size)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -2687,7 +2676,6 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
cache->length = size;
set_free_space_tree_thresholds(cache);
- cache->used = bytes_used;
cache->flags = type;
cache->cached = BTRFS_CACHE_FINISHED;
cache->global_root_id = calculate_global_root_id(fs_info, cache->start);
@@ -2738,9 +2726,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
#ifdef CONFIG_BTRFS_DEBUG
if (btrfs_should_fragment_free_space(cache)) {
- u64 new_bytes_used = size - bytes_used;
-
- cache->space_info->bytes_used += new_bytes_used >> 1;
+ cache->space_info->bytes_used += size >> 1;
fragment_free_space(cache);
}
#endif
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 6e4a0b429ac3..cc0e4b37db2d 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -91,14 +91,6 @@ struct btrfs_caching_control {
/* Once caching_thread() finds this much free space, it will wake up waiters. */
#define CACHING_CTL_WAKE_UP SZ_2M
-/*
- * Tree to record all locked full stripes of a RAID5/6 block group
- */
-struct btrfs_full_stripe_locks_tree {
- struct rb_root root;
- struct mutex lock;
-};
-
struct btrfs_block_group {
struct btrfs_fs_info *fs_info;
struct inode *inode;
@@ -229,9 +221,6 @@ struct btrfs_block_group {
*/
int swap_extents;
- /* Record locked full stripes for RAID5/6 block group */
- struct btrfs_full_stripe_locks_tree full_stripe_locks_root;
-
/*
* Allocation offset for the block group to implement sequential
* allocation. This is used only on a zoned filesystem.
@@ -302,7 +291,7 @@ void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info);
void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg);
int btrfs_read_block_groups(struct btrfs_fs_info *info);
struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
- u64 bytes_used, u64 type,
+ u64 type,
u64 chunk_offset, u64 size);
void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans);
int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
index 5367a14d44d2..3ab707e26fa2 100644
--- a/fs/btrfs/block-rsv.c
+++ b/fs/btrfs/block-rsv.c
@@ -232,9 +232,6 @@ int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_percent)
u64 num_bytes = 0;
int ret = -ENOSPC;
- if (!block_rsv)
- return 0;
-
spin_lock(&block_rsv->lock);
num_bytes = mult_perc(block_rsv->size, min_percent);
if (block_rsv->reserved >= num_bytes)
@@ -245,17 +242,15 @@ int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_percent)
}
int btrfs_block_rsv_refill(struct btrfs_fs_info *fs_info,
- struct btrfs_block_rsv *block_rsv, u64 min_reserved,
+ struct btrfs_block_rsv *block_rsv, u64 num_bytes,
enum btrfs_reserve_flush_enum flush)
{
- u64 num_bytes = 0;
int ret = -ENOSPC;
if (!block_rsv)
return 0;
spin_lock(&block_rsv->lock);
- num_bytes = min_reserved;
if (block_rsv->reserved >= num_bytes)
ret = 0;
else
@@ -355,17 +350,19 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info)
/*
* But we also want to reserve enough space so we can do the fallback
- * global reserve for an unlink, which is an additional 5 items (see the
- * comment in __unlink_start_trans for what we're modifying.)
+ * global reserve for an unlink, which is an additional
+ * BTRFS_UNLINK_METADATA_UNITS items.
*
* But we also need space for the delayed ref updates from the unlink,
- * so its 10, 5 for the actual operation, and 5 for the delayed ref
- * updates.
+ * so add BTRFS_UNLINK_METADATA_UNITS units for delayed refs, one for
+ * each unlink metadata item.
*/
- min_items += 10;
+ min_items += BTRFS_UNLINK_METADATA_UNITS;
num_bytes = max_t(u64, num_bytes,
- btrfs_calc_insert_metadata_size(fs_info, min_items));
+ btrfs_calc_insert_metadata_size(fs_info, min_items) +
+ btrfs_calc_delayed_ref_bytes(fs_info,
+ BTRFS_UNLINK_METADATA_UNITS));
spin_lock(&sinfo->lock);
spin_lock(&block_rsv->lock);
diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h
index 4cc41c9aaa82..6dc781709aca 100644
--- a/fs/btrfs/block-rsv.h
+++ b/fs/btrfs/block-rsv.h
@@ -65,7 +65,7 @@ int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info,
enum btrfs_reserve_flush_enum flush);
int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_percent);
int btrfs_block_rsv_refill(struct btrfs_fs_info *fs_info,
- struct btrfs_block_rsv *block_rsv, u64 min_reserved,
+ struct btrfs_block_rsv *block_rsv, u64 num_bytes,
enum btrfs_reserve_flush_enum flush);
int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
struct btrfs_block_rsv *dst_rsv, u64 num_bytes,
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 9dc21622806e..ec2ae4406c16 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -142,11 +142,22 @@ struct btrfs_inode {
/* a local copy of root's last_log_commit */
int last_log_commit;
- /*
- * Total number of bytes pending delalloc, used by stat to calculate the
- * real block usage of the file. This is used only for files.
- */
- u64 delalloc_bytes;
+ union {
+ /*
+ * Total number of bytes pending delalloc, used by stat to
+ * calculate the real block usage of the file. This is used
+ * only for files.
+ */
+ u64 delalloc_bytes;
+ /*
+ * The lowest possible index of the next dir index key which
+ * points to an inode that needs to be logged.
+ * This is used only for directories.
+ * Use the helpers btrfs_get_first_dir_index_to_log() and
+ * btrfs_set_first_dir_index_to_log() to access this field.
+ */
+ u64 first_dir_index_to_log;
+ };
union {
/*
@@ -247,6 +258,17 @@ struct btrfs_inode {
struct inode vfs_inode;
};
+static inline u64 btrfs_get_first_dir_index_to_log(const struct btrfs_inode *inode)
+{
+ return READ_ONCE(inode->first_dir_index_to_log);
+}
+
+static inline void btrfs_set_first_dir_index_to_log(struct btrfs_inode *inode,
+ u64 index)
+{
+ WRITE_ONCE(inode->first_dir_index_to_log, index);
+}
+
static inline struct btrfs_inode *BTRFS_I(const struct inode *inode)
{
return container_of(inode, struct btrfs_inode, vfs_inode);
@@ -407,7 +429,8 @@ static inline void btrfs_inode_split_flags(u64 inode_item_flags,
int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
u32 pgoff, u8 *csum, const u8 * const csum_expected);
-blk_status_t btrfs_extract_ordered_extent(struct btrfs_bio *bbio);
+int btrfs_extract_ordered_extent(struct btrfs_bio *bbio,
+ struct btrfs_ordered_extent *ordered);
bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
u32 bio_offset, struct bio_vec *bv);
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index f42f31f22d13..2d0493f0a184 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -37,6 +37,8 @@
#include "file-item.h"
#include "super.h"
+struct bio_set btrfs_compressed_bioset;
+
static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" };
const char* btrfs_compress_type2str(enum btrfs_compression_type type)
@@ -54,6 +56,25 @@ const char* btrfs_compress_type2str(enum btrfs_compression_type type)
return NULL;
}
+static inline struct compressed_bio *to_compressed_bio(struct btrfs_bio *bbio)
+{
+ return container_of(bbio, struct compressed_bio, bbio);
+}
+
+static struct compressed_bio *alloc_compressed_bio(struct btrfs_inode *inode,
+ u64 start, blk_opf_t op,
+ btrfs_bio_end_io_t end_io)
+{
+ struct btrfs_bio *bbio;
+
+ bbio = btrfs_bio(bio_alloc_bioset(NULL, BTRFS_MAX_COMPRESSED_PAGES, op,
+ GFP_NOFS, &btrfs_compressed_bioset));
+ btrfs_bio_init(bbio, inode->root->fs_info, end_io, NULL);
+ bbio->inode = inode;
+ bbio->file_offset = start;
+ return to_compressed_bio(bbio);
+}
+
bool btrfs_compress_is_valid_type(const char *str, size_t len)
{
int i;
@@ -139,32 +160,25 @@ static int compression_decompress(int type, struct list_head *ws,
}
}
+static void btrfs_free_compressed_pages(struct compressed_bio *cb)
+{
+ for (unsigned int i = 0; i < cb->nr_pages; i++)
+ put_page(cb->compressed_pages[i]);
+ kfree(cb->compressed_pages);
+}
+
static int btrfs_decompress_bio(struct compressed_bio *cb);
static void end_compressed_bio_read(struct btrfs_bio *bbio)
{
- struct compressed_bio *cb = bbio->private;
- unsigned int index;
- struct page *page;
+ struct compressed_bio *cb = to_compressed_bio(bbio);
+ blk_status_t status = bbio->bio.bi_status;
- if (bbio->bio.bi_status)
- cb->status = bbio->bio.bi_status;
- else
- cb->status = errno_to_blk_status(btrfs_decompress_bio(cb));
-
- /* Release the compressed pages */
- for (index = 0; index < cb->nr_pages; index++) {
- page = cb->compressed_pages[index];
- page->mapping = NULL;
- put_page(page);
- }
-
- /* Do io completion on the original bio */
- btrfs_bio_end_io(btrfs_bio(cb->orig_bio), cb->status);
+ if (!status)
+ status = errno_to_blk_status(btrfs_decompress_bio(cb));
- /* Finally free the cb struct */
- kfree(cb->compressed_pages);
- kfree(cb);
+ btrfs_free_compressed_pages(cb);
+ btrfs_bio_end_io(cb->orig_bbio, status);
bio_put(&bbio->bio);
}
@@ -172,14 +186,14 @@ static void end_compressed_bio_read(struct btrfs_bio *bbio)
* Clear the writeback bits on all of the file
* pages for a compressed write
*/
-static noinline void end_compressed_writeback(struct inode *inode,
- const struct compressed_bio *cb)
+static noinline void end_compressed_writeback(const struct compressed_bio *cb)
{
+ struct inode *inode = &cb->bbio.inode->vfs_inode;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
unsigned long index = cb->start >> PAGE_SHIFT;
unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT;
struct folio_batch fbatch;
- const int errno = blk_status_to_errno(cb->status);
+ const int errno = blk_status_to_errno(cb->bbio.bio.bi_status);
int i;
int ret;
@@ -207,45 +221,25 @@ static noinline void end_compressed_writeback(struct inode *inode,
/* the inode may be gone now */
}
-static void finish_compressed_bio_write(struct compressed_bio *cb)
+static void btrfs_finish_compressed_write_work(struct work_struct *work)
{
- struct inode *inode = cb->inode;
- unsigned int index;
+ struct compressed_bio *cb =
+ container_of(work, struct compressed_bio, write_end_work);
/*
* Ok, we're the last bio for this extent, step one is to call back
* into the FS and do all the end_io operations.
*/
- btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), NULL,
+ btrfs_writepage_endio_finish_ordered(cb->bbio.inode, NULL,
cb->start, cb->start + cb->len - 1,
- cb->status == BLK_STS_OK);
+ cb->bbio.bio.bi_status == BLK_STS_OK);
if (cb->writeback)
- end_compressed_writeback(inode, cb);
+ end_compressed_writeback(cb);
/* Note, our inode could be gone now */
- /*
- * Release the compressed pages, these came from alloc_page and
- * are not attached to the inode at all
- */
- for (index = 0; index < cb->nr_pages; index++) {
- struct page *page = cb->compressed_pages[index];
-
- page->mapping = NULL;
- put_page(page);
- }
-
- /* Finally free the cb struct */
- kfree(cb->compressed_pages);
- kfree(cb);
-}
-
-static void btrfs_finish_compressed_write_work(struct work_struct *work)
-{
- struct compressed_bio *cb =
- container_of(work, struct compressed_bio, write_end_work);
-
- finish_compressed_bio_write(cb);
+ btrfs_free_compressed_pages(cb);
+ bio_put(&cb->bbio.bio);
}
/*
@@ -257,13 +251,25 @@ static void btrfs_finish_compressed_write_work(struct work_struct *work)
*/
static void end_compressed_bio_write(struct btrfs_bio *bbio)
{
- struct compressed_bio *cb = bbio->private;
- struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
+ struct compressed_bio *cb = to_compressed_bio(bbio);
+ struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
- cb->status = bbio->bio.bi_status;
queue_work(fs_info->compressed_write_workers, &cb->write_end_work);
+}
- bio_put(&bbio->bio);
+static void btrfs_add_compressed_bio_pages(struct compressed_bio *cb)
+{
+ struct bio *bio = &cb->bbio.bio;
+ u32 offset = 0;
+
+ while (offset < cb->compressed_len) {
+ u32 len = min_t(u32, cb->compressed_len - offset, PAGE_SIZE);
+
+ /* Maximum compressed extent is smaller than bio size limit. */
+ __bio_add_page(bio, cb->compressed_pages[offset >> PAGE_SHIFT],
+ len, 0);
+ offset += len;
+ }
}
/*
@@ -275,28 +281,24 @@ static void end_compressed_bio_write(struct btrfs_bio *bbio)
* This also checksums the file bytes and gets things ready for
* the end io hooks.
*/
-blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
+void btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
unsigned int len, u64 disk_start,
unsigned int compressed_len,
struct page **compressed_pages,
unsigned int nr_pages,
blk_opf_t write_flags,
- struct cgroup_subsys_state *blkcg_css,
bool writeback)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct bio *bio = NULL;
struct compressed_bio *cb;
- u64 cur_disk_bytenr = disk_start;
- blk_status_t ret = BLK_STS_OK;
ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
IS_ALIGNED(len, fs_info->sectorsize));
- cb = kmalloc(sizeof(struct compressed_bio), GFP_NOFS);
- if (!cb)
- return BLK_STS_RESOURCE;
- cb->status = BLK_STS_OK;
- cb->inode = &inode->vfs_inode;
+
+ write_flags |= REQ_BTRFS_ONE_ORDERED;
+
+ cb = alloc_compressed_bio(inode, start, REQ_OP_WRITE | write_flags,
+ end_compressed_bio_write);
cb->start = start;
cb->len = len;
cb->compressed_pages = compressed_pages;
@@ -304,56 +306,10 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
cb->writeback = writeback;
INIT_WORK(&cb->write_end_work, btrfs_finish_compressed_write_work);
cb->nr_pages = nr_pages;
+ cb->bbio.bio.bi_iter.bi_sector = disk_start >> SECTOR_SHIFT;
+ btrfs_add_compressed_bio_pages(cb);
- if (blkcg_css) {
- kthread_associate_blkcg(blkcg_css);
- write_flags |= REQ_CGROUP_PUNT;
- }
-
- write_flags |= REQ_BTRFS_ONE_ORDERED;
- bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_WRITE | write_flags,
- BTRFS_I(cb->inode), end_compressed_bio_write, cb);
- bio->bi_iter.bi_sector = cur_disk_bytenr >> SECTOR_SHIFT;
- btrfs_bio(bio)->file_offset = start;
-
- while (cur_disk_bytenr < disk_start + compressed_len) {
- u64 offset = cur_disk_bytenr - disk_start;
- unsigned int index = offset >> PAGE_SHIFT;
- unsigned int real_size;
- unsigned int added;
- struct page *page = compressed_pages[index];
-
- /*
- * We have various limits on the real read size:
- * - page boundary
- * - compressed length boundary
- */
- real_size = min_t(u64, U32_MAX, PAGE_SIZE - offset_in_page(offset));
- real_size = min_t(u64, real_size, compressed_len - offset);
- ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize));
-
- added = bio_add_page(bio, page, real_size, offset_in_page(offset));
- /*
- * Maximum compressed extent is smaller than bio size limit,
- * thus bio_add_page() should always success.
- */
- ASSERT(added == real_size);
- cur_disk_bytenr += added;
- }
-
- /* Finished the range. */
- ASSERT(bio->bi_iter.bi_size);
- btrfs_submit_bio(bio, 0);
- if (blkcg_css)
- kthread_associate_blkcg(NULL);
- return ret;
-}
-
-static u64 bio_end_offset(struct bio *bio)
-{
- struct bio_vec *last = bio_last_bvec_all(bio);
-
- return page_offset(last->bv_page) + last->bv_len + last->bv_offset;
+ btrfs_submit_bio(&cb->bbio, 0);
}
/*
@@ -374,7 +330,8 @@ static noinline int add_ra_bio_pages(struct inode *inode,
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
unsigned long end_index;
- u64 cur = bio_end_offset(cb->orig_bio);
+ struct bio *orig_bio = &cb->orig_bbio->bio;
+ u64 cur = cb->orig_bbio->file_offset + orig_bio->bi_iter.bi_size;
u64 isize = i_size_read(inode);
int ret;
struct page *page;
@@ -464,7 +421,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
*/
if (!em || cur < em->start ||
(cur + fs_info->sectorsize > extent_map_end(em)) ||
- (em->block_start >> 9) != cb->orig_bio->bi_iter.bi_sector) {
+ (em->block_start >> 9) != orig_bio->bi_iter.bi_sector) {
free_extent_map(em);
unlock_extent(tree, cur, page_end, NULL);
unlock_page(page);
@@ -484,7 +441,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
}
add_size = min(em->start + em->len, page_end + 1) - cur;
- ret = bio_add_page(cb->orig_bio, page, add_size, offset_in_page(cur));
+ ret = bio_add_page(orig_bio, page, add_size, offset_in_page(cur));
if (ret != add_size) {
unlock_extent(tree, cur, page_end, NULL);
unlock_page(page);
@@ -515,17 +472,14 @@ static noinline int add_ra_bio_pages(struct inode *inode,
* After the compressed pages are read, we copy the bytes into the
* bio we were passed and then call the bio end_io calls
*/
-void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
- int mirror_num)
+void btrfs_submit_compressed_read(struct btrfs_bio *bbio, int mirror_num)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct extent_map_tree *em_tree;
+ struct btrfs_inode *inode = bbio->inode;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct extent_map_tree *em_tree = &inode->extent_tree;
struct compressed_bio *cb;
unsigned int compressed_len;
- struct bio *comp_bio;
- const u64 disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT;
- u64 cur_disk_byte = disk_bytenr;
- u64 file_offset;
+ u64 file_offset = bbio->file_offset;
u64 em_len;
u64 em_start;
struct extent_map *em;
@@ -533,12 +487,6 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
int memstall = 0;
blk_status_t ret;
int ret2;
- int i;
-
- em_tree = &BTRFS_I(inode)->extent_tree;
-
- file_offset = bio_first_bvec_all(bio)->bv_offset +
- page_offset(bio_first_page_all(bio));
/* we need the actual starting offset of this extent in the file */
read_lock(&em_tree->lock);
@@ -551,102 +499,54 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
ASSERT(em->compress_type != BTRFS_COMPRESS_NONE);
compressed_len = em->block_len;
- cb = kmalloc(sizeof(struct compressed_bio), GFP_NOFS);
- if (!cb) {
- ret = BLK_STS_RESOURCE;
- goto out;
- }
- cb->status = BLK_STS_OK;
- cb->inode = inode;
+ cb = alloc_compressed_bio(inode, file_offset, REQ_OP_READ,
+ end_compressed_bio_read);
cb->start = em->orig_start;
em_len = em->len;
em_start = em->start;
- cb->len = bio->bi_iter.bi_size;
+ cb->len = bbio->bio.bi_iter.bi_size;
cb->compressed_len = compressed_len;
cb->compress_type = em->compress_type;
- cb->orig_bio = bio;
+ cb->orig_bbio = bbio;
free_extent_map(em);
- em = NULL;
cb->nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE);
cb->compressed_pages = kcalloc(cb->nr_pages, sizeof(struct page *), GFP_NOFS);
if (!cb->compressed_pages) {
ret = BLK_STS_RESOURCE;
- goto fail;
+ goto out_free_bio;
}
ret2 = btrfs_alloc_page_array(cb->nr_pages, cb->compressed_pages);
if (ret2) {
ret = BLK_STS_RESOURCE;
- goto fail;
+ goto out_free_compressed_pages;
}
- add_ra_bio_pages(inode, em_start + em_len, cb, &memstall, &pflags);
+ add_ra_bio_pages(&inode->vfs_inode, em_start + em_len, cb, &memstall,
+ &pflags);
/* include any pages we added in add_ra-bio_pages */
- cb->len = bio->bi_iter.bi_size;
-
- comp_bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, BTRFS_I(cb->inode),
- end_compressed_bio_read, cb);
- comp_bio->bi_iter.bi_sector = (cur_disk_byte >> SECTOR_SHIFT);
-
- while (cur_disk_byte < disk_bytenr + compressed_len) {
- u64 offset = cur_disk_byte - disk_bytenr;
- unsigned int index = offset >> PAGE_SHIFT;
- unsigned int real_size;
- unsigned int added;
- struct page *page = cb->compressed_pages[index];
-
- /*
- * We have various limit on the real read size:
- * - page boundary
- * - compressed length boundary
- */
- real_size = min_t(u64, U32_MAX, PAGE_SIZE - offset_in_page(offset));
- real_size = min_t(u64, real_size, compressed_len - offset);
- ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize));
-
- added = bio_add_page(comp_bio, page, real_size, offset_in_page(offset));
- /*
- * Maximum compressed extent is smaller than bio size limit,
- * thus bio_add_page() should always success.
- */
- ASSERT(added == real_size);
- cur_disk_byte += added;
- }
+ cb->len = bbio->bio.bi_iter.bi_size;
+ cb->bbio.bio.bi_iter.bi_sector = bbio->bio.bi_iter.bi_sector;
+ btrfs_add_compressed_bio_pages(cb);
if (memstall)
psi_memstall_leave(&pflags);
- /*
- * Stash the initial offset of this chunk, as there is no direct
- * correlation between compressed pages and the original file offset.
- * The field is only used for printing error messages anyway.
- */
- btrfs_bio(comp_bio)->file_offset = file_offset;
-
- ASSERT(comp_bio->bi_iter.bi_size);
- btrfs_submit_bio(comp_bio, mirror_num);
+ btrfs_submit_bio(&cb->bbio, mirror_num);
return;
-fail:
- if (cb->compressed_pages) {
- for (i = 0; i < cb->nr_pages; i++) {
- if (cb->compressed_pages[i])
- __free_page(cb->compressed_pages[i]);
- }
- }
-
+out_free_compressed_pages:
kfree(cb->compressed_pages);
- kfree(cb);
+out_free_bio:
+ bio_put(&cb->bbio.bio);
out:
- free_extent_map(em);
- btrfs_bio_end_io(btrfs_bio(bio), ret);
- return;
+ btrfs_bio_end_io(bbio, ret);
}
/*
@@ -1038,6 +938,8 @@ static int btrfs_decompress_bio(struct compressed_bio *cb)
ret = compression_decompress_bio(workspace, cb);
put_workspace(type, workspace);
+ if (!ret)
+ zero_fill_bio(&cb->orig_bbio->bio);
return ret;
}
@@ -1062,6 +964,10 @@ int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page,
int __init btrfs_init_compress(void)
{
+ if (bioset_init(&btrfs_compressed_bioset, BIO_POOL_SIZE,
+ offsetof(struct compressed_bio, bbio.bio),
+ BIOSET_NEED_BVECS))
+ return -ENOMEM;
btrfs_init_workspace_manager(BTRFS_COMPRESS_NONE);
btrfs_init_workspace_manager(BTRFS_COMPRESS_ZLIB);
btrfs_init_workspace_manager(BTRFS_COMPRESS_LZO);
@@ -1075,6 +981,7 @@ void __cold btrfs_exit_compress(void)
btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_ZLIB);
btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_LZO);
zstd_cleanup_workspace_manager();
+ bioset_exit(&btrfs_compressed_bioset);
}
/*
@@ -1110,7 +1017,7 @@ void __cold btrfs_exit_compress(void)
int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
struct compressed_bio *cb, u32 decompressed)
{
- struct bio *orig_bio = cb->orig_bio;
+ struct bio *orig_bio = &cb->orig_bbio->bio;
/* Offset inside the full decompressed extent */
u32 cur_offset;
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index a5e3377db9ad..19ab2abeddc0 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -6,8 +6,8 @@
#ifndef BTRFS_COMPRESSION_H
#define BTRFS_COMPRESSION_H
-#include <linux/blk_types.h>
#include <linux/sizes.h>
+#include "bio.h"
struct btrfs_inode;
@@ -23,6 +23,7 @@ struct btrfs_inode;
/* Maximum length of compressed data stored on disk */
#define BTRFS_MAX_COMPRESSED (SZ_128K)
+#define BTRFS_MAX_COMPRESSED_PAGES (BTRFS_MAX_COMPRESSED / PAGE_SIZE)
static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0);
/* Maximum size of data before compression */
@@ -37,9 +38,6 @@ struct compressed_bio {
/* the pages with the compressed data on them */
struct page **compressed_pages;
- /* inode that owns this data */
- struct inode *inode;
-
/* starting offset in the inode for our pages */
u64 start;
@@ -55,14 +53,14 @@ struct compressed_bio {
/* Whether this is a write for writeback. */
bool writeback;
- /* IO errors */
- blk_status_t status;
-
union {
/* For reads, this is the bio we are copying the data into */
- struct bio *orig_bio;
+ struct btrfs_bio *orig_bbio;
struct work_struct write_end_work;
};
+
+ /* Must be last. */
+ struct btrfs_bio bbio;
};
static inline unsigned int btrfs_compress_type(unsigned int type_level)
@@ -88,16 +86,14 @@ int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page,
int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
struct compressed_bio *cb, u32 decompressed);
-blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
+void btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
unsigned int len, u64 disk_start,
unsigned int compressed_len,
struct page **compressed_pages,
unsigned int nr_pages,
blk_opf_t write_flags,
- struct cgroup_subsys_state *blkcg_css,
bool writeback);
-void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
- int mirror_num);
+void btrfs_submit_compressed_read(struct btrfs_bio *bbio, int mirror_num);
unsigned int btrfs_compress_str2level(unsigned int type, const char *str);
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index a5b6bb54545f..3c983c70028a 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -854,7 +854,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
* Search for a key in the given extent_buffer.
*
* The lower boundary for the search is specified by the slot number @first_slot.
- * Use a value of 0 to search over the whole extent buffer.
+ * Use a value of 0 to search over the whole extent buffer. Works for both
+ * leaves and nodes.
*
* The slot in the extent buffer is returned via @slot. If the key exists in the
* extent buffer, then @slot will point to the slot where the key is, otherwise
@@ -863,8 +864,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
* Slot may point to the total number of items (i.e. one position beyond the last
* key) if the key is bigger than the last key in the extent buffer.
*/
-int btrfs_generic_bin_search(struct extent_buffer *eb, int first_slot,
- const struct btrfs_key *key, int *slot)
+int btrfs_bin_search(struct extent_buffer *eb, int first_slot,
+ const struct btrfs_key *key, int *slot)
{
unsigned long p;
int item_size;
@@ -959,7 +960,7 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
if (slot < 0 || slot >= btrfs_header_nritems(parent))
return ERR_PTR(-ENOENT);
- BUG_ON(level == 0);
+ ASSERT(level);
check.level = level - 1;
check.transid = btrfs_node_ptr_generation(parent, slot);
@@ -1064,11 +1065,14 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
return 0;
- left = btrfs_read_node_slot(parent, pslot - 1);
- if (IS_ERR(left))
- left = NULL;
+ if (pslot) {
+ left = btrfs_read_node_slot(parent, pslot - 1);
+ if (IS_ERR(left)) {
+ ret = PTR_ERR(left);
+ left = NULL;
+ goto enospc;
+ }
- if (left) {
__btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
wret = btrfs_cow_block(trans, root, left,
parent, pslot - 1, &left,
@@ -1079,11 +1083,14 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
}
}
- right = btrfs_read_node_slot(parent, pslot + 1);
- if (IS_ERR(right))
- right = NULL;
+ if (pslot + 1 < btrfs_header_nritems(parent)) {
+ right = btrfs_read_node_slot(parent, pslot + 1);
+ if (IS_ERR(right)) {
+ ret = PTR_ERR(right);
+ right = NULL;
+ goto enospc;
+ }
- if (right) {
__btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
wret = btrfs_cow_block(trans, root, right,
parent, pslot + 1, &right,
@@ -1240,14 +1247,14 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
if (!parent)
return 1;
- left = btrfs_read_node_slot(parent, pslot - 1);
- if (IS_ERR(left))
- left = NULL;
-
/* first, try to make some room in the middle buffer */
- if (left) {
+ if (pslot) {
u32 left_nr;
+ left = btrfs_read_node_slot(parent, pslot - 1);
+ if (IS_ERR(left))
+ return PTR_ERR(left);
+
__btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
left_nr = btrfs_header_nritems(left);
@@ -1292,16 +1299,17 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
btrfs_tree_unlock(left);
free_extent_buffer(left);
}
- right = btrfs_read_node_slot(parent, pslot + 1);
- if (IS_ERR(right))
- right = NULL;
/*
* then try to empty the right most buffer into the middle
*/
- if (right) {
+ if (pslot + 1 < btrfs_header_nritems(parent)) {
u32 right_nr;
+ right = btrfs_read_node_slot(parent, pslot + 1);
+ if (IS_ERR(right))
+ return PTR_ERR(right);
+
__btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
right_nr = btrfs_header_nritems(right);
@@ -1864,7 +1872,7 @@ static inline int search_for_key_slot(struct extent_buffer *eb,
return 0;
}
- return btrfs_generic_bin_search(eb, search_low_slot, key, slot);
+ return btrfs_bin_search(eb, search_low_slot, key, slot);
}
static int search_leaf(struct btrfs_trans_handle *trans,
@@ -2321,7 +2329,7 @@ again:
*/
btrfs_unlock_up_safe(p, level + 1);
- ret = btrfs_bin_search(b, key, &slot);
+ ret = btrfs_bin_search(b, 0, key, &slot);
if (ret < 0)
goto done;
@@ -2482,26 +2490,15 @@ int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key,
int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key,
struct btrfs_path *path)
{
- while (1) {
+ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
int ret;
- const int slot = path->slots[0];
- const struct extent_buffer *leaf = path->nodes[0];
- /* This is where we start walking the path. */
- if (slot >= btrfs_header_nritems(leaf)) {
- /*
- * If we've reached the last slot in this leaf we need
- * to go to the next leaf and reset the path.
- */
- ret = btrfs_next_leaf(root, path);
- if (ret)
- return ret;
- continue;
- }
- /* Store the found, valid item in @key. */
- btrfs_item_key_to_cpu(leaf, key, slot);
- break;
+ ret = btrfs_next_leaf(root, path);
+ if (ret)
+ return ret;
}
+
+ btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]);
return 0;
}
@@ -3198,12 +3195,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
btrfs_assert_tree_write_locked(path->nodes[1]);
right = btrfs_read_node_slot(upper, slot + 1);
- /*
- * slot + 1 is not valid or we fail to read the right node,
- * no big deal, just return.
- */
if (IS_ERR(right))
- return 1;
+ return PTR_ERR(right);
__btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
@@ -3417,12 +3410,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
btrfs_assert_tree_write_locked(path->nodes[1]);
left = btrfs_read_node_slot(path->nodes[1], slot - 1);
- /*
- * slot - 1 is not valid or we fail to read the left node,
- * no big deal, just return.
- */
if (IS_ERR(left))
- return 1;
+ return PTR_ERR(left);
__btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
@@ -4576,7 +4565,7 @@ again:
while (1) {
nritems = btrfs_header_nritems(cur);
level = btrfs_header_level(cur);
- sret = btrfs_bin_search(cur, min_key, &slot);
+ sret = btrfs_bin_search(cur, 0, min_key, &slot);
if (sret < 0) {
ret = sret;
goto out;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 97897107fab5..4c1986cd5bed 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -508,22 +508,9 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range);
int __init btrfs_ctree_init(void);
void __cold btrfs_ctree_exit(void);
-int btrfs_generic_bin_search(struct extent_buffer *eb, int first_slot,
- const struct btrfs_key *key, int *slot);
+int btrfs_bin_search(struct extent_buffer *eb, int first_slot,
+ const struct btrfs_key *key, int *slot);
-/*
- * Simple binary search on an extent buffer. Works for both leaves and nodes, and
- * always searches over the whole range of keys (slot 0 to slot 'nritems - 1').
- */
-static inline int btrfs_bin_search(struct extent_buffer *eb,
- const struct btrfs_key *key,
- int *slot)
-{
- return btrfs_generic_bin_search(eb, 0, key, slot);
-}
-
-int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
- int *slot);
int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2);
int btrfs_previous_item(struct btrfs_root *root,
struct btrfs_path *path, u64 min_objectid,
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index 7ddb1d104e8e..427abaf608b8 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -358,8 +358,8 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
* racing with an ordered completion or some such that would think it
* needs to free the reservation we just made.
*/
- spin_lock(&inode->lock);
nr_extents = count_max_extents(fs_info, num_bytes);
+ spin_lock(&inode->lock);
btrfs_mod_outstanding_extents(inode, nr_extents);
inode->csum_bytes += disk_num_bytes;
btrfs_calculate_inode_block_rsv_size(fs_info, inode);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 886ffb232eac..0b32432d7d56 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -53,24 +53,6 @@ bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info)
return ret;
}
-int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans)
-{
- u64 num_entries =
- atomic_read(&trans->transaction->delayed_refs.num_entries);
- u64 avg_runtime;
- u64 val;
-
- smp_mb();
- avg_runtime = trans->fs_info->avg_delayed_ref_runtime;
- val = num_entries * avg_runtime;
- if (val >= NSEC_PER_SEC)
- return 1;
- if (val >= NSEC_PER_SEC / 2)
- return 2;
-
- return btrfs_check_space_for_delayed_refs(trans->fs_info);
-}
-
/*
* Release a ref head's reservation.
*
@@ -83,20 +65,9 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans)
void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)
{
struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
- u64 num_bytes = btrfs_calc_insert_metadata_size(fs_info, nr);
+ const u64 num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, nr);
u64 released = 0;
- /*
- * We have to check the mount option here because we could be enabling
- * the free space tree for the first time and don't have the compat_ro
- * option set yet.
- *
- * We need extra reservations if we have the free space tree because
- * we'll have to modify that tree as well.
- */
- if (btrfs_test_opt(fs_info, FREE_SPACE_TREE))
- num_bytes *= 2;
-
released = btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL);
if (released)
trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
@@ -118,18 +89,8 @@ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)
if (!trans->delayed_ref_updates)
return;
- num_bytes = btrfs_calc_insert_metadata_size(fs_info,
- trans->delayed_ref_updates);
- /*
- * We have to check the mount option here because we could be enabling
- * the free space tree for the first time and don't have the compat_ro
- * option set yet.
- *
- * We need extra reservations if we have the free space tree because
- * we'll have to modify that tree as well.
- */
- if (btrfs_test_opt(fs_info, FREE_SPACE_TREE))
- num_bytes *= 2;
+ num_bytes = btrfs_calc_delayed_ref_bytes(fs_info,
+ trans->delayed_ref_updates);
spin_lock(&delayed_rsv->lock);
delayed_rsv->size += num_bytes;
@@ -200,7 +161,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
enum btrfs_reserve_flush_enum flush)
{
struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
- u64 limit = btrfs_calc_insert_metadata_size(fs_info, 1);
+ u64 limit = btrfs_calc_delayed_ref_bytes(fs_info, 1);
u64 num_bytes = 0;
int ret = -ENOSPC;
@@ -217,7 +178,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush);
if (ret)
return ret;
- btrfs_block_rsv_add_bytes(block_rsv, num_bytes, 0);
+ btrfs_block_rsv_add_bytes(block_rsv, num_bytes, false);
trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
0, num_bytes, 1);
return 0;
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 2eb34abf700f..b54261fe509b 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -253,6 +253,27 @@ extern struct kmem_cache *btrfs_delayed_extent_op_cachep;
int __init btrfs_delayed_ref_init(void);
void __cold btrfs_delayed_ref_exit(void);
+static inline u64 btrfs_calc_delayed_ref_bytes(const struct btrfs_fs_info *fs_info,
+ int num_delayed_refs)
+{
+ u64 num_bytes;
+
+ num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_delayed_refs);
+
+ /*
+ * We have to check the mount option here because we could be enabling
+ * the free space tree for the first time and don't have the compat_ro
+ * option set yet.
+ *
+ * We need extra reservations if we have the free space tree because
+ * we'll have to modify that tree as well.
+ */
+ if (btrfs_test_opt(fs_info, FREE_SPACE_TREE))
+ num_bytes *= 2;
+
+ return num_bytes;
+}
+
static inline void btrfs_init_generic_ref(struct btrfs_ref *generic_ref,
int action, u64 bytenr, u64 len, u64 parent)
{
@@ -385,7 +406,6 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *src,
u64 num_bytes);
-int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans);
bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info);
/*
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 9e1596bb208d..59ea049fe7ee 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1341,17 +1341,8 @@ struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
{
int ret;
- unsigned int nofs_flag;
- /*
- * We might be called under a transaction (e.g. indirect backref
- * resolution) which could deadlock if it triggers memory reclaim
- */
- nofs_flag = memalloc_nofs_save();
- ret = btrfs_drew_lock_init(&root->snapshot_lock);
- memalloc_nofs_restore(nofs_flag);
- if (ret)
- goto fail;
+ btrfs_drew_lock_init(&root->snapshot_lock);
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID &&
!btrfs_is_data_reloc_root(root)) {
@@ -2065,7 +2056,6 @@ void btrfs_put_root(struct btrfs_root *root)
WARN_ON(test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state));
if (root->anon_dev)
free_anon_bdev(root->anon_dev);
- btrfs_drew_lock_destroy(&root->snapshot_lock);
free_root_extent_buffers(root);
#ifdef CONFIG_BTRFS_DEBUG
spin_lock(&root->fs_info->fs_roots_radix_lock);
@@ -2125,11 +2115,16 @@ static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
atomic_set(&fs_info->reloc_cancel_req, 0);
}
-static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
+static int btrfs_init_btree_inode(struct super_block *sb)
{
- struct inode *inode = fs_info->btree_inode;
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
unsigned long hash = btrfs_inode_hash(BTRFS_BTREE_INODE_OBJECTID,
fs_info->tree_root);
+ struct inode *inode;
+
+ inode = new_inode(sb);
+ if (!inode)
+ return -ENOMEM;
inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
set_nlink(inode, 1);
@@ -2140,6 +2135,7 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
*/
inode->i_size = OFFSET_MAX;
inode->i_mapping->a_ops = &btree_aops;
+ mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree,
@@ -2152,6 +2148,9 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
BTRFS_I(inode)->location.offset = 0;
set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
__insert_inode_hash(inode, hash);
+ fs_info->btree_inode = inode;
+
+ return 0;
}
static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
@@ -2966,7 +2965,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
atomic64_set(&fs_info->free_chunk_space, 0);
fs_info->tree_mod_log = RB_ROOT;
fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
- fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */
btrfs_init_ref_verify(fs_info);
fs_info->thread_pool_size = min_t(unsigned long,
@@ -3344,14 +3342,11 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
struct btrfs_root *tree_root;
struct btrfs_root *chunk_root;
int ret;
- int err = -EINVAL;
int level;
ret = init_mount_fs_info(fs_info, sb);
- if (ret) {
- err = ret;
+ if (ret)
goto fail;
- }
/* These need to be init'ed before we start creating inodes and such. */
tree_root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID,
@@ -3361,17 +3356,13 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
GFP_KERNEL);
fs_info->chunk_root = chunk_root;
if (!tree_root || !chunk_root) {
- err = -ENOMEM;
+ ret = -ENOMEM;
goto fail;
}
- fs_info->btree_inode = new_inode(sb);
- if (!fs_info->btree_inode) {
- err = -ENOMEM;
+ ret = btrfs_init_btree_inode(sb);
+ if (ret)
goto fail;
- }
- mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
- btrfs_init_btree_inode(fs_info);
invalidate_bdev(fs_devices->latest_dev->bdev);
@@ -3380,7 +3371,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
*/
disk_super = btrfs_read_dev_super(fs_devices->latest_dev->bdev);
if (IS_ERR(disk_super)) {
- err = PTR_ERR(disk_super);
+ ret = PTR_ERR(disk_super);
goto fail_alloc;
}
@@ -3392,7 +3383,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
if (!btrfs_supported_super_csum(csum_type)) {
btrfs_err(fs_info, "unsupported checksum algorithm: %u",
csum_type);
- err = -EINVAL;
+ ret = -EINVAL;
btrfs_release_disk_super(disk_super);
goto fail_alloc;
}
@@ -3401,7 +3392,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
ret = btrfs_init_csum_hash(fs_info, csum_type);
if (ret) {
- err = ret;
btrfs_release_disk_super(disk_super);
goto fail_alloc;
}
@@ -3412,7 +3402,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
*/
if (btrfs_check_super_csum(fs_info, disk_super)) {
btrfs_err(fs_info, "superblock checksum mismatch");
- err = -EINVAL;
+ ret = -EINVAL;
btrfs_release_disk_super(disk_super);
goto fail_alloc;
}
@@ -3442,12 +3432,15 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
ret = btrfs_validate_mount_super(fs_info);
if (ret) {
btrfs_err(fs_info, "superblock contains fatal errors");
- err = -EINVAL;
+ ret = -EINVAL;
goto fail_alloc;
}
- if (!btrfs_super_root(disk_super))
+ if (!btrfs_super_root(disk_super)) {
+ btrfs_err(fs_info, "invalid superblock tree root bytenr");
+ ret = -EINVAL;
goto fail_alloc;
+ }
/* check FS state, whether FS is broken. */
if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR)
@@ -3474,16 +3467,12 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
fs_info->stripesize = stripesize;
ret = btrfs_parse_options(fs_info, options, sb->s_flags);
- if (ret) {
- err = ret;
+ if (ret)
goto fail_alloc;
- }
ret = btrfs_check_features(fs_info, !sb_rdonly(sb));
- if (ret < 0) {
- err = ret;
+ if (ret < 0)
goto fail_alloc;
- }
if (sectorsize < PAGE_SIZE) {
struct btrfs_subpage_info *subpage_info;
@@ -3503,17 +3492,17 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
"read-write for sector size %u with page size %lu is experimental",
sectorsize, PAGE_SIZE);
subpage_info = kzalloc(sizeof(*subpage_info), GFP_KERNEL);
- if (!subpage_info)
+ if (!subpage_info) {
+ ret = -ENOMEM;
goto fail_alloc;
+ }
btrfs_init_subpage_info(subpage_info, sectorsize);
fs_info->subpage_info = subpage_info;
}
ret = btrfs_init_workqueues(fs_info);
- if (ret) {
- err = ret;
+ if (ret)
goto fail_sb_buffer;
- }
sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super);
sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE);
@@ -3559,6 +3548,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
btrfs_free_extra_devids(fs_devices);
if (!fs_devices->latest_dev->bdev) {
btrfs_err(fs_info, "failed to read devices");
+ ret = -EIO;
goto fail_tree_roots;
}
@@ -3574,8 +3564,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
ret = btrfs_get_dev_zone_info_all_devices(fs_info);
if (ret) {
btrfs_err(fs_info,
- "zoned: failed to read device zone info: %d",
- ret);
+ "zoned: failed to read device zone info: %d", ret);
goto fail_block_groups;
}
@@ -3654,19 +3643,24 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
!btrfs_check_rw_degradable(fs_info, NULL)) {
btrfs_warn(fs_info,
"writable mount is not allowed due to too many missing devices");
+ ret = -EINVAL;
goto fail_sysfs;
}
fs_info->cleaner_kthread = kthread_run(cleaner_kthread, fs_info,
"btrfs-cleaner");
- if (IS_ERR(fs_info->cleaner_kthread))
+ if (IS_ERR(fs_info->cleaner_kthread)) {
+ ret = PTR_ERR(fs_info->cleaner_kthread);
goto fail_sysfs;
+ }
fs_info->transaction_kthread = kthread_run(transaction_kthread,
tree_root,
"btrfs-transaction");
- if (IS_ERR(fs_info->transaction_kthread))
+ if (IS_ERR(fs_info->transaction_kthread)) {
+ ret = PTR_ERR(fs_info->transaction_kthread);
goto fail_cleaner;
+ }
if (!btrfs_test_opt(fs_info, NOSSD) &&
!fs_info->fs_devices->rotating) {
@@ -3684,7 +3678,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
fs_info->fs_devices->discardable) {
btrfs_set_and_info(fs_info, DISCARD_ASYNC,
"auto enabling async discard");
- btrfs_clear_opt(fs_info->mount_opt, NODISCARD);
}
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
@@ -3711,16 +3704,14 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
!btrfs_test_opt(fs_info, NOLOGREPLAY)) {
btrfs_info(fs_info, "start tree-log replay");
ret = btrfs_replay_log(fs_info, fs_devices);
- if (ret) {
- err = ret;
+ if (ret)
goto fail_qgroup;
- }
}
fs_info->fs_root = btrfs_get_fs_root(fs_info, BTRFS_FS_TREE_OBJECTID, true);
if (IS_ERR(fs_info->fs_root)) {
- err = PTR_ERR(fs_info->fs_root);
- btrfs_warn(fs_info, "failed to read fs tree: %d", err);
+ ret = PTR_ERR(fs_info->fs_root);
+ btrfs_warn(fs_info, "failed to read fs tree: %d", ret);
fs_info->fs_root = NULL;
goto fail_qgroup;
}
@@ -3797,7 +3788,8 @@ fail_alloc:
iput(fs_info->btree_inode);
fail:
btrfs_close_devices(fs_info->fs_devices);
- return err;
+ ASSERT(ret < 0);
+ return ret;
}
ALLOW_ERROR_INJECTION(open_ctree, ERRNO);
@@ -4094,6 +4086,8 @@ static void write_dev_flush(struct btrfs_device *device)
{
struct bio *bio = &device->flush_bio;
+ device->last_flush_error = BLK_STS_OK;
+
#ifndef CONFIG_BTRFS_FS_CHECK_INTEGRITY
/*
* When a disk has write caching disabled, we skip submission of a bio
@@ -4122,25 +4116,24 @@ static void write_dev_flush(struct btrfs_device *device)
/*
* If the flush bio has been submitted by write_dev_flush, wait for it.
+ * Return true for any error, and false otherwise.
*/
-static blk_status_t wait_dev_flush(struct btrfs_device *device)
+static bool wait_dev_flush(struct btrfs_device *device)
{
struct bio *bio = &device->flush_bio;
- if (!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state))
- return BLK_STS_OK;
+ if (!test_and_clear_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state))
+ return false;
- clear_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state);
wait_for_completion_io(&device->flush_wait);
- return bio->bi_status;
-}
+ if (bio->bi_status) {
+ device->last_flush_error = bio->bi_status;
+ btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_FLUSH_ERRS);
+ return true;
+ }
-static int check_barrier_error(struct btrfs_fs_info *fs_info)
-{
- if (!btrfs_check_rw_degradable(fs_info, NULL))
- return -EIO;
- return 0;
+ return false;
}
/*
@@ -4152,7 +4145,6 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
struct list_head *head;
struct btrfs_device *dev;
int errors_wait = 0;
- blk_status_t ret;
lockdep_assert_held(&info->fs_devices->device_list_mutex);
/* send down all the barriers */
@@ -4167,7 +4159,6 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
continue;
write_dev_flush(dev);
- dev->last_flush_error = BLK_STS_OK;
}
/* wait for all the barriers */
@@ -4182,23 +4173,17 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
continue;
- ret = wait_dev_flush(dev);
- if (ret) {
- dev->last_flush_error = ret;
- btrfs_dev_stat_inc_and_print(dev,
- BTRFS_DEV_STAT_FLUSH_ERRS);
+ if (wait_dev_flush(dev))
errors_wait++;
- }
}
- if (errors_wait) {
- /*
- * At some point we need the status of all disks
- * to arrive at the volume status. So error checking
- * is being pushed to a separate loop.
- */
- return check_barrier_error(info);
- }
+ /*
+ * Checks last_flush_error of disks in order to determine the device
+ * state.
+ */
+ if (errors_wait && !btrfs_check_rw_degradable(info, NULL))
+ return -EIO;
+
return 0;
}
@@ -4404,12 +4389,12 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
root_objectid = gang[i]->root_key.objectid;
err = btrfs_orphan_cleanup(gang[i]);
if (err)
- break;
+ goto out;
btrfs_put_root(gang[i]);
}
root_objectid++;
}
-
+out:
/* release the uncleaned roots due to error */
for (; i < ret; i++) {
if (gang[i])
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 824c657f59e8..5cd289de4e92 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1894,8 +1894,7 @@ static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
}
static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_head *locked_ref,
- unsigned long *run_refs)
+ struct btrfs_delayed_ref_head *locked_ref)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_delayed_ref_root *delayed_refs;
@@ -1917,7 +1916,6 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
return -EAGAIN;
}
- (*run_refs)++;
ref->in_tree = 0;
rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
RB_CLEAR_NODE(&ref->ref_node);
@@ -1981,10 +1979,8 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_delayed_ref_head *locked_ref = NULL;
- ktime_t start = ktime_get();
int ret;
unsigned long count = 0;
- unsigned long actual_count = 0;
delayed_refs = &trans->transaction->delayed_refs;
do {
@@ -2014,8 +2010,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
spin_lock(&locked_ref->lock);
btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref);
- ret = btrfs_run_delayed_refs_for_head(trans, locked_ref,
- &actual_count);
+ ret = btrfs_run_delayed_refs_for_head(trans, locked_ref);
if (ret < 0 && ret != -EAGAIN) {
/*
* Error, btrfs_run_delayed_refs_for_head already
@@ -2046,24 +2041,6 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
cond_resched();
} while ((nr != -1 && count < nr) || locked_ref);
- /*
- * We don't want to include ref heads since we can have empty ref heads
- * and those will drastically skew our runtime down since we just do
- * accounting, no actual extent tree updates.
- */
- if (actual_count > 0) {
- u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
- u64 avg;
-
- /*
- * We weigh the current average higher than our current runtime
- * to avoid large swings in the average.
- */
- spin_lock(&delayed_refs->lock);
- avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
- fs_info->avg_delayed_ref_runtime = avg >> 2; /* div by 4 */
- spin_unlock(&delayed_refs->lock);
- }
return 0;
}
@@ -5509,11 +5486,11 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
{
int level = wc->level;
int lookup_info = 1;
- int ret;
+ int ret = 0;
while (level >= 0) {
ret = walk_down_proc(trans, root, path, wc, lookup_info);
- if (ret > 0)
+ if (ret)
break;
if (level == 0)
@@ -5528,10 +5505,10 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
path->slots[level]++;
continue;
} else if (ret < 0)
- return ret;
+ break;
level = wc->level;
}
- return 0;
+ return (ret == 1) ? 0 : ret;
}
static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
@@ -5708,12 +5685,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
ret = walk_down_tree(trans, root, path, wc);
if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
err = ret;
break;
}
ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
err = ret;
break;
}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 40300e8e5f99..a1adadd5d25d 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -97,11 +97,13 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
* how many bytes are there before stripe/ordered extent boundary.
*/
struct btrfs_bio_ctrl {
- struct bio *bio;
+ struct btrfs_bio *bbio;
int mirror_num;
enum btrfs_compression_type compress_type;
u32 len_to_oe_boundary;
+ blk_opf_t opf;
btrfs_bio_end_io_t end_io_func;
+ struct writeback_control *wbc;
/*
* This is for metadata read, to provide the extra needed verification
@@ -117,51 +119,41 @@ struct btrfs_bio_ctrl {
* does the unlocking.
*/
bool extent_locked;
-
- /* Tell the submit_bio code to use REQ_SYNC */
- bool sync_io;
};
static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
{
- struct bio *bio;
- struct bio_vec *bv;
- struct inode *inode;
- int mirror_num;
+ struct btrfs_bio *bbio = bio_ctrl->bbio;
+ int mirror_num = bio_ctrl->mirror_num;
- if (!bio_ctrl->bio)
+ if (!bbio)
return;
- bio = bio_ctrl->bio;
- bv = bio_first_bvec_all(bio);
- inode = bv->bv_page->mapping->host;
- mirror_num = bio_ctrl->mirror_num;
-
/* Caller should ensure the bio has at least some range added */
- ASSERT(bio->bi_iter.bi_size);
+ ASSERT(bbio->bio.bi_iter.bi_size);
- if (!is_data_inode(inode)) {
- if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
+ if (!is_data_inode(&bbio->inode->vfs_inode)) {
+ if (btrfs_op(&bbio->bio) != BTRFS_MAP_WRITE) {
/*
* For metadata read, we should have the parent_check,
* and copy it to bbio for metadata verification.
*/
ASSERT(bio_ctrl->parent_check);
- memcpy(&btrfs_bio(bio)->parent_check,
+ memcpy(&bbio->parent_check,
bio_ctrl->parent_check,
sizeof(struct btrfs_tree_parent_check));
}
- bio->bi_opf |= REQ_META;
+ bbio->bio.bi_opf |= REQ_META;
}
- if (btrfs_op(bio) == BTRFS_MAP_READ &&
+ if (btrfs_op(&bbio->bio) == BTRFS_MAP_READ &&
bio_ctrl->compress_type != BTRFS_COMPRESS_NONE)
- btrfs_submit_compressed_read(inode, bio, mirror_num);
+ btrfs_submit_compressed_read(bbio, mirror_num);
else
- btrfs_submit_bio(bio, mirror_num);
+ btrfs_submit_bio(bbio, mirror_num);
- /* The bio is owned by the end_io handler now */
- bio_ctrl->bio = NULL;
+ /* The bbio is owned by the end_io handler now */
+ bio_ctrl->bbio = NULL;
}
/*
@@ -169,16 +161,16 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
*/
static void submit_write_bio(struct btrfs_bio_ctrl *bio_ctrl, int ret)
{
- struct bio *bio = bio_ctrl->bio;
+ struct btrfs_bio *bbio = bio_ctrl->bbio;
- if (!bio)
+ if (!bbio)
return;
if (ret) {
ASSERT(ret < 0);
- btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret));
+ btrfs_bio_end_io(bbio, errno_to_blk_status(ret));
/* The bio is owned by the end_io handler now */
- bio_ctrl->bio = NULL;
+ bio_ctrl->bbio = NULL;
} else {
submit_one_bio(bio_ctrl);
}
@@ -867,89 +859,52 @@ int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array)
return 0;
}
-/*
- * Attempt to add a page to bio.
- *
- * @bio_ctrl: record both the bio, and its bio_flags
- * @page: page to add to the bio
- * @disk_bytenr: offset of the new bio or to check whether we are adding
- * a contiguous page to the previous one
- * @size: portion of page that we want to write
- * @pg_offset: starting offset in the page
- * @compress_type: compression type of the current bio to see if we can merge them
- *
- * Attempt to add a page to bio considering stripe alignment etc.
- *
- * Return >= 0 for the number of bytes added to the bio.
- * Can return 0 if the current bio is already at stripe/zone boundary.
- * Return <0 for error.
- */
-static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl,
- struct page *page,
- u64 disk_bytenr, unsigned int size,
- unsigned int pg_offset,
- enum btrfs_compression_type compress_type)
+static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl,
+ struct page *page, u64 disk_bytenr,
+ unsigned int pg_offset)
{
- struct bio *bio = bio_ctrl->bio;
- u32 bio_size = bio->bi_iter.bi_size;
- u32 real_size;
+ struct bio *bio = &bio_ctrl->bbio->bio;
+ struct bio_vec *bvec = bio_last_bvec_all(bio);
const sector_t sector = disk_bytenr >> SECTOR_SHIFT;
- bool contig = false;
- ASSERT(bio);
- /* The limit should be calculated when bio_ctrl->bio is allocated */
- ASSERT(bio_ctrl->len_to_oe_boundary);
- if (bio_ctrl->compress_type != compress_type)
- return 0;
-
-
- if (bio->bi_iter.bi_size == 0) {
- /* We can always add a page into an empty bio. */
- contig = true;
- } else if (bio_ctrl->compress_type == BTRFS_COMPRESS_NONE) {
- struct bio_vec *bvec = bio_last_bvec_all(bio);
-
- /*
- * The contig check requires the following conditions to be met:
- * 1) The pages are belonging to the same inode
- * This is implied by the call chain.
- *
- * 2) The range has adjacent logical bytenr
- *
- * 3) The range has adjacent file offset
- * This is required for the usage of btrfs_bio->file_offset.
- */
- if (bio_end_sector(bio) == sector &&
- page_offset(bvec->bv_page) + bvec->bv_offset +
- bvec->bv_len == page_offset(page) + pg_offset)
- contig = true;
- } else {
+ if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) {
/*
- * For compression, all IO should have its logical bytenr
- * set to the starting bytenr of the compressed extent.
+ * For compression, all IO should have its logical bytenr set
+ * to the starting bytenr of the compressed extent.
*/
- contig = bio->bi_iter.bi_sector == sector;
+ return bio->bi_iter.bi_sector == sector;
}
- if (!contig)
- return 0;
-
- real_size = min(bio_ctrl->len_to_oe_boundary - bio_size, size);
-
/*
- * If real_size is 0, never call bio_add_*_page(), as even size is 0,
- * bio will still execute its endio function on the page!
+ * The contig check requires the following conditions to be met:
+ *
+ * 1) The pages are belonging to the same inode
+ * This is implied by the call chain.
+ *
+ * 2) The range has adjacent logical bytenr
+ *
+ * 3) The range has adjacent file offset
+ * This is required for the usage of btrfs_bio->file_offset.
*/
- if (real_size == 0)
- return 0;
-
- return bio_add_page(bio, page, real_size, pg_offset);
+ return bio_end_sector(bio) == sector &&
+ page_offset(bvec->bv_page) + bvec->bv_offset + bvec->bv_len ==
+ page_offset(page) + pg_offset;
}
-static void calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl,
- struct btrfs_inode *inode, u64 file_offset)
+static void alloc_new_bio(struct btrfs_inode *inode,
+ struct btrfs_bio_ctrl *bio_ctrl,
+ u64 disk_bytenr, u64 file_offset)
{
- struct btrfs_ordered_extent *ordered;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_bio *bbio;
+
+ bbio = btrfs_bio_alloc(BIO_MAX_VECS, bio_ctrl->opf, fs_info,
+ bio_ctrl->end_io_func, NULL);
+ bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
+ bbio->inode = inode;
+ bbio->file_offset = file_offset;
+ bio_ctrl->bbio = bbio;
+ bio_ctrl->len_to_oe_boundary = U32_MAX;
/*
* Limit the extent to the ordered boundary for Zone Append.
@@ -957,132 +912,89 @@ static void calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl,
* them.
*/
if (bio_ctrl->compress_type == BTRFS_COMPRESS_NONE &&
- btrfs_use_zone_append(btrfs_bio(bio_ctrl->bio))) {
+ btrfs_use_zone_append(bbio)) {
+ struct btrfs_ordered_extent *ordered;
+
ordered = btrfs_lookup_ordered_extent(inode, file_offset);
if (ordered) {
bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX,
ordered->file_offset +
ordered->disk_num_bytes - file_offset);
btrfs_put_ordered_extent(ordered);
- return;
}
}
- bio_ctrl->len_to_oe_boundary = U32_MAX;
-}
-
-static void alloc_new_bio(struct btrfs_inode *inode,
- struct btrfs_bio_ctrl *bio_ctrl,
- struct writeback_control *wbc, blk_opf_t opf,
- u64 disk_bytenr, u32 offset, u64 file_offset,
- enum btrfs_compression_type compress_type)
-{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct bio *bio;
-
- bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, inode, bio_ctrl->end_io_func,
- NULL);
- /*
- * For compressed page range, its disk_bytenr is always @disk_bytenr
- * passed in, no matter if we have added any range into previous bio.
- */
- if (compress_type != BTRFS_COMPRESS_NONE)
- bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
- else
- bio->bi_iter.bi_sector = (disk_bytenr + offset) >> SECTOR_SHIFT;
- btrfs_bio(bio)->file_offset = file_offset;
- bio_ctrl->bio = bio;
- bio_ctrl->compress_type = compress_type;
- calc_bio_boundaries(bio_ctrl, inode, file_offset);
-
- if (wbc) {
+ if (bio_ctrl->wbc) {
/*
* Pick the last added device to support cgroup writeback. For
* multi-device file systems this means blk-cgroup policies have
* to always be set on the last added/replaced device.
* This is a bit odd but has been like that for a long time.
*/
- bio_set_dev(bio, fs_info->fs_devices->latest_dev->bdev);
- wbc_init_bio(wbc, bio);
+ bio_set_dev(&bbio->bio, fs_info->fs_devices->latest_dev->bdev);
+ wbc_init_bio(bio_ctrl->wbc, &bbio->bio);
}
}
/*
- * @opf: bio REQ_OP_* and REQ_* flags as one value
- * @wbc: optional writeback control for io accounting
* @disk_bytenr: logical bytenr where the write will be
* @page: page to add to the bio
* @size: portion of page that we want to write to
* @pg_offset: offset of the new bio or to check whether we are adding
* a contiguous page to the previous one
- * @compress_type: compress type for current bio
*
- * The will either add the page into the existing @bio_ctrl->bio, or allocate a
- * new one in @bio_ctrl->bio.
+ * The will either add the page into the existing @bio_ctrl->bbio, or allocate a
+ * new one in @bio_ctrl->bbio.
* The mirror number for this IO should already be initizlied in
* @bio_ctrl->mirror_num.
*/
-static int submit_extent_page(blk_opf_t opf,
- struct writeback_control *wbc,
- struct btrfs_bio_ctrl *bio_ctrl,
- u64 disk_bytenr, struct page *page,
- size_t size, unsigned long pg_offset,
- enum btrfs_compression_type compress_type,
- bool force_bio_submit)
+static void submit_extent_page(struct btrfs_bio_ctrl *bio_ctrl,
+ u64 disk_bytenr, struct page *page,
+ size_t size, unsigned long pg_offset)
{
struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
- unsigned int cur = pg_offset;
-
- ASSERT(bio_ctrl);
-
- ASSERT(pg_offset < PAGE_SIZE && size <= PAGE_SIZE &&
- pg_offset + size <= PAGE_SIZE);
+ ASSERT(pg_offset + size <= PAGE_SIZE);
ASSERT(bio_ctrl->end_io_func);
- if (force_bio_submit)
+ if (bio_ctrl->bbio &&
+ !btrfs_bio_is_contig(bio_ctrl, page, disk_bytenr, pg_offset))
submit_one_bio(bio_ctrl);
- while (cur < pg_offset + size) {
- u32 offset = cur - pg_offset;
- int added;
+ do {
+ u32 len = size;
/* Allocate new bio if needed */
- if (!bio_ctrl->bio) {
- alloc_new_bio(inode, bio_ctrl, wbc, opf, disk_bytenr,
- offset, page_offset(page) + cur,
- compress_type);
+ if (!bio_ctrl->bbio) {
+ alloc_new_bio(inode, bio_ctrl, disk_bytenr,
+ page_offset(page) + pg_offset);
}
- /*
- * We must go through btrfs_bio_add_page() to ensure each
- * page range won't cross various boundaries.
- */
- if (compress_type != BTRFS_COMPRESS_NONE)
- added = btrfs_bio_add_page(bio_ctrl, page, disk_bytenr,
- size - offset, pg_offset + offset,
- compress_type);
- else
- added = btrfs_bio_add_page(bio_ctrl, page,
- disk_bytenr + offset, size - offset,
- pg_offset + offset, compress_type);
-
- /* Metadata page range should never be split */
- if (!is_data_inode(&inode->vfs_inode))
- ASSERT(added == 0 || added == size - offset);
-
- /* At least we added some page, update the account */
- if (wbc && added)
- wbc_account_cgroup_owner(wbc, page, added);
-
- /* We have reached boundary, submit right now */
- if (added < size - offset) {
- /* The bio should contain some page(s) */
- ASSERT(bio_ctrl->bio->bi_iter.bi_size);
+
+ /* Cap to the current ordered extent boundary if there is one. */
+ if (len > bio_ctrl->len_to_oe_boundary) {
+ ASSERT(bio_ctrl->compress_type == BTRFS_COMPRESS_NONE);
+ ASSERT(is_data_inode(&inode->vfs_inode));
+ len = bio_ctrl->len_to_oe_boundary;
+ }
+
+ if (bio_add_page(&bio_ctrl->bbio->bio, page, len, pg_offset) != len) {
+ /* bio full: move on to a new one */
submit_one_bio(bio_ctrl);
+ continue;
}
- cur += added;
- }
- return 0;
+
+ if (bio_ctrl->wbc)
+ wbc_account_cgroup_owner(bio_ctrl->wbc, page, len);
+
+ size -= len;
+ pg_offset += len;
+ disk_bytenr += len;
+ bio_ctrl->len_to_oe_boundary -= len;
+
+ /* Ordered extent boundary: move on to a new bio. */
+ if (bio_ctrl->len_to_oe_boundary == 0)
+ submit_one_bio(bio_ctrl);
+ } while (size);
}
static int attach_extent_buffer_page(struct extent_buffer *eb,
@@ -1193,8 +1105,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
* return 0 on success, otherwise return error
*/
static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
- struct btrfs_bio_ctrl *bio_ctrl,
- blk_opf_t read_flags, u64 *prev_em_start)
+ struct btrfs_bio_ctrl *bio_ctrl, u64 *prev_em_start)
{
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -1216,7 +1127,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
unlock_extent(tree, start, end, NULL);
btrfs_page_set_error(fs_info, page, start, PAGE_SIZE);
unlock_page(page);
- goto out;
+ return ret;
}
if (page->index == last_byte >> PAGE_SHIFT) {
@@ -1230,7 +1141,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
bio_ctrl->end_io_func = end_bio_extent_readpage;
begin_page_read(fs_info, page);
while (cur <= end) {
- unsigned long this_bio_flag = 0;
+ enum btrfs_compression_type compress_type = BTRFS_COMPRESS_NONE;
bool force_bio_submit = false;
u64 disk_bytenr;
@@ -1247,19 +1158,18 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
if (IS_ERR(em)) {
unlock_extent(tree, cur, end, NULL);
end_page_read(page, false, cur, end + 1 - cur);
- ret = PTR_ERR(em);
- break;
+ return PTR_ERR(em);
}
extent_offset = cur - em->start;
BUG_ON(extent_map_end(em) <= cur);
BUG_ON(end < cur);
if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
- this_bio_flag = em->compress_type;
+ compress_type = em->compress_type;
iosize = min(extent_map_end(em) - cur, end - cur + 1);
iosize = ALIGN(iosize, blocksize);
- if (this_bio_flag != BTRFS_COMPRESS_NONE)
+ if (compress_type != BTRFS_COMPRESS_NONE)
disk_bytenr = em->block_start;
else
disk_bytenr = em->block_start + extent_offset;
@@ -1331,24 +1241,20 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
continue;
}
- ret = submit_extent_page(REQ_OP_READ | read_flags, NULL,
- bio_ctrl, disk_bytenr, page, iosize,
- pg_offset, this_bio_flag,
- force_bio_submit);
- if (ret) {
- /*
- * We have to unlock the remaining range, or the page
- * will never be unlocked.
- */
- unlock_extent(tree, cur, end, NULL);
- end_page_read(page, false, cur, end + 1 - cur);
- goto out;
+ if (bio_ctrl->compress_type != compress_type) {
+ submit_one_bio(bio_ctrl);
+ bio_ctrl->compress_type = compress_type;
}
+
+ if (force_bio_submit)
+ submit_one_bio(bio_ctrl);
+ submit_extent_page(bio_ctrl, disk_bytenr, page, iosize,
+ pg_offset);
cur = cur + iosize;
pg_offset += iosize;
}
-out:
- return ret;
+
+ return 0;
}
int btrfs_read_folio(struct file *file, struct folio *folio)
@@ -1357,12 +1263,12 @@ int btrfs_read_folio(struct file *file, struct folio *folio)
struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
u64 start = page_offset(page);
u64 end = start + PAGE_SIZE - 1;
- struct btrfs_bio_ctrl bio_ctrl = { 0 };
+ struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ };
int ret;
btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
- ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL);
+ ret = btrfs_do_readpage(page, NULL, &bio_ctrl, NULL);
/*
* If btrfs_do_readpage() failed we will want to submit the assembled
* bio to do the cleanup.
@@ -1384,7 +1290,7 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages,
for (index = 0; index < nr_pages; index++) {
btrfs_do_readpage(pages[index], em_cached, bio_ctrl,
- REQ_RAHEAD, prev_em_start);
+ prev_em_start);
put_page(pages[index]);
}
}
@@ -1520,7 +1426,6 @@ static void find_next_dirty_byte(struct btrfs_fs_info *fs_info,
*/
static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
struct page *page,
- struct writeback_control *wbc,
struct btrfs_bio_ctrl *bio_ctrl,
loff_t i_size,
int *nr_ret)
@@ -1531,18 +1436,14 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
u64 extent_offset;
u64 block_start;
struct extent_map *em;
- int saved_ret = 0;
int ret = 0;
int nr = 0;
- enum req_op op = REQ_OP_WRITE;
- const blk_opf_t write_flags = wbc_to_write_flags(wbc);
- bool has_error = false;
bool compressed;
ret = btrfs_writepage_cow_fixup(page);
if (ret) {
/* Fixup worker will requeue */
- redirty_page_for_writepage(wbc, page);
+ redirty_page_for_writepage(bio_ctrl->wbc, page);
unlock_page(page);
return 1;
}
@@ -1551,7 +1452,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
* we don't want to touch the inode after unlocking the page,
* so we update the mapping writeback index now
*/
- wbc->nr_to_write--;
+ bio_ctrl->wbc->nr_to_write--;
bio_ctrl->end_io_func = end_bio_extent_writepage;
while (cur <= end) {
@@ -1587,10 +1488,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
if (IS_ERR(em)) {
btrfs_page_set_error(fs_info, page, cur, end - cur + 1);
ret = PTR_ERR_OR_ZERO(em);
- has_error = true;
- if (!saved_ret)
- saved_ret = ret;
- break;
+ goto out_error;
}
extent_offset = cur - em->start;
@@ -1642,33 +1540,21 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
*/
btrfs_page_clear_dirty(fs_info, page, cur, iosize);
- ret = submit_extent_page(op | write_flags, wbc,
- bio_ctrl, disk_bytenr,
- page, iosize,
- cur - page_offset(page),
- 0, false);
- if (ret) {
- has_error = true;
- if (!saved_ret)
- saved_ret = ret;
-
- btrfs_page_set_error(fs_info, page, cur, iosize);
- if (PageWriteback(page))
- btrfs_page_clear_writeback(fs_info, page, cur,
- iosize);
- }
-
+ submit_extent_page(bio_ctrl, disk_bytenr, page, iosize,
+ cur - page_offset(page));
cur += iosize;
nr++;
}
+
+ btrfs_page_assert_not_dirty(fs_info, page);
+ *nr_ret = nr;
+ return 0;
+
+out_error:
/*
* If we finish without problem, we should not only clear page dirty,
* but also empty subpage dirty bits
*/
- if (!has_error)
- btrfs_page_assert_not_dirty(fs_info, page);
- else
- ret = saved_ret;
*nr_ret = nr;
return ret;
}
@@ -1682,8 +1568,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
* Return 0 if everything goes well.
* Return <0 for error.
*/
-static int __extent_writepage(struct page *page, struct writeback_control *wbc,
- struct btrfs_bio_ctrl *bio_ctrl)
+static int __extent_writepage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl)
{
struct folio *folio = page_folio(page);
struct inode *inode = page->mapping->host;
@@ -1696,7 +1581,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
loff_t i_size = i_size_read(inode);
unsigned long end_index = i_size >> PAGE_SHIFT;
- trace___extent_writepage(page, inode, wbc);
+ trace___extent_writepage(page, inode, bio_ctrl->wbc);
WARN_ON(!PageLocked(page));
@@ -1721,15 +1606,14 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
}
if (!bio_ctrl->extent_locked) {
- ret = writepage_delalloc(BTRFS_I(inode), page, wbc);
+ ret = writepage_delalloc(BTRFS_I(inode), page, bio_ctrl->wbc);
if (ret == 1)
return 0;
if (ret)
goto done;
}
- ret = __extent_writepage_io(BTRFS_I(inode), page, wbc, bio_ctrl, i_size,
- &nr);
+ ret = __extent_writepage_io(BTRFS_I(inode), page, bio_ctrl, i_size, &nr);
if (ret == 1)
return 0;
@@ -1773,6 +1657,8 @@ done:
if (PageError(page))
end_extent_writepage(page, ret, page_start, page_end);
if (bio_ctrl->extent_locked) {
+ struct writeback_control *wbc = bio_ctrl->wbc;
+
/*
* If bio_ctrl->extent_locked, it's from extent_write_locked_range(),
* the page can either be locked by lock_page() or
@@ -1828,7 +1714,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb
if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
btrfs_tree_unlock(eb);
- if (!bio_ctrl->sync_io)
+ if (bio_ctrl->wbc->sync_mode != WB_SYNC_ALL)
return 0;
if (!flush) {
submit_write_bio(bio_ctrl, 0);
@@ -2113,15 +1999,12 @@ static void prepare_eb_write(struct extent_buffer *eb)
* Unlike the work in write_one_eb(), we rely completely on extent locking.
* Page locking is only utilized at minimum to keep the VMM code happy.
*/
-static int write_one_subpage_eb(struct extent_buffer *eb,
- struct writeback_control *wbc,
- struct btrfs_bio_ctrl *bio_ctrl)
+static void write_one_subpage_eb(struct extent_buffer *eb,
+ struct btrfs_bio_ctrl *bio_ctrl)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
struct page *page = eb->pages[0];
- blk_opf_t write_flags = wbc_to_write_flags(wbc);
bool no_dirty_ebs = false;
- int ret;
prepare_eb_write(eb);
@@ -2137,36 +2020,22 @@ static int write_one_subpage_eb(struct extent_buffer *eb,
bio_ctrl->end_io_func = end_bio_subpage_eb_writepage;
- ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
- bio_ctrl, eb->start, page, eb->len,
- eb->start - page_offset(page), 0, false);
- if (ret) {
- btrfs_subpage_clear_writeback(fs_info, page, eb->start, eb->len);
- set_btree_ioerr(page, eb);
- unlock_page(page);
-
- if (atomic_dec_and_test(&eb->io_pages))
- end_extent_buffer_writeback(eb);
- return -EIO;
- }
+ submit_extent_page(bio_ctrl, eb->start, page, eb->len,
+ eb->start - page_offset(page));
unlock_page(page);
/*
* Submission finished without problem, if no range of the page is
* dirty anymore, we have submitted a page. Update nr_written in wbc.
*/
if (no_dirty_ebs)
- wbc->nr_to_write--;
- return ret;
+ bio_ctrl->wbc->nr_to_write--;
}
-static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
- struct writeback_control *wbc,
+static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
struct btrfs_bio_ctrl *bio_ctrl)
{
u64 disk_bytenr = eb->start;
int i, num_pages;
- blk_opf_t write_flags = wbc_to_write_flags(wbc);
- int ret = 0;
prepare_eb_write(eb);
@@ -2178,32 +2047,11 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
clear_page_dirty_for_io(p);
set_page_writeback(p);
- ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
- bio_ctrl, disk_bytenr, p,
- PAGE_SIZE, 0, 0, false);
- if (ret) {
- set_btree_ioerr(p, eb);
- if (PageWriteback(p))
- end_page_writeback(p);
- if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
- end_extent_buffer_writeback(eb);
- ret = -EIO;
- break;
- }
+ submit_extent_page(bio_ctrl, disk_bytenr, p, PAGE_SIZE, 0);
disk_bytenr += PAGE_SIZE;
- wbc->nr_to_write--;
+ bio_ctrl->wbc->nr_to_write--;
unlock_page(p);
}
-
- if (unlikely(ret)) {
- for (; i < num_pages; i++) {
- struct page *p = eb->pages[i];
- clear_page_dirty_for_io(p);
- unlock_page(p);
- }
- }
-
- return ret;
}
/*
@@ -2220,9 +2068,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
* Return >=0 for the number of submitted extent buffers.
* Return <0 for fatal error.
*/
-static int submit_eb_subpage(struct page *page,
- struct writeback_control *wbc,
- struct btrfs_bio_ctrl *bio_ctrl)
+static int submit_eb_subpage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl)
{
struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
int submitted = 0;
@@ -2284,10 +2130,8 @@ static int submit_eb_subpage(struct page *page,
free_extent_buffer(eb);
goto cleanup;
}
- ret = write_one_subpage_eb(eb, wbc, bio_ctrl);
+ write_one_subpage_eb(eb, bio_ctrl);
free_extent_buffer(eb);
- if (ret < 0)
- goto cleanup;
submitted++;
}
return submitted;
@@ -2318,8 +2162,7 @@ cleanup:
* previous call.
* Return <0 for fatal error.
*/
-static int submit_eb_page(struct page *page, struct writeback_control *wbc,
- struct btrfs_bio_ctrl *bio_ctrl,
+static int submit_eb_page(struct page *page, struct btrfs_bio_ctrl *bio_ctrl,
struct extent_buffer **eb_context)
{
struct address_space *mapping = page->mapping;
@@ -2331,7 +2174,7 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc,
return 0;
if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
- return submit_eb_subpage(page, wbc, bio_ctrl);
+ return submit_eb_subpage(page, bio_ctrl);
spin_lock(&mapping->private_lock);
if (!PagePrivate(page)) {
@@ -2364,7 +2207,8 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc,
* If for_sync, this hole will be filled with
* trasnsaction commit.
*/
- if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
+ if (bio_ctrl->wbc->sync_mode == WB_SYNC_ALL &&
+ !bio_ctrl->wbc->for_sync)
ret = -EAGAIN;
else
ret = 0;
@@ -2389,10 +2233,8 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc,
btrfs_schedule_zone_finish_bg(cache, eb);
btrfs_put_block_group(cache);
}
- ret = write_one_eb(eb, wbc, bio_ctrl);
+ write_one_eb(eb, bio_ctrl);
free_extent_buffer(eb);
- if (ret < 0)
- return ret;
return 1;
}
@@ -2401,8 +2243,9 @@ int btree_write_cache_pages(struct address_space *mapping,
{
struct extent_buffer *eb_context = NULL;
struct btrfs_bio_ctrl bio_ctrl = {
+ .wbc = wbc,
+ .opf = REQ_OP_WRITE | wbc_to_write_flags(wbc),
.extent_locked = 0,
- .sync_io = (wbc->sync_mode == WB_SYNC_ALL),
};
struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
int ret = 0;
@@ -2445,8 +2288,7 @@ retry:
for (i = 0; i < nr_folios; i++) {
struct folio *folio = fbatch.folios[i];
- ret = submit_eb_page(&folio->page, wbc, &bio_ctrl,
- &eb_context);
+ ret = submit_eb_page(&folio->page, &bio_ctrl, &eb_context);
if (ret == 0)
continue;
if (ret < 0) {
@@ -2529,9 +2371,9 @@ retry:
* existing IO to complete.
*/
static int extent_write_cache_pages(struct address_space *mapping,
- struct writeback_control *wbc,
struct btrfs_bio_ctrl *bio_ctrl)
{
+ struct writeback_control *wbc = bio_ctrl->wbc;
struct inode *inode = mapping->host;
int ret = 0;
int done = 0;
@@ -2632,7 +2474,7 @@ retry:
continue;
}
- ret = __extent_writepage(&folio->page, wbc, bio_ctrl);
+ ret = __extent_writepage(&folio->page, bio_ctrl);
if (ret < 0) {
done = 1;
break;
@@ -2688,18 +2530,19 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end)
u64 cur = start;
unsigned long nr_pages;
const u32 sectorsize = btrfs_sb(inode->i_sb)->sectorsize;
- struct btrfs_bio_ctrl bio_ctrl = {
- .extent_locked = 1,
- .sync_io = 1,
- };
struct writeback_control wbc_writepages = {
.sync_mode = WB_SYNC_ALL,
.range_start = start,
.range_end = end + 1,
- /* We're called from an async helper function */
- .punt_to_cgroup = 1,
.no_cgroup_owner = 1,
};
+ struct btrfs_bio_ctrl bio_ctrl = {
+ .wbc = &wbc_writepages,
+ /* We're called from an async helper function */
+ .opf = REQ_OP_WRITE | REQ_BTRFS_CGROUP_PUNT |
+ wbc_to_write_flags(&wbc_writepages),
+ .extent_locked = 1,
+ };
ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(end + 1, sectorsize));
nr_pages = (round_up(end, PAGE_SIZE) - round_down(start, PAGE_SIZE)) >>
@@ -2719,7 +2562,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end)
ASSERT(PageLocked(page));
ASSERT(PageDirty(page));
clear_page_dirty_for_io(page);
- ret = __extent_writepage(page, &wbc_writepages, &bio_ctrl);
+ ret = __extent_writepage(page, &bio_ctrl);
ASSERT(ret <= 0);
if (ret < 0) {
found_error = true;
@@ -2743,8 +2586,9 @@ int extent_writepages(struct address_space *mapping,
struct inode *inode = mapping->host;
int ret = 0;
struct btrfs_bio_ctrl bio_ctrl = {
+ .wbc = wbc,
+ .opf = REQ_OP_WRITE | wbc_to_write_flags(wbc),
.extent_locked = 0,
- .sync_io = (wbc->sync_mode == WB_SYNC_ALL),
};
/*
@@ -2752,7 +2596,7 @@ int extent_writepages(struct address_space *mapping,
* protect the write pointer updates.
*/
btrfs_zoned_data_reloc_lock(BTRFS_I(inode));
- ret = extent_write_cache_pages(mapping, wbc, &bio_ctrl);
+ ret = extent_write_cache_pages(mapping, &bio_ctrl);
submit_write_bio(&bio_ctrl, ret);
btrfs_zoned_data_reloc_unlock(BTRFS_I(inode));
return ret;
@@ -2760,7 +2604,7 @@ int extent_writepages(struct address_space *mapping,
void extent_readahead(struct readahead_control *rac)
{
- struct btrfs_bio_ctrl bio_ctrl = { 0 };
+ struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ | REQ_RAHEAD };
struct page *pagepool[16];
struct extent_map *em_cached = NULL;
u64 prev_em_start = (u64)-1;
@@ -4407,10 +4251,11 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
struct page *page = eb->pages[0];
struct extent_state *cached_state = NULL;
struct btrfs_bio_ctrl bio_ctrl = {
+ .opf = REQ_OP_READ,
.mirror_num = mirror_num,
.parent_check = check,
};
- int ret = 0;
+ int ret;
ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags));
ASSERT(PagePrivate(page));
@@ -4428,14 +4273,13 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
return ret;
}
- ret = 0;
if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags) ||
PageUptodate(page) ||
btrfs_subpage_test_uptodate(fs_info, page, eb->start, eb->len)) {
set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
unlock_extent(io_tree, eb->start, eb->start + eb->len - 1,
&cached_state);
- return ret;
+ return 0;
}
clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
@@ -4447,28 +4291,19 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len);
btrfs_subpage_start_reader(fs_info, page, eb->start, eb->len);
- ret = submit_extent_page(REQ_OP_READ, NULL, &bio_ctrl,
- eb->start, page, eb->len,
- eb->start - page_offset(page), 0, true);
- if (ret) {
- /*
- * In the endio function, if we hit something wrong we will
- * increase the io_pages, so here we need to decrease it for
- * error path.
- */
- atomic_dec(&eb->io_pages);
- }
+ submit_extent_page(&bio_ctrl, eb->start, page, eb->len,
+ eb->start - page_offset(page));
submit_one_bio(&bio_ctrl);
- if (ret || wait != WAIT_COMPLETE) {
+ if (wait != WAIT_COMPLETE) {
free_extent_state(cached_state);
- return ret;
+ return 0;
}
wait_extent_bit(io_tree, eb->start, eb->start + eb->len - 1,
EXTENT_LOCKED, &cached_state);
if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
- ret = -EIO;
- return ret;
+ return -EIO;
+ return 0;
}
int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
@@ -4476,13 +4311,12 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
{
int i;
struct page *page;
- int err;
- int ret = 0;
int locked_pages = 0;
int all_uptodate = 1;
int num_pages;
unsigned long num_reads = 0;
struct btrfs_bio_ctrl bio_ctrl = {
+ .opf = REQ_OP_READ,
.mirror_num = mirror_num,
.parent_check = check,
};
@@ -4550,27 +4384,9 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
page = eb->pages[i];
if (!PageUptodate(page)) {
- if (ret) {
- atomic_dec(&eb->io_pages);
- unlock_page(page);
- continue;
- }
-
ClearPageError(page);
- err = submit_extent_page(REQ_OP_READ, NULL,
- &bio_ctrl, page_offset(page), page,
- PAGE_SIZE, 0, 0, false);
- if (err) {
- /*
- * We failed to submit the bio so it's the
- * caller's responsibility to perform cleanup
- * i.e unlock page/set error bit.
- */
- ret = err;
- SetPageError(page);
- unlock_page(page);
- atomic_dec(&eb->io_pages);
- }
+ submit_extent_page(&bio_ctrl, page_offset(page), page,
+ PAGE_SIZE, 0);
} else {
unlock_page(page);
}
@@ -4578,17 +4394,17 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
submit_one_bio(&bio_ctrl);
- if (ret || wait != WAIT_COMPLETE)
- return ret;
+ if (wait != WAIT_COMPLETE)
+ return 0;
for (i = 0; i < num_pages; i++) {
page = eb->pages[i];
wait_on_page_locked(page);
if (!PageUptodate(page))
- ret = -EIO;
+ return -EIO;
}
- return ret;
+ return 0;
unlock_exit:
while (locked_pages > 0) {
@@ -4596,7 +4412,7 @@ unlock_exit:
page = eb->pages[locked_pages];
unlock_page(page);
}
- return ret;
+ return 0;
}
static bool report_eb_range(const struct extent_buffer *eb, unsigned long start,
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 41c77a100853..018c711a0bc8 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -336,48 +336,6 @@ out:
}
/*
- * Locate the file_offset of @cur_disk_bytenr of a @bio.
- *
- * Bio of btrfs represents read range of
- * [bi_sector << 9, bi_sector << 9 + bi_size).
- * Knowing this, we can iterate through each bvec to locate the page belong to
- * @cur_disk_bytenr and get the file offset.
- *
- * @inode is used to determine if the bvec page really belongs to @inode.
- *
- * Return 0 if we can't find the file offset
- * Return >0 if we find the file offset and restore it to @file_offset_ret
- */
-static int search_file_offset_in_bio(struct bio *bio, struct inode *inode,
- u64 disk_bytenr, u64 *file_offset_ret)
-{
- struct bvec_iter iter;
- struct bio_vec bvec;
- u64 cur = bio->bi_iter.bi_sector << SECTOR_SHIFT;
- int ret = 0;
-
- bio_for_each_segment(bvec, bio, iter) {
- struct page *page = bvec.bv_page;
-
- if (cur > disk_bytenr)
- break;
- if (cur + bvec.bv_len <= disk_bytenr) {
- cur += bvec.bv_len;
- continue;
- }
- ASSERT(in_range(disk_bytenr, cur, bvec.bv_len));
- if (page->mapping && page->mapping->host &&
- page->mapping->host == inode) {
- ret = 1;
- *file_offset_ret = page_offset(page) + bvec.bv_offset +
- disk_bytenr - cur;
- break;
- }
- }
- return ret;
-}
-
-/*
* Lookup the checksum for the read bio in csum tree.
*
* Return: BLK_STS_RESOURCE if allocating memory fails, BLK_STS_OK otherwise.
@@ -386,17 +344,15 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
{
struct btrfs_inode *inode = bbio->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct extent_io_tree *io_tree = &inode->io_tree;
struct bio *bio = &bbio->bio;
struct btrfs_path *path;
const u32 sectorsize = fs_info->sectorsize;
const u32 csum_size = fs_info->csum_size;
u32 orig_len = bio->bi_iter.bi_size;
u64 orig_disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT;
- u64 cur_disk_bytenr;
const unsigned int nblocks = orig_len >> fs_info->sectorsize_bits;
- int count = 0;
blk_status_t ret = BLK_STS_OK;
+ u32 bio_offset = 0;
if ((inode->flags & BTRFS_INODE_NODATASUM) ||
test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state))
@@ -447,28 +403,14 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
path->skip_locking = 1;
}
- for (cur_disk_bytenr = orig_disk_bytenr;
- cur_disk_bytenr < orig_disk_bytenr + orig_len;
- cur_disk_bytenr += (count * sectorsize)) {
- u64 search_len = orig_disk_bytenr + orig_len - cur_disk_bytenr;
- unsigned int sector_offset;
- u8 *csum_dst;
-
- /*
- * Although both cur_disk_bytenr and orig_disk_bytenr is u64,
- * we're calculating the offset to the bio start.
- *
- * Bio size is limited to UINT_MAX, thus unsigned int is large
- * enough to contain the raw result, not to mention the right
- * shifted result.
- */
- ASSERT(cur_disk_bytenr - orig_disk_bytenr < UINT_MAX);
- sector_offset = (cur_disk_bytenr - orig_disk_bytenr) >>
- fs_info->sectorsize_bits;
- csum_dst = bbio->csum + sector_offset * csum_size;
+ while (bio_offset < orig_len) {
+ int count;
+ u64 cur_disk_bytenr = orig_disk_bytenr + bio_offset;
+ u8 *csum_dst = bbio->csum +
+ (bio_offset >> fs_info->sectorsize_bits) * csum_size;
count = search_csum_tree(fs_info, path, cur_disk_bytenr,
- search_len, csum_dst);
+ orig_len - bio_offset, csum_dst);
if (count < 0) {
ret = errno_to_blk_status(count);
if (bbio->csum != bbio->csum_inline)
@@ -493,14 +435,9 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
if (inode->root->root_key.objectid ==
BTRFS_DATA_RELOC_TREE_OBJECTID) {
- u64 file_offset;
- int ret;
-
- ret = search_file_offset_in_bio(bio,
- &inode->vfs_inode,
- cur_disk_bytenr, &file_offset);
- if (ret)
- set_extent_bits(io_tree, file_offset,
+ u64 file_offset = bbio->file_offset + bio_offset;
+
+ set_extent_bits(&inode->io_tree, file_offset,
file_offset + sectorsize - 1,
EXTENT_NODATASUM);
} else {
@@ -509,6 +446,7 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
cur_disk_bytenr, cur_disk_bytenr + sectorsize);
}
}
+ bio_offset += count * sectorsize;
}
btrfs_free_path(path);
@@ -659,7 +597,8 @@ fail:
* in is large enough to contain all csums.
*/
int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end,
- u8 *csum_buf, unsigned long *csum_bitmap)
+ u8 *csum_buf, unsigned long *csum_bitmap,
+ bool search_commit)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key key;
@@ -676,6 +615,12 @@ int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end,
if (!path)
return -ENOMEM;
+ if (search_commit) {
+ path->skip_locking = 1;
+ path->reada = READA_FORWARD;
+ path->search_commit_root = 1;
+ }
+
key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
key.type = BTRFS_EXTENT_CSUM_KEY;
key.offset = start;
diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h
index cd7f2ae515c0..6be8725cd574 100644
--- a/fs/btrfs/file-item.h
+++ b/fs/btrfs/file-item.h
@@ -57,7 +57,8 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end,
struct list_head *list, int search_commit,
bool nowait);
int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end,
- u8 *csum_buf, unsigned long *csum_bitmap);
+ u8 *csum_buf, unsigned long *csum_bitmap,
+ bool search_commit);
void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
const struct btrfs_path *path,
struct btrfs_file_extent_item *fi,
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 24cd49229408..0d98fc5f6f44 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -25,6 +25,18 @@
static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE);
/*
+ * Number of metadata items necessary for an unlink operation:
+ *
+ * 1 for the possible orphan item
+ * 1 for the dir item
+ * 1 for the dir index
+ * 1 for the inode ref
+ * 1 for the inode
+ * 1 for the parent inode
+ */
+#define BTRFS_UNLINK_METADATA_UNITS 6
+
+/*
* The reserved space at the beginning of each device. It covers the primary
* super block and leaves space for potential use by other tools like
* bootloaders or to lower potential damage of accidental overwrite.
@@ -193,11 +205,7 @@ enum {
#define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL
#define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL
-#ifdef CONFIG_BTRFS_DEBUG
-/*
- * Extent tree v2 supported only with CONFIG_BTRFS_DEBUG
- */
-#define BTRFS_FEATURE_INCOMPAT_SUPP \
+#define BTRFS_FEATURE_INCOMPAT_SUPP_STABLE \
(BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \
BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \
@@ -210,23 +218,22 @@ enum {
BTRFS_FEATURE_INCOMPAT_NO_HOLES | \
BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \
BTRFS_FEATURE_INCOMPAT_RAID1C34 | \
- BTRFS_FEATURE_INCOMPAT_ZONED | \
+ BTRFS_FEATURE_INCOMPAT_ZONED)
+
+#ifdef CONFIG_BTRFS_DEBUG
+ /*
+ * Features under developmen like Extent tree v2 support is enabled
+ * only under CONFIG_BTRFS_DEBUG.
+ */
+#define BTRFS_FEATURE_INCOMPAT_SUPP \
+ (BTRFS_FEATURE_INCOMPAT_SUPP_STABLE | \
BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2)
+
#else
-#define BTRFS_FEATURE_INCOMPAT_SUPP \
- (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
- BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \
- BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \
- BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \
- BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \
- BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD | \
- BTRFS_FEATURE_INCOMPAT_RAID56 | \
- BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \
- BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \
- BTRFS_FEATURE_INCOMPAT_NO_HOLES | \
- BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \
- BTRFS_FEATURE_INCOMPAT_RAID1C34 | \
- BTRFS_FEATURE_INCOMPAT_ZONED)
+
+#define BTRFS_FEATURE_INCOMPAT_SUPP \
+ (BTRFS_FEATURE_INCOMPAT_SUPP_STABLE)
+
#endif
#define BTRFS_FEATURE_INCOMPAT_SAFE_SET \
@@ -412,7 +419,6 @@ struct btrfs_fs_info {
* Must be written and read while holding btrfs_fs_info::commit_root_sem.
*/
u64 last_reloc_trans;
- u64 avg_delayed_ref_runtime;
/*
* This is updated to the current trans every time a full commit is
@@ -638,7 +644,6 @@ struct btrfs_fs_info {
refcount_t scrub_workers_refcnt;
struct workqueue_struct *scrub_workers;
struct workqueue_struct *scrub_wr_completion_workers;
- struct workqueue_struct *scrub_parity_workers;
struct btrfs_subpage_info *subpage_info;
struct btrfs_discard_ctl discard_ctl;
@@ -828,7 +833,7 @@ static inline u64 btrfs_csum_bytes_to_leaves(
* Use this if we would be adding new items, as we could split nodes as we cow
* down the tree.
*/
-static inline u64 btrfs_calc_insert_metadata_size(struct btrfs_fs_info *fs_info,
+static inline u64 btrfs_calc_insert_metadata_size(const struct btrfs_fs_info *fs_info,
unsigned num_items)
{
return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items;
@@ -838,7 +843,7 @@ static inline u64 btrfs_calc_insert_metadata_size(struct btrfs_fs_info *fs_info,
* Doing a truncate or a modification won't result in new nodes or leaves, just
* what we need for COW.
*/
-static inline u64 btrfs_calc_metadata_size(struct btrfs_fs_info *fs_info,
+static inline u64 btrfs_calc_metadata_size(const struct btrfs_fs_info *fs_info,
unsigned num_items)
{
return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items;
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index b65c45b5d681..4c322b720a80 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -527,7 +527,7 @@ search_again:
while (1) {
u64 clear_start = 0, clear_len = 0, extent_start = 0;
- bool should_throttle = false;
+ bool refill_delayed_refs_rsv = false;
fi = NULL;
leaf = path->nodes[0];
@@ -660,8 +660,7 @@ delete:
/* No pending yet, add ourselves */
pending_del_slot = path->slots[0];
pending_del_nr = 1;
- } else if (pending_del_nr &&
- path->slots[0] + 1 == pending_del_slot) {
+ } else if (path->slots[0] + 1 == pending_del_slot) {
/* Hop on the pending chunk */
pending_del_nr++;
pending_del_slot = path->slots[0];
@@ -686,10 +685,8 @@ delete:
btrfs_abort_transaction(trans, ret);
break;
}
- if (be_nice) {
- if (btrfs_should_throttle_delayed_refs(trans))
- should_throttle = true;
- }
+ if (be_nice && btrfs_check_space_for_delayed_refs(fs_info))
+ refill_delayed_refs_rsv = true;
}
if (found_type == BTRFS_INODE_ITEM_KEY)
@@ -697,7 +694,7 @@ delete:
if (path->slots[0] == 0 ||
path->slots[0] != pending_del_slot ||
- should_throttle) {
+ refill_delayed_refs_rsv) {
if (pending_del_nr) {
ret = btrfs_del_items(trans, root, path,
pending_del_slot,
@@ -720,7 +717,7 @@ delete:
* actually allocate, so just bail if we're short and
* let the normal reservation dance happen higher up.
*/
- if (should_throttle) {
+ if (refill_delayed_refs_rsv) {
ret = btrfs_delayed_refs_rsv_refill(fs_info,
BTRFS_RESERVE_NO_FLUSH);
if (ret) {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 957e4d76a7b6..57d070025c7a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -79,6 +79,7 @@ struct btrfs_iget_args {
struct btrfs_dio_data {
ssize_t submitted;
struct extent_changeset *data_reserved;
+ struct btrfs_ordered_extent *ordered;
bool data_space_reserved;
bool nocow_done;
};
@@ -669,8 +670,7 @@ static noinline int compress_file_range(struct async_chunk *async_chunk)
again:
will_compress = 0;
nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
- nr_pages = min_t(unsigned long, nr_pages,
- BTRFS_MAX_COMPRESSED / PAGE_SIZE);
+ nr_pages = min_t(unsigned long, nr_pages, BTRFS_MAX_COMPRESSED_PAGES);
/*
* we don't want to send crud past the end of i_size through
@@ -945,10 +945,9 @@ static int submit_uncompressed_range(struct btrfs_inode *inode,
ret = cow_file_range(inode, locked_page, start, end, &page_started,
&nr_written, 0, NULL);
/* Inline extent inserted, page gets unlocked and everything is done */
- if (page_started) {
- ret = 0;
- goto out;
- }
+ if (page_started)
+ return 0;
+
if (ret < 0) {
btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1);
if (locked_page) {
@@ -962,14 +961,11 @@ static int submit_uncompressed_range(struct btrfs_inode *inode,
end_extent_writepage(locked_page, ret, page_start, page_end);
unlock_page(locked_page);
}
- goto out;
+ return ret;
}
- ret = extent_write_locked_range(&inode->vfs_inode, start, end);
/* All pages will be unlocked, including @locked_page */
-out:
- kfree(async_extent);
- return ret;
+ return extent_write_locked_range(&inode->vfs_inode, start, end);
}
static int submit_one_async_extent(struct btrfs_inode *inode,
@@ -987,6 +983,9 @@ static int submit_one_async_extent(struct btrfs_inode *inode,
u64 start = async_extent->start;
u64 end = async_extent->start + async_extent->ram_size - 1;
+ if (async_chunk->blkcg_css)
+ kthread_associate_blkcg(async_chunk->blkcg_css);
+
/*
* If async_chunk->locked_page is in the async_extent range, we need to
* handle it.
@@ -1001,8 +1000,10 @@ static int submit_one_async_extent(struct btrfs_inode *inode,
lock_extent(io_tree, start, end, NULL);
/* We have fall back to uncompressed write */
- if (!async_extent->pages)
- return submit_uncompressed_range(inode, async_extent, locked_page);
+ if (!async_extent->pages) {
+ ret = submit_uncompressed_range(inode, async_extent, locked_page);
+ goto done;
+ }
ret = btrfs_reserve_extent(root, async_extent->ram_size,
async_extent->compressed_size,
@@ -1054,24 +1055,18 @@ static int submit_one_async_extent(struct btrfs_inode *inode,
extent_clear_unlock_delalloc(inode, start, end,
NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
PAGE_UNLOCK | PAGE_START_WRITEBACK);
- if (btrfs_submit_compressed_write(inode, start, /* file_offset */
+
+ btrfs_submit_compressed_write(inode, start, /* file_offset */
async_extent->ram_size, /* num_bytes */
ins.objectid, /* disk_bytenr */
ins.offset, /* compressed_len */
async_extent->pages, /* compressed_pages */
async_extent->nr_pages,
- async_chunk->write_flags,
- async_chunk->blkcg_css, true)) {
- const u64 start = async_extent->start;
- const u64 end = start + async_extent->ram_size - 1;
-
- btrfs_writepage_endio_finish_ordered(inode, NULL, start, end, 0);
-
- extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
- PAGE_END_WRITEBACK | PAGE_SET_ERROR);
- free_async_extent_pages(async_extent);
- }
+ async_chunk->write_flags, true);
*alloc_hint = ins.objectid + ins.offset;
+done:
+ if (async_chunk->blkcg_css)
+ kthread_associate_blkcg(NULL);
kfree(async_extent);
return ret;
@@ -1086,8 +1081,7 @@ out_free:
PAGE_UNLOCK | PAGE_START_WRITEBACK |
PAGE_END_WRITEBACK | PAGE_SET_ERROR);
free_async_extent_pages(async_extent);
- kfree(async_extent);
- return ret;
+ goto done;
}
/*
@@ -1622,6 +1616,7 @@ static int cow_file_range_async(struct btrfs_inode *inode,
if (blkcg_css != blkcg_root_css) {
css_get(blkcg_css);
async_chunk[i].blkcg_css = blkcg_css;
+ async_chunk[i].write_flags |= REQ_BTRFS_CGROUP_PUNT;
} else {
async_chunk[i].blkcg_css = NULL;
}
@@ -2521,37 +2516,31 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
}
/*
- * Split an extent_map at [start, start + len]
+ * Split off the first pre bytes from the extent_map at [start, start + len]
*
* This function is intended to be used only for extract_ordered_extent().
*/
-static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
- u64 pre, u64 post)
+static int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre)
{
struct extent_map_tree *em_tree = &inode->extent_tree;
struct extent_map *em;
struct extent_map *split_pre = NULL;
struct extent_map *split_mid = NULL;
- struct extent_map *split_post = NULL;
int ret = 0;
unsigned long flags;
- /* Sanity check */
- if (pre == 0 && post == 0)
- return 0;
+ ASSERT(pre != 0);
+ ASSERT(pre < len);
split_pre = alloc_extent_map();
- if (pre)
- split_mid = alloc_extent_map();
- if (post)
- split_post = alloc_extent_map();
- if (!split_pre || (pre && !split_mid) || (post && !split_post)) {
+ if (!split_pre)
+ return -ENOMEM;
+ split_mid = alloc_extent_map();
+ if (!split_mid) {
ret = -ENOMEM;
- goto out;
+ goto out_free_pre;
}
- ASSERT(pre + post < len);
-
lock_extent(&inode->io_tree, start, start + len - 1, NULL);
write_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, len);
@@ -2572,7 +2561,7 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
/* First, replace the em with a new extent_map starting from * em->start */
split_pre->start = em->start;
- split_pre->len = (pre ? pre : em->len - post);
+ split_pre->len = pre;
split_pre->orig_start = split_pre->start;
split_pre->block_start = em->block_start;
split_pre->block_len = split_pre->len;
@@ -2586,38 +2575,21 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
/*
* Now we only have an extent_map at:
- * [em->start, em->start + pre] if pre != 0
- * [em->start, em->start + em->len - post] if pre == 0
- */
-
- if (pre) {
- /* Insert the middle extent_map */
- split_mid->start = em->start + pre;
- split_mid->len = em->len - pre - post;
- split_mid->orig_start = split_mid->start;
- split_mid->block_start = em->block_start + pre;
- split_mid->block_len = split_mid->len;
- split_mid->orig_block_len = split_mid->block_len;
- split_mid->ram_bytes = split_mid->len;
- split_mid->flags = flags;
- split_mid->compress_type = em->compress_type;
- split_mid->generation = em->generation;
- add_extent_mapping(em_tree, split_mid, 1);
- }
-
- if (post) {
- split_post->start = em->start + em->len - post;
- split_post->len = post;
- split_post->orig_start = split_post->start;
- split_post->block_start = em->block_start + em->len - post;
- split_post->block_len = split_post->len;
- split_post->orig_block_len = split_post->block_len;
- split_post->ram_bytes = split_post->len;
- split_post->flags = flags;
- split_post->compress_type = em->compress_type;
- split_post->generation = em->generation;
- add_extent_mapping(em_tree, split_post, 1);
- }
+ * [em->start, em->start + pre]
+ */
+
+ /* Insert the middle extent_map. */
+ split_mid->start = em->start + pre;
+ split_mid->len = em->len - pre;
+ split_mid->orig_start = split_mid->start;
+ split_mid->block_start = em->block_start + pre;
+ split_mid->block_len = split_mid->len;
+ split_mid->orig_block_len = split_mid->block_len;
+ split_mid->ram_bytes = split_mid->len;
+ split_mid->flags = flags;
+ split_mid->compress_type = em->compress_type;
+ split_mid->generation = em->generation;
+ add_extent_mapping(em_tree, split_mid, 1);
/* Once for us */
free_extent_map(em);
@@ -2627,72 +2599,41 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
out_unlock:
write_unlock(&em_tree->lock);
unlock_extent(&inode->io_tree, start, start + len - 1, NULL);
-out:
- free_extent_map(split_pre);
free_extent_map(split_mid);
- free_extent_map(split_post);
-
+out_free_pre:
+ free_extent_map(split_pre);
return ret;
}
-blk_status_t btrfs_extract_ordered_extent(struct btrfs_bio *bbio)
+int btrfs_extract_ordered_extent(struct btrfs_bio *bbio,
+ struct btrfs_ordered_extent *ordered)
{
u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
u64 len = bbio->bio.bi_iter.bi_size;
struct btrfs_inode *inode = bbio->inode;
- struct btrfs_ordered_extent *ordered;
- u64 file_len;
- u64 end = start + len;
- u64 ordered_end;
- u64 pre, post;
+ u64 ordered_len = ordered->num_bytes;
int ret = 0;
- ordered = btrfs_lookup_ordered_extent(inode, bbio->file_offset);
- if (WARN_ON_ONCE(!ordered))
- return BLK_STS_IOERR;
+ /* Must always be called for the beginning of an ordered extent. */
+ if (WARN_ON_ONCE(start != ordered->disk_bytenr))
+ return -EINVAL;
- /* No need to split */
+ /* No need to split if the ordered extent covers the entire bio. */
if (ordered->disk_num_bytes == len)
- goto out;
-
- /* We cannot split once end_bio'd ordered extent */
- if (WARN_ON_ONCE(ordered->bytes_left != ordered->disk_num_bytes)) {
- ret = -EINVAL;
- goto out;
- }
-
- /* We cannot split a compressed ordered extent */
- if (WARN_ON_ONCE(ordered->disk_num_bytes != ordered->num_bytes)) {
- ret = -EINVAL;
- goto out;
- }
-
- ordered_end = ordered->disk_bytenr + ordered->disk_num_bytes;
- /* bio must be in one ordered extent */
- if (WARN_ON_ONCE(start < ordered->disk_bytenr || end > ordered_end)) {
- ret = -EINVAL;
- goto out;
- }
-
- /* Checksum list should be empty */
- if (WARN_ON_ONCE(!list_empty(&ordered->list))) {
- ret = -EINVAL;
- goto out;
- }
-
- file_len = ordered->num_bytes;
- pre = start - ordered->disk_bytenr;
- post = ordered_end - end;
+ return 0;
- ret = btrfs_split_ordered_extent(ordered, pre, post);
+ ret = btrfs_split_ordered_extent(ordered, len);
if (ret)
- goto out;
- ret = split_zoned_em(inode, bbio->file_offset, file_len, pre, post);
+ return ret;
-out:
- btrfs_put_ordered_extent(ordered);
+ /*
+ * Don't split the extent_map for NOCOW extents, as we're writing into
+ * a pre-existing one.
+ */
+ if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
+ return 0;
- return errno_to_blk_status(ret);
+ return split_extent_map(inode, bbio->file_offset, ordered_len, len);
}
/*
@@ -3367,13 +3308,6 @@ int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
return 0;
}
-static u8 *btrfs_csum_ptr(const struct btrfs_fs_info *fs_info, u8 *csums, u64 offset)
-{
- u64 offset_in_sectors = offset >> fs_info->sectorsize_bits;
-
- return csums + offset_in_sectors * fs_info->csum_size;
-}
-
/*
* Verify the checksum of a single data sector.
*
@@ -3411,7 +3345,8 @@ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
return true;
}
- csum_expected = btrfs_csum_ptr(fs_info, bbio->csum, bio_offset);
+ csum_expected = bbio->csum + (bio_offset >> fs_info->sectorsize_bits) *
+ fs_info->csum_size;
if (btrfs_check_sector_csum(fs_info, bv->bv_page, bv->bv_offset, csum,
csum_expected))
goto zeroit;
@@ -3691,6 +3626,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
+ iput(inode);
goto out;
}
btrfs_debug(fs_info, "auto deleting %Lu",
@@ -3698,8 +3634,10 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
ret = btrfs_del_orphan_item(trans, root,
found_key.objectid);
btrfs_end_transaction(trans);
- if (ret)
+ if (ret) {
+ iput(inode);
goto out;
+ }
continue;
}
@@ -4261,15 +4199,8 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct btrfs_inode *dir)
{
struct btrfs_root *root = dir->root;
- /*
- * 1 for the possible orphan item
- * 1 for the dir item
- * 1 for the dir index
- * 1 for the inode ref
- * 1 for the inode
- * 1 for the parent inode
- */
- return btrfs_start_transaction_fallback_global_rsv(root, 6);
+ return btrfs_start_transaction_fallback_global_rsv(root,
+ BTRFS_UNLINK_METADATA_UNITS);
}
static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
@@ -5243,7 +5174,7 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_trans_handle *trans;
- u64 delayed_refs_extra = btrfs_calc_insert_metadata_size(fs_info, 1);
+ u64 delayed_refs_extra = btrfs_calc_delayed_ref_bytes(fs_info, 1);
int ret;
/*
@@ -5281,7 +5212,7 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
trans->block_rsv = &fs_info->trans_block_rsv;
trans->bytes_reserved = delayed_refs_extra;
btrfs_block_rsv_migrate(rsv, trans->block_rsv,
- delayed_refs_extra, 1);
+ delayed_refs_extra, true);
}
return trans;
}
@@ -5291,7 +5222,7 @@ void btrfs_evict_inode(struct inode *inode)
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_block_rsv *rsv;
+ struct btrfs_block_rsv *rsv = NULL;
int ret;
trace_btrfs_inode_evict(inode);
@@ -5308,18 +5239,18 @@ void btrfs_evict_inode(struct inode *inode)
((btrfs_root_refs(&root->root_item) != 0 &&
root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) ||
btrfs_is_free_space_inode(BTRFS_I(inode))))
- goto no_delete;
+ goto out;
if (is_bad_inode(inode))
- goto no_delete;
+ goto out;
if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
- goto no_delete;
+ goto out;
if (inode->i_nlink > 0) {
BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID);
- goto no_delete;
+ goto out;
}
/*
@@ -5328,7 +5259,7 @@ void btrfs_evict_inode(struct inode *inode)
*/
ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode));
if (ret)
- goto no_delete;
+ goto out;
/*
* This drops any pending insert or delete operations we have for this
@@ -5340,7 +5271,7 @@ void btrfs_evict_inode(struct inode *inode)
rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
if (!rsv)
- goto no_delete;
+ goto out;
rsv->size = btrfs_calc_metadata_size(fs_info, 1);
rsv->failfast = true;
@@ -5356,16 +5287,21 @@ void btrfs_evict_inode(struct inode *inode)
trans = evict_refill_and_join(root, rsv);
if (IS_ERR(trans))
- goto free_rsv;
+ goto out;
trans->block_rsv = rsv;
ret = btrfs_truncate_inode_items(trans, root, &control);
trans->block_rsv = &fs_info->trans_block_rsv;
btrfs_end_transaction(trans);
- btrfs_btree_balance_dirty(fs_info);
+ /*
+ * We have not added new delayed items for our inode after we
+ * have flushed its delayed items, so no need to throttle on
+ * delayed items. However we have modified extent buffers.
+ */
+ btrfs_btree_balance_dirty_nodelay(fs_info);
if (ret && ret != -ENOSPC && ret != -EAGAIN)
- goto free_rsv;
+ goto out;
else if (!ret)
break;
}
@@ -5387,9 +5323,8 @@ void btrfs_evict_inode(struct inode *inode)
btrfs_end_transaction(trans);
}
-free_rsv:
+out:
btrfs_free_block_rsv(fs_info, rsv);
-no_delete:
/*
* If we didn't successfully delete, the orphan item will still be in
* the tree and we'll retry on the next mount. Again, we might also want
@@ -6981,6 +6916,7 @@ out:
}
static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
+ struct btrfs_dio_data *dio_data,
const u64 start,
const u64 len,
const u64 orig_start,
@@ -6991,7 +6927,7 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
const int type)
{
struct extent_map *em = NULL;
- int ret;
+ struct btrfs_ordered_extent *ordered;
if (type != BTRFS_ORDERED_NOCOW) {
em = create_io_em(inode, start, len, orig_start, block_start,
@@ -7001,18 +6937,21 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
if (IS_ERR(em))
goto out;
}
- ret = btrfs_add_ordered_extent(inode, start, len, len, block_start,
- block_len, 0,
- (1 << type) |
- (1 << BTRFS_ORDERED_DIRECT),
- BTRFS_COMPRESS_NONE);
- if (ret) {
+ ordered = btrfs_alloc_ordered_extent(inode, start, len, len,
+ block_start, block_len, 0,
+ (1 << type) |
+ (1 << BTRFS_ORDERED_DIRECT),
+ BTRFS_COMPRESS_NONE);
+ if (IS_ERR(ordered)) {
if (em) {
free_extent_map(em);
btrfs_drop_extent_map_range(inode, start,
start + len - 1, false);
}
- em = ERR_PTR(ret);
+ em = ERR_CAST(ordered);
+ } else {
+ ASSERT(!dio_data->ordered);
+ dio_data->ordered = ordered;
}
out:
@@ -7020,6 +6959,7 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
}
static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
+ struct btrfs_dio_data *dio_data,
u64 start, u64 len)
{
struct btrfs_root *root = inode->root;
@@ -7035,7 +6975,7 @@ static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
if (ret)
return ERR_PTR(ret);
- em = btrfs_create_dio_extent(inode, start, ins.offset, start,
+ em = btrfs_create_dio_extent(inode, dio_data, start, ins.offset, start,
ins.objectid, ins.offset, ins.offset,
ins.offset, BTRFS_ORDERED_REGULAR);
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
@@ -7380,7 +7320,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
}
space_reserved = true;
- em2 = btrfs_create_dio_extent(BTRFS_I(inode), start, len,
+ em2 = btrfs_create_dio_extent(BTRFS_I(inode), dio_data, start, len,
orig_start, block_start,
len, orig_block_len,
ram_bytes, type);
@@ -7422,7 +7362,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
goto out;
space_reserved = true;
- em = btrfs_new_extent_direct(BTRFS_I(inode), start, len);
+ em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
goto out;
@@ -7728,6 +7668,10 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
pos + length - 1, NULL);
ret = -ENOTBLK;
}
+ if (write) {
+ btrfs_put_ordered_extent(dio_data->ordered);
+ dio_data->ordered = NULL;
+ }
if (write)
extent_changeset_free(dio_data->data_reserved);
@@ -7767,14 +7711,34 @@ static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio,
container_of(bbio, struct btrfs_dio_private, bbio);
struct btrfs_dio_data *dio_data = iter->private;
- btrfs_bio_init(bbio, BTRFS_I(iter->inode), btrfs_dio_end_io, bio->bi_private);
+ btrfs_bio_init(bbio, BTRFS_I(iter->inode)->root->fs_info,
+ btrfs_dio_end_io, bio->bi_private);
+ bbio->inode = BTRFS_I(iter->inode);
bbio->file_offset = file_offset;
dip->file_offset = file_offset;
dip->bytes = bio->bi_iter.bi_size;
dio_data->submitted += bio->bi_iter.bi_size;
- btrfs_submit_bio(bio, 0);
+
+ /*
+ * Check if we are doing a partial write. If we are, we need to split
+ * the ordered extent to match the submitted bio. Hang on to the
+ * remaining unfinishable ordered_extent in dio_data so that it can be
+ * cancelled in iomap_end to avoid a deadlock wherein faulting the
+ * remaining pages is blocked on the outstanding ordered extent.
+ */
+ if (iter->flags & IOMAP_WRITE) {
+ int ret;
+
+ ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered);
+ if (ret) {
+ btrfs_bio_end_io(bbio, errno_to_blk_status(ret));
+ return;
+ }
+ }
+
+ btrfs_submit_bio(bbio, 0);
}
static const struct iomap_ops btrfs_dio_iomap_ops = {
@@ -7789,7 +7753,7 @@ static const struct iomap_dio_ops btrfs_dio_ops = {
ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, size_t done_before)
{
- struct btrfs_dio_data data;
+ struct btrfs_dio_data data = { 0 };
return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
IOMAP_DIO_PARTIAL, &data, done_before);
@@ -7798,7 +7762,7 @@ ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, size_t done_be
struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
size_t done_before)
{
- struct btrfs_dio_data data;
+ struct btrfs_dio_data data = { 0 };
return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
IOMAP_DIO_PARTIAL, &data, done_before);
@@ -9908,8 +9872,6 @@ out:
}
struct btrfs_encoded_read_private {
- struct btrfs_inode *inode;
- u64 file_offset;
wait_queue_head_t wait;
atomic_t pending;
blk_status_t status;
@@ -9939,45 +9901,41 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
u64 file_offset, u64 disk_bytenr,
u64 disk_io_size, struct page **pages)
{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_encoded_read_private priv = {
- .inode = inode,
- .file_offset = file_offset,
.pending = ATOMIC_INIT(1),
};
unsigned long i = 0;
- u64 cur = 0;
+ struct btrfs_bio *bbio;
init_waitqueue_head(&priv.wait);
- /* Submit bios for the extent, splitting due to bio limits as necessary. */
- while (cur < disk_io_size) {
- struct bio *bio = NULL;
- u64 remaining = disk_io_size - cur;
-
- while (bio || remaining) {
- size_t bytes = min_t(u64, remaining, PAGE_SIZE);
-
- if (!bio) {
- bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ,
- inode,
- btrfs_encoded_read_endio,
- &priv);
- bio->bi_iter.bi_sector =
- (disk_bytenr + cur) >> SECTOR_SHIFT;
- }
- if (!bytes ||
- bio_add_page(bio, pages[i], bytes, 0) < bytes) {
- atomic_inc(&priv.pending);
- btrfs_submit_bio(bio, 0);
- bio = NULL;
- continue;
- }
+ bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
+ btrfs_encoded_read_endio, &priv);
+ bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
+ bbio->inode = inode;
- i++;
- cur += bytes;
- remaining -= bytes;
+ do {
+ size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE);
+
+ if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) {
+ atomic_inc(&priv.pending);
+ btrfs_submit_bio(bbio, 0);
+
+ bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
+ btrfs_encoded_read_endio, &priv);
+ bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
+ bbio->inode = inode;
+ continue;
}
- }
+
+ i++;
+ disk_bytenr += bytes;
+ disk_io_size -= bytes;
+ } while (disk_io_size);
+
+ atomic_inc(&priv.pending);
+ btrfs_submit_bio(bbio, 0);
if (atomic_dec_return(&priv.pending))
io_wait_event(priv.wait, !atomic_read(&priv.pending));
@@ -10398,13 +10356,8 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
btrfs_delalloc_release_extents(inode, num_bytes);
- if (btrfs_submit_compressed_write(inode, start, num_bytes, ins.objectid,
- ins.offset, pages, nr_pages, 0, NULL,
- false)) {
- btrfs_writepage_endio_finish_ordered(inode, pages[0], start, end, 0);
- ret = -EIO;
- goto out_pages;
- }
+ btrfs_submit_compressed_write(inode, start, num_bytes, ins.objectid,
+ ins.offset, pages, nr_pages, 0, false);
ret = orig_count;
goto out;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index ba769a1eb87a..25833b4eeaf5 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3161,6 +3161,11 @@ static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
if (IS_ERR(sa))
return PTR_ERR(sa);
+ if (sa->flags & ~BTRFS_SCRUB_SUPPORTED_FLAGS) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
if (!(sa->flags & BTRFS_SCRUB_READONLY)) {
ret = mnt_want_write_file(file);
if (ret)
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 870528d87526..3a496b0d3d2b 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -325,24 +325,12 @@ struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root)
* acquire the lock.
*/
-int btrfs_drew_lock_init(struct btrfs_drew_lock *lock)
+void btrfs_drew_lock_init(struct btrfs_drew_lock *lock)
{
- int ret;
-
- ret = percpu_counter_init(&lock->writers, 0, GFP_KERNEL);
- if (ret)
- return ret;
-
atomic_set(&lock->readers, 0);
+ atomic_set(&lock->writers, 0);
init_waitqueue_head(&lock->pending_readers);
init_waitqueue_head(&lock->pending_writers);
-
- return 0;
-}
-
-void btrfs_drew_lock_destroy(struct btrfs_drew_lock *lock)
-{
- percpu_counter_destroy(&lock->writers);
}
/* Return true if acquisition is successful, false otherwise */
@@ -351,10 +339,10 @@ bool btrfs_drew_try_write_lock(struct btrfs_drew_lock *lock)
if (atomic_read(&lock->readers))
return false;
- percpu_counter_inc(&lock->writers);
+ atomic_inc(&lock->writers);
/* Ensure writers count is updated before we check for pending readers */
- smp_mb();
+ smp_mb__after_atomic();
if (atomic_read(&lock->readers)) {
btrfs_drew_write_unlock(lock);
return false;
@@ -374,7 +362,7 @@ void btrfs_drew_write_lock(struct btrfs_drew_lock *lock)
void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock)
{
- percpu_counter_dec(&lock->writers);
+ atomic_dec(&lock->writers);
cond_wake_up(&lock->pending_readers);
}
@@ -390,8 +378,7 @@ void btrfs_drew_read_lock(struct btrfs_drew_lock *lock)
*/
smp_mb__after_atomic();
- wait_event(lock->pending_readers,
- percpu_counter_sum(&lock->writers) == 0);
+ wait_event(lock->pending_readers, atomic_read(&lock->writers) == 0);
}
void btrfs_drew_read_unlock(struct btrfs_drew_lock *lock)
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index 11c2269b4b6f..edb9b4a0dba1 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -195,13 +195,12 @@ static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw)
struct btrfs_drew_lock {
atomic_t readers;
- struct percpu_counter writers;
+ atomic_t writers;
wait_queue_head_t pending_writers;
wait_queue_head_t pending_readers;
};
-int btrfs_drew_lock_init(struct btrfs_drew_lock *lock);
-void btrfs_drew_lock_destroy(struct btrfs_drew_lock *lock);
+void btrfs_drew_lock_init(struct btrfs_drew_lock *lock);
void btrfs_drew_write_lock(struct btrfs_drew_lock *lock);
bool btrfs_drew_try_write_lock(struct btrfs_drew_lock *lock);
void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock);
diff --git a/fs/btrfs/lru_cache.h b/fs/btrfs/lru_cache.h
index de3e18bce24a..00328c856be6 100644
--- a/fs/btrfs/lru_cache.h
+++ b/fs/btrfs/lru_cache.h
@@ -55,11 +55,6 @@ static inline unsigned int btrfs_lru_cache_size(const struct btrfs_lru_cache *ca
return cache->size;
}
-static inline bool btrfs_lru_cache_is_full(const struct btrfs_lru_cache *cache)
-{
- return cache->size >= cache->max_size;
-}
-
static inline struct btrfs_lru_cache_entry *btrfs_lru_cache_lru_entry(
struct btrfs_lru_cache *cache)
{
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 71f6d8302d50..3a095b9c6373 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -17,6 +17,7 @@
#include "compression.h"
#include "ctree.h"
#include "super.h"
+#include "btrfs_inode.h"
#define LZO_LEN 4
@@ -329,7 +330,7 @@ static void copy_compressed_segment(struct compressed_bio *cb,
int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
- const struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
+ const struct btrfs_fs_info *fs_info = cb->bbio.inode->root->fs_info;
const u32 sectorsize = fs_info->sectorsize;
char *kaddr;
int ret;
@@ -388,8 +389,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
*/
btrfs_err(fs_info, "unexpectedly large lzo segment len %u",
seg_len);
- ret = -EIO;
- goto out;
+ return -EIO;
}
/* Copy the compressed segment payload into workspace */
@@ -400,8 +400,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
workspace->buf, &out_len);
if (ret != LZO_E_OK) {
btrfs_err(fs_info, "failed to decompress");
- ret = -EIO;
- goto out;
+ return -EIO;
}
/* Copy the data into inode pages */
@@ -410,7 +409,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
/* All data read, exit */
if (ret == 0)
- goto out;
+ return 0;
ret = 0;
/* Check if the sector has enough space for a segment header */
@@ -421,10 +420,8 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
/* Skip the padding zeros */
cur_in += sector_bytes_left;
}
-out:
- if (!ret)
- zero_fill_bio(cb->orig_bio);
- return ret;
+
+ return 0;
}
int lzo_decompress(struct list_head *ws, const u8 *data_in,
diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c
index fde5aaa6e7c9..310a05cf95ef 100644
--- a/fs/btrfs/messages.c
+++ b/fs/btrfs/messages.c
@@ -253,7 +253,7 @@ void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt,
#endif
#ifdef CONFIG_BTRFS_ASSERT
-void __cold btrfs_assertfail(const char *expr, const char *file, int line)
+void __cold __noreturn btrfs_assertfail(const char *expr, const char *file, int line)
{
pr_err("assertion failed: %s, in %s:%d\n", expr, file, line);
BUG();
diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h
index 8c516ee58ff9..ac2d1982ba3d 100644
--- a/fs/btrfs/messages.h
+++ b/fs/btrfs/messages.h
@@ -160,7 +160,7 @@ do { \
} while (0)
#ifdef CONFIG_BTRFS_ASSERT
-void __cold btrfs_assertfail(const char *expr, const char *file, int line);
+void __cold __noreturn btrfs_assertfail(const char *expr, const char *file, int line);
#define ASSERT(expr) \
(likely(expr) ? (void)0 : btrfs_assertfail(#expr, __FILE__, __LINE__))
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 6c24b69e2d0a..a9778a91511e 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -160,14 +160,16 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
* @compress_type: Compression algorithm used for data.
*
* Most of these parameters correspond to &struct btrfs_file_extent_item. The
- * tree is given a single reference on the ordered extent that was inserted.
+ * tree is given a single reference on the ordered extent that was inserted, and
+ * the returned pointer is given a second reference.
*
- * Return: 0 or -ENOMEM.
+ * Return: the new ordered extent or error pointer.
*/
-int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
- u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
- u64 disk_num_bytes, u64 offset, unsigned flags,
- int compress_type)
+struct btrfs_ordered_extent *btrfs_alloc_ordered_extent(
+ struct btrfs_inode *inode, u64 file_offset,
+ u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
+ u64 disk_num_bytes, u64 offset, unsigned long flags,
+ int compress_type)
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -181,7 +183,7 @@ int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
/* For nocow write, we can release the qgroup rsv right now */
ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes);
if (ret < 0)
- return ret;
+ return ERR_PTR(ret);
ret = 0;
} else {
/*
@@ -190,11 +192,11 @@ int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
*/
ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes);
if (ret < 0)
- return ret;
+ return ERR_PTR(ret);
}
entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);
if (!entry)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
entry->file_offset = file_offset;
entry->num_bytes = num_bytes;
@@ -256,6 +258,32 @@ int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
btrfs_mod_outstanding_extents(inode, 1);
spin_unlock(&inode->lock);
+ /* One ref for the returned entry to match semantics of lookup. */
+ refcount_inc(&entry->refs);
+
+ return entry;
+}
+
+/*
+ * Add a new btrfs_ordered_extent for the range, but drop the reference instead
+ * of returning it to the caller.
+ */
+int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
+ u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
+ u64 disk_num_bytes, u64 offset, unsigned long flags,
+ int compress_type)
+{
+ struct btrfs_ordered_extent *ordered;
+
+ ordered = btrfs_alloc_ordered_extent(inode, file_offset, num_bytes,
+ ram_bytes, disk_bytenr,
+ disk_num_bytes, offset, flags,
+ compress_type);
+
+ if (IS_ERR(ordered))
+ return PTR_ERR(ordered);
+ btrfs_put_ordered_extent(ordered);
+
return 0;
}
@@ -1088,39 +1116,37 @@ bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end,
return false;
}
-
-static int clone_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pos,
- u64 len)
-{
- struct inode *inode = ordered->inode;
- struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
- u64 file_offset = ordered->file_offset + pos;
- u64 disk_bytenr = ordered->disk_bytenr + pos;
- unsigned long flags = ordered->flags & BTRFS_ORDERED_TYPE_FLAGS;
-
- /*
- * The splitting extent is already counted and will be added again in
- * btrfs_add_ordered_extent_*(). Subtract len to avoid double counting.
- */
- percpu_counter_add_batch(&fs_info->ordered_bytes, -len,
- fs_info->delalloc_batch);
- WARN_ON_ONCE(flags & (1 << BTRFS_ORDERED_COMPRESSED));
- return btrfs_add_ordered_extent(BTRFS_I(inode), file_offset, len, len,
- disk_bytenr, len, 0, flags,
- ordered->compress_type);
-}
-
-int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre,
- u64 post)
+/* Split out a new ordered extent for this first @len bytes of @ordered. */
+int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 len)
{
struct inode *inode = ordered->inode;
struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
- struct rb_node *node;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- int ret = 0;
+ u64 file_offset = ordered->file_offset;
+ u64 disk_bytenr = ordered->disk_bytenr;
+ unsigned long flags = ordered->flags & BTRFS_ORDERED_TYPE_FLAGS;
+ struct rb_node *node;
trace_btrfs_ordered_extent_split(BTRFS_I(inode), ordered);
+ ASSERT(!(flags & (1U << BTRFS_ORDERED_COMPRESSED)));
+
+ /*
+ * The entire bio must be covered by the ordered extent, but we can't
+ * reduce the original extent to a zero length either.
+ */
+ if (WARN_ON_ONCE(len >= ordered->num_bytes))
+ return -EINVAL;
+ /* We cannot split once ordered extent is past end_bio. */
+ if (WARN_ON_ONCE(ordered->bytes_left != ordered->disk_num_bytes))
+ return -EINVAL;
+ /* We cannot split a compressed ordered extent. */
+ if (WARN_ON_ONCE(ordered->disk_num_bytes != ordered->num_bytes))
+ return -EINVAL;
+ /* Checksum list should be empty. */
+ if (WARN_ON_ONCE(!list_empty(&ordered->list)))
+ return -EINVAL;
+
spin_lock_irq(&tree->lock);
/* Remove from tree once */
node = &ordered->rb_node;
@@ -1129,11 +1155,11 @@ int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre,
if (tree->last == node)
tree->last = NULL;
- ordered->file_offset += pre;
- ordered->disk_bytenr += pre;
- ordered->num_bytes -= (pre + post);
- ordered->disk_num_bytes -= (pre + post);
- ordered->bytes_left -= (pre + post);
+ ordered->file_offset += len;
+ ordered->disk_bytenr += len;
+ ordered->num_bytes -= len;
+ ordered->disk_num_bytes -= len;
+ ordered->bytes_left -= len;
/* Re-insert the node */
node = tree_insert(&tree->tree, ordered->file_offset, &ordered->rb_node);
@@ -1144,13 +1170,15 @@ int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre,
spin_unlock_irq(&tree->lock);
- if (pre)
- ret = clone_ordered_extent(ordered, 0, pre);
- if (ret == 0 && post)
- ret = clone_ordered_extent(ordered, pre + ordered->disk_num_bytes,
- post);
+ /*
+ * The splitting extent is already counted and will be added again in
+ * btrfs_add_ordered_extent(). Subtract len to avoid double counting.
+ */
+ percpu_counter_add_batch(&fs_info->ordered_bytes, -len, fs_info->delalloc_batch);
- return ret;
+ return btrfs_add_ordered_extent(BTRFS_I(inode), file_offset, len, len,
+ disk_bytenr, len, 0, flags,
+ ordered->compress_type);
}
int __init ordered_data_init(void)
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index eb40cb39f842..f0f1138d23c3 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -178,9 +178,14 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
struct btrfs_ordered_extent **cached,
u64 file_offset, u64 io_size);
+struct btrfs_ordered_extent *btrfs_alloc_ordered_extent(
+ struct btrfs_inode *inode, u64 file_offset,
+ u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
+ u64 disk_num_bytes, u64 offset, unsigned long flags,
+ int compress_type);
int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
- u64 disk_num_bytes, u64 offset, unsigned flags,
+ u64 disk_num_bytes, u64 offset, unsigned long flags,
int compress_type);
void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
struct btrfs_ordered_sum *sum);
@@ -207,8 +212,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
struct extent_state **cached_state);
bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end,
struct extent_state **cached_state);
-int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre,
- u64 post);
+int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 len);
int __init ordered_data_init(void);
void __cold ordered_data_exit(void);
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 642828c1b299..2fab37f062de 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -202,7 +202,7 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
*/
static int rbio_bucket(struct btrfs_raid_bio *rbio)
{
- u64 num = rbio->bioc->raid_map[0];
+ u64 num = rbio->bioc->full_stripe_logical;
/*
* we shift down quite a bit. We're using byte
@@ -407,16 +407,15 @@ static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
{
struct btrfs_stripe_hash_table *table;
- unsigned long flags;
if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
return;
table = rbio->bioc->fs_info->stripe_hash_table;
- spin_lock_irqsave(&table->cache_lock, flags);
+ spin_lock(&table->cache_lock);
__remove_rbio_from_cache(rbio);
- spin_unlock_irqrestore(&table->cache_lock, flags);
+ spin_unlock(&table->cache_lock);
}
/*
@@ -425,19 +424,18 @@ static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
{
struct btrfs_stripe_hash_table *table;
- unsigned long flags;
struct btrfs_raid_bio *rbio;
table = info->stripe_hash_table;
- spin_lock_irqsave(&table->cache_lock, flags);
+ spin_lock(&table->cache_lock);
while (!list_empty(&table->stripe_cache)) {
rbio = list_entry(table->stripe_cache.next,
struct btrfs_raid_bio,
stripe_cache);
__remove_rbio_from_cache(rbio);
}
- spin_unlock_irqrestore(&table->cache_lock, flags);
+ spin_unlock(&table->cache_lock);
}
/*
@@ -467,14 +465,13 @@ void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
static void cache_rbio(struct btrfs_raid_bio *rbio)
{
struct btrfs_stripe_hash_table *table;
- unsigned long flags;
if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
return;
table = rbio->bioc->fs_info->stripe_hash_table;
- spin_lock_irqsave(&table->cache_lock, flags);
+ spin_lock(&table->cache_lock);
spin_lock(&rbio->bio_list_lock);
/* bump our ref if we were not in the list before */
@@ -501,7 +498,7 @@ static void cache_rbio(struct btrfs_raid_bio *rbio)
__remove_rbio_from_cache(found);
}
- spin_unlock_irqrestore(&table->cache_lock, flags);
+ spin_unlock(&table->cache_lock);
}
/*
@@ -530,15 +527,14 @@ static void run_xor(void **pages, int src_cnt, ssize_t len)
*/
static int rbio_is_full(struct btrfs_raid_bio *rbio)
{
- unsigned long flags;
unsigned long size = rbio->bio_list_bytes;
int ret = 1;
- spin_lock_irqsave(&rbio->bio_list_lock, flags);
+ spin_lock(&rbio->bio_list_lock);
if (size != rbio->nr_data * BTRFS_STRIPE_LEN)
ret = 0;
BUG_ON(size > rbio->nr_data * BTRFS_STRIPE_LEN);
- spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
+ spin_unlock(&rbio->bio_list_lock);
return ret;
}
@@ -571,7 +567,7 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
test_bit(RBIO_CACHE_BIT, &cur->flags))
return 0;
- if (last->bioc->raid_map[0] != cur->bioc->raid_map[0])
+ if (last->bioc->full_stripe_logical != cur->bioc->full_stripe_logical)
return 0;
/* we can't merge with different operations */
@@ -657,16 +653,15 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
struct btrfs_stripe_hash *h;
struct btrfs_raid_bio *cur;
struct btrfs_raid_bio *pending;
- unsigned long flags;
struct btrfs_raid_bio *freeit = NULL;
struct btrfs_raid_bio *cache_drop = NULL;
int ret = 0;
h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
- spin_lock_irqsave(&h->lock, flags);
+ spin_lock(&h->lock);
list_for_each_entry(cur, &h->hash_list, hash_list) {
- if (cur->bioc->raid_map[0] != rbio->bioc->raid_map[0])
+ if (cur->bioc->full_stripe_logical != rbio->bioc->full_stripe_logical)
continue;
spin_lock(&cur->bio_list_lock);
@@ -724,7 +719,7 @@ lockit:
refcount_inc(&rbio->refs);
list_add(&rbio->hash_list, &h->hash_list);
out:
- spin_unlock_irqrestore(&h->lock, flags);
+ spin_unlock(&h->lock);
if (cache_drop)
remove_rbio_from_cache(cache_drop);
if (freeit)
@@ -742,7 +737,6 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
{
int bucket;
struct btrfs_stripe_hash *h;
- unsigned long flags;
int keep_cache = 0;
bucket = rbio_bucket(rbio);
@@ -751,7 +745,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
if (list_empty(&rbio->plug_list))
cache_rbio(rbio);
- spin_lock_irqsave(&h->lock, flags);
+ spin_lock(&h->lock);
spin_lock(&rbio->bio_list_lock);
if (!list_empty(&rbio->hash_list)) {
@@ -788,7 +782,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
list_add(&next->hash_list, &h->hash_list);
refcount_inc(&next->refs);
spin_unlock(&rbio->bio_list_lock);
- spin_unlock_irqrestore(&h->lock, flags);
+ spin_unlock(&h->lock);
if (next->operation == BTRFS_RBIO_READ_REBUILD)
start_async_work(next, recover_rbio_work_locked);
@@ -808,7 +802,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
}
done:
spin_unlock(&rbio->bio_list_lock);
- spin_unlock_irqrestore(&h->lock, flags);
+ spin_unlock(&h->lock);
done_nolock:
if (!keep_cache)
@@ -891,16 +885,16 @@ static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio,
index = stripe_nr * rbio->stripe_nsectors + sector_nr;
ASSERT(index >= 0 && index < rbio->nr_sectors);
- spin_lock_irq(&rbio->bio_list_lock);
+ spin_lock(&rbio->bio_list_lock);
sector = &rbio->bio_sectors[index];
if (sector->page || bio_list_only) {
/* Don't return sector without a valid page pointer */
if (!sector->page)
sector = NULL;
- spin_unlock_irq(&rbio->bio_list_lock);
+ spin_unlock(&rbio->bio_list_lock);
return sector;
}
- spin_unlock_irq(&rbio->bio_list_lock);
+ spin_unlock(&rbio->bio_list_lock);
return &rbio->stripe_sectors[index];
}
@@ -912,7 +906,7 @@ static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio,
static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
struct btrfs_io_context *bioc)
{
- const unsigned int real_stripes = bioc->num_stripes - bioc->num_tgtdevs;
+ const unsigned int real_stripes = bioc->num_stripes - bioc->replace_nr_stripes;
const unsigned int stripe_npages = BTRFS_STRIPE_LEN >> PAGE_SHIFT;
const unsigned int num_pages = stripe_npages * real_stripes;
const unsigned int stripe_nsectors =
@@ -1108,7 +1102,7 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
bio->bi_iter.bi_sector = disk_start >> 9;
bio->bi_private = rbio;
- bio_add_page(bio, sector->page, sectorsize, sector->pgoff);
+ __bio_add_page(bio, sector->page, sectorsize, sector->pgoff);
bio_list_add(bio_list, bio);
return 0;
}
@@ -1119,7 +1113,7 @@ static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio)
struct bio_vec bvec;
struct bvec_iter iter;
u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
- rbio->bioc->raid_map[0];
+ rbio->bioc->full_stripe_logical;
bio_for_each_segment(bvec, bio, iter) {
u32 bvec_offset;
@@ -1148,11 +1142,11 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio)
{
struct bio *bio;
- spin_lock_irq(&rbio->bio_list_lock);
+ spin_lock(&rbio->bio_list_lock);
bio_list_for_each(bio, &rbio->bio_list)
index_one_bio(rbio, bio);
- spin_unlock_irq(&rbio->bio_list_lock);
+ spin_unlock(&rbio->bio_list_lock);
}
static void bio_get_trace_info(struct btrfs_raid_bio *rbio, struct bio *bio,
@@ -1282,10 +1276,16 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio,
goto error;
}
- if (likely(!rbio->bioc->num_tgtdevs))
+ if (likely(!rbio->bioc->replace_nr_stripes))
return 0;
- /* Make a copy for the replace target device. */
+ /*
+ * Make a copy for the replace target device.
+ *
+ * Thus the source stripe number (in replace_stripe_src) should be valid.
+ */
+ ASSERT(rbio->bioc->replace_stripe_src >= 0);
+
for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
total_sector_nr++) {
struct sector_ptr *sector;
@@ -1293,7 +1293,12 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio,
stripe = total_sector_nr / rbio->stripe_nsectors;
sectornr = total_sector_nr % rbio->stripe_nsectors;
- if (!rbio->bioc->tgtdev_map[stripe]) {
+ /*
+ * For RAID56, there is only one device that can be replaced,
+ * and replace_stripe_src[0] indicates the stripe number we
+ * need to copy from.
+ */
+ if (stripe != rbio->bioc->replace_stripe_src) {
/*
* We can skip the whole stripe completely, note
* total_sector_nr will be increased by one anyway.
@@ -1316,7 +1321,7 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio,
}
ret = rbio_add_io_sector(rbio, bio_list, sector,
- rbio->bioc->tgtdev_map[stripe],
+ rbio->real_stripes,
sectornr, REQ_OP_WRITE);
if (ret)
goto error;
@@ -1332,7 +1337,7 @@ static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio)
{
struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
- rbio->bioc->raid_map[0];
+ rbio->bioc->full_stripe_logical;
int total_nr_sector = offset >> fs_info->sectorsize_bits;
ASSERT(total_nr_sector < rbio->nr_data * rbio->stripe_nsectors);
@@ -1609,7 +1614,7 @@ static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio)
{
const struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT;
- const u64 full_stripe_start = rbio->bioc->raid_map[0];
+ const u64 full_stripe_start = rbio->bioc->full_stripe_logical;
const u32 orig_len = orig_bio->bi_iter.bi_size;
const u32 sectorsize = fs_info->sectorsize;
u64 cur_logical;
@@ -1796,9 +1801,8 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
* here due to a crc mismatch and we can't give them the
* data they want.
*/
- if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) {
- if (rbio->bioc->raid_map[faila] ==
- RAID5_P_STRIPE)
+ if (failb == rbio->real_stripes - 1) {
+ if (faila == rbio->real_stripes - 2)
/*
* Only P and Q are corrupted.
* We only care about data stripes recovery,
@@ -1812,7 +1816,7 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
goto pstripe;
}
- if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) {
+ if (failb == rbio->real_stripes - 2) {
raid6_datap_recov(rbio->real_stripes, sectorsize,
faila, pointers);
} else {
@@ -1895,9 +1899,9 @@ static int recover_sectors(struct btrfs_raid_bio *rbio)
if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
- spin_lock_irq(&rbio->bio_list_lock);
+ spin_lock(&rbio->bio_list_lock);
set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
- spin_unlock_irq(&rbio->bio_list_lock);
+ spin_unlock(&rbio->bio_list_lock);
}
index_rbio_pages(rbio);
@@ -2075,8 +2079,8 @@ static void fill_data_csums(struct btrfs_raid_bio *rbio)
{
struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
struct btrfs_root *csum_root = btrfs_csum_root(fs_info,
- rbio->bioc->raid_map[0]);
- const u64 start = rbio->bioc->raid_map[0];
+ rbio->bioc->full_stripe_logical);
+ const u64 start = rbio->bioc->full_stripe_logical;
const u32 len = (rbio->nr_data * rbio->stripe_nsectors) <<
fs_info->sectorsize_bits;
int ret;
@@ -2109,7 +2113,7 @@ static void fill_data_csums(struct btrfs_raid_bio *rbio)
}
ret = btrfs_lookup_csums_bitmap(csum_root, start, start + len - 1,
- rbio->csum_buf, rbio->csum_bitmap);
+ rbio->csum_buf, rbio->csum_bitmap, false);
if (ret < 0)
goto error;
if (bitmap_empty(rbio->csum_bitmap, len >> fs_info->sectorsize_bits))
@@ -2124,7 +2128,7 @@ error:
*/
btrfs_warn_rl(fs_info,
"sub-stripe write for full stripe %llu is not safe, failed to get csum: %d",
- rbio->bioc->raid_map[0], ret);
+ rbio->bioc->full_stripe_logical, ret);
no_csum:
kfree(rbio->csum_buf);
bitmap_free(rbio->csum_bitmap);
@@ -2265,9 +2269,9 @@ static void rmw_rbio(struct btrfs_raid_bio *rbio)
* bio list any more, anyone else that wants to change this stripe
* needs to do their own rmw.
*/
- spin_lock_irq(&rbio->bio_list_lock);
+ spin_lock(&rbio->bio_list_lock);
set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
- spin_unlock_irq(&rbio->bio_list_lock);
+ spin_unlock(&rbio->bio_list_lock);
bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
@@ -2372,23 +2376,6 @@ struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
return rbio;
}
-/* Used for both parity scrub and missing. */
-void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
- unsigned int pgoff, u64 logical)
-{
- const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
- int stripe_offset;
- int index;
-
- ASSERT(logical >= rbio->bioc->raid_map[0]);
- ASSERT(logical + sectorsize <= rbio->bioc->raid_map[0] +
- BTRFS_STRIPE_LEN * rbio->nr_data);
- stripe_offset = (int)(logical - rbio->bioc->raid_map[0]);
- index = stripe_offset / sectorsize;
- rbio->bio_sectors[index].page = page;
- rbio->bio_sectors[index].pgoff = pgoff;
-}
-
/*
* We just scrub the parity that we have correct data on the same horizontal,
* so we needn't allocate all pages for all the stripes.
@@ -2442,7 +2429,11 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check)
else
BUG();
- if (bioc->num_tgtdevs && bioc->tgtdev_map[rbio->scrubp]) {
+ /*
+ * Replace is running and our P/Q stripe is being replaced, then we
+ * need to duplicate the final write to replace target.
+ */
+ if (bioc->replace_nr_stripes && bioc->replace_stripe_src == rbio->scrubp) {
is_replace = 1;
bitmap_copy(pbitmap, &rbio->dbitmap, rbio->stripe_nsectors);
}
@@ -2544,13 +2535,18 @@ writeback:
if (!is_replace)
goto submit_write;
+ /*
+ * Replace is running and our parity stripe needs to be duplicated to
+ * the target device. Check we have a valid source stripe number.
+ */
+ ASSERT(rbio->bioc->replace_stripe_src >= 0);
for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) {
struct sector_ptr *sector;
sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
ret = rbio_add_io_sector(rbio, &bio_list, sector,
- bioc->tgtdev_map[rbio->scrubp],
- sectornr, REQ_OP_WRITE);
+ rbio->real_stripes,
+ sectornr, REQ_OP_WRITE);
if (ret)
goto cleanup;
}
@@ -2751,33 +2747,3 @@ void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
if (!lock_stripe_add(rbio))
start_async_work(rbio, scrub_rbio_work_locked);
}
-
-/* The following code is used for dev replace of a missing RAID 5/6 device. */
-
-struct btrfs_raid_bio *
-raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc)
-{
- struct btrfs_fs_info *fs_info = bioc->fs_info;
- struct btrfs_raid_bio *rbio;
-
- rbio = alloc_rbio(fs_info, bioc);
- if (IS_ERR(rbio))
- return NULL;
-
- rbio->operation = BTRFS_RBIO_REBUILD_MISSING;
- bio_list_add(&rbio->bio_list, bio);
- /*
- * This is a special bio which is used to hold the completion handler
- * and make the scrub rbio is similar to the other types
- */
- ASSERT(!bio->bi_iter.bi_size);
-
- set_rbio_range_error(rbio, bio);
-
- return rbio;
-}
-
-void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio)
-{
- start_async_work(rbio, recover_rbio_work);
-}
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index df0e0abdeb1f..0f7f31c8cb98 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -170,6 +170,11 @@ static inline int nr_data_stripes(const struct map_lookup *map)
return map->num_stripes - btrfs_nr_parity_stripes(map->type);
}
+static inline int nr_bioc_data_stripes(const struct btrfs_io_context *bioc)
+{
+ return bioc->num_stripes - btrfs_nr_parity_stripes(bioc->map_type);
+}
+
#define RAID5_P_STRIPE ((u64)-2)
#define RAID6_Q_STRIPE ((u64)-1)
@@ -182,19 +187,12 @@ void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
int mirror_num);
void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc);
-void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
- unsigned int pgoff, u64 logical);
-
struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
struct btrfs_io_context *bioc,
struct btrfs_device *scrub_dev,
unsigned long *dbitmap, int stripe_nsectors);
void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio);
-struct btrfs_raid_bio *
-raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc);
-void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio);
-
int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index ef13a9d4e370..09b1988d1791 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1266,7 +1266,7 @@ again:
level = btrfs_header_level(parent);
ASSERT(level >= lowest_level);
- ret = btrfs_bin_search(parent, &key, &slot);
+ ret = btrfs_bin_search(parent, 0, &key, &slot);
if (ret < 0)
break;
if (ret && slot > 0)
@@ -2407,7 +2407,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
if (upper->eb && !upper->locked) {
if (!lowest) {
- ret = btrfs_bin_search(upper->eb, key, &slot);
+ ret = btrfs_bin_search(upper->eb, 0, key, &slot);
if (ret < 0)
goto next;
BUG_ON(ret);
@@ -2441,7 +2441,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
slot = path->slots[upper->level];
btrfs_release_path(path);
} else {
- ret = btrfs_bin_search(upper->eb, key, &slot);
+ ret = btrfs_bin_search(upper->eb, 0, key, &slot);
if (ret < 0)
goto next;
BUG_ON(ret);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 69c93ae333f6..836725a19661 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -38,18 +38,14 @@
* - add a mode to also read unallocated space
*/
-struct scrub_block;
struct scrub_ctx;
/*
- * The following three values only influence the performance.
+ * The following value only influences the performance.
*
- * The last one configures the number of parallel and outstanding I/O
- * operations. The first one configures an upper limit for the number
- * of (dynamically allocated) pages that are added to a bio.
+ * This determines the batch size for stripe submitted in one go.
*/
-#define SCRUB_SECTORS_PER_BIO 32 /* 128KiB per bio for 4KiB pages */
-#define SCRUB_BIOS_PER_SCTX 64 /* 8MiB per device in flight for 4KiB pages */
+#define SCRUB_STRIPES_PER_SCTX 8 /* That would be 8 64K stripe per-device. */
/*
* The following value times PAGE_SIZE needs to be large enough to match the
@@ -57,128 +53,124 @@ struct scrub_ctx;
*/
#define SCRUB_MAX_SECTORS_PER_BLOCK (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K)
-#define SCRUB_MAX_PAGES (DIV_ROUND_UP(BTRFS_MAX_METADATA_BLOCKSIZE, PAGE_SIZE))
+/* Represent one sector and its needed info to verify the content. */
+struct scrub_sector_verification {
+ bool is_metadata;
-/*
- * Maximum number of mirrors that can be available for all profiles counting
- * the target device of dev-replace as one. During an active device replace
- * procedure, the target device of the copy operation is a mirror for the
- * filesystem data as well that can be used to read data in order to repair
- * read errors on other disks.
- *
- * Current value is derived from RAID1C4 with 4 copies.
- */
-#define BTRFS_MAX_MIRRORS (4 + 1)
+ union {
+ /*
+ * Csum pointer for data csum verification. Should point to a
+ * sector csum inside scrub_stripe::csums.
+ *
+ * NULL if this data sector has no csum.
+ */
+ u8 *csum;
-struct scrub_recover {
- refcount_t refs;
- struct btrfs_io_context *bioc;
- u64 map_length;
+ /*
+ * Extra info for metadata verification. All sectors inside a
+ * tree block share the same generation.
+ */
+ u64 generation;
+ };
};
-struct scrub_sector {
- struct scrub_block *sblock;
- struct list_head list;
- u64 flags; /* extent flags */
- u64 generation;
- /* Offset in bytes to @sblock. */
- u32 offset;
- atomic_t refs;
- unsigned int have_csum:1;
- unsigned int io_error:1;
- u8 csum[BTRFS_CSUM_SIZE];
-
- struct scrub_recover *recover;
-};
+enum scrub_stripe_flags {
+ /* Set when @mirror_num, @dev, @physical and @logical are set. */
+ SCRUB_STRIPE_FLAG_INITIALIZED,
-struct scrub_bio {
- int index;
- struct scrub_ctx *sctx;
- struct btrfs_device *dev;
- struct bio *bio;
- blk_status_t status;
- u64 logical;
- u64 physical;
- struct scrub_sector *sectors[SCRUB_SECTORS_PER_BIO];
- int sector_count;
- int next_free;
- struct work_struct work;
-};
+ /* Set when the read-repair is finished. */
+ SCRUB_STRIPE_FLAG_REPAIR_DONE,
-struct scrub_block {
/*
- * Each page will have its page::private used to record the logical
- * bytenr.
+ * Set for data stripes if it's triggered from P/Q stripe.
+ * During such scrub, we should not report errors in data stripes, nor
+ * update the accounting.
*/
- struct page *pages[SCRUB_MAX_PAGES];
- struct scrub_sector *sectors[SCRUB_MAX_SECTORS_PER_BLOCK];
- struct btrfs_device *dev;
- /* Logical bytenr of the sblock */
- u64 logical;
- u64 physical;
- u64 physical_for_dev_replace;
- /* Length of sblock in bytes */
- u32 len;
- int sector_count;
- int mirror_num;
-
- atomic_t outstanding_sectors;
- refcount_t refs; /* free mem on transition to zero */
- struct scrub_ctx *sctx;
- struct scrub_parity *sparity;
- struct {
- unsigned int header_error:1;
- unsigned int checksum_error:1;
- unsigned int no_io_error_seen:1;
- unsigned int generation_error:1; /* also sets header_error */
-
- /* The following is for the data used to check parity */
- /* It is for the data with checksum */
- unsigned int data_corrected:1;
- };
- struct work_struct work;
+ SCRUB_STRIPE_FLAG_NO_REPORT,
};
-/* Used for the chunks with parity stripe such RAID5/6 */
-struct scrub_parity {
- struct scrub_ctx *sctx;
+#define SCRUB_STRIPE_PAGES (BTRFS_STRIPE_LEN / PAGE_SIZE)
- struct btrfs_device *scrub_dev;
+/*
+ * Represent one contiguous range with a length of BTRFS_STRIPE_LEN.
+ */
+struct scrub_stripe {
+ struct scrub_ctx *sctx;
+ struct btrfs_block_group *bg;
- u64 logic_start;
+ struct page *pages[SCRUB_STRIPE_PAGES];
+ struct scrub_sector_verification *sectors;
- u64 logic_end;
+ struct btrfs_device *dev;
+ u64 logical;
+ u64 physical;
+
+ u16 mirror_num;
+
+ /* Should be BTRFS_STRIPE_LEN / sectorsize. */
+ u16 nr_sectors;
+
+ /*
+ * How many data/meta extents are in this stripe. Only for scrub status
+ * reporting purposes.
+ */
+ u16 nr_data_extents;
+ u16 nr_meta_extents;
+
+ atomic_t pending_io;
+ wait_queue_head_t io_wait;
+ wait_queue_head_t repair_wait;
- int nsectors;
+ /*
+ * Indicate the states of the stripe. Bits are defined in
+ * scrub_stripe_flags enum.
+ */
+ unsigned long state;
- u32 stripe_len;
+ /* Indicate which sectors are covered by extent items. */
+ unsigned long extent_sector_bitmap;
- refcount_t refs;
+ /*
+ * The errors hit during the initial read of the stripe.
+ *
+ * Would be utilized for error reporting and repair.
+ */
+ unsigned long init_error_bitmap;
- struct list_head sectors_list;
+ /*
+ * The following error bitmaps are all for the current status.
+ * Every time we submit a new read, these bitmaps may be updated.
+ *
+ * error_bitmap = io_error_bitmap | csum_error_bitmap | meta_error_bitmap;
+ *
+ * IO and csum errors can happen for both metadata and data.
+ */
+ unsigned long error_bitmap;
+ unsigned long io_error_bitmap;
+ unsigned long csum_error_bitmap;
+ unsigned long meta_error_bitmap;
- /* Work of parity check and repair */
- struct work_struct work;
+ /* For writeback (repair or replace) error reporting. */
+ unsigned long write_error_bitmap;
- /* Mark the parity blocks which have data */
- unsigned long dbitmap;
+ /* Writeback can be concurrent, thus we need to protect the bitmap. */
+ spinlock_t write_error_lock;
/*
- * Mark the parity blocks which have data, but errors happen when
- * read data or check data
+ * Checksum for the whole stripe if this stripe is inside a data block
+ * group.
*/
- unsigned long ebitmap;
+ u8 *csums;
+
+ struct work_struct work;
};
struct scrub_ctx {
- struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX];
+ struct scrub_stripe stripes[SCRUB_STRIPES_PER_SCTX];
+ struct scrub_stripe *raid56_data_stripes;
struct btrfs_fs_info *fs_info;
int first_free;
- int curr;
- atomic_t bios_in_flight;
- atomic_t workers_pending;
- spinlock_t list_lock;
- wait_queue_head_t list_wait;
+ int cur_stripe;
struct list_head csum_list;
atomic_t cancel_req;
int readonly;
@@ -191,10 +183,8 @@ struct scrub_ctx {
int is_dev_replace;
u64 write_pointer;
- struct scrub_bio *wr_curr_bio;
struct mutex wr_lock;
struct btrfs_device *wr_tgtdev;
- bool flush_all_writes;
/*
* statistics
@@ -221,239 +211,66 @@ struct scrub_warning {
struct btrfs_device *dev;
};
-struct full_stripe_lock {
- struct rb_node node;
- u64 logical;
- u64 refs;
- struct mutex mutex;
-};
-
-#ifndef CONFIG_64BIT
-/* This structure is for architectures whose (void *) is smaller than u64 */
-struct scrub_page_private {
- u64 logical;
-};
-#endif
-
-static int attach_scrub_page_private(struct page *page, u64 logical)
-{
-#ifdef CONFIG_64BIT
- attach_page_private(page, (void *)logical);
- return 0;
-#else
- struct scrub_page_private *spp;
-
- spp = kmalloc(sizeof(*spp), GFP_KERNEL);
- if (!spp)
- return -ENOMEM;
- spp->logical = logical;
- attach_page_private(page, (void *)spp);
- return 0;
-#endif
-}
-
-static void detach_scrub_page_private(struct page *page)
-{
-#ifdef CONFIG_64BIT
- detach_page_private(page);
- return;
-#else
- struct scrub_page_private *spp;
-
- spp = detach_page_private(page);
- kfree(spp);
- return;
-#endif
-}
-
-static struct scrub_block *alloc_scrub_block(struct scrub_ctx *sctx,
- struct btrfs_device *dev,
- u64 logical, u64 physical,
- u64 physical_for_dev_replace,
- int mirror_num)
-{
- struct scrub_block *sblock;
-
- sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
- if (!sblock)
- return NULL;
- refcount_set(&sblock->refs, 1);
- sblock->sctx = sctx;
- sblock->logical = logical;
- sblock->physical = physical;
- sblock->physical_for_dev_replace = physical_for_dev_replace;
- sblock->dev = dev;
- sblock->mirror_num = mirror_num;
- sblock->no_io_error_seen = 1;
- /*
- * Scrub_block::pages will be allocated at alloc_scrub_sector() when
- * the corresponding page is not allocated.
- */
- return sblock;
-}
-
-/*
- * Allocate a new scrub sector and attach it to @sblock.
- *
- * Will also allocate new pages for @sblock if needed.
- */
-static struct scrub_sector *alloc_scrub_sector(struct scrub_block *sblock,
- u64 logical)
+static void release_scrub_stripe(struct scrub_stripe *stripe)
{
- const pgoff_t page_index = (logical - sblock->logical) >> PAGE_SHIFT;
- struct scrub_sector *ssector;
-
- /* We must never have scrub_block exceed U32_MAX in size. */
- ASSERT(logical - sblock->logical < U32_MAX);
-
- ssector = kzalloc(sizeof(*ssector), GFP_KERNEL);
- if (!ssector)
- return NULL;
-
- /* Allocate a new page if the slot is not allocated */
- if (!sblock->pages[page_index]) {
- int ret;
+ if (!stripe)
+ return;
- sblock->pages[page_index] = alloc_page(GFP_KERNEL);
- if (!sblock->pages[page_index]) {
- kfree(ssector);
- return NULL;
- }
- ret = attach_scrub_page_private(sblock->pages[page_index],
- sblock->logical + (page_index << PAGE_SHIFT));
- if (ret < 0) {
- kfree(ssector);
- __free_page(sblock->pages[page_index]);
- sblock->pages[page_index] = NULL;
- return NULL;
- }
+ for (int i = 0; i < SCRUB_STRIPE_PAGES; i++) {
+ if (stripe->pages[i])
+ __free_page(stripe->pages[i]);
+ stripe->pages[i] = NULL;
}
-
- atomic_set(&ssector->refs, 1);
- ssector->sblock = sblock;
- /* The sector to be added should not be used */
- ASSERT(sblock->sectors[sblock->sector_count] == NULL);
- ssector->offset = logical - sblock->logical;
-
- /* The sector count must be smaller than the limit */
- ASSERT(sblock->sector_count < SCRUB_MAX_SECTORS_PER_BLOCK);
-
- sblock->sectors[sblock->sector_count] = ssector;
- sblock->sector_count++;
- sblock->len += sblock->sctx->fs_info->sectorsize;
-
- return ssector;
-}
-
-static struct page *scrub_sector_get_page(struct scrub_sector *ssector)
-{
- struct scrub_block *sblock = ssector->sblock;
- pgoff_t index;
- /*
- * When calling this function, ssector must be alreaday attached to the
- * parent sblock.
- */
- ASSERT(sblock);
-
- /* The range should be inside the sblock range */
- ASSERT(ssector->offset < sblock->len);
-
- index = ssector->offset >> PAGE_SHIFT;
- ASSERT(index < SCRUB_MAX_PAGES);
- ASSERT(sblock->pages[index]);
- ASSERT(PagePrivate(sblock->pages[index]));
- return sblock->pages[index];
+ kfree(stripe->sectors);
+ kfree(stripe->csums);
+ stripe->sectors = NULL;
+ stripe->csums = NULL;
+ stripe->sctx = NULL;
+ stripe->state = 0;
}
-static unsigned int scrub_sector_get_page_offset(struct scrub_sector *ssector)
+static int init_scrub_stripe(struct btrfs_fs_info *fs_info,
+ struct scrub_stripe *stripe)
{
- struct scrub_block *sblock = ssector->sblock;
+ int ret;
- /*
- * When calling this function, ssector must be already attached to the
- * parent sblock.
- */
- ASSERT(sblock);
+ memset(stripe, 0, sizeof(*stripe));
- /* The range should be inside the sblock range */
- ASSERT(ssector->offset < sblock->len);
+ stripe->nr_sectors = BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits;
+ stripe->state = 0;
- return offset_in_page(ssector->offset);
-}
+ init_waitqueue_head(&stripe->io_wait);
+ init_waitqueue_head(&stripe->repair_wait);
+ atomic_set(&stripe->pending_io, 0);
+ spin_lock_init(&stripe->write_error_lock);
-static char *scrub_sector_get_kaddr(struct scrub_sector *ssector)
-{
- return page_address(scrub_sector_get_page(ssector)) +
- scrub_sector_get_page_offset(ssector);
+ ret = btrfs_alloc_page_array(SCRUB_STRIPE_PAGES, stripe->pages);
+ if (ret < 0)
+ goto error;
+
+ stripe->sectors = kcalloc(stripe->nr_sectors,
+ sizeof(struct scrub_sector_verification),
+ GFP_KERNEL);
+ if (!stripe->sectors)
+ goto error;
+
+ stripe->csums = kcalloc(BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits,
+ fs_info->csum_size, GFP_KERNEL);
+ if (!stripe->csums)
+ goto error;
+ return 0;
+error:
+ release_scrub_stripe(stripe);
+ return -ENOMEM;
}
-static int bio_add_scrub_sector(struct bio *bio, struct scrub_sector *ssector,
- unsigned int len)
+static void wait_scrub_stripe_io(struct scrub_stripe *stripe)
{
- return bio_add_page(bio, scrub_sector_get_page(ssector), len,
- scrub_sector_get_page_offset(ssector));
+ wait_event(stripe->io_wait, atomic_read(&stripe->pending_io) == 0);
}
-static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
- struct scrub_block *sblocks_for_recheck[]);
-static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
- struct scrub_block *sblock,
- int retry_failed_mirror);
-static void scrub_recheck_block_checksum(struct scrub_block *sblock);
-static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
- struct scrub_block *sblock_good);
-static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad,
- struct scrub_block *sblock_good,
- int sector_num, int force_write);
-static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
-static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock,
- int sector_num);
-static int scrub_checksum_data(struct scrub_block *sblock);
-static int scrub_checksum_tree_block(struct scrub_block *sblock);
-static int scrub_checksum_super(struct scrub_block *sblock);
-static void scrub_block_put(struct scrub_block *sblock);
-static void scrub_sector_get(struct scrub_sector *sector);
-static void scrub_sector_put(struct scrub_sector *sector);
-static void scrub_parity_get(struct scrub_parity *sparity);
-static void scrub_parity_put(struct scrub_parity *sparity);
-static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
- u64 physical, struct btrfs_device *dev, u64 flags,
- u64 gen, int mirror_num, u8 *csum,
- u64 physical_for_dev_replace);
-static void scrub_bio_end_io(struct bio *bio);
-static void scrub_bio_end_io_worker(struct work_struct *work);
-static void scrub_block_complete(struct scrub_block *sblock);
-static void scrub_find_good_copy(struct btrfs_fs_info *fs_info,
- u64 extent_logical, u32 extent_len,
- u64 *extent_physical,
- struct btrfs_device **extent_dev,
- int *extent_mirror_num);
-static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx,
- struct scrub_sector *sector);
-static void scrub_wr_submit(struct scrub_ctx *sctx);
-static void scrub_wr_bio_end_io(struct bio *bio);
-static void scrub_wr_bio_end_io_worker(struct work_struct *work);
static void scrub_put_ctx(struct scrub_ctx *sctx);
-static inline int scrub_is_page_on_raid56(struct scrub_sector *sector)
-{
- return sector->recover &&
- (sector->recover->bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
-}
-
-static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
-{
- refcount_inc(&sctx->refs);
- atomic_inc(&sctx->bios_in_flight);
-}
-
-static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
-{
- atomic_dec(&sctx->bios_in_flight);
- wake_up(&sctx->list_wait);
- scrub_put_ctx(sctx);
-}
-
static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
{
while (atomic_read(&fs_info->scrub_pause_req)) {
@@ -486,223 +303,6 @@ static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
scrub_pause_off(fs_info);
}
-/*
- * Insert new full stripe lock into full stripe locks tree
- *
- * Return pointer to existing or newly inserted full_stripe_lock structure if
- * everything works well.
- * Return ERR_PTR(-ENOMEM) if we failed to allocate memory
- *
- * NOTE: caller must hold full_stripe_locks_root->lock before calling this
- * function
- */
-static struct full_stripe_lock *insert_full_stripe_lock(
- struct btrfs_full_stripe_locks_tree *locks_root,
- u64 fstripe_logical)
-{
- struct rb_node **p;
- struct rb_node *parent = NULL;
- struct full_stripe_lock *entry;
- struct full_stripe_lock *ret;
-
- lockdep_assert_held(&locks_root->lock);
-
- p = &locks_root->root.rb_node;
- while (*p) {
- parent = *p;
- entry = rb_entry(parent, struct full_stripe_lock, node);
- if (fstripe_logical < entry->logical) {
- p = &(*p)->rb_left;
- } else if (fstripe_logical > entry->logical) {
- p = &(*p)->rb_right;
- } else {
- entry->refs++;
- return entry;
- }
- }
-
- /*
- * Insert new lock.
- */
- ret = kmalloc(sizeof(*ret), GFP_KERNEL);
- if (!ret)
- return ERR_PTR(-ENOMEM);
- ret->logical = fstripe_logical;
- ret->refs = 1;
- mutex_init(&ret->mutex);
-
- rb_link_node(&ret->node, parent, p);
- rb_insert_color(&ret->node, &locks_root->root);
- return ret;
-}
-
-/*
- * Search for a full stripe lock of a block group
- *
- * Return pointer to existing full stripe lock if found
- * Return NULL if not found
- */
-static struct full_stripe_lock *search_full_stripe_lock(
- struct btrfs_full_stripe_locks_tree *locks_root,
- u64 fstripe_logical)
-{
- struct rb_node *node;
- struct full_stripe_lock *entry;
-
- lockdep_assert_held(&locks_root->lock);
-
- node = locks_root->root.rb_node;
- while (node) {
- entry = rb_entry(node, struct full_stripe_lock, node);
- if (fstripe_logical < entry->logical)
- node = node->rb_left;
- else if (fstripe_logical > entry->logical)
- node = node->rb_right;
- else
- return entry;
- }
- return NULL;
-}
-
-/*
- * Helper to get full stripe logical from a normal bytenr.
- *
- * Caller must ensure @cache is a RAID56 block group.
- */
-static u64 get_full_stripe_logical(struct btrfs_block_group *cache, u64 bytenr)
-{
- u64 ret;
-
- /*
- * Due to chunk item size limit, full stripe length should not be
- * larger than U32_MAX. Just a sanity check here.
- */
- WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX);
-
- /*
- * round_down() can only handle power of 2, while RAID56 full
- * stripe length can be 64KiB * n, so we need to manually round down.
- */
- ret = div64_u64(bytenr - cache->start, cache->full_stripe_len) *
- cache->full_stripe_len + cache->start;
- return ret;
-}
-
-/*
- * Lock a full stripe to avoid concurrency of recovery and read
- *
- * It's only used for profiles with parities (RAID5/6), for other profiles it
- * does nothing.
- *
- * Return 0 if we locked full stripe covering @bytenr, with a mutex held.
- * So caller must call unlock_full_stripe() at the same context.
- *
- * Return <0 if encounters error.
- */
-static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
- bool *locked_ret)
-{
- struct btrfs_block_group *bg_cache;
- struct btrfs_full_stripe_locks_tree *locks_root;
- struct full_stripe_lock *existing;
- u64 fstripe_start;
- int ret = 0;
-
- *locked_ret = false;
- bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
- if (!bg_cache) {
- ASSERT(0);
- return -ENOENT;
- }
-
- /* Profiles not based on parity don't need full stripe lock */
- if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
- goto out;
- locks_root = &bg_cache->full_stripe_locks_root;
-
- fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
-
- /* Now insert the full stripe lock */
- mutex_lock(&locks_root->lock);
- existing = insert_full_stripe_lock(locks_root, fstripe_start);
- mutex_unlock(&locks_root->lock);
- if (IS_ERR(existing)) {
- ret = PTR_ERR(existing);
- goto out;
- }
- mutex_lock(&existing->mutex);
- *locked_ret = true;
-out:
- btrfs_put_block_group(bg_cache);
- return ret;
-}
-
-/*
- * Unlock a full stripe.
- *
- * NOTE: Caller must ensure it's the same context calling corresponding
- * lock_full_stripe().
- *
- * Return 0 if we unlock full stripe without problem.
- * Return <0 for error
- */
-static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
- bool locked)
-{
- struct btrfs_block_group *bg_cache;
- struct btrfs_full_stripe_locks_tree *locks_root;
- struct full_stripe_lock *fstripe_lock;
- u64 fstripe_start;
- bool freeit = false;
- int ret = 0;
-
- /* If we didn't acquire full stripe lock, no need to continue */
- if (!locked)
- return 0;
-
- bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
- if (!bg_cache) {
- ASSERT(0);
- return -ENOENT;
- }
- if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
- goto out;
-
- locks_root = &bg_cache->full_stripe_locks_root;
- fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
-
- mutex_lock(&locks_root->lock);
- fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start);
- /* Unpaired unlock_full_stripe() detected */
- if (!fstripe_lock) {
- WARN_ON(1);
- ret = -ENOENT;
- mutex_unlock(&locks_root->lock);
- goto out;
- }
-
- if (fstripe_lock->refs == 0) {
- WARN_ON(1);
- btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow",
- fstripe_lock->logical);
- } else {
- fstripe_lock->refs--;
- }
-
- if (fstripe_lock->refs == 0) {
- rb_erase(&fstripe_lock->node, &locks_root->root);
- freeit = true;
- }
- mutex_unlock(&locks_root->lock);
-
- mutex_unlock(&fstripe_lock->mutex);
- if (freeit)
- kfree(fstripe_lock);
-out:
- btrfs_put_block_group(bg_cache);
- return ret;
-}
-
static void scrub_free_csums(struct scrub_ctx *sctx)
{
while (!list_empty(&sctx->csum_list)) {
@@ -721,24 +321,9 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
if (!sctx)
return;
- /* this can happen when scrub is cancelled */
- if (sctx->curr != -1) {
- struct scrub_bio *sbio = sctx->bios[sctx->curr];
-
- for (i = 0; i < sbio->sector_count; i++)
- scrub_block_put(sbio->sectors[i]->sblock);
- bio_put(sbio->bio);
- }
+ for (i = 0; i < SCRUB_STRIPES_PER_SCTX; i++)
+ release_scrub_stripe(&sctx->stripes[i]);
- for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
- struct scrub_bio *sbio = sctx->bios[i];
-
- if (!sbio)
- break;
- kfree(sbio);
- }
-
- kfree(sctx->wr_curr_bio);
scrub_free_csums(sctx);
kfree(sctx);
}
@@ -760,45 +345,26 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
goto nomem;
refcount_set(&sctx->refs, 1);
sctx->is_dev_replace = is_dev_replace;
- sctx->sectors_per_bio = SCRUB_SECTORS_PER_BIO;
- sctx->curr = -1;
sctx->fs_info = fs_info;
INIT_LIST_HEAD(&sctx->csum_list);
- for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
- struct scrub_bio *sbio;
+ for (i = 0; i < SCRUB_STRIPES_PER_SCTX; i++) {
+ int ret;
- sbio = kzalloc(sizeof(*sbio), GFP_KERNEL);
- if (!sbio)
+ ret = init_scrub_stripe(fs_info, &sctx->stripes[i]);
+ if (ret < 0)
goto nomem;
- sctx->bios[i] = sbio;
-
- sbio->index = i;
- sbio->sctx = sctx;
- sbio->sector_count = 0;
- INIT_WORK(&sbio->work, scrub_bio_end_io_worker);
-
- if (i != SCRUB_BIOS_PER_SCTX - 1)
- sctx->bios[i]->next_free = i + 1;
- else
- sctx->bios[i]->next_free = -1;
+ sctx->stripes[i].sctx = sctx;
}
sctx->first_free = 0;
- atomic_set(&sctx->bios_in_flight, 0);
- atomic_set(&sctx->workers_pending, 0);
atomic_set(&sctx->cancel_req, 0);
- spin_lock_init(&sctx->list_lock);
spin_lock_init(&sctx->stat_lock);
- init_waitqueue_head(&sctx->list_wait);
sctx->throttle_deadline = 0;
- WARN_ON(sctx->wr_curr_bio != NULL);
mutex_init(&sctx->wr_lock);
- sctx->wr_curr_bio = NULL;
if (is_dev_replace) {
WARN_ON(!fs_info->dev_replace.tgtdev);
sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
- sctx->flush_all_writes = false;
}
return sctx;
@@ -898,10 +464,10 @@ err:
return 0;
}
-static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
+static void scrub_print_common_warning(const char *errstr, struct btrfs_device *dev,
+ bool is_super, u64 logical, u64 physical)
{
- struct btrfs_device *dev;
- struct btrfs_fs_info *fs_info;
+ struct btrfs_fs_info *fs_info = dev->fs_info;
struct btrfs_path *path;
struct btrfs_key found_key;
struct extent_buffer *eb;
@@ -914,22 +480,18 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
u8 ref_level = 0;
int ret;
- WARN_ON(sblock->sector_count < 1);
- dev = sblock->dev;
- fs_info = sblock->sctx->fs_info;
-
/* Super block error, no need to search extent tree. */
- if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
+ if (is_super) {
btrfs_warn_in_rcu(fs_info, "%s on device %s, physical %llu",
- errstr, btrfs_dev_name(dev), sblock->physical);
+ errstr, btrfs_dev_name(dev), physical);
return;
}
path = btrfs_alloc_path();
if (!path)
return;
- swarn.physical = sblock->physical;
- swarn.logical = sblock->logical;
+ swarn.physical = physical;
+ swarn.logical = logical;
swarn.errstr = errstr;
swarn.dev = NULL;
@@ -978,447 +540,6 @@ out:
btrfs_free_path(path);
}
-static inline void scrub_get_recover(struct scrub_recover *recover)
-{
- refcount_inc(&recover->refs);
-}
-
-static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
- struct scrub_recover *recover)
-{
- if (refcount_dec_and_test(&recover->refs)) {
- btrfs_bio_counter_dec(fs_info);
- btrfs_put_bioc(recover->bioc);
- kfree(recover);
- }
-}
-
-/*
- * scrub_handle_errored_block gets called when either verification of the
- * sectors failed or the bio failed to read, e.g. with EIO. In the latter
- * case, this function handles all sectors in the bio, even though only one
- * may be bad.
- * The goal of this function is to repair the errored block by using the
- * contents of one of the mirrors.
- */
-static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
-{
- struct scrub_ctx *sctx = sblock_to_check->sctx;
- struct btrfs_device *dev = sblock_to_check->dev;
- struct btrfs_fs_info *fs_info;
- u64 logical;
- unsigned int failed_mirror_index;
- unsigned int is_metadata;
- unsigned int have_csum;
- /* One scrub_block for each mirror */
- struct scrub_block *sblocks_for_recheck[BTRFS_MAX_MIRRORS] = { 0 };
- struct scrub_block *sblock_bad;
- int ret;
- int mirror_index;
- int sector_num;
- int success;
- bool full_stripe_locked;
- unsigned int nofs_flag;
- static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
- DEFAULT_RATELIMIT_BURST);
-
- BUG_ON(sblock_to_check->sector_count < 1);
- fs_info = sctx->fs_info;
- if (sblock_to_check->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
- /*
- * If we find an error in a super block, we just report it.
- * They will get written with the next transaction commit
- * anyway
- */
- scrub_print_warning("super block error", sblock_to_check);
- spin_lock(&sctx->stat_lock);
- ++sctx->stat.super_errors;
- spin_unlock(&sctx->stat_lock);
- btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS);
- return 0;
- }
- logical = sblock_to_check->logical;
- ASSERT(sblock_to_check->mirror_num);
- failed_mirror_index = sblock_to_check->mirror_num - 1;
- is_metadata = !(sblock_to_check->sectors[0]->flags &
- BTRFS_EXTENT_FLAG_DATA);
- have_csum = sblock_to_check->sectors[0]->have_csum;
-
- if (!sctx->is_dev_replace && btrfs_repair_one_zone(fs_info, logical))
- return 0;
-
- /*
- * We must use GFP_NOFS because the scrub task might be waiting for a
- * worker task executing this function and in turn a transaction commit
- * might be waiting the scrub task to pause (which needs to wait for all
- * the worker tasks to complete before pausing).
- * We do allocations in the workers through insert_full_stripe_lock()
- * and scrub_add_sector_to_wr_bio(), which happens down the call chain of
- * this function.
- */
- nofs_flag = memalloc_nofs_save();
- /*
- * For RAID5/6, race can happen for a different device scrub thread.
- * For data corruption, Parity and Data threads will both try
- * to recovery the data.
- * Race can lead to doubly added csum error, or even unrecoverable
- * error.
- */
- ret = lock_full_stripe(fs_info, logical, &full_stripe_locked);
- if (ret < 0) {
- memalloc_nofs_restore(nofs_flag);
- spin_lock(&sctx->stat_lock);
- if (ret == -ENOMEM)
- sctx->stat.malloc_errors++;
- sctx->stat.read_errors++;
- sctx->stat.uncorrectable_errors++;
- spin_unlock(&sctx->stat_lock);
- return ret;
- }
-
- /*
- * read all mirrors one after the other. This includes to
- * re-read the extent or metadata block that failed (that was
- * the cause that this fixup code is called) another time,
- * sector by sector this time in order to know which sectors
- * caused I/O errors and which ones are good (for all mirrors).
- * It is the goal to handle the situation when more than one
- * mirror contains I/O errors, but the errors do not
- * overlap, i.e. the data can be repaired by selecting the
- * sectors from those mirrors without I/O error on the
- * particular sectors. One example (with blocks >= 2 * sectorsize)
- * would be that mirror #1 has an I/O error on the first sector,
- * the second sector is good, and mirror #2 has an I/O error on
- * the second sector, but the first sector is good.
- * Then the first sector of the first mirror can be repaired by
- * taking the first sector of the second mirror, and the
- * second sector of the second mirror can be repaired by
- * copying the contents of the 2nd sector of the 1st mirror.
- * One more note: if the sectors of one mirror contain I/O
- * errors, the checksum cannot be verified. In order to get
- * the best data for repairing, the first attempt is to find
- * a mirror without I/O errors and with a validated checksum.
- * Only if this is not possible, the sectors are picked from
- * mirrors with I/O errors without considering the checksum.
- * If the latter is the case, at the end, the checksum of the
- * repaired area is verified in order to correctly maintain
- * the statistics.
- */
- for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; mirror_index++) {
- /*
- * Note: the two members refs and outstanding_sectors are not
- * used in the blocks that are used for the recheck procedure.
- *
- * But alloc_scrub_block() will initialize sblock::ref anyway,
- * so we can use scrub_block_put() to clean them up.
- *
- * And here we don't setup the physical/dev for the sblock yet,
- * they will be correctly initialized in scrub_setup_recheck_block().
- */
- sblocks_for_recheck[mirror_index] = alloc_scrub_block(sctx, NULL,
- logical, 0, 0, mirror_index);
- if (!sblocks_for_recheck[mirror_index]) {
- spin_lock(&sctx->stat_lock);
- sctx->stat.malloc_errors++;
- sctx->stat.read_errors++;
- sctx->stat.uncorrectable_errors++;
- spin_unlock(&sctx->stat_lock);
- btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
- goto out;
- }
- }
-
- /* Setup the context, map the logical blocks and alloc the sectors */
- ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
- if (ret) {
- spin_lock(&sctx->stat_lock);
- sctx->stat.read_errors++;
- sctx->stat.uncorrectable_errors++;
- spin_unlock(&sctx->stat_lock);
- btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
- goto out;
- }
- BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
- sblock_bad = sblocks_for_recheck[failed_mirror_index];
-
- /* build and submit the bios for the failed mirror, check checksums */
- scrub_recheck_block(fs_info, sblock_bad, 1);
-
- if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
- sblock_bad->no_io_error_seen) {
- /*
- * The error disappeared after reading sector by sector, or
- * the area was part of a huge bio and other parts of the
- * bio caused I/O errors, or the block layer merged several
- * read requests into one and the error is caused by a
- * different bio (usually one of the two latter cases is
- * the cause)
- */
- spin_lock(&sctx->stat_lock);
- sctx->stat.unverified_errors++;
- sblock_to_check->data_corrected = 1;
- spin_unlock(&sctx->stat_lock);
-
- if (sctx->is_dev_replace)
- scrub_write_block_to_dev_replace(sblock_bad);
- goto out;
- }
-
- if (!sblock_bad->no_io_error_seen) {
- spin_lock(&sctx->stat_lock);
- sctx->stat.read_errors++;
- spin_unlock(&sctx->stat_lock);
- if (__ratelimit(&rs))
- scrub_print_warning("i/o error", sblock_to_check);
- btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
- } else if (sblock_bad->checksum_error) {
- spin_lock(&sctx->stat_lock);
- sctx->stat.csum_errors++;
- spin_unlock(&sctx->stat_lock);
- if (__ratelimit(&rs))
- scrub_print_warning("checksum error", sblock_to_check);
- btrfs_dev_stat_inc_and_print(dev,
- BTRFS_DEV_STAT_CORRUPTION_ERRS);
- } else if (sblock_bad->header_error) {
- spin_lock(&sctx->stat_lock);
- sctx->stat.verify_errors++;
- spin_unlock(&sctx->stat_lock);
- if (__ratelimit(&rs))
- scrub_print_warning("checksum/header error",
- sblock_to_check);
- if (sblock_bad->generation_error)
- btrfs_dev_stat_inc_and_print(dev,
- BTRFS_DEV_STAT_GENERATION_ERRS);
- else
- btrfs_dev_stat_inc_and_print(dev,
- BTRFS_DEV_STAT_CORRUPTION_ERRS);
- }
-
- if (sctx->readonly) {
- ASSERT(!sctx->is_dev_replace);
- goto out;
- }
-
- /*
- * now build and submit the bios for the other mirrors, check
- * checksums.
- * First try to pick the mirror which is completely without I/O
- * errors and also does not have a checksum error.
- * If one is found, and if a checksum is present, the full block
- * that is known to contain an error is rewritten. Afterwards
- * the block is known to be corrected.
- * If a mirror is found which is completely correct, and no
- * checksum is present, only those sectors are rewritten that had
- * an I/O error in the block to be repaired, since it cannot be
- * determined, which copy of the other sectors is better (and it
- * could happen otherwise that a correct sector would be
- * overwritten by a bad one).
- */
- for (mirror_index = 0; ;mirror_index++) {
- struct scrub_block *sblock_other;
-
- if (mirror_index == failed_mirror_index)
- continue;
-
- /* raid56's mirror can be more than BTRFS_MAX_MIRRORS */
- if (!scrub_is_page_on_raid56(sblock_bad->sectors[0])) {
- if (mirror_index >= BTRFS_MAX_MIRRORS)
- break;
- if (!sblocks_for_recheck[mirror_index]->sector_count)
- break;
-
- sblock_other = sblocks_for_recheck[mirror_index];
- } else {
- struct scrub_recover *r = sblock_bad->sectors[0]->recover;
- int max_allowed = r->bioc->num_stripes - r->bioc->num_tgtdevs;
-
- if (mirror_index >= max_allowed)
- break;
- if (!sblocks_for_recheck[1]->sector_count)
- break;
-
- ASSERT(failed_mirror_index == 0);
- sblock_other = sblocks_for_recheck[1];
- sblock_other->mirror_num = 1 + mirror_index;
- }
-
- /* build and submit the bios, check checksums */
- scrub_recheck_block(fs_info, sblock_other, 0);
-
- if (!sblock_other->header_error &&
- !sblock_other->checksum_error &&
- sblock_other->no_io_error_seen) {
- if (sctx->is_dev_replace) {
- scrub_write_block_to_dev_replace(sblock_other);
- goto corrected_error;
- } else {
- ret = scrub_repair_block_from_good_copy(
- sblock_bad, sblock_other);
- if (!ret)
- goto corrected_error;
- }
- }
- }
-
- if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
- goto did_not_correct_error;
-
- /*
- * In case of I/O errors in the area that is supposed to be
- * repaired, continue by picking good copies of those sectors.
- * Select the good sectors from mirrors to rewrite bad sectors from
- * the area to fix. Afterwards verify the checksum of the block
- * that is supposed to be repaired. This verification step is
- * only done for the purpose of statistic counting and for the
- * final scrub report, whether errors remain.
- * A perfect algorithm could make use of the checksum and try
- * all possible combinations of sectors from the different mirrors
- * until the checksum verification succeeds. For example, when
- * the 2nd sector of mirror #1 faces I/O errors, and the 2nd sector
- * of mirror #2 is readable but the final checksum test fails,
- * then the 2nd sector of mirror #3 could be tried, whether now
- * the final checksum succeeds. But this would be a rare
- * exception and is therefore not implemented. At least it is
- * avoided that the good copy is overwritten.
- * A more useful improvement would be to pick the sectors
- * without I/O error based on sector sizes (512 bytes on legacy
- * disks) instead of on sectorsize. Then maybe 512 byte of one
- * mirror could be repaired by taking 512 byte of a different
- * mirror, even if other 512 byte sectors in the same sectorsize
- * area are unreadable.
- */
- success = 1;
- for (sector_num = 0; sector_num < sblock_bad->sector_count;
- sector_num++) {
- struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num];
- struct scrub_block *sblock_other = NULL;
-
- /* Skip no-io-error sectors in scrub */
- if (!sector_bad->io_error && !sctx->is_dev_replace)
- continue;
-
- if (scrub_is_page_on_raid56(sblock_bad->sectors[0])) {
- /*
- * In case of dev replace, if raid56 rebuild process
- * didn't work out correct data, then copy the content
- * in sblock_bad to make sure target device is identical
- * to source device, instead of writing garbage data in
- * sblock_for_recheck array to target device.
- */
- sblock_other = NULL;
- } else if (sector_bad->io_error) {
- /* Try to find no-io-error sector in mirrors */
- for (mirror_index = 0;
- mirror_index < BTRFS_MAX_MIRRORS &&
- sblocks_for_recheck[mirror_index]->sector_count > 0;
- mirror_index++) {
- if (!sblocks_for_recheck[mirror_index]->
- sectors[sector_num]->io_error) {
- sblock_other = sblocks_for_recheck[mirror_index];
- break;
- }
- }
- if (!sblock_other)
- success = 0;
- }
-
- if (sctx->is_dev_replace) {
- /*
- * Did not find a mirror to fetch the sector from.
- * scrub_write_sector_to_dev_replace() handles this
- * case (sector->io_error), by filling the block with
- * zeros before submitting the write request
- */
- if (!sblock_other)
- sblock_other = sblock_bad;
-
- if (scrub_write_sector_to_dev_replace(sblock_other,
- sector_num) != 0) {
- atomic64_inc(
- &fs_info->dev_replace.num_write_errors);
- success = 0;
- }
- } else if (sblock_other) {
- ret = scrub_repair_sector_from_good_copy(sblock_bad,
- sblock_other,
- sector_num, 0);
- if (0 == ret)
- sector_bad->io_error = 0;
- else
- success = 0;
- }
- }
-
- if (success && !sctx->is_dev_replace) {
- if (is_metadata || have_csum) {
- /*
- * need to verify the checksum now that all
- * sectors on disk are repaired (the write
- * request for data to be repaired is on its way).
- * Just be lazy and use scrub_recheck_block()
- * which re-reads the data before the checksum
- * is verified, but most likely the data comes out
- * of the page cache.
- */
- scrub_recheck_block(fs_info, sblock_bad, 1);
- if (!sblock_bad->header_error &&
- !sblock_bad->checksum_error &&
- sblock_bad->no_io_error_seen)
- goto corrected_error;
- else
- goto did_not_correct_error;
- } else {
-corrected_error:
- spin_lock(&sctx->stat_lock);
- sctx->stat.corrected_errors++;
- sblock_to_check->data_corrected = 1;
- spin_unlock(&sctx->stat_lock);
- btrfs_err_rl_in_rcu(fs_info,
- "fixed up error at logical %llu on dev %s",
- logical, btrfs_dev_name(dev));
- }
- } else {
-did_not_correct_error:
- spin_lock(&sctx->stat_lock);
- sctx->stat.uncorrectable_errors++;
- spin_unlock(&sctx->stat_lock);
- btrfs_err_rl_in_rcu(fs_info,
- "unable to fixup (regular) error at logical %llu on dev %s",
- logical, btrfs_dev_name(dev));
- }
-
-out:
- for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; mirror_index++) {
- struct scrub_block *sblock = sblocks_for_recheck[mirror_index];
- struct scrub_recover *recover;
- int sector_index;
-
- /* Not allocated, continue checking the next mirror */
- if (!sblock)
- continue;
-
- for (sector_index = 0; sector_index < sblock->sector_count;
- sector_index++) {
- /*
- * Here we just cleanup the recover, each sector will be
- * properly cleaned up by later scrub_block_put()
- */
- recover = sblock->sectors[sector_index]->recover;
- if (recover) {
- scrub_put_recover(fs_info, recover);
- sblock->sectors[sector_index]->recover = NULL;
- }
- }
- scrub_block_put(sblock);
- }
-
- ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
- memalloc_nofs_restore(nofs_flag);
- if (ret < 0)
- return ret;
- return 0;
-}
-
static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc)
{
if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
@@ -1430,7 +551,7 @@ static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc)
}
static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
- u64 *raid_map,
+ u64 full_stripe_logical,
int nstripes, int mirror,
int *stripe_index,
u64 *stripe_offset)
@@ -1438,19 +559,22 @@ static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
int i;
if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+ const int nr_data_stripes = (map_type & BTRFS_BLOCK_GROUP_RAID5) ?
+ nstripes - 1 : nstripes - 2;
+
/* RAID5/6 */
- for (i = 0; i < nstripes; i++) {
- if (raid_map[i] == RAID6_Q_STRIPE ||
- raid_map[i] == RAID5_P_STRIPE)
- continue;
+ for (i = 0; i < nr_data_stripes; i++) {
+ const u64 data_stripe_start = full_stripe_logical +
+ (i * BTRFS_STRIPE_LEN);
- if (logical >= raid_map[i] &&
- logical < raid_map[i] + BTRFS_STRIPE_LEN)
+ if (logical >= data_stripe_start &&
+ logical < data_stripe_start + BTRFS_STRIPE_LEN)
break;
}
*stripe_index = i;
- *stripe_offset = logical - raid_map[i];
+ *stripe_offset = (logical - full_stripe_logical) &
+ BTRFS_STRIPE_LEN_MASK;
} else {
/* The other RAID type */
*stripe_index = mirror;
@@ -1458,336 +582,6 @@ static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
}
}
-static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
- struct scrub_block *sblocks_for_recheck[])
-{
- struct scrub_ctx *sctx = original_sblock->sctx;
- struct btrfs_fs_info *fs_info = sctx->fs_info;
- u64 logical = original_sblock->logical;
- u64 length = original_sblock->sector_count << fs_info->sectorsize_bits;
- u64 generation = original_sblock->sectors[0]->generation;
- u64 flags = original_sblock->sectors[0]->flags;
- u64 have_csum = original_sblock->sectors[0]->have_csum;
- struct scrub_recover *recover;
- struct btrfs_io_context *bioc;
- u64 sublen;
- u64 mapped_length;
- u64 stripe_offset;
- int stripe_index;
- int sector_index = 0;
- int mirror_index;
- int nmirrors;
- int ret;
-
- while (length > 0) {
- sublen = min_t(u64, length, fs_info->sectorsize);
- mapped_length = sublen;
- bioc = NULL;
-
- /*
- * With a length of sectorsize, each returned stripe represents
- * one mirror
- */
- btrfs_bio_counter_inc_blocked(fs_info);
- ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
- logical, &mapped_length, &bioc);
- if (ret || !bioc || mapped_length < sublen) {
- btrfs_put_bioc(bioc);
- btrfs_bio_counter_dec(fs_info);
- return -EIO;
- }
-
- recover = kzalloc(sizeof(struct scrub_recover), GFP_KERNEL);
- if (!recover) {
- btrfs_put_bioc(bioc);
- btrfs_bio_counter_dec(fs_info);
- return -ENOMEM;
- }
-
- refcount_set(&recover->refs, 1);
- recover->bioc = bioc;
- recover->map_length = mapped_length;
-
- ASSERT(sector_index < SCRUB_MAX_SECTORS_PER_BLOCK);
-
- nmirrors = min(scrub_nr_raid_mirrors(bioc), BTRFS_MAX_MIRRORS);
-
- for (mirror_index = 0; mirror_index < nmirrors;
- mirror_index++) {
- struct scrub_block *sblock;
- struct scrub_sector *sector;
-
- sblock = sblocks_for_recheck[mirror_index];
- sblock->sctx = sctx;
-
- sector = alloc_scrub_sector(sblock, logical);
- if (!sector) {
- spin_lock(&sctx->stat_lock);
- sctx->stat.malloc_errors++;
- spin_unlock(&sctx->stat_lock);
- scrub_put_recover(fs_info, recover);
- return -ENOMEM;
- }
- sector->flags = flags;
- sector->generation = generation;
- sector->have_csum = have_csum;
- if (have_csum)
- memcpy(sector->csum,
- original_sblock->sectors[0]->csum,
- sctx->fs_info->csum_size);
-
- scrub_stripe_index_and_offset(logical,
- bioc->map_type,
- bioc->raid_map,
- bioc->num_stripes -
- bioc->num_tgtdevs,
- mirror_index,
- &stripe_index,
- &stripe_offset);
- /*
- * We're at the first sector, also populate @sblock
- * physical and dev.
- */
- if (sector_index == 0) {
- sblock->physical =
- bioc->stripes[stripe_index].physical +
- stripe_offset;
- sblock->dev = bioc->stripes[stripe_index].dev;
- sblock->physical_for_dev_replace =
- original_sblock->physical_for_dev_replace;
- }
-
- BUG_ON(sector_index >= original_sblock->sector_count);
- scrub_get_recover(recover);
- sector->recover = recover;
- }
- scrub_put_recover(fs_info, recover);
- length -= sublen;
- logical += sublen;
- sector_index++;
- }
-
- return 0;
-}
-
-static void scrub_bio_wait_endio(struct bio *bio)
-{
- complete(bio->bi_private);
-}
-
-static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
- struct bio *bio,
- struct scrub_sector *sector)
-{
- DECLARE_COMPLETION_ONSTACK(done);
-
- bio->bi_iter.bi_sector = (sector->offset + sector->sblock->logical) >>
- SECTOR_SHIFT;
- bio->bi_private = &done;
- bio->bi_end_io = scrub_bio_wait_endio;
- raid56_parity_recover(bio, sector->recover->bioc, sector->sblock->mirror_num);
-
- wait_for_completion_io(&done);
- return blk_status_to_errno(bio->bi_status);
-}
-
-static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
- struct scrub_block *sblock)
-{
- struct scrub_sector *first_sector = sblock->sectors[0];
- struct bio *bio;
- int i;
-
- /* All sectors in sblock belong to the same stripe on the same device. */
- ASSERT(sblock->dev);
- if (!sblock->dev->bdev)
- goto out;
-
- bio = bio_alloc(sblock->dev->bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
-
- for (i = 0; i < sblock->sector_count; i++) {
- struct scrub_sector *sector = sblock->sectors[i];
-
- bio_add_scrub_sector(bio, sector, fs_info->sectorsize);
- }
-
- if (scrub_submit_raid56_bio_wait(fs_info, bio, first_sector)) {
- bio_put(bio);
- goto out;
- }
-
- bio_put(bio);
-
- scrub_recheck_block_checksum(sblock);
-
- return;
-out:
- for (i = 0; i < sblock->sector_count; i++)
- sblock->sectors[i]->io_error = 1;
-
- sblock->no_io_error_seen = 0;
-}
-
-/*
- * This function will check the on disk data for checksum errors, header errors
- * and read I/O errors. If any I/O errors happen, the exact sectors which are
- * errored are marked as being bad. The goal is to enable scrub to take those
- * sectors that are not errored from all the mirrors so that the sectors that
- * are errored in the just handled mirror can be repaired.
- */
-static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
- struct scrub_block *sblock,
- int retry_failed_mirror)
-{
- int i;
-
- sblock->no_io_error_seen = 1;
-
- /* short cut for raid56 */
- if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->sectors[0]))
- return scrub_recheck_block_on_raid56(fs_info, sblock);
-
- for (i = 0; i < sblock->sector_count; i++) {
- struct scrub_sector *sector = sblock->sectors[i];
- struct bio bio;
- struct bio_vec bvec;
-
- if (sblock->dev->bdev == NULL) {
- sector->io_error = 1;
- sblock->no_io_error_seen = 0;
- continue;
- }
-
- bio_init(&bio, sblock->dev->bdev, &bvec, 1, REQ_OP_READ);
- bio_add_scrub_sector(&bio, sector, fs_info->sectorsize);
- bio.bi_iter.bi_sector = (sblock->physical + sector->offset) >>
- SECTOR_SHIFT;
-
- btrfsic_check_bio(&bio);
- if (submit_bio_wait(&bio)) {
- sector->io_error = 1;
- sblock->no_io_error_seen = 0;
- }
-
- bio_uninit(&bio);
- }
-
- if (sblock->no_io_error_seen)
- scrub_recheck_block_checksum(sblock);
-}
-
-static inline int scrub_check_fsid(u8 fsid[], struct scrub_sector *sector)
-{
- struct btrfs_fs_devices *fs_devices = sector->sblock->dev->fs_devices;
- int ret;
-
- ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
- return !ret;
-}
-
-static void scrub_recheck_block_checksum(struct scrub_block *sblock)
-{
- sblock->header_error = 0;
- sblock->checksum_error = 0;
- sblock->generation_error = 0;
-
- if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_DATA)
- scrub_checksum_data(sblock);
- else
- scrub_checksum_tree_block(sblock);
-}
-
-static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
- struct scrub_block *sblock_good)
-{
- int i;
- int ret = 0;
-
- for (i = 0; i < sblock_bad->sector_count; i++) {
- int ret_sub;
-
- ret_sub = scrub_repair_sector_from_good_copy(sblock_bad,
- sblock_good, i, 1);
- if (ret_sub)
- ret = ret_sub;
- }
-
- return ret;
-}
-
-static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad,
- struct scrub_block *sblock_good,
- int sector_num, int force_write)
-{
- struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num];
- struct scrub_sector *sector_good = sblock_good->sectors[sector_num];
- struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;
- const u32 sectorsize = fs_info->sectorsize;
-
- if (force_write || sblock_bad->header_error ||
- sblock_bad->checksum_error || sector_bad->io_error) {
- struct bio bio;
- struct bio_vec bvec;
- int ret;
-
- if (!sblock_bad->dev->bdev) {
- btrfs_warn_rl(fs_info,
- "scrub_repair_page_from_good_copy(bdev == NULL) is unexpected");
- return -EIO;
- }
-
- bio_init(&bio, sblock_bad->dev->bdev, &bvec, 1, REQ_OP_WRITE);
- bio.bi_iter.bi_sector = (sblock_bad->physical +
- sector_bad->offset) >> SECTOR_SHIFT;
- ret = bio_add_scrub_sector(&bio, sector_good, sectorsize);
-
- btrfsic_check_bio(&bio);
- ret = submit_bio_wait(&bio);
- bio_uninit(&bio);
-
- if (ret) {
- btrfs_dev_stat_inc_and_print(sblock_bad->dev,
- BTRFS_DEV_STAT_WRITE_ERRS);
- atomic64_inc(&fs_info->dev_replace.num_write_errors);
- return -EIO;
- }
- }
-
- return 0;
-}
-
-static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
-{
- struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
- int i;
-
- /*
- * This block is used for the check of the parity on the source device,
- * so the data needn't be written into the destination device.
- */
- if (sblock->sparity)
- return;
-
- for (i = 0; i < sblock->sector_count; i++) {
- int ret;
-
- ret = scrub_write_sector_to_dev_replace(sblock, i);
- if (ret)
- atomic64_inc(&fs_info->dev_replace.num_write_errors);
- }
-}
-
-static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock, int sector_num)
-{
- const u32 sectorsize = sblock->sctx->fs_info->sectorsize;
- struct scrub_sector *sector = sblock->sectors[sector_num];
-
- if (sector->io_error)
- memset(scrub_sector_get_kaddr(sector), 0, sectorsize);
-
- return scrub_add_sector_to_wr_bio(sblock->sctx, sector);
-}
-
static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
{
int ret = 0;
@@ -1810,1089 +604,653 @@ static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
return ret;
}
-static void scrub_block_get(struct scrub_block *sblock)
+static struct page *scrub_stripe_get_page(struct scrub_stripe *stripe, int sector_nr)
{
- refcount_inc(&sblock->refs);
-}
-
-static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx,
- struct scrub_sector *sector)
-{
- struct scrub_block *sblock = sector->sblock;
- struct scrub_bio *sbio;
- int ret;
- const u32 sectorsize = sctx->fs_info->sectorsize;
-
- mutex_lock(&sctx->wr_lock);
-again:
- if (!sctx->wr_curr_bio) {
- sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio),
- GFP_KERNEL);
- if (!sctx->wr_curr_bio) {
- mutex_unlock(&sctx->wr_lock);
- return -ENOMEM;
- }
- sctx->wr_curr_bio->sctx = sctx;
- sctx->wr_curr_bio->sector_count = 0;
- }
- sbio = sctx->wr_curr_bio;
- if (sbio->sector_count == 0) {
- ret = fill_writer_pointer_gap(sctx, sector->offset +
- sblock->physical_for_dev_replace);
- if (ret) {
- mutex_unlock(&sctx->wr_lock);
- return ret;
- }
-
- sbio->physical = sblock->physical_for_dev_replace + sector->offset;
- sbio->logical = sblock->logical + sector->offset;
- sbio->dev = sctx->wr_tgtdev;
- if (!sbio->bio) {
- sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio,
- REQ_OP_WRITE, GFP_NOFS);
- }
- sbio->bio->bi_private = sbio;
- sbio->bio->bi_end_io = scrub_wr_bio_end_io;
- sbio->bio->bi_iter.bi_sector = sbio->physical >> 9;
- sbio->status = 0;
- } else if (sbio->physical + sbio->sector_count * sectorsize !=
- sblock->physical_for_dev_replace + sector->offset ||
- sbio->logical + sbio->sector_count * sectorsize !=
- sblock->logical + sector->offset) {
- scrub_wr_submit(sctx);
- goto again;
- }
+ struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
+ int page_index = (sector_nr << fs_info->sectorsize_bits) >> PAGE_SHIFT;
- ret = bio_add_scrub_sector(sbio->bio, sector, sectorsize);
- if (ret != sectorsize) {
- if (sbio->sector_count < 1) {
- bio_put(sbio->bio);
- sbio->bio = NULL;
- mutex_unlock(&sctx->wr_lock);
- return -EIO;
- }
- scrub_wr_submit(sctx);
- goto again;
- }
-
- sbio->sectors[sbio->sector_count] = sector;
- scrub_sector_get(sector);
- /*
- * Since ssector no longer holds a page, but uses sblock::pages, we
- * have to ensure the sblock had not been freed before our write bio
- * finished.
- */
- scrub_block_get(sector->sblock);
-
- sbio->sector_count++;
- if (sbio->sector_count == sctx->sectors_per_bio)
- scrub_wr_submit(sctx);
- mutex_unlock(&sctx->wr_lock);
-
- return 0;
+ return stripe->pages[page_index];
}
-static void scrub_wr_submit(struct scrub_ctx *sctx)
+static unsigned int scrub_stripe_get_page_offset(struct scrub_stripe *stripe,
+ int sector_nr)
{
- struct scrub_bio *sbio;
-
- if (!sctx->wr_curr_bio)
- return;
-
- sbio = sctx->wr_curr_bio;
- sctx->wr_curr_bio = NULL;
- scrub_pending_bio_inc(sctx);
- /* process all writes in a single worker thread. Then the block layer
- * orders the requests before sending them to the driver which
- * doubled the write performance on spinning disks when measured
- * with Linux 3.5 */
- btrfsic_check_bio(sbio->bio);
- submit_bio(sbio->bio);
-
- if (btrfs_is_zoned(sctx->fs_info))
- sctx->write_pointer = sbio->physical + sbio->sector_count *
- sctx->fs_info->sectorsize;
-}
-
-static void scrub_wr_bio_end_io(struct bio *bio)
-{
- struct scrub_bio *sbio = bio->bi_private;
- struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
-
- sbio->status = bio->bi_status;
- sbio->bio = bio;
+ struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
- INIT_WORK(&sbio->work, scrub_wr_bio_end_io_worker);
- queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
+ return offset_in_page(sector_nr << fs_info->sectorsize_bits);
}
-static void scrub_wr_bio_end_io_worker(struct work_struct *work)
+static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr)
{
- struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
- struct scrub_ctx *sctx = sbio->sctx;
- int i;
-
- ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO);
- if (sbio->status) {
- struct btrfs_dev_replace *dev_replace =
- &sbio->sctx->fs_info->dev_replace;
-
- for (i = 0; i < sbio->sector_count; i++) {
- struct scrub_sector *sector = sbio->sectors[i];
-
- sector->io_error = 1;
- atomic64_inc(&dev_replace->num_write_errors);
- }
- }
-
- /*
- * In scrub_add_sector_to_wr_bio() we grab extra ref for sblock, now in
- * endio we should put the sblock.
- */
- for (i = 0; i < sbio->sector_count; i++) {
- scrub_block_put(sbio->sectors[i]->sblock);
- scrub_sector_put(sbio->sectors[i]);
- }
-
- bio_put(sbio->bio);
- kfree(sbio);
- scrub_pending_bio_dec(sctx);
-}
-
-static int scrub_checksum(struct scrub_block *sblock)
-{
- u64 flags;
- int ret;
-
- /*
- * No need to initialize these stats currently,
- * because this function only use return value
- * instead of these stats value.
- *
- * Todo:
- * always use stats
- */
- sblock->header_error = 0;
- sblock->generation_error = 0;
- sblock->checksum_error = 0;
-
- WARN_ON(sblock->sector_count < 1);
- flags = sblock->sectors[0]->flags;
- ret = 0;
- if (flags & BTRFS_EXTENT_FLAG_DATA)
- ret = scrub_checksum_data(sblock);
- else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
- ret = scrub_checksum_tree_block(sblock);
- else if (flags & BTRFS_EXTENT_FLAG_SUPER)
- ret = scrub_checksum_super(sblock);
- else
- WARN_ON(1);
- if (ret)
- scrub_handle_errored_block(sblock);
-
- return ret;
-}
-
-static int scrub_checksum_data(struct scrub_block *sblock)
-{
- struct scrub_ctx *sctx = sblock->sctx;
- struct btrfs_fs_info *fs_info = sctx->fs_info;
+ struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
+ const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits;
+ const u64 logical = stripe->logical + (sector_nr << fs_info->sectorsize_bits);
+ const struct page *first_page = scrub_stripe_get_page(stripe, sector_nr);
+ const unsigned int first_off = scrub_stripe_get_page_offset(stripe, sector_nr);
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
- u8 csum[BTRFS_CSUM_SIZE];
- struct scrub_sector *sector;
- char *kaddr;
-
- BUG_ON(sblock->sector_count < 1);
- sector = sblock->sectors[0];
- if (!sector->have_csum)
- return 0;
-
- kaddr = scrub_sector_get_kaddr(sector);
-
- shash->tfm = fs_info->csum_shash;
- crypto_shash_init(shash);
-
- crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
-
- if (memcmp(csum, sector->csum, fs_info->csum_size))
- sblock->checksum_error = 1;
- return sblock->checksum_error;
-}
-
-static int scrub_checksum_tree_block(struct scrub_block *sblock)
-{
- struct scrub_ctx *sctx = sblock->sctx;
- struct btrfs_header *h;
- struct btrfs_fs_info *fs_info = sctx->fs_info;
- SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
- u8 calculated_csum[BTRFS_CSUM_SIZE];
u8 on_disk_csum[BTRFS_CSUM_SIZE];
- /*
- * This is done in sectorsize steps even for metadata as there's a
- * constraint for nodesize to be aligned to sectorsize. This will need
- * to change so we don't misuse data and metadata units like that.
- */
- const u32 sectorsize = sctx->fs_info->sectorsize;
- const int num_sectors = fs_info->nodesize >> fs_info->sectorsize_bits;
- int i;
- struct scrub_sector *sector;
- char *kaddr;
-
- BUG_ON(sblock->sector_count < 1);
-
- /* Each member in sectors is just one sector */
- ASSERT(sblock->sector_count == num_sectors);
-
- sector = sblock->sectors[0];
- kaddr = scrub_sector_get_kaddr(sector);
- h = (struct btrfs_header *)kaddr;
- memcpy(on_disk_csum, h->csum, sctx->fs_info->csum_size);
+ u8 calculated_csum[BTRFS_CSUM_SIZE];
+ struct btrfs_header *header;
/*
- * we don't use the getter functions here, as we
- * a) don't have an extent buffer and
- * b) the page is already kmapped
+ * Here we don't have a good way to attach the pages (and subpages)
+ * to a dummy extent buffer, thus we have to directly grab the members
+ * from pages.
*/
- if (sblock->logical != btrfs_stack_header_bytenr(h)) {
- sblock->header_error = 1;
+ header = (struct btrfs_header *)(page_address(first_page) + first_off);
+ memcpy(on_disk_csum, header->csum, fs_info->csum_size);
+
+ if (logical != btrfs_stack_header_bytenr(header)) {
+ bitmap_set(&stripe->csum_error_bitmap, sector_nr, sectors_per_tree);
+ bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
btrfs_warn_rl(fs_info,
"tree block %llu mirror %u has bad bytenr, has %llu want %llu",
- sblock->logical, sblock->mirror_num,
- btrfs_stack_header_bytenr(h),
- sblock->logical);
- goto out;
+ logical, stripe->mirror_num,
+ btrfs_stack_header_bytenr(header), logical);
+ return;
}
-
- if (!scrub_check_fsid(h->fsid, sector)) {
- sblock->header_error = 1;
+ if (memcmp(header->fsid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE) != 0) {
+ bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
+ bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
btrfs_warn_rl(fs_info,
"tree block %llu mirror %u has bad fsid, has %pU want %pU",
- sblock->logical, sblock->mirror_num,
- h->fsid, sblock->dev->fs_devices->fsid);
- goto out;
+ logical, stripe->mirror_num,
+ header->fsid, fs_info->fs_devices->fsid);
+ return;
}
-
- if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, BTRFS_UUID_SIZE)) {
- sblock->header_error = 1;
+ if (memcmp(header->chunk_tree_uuid, fs_info->chunk_tree_uuid,
+ BTRFS_UUID_SIZE) != 0) {
+ bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
+ bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
btrfs_warn_rl(fs_info,
"tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU",
- sblock->logical, sblock->mirror_num,
- h->chunk_tree_uuid, fs_info->chunk_tree_uuid);
- goto out;
+ logical, stripe->mirror_num,
+ header->chunk_tree_uuid, fs_info->chunk_tree_uuid);
+ return;
}
+ /* Now check tree block csum. */
shash->tfm = fs_info->csum_shash;
crypto_shash_init(shash);
- crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
- sectorsize - BTRFS_CSUM_SIZE);
+ crypto_shash_update(shash, page_address(first_page) + first_off +
+ BTRFS_CSUM_SIZE, fs_info->sectorsize - BTRFS_CSUM_SIZE);
+
+ for (int i = sector_nr + 1; i < sector_nr + sectors_per_tree; i++) {
+ struct page *page = scrub_stripe_get_page(stripe, i);
+ unsigned int page_off = scrub_stripe_get_page_offset(stripe, i);
- for (i = 1; i < num_sectors; i++) {
- kaddr = scrub_sector_get_kaddr(sblock->sectors[i]);
- crypto_shash_update(shash, kaddr, sectorsize);
+ crypto_shash_update(shash, page_address(page) + page_off,
+ fs_info->sectorsize);
}
crypto_shash_final(shash, calculated_csum);
- if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size)) {
- sblock->checksum_error = 1;
+ if (memcmp(calculated_csum, on_disk_csum, fs_info->csum_size) != 0) {
+ bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
+ bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
btrfs_warn_rl(fs_info,
"tree block %llu mirror %u has bad csum, has " CSUM_FMT " want " CSUM_FMT,
- sblock->logical, sblock->mirror_num,
+ logical, stripe->mirror_num,
CSUM_FMT_VALUE(fs_info->csum_size, on_disk_csum),
CSUM_FMT_VALUE(fs_info->csum_size, calculated_csum));
- goto out;
+ return;
}
-
- if (sector->generation != btrfs_stack_header_generation(h)) {
- sblock->header_error = 1;
- sblock->generation_error = 1;
+ if (stripe->sectors[sector_nr].generation !=
+ btrfs_stack_header_generation(header)) {
+ bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
+ bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
btrfs_warn_rl(fs_info,
"tree block %llu mirror %u has bad generation, has %llu want %llu",
- sblock->logical, sblock->mirror_num,
- btrfs_stack_header_generation(h),
- sector->generation);
+ logical, stripe->mirror_num,
+ btrfs_stack_header_generation(header),
+ stripe->sectors[sector_nr].generation);
+ return;
}
-
-out:
- return sblock->header_error || sblock->checksum_error;
+ bitmap_clear(&stripe->error_bitmap, sector_nr, sectors_per_tree);
+ bitmap_clear(&stripe->csum_error_bitmap, sector_nr, sectors_per_tree);
+ bitmap_clear(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
}
-static int scrub_checksum_super(struct scrub_block *sblock)
+static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr)
{
- struct btrfs_super_block *s;
- struct scrub_ctx *sctx = sblock->sctx;
- struct btrfs_fs_info *fs_info = sctx->fs_info;
- SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
- u8 calculated_csum[BTRFS_CSUM_SIZE];
- struct scrub_sector *sector;
- char *kaddr;
- int fail_gen = 0;
- int fail_cor = 0;
-
- BUG_ON(sblock->sector_count < 1);
- sector = sblock->sectors[0];
- kaddr = scrub_sector_get_kaddr(sector);
- s = (struct btrfs_super_block *)kaddr;
-
- if (sblock->logical != btrfs_super_bytenr(s))
- ++fail_cor;
-
- if (sector->generation != btrfs_super_generation(s))
- ++fail_gen;
-
- if (!scrub_check_fsid(s->fsid, sector))
- ++fail_cor;
+ struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
+ struct scrub_sector_verification *sector = &stripe->sectors[sector_nr];
+ const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits;
+ struct page *page = scrub_stripe_get_page(stripe, sector_nr);
+ unsigned int pgoff = scrub_stripe_get_page_offset(stripe, sector_nr);
+ u8 csum_buf[BTRFS_CSUM_SIZE];
+ int ret;
- shash->tfm = fs_info->csum_shash;
- crypto_shash_init(shash);
- crypto_shash_digest(shash, kaddr + BTRFS_CSUM_SIZE,
- BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, calculated_csum);
+ ASSERT(sector_nr >= 0 && sector_nr < stripe->nr_sectors);
- if (memcmp(calculated_csum, s->csum, sctx->fs_info->csum_size))
- ++fail_cor;
+ /* Sector not utilized, skip it. */
+ if (!test_bit(sector_nr, &stripe->extent_sector_bitmap))
+ return;
- return fail_cor + fail_gen;
-}
+ /* IO error, no need to check. */
+ if (test_bit(sector_nr, &stripe->io_error_bitmap))
+ return;
-static void scrub_block_put(struct scrub_block *sblock)
-{
- if (refcount_dec_and_test(&sblock->refs)) {
- int i;
-
- if (sblock->sparity)
- scrub_parity_put(sblock->sparity);
-
- for (i = 0; i < sblock->sector_count; i++)
- scrub_sector_put(sblock->sectors[i]);
- for (i = 0; i < DIV_ROUND_UP(sblock->len, PAGE_SIZE); i++) {
- if (sblock->pages[i]) {
- detach_scrub_page_private(sblock->pages[i]);
- __free_page(sblock->pages[i]);
- }
+ /* Metadata, verify the full tree block. */
+ if (sector->is_metadata) {
+ /*
+ * Check if the tree block crosses the stripe boudary. If
+ * crossed the boundary, we cannot verify it but only give a
+ * warning.
+ *
+ * This can only happen on a very old filesystem where chunks
+ * are not ensured to be stripe aligned.
+ */
+ if (unlikely(sector_nr + sectors_per_tree > stripe->nr_sectors)) {
+ btrfs_warn_rl(fs_info,
+ "tree block at %llu crosses stripe boundary %llu",
+ stripe->logical +
+ (sector_nr << fs_info->sectorsize_bits),
+ stripe->logical);
+ return;
}
- kfree(sblock);
- }
-}
-
-static void scrub_sector_get(struct scrub_sector *sector)
-{
- atomic_inc(&sector->refs);
-}
-
-static void scrub_sector_put(struct scrub_sector *sector)
-{
- if (atomic_dec_and_test(&sector->refs))
- kfree(sector);
-}
-
-/*
- * Throttling of IO submission, bandwidth-limit based, the timeslice is 1
- * second. Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max.
- */
-static void scrub_throttle(struct scrub_ctx *sctx)
-{
- const int time_slice = 1000;
- struct scrub_bio *sbio;
- struct btrfs_device *device;
- s64 delta;
- ktime_t now;
- u32 div;
- u64 bwlimit;
-
- sbio = sctx->bios[sctx->curr];
- device = sbio->dev;
- bwlimit = READ_ONCE(device->scrub_speed_max);
- if (bwlimit == 0)
+ scrub_verify_one_metadata(stripe, sector_nr);
return;
+ }
/*
- * Slice is divided into intervals when the IO is submitted, adjust by
- * bwlimit and maximum of 64 intervals.
+ * Data is easier, we just verify the data csum (if we have it). For
+ * cases without csum, we have no other choice but to trust it.
*/
- div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024)));
- div = min_t(u32, 64, div);
-
- /* Start new epoch, set deadline */
- now = ktime_get();
- if (sctx->throttle_deadline == 0) {
- sctx->throttle_deadline = ktime_add_ms(now, time_slice / div);
- sctx->throttle_sent = 0;
+ if (!sector->csum) {
+ clear_bit(sector_nr, &stripe->error_bitmap);
+ return;
}
- /* Still in the time to send? */
- if (ktime_before(now, sctx->throttle_deadline)) {
- /* If current bio is within the limit, send it */
- sctx->throttle_sent += sbio->bio->bi_iter.bi_size;
- if (sctx->throttle_sent <= div_u64(bwlimit, div))
- return;
-
- /* We're over the limit, sleep until the rest of the slice */
- delta = ktime_ms_delta(sctx->throttle_deadline, now);
+ ret = btrfs_check_sector_csum(fs_info, page, pgoff, csum_buf, sector->csum);
+ if (ret < 0) {
+ set_bit(sector_nr, &stripe->csum_error_bitmap);
+ set_bit(sector_nr, &stripe->error_bitmap);
} else {
- /* New request after deadline, start new epoch */
- delta = 0;
- }
-
- if (delta) {
- long timeout;
-
- timeout = div_u64(delta * HZ, 1000);
- schedule_timeout_interruptible(timeout);
+ clear_bit(sector_nr, &stripe->csum_error_bitmap);
+ clear_bit(sector_nr, &stripe->error_bitmap);
}
-
- /* Next call will start the deadline period */
- sctx->throttle_deadline = 0;
-}
-
-static void scrub_submit(struct scrub_ctx *sctx)
-{
- struct scrub_bio *sbio;
-
- if (sctx->curr == -1)
- return;
-
- scrub_throttle(sctx);
-
- sbio = sctx->bios[sctx->curr];
- sctx->curr = -1;
- scrub_pending_bio_inc(sctx);
- btrfsic_check_bio(sbio->bio);
- submit_bio(sbio->bio);
}
-static int scrub_add_sector_to_rd_bio(struct scrub_ctx *sctx,
- struct scrub_sector *sector)
+/* Verify specified sectors of a stripe. */
+static void scrub_verify_one_stripe(struct scrub_stripe *stripe, unsigned long bitmap)
{
- struct scrub_block *sblock = sector->sblock;
- struct scrub_bio *sbio;
- const u32 sectorsize = sctx->fs_info->sectorsize;
- int ret;
+ struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
+ const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits;
+ int sector_nr;
-again:
- /*
- * grab a fresh bio or wait for one to become available
- */
- while (sctx->curr == -1) {
- spin_lock(&sctx->list_lock);
- sctx->curr = sctx->first_free;
- if (sctx->curr != -1) {
- sctx->first_free = sctx->bios[sctx->curr]->next_free;
- sctx->bios[sctx->curr]->next_free = -1;
- sctx->bios[sctx->curr]->sector_count = 0;
- spin_unlock(&sctx->list_lock);
- } else {
- spin_unlock(&sctx->list_lock);
- wait_event(sctx->list_wait, sctx->first_free != -1);
- }
+ for_each_set_bit(sector_nr, &bitmap, stripe->nr_sectors) {
+ scrub_verify_one_sector(stripe, sector_nr);
+ if (stripe->sectors[sector_nr].is_metadata)
+ sector_nr += sectors_per_tree - 1;
}
- sbio = sctx->bios[sctx->curr];
- if (sbio->sector_count == 0) {
- sbio->physical = sblock->physical + sector->offset;
- sbio->logical = sblock->logical + sector->offset;
- sbio->dev = sblock->dev;
- if (!sbio->bio) {
- sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio,
- REQ_OP_READ, GFP_NOFS);
- }
- sbio->bio->bi_private = sbio;
- sbio->bio->bi_end_io = scrub_bio_end_io;
- sbio->bio->bi_iter.bi_sector = sbio->physical >> 9;
- sbio->status = 0;
- } else if (sbio->physical + sbio->sector_count * sectorsize !=
- sblock->physical + sector->offset ||
- sbio->logical + sbio->sector_count * sectorsize !=
- sblock->logical + sector->offset ||
- sbio->dev != sblock->dev) {
- scrub_submit(sctx);
- goto again;
- }
-
- sbio->sectors[sbio->sector_count] = sector;
- ret = bio_add_scrub_sector(sbio->bio, sector, sectorsize);
- if (ret != sectorsize) {
- if (sbio->sector_count < 1) {
- bio_put(sbio->bio);
- sbio->bio = NULL;
- return -EIO;
- }
- scrub_submit(sctx);
- goto again;
- }
-
- scrub_block_get(sblock); /* one for the page added to the bio */
- atomic_inc(&sblock->outstanding_sectors);
- sbio->sector_count++;
- if (sbio->sector_count == sctx->sectors_per_bio)
- scrub_submit(sctx);
-
- return 0;
}
-static void scrub_missing_raid56_end_io(struct bio *bio)
+static int calc_sector_number(struct scrub_stripe *stripe, struct bio_vec *first_bvec)
{
- struct scrub_block *sblock = bio->bi_private;
- struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
-
- btrfs_bio_counter_dec(fs_info);
- if (bio->bi_status)
- sblock->no_io_error_seen = 0;
-
- bio_put(bio);
+ int i;
- queue_work(fs_info->scrub_workers, &sblock->work);
+ for (i = 0; i < stripe->nr_sectors; i++) {
+ if (scrub_stripe_get_page(stripe, i) == first_bvec->bv_page &&
+ scrub_stripe_get_page_offset(stripe, i) == first_bvec->bv_offset)
+ break;
+ }
+ ASSERT(i < stripe->nr_sectors);
+ return i;
}
-static void scrub_missing_raid56_worker(struct work_struct *work)
+/*
+ * Repair read is different to the regular read:
+ *
+ * - Only reads the failed sectors
+ * - May have extra blocksize limits
+ */
+static void scrub_repair_read_endio(struct btrfs_bio *bbio)
{
- struct scrub_block *sblock = container_of(work, struct scrub_block, work);
- struct scrub_ctx *sctx = sblock->sctx;
- struct btrfs_fs_info *fs_info = sctx->fs_info;
- u64 logical;
- struct btrfs_device *dev;
+ struct scrub_stripe *stripe = bbio->private;
+ struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
+ struct bio_vec *bvec;
+ int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio));
+ u32 bio_size = 0;
+ int i;
- logical = sblock->logical;
- dev = sblock->dev;
+ ASSERT(sector_nr < stripe->nr_sectors);
- if (sblock->no_io_error_seen)
- scrub_recheck_block_checksum(sblock);
+ bio_for_each_bvec_all(bvec, &bbio->bio, i)
+ bio_size += bvec->bv_len;
- if (!sblock->no_io_error_seen) {
- spin_lock(&sctx->stat_lock);
- sctx->stat.read_errors++;
- spin_unlock(&sctx->stat_lock);
- btrfs_err_rl_in_rcu(fs_info,
- "IO error rebuilding logical %llu for dev %s",
- logical, btrfs_dev_name(dev));
- } else if (sblock->header_error || sblock->checksum_error) {
- spin_lock(&sctx->stat_lock);
- sctx->stat.uncorrectable_errors++;
- spin_unlock(&sctx->stat_lock);
- btrfs_err_rl_in_rcu(fs_info,
- "failed to rebuild valid logical %llu for dev %s",
- logical, btrfs_dev_name(dev));
+ if (bbio->bio.bi_status) {
+ bitmap_set(&stripe->io_error_bitmap, sector_nr,
+ bio_size >> fs_info->sectorsize_bits);
+ bitmap_set(&stripe->error_bitmap, sector_nr,
+ bio_size >> fs_info->sectorsize_bits);
} else {
- scrub_write_block_to_dev_replace(sblock);
- }
-
- if (sctx->is_dev_replace && sctx->flush_all_writes) {
- mutex_lock(&sctx->wr_lock);
- scrub_wr_submit(sctx);
- mutex_unlock(&sctx->wr_lock);
+ bitmap_clear(&stripe->io_error_bitmap, sector_nr,
+ bio_size >> fs_info->sectorsize_bits);
}
+ bio_put(&bbio->bio);
+ if (atomic_dec_and_test(&stripe->pending_io))
+ wake_up(&stripe->io_wait);
+}
- scrub_block_put(sblock);
- scrub_pending_bio_dec(sctx);
+static int calc_next_mirror(int mirror, int num_copies)
+{
+ ASSERT(mirror <= num_copies);
+ return (mirror + 1 > num_copies) ? 1 : mirror + 1;
}
-static void scrub_missing_raid56_pages(struct scrub_block *sblock)
+static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe,
+ int mirror, int blocksize, bool wait)
{
- struct scrub_ctx *sctx = sblock->sctx;
- struct btrfs_fs_info *fs_info = sctx->fs_info;
- u64 length = sblock->sector_count << fs_info->sectorsize_bits;
- u64 logical = sblock->logical;
- struct btrfs_io_context *bioc = NULL;
- struct bio *bio;
- struct btrfs_raid_bio *rbio;
- int ret;
+ struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
+ struct btrfs_bio *bbio = NULL;
+ const unsigned long old_error_bitmap = stripe->error_bitmap;
int i;
- btrfs_bio_counter_inc_blocked(fs_info);
- ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
- &length, &bioc);
- if (ret || !bioc || !bioc->raid_map)
- goto bioc_out;
-
- if (WARN_ON(!sctx->is_dev_replace ||
- !(bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
- /*
- * We shouldn't be scrubbing a missing device. Even for dev
- * replace, we should only get here for RAID 5/6. We either
- * managed to mount something with no mirrors remaining or
- * there's a bug in scrub_find_good_copy()/btrfs_map_block().
- */
- goto bioc_out;
- }
+ ASSERT(stripe->mirror_num >= 1);
+ ASSERT(atomic_read(&stripe->pending_io) == 0);
- bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
- bio->bi_iter.bi_sector = logical >> 9;
- bio->bi_private = sblock;
- bio->bi_end_io = scrub_missing_raid56_end_io;
+ for_each_set_bit(i, &old_error_bitmap, stripe->nr_sectors) {
+ struct page *page;
+ int pgoff;
+ int ret;
- rbio = raid56_alloc_missing_rbio(bio, bioc);
- if (!rbio)
- goto rbio_out;
+ page = scrub_stripe_get_page(stripe, i);
+ pgoff = scrub_stripe_get_page_offset(stripe, i);
+
+ /* The current sector cannot be merged, submit the bio. */
+ if (bbio && ((i > 0 && !test_bit(i - 1, &stripe->error_bitmap)) ||
+ bbio->bio.bi_iter.bi_size >= blocksize)) {
+ ASSERT(bbio->bio.bi_iter.bi_size);
+ atomic_inc(&stripe->pending_io);
+ btrfs_submit_bio(bbio, mirror);
+ if (wait)
+ wait_scrub_stripe_io(stripe);
+ bbio = NULL;
+ }
- for (i = 0; i < sblock->sector_count; i++) {
- struct scrub_sector *sector = sblock->sectors[i];
+ if (!bbio) {
+ bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_READ,
+ fs_info, scrub_repair_read_endio, stripe);
+ bbio->bio.bi_iter.bi_sector = (stripe->logical +
+ (i << fs_info->sectorsize_bits)) >> SECTOR_SHIFT;
+ }
- raid56_add_scrub_pages(rbio, scrub_sector_get_page(sector),
- scrub_sector_get_page_offset(sector),
- sector->offset + sector->sblock->logical);
+ ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff);
+ ASSERT(ret == fs_info->sectorsize);
+ }
+ if (bbio) {
+ ASSERT(bbio->bio.bi_iter.bi_size);
+ atomic_inc(&stripe->pending_io);
+ btrfs_submit_bio(bbio, mirror);
+ if (wait)
+ wait_scrub_stripe_io(stripe);
}
-
- INIT_WORK(&sblock->work, scrub_missing_raid56_worker);
- scrub_block_get(sblock);
- scrub_pending_bio_inc(sctx);
- raid56_submit_missing_rbio(rbio);
- btrfs_put_bioc(bioc);
- return;
-
-rbio_out:
- bio_put(bio);
-bioc_out:
- btrfs_bio_counter_dec(fs_info);
- btrfs_put_bioc(bioc);
- spin_lock(&sctx->stat_lock);
- sctx->stat.malloc_errors++;
- spin_unlock(&sctx->stat_lock);
}
-static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
- u64 physical, struct btrfs_device *dev, u64 flags,
- u64 gen, int mirror_num, u8 *csum,
- u64 physical_for_dev_replace)
+static void scrub_stripe_report_errors(struct scrub_ctx *sctx,
+ struct scrub_stripe *stripe)
{
- struct scrub_block *sblock;
- const u32 sectorsize = sctx->fs_info->sectorsize;
- int index;
+ static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
+ struct btrfs_device *dev = NULL;
+ u64 physical = 0;
+ int nr_data_sectors = 0;
+ int nr_meta_sectors = 0;
+ int nr_nodatacsum_sectors = 0;
+ int nr_repaired_sectors = 0;
+ int sector_nr;
+
+ if (test_bit(SCRUB_STRIPE_FLAG_NO_REPORT, &stripe->state))
+ return;
- sblock = alloc_scrub_block(sctx, dev, logical, physical,
- physical_for_dev_replace, mirror_num);
- if (!sblock) {
- spin_lock(&sctx->stat_lock);
- sctx->stat.malloc_errors++;
- spin_unlock(&sctx->stat_lock);
- return -ENOMEM;
- }
+ /*
+ * Init needed infos for error reporting.
+ *
+ * Although our scrub_stripe infrastucture is mostly based on btrfs_submit_bio()
+ * thus no need for dev/physical, error reporting still needs dev and physical.
+ */
+ if (!bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors)) {
+ u64 mapped_len = fs_info->sectorsize;
+ struct btrfs_io_context *bioc = NULL;
+ int stripe_index = stripe->mirror_num - 1;
+ int ret;
- for (index = 0; len > 0; index++) {
- struct scrub_sector *sector;
+ /* For scrub, our mirror_num should always start at 1. */
+ ASSERT(stripe->mirror_num >= 1);
+ ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
+ stripe->logical, &mapped_len, &bioc);
/*
- * Here we will allocate one page for one sector to scrub.
- * This is fine if PAGE_SIZE == sectorsize, but will cost
- * more memory for PAGE_SIZE > sectorsize case.
+ * If we failed, dev will be NULL, and later detailed reports
+ * will just be skipped.
*/
- u32 l = min(sectorsize, len);
+ if (ret < 0)
+ goto skip;
+ physical = bioc->stripes[stripe_index].physical;
+ dev = bioc->stripes[stripe_index].dev;
+ btrfs_put_bioc(bioc);
+ }
- sector = alloc_scrub_sector(sblock, logical);
- if (!sector) {
- spin_lock(&sctx->stat_lock);
- sctx->stat.malloc_errors++;
- spin_unlock(&sctx->stat_lock);
- scrub_block_put(sblock);
- return -ENOMEM;
- }
- sector->flags = flags;
- sector->generation = gen;
- if (csum) {
- sector->have_csum = 1;
- memcpy(sector->csum, csum, sctx->fs_info->csum_size);
+skip:
+ for_each_set_bit(sector_nr, &stripe->extent_sector_bitmap, stripe->nr_sectors) {
+ bool repaired = false;
+
+ if (stripe->sectors[sector_nr].is_metadata) {
+ nr_meta_sectors++;
} else {
- sector->have_csum = 0;
+ nr_data_sectors++;
+ if (!stripe->sectors[sector_nr].csum)
+ nr_nodatacsum_sectors++;
}
- len -= l;
- logical += l;
- physical += l;
- physical_for_dev_replace += l;
- }
- WARN_ON(sblock->sector_count == 0);
- if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
+ if (test_bit(sector_nr, &stripe->init_error_bitmap) &&
+ !test_bit(sector_nr, &stripe->error_bitmap)) {
+ nr_repaired_sectors++;
+ repaired = true;
+ }
+
+ /* Good sector from the beginning, nothing need to be done. */
+ if (!test_bit(sector_nr, &stripe->init_error_bitmap))
+ continue;
+
/*
- * This case should only be hit for RAID 5/6 device replace. See
- * the comment in scrub_missing_raid56_pages() for details.
+ * Report error for the corrupted sectors. If repaired, just
+ * output the message of repaired message.
*/
- scrub_missing_raid56_pages(sblock);
- } else {
- for (index = 0; index < sblock->sector_count; index++) {
- struct scrub_sector *sector = sblock->sectors[index];
- int ret;
-
- ret = scrub_add_sector_to_rd_bio(sctx, sector);
- if (ret) {
- scrub_block_put(sblock);
- return ret;
+ if (repaired) {
+ if (dev) {
+ btrfs_err_rl_in_rcu(fs_info,
+ "fixed up error at logical %llu on dev %s physical %llu",
+ stripe->logical, btrfs_dev_name(dev),
+ physical);
+ } else {
+ btrfs_err_rl_in_rcu(fs_info,
+ "fixed up error at logical %llu on mirror %u",
+ stripe->logical, stripe->mirror_num);
}
+ continue;
}
- if (flags & BTRFS_EXTENT_FLAG_SUPER)
- scrub_submit(sctx);
- }
-
- /* last one frees, either here or in bio completion for last page */
- scrub_block_put(sblock);
- return 0;
-}
-
-static void scrub_bio_end_io(struct bio *bio)
-{
- struct scrub_bio *sbio = bio->bi_private;
- struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
+ /* The remaining are all for unrepaired. */
+ if (dev) {
+ btrfs_err_rl_in_rcu(fs_info,
+ "unable to fixup (regular) error at logical %llu on dev %s physical %llu",
+ stripe->logical, btrfs_dev_name(dev),
+ physical);
+ } else {
+ btrfs_err_rl_in_rcu(fs_info,
+ "unable to fixup (regular) error at logical %llu on mirror %u",
+ stripe->logical, stripe->mirror_num);
+ }
- sbio->status = bio->bi_status;
- sbio->bio = bio;
+ if (test_bit(sector_nr, &stripe->io_error_bitmap))
+ if (__ratelimit(&rs) && dev)
+ scrub_print_common_warning("i/o error", dev, false,
+ stripe->logical, physical);
+ if (test_bit(sector_nr, &stripe->csum_error_bitmap))
+ if (__ratelimit(&rs) && dev)
+ scrub_print_common_warning("checksum error", dev, false,
+ stripe->logical, physical);
+ if (test_bit(sector_nr, &stripe->meta_error_bitmap))
+ if (__ratelimit(&rs) && dev)
+ scrub_print_common_warning("header error", dev, false,
+ stripe->logical, physical);
+ }
- queue_work(fs_info->scrub_workers, &sbio->work);
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.data_extents_scrubbed += stripe->nr_data_extents;
+ sctx->stat.tree_extents_scrubbed += stripe->nr_meta_extents;
+ sctx->stat.data_bytes_scrubbed += nr_data_sectors << fs_info->sectorsize_bits;
+ sctx->stat.tree_bytes_scrubbed += nr_meta_sectors << fs_info->sectorsize_bits;
+ sctx->stat.no_csum += nr_nodatacsum_sectors;
+ sctx->stat.read_errors +=
+ bitmap_weight(&stripe->io_error_bitmap, stripe->nr_sectors);
+ sctx->stat.csum_errors +=
+ bitmap_weight(&stripe->csum_error_bitmap, stripe->nr_sectors);
+ sctx->stat.verify_errors +=
+ bitmap_weight(&stripe->meta_error_bitmap, stripe->nr_sectors);
+ sctx->stat.uncorrectable_errors +=
+ bitmap_weight(&stripe->error_bitmap, stripe->nr_sectors);
+ sctx->stat.corrected_errors += nr_repaired_sectors;
+ spin_unlock(&sctx->stat_lock);
}
-static void scrub_bio_end_io_worker(struct work_struct *work)
+/*
+ * The main entrance for all read related scrub work, including:
+ *
+ * - Wait for the initial read to finish
+ * - Verify and locate any bad sectors
+ * - Go through the remaining mirrors and try to read as large blocksize as
+ * possible
+ * - Go through all mirrors (including the failed mirror) sector-by-sector
+ *
+ * Writeback does not happen here, it needs extra synchronization.
+ */
+static void scrub_stripe_read_repair_worker(struct work_struct *work)
{
- struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
- struct scrub_ctx *sctx = sbio->sctx;
+ struct scrub_stripe *stripe = container_of(work, struct scrub_stripe, work);
+ struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
+ int num_copies = btrfs_num_copies(fs_info, stripe->bg->start,
+ stripe->bg->length);
+ int mirror;
int i;
- ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO);
- if (sbio->status) {
- for (i = 0; i < sbio->sector_count; i++) {
- struct scrub_sector *sector = sbio->sectors[i];
+ ASSERT(stripe->mirror_num > 0);
- sector->io_error = 1;
- sector->sblock->no_io_error_seen = 0;
- }
- }
+ wait_scrub_stripe_io(stripe);
+ scrub_verify_one_stripe(stripe, stripe->extent_sector_bitmap);
+ /* Save the initial failed bitmap for later repair and report usage. */
+ stripe->init_error_bitmap = stripe->error_bitmap;
- /* Now complete the scrub_block items that have all pages completed */
- for (i = 0; i < sbio->sector_count; i++) {
- struct scrub_sector *sector = sbio->sectors[i];
- struct scrub_block *sblock = sector->sblock;
+ if (bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors))
+ goto out;
- if (atomic_dec_and_test(&sblock->outstanding_sectors))
- scrub_block_complete(sblock);
- scrub_block_put(sblock);
+ /*
+ * Try all remaining mirrors.
+ *
+ * Here we still try to read as large block as possible, as this is
+ * faster and we have extra safety nets to rely on.
+ */
+ for (mirror = calc_next_mirror(stripe->mirror_num, num_copies);
+ mirror != stripe->mirror_num;
+ mirror = calc_next_mirror(mirror, num_copies)) {
+ const unsigned long old_error_bitmap = stripe->error_bitmap;
+
+ scrub_stripe_submit_repair_read(stripe, mirror,
+ BTRFS_STRIPE_LEN, false);
+ wait_scrub_stripe_io(stripe);
+ scrub_verify_one_stripe(stripe, old_error_bitmap);
+ if (bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors))
+ goto out;
}
- bio_put(sbio->bio);
- sbio->bio = NULL;
- spin_lock(&sctx->list_lock);
- sbio->next_free = sctx->first_free;
- sctx->first_free = sbio->index;
- spin_unlock(&sctx->list_lock);
+ /*
+ * Last safety net, try re-checking all mirrors, including the failed
+ * one, sector-by-sector.
+ *
+ * As if one sector failed the drive's internal csum, the whole read
+ * containing the offending sector would be marked as error.
+ * Thus here we do sector-by-sector read.
+ *
+ * This can be slow, thus we only try it as the last resort.
+ */
- if (sctx->is_dev_replace && sctx->flush_all_writes) {
- mutex_lock(&sctx->wr_lock);
- scrub_wr_submit(sctx);
- mutex_unlock(&sctx->wr_lock);
- }
+ for (i = 0, mirror = stripe->mirror_num;
+ i < num_copies;
+ i++, mirror = calc_next_mirror(mirror, num_copies)) {
+ const unsigned long old_error_bitmap = stripe->error_bitmap;
- scrub_pending_bio_dec(sctx);
+ scrub_stripe_submit_repair_read(stripe, mirror,
+ fs_info->sectorsize, true);
+ wait_scrub_stripe_io(stripe);
+ scrub_verify_one_stripe(stripe, old_error_bitmap);
+ if (bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors))
+ goto out;
+ }
+out:
+ scrub_stripe_report_errors(stripe->sctx, stripe);
+ set_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state);
+ wake_up(&stripe->repair_wait);
}
-static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
- unsigned long *bitmap,
- u64 start, u32 len)
+static void scrub_read_endio(struct btrfs_bio *bbio)
{
- u64 offset;
- u32 nsectors;
- u32 sectorsize_bits = sparity->sctx->fs_info->sectorsize_bits;
+ struct scrub_stripe *stripe = bbio->private;
- if (len >= sparity->stripe_len) {
- bitmap_set(bitmap, 0, sparity->nsectors);
- return;
+ if (bbio->bio.bi_status) {
+ bitmap_set(&stripe->io_error_bitmap, 0, stripe->nr_sectors);
+ bitmap_set(&stripe->error_bitmap, 0, stripe->nr_sectors);
+ } else {
+ bitmap_clear(&stripe->io_error_bitmap, 0, stripe->nr_sectors);
}
-
- start -= sparity->logic_start;
- start = div64_u64_rem(start, sparity->stripe_len, &offset);
- offset = offset >> sectorsize_bits;
- nsectors = len >> sectorsize_bits;
-
- if (offset + nsectors <= sparity->nsectors) {
- bitmap_set(bitmap, offset, nsectors);
- return;
+ bio_put(&bbio->bio);
+ if (atomic_dec_and_test(&stripe->pending_io)) {
+ wake_up(&stripe->io_wait);
+ INIT_WORK(&stripe->work, scrub_stripe_read_repair_worker);
+ queue_work(stripe->bg->fs_info->scrub_workers, &stripe->work);
}
-
- bitmap_set(bitmap, offset, sparity->nsectors - offset);
- bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
-}
-
-static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
- u64 start, u32 len)
-{
- __scrub_mark_bitmap(sparity, &sparity->ebitmap, start, len);
}
-static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
- u64 start, u32 len)
+static void scrub_write_endio(struct btrfs_bio *bbio)
{
- __scrub_mark_bitmap(sparity, &sparity->dbitmap, start, len);
-}
-
-static void scrub_block_complete(struct scrub_block *sblock)
-{
- int corrupted = 0;
+ struct scrub_stripe *stripe = bbio->private;
+ struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
+ struct bio_vec *bvec;
+ int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio));
+ u32 bio_size = 0;
+ int i;
- if (!sblock->no_io_error_seen) {
- corrupted = 1;
- scrub_handle_errored_block(sblock);
- } else {
- /*
- * if has checksum error, write via repair mechanism in
- * dev replace case, otherwise write here in dev replace
- * case.
- */
- corrupted = scrub_checksum(sblock);
- if (!corrupted && sblock->sctx->is_dev_replace)
- scrub_write_block_to_dev_replace(sblock);
- }
+ bio_for_each_bvec_all(bvec, &bbio->bio, i)
+ bio_size += bvec->bv_len;
- if (sblock->sparity && corrupted && !sblock->data_corrected) {
- u64 start = sblock->logical;
- u64 end = sblock->logical +
- sblock->sectors[sblock->sector_count - 1]->offset +
- sblock->sctx->fs_info->sectorsize;
+ if (bbio->bio.bi_status) {
+ unsigned long flags;
- ASSERT(end - start <= U32_MAX);
- scrub_parity_mark_sectors_error(sblock->sparity,
- start, end - start);
+ spin_lock_irqsave(&stripe->write_error_lock, flags);
+ bitmap_set(&stripe->write_error_bitmap, sector_nr,
+ bio_size >> fs_info->sectorsize_bits);
+ spin_unlock_irqrestore(&stripe->write_error_lock, flags);
}
-}
+ bio_put(&bbio->bio);
-static void drop_csum_range(struct scrub_ctx *sctx, struct btrfs_ordered_sum *sum)
-{
- sctx->stat.csum_discards += sum->len >> sctx->fs_info->sectorsize_bits;
- list_del(&sum->list);
- kfree(sum);
+ if (atomic_dec_and_test(&stripe->pending_io))
+ wake_up(&stripe->io_wait);
}
/*
- * Find the desired csum for range [logical, logical + sectorsize), and store
- * the csum into @csum.
+ * Submit the write bio(s) for the sectors specified by @write_bitmap.
*
- * The search source is sctx->csum_list, which is a pre-populated list
- * storing bytenr ordered csum ranges. We're responsible to cleanup any range
- * that is before @logical.
+ * Here we utilize btrfs_submit_repair_write(), which has some extra benefits:
*
- * Return 0 if there is no csum for the range.
- * Return 1 if there is csum for the range and copied to @csum.
+ * - Only needs logical bytenr and mirror_num
+ * Just like the scrub read path
+ *
+ * - Would only result in writes to the specified mirror
+ * Unlike the regular writeback path, which would write back to all stripes
+ *
+ * - Handle dev-replace and read-repair writeback differently
*/
-static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
+static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *stripe,
+ unsigned long write_bitmap, bool dev_replace)
{
- bool found = false;
+ struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
+ struct btrfs_bio *bbio = NULL;
+ const bool zoned = btrfs_is_zoned(fs_info);
+ int sector_nr;
- while (!list_empty(&sctx->csum_list)) {
- struct btrfs_ordered_sum *sum = NULL;
- unsigned long index;
- unsigned long num_sectors;
-
- sum = list_first_entry(&sctx->csum_list,
- struct btrfs_ordered_sum, list);
- /* The current csum range is beyond our range, no csum found */
- if (sum->bytenr > logical)
- break;
+ for_each_set_bit(sector_nr, &write_bitmap, stripe->nr_sectors) {
+ struct page *page = scrub_stripe_get_page(stripe, sector_nr);
+ unsigned int pgoff = scrub_stripe_get_page_offset(stripe, sector_nr);
+ int ret;
- /*
- * The current sum is before our bytenr, since scrub is always
- * done in bytenr order, the csum will never be used anymore,
- * clean it up so that later calls won't bother with the range,
- * and continue search the next range.
- */
- if (sum->bytenr + sum->len <= logical) {
- drop_csum_range(sctx, sum);
- continue;
+ /* We should only writeback sectors covered by an extent. */
+ ASSERT(test_bit(sector_nr, &stripe->extent_sector_bitmap));
+
+ /* Cannot merge with previous sector, submit the current one. */
+ if (bbio && sector_nr && !test_bit(sector_nr - 1, &write_bitmap)) {
+ fill_writer_pointer_gap(sctx, stripe->physical +
+ (sector_nr << fs_info->sectorsize_bits));
+ atomic_inc(&stripe->pending_io);
+ btrfs_submit_repair_write(bbio, stripe->mirror_num, dev_replace);
+ /* For zoned writeback, queue depth must be 1. */
+ if (zoned)
+ wait_scrub_stripe_io(stripe);
+ bbio = NULL;
}
-
- /* Now the csum range covers our bytenr, copy the csum */
- found = true;
- index = (logical - sum->bytenr) >> sctx->fs_info->sectorsize_bits;
- num_sectors = sum->len >> sctx->fs_info->sectorsize_bits;
-
- memcpy(csum, sum->sums + index * sctx->fs_info->csum_size,
- sctx->fs_info->csum_size);
-
- /* Cleanup the range if we're at the end of the csum range */
- if (index == num_sectors - 1)
- drop_csum_range(sctx, sum);
- break;
+ if (!bbio) {
+ bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_WRITE,
+ fs_info, scrub_write_endio, stripe);
+ bbio->bio.bi_iter.bi_sector = (stripe->logical +
+ (sector_nr << fs_info->sectorsize_bits)) >>
+ SECTOR_SHIFT;
+ }
+ ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff);
+ ASSERT(ret == fs_info->sectorsize);
+ }
+ if (bbio) {
+ fill_writer_pointer_gap(sctx, bbio->bio.bi_iter.bi_sector <<
+ SECTOR_SHIFT);
+ atomic_inc(&stripe->pending_io);
+ btrfs_submit_repair_write(bbio, stripe->mirror_num, dev_replace);
+ if (zoned)
+ wait_scrub_stripe_io(stripe);
}
- if (!found)
- return 0;
- return 1;
}
-/* scrub extent tries to collect up to 64 kB for each bio */
-static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
- u64 logical, u32 len,
- u64 physical, struct btrfs_device *dev, u64 flags,
- u64 gen, int mirror_num)
+/*
+ * Throttling of IO submission, bandwidth-limit based, the timeslice is 1
+ * second. Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max.
+ */
+static void scrub_throttle_dev_io(struct scrub_ctx *sctx, struct btrfs_device *device,
+ unsigned int bio_size)
{
- struct btrfs_device *src_dev = dev;
- u64 src_physical = physical;
- int src_mirror = mirror_num;
- int ret;
- u8 csum[BTRFS_CSUM_SIZE];
- u32 blocksize;
+ const int time_slice = 1000;
+ s64 delta;
+ ktime_t now;
+ u32 div;
+ u64 bwlimit;
- if (flags & BTRFS_EXTENT_FLAG_DATA) {
- if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
- blocksize = map->stripe_len;
- else
- blocksize = sctx->fs_info->sectorsize;
- spin_lock(&sctx->stat_lock);
- sctx->stat.data_extents_scrubbed++;
- sctx->stat.data_bytes_scrubbed += len;
- spin_unlock(&sctx->stat_lock);
- } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
- if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
- blocksize = map->stripe_len;
- else
- blocksize = sctx->fs_info->nodesize;
- spin_lock(&sctx->stat_lock);
- sctx->stat.tree_extents_scrubbed++;
- sctx->stat.tree_bytes_scrubbed += len;
- spin_unlock(&sctx->stat_lock);
- } else {
- blocksize = sctx->fs_info->sectorsize;
- WARN_ON(1);
- }
+ bwlimit = READ_ONCE(device->scrub_speed_max);
+ if (bwlimit == 0)
+ return;
/*
- * For dev-replace case, we can have @dev being a missing device.
- * Regular scrub will avoid its execution on missing device at all,
- * as that would trigger tons of read error.
- *
- * Reading from missing device will cause read error counts to
- * increase unnecessarily.
- * So here we change the read source to a good mirror.
+ * Slice is divided into intervals when the IO is submitted, adjust by
+ * bwlimit and maximum of 64 intervals.
*/
- if (sctx->is_dev_replace && !dev->bdev)
- scrub_find_good_copy(sctx->fs_info, logical, len, &src_physical,
- &src_dev, &src_mirror);
- while (len) {
- u32 l = min(len, blocksize);
- int have_csum = 0;
-
- if (flags & BTRFS_EXTENT_FLAG_DATA) {
- /* push csums to sbio */
- have_csum = scrub_find_csum(sctx, logical, csum);
- if (have_csum == 0)
- ++sctx->stat.no_csum;
- }
- ret = scrub_sectors(sctx, logical, l, src_physical, src_dev,
- flags, gen, src_mirror,
- have_csum ? csum : NULL, physical);
- if (ret)
- return ret;
- len -= l;
- logical += l;
- physical += l;
- src_physical += l;
- }
- return 0;
-}
-
-static int scrub_sectors_for_parity(struct scrub_parity *sparity,
- u64 logical, u32 len,
- u64 physical, struct btrfs_device *dev,
- u64 flags, u64 gen, int mirror_num, u8 *csum)
-{
- struct scrub_ctx *sctx = sparity->sctx;
- struct scrub_block *sblock;
- const u32 sectorsize = sctx->fs_info->sectorsize;
- int index;
-
- ASSERT(IS_ALIGNED(len, sectorsize));
-
- sblock = alloc_scrub_block(sctx, dev, logical, physical, physical, mirror_num);
- if (!sblock) {
- spin_lock(&sctx->stat_lock);
- sctx->stat.malloc_errors++;
- spin_unlock(&sctx->stat_lock);
- return -ENOMEM;
- }
-
- sblock->sparity = sparity;
- scrub_parity_get(sparity);
-
- for (index = 0; len > 0; index++) {
- struct scrub_sector *sector;
-
- sector = alloc_scrub_sector(sblock, logical);
- if (!sector) {
- spin_lock(&sctx->stat_lock);
- sctx->stat.malloc_errors++;
- spin_unlock(&sctx->stat_lock);
- scrub_block_put(sblock);
- return -ENOMEM;
- }
- sblock->sectors[index] = sector;
- /* For scrub parity */
- scrub_sector_get(sector);
- list_add_tail(&sector->list, &sparity->sectors_list);
- sector->flags = flags;
- sector->generation = gen;
- if (csum) {
- sector->have_csum = 1;
- memcpy(sector->csum, csum, sctx->fs_info->csum_size);
- } else {
- sector->have_csum = 0;
- }
-
- /* Iterate over the stripe range in sectorsize steps */
- len -= sectorsize;
- logical += sectorsize;
- physical += sectorsize;
- }
-
- WARN_ON(sblock->sector_count == 0);
- for (index = 0; index < sblock->sector_count; index++) {
- struct scrub_sector *sector = sblock->sectors[index];
- int ret;
+ div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024)));
+ div = min_t(u32, 64, div);
- ret = scrub_add_sector_to_rd_bio(sctx, sector);
- if (ret) {
- scrub_block_put(sblock);
- return ret;
- }
+ /* Start new epoch, set deadline */
+ now = ktime_get();
+ if (sctx->throttle_deadline == 0) {
+ sctx->throttle_deadline = ktime_add_ms(now, time_slice / div);
+ sctx->throttle_sent = 0;
}
- /* Last one frees, either here or in bio completion for last sector */
- scrub_block_put(sblock);
- return 0;
-}
-
-static int scrub_extent_for_parity(struct scrub_parity *sparity,
- u64 logical, u32 len,
- u64 physical, struct btrfs_device *dev,
- u64 flags, u64 gen, int mirror_num)
-{
- struct scrub_ctx *sctx = sparity->sctx;
- int ret;
- u8 csum[BTRFS_CSUM_SIZE];
- u32 blocksize;
-
- if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
- scrub_parity_mark_sectors_error(sparity, logical, len);
- return 0;
- }
+ /* Still in the time to send? */
+ if (ktime_before(now, sctx->throttle_deadline)) {
+ /* If current bio is within the limit, send it */
+ sctx->throttle_sent += bio_size;
+ if (sctx->throttle_sent <= div_u64(bwlimit, div))
+ return;
- if (flags & BTRFS_EXTENT_FLAG_DATA) {
- blocksize = sparity->stripe_len;
- } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
- blocksize = sparity->stripe_len;
+ /* We're over the limit, sleep until the rest of the slice */
+ delta = ktime_ms_delta(sctx->throttle_deadline, now);
} else {
- blocksize = sctx->fs_info->sectorsize;
- WARN_ON(1);
+ /* New request after deadline, start new epoch */
+ delta = 0;
}
- while (len) {
- u32 l = min(len, blocksize);
- int have_csum = 0;
+ if (delta) {
+ long timeout;
- if (flags & BTRFS_EXTENT_FLAG_DATA) {
- /* push csums to sbio */
- have_csum = scrub_find_csum(sctx, logical, csum);
- if (have_csum == 0)
- goto skip;
- }
- ret = scrub_sectors_for_parity(sparity, logical, l, physical, dev,
- flags, gen, mirror_num,
- have_csum ? csum : NULL);
- if (ret)
- return ret;
-skip:
- len -= l;
- logical += l;
- physical += l;
+ timeout = div_u64(delta * HZ, 1000);
+ schedule_timeout_interruptible(timeout);
}
- return 0;
+
+ /* Next call will start the deadline period */
+ sctx->throttle_deadline = 0;
}
/*
@@ -2908,10 +1266,7 @@ static int get_raid56_logic_offset(u64 physical, int num,
{
int i;
int j = 0;
- u64 stripe_nr;
u64 last_offset;
- u32 stripe_index;
- u32 rot;
const int data_stripes = nr_data_stripes(map);
last_offset = (physical - map->stripes[num].physical) * data_stripes;
@@ -2920,13 +1275,17 @@ static int get_raid56_logic_offset(u64 physical, int num,
*offset = last_offset;
for (i = 0; i < data_stripes; i++) {
- *offset = last_offset + i * map->stripe_len;
+ u32 stripe_nr;
+ u32 stripe_index;
+ u32 rot;
- stripe_nr = div64_u64(*offset, map->stripe_len);
- stripe_nr = div_u64(stripe_nr, data_stripes);
+ *offset = last_offset + (i << BTRFS_STRIPE_LEN_SHIFT);
+
+ stripe_nr = (u32)(*offset >> BTRFS_STRIPE_LEN_SHIFT) / data_stripes;
/* Work out the disk rotation on this stripe-set */
- stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
+ rot = stripe_nr % map->num_stripes;
+ stripe_nr /= map->num_stripes;
/* calculate which stripe this data locates */
rot += i;
stripe_index = rot % map->num_stripes;
@@ -2935,123 +1294,10 @@ static int get_raid56_logic_offset(u64 physical, int num,
if (stripe_index < num)
j++;
}
- *offset = last_offset + j * map->stripe_len;
+ *offset = last_offset + (j << BTRFS_STRIPE_LEN_SHIFT);
return 1;
}
-static void scrub_free_parity(struct scrub_parity *sparity)
-{
- struct scrub_ctx *sctx = sparity->sctx;
- struct scrub_sector *curr, *next;
- int nbits;
-
- nbits = bitmap_weight(&sparity->ebitmap, sparity->nsectors);
- if (nbits) {
- spin_lock(&sctx->stat_lock);
- sctx->stat.read_errors += nbits;
- sctx->stat.uncorrectable_errors += nbits;
- spin_unlock(&sctx->stat_lock);
- }
-
- list_for_each_entry_safe(curr, next, &sparity->sectors_list, list) {
- list_del_init(&curr->list);
- scrub_sector_put(curr);
- }
-
- kfree(sparity);
-}
-
-static void scrub_parity_bio_endio_worker(struct work_struct *work)
-{
- struct scrub_parity *sparity = container_of(work, struct scrub_parity,
- work);
- struct scrub_ctx *sctx = sparity->sctx;
-
- btrfs_bio_counter_dec(sctx->fs_info);
- scrub_free_parity(sparity);
- scrub_pending_bio_dec(sctx);
-}
-
-static void scrub_parity_bio_endio(struct bio *bio)
-{
- struct scrub_parity *sparity = bio->bi_private;
- struct btrfs_fs_info *fs_info = sparity->sctx->fs_info;
-
- if (bio->bi_status)
- bitmap_or(&sparity->ebitmap, &sparity->ebitmap,
- &sparity->dbitmap, sparity->nsectors);
-
- bio_put(bio);
-
- INIT_WORK(&sparity->work, scrub_parity_bio_endio_worker);
- queue_work(fs_info->scrub_parity_workers, &sparity->work);
-}
-
-static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
-{
- struct scrub_ctx *sctx = sparity->sctx;
- struct btrfs_fs_info *fs_info = sctx->fs_info;
- struct bio *bio;
- struct btrfs_raid_bio *rbio;
- struct btrfs_io_context *bioc = NULL;
- u64 length;
- int ret;
-
- if (!bitmap_andnot(&sparity->dbitmap, &sparity->dbitmap,
- &sparity->ebitmap, sparity->nsectors))
- goto out;
-
- length = sparity->logic_end - sparity->logic_start;
-
- btrfs_bio_counter_inc_blocked(fs_info);
- ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
- &length, &bioc);
- if (ret || !bioc || !bioc->raid_map)
- goto bioc_out;
-
- bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
- bio->bi_iter.bi_sector = sparity->logic_start >> 9;
- bio->bi_private = sparity;
- bio->bi_end_io = scrub_parity_bio_endio;
-
- rbio = raid56_parity_alloc_scrub_rbio(bio, bioc,
- sparity->scrub_dev,
- &sparity->dbitmap,
- sparity->nsectors);
- btrfs_put_bioc(bioc);
- if (!rbio)
- goto rbio_out;
-
- scrub_pending_bio_inc(sctx);
- raid56_parity_submit_scrub_rbio(rbio);
- return;
-
-rbio_out:
- bio_put(bio);
-bioc_out:
- btrfs_bio_counter_dec(fs_info);
- bitmap_or(&sparity->ebitmap, &sparity->ebitmap, &sparity->dbitmap,
- sparity->nsectors);
- spin_lock(&sctx->stat_lock);
- sctx->stat.malloc_errors++;
- spin_unlock(&sctx->stat_lock);
-out:
- scrub_free_parity(sparity);
-}
-
-static void scrub_parity_get(struct scrub_parity *sparity)
-{
- refcount_inc(&sparity->refs);
-}
-
-static void scrub_parity_put(struct scrub_parity *sparity)
-{
- if (!refcount_dec_and_test(&sparity->refs))
- return;
-
- scrub_parity_check_and_repair(sparity);
-}
-
/*
* Return 0 if the extent item range covers any byte of the range.
* Return <0 if the extent item is before @search_start.
@@ -3178,226 +1424,533 @@ static void get_extent_info(struct btrfs_path *path, u64 *extent_start_ret,
*generation_ret = btrfs_extent_generation(path->nodes[0], ei);
}
-static bool does_range_cross_boundary(u64 extent_start, u64 extent_len,
- u64 boundary_start, u64 boudary_len)
+static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical,
+ u64 physical, u64 physical_end)
{
- return (extent_start < boundary_start &&
- extent_start + extent_len > boundary_start) ||
- (extent_start < boundary_start + boudary_len &&
- extent_start + extent_len > boundary_start + boudary_len);
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
+ int ret = 0;
+
+ if (!btrfs_is_zoned(fs_info))
+ return 0;
+
+ mutex_lock(&sctx->wr_lock);
+ if (sctx->write_pointer < physical_end) {
+ ret = btrfs_sync_zone_write_pointer(sctx->wr_tgtdev, logical,
+ physical,
+ sctx->write_pointer);
+ if (ret)
+ btrfs_err(fs_info,
+ "zoned: failed to recover write pointer");
+ }
+ mutex_unlock(&sctx->wr_lock);
+ btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical);
+
+ return ret;
}
-static int scrub_raid56_data_stripe_for_parity(struct scrub_ctx *sctx,
- struct scrub_parity *sparity,
- struct map_lookup *map,
- struct btrfs_device *sdev,
- struct btrfs_path *path,
- u64 logical)
+static void fill_one_extent_info(struct btrfs_fs_info *fs_info,
+ struct scrub_stripe *stripe,
+ u64 extent_start, u64 extent_len,
+ u64 extent_flags, u64 extent_gen)
+{
+ for (u64 cur_logical = max(stripe->logical, extent_start);
+ cur_logical < min(stripe->logical + BTRFS_STRIPE_LEN,
+ extent_start + extent_len);
+ cur_logical += fs_info->sectorsize) {
+ const int nr_sector = (cur_logical - stripe->logical) >>
+ fs_info->sectorsize_bits;
+ struct scrub_sector_verification *sector =
+ &stripe->sectors[nr_sector];
+
+ set_bit(nr_sector, &stripe->extent_sector_bitmap);
+ if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+ sector->is_metadata = true;
+ sector->generation = extent_gen;
+ }
+ }
+}
+
+static void scrub_stripe_reset_bitmaps(struct scrub_stripe *stripe)
{
- struct btrfs_fs_info *fs_info = sctx->fs_info;
- struct btrfs_root *extent_root = btrfs_extent_root(fs_info, logical);
- struct btrfs_root *csum_root = btrfs_csum_root(fs_info, logical);
- u64 cur_logical = logical;
+ stripe->extent_sector_bitmap = 0;
+ stripe->init_error_bitmap = 0;
+ stripe->error_bitmap = 0;
+ stripe->io_error_bitmap = 0;
+ stripe->csum_error_bitmap = 0;
+ stripe->meta_error_bitmap = 0;
+}
+
+/*
+ * Locate one stripe which has at least one extent in its range.
+ *
+ * Return 0 if found such stripe, and store its info into @stripe.
+ * Return >0 if there is no such stripe in the specified range.
+ * Return <0 for error.
+ */
+static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg,
+ struct btrfs_device *dev, u64 physical,
+ int mirror_num, u64 logical_start,
+ u32 logical_len,
+ struct scrub_stripe *stripe)
+{
+ struct btrfs_fs_info *fs_info = bg->fs_info;
+ struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bg->start);
+ struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bg->start);
+ const u64 logical_end = logical_start + logical_len;
+ struct btrfs_path path = { 0 };
+ u64 cur_logical = logical_start;
+ u64 stripe_end;
+ u64 extent_start;
+ u64 extent_len;
+ u64 extent_flags;
+ u64 extent_gen;
int ret;
- ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK);
+ memset(stripe->sectors, 0, sizeof(struct scrub_sector_verification) *
+ stripe->nr_sectors);
+ scrub_stripe_reset_bitmaps(stripe);
- /* Path must not be populated */
- ASSERT(!path->nodes[0]);
+ /* The range must be inside the bg. */
+ ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length);
- while (cur_logical < logical + map->stripe_len) {
- struct btrfs_io_context *bioc = NULL;
- struct btrfs_device *extent_dev;
- u64 extent_start;
- u64 extent_size;
- u64 mapped_length;
- u64 extent_flags;
- u64 extent_gen;
- u64 extent_physical;
- u64 extent_mirror_num;
-
- ret = find_first_extent_item(extent_root, path, cur_logical,
- logical + map->stripe_len - cur_logical);
- /* No more extent item in this data stripe */
+ path.search_commit_root = 1;
+ path.skip_locking = 1;
+
+ ret = find_first_extent_item(extent_root, &path, logical_start, logical_len);
+ /* Either error or not found. */
+ if (ret)
+ goto out;
+ get_extent_info(&path, &extent_start, &extent_len, &extent_flags, &extent_gen);
+ if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+ stripe->nr_meta_extents++;
+ if (extent_flags & BTRFS_EXTENT_FLAG_DATA)
+ stripe->nr_data_extents++;
+ cur_logical = max(extent_start, cur_logical);
+
+ /*
+ * Round down to stripe boundary.
+ *
+ * The extra calculation against bg->start is to handle block groups
+ * whose logical bytenr is not BTRFS_STRIPE_LEN aligned.
+ */
+ stripe->logical = round_down(cur_logical - bg->start, BTRFS_STRIPE_LEN) +
+ bg->start;
+ stripe->physical = physical + stripe->logical - logical_start;
+ stripe->dev = dev;
+ stripe->bg = bg;
+ stripe->mirror_num = mirror_num;
+ stripe_end = stripe->logical + BTRFS_STRIPE_LEN - 1;
+
+ /* Fill the first extent info into stripe->sectors[] array. */
+ fill_one_extent_info(fs_info, stripe, extent_start, extent_len,
+ extent_flags, extent_gen);
+ cur_logical = extent_start + extent_len;
+
+ /* Fill the extent info for the remaining sectors. */
+ while (cur_logical <= stripe_end) {
+ ret = find_first_extent_item(extent_root, &path, cur_logical,
+ stripe_end - cur_logical + 1);
+ if (ret < 0)
+ goto out;
if (ret > 0) {
ret = 0;
break;
}
+ get_extent_info(&path, &extent_start, &extent_len,
+ &extent_flags, &extent_gen);
+ if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+ stripe->nr_meta_extents++;
+ if (extent_flags & BTRFS_EXTENT_FLAG_DATA)
+ stripe->nr_data_extents++;
+ fill_one_extent_info(fs_info, stripe, extent_start, extent_len,
+ extent_flags, extent_gen);
+ cur_logical = extent_start + extent_len;
+ }
+
+ /* Now fill the data csum. */
+ if (bg->flags & BTRFS_BLOCK_GROUP_DATA) {
+ int sector_nr;
+ unsigned long csum_bitmap = 0;
+
+ /* Csum space should have already been allocated. */
+ ASSERT(stripe->csums);
+
+ /*
+ * Our csum bitmap should be large enough, as BTRFS_STRIPE_LEN
+ * should contain at most 16 sectors.
+ */
+ ASSERT(BITS_PER_LONG >= BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits);
+
+ ret = btrfs_lookup_csums_bitmap(csum_root, stripe->logical,
+ stripe_end, stripe->csums,
+ &csum_bitmap, true);
if (ret < 0)
- break;
- get_extent_info(path, &extent_start, &extent_size, &extent_flags,
- &extent_gen);
+ goto out;
+ if (ret > 0)
+ ret = 0;
+
+ for_each_set_bit(sector_nr, &csum_bitmap, stripe->nr_sectors) {
+ stripe->sectors[sector_nr].csum = stripe->csums +
+ sector_nr * fs_info->csum_size;
+ }
+ }
+ set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state);
+out:
+ btrfs_release_path(&path);
+ return ret;
+}
+
+static void scrub_reset_stripe(struct scrub_stripe *stripe)
+{
+ scrub_stripe_reset_bitmaps(stripe);
+
+ stripe->nr_meta_extents = 0;
+ stripe->nr_data_extents = 0;
+ stripe->state = 0;
+
+ for (int i = 0; i < stripe->nr_sectors; i++) {
+ stripe->sectors[i].is_metadata = false;
+ stripe->sectors[i].csum = NULL;
+ stripe->sectors[i].generation = 0;
+ }
+}
+
+static void scrub_submit_initial_read(struct scrub_ctx *sctx,
+ struct scrub_stripe *stripe)
+{
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
+ struct btrfs_bio *bbio;
+ int mirror = stripe->mirror_num;
+
+ ASSERT(stripe->bg);
+ ASSERT(stripe->mirror_num > 0);
+ ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state));
+
+ bbio = btrfs_bio_alloc(SCRUB_STRIPE_PAGES, REQ_OP_READ, fs_info,
+ scrub_read_endio, stripe);
+
+ /* Read the whole stripe. */
+ bbio->bio.bi_iter.bi_sector = stripe->logical >> SECTOR_SHIFT;
+ for (int i = 0; i < BTRFS_STRIPE_LEN >> PAGE_SHIFT; i++) {
+ int ret;
+
+ ret = bio_add_page(&bbio->bio, stripe->pages[i], PAGE_SIZE, 0);
+ /* We should have allocated enough bio vectors. */
+ ASSERT(ret == PAGE_SIZE);
+ }
+ atomic_inc(&stripe->pending_io);
+
+ /*
+ * For dev-replace, either user asks to avoid the source dev, or
+ * the device is missing, we try the next mirror instead.
+ */
+ if (sctx->is_dev_replace &&
+ (fs_info->dev_replace.cont_reading_from_srcdev_mode ==
+ BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID ||
+ !stripe->dev->bdev)) {
+ int num_copies = btrfs_num_copies(fs_info, stripe->bg->start,
+ stripe->bg->length);
+
+ mirror = calc_next_mirror(mirror, num_copies);
+ }
+ btrfs_submit_bio(bbio, mirror);
+}
+
+static bool stripe_has_metadata_error(struct scrub_stripe *stripe)
+{
+ int i;
+
+ for_each_set_bit(i, &stripe->error_bitmap, stripe->nr_sectors) {
+ if (stripe->sectors[i].is_metadata) {
+ struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
- /* Metadata should not cross stripe boundaries */
- if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
- does_range_cross_boundary(extent_start, extent_size,
- logical, map->stripe_len)) {
btrfs_err(fs_info,
- "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
- extent_start, logical);
- spin_lock(&sctx->stat_lock);
- sctx->stat.uncorrectable_errors++;
- spin_unlock(&sctx->stat_lock);
- cur_logical += extent_size;
- continue;
+ "stripe %llu has unrepaired metadata sector at %llu",
+ stripe->logical,
+ stripe->logical + (i << fs_info->sectorsize_bits));
+ return true;
}
+ }
+ return false;
+}
- /* Skip hole range which doesn't have any extent */
- cur_logical = max(extent_start, cur_logical);
+static int flush_scrub_stripes(struct scrub_ctx *sctx)
+{
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
+ struct scrub_stripe *stripe;
+ const int nr_stripes = sctx->cur_stripe;
+ int ret = 0;
- /* Truncate the range inside this data stripe */
- extent_size = min(extent_start + extent_size,
- logical + map->stripe_len) - cur_logical;
- extent_start = cur_logical;
- ASSERT(extent_size <= U32_MAX);
+ if (!nr_stripes)
+ return 0;
- scrub_parity_mark_sectors_data(sparity, extent_start, extent_size);
+ ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &sctx->stripes[0].state));
- mapped_length = extent_size;
- ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_start,
- &mapped_length, &bioc, 0);
- if (!ret && (!bioc || mapped_length < extent_size))
- ret = -EIO;
- if (ret) {
- btrfs_put_bioc(bioc);
- scrub_parity_mark_sectors_error(sparity, extent_start,
- extent_size);
- break;
+ scrub_throttle_dev_io(sctx, sctx->stripes[0].dev,
+ nr_stripes << BTRFS_STRIPE_LEN_SHIFT);
+ for (int i = 0; i < nr_stripes; i++) {
+ stripe = &sctx->stripes[i];
+ scrub_submit_initial_read(sctx, stripe);
+ }
+
+ for (int i = 0; i < nr_stripes; i++) {
+ stripe = &sctx->stripes[i];
+
+ wait_event(stripe->repair_wait,
+ test_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state));
+ }
+
+ /*
+ * Submit the repaired sectors. For zoned case, we cannot do repair
+ * in-place, but queue the bg to be relocated.
+ */
+ if (btrfs_is_zoned(fs_info)) {
+ for (int i = 0; i < nr_stripes; i++) {
+ stripe = &sctx->stripes[i];
+
+ if (!bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors)) {
+ btrfs_repair_one_zone(fs_info,
+ sctx->stripes[0].bg->start);
+ break;
+ }
}
- extent_physical = bioc->stripes[0].physical;
- extent_mirror_num = bioc->mirror_num;
- extent_dev = bioc->stripes[0].dev;
- btrfs_put_bioc(bioc);
+ } else {
+ for (int i = 0; i < nr_stripes; i++) {
+ unsigned long repaired;
- ret = btrfs_lookup_csums_list(csum_root, extent_start,
- extent_start + extent_size - 1,
- &sctx->csum_list, 1, false);
- if (ret) {
- scrub_parity_mark_sectors_error(sparity, extent_start,
- extent_size);
- break;
+ stripe = &sctx->stripes[i];
+
+ bitmap_andnot(&repaired, &stripe->init_error_bitmap,
+ &stripe->error_bitmap, stripe->nr_sectors);
+ scrub_write_sectors(sctx, stripe, repaired, false);
+ }
+ }
+
+ /* Submit for dev-replace. */
+ if (sctx->is_dev_replace) {
+ /*
+ * For dev-replace, if we know there is something wrong with
+ * metadata, we should immedately abort.
+ */
+ for (int i = 0; i < nr_stripes; i++) {
+ if (stripe_has_metadata_error(&sctx->stripes[i])) {
+ ret = -EIO;
+ goto out;
+ }
}
+ for (int i = 0; i < nr_stripes; i++) {
+ unsigned long good;
- ret = scrub_extent_for_parity(sparity, extent_start,
- extent_size, extent_physical,
- extent_dev, extent_flags,
- extent_gen, extent_mirror_num);
- scrub_free_csums(sctx);
+ stripe = &sctx->stripes[i];
- if (ret) {
- scrub_parity_mark_sectors_error(sparity, extent_start,
- extent_size);
- break;
+ ASSERT(stripe->dev == fs_info->dev_replace.srcdev);
+
+ bitmap_andnot(&good, &stripe->extent_sector_bitmap,
+ &stripe->error_bitmap, stripe->nr_sectors);
+ scrub_write_sectors(sctx, stripe, good, true);
}
+ }
- cond_resched();
- cur_logical += extent_size;
+ /* Wait for the above writebacks to finish. */
+ for (int i = 0; i < nr_stripes; i++) {
+ stripe = &sctx->stripes[i];
+
+ wait_scrub_stripe_io(stripe);
+ scrub_reset_stripe(stripe);
}
- btrfs_release_path(path);
+out:
+ sctx->cur_stripe = 0;
return ret;
}
-static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
- struct map_lookup *map,
- struct btrfs_device *sdev,
- u64 logic_start,
- u64 logic_end)
+static void raid56_scrub_wait_endio(struct bio *bio)
{
- struct btrfs_fs_info *fs_info = sctx->fs_info;
- struct btrfs_path *path;
- u64 cur_logical;
+ complete(bio->bi_private);
+}
+
+static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *bg,
+ struct btrfs_device *dev, int mirror_num,
+ u64 logical, u32 length, u64 physical)
+{
+ struct scrub_stripe *stripe;
int ret;
- struct scrub_parity *sparity;
- int nsectors;
- path = btrfs_alloc_path();
- if (!path) {
- spin_lock(&sctx->stat_lock);
- sctx->stat.malloc_errors++;
- spin_unlock(&sctx->stat_lock);
- return -ENOMEM;
+ /* No available slot, submit all stripes and wait for them. */
+ if (sctx->cur_stripe >= SCRUB_STRIPES_PER_SCTX) {
+ ret = flush_scrub_stripes(sctx);
+ if (ret < 0)
+ return ret;
}
- path->search_commit_root = 1;
- path->skip_locking = 1;
- ASSERT(map->stripe_len <= U32_MAX);
- nsectors = map->stripe_len >> fs_info->sectorsize_bits;
- ASSERT(nsectors <= BITS_PER_LONG);
- sparity = kzalloc(sizeof(struct scrub_parity), GFP_NOFS);
- if (!sparity) {
- spin_lock(&sctx->stat_lock);
- sctx->stat.malloc_errors++;
- spin_unlock(&sctx->stat_lock);
- btrfs_free_path(path);
- return -ENOMEM;
- }
+ stripe = &sctx->stripes[sctx->cur_stripe];
- ASSERT(map->stripe_len <= U32_MAX);
- sparity->stripe_len = map->stripe_len;
- sparity->nsectors = nsectors;
- sparity->sctx = sctx;
- sparity->scrub_dev = sdev;
- sparity->logic_start = logic_start;
- sparity->logic_end = logic_end;
- refcount_set(&sparity->refs, 1);
- INIT_LIST_HEAD(&sparity->sectors_list);
+ /* We can queue one stripe using the remaining slot. */
+ scrub_reset_stripe(stripe);
+ ret = scrub_find_fill_first_stripe(bg, dev, physical, mirror_num,
+ logical, length, stripe);
+ /* Either >0 as no more extents or <0 for error. */
+ if (ret)
+ return ret;
+ sctx->cur_stripe++;
+ return 0;
+}
- ret = 0;
- for (cur_logical = logic_start; cur_logical < logic_end;
- cur_logical += map->stripe_len) {
- ret = scrub_raid56_data_stripe_for_parity(sctx, sparity, map,
- sdev, path, cur_logical);
+static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
+ struct btrfs_device *scrub_dev,
+ struct btrfs_block_group *bg,
+ struct map_lookup *map,
+ u64 full_stripe_start)
+{
+ DECLARE_COMPLETION_ONSTACK(io_done);
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
+ struct btrfs_raid_bio *rbio;
+ struct btrfs_io_context *bioc = NULL;
+ struct bio *bio;
+ struct scrub_stripe *stripe;
+ bool all_empty = true;
+ const int data_stripes = nr_data_stripes(map);
+ unsigned long extent_bitmap = 0;
+ u64 length = data_stripes << BTRFS_STRIPE_LEN_SHIFT;
+ int ret;
+
+ ASSERT(sctx->raid56_data_stripes);
+
+ for (int i = 0; i < data_stripes; i++) {
+ int stripe_index;
+ int rot;
+ u64 physical;
+
+ stripe = &sctx->raid56_data_stripes[i];
+ rot = div_u64(full_stripe_start - bg->start,
+ data_stripes) >> BTRFS_STRIPE_LEN_SHIFT;
+ stripe_index = (i + rot) % map->num_stripes;
+ physical = map->stripes[stripe_index].physical +
+ (rot << BTRFS_STRIPE_LEN_SHIFT);
+
+ scrub_reset_stripe(stripe);
+ set_bit(SCRUB_STRIPE_FLAG_NO_REPORT, &stripe->state);
+ ret = scrub_find_fill_first_stripe(bg,
+ map->stripes[stripe_index].dev, physical, 1,
+ full_stripe_start + (i << BTRFS_STRIPE_LEN_SHIFT),
+ BTRFS_STRIPE_LEN, stripe);
if (ret < 0)
+ goto out;
+ /*
+ * No extent in this data stripe, need to manually mark them
+ * initialized to make later read submission happy.
+ */
+ if (ret > 0) {
+ stripe->logical = full_stripe_start +
+ (i << BTRFS_STRIPE_LEN_SHIFT);
+ stripe->dev = map->stripes[stripe_index].dev;
+ stripe->mirror_num = 1;
+ set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state);
+ }
+ }
+
+ /* Check if all data stripes are empty. */
+ for (int i = 0; i < data_stripes; i++) {
+ stripe = &sctx->raid56_data_stripes[i];
+ if (!bitmap_empty(&stripe->extent_sector_bitmap, stripe->nr_sectors)) {
+ all_empty = false;
break;
+ }
+ }
+ if (all_empty) {
+ ret = 0;
+ goto out;
}
- scrub_parity_put(sparity);
- scrub_submit(sctx);
- mutex_lock(&sctx->wr_lock);
- scrub_wr_submit(sctx);
- mutex_unlock(&sctx->wr_lock);
+ for (int i = 0; i < data_stripes; i++) {
+ stripe = &sctx->raid56_data_stripes[i];
+ scrub_submit_initial_read(sctx, stripe);
+ }
+ for (int i = 0; i < data_stripes; i++) {
+ stripe = &sctx->raid56_data_stripes[i];
- btrfs_free_path(path);
- return ret < 0 ? ret : 0;
-}
+ wait_event(stripe->repair_wait,
+ test_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state));
+ }
+ /* For now, no zoned support for RAID56. */
+ ASSERT(!btrfs_is_zoned(sctx->fs_info));
-static void sync_replace_for_zoned(struct scrub_ctx *sctx)
-{
- if (!btrfs_is_zoned(sctx->fs_info))
- return;
+ /* Writeback for the repaired sectors. */
+ for (int i = 0; i < data_stripes; i++) {
+ unsigned long repaired;
- sctx->flush_all_writes = true;
- scrub_submit(sctx);
- mutex_lock(&sctx->wr_lock);
- scrub_wr_submit(sctx);
- mutex_unlock(&sctx->wr_lock);
+ stripe = &sctx->raid56_data_stripes[i];
- wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
-}
+ bitmap_andnot(&repaired, &stripe->init_error_bitmap,
+ &stripe->error_bitmap, stripe->nr_sectors);
+ scrub_write_sectors(sctx, stripe, repaired, false);
+ }
-static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical,
- u64 physical, u64 physical_end)
-{
- struct btrfs_fs_info *fs_info = sctx->fs_info;
- int ret = 0;
+ /* Wait for the above writebacks to finish. */
+ for (int i = 0; i < data_stripes; i++) {
+ stripe = &sctx->raid56_data_stripes[i];
- if (!btrfs_is_zoned(fs_info))
- return 0;
+ wait_scrub_stripe_io(stripe);
+ }
- wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
+ /*
+ * Now all data stripes are properly verified. Check if we have any
+ * unrepaired, if so abort immediately or we could further corrupt the
+ * P/Q stripes.
+ *
+ * During the loop, also populate extent_bitmap.
+ */
+ for (int i = 0; i < data_stripes; i++) {
+ unsigned long error;
- mutex_lock(&sctx->wr_lock);
- if (sctx->write_pointer < physical_end) {
- ret = btrfs_sync_zone_write_pointer(sctx->wr_tgtdev, logical,
- physical,
- sctx->write_pointer);
- if (ret)
+ stripe = &sctx->raid56_data_stripes[i];
+
+ /*
+ * We should only check the errors where there is an extent.
+ * As we may hit an empty data stripe while it's missing.
+ */
+ bitmap_and(&error, &stripe->error_bitmap,
+ &stripe->extent_sector_bitmap, stripe->nr_sectors);
+ if (!bitmap_empty(&error, stripe->nr_sectors)) {
btrfs_err(fs_info,
- "zoned: failed to recover write pointer");
+"unrepaired sectors detected, full stripe %llu data stripe %u errors %*pbl",
+ full_stripe_start, i, stripe->nr_sectors,
+ &error);
+ ret = -EIO;
+ goto out;
+ }
+ bitmap_or(&extent_bitmap, &extent_bitmap,
+ &stripe->extent_sector_bitmap, stripe->nr_sectors);
}
- mutex_unlock(&sctx->wr_lock);
- btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical);
+ /* Now we can check and regenerate the P/Q stripe. */
+ bio = bio_alloc(NULL, 1, REQ_OP_READ, GFP_NOFS);
+ bio->bi_iter.bi_sector = full_stripe_start >> SECTOR_SHIFT;
+ bio->bi_private = &io_done;
+ bio->bi_end_io = raid56_scrub_wait_endio;
+
+ btrfs_bio_counter_inc_blocked(fs_info);
+ ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, full_stripe_start,
+ &length, &bioc);
+ if (ret < 0) {
+ btrfs_put_bioc(bioc);
+ btrfs_bio_counter_dec(fs_info);
+ goto out;
+ }
+ rbio = raid56_parity_alloc_scrub_rbio(bio, bioc, scrub_dev, &extent_bitmap,
+ BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits);
+ btrfs_put_bioc(bioc);
+ if (!rbio) {
+ ret = -ENOMEM;
+ btrfs_bio_counter_dec(fs_info);
+ goto out;
+ }
+ raid56_parity_submit_scrub_rbio(rbio);
+ wait_for_completion_io(&io_done);
+ ret = blk_status_to_errno(bio->bi_status);
+ bio_put(bio);
+ btrfs_bio_counter_dec(fs_info);
+
+out:
return ret;
}
@@ -3410,8 +1963,6 @@ static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical,
* and @logical_length parameter.
*/
static int scrub_simple_mirror(struct scrub_ctx *sctx,
- struct btrfs_root *extent_root,
- struct btrfs_root *csum_root,
struct btrfs_block_group *bg,
struct map_lookup *map,
u64 logical_start, u64 logical_length,
@@ -3421,7 +1972,6 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
struct btrfs_fs_info *fs_info = sctx->fs_info;
const u64 logical_end = logical_start + logical_length;
/* An artificial limit, inherit from old scrub behavior */
- const u32 max_length = SZ_64K;
struct btrfs_path path = { 0 };
u64 cur_logical = logical_start;
int ret;
@@ -3433,11 +1983,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
path.skip_locking = 1;
/* Go through each extent items inside the logical range */
while (cur_logical < logical_end) {
- u64 extent_start;
- u64 extent_len;
- u64 extent_flags;
- u64 extent_gen;
- u64 scrub_len;
+ u64 cur_physical = physical + cur_logical - logical_start;
/* Canceled? */
if (atomic_read(&fs_info->scrub_cancel_req) ||
@@ -3448,14 +1994,6 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
/* Paused? */
if (atomic_read(&fs_info->scrub_pause_req)) {
/* Push queued extents */
- sctx->flush_all_writes = true;
- scrub_submit(sctx);
- mutex_lock(&sctx->wr_lock);
- scrub_wr_submit(sctx);
- mutex_unlock(&sctx->wr_lock);
- wait_event(sctx->list_wait,
- atomic_read(&sctx->bios_in_flight) == 0);
- sctx->flush_all_writes = false;
scrub_blocked_if_needed(fs_info);
}
/* Block group removed? */
@@ -3467,8 +2005,9 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
}
spin_unlock(&bg->lock);
- ret = find_first_extent_item(extent_root, &path, cur_logical,
- logical_end - cur_logical);
+ ret = queue_scrub_stripe(sctx, bg, device, mirror_num,
+ cur_logical, logical_end - cur_logical,
+ cur_physical);
if (ret > 0) {
/* No more extent, just update the accounting */
sctx->stat.last_physical = physical + logical_length;
@@ -3477,52 +2016,11 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
}
if (ret < 0)
break;
- get_extent_info(&path, &extent_start, &extent_len,
- &extent_flags, &extent_gen);
- /* Skip hole range which doesn't have any extent */
- cur_logical = max(extent_start, cur_logical);
- /*
- * Scrub len has three limits:
- * - Extent size limit
- * - Scrub range limit
- * This is especially imporatant for RAID0/RAID10 to reuse
- * this function
- * - Max scrub size limit
- */
- scrub_len = min(min(extent_start + extent_len,
- logical_end), cur_logical + max_length) -
- cur_logical;
-
- if (extent_flags & BTRFS_EXTENT_FLAG_DATA) {
- ret = btrfs_lookup_csums_list(csum_root, cur_logical,
- cur_logical + scrub_len - 1,
- &sctx->csum_list, 1, false);
- if (ret)
- break;
- }
- if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
- does_range_cross_boundary(extent_start, extent_len,
- logical_start, logical_length)) {
- btrfs_err(fs_info,
-"scrub: tree block %llu spanning boundaries, ignored. boundary=[%llu, %llu)",
- extent_start, logical_start, logical_end);
- spin_lock(&sctx->stat_lock);
- sctx->stat.uncorrectable_errors++;
- spin_unlock(&sctx->stat_lock);
- cur_logical += scrub_len;
- continue;
- }
- ret = scrub_extent(sctx, map, cur_logical, scrub_len,
- cur_logical - logical_start + physical,
- device, extent_flags, extent_gen,
- mirror_num);
- scrub_free_csums(sctx);
- if (ret)
- break;
- if (sctx->is_dev_replace)
- sync_replace_for_zoned(sctx);
- cur_logical += scrub_len;
+ ASSERT(sctx->cur_stripe > 0);
+ cur_logical = sctx->stripes[sctx->cur_stripe - 1].logical
+ + BTRFS_STRIPE_LEN;
+
/* Don't hold CPU for too long time */
cond_resched();
}
@@ -3536,7 +2034,7 @@ static u64 simple_stripe_full_stripe_len(const struct map_lookup *map)
ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
BTRFS_BLOCK_GROUP_RAID10));
- return map->num_stripes / map->sub_stripes * map->stripe_len;
+ return (map->num_stripes / map->sub_stripes) << BTRFS_STRIPE_LEN_SHIFT;
}
/* Get the logical bytenr for the stripe */
@@ -3552,7 +2050,8 @@ static u64 simple_stripe_get_logical(struct map_lookup *map,
* (stripe_index / sub_stripes) gives how many data stripes we need to
* skip.
*/
- return (stripe_index / map->sub_stripes) * map->stripe_len + bg->start;
+ return ((stripe_index / map->sub_stripes) << BTRFS_STRIPE_LEN_SHIFT) +
+ bg->start;
}
/* Get the mirror number for the stripe */
@@ -3567,8 +2066,6 @@ static int simple_stripe_mirror_num(struct map_lookup *map, int stripe_index)
}
static int scrub_simple_stripe(struct scrub_ctx *sctx,
- struct btrfs_root *extent_root,
- struct btrfs_root *csum_root,
struct btrfs_block_group *bg,
struct map_lookup *map,
struct btrfs_device *device,
@@ -3588,15 +2085,15 @@ static int scrub_simple_stripe(struct scrub_ctx *sctx,
* just RAID1, so we can reuse scrub_simple_mirror() to scrub
* this stripe.
*/
- ret = scrub_simple_mirror(sctx, extent_root, csum_root, bg, map,
- cur_logical, map->stripe_len, device,
- cur_physical, mirror_num);
+ ret = scrub_simple_mirror(sctx, bg, map, cur_logical,
+ BTRFS_STRIPE_LEN, device, cur_physical,
+ mirror_num);
if (ret)
return ret;
/* Skip to next stripe which belongs to the target device */
cur_logical += logical_increment;
/* For physical offset, we just go to next stripe */
- cur_physical += map->stripe_len;
+ cur_physical += BTRFS_STRIPE_LEN;
}
return ret;
}
@@ -3607,15 +2104,12 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
struct btrfs_device *scrub_dev,
int stripe_index)
{
- struct btrfs_path *path;
struct btrfs_fs_info *fs_info = sctx->fs_info;
- struct btrfs_root *root;
- struct btrfs_root *csum_root;
- struct blk_plug plug;
struct map_lookup *map = em->map_lookup;
const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
const u64 chunk_logical = bg->start;
int ret;
+ int ret2;
u64 physical = map->stripes[stripe_index].physical;
const u64 dev_stripe_len = btrfs_calc_stripe_length(em);
const u64 physical_end = physical + dev_stripe_len;
@@ -3626,43 +2120,37 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
/* Offset inside the chunk */
u64 offset;
u64 stripe_logical;
- u64 stripe_end;
int stop_loop = 0;
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- /*
- * work on commit root. The related disk blocks are static as
- * long as COW is applied. This means, it is save to rewrite
- * them to repair disk errors without any race conditions
- */
- path->search_commit_root = 1;
- path->skip_locking = 1;
- path->reada = READA_FORWARD;
-
- wait_event(sctx->list_wait,
- atomic_read(&sctx->bios_in_flight) == 0);
scrub_blocked_if_needed(fs_info);
- root = btrfs_extent_root(fs_info, bg->start);
- csum_root = btrfs_csum_root(fs_info, bg->start);
-
- /*
- * collect all data csums for the stripe to avoid seeking during
- * the scrub. This might currently (crc32) end up to be about 1MB
- */
- blk_start_plug(&plug);
-
if (sctx->is_dev_replace &&
btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) {
mutex_lock(&sctx->wr_lock);
sctx->write_pointer = physical;
mutex_unlock(&sctx->wr_lock);
- sctx->flush_all_writes = true;
}
+ /* Prepare the extra data stripes used by RAID56. */
+ if (profile & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+ ASSERT(sctx->raid56_data_stripes == NULL);
+
+ sctx->raid56_data_stripes = kcalloc(nr_data_stripes(map),
+ sizeof(struct scrub_stripe),
+ GFP_KERNEL);
+ if (!sctx->raid56_data_stripes) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ for (int i = 0; i < nr_data_stripes(map); i++) {
+ ret = init_scrub_stripe(fs_info,
+ &sctx->raid56_data_stripes[i]);
+ if (ret < 0)
+ goto out;
+ sctx->raid56_data_stripes[i].bg = bg;
+ sctx->raid56_data_stripes[i].sctx = sctx;
+ }
+ }
/*
* There used to be a big double loop to handle all profiles using the
* same routine, which grows larger and more gross over time.
@@ -3680,17 +2168,15 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
* Only @physical and @mirror_num needs to calculated using
* @stripe_index.
*/
- ret = scrub_simple_mirror(sctx, root, csum_root, bg, map,
- bg->start, bg->length, scrub_dev,
- map->stripes[stripe_index].physical,
+ ret = scrub_simple_mirror(sctx, bg, map, bg->start, bg->length,
+ scrub_dev, map->stripes[stripe_index].physical,
stripe_index + 1);
offset = 0;
goto out;
}
if (profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
- ret = scrub_simple_stripe(sctx, root, csum_root, bg, map,
- scrub_dev, stripe_index);
- offset = map->stripe_len * (stripe_index / map->sub_stripes);
+ ret = scrub_simple_stripe(sctx, bg, map, scrub_dev, stripe_index);
+ offset = (stripe_index / map->sub_stripes) << BTRFS_STRIPE_LEN_SHIFT;
goto out;
}
@@ -3705,7 +2191,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
/* Initialize @offset in case we need to go to out: label */
get_raid56_logic_offset(physical, stripe_index, map, &offset, NULL);
- increment = map->stripe_len * nr_data_stripes(map);
+ increment = nr_data_stripes(map) << BTRFS_STRIPE_LEN_SHIFT;
/*
* Due to the rotation, for RAID56 it's better to iterate each stripe
@@ -3718,10 +2204,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
if (ret) {
/* it is parity strip */
stripe_logical += chunk_logical;
- stripe_end = stripe_logical + increment;
- ret = scrub_raid56_parity(sctx, map, scrub_dev,
- stripe_logical,
- stripe_end);
+ ret = scrub_raid56_parity_stripe(sctx, scrub_dev, bg,
+ map, stripe_logical);
if (ret)
goto out;
goto next;
@@ -3735,14 +2219,13 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
* We can reuse scrub_simple_mirror() here, as the repair part
* is still based on @mirror_num.
*/
- ret = scrub_simple_mirror(sctx, root, csum_root, bg, map,
- logical, map->stripe_len,
+ ret = scrub_simple_mirror(sctx, bg, map, logical, BTRFS_STRIPE_LEN,
scrub_dev, physical, 1);
if (ret < 0)
goto out;
next:
logical += increment;
- physical += map->stripe_len;
+ physical += BTRFS_STRIPE_LEN;
spin_lock(&sctx->stat_lock);
if (stop_loop)
sctx->stat.last_physical =
@@ -3754,14 +2237,15 @@ next:
break;
}
out:
- /* push queued extents */
- scrub_submit(sctx);
- mutex_lock(&sctx->wr_lock);
- scrub_wr_submit(sctx);
- mutex_unlock(&sctx->wr_lock);
-
- blk_finish_plug(&plug);
- btrfs_free_path(path);
+ ret2 = flush_scrub_stripes(sctx);
+ if (!ret2)
+ ret = ret2;
+ if (sctx->raid56_data_stripes) {
+ for (int i = 0; i < nr_data_stripes(map); i++)
+ release_scrub_stripe(&sctx->raid56_data_stripes[i]);
+ kfree(sctx->raid56_data_stripes);
+ sctx->raid56_data_stripes = NULL;
+ }
if (sctx->is_dev_replace && ret >= 0) {
int ret2;
@@ -4079,39 +2563,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
ret = scrub_chunk(sctx, cache, scrub_dev, found_key.offset,
dev_extent_len);
-
- /*
- * flush, submit all pending read and write bios, afterwards
- * wait for them.
- * Note that in the dev replace case, a read request causes
- * write requests that are submitted in the read completion
- * worker. Therefore in the current situation, it is required
- * that all write requests are flushed, so that all read and
- * write requests are really completed when bios_in_flight
- * changes to 0.
- */
- sctx->flush_all_writes = true;
- scrub_submit(sctx);
- mutex_lock(&sctx->wr_lock);
- scrub_wr_submit(sctx);
- mutex_unlock(&sctx->wr_lock);
-
- wait_event(sctx->list_wait,
- atomic_read(&sctx->bios_in_flight) == 0);
-
- scrub_pause_on(fs_info);
-
- /*
- * must be called before we decrease @scrub_paused.
- * make sure we don't block transaction commit while
- * we are waiting pending workers finished.
- */
- wait_event(sctx->list_wait,
- atomic_read(&sctx->workers_pending) == 0);
- sctx->flush_all_writes = false;
-
- scrub_pause_off(fs_info);
-
if (sctx->is_dev_replace &&
!btrfs_finish_block_group_to_copy(dev_replace->srcdev,
cache, found_key.offset))
@@ -4168,18 +2619,62 @@ skip:
return ret;
}
+static int scrub_one_super(struct scrub_ctx *sctx, struct btrfs_device *dev,
+ struct page *page, u64 physical, u64 generation)
+{
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
+ struct bio_vec bvec;
+ struct bio bio;
+ struct btrfs_super_block *sb = page_address(page);
+ int ret;
+
+ bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_READ);
+ bio.bi_iter.bi_sector = physical >> SECTOR_SHIFT;
+ __bio_add_page(&bio, page, BTRFS_SUPER_INFO_SIZE, 0);
+ ret = submit_bio_wait(&bio);
+ bio_uninit(&bio);
+
+ if (ret < 0)
+ return ret;
+ ret = btrfs_check_super_csum(fs_info, sb);
+ if (ret != 0) {
+ btrfs_err_rl(fs_info,
+ "super block at physical %llu devid %llu has bad csum",
+ physical, dev->devid);
+ return -EIO;
+ }
+ if (btrfs_super_generation(sb) != generation) {
+ btrfs_err_rl(fs_info,
+"super block at physical %llu devid %llu has bad generation %llu expect %llu",
+ physical, dev->devid,
+ btrfs_super_generation(sb), generation);
+ return -EUCLEAN;
+ }
+
+ return btrfs_validate_super(fs_info, sb, -1);
+}
+
static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
struct btrfs_device *scrub_dev)
{
int i;
u64 bytenr;
u64 gen;
- int ret;
+ int ret = 0;
+ struct page *page;
struct btrfs_fs_info *fs_info = sctx->fs_info;
if (BTRFS_FS_ERROR(fs_info))
return -EROFS;
+ page = alloc_page(GFP_KERNEL);
+ if (!page) {
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.malloc_errors++;
+ spin_unlock(&sctx->stat_lock);
+ return -ENOMEM;
+ }
+
/* Seed devices of a new filesystem has their own generation. */
if (scrub_dev->fs_devices != fs_info->fs_devices)
gen = scrub_dev->generation;
@@ -4194,14 +2689,14 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
if (!btrfs_check_super_location(scrub_dev, bytenr))
continue;
- ret = scrub_sectors(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
- scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
- NULL, bytenr);
- if (ret)
- return ret;
+ ret = scrub_one_super(sctx, scrub_dev, page, bytenr, gen);
+ if (ret) {
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.super_errors++;
+ spin_unlock(&sctx->stat_lock);
+ }
}
- wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
-
+ __free_page(page);
return 0;
}
@@ -4212,20 +2707,15 @@ static void scrub_workers_put(struct btrfs_fs_info *fs_info)
struct workqueue_struct *scrub_workers = fs_info->scrub_workers;
struct workqueue_struct *scrub_wr_comp =
fs_info->scrub_wr_completion_workers;
- struct workqueue_struct *scrub_parity =
- fs_info->scrub_parity_workers;
fs_info->scrub_workers = NULL;
fs_info->scrub_wr_completion_workers = NULL;
- fs_info->scrub_parity_workers = NULL;
mutex_unlock(&fs_info->scrub_lock);
if (scrub_workers)
destroy_workqueue(scrub_workers);
if (scrub_wr_comp)
destroy_workqueue(scrub_wr_comp);
- if (scrub_parity)
- destroy_workqueue(scrub_parity);
}
}
@@ -4237,7 +2727,6 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
{
struct workqueue_struct *scrub_workers = NULL;
struct workqueue_struct *scrub_wr_comp = NULL;
- struct workqueue_struct *scrub_parity = NULL;
unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
int max_active = fs_info->thread_pool_size;
int ret = -ENOMEM;
@@ -4254,18 +2743,12 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
if (!scrub_wr_comp)
goto fail_scrub_wr_completion_workers;
- scrub_parity = alloc_workqueue("btrfs-scrubparity", flags, max_active);
- if (!scrub_parity)
- goto fail_scrub_parity_workers;
-
mutex_lock(&fs_info->scrub_lock);
if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
ASSERT(fs_info->scrub_workers == NULL &&
- fs_info->scrub_wr_completion_workers == NULL &&
- fs_info->scrub_parity_workers == NULL);
+ fs_info->scrub_wr_completion_workers == NULL);
fs_info->scrub_workers = scrub_workers;
fs_info->scrub_wr_completion_workers = scrub_wr_comp;
- fs_info->scrub_parity_workers = scrub_parity;
refcount_set(&fs_info->scrub_workers_refcnt, 1);
mutex_unlock(&fs_info->scrub_lock);
return 0;
@@ -4275,8 +2758,7 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
mutex_unlock(&fs_info->scrub_lock);
ret = 0;
- destroy_workqueue(scrub_parity);
-fail_scrub_parity_workers:
+
destroy_workqueue(scrub_wr_comp);
fail_scrub_wr_completion_workers:
destroy_workqueue(scrub_workers);
@@ -4411,12 +2893,9 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
ret = scrub_enumerate_chunks(sctx, dev, start, end);
memalloc_nofs_restore(nofs_flag);
- wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
atomic_dec(&fs_info->scrubs_running);
wake_up(&fs_info->scrub_pause_wait);
- wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
-
if (progress)
memcpy(progress, &sctx->stat, sizeof(*progress));
@@ -4541,28 +3020,3 @@ int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
}
-
-static void scrub_find_good_copy(struct btrfs_fs_info *fs_info,
- u64 extent_logical, u32 extent_len,
- u64 *extent_physical,
- struct btrfs_device **extent_dev,
- int *extent_mirror_num)
-{
- u64 mapped_length;
- struct btrfs_io_context *bioc = NULL;
- int ret;
-
- mapped_length = extent_len;
- ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical,
- &mapped_length, &bioc, 0);
- if (ret || !bioc || mapped_length < extent_len ||
- !bioc->stripes[0].dev->bdev) {
- btrfs_put_bioc(bioc);
- return;
- }
-
- *extent_physical = bioc->stripes[0].physical;
- *extent_mirror_num = bioc->mirror_num;
- *extent_dev = bioc->stripes[0].dev;
- btrfs_put_bioc(bioc);
-}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index e5c963bb873d..af2e153543a5 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1875,7 +1875,7 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen,
int left_ret;
int right_ret;
u64 left_gen;
- u64 right_gen;
+ u64 right_gen = 0;
struct btrfs_inode_info info;
ret = get_inode_info(sctx->send_root, ino, &info);
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 3eecce86f63f..75e7fa337e66 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -537,7 +537,7 @@ again:
up_read(&info->groups_sem);
}
-static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
+static inline u64 calc_reclaim_items_nr(const struct btrfs_fs_info *fs_info,
u64 to_reclaim)
{
u64 bytes;
@@ -550,6 +550,18 @@ static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
return nr;
}
+static inline u64 calc_delayed_refs_nr(const struct btrfs_fs_info *fs_info,
+ u64 to_reclaim)
+{
+ const u64 bytes = btrfs_calc_delayed_ref_bytes(fs_info, 1);
+ u64 nr;
+
+ nr = div64_u64(to_reclaim, bytes);
+ if (!nr)
+ nr = 1;
+ return nr;
+}
+
#define EXTENT_SIZE_PER_ITEM SZ_256K
/*
@@ -727,7 +739,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
break;
}
if (state == FLUSH_DELAYED_REFS_NR)
- nr = calc_reclaim_items_nr(fs_info, num_bytes);
+ nr = calc_delayed_refs_nr(fs_info, num_bytes);
else
nr = 0;
btrfs_run_delayed_refs(trans, nr);
@@ -1599,11 +1611,22 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
struct reserve_ticket ticket;
u64 start_ns = 0;
u64 used;
- int ret = 0;
+ int ret = -ENOSPC;
bool pending_tickets;
ASSERT(orig_bytes);
- ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
+ /*
+ * If have a transaction handle (current->journal_info != NULL), then
+ * the flush method can not be neither BTRFS_RESERVE_FLUSH_ALL* nor
+ * BTRFS_RESERVE_FLUSH_EVICT, as we could deadlock because those
+ * flushing methods can trigger transaction commits.
+ */
+ if (current->journal_info) {
+ /* One assert per line for easier debugging. */
+ ASSERT(flush != BTRFS_RESERVE_FLUSH_ALL);
+ ASSERT(flush != BTRFS_RESERVE_FLUSH_ALL_STEAL);
+ ASSERT(flush != BTRFS_RESERVE_FLUSH_EVICT);
+ }
if (flush == BTRFS_RESERVE_FLUSH_DATA)
async_work = &fs_info->async_data_reclaim_work;
@@ -1611,7 +1634,6 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
async_work = &fs_info->async_reclaim_work;
spin_lock(&space_info->lock);
- ret = -ENOSPC;
used = btrfs_space_info_used(space_info, true);
/*
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index 2033b71b18ce..0bb9d14e60a8 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -27,6 +27,7 @@ enum btrfs_reserve_flush_enum {
* - Running delayed refs
* - Running delalloc and waiting for ordered extents
* - Allocating a new chunk
+ * - Committing transaction
*/
BTRFS_RESERVE_FLUSH_EVICT,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 366fb4cde145..6cb97efee976 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1158,6 +1158,7 @@ static int btrfs_fill_super(struct super_block *sb,
inode = btrfs_iget(sb, BTRFS_FIRST_FREE_OBJECTID, fs_info->fs_root);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
+ btrfs_handle_fs_error(fs_info, err, NULL);
goto fail_close;
}
@@ -2412,7 +2413,7 @@ static int __init btrfs_print_mod_info(void)
", fsverity=no"
#endif
;
- pr_info("Btrfs loaded, crc32c=%s%s\n", crc32c_impl(), options);
+ pr_info("Btrfs loaded%s\n", options);
return 0;
}
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 37fc58a7f27e..25294e624851 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -1262,8 +1262,13 @@ static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj,
if (ret)
return ret;
+#ifdef CONFIG_BTRFS_DEBUG
+ if (thresh != 0 && (thresh > 100))
+ return -EINVAL;
+#else
if (thresh != 0 && (thresh <= 50 || thresh > 100))
return -EINVAL;
+#endif
WRITE_ONCE(fs_info->bg_reclaim_threshold, thresh);
diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c
index f2f2e11dac4c..ed0f36ae5346 100644
--- a/fs/btrfs/tests/extent-map-tests.c
+++ b/fs/btrfs/tests/extent-map-tests.c
@@ -486,7 +486,6 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info,
em->map_lookup = map;
map->num_stripes = test->num_stripes;
- map->stripe_len = BTRFS_STRIPE_LEN;
map->type = test->raid_type;
for (i = 0; i < map->num_stripes; i++) {
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index b8d5b1fa9a03..8b6a99b8d7f6 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -601,15 +601,16 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
/*
* We want to reserve all the bytes we may need all at once, so
* we only do 1 enospc flushing cycle per transaction start. We
- * accomplish this by simply assuming we'll do 2 x num_items
- * worth of delayed refs updates in this trans handle, and
- * refill that amount for whatever is missing in the reserve.
+ * accomplish this by simply assuming we'll do num_items worth
+ * of delayed refs updates in this trans handle, and refill that
+ * amount for whatever is missing in the reserve.
*/
num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items);
if (flush == BTRFS_RESERVE_FLUSH_ALL &&
- btrfs_block_rsv_full(delayed_refs_rsv) == 0) {
- delayed_refs_bytes = num_bytes;
- num_bytes <<= 1;
+ !btrfs_block_rsv_full(delayed_refs_rsv)) {
+ delayed_refs_bytes = btrfs_calc_delayed_ref_bytes(fs_info,
+ num_items);
+ num_bytes += delayed_refs_bytes;
}
/*
@@ -942,16 +943,6 @@ void btrfs_throttle(struct btrfs_fs_info *fs_info)
wait_current_trans(fs_info);
}
-static bool should_end_transaction(struct btrfs_trans_handle *trans)
-{
- struct btrfs_fs_info *fs_info = trans->fs_info;
-
- if (btrfs_check_space_for_delayed_refs(fs_info))
- return true;
-
- return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 50);
-}
-
bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans)
{
struct btrfs_transaction *cur_trans = trans->transaction;
@@ -960,7 +951,10 @@ bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans)
test_bit(BTRFS_DELAYED_REFS_FLUSHING, &cur_trans->delayed_refs.flags))
return true;
- return should_end_transaction(trans);
+ if (btrfs_check_space_for_delayed_refs(trans->fs_info))
+ return true;
+
+ return !!btrfs_block_rsv_check(&trans->fs_info->global_block_rsv, 50);
}
static void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans)
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index baad1ed7e111..e2b54793bf0c 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -849,6 +849,20 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf,
stripe_len);
return -EUCLEAN;
}
+ /*
+ * We artificially limit the chunk size, so that the number of stripes
+ * inside a chunk can be fit into a U32. The current limit (256G) is
+ * way too large for real world usage anyway, and it's also much larger
+ * than our existing limit (10G).
+ *
+ * Thus it should be a good way to catch obvious bitflips.
+ */
+ if (unlikely(length >= ((u64)U32_MAX << BTRFS_STRIPE_LEN_SHIFT))) {
+ chunk_err(leaf, chunk, logical,
+ "chunk length too large: have %llu limit %llu",
+ length, (u64)U32_MAX << BTRFS_STRIPE_LEN_SHIFT);
+ return -EUCLEAN;
+ }
if (unlikely(type & ~(BTRFS_BLOCK_GROUP_TYPE_MASK |
BTRFS_BLOCK_GROUP_PROFILE_MASK))) {
chunk_err(leaf, chunk, logical,
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 200cea6e49e5..9b212e8c70cc 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2563,6 +2563,28 @@ static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start)
btrfs_put_block_group(cache);
}
+static int clean_log_buffer(struct btrfs_trans_handle *trans,
+ struct extent_buffer *eb)
+{
+ int ret;
+
+ btrfs_tree_lock(eb);
+ btrfs_clear_buffer_dirty(trans, eb);
+ wait_on_extent_buffer_writeback(eb);
+ btrfs_tree_unlock(eb);
+
+ if (trans) {
+ ret = btrfs_pin_reserved_extent(trans, eb->start, eb->len);
+ if (ret)
+ return ret;
+ btrfs_redirty_list_add(trans->transaction, eb);
+ } else {
+ unaccount_log_buffer(eb->fs_info, eb->start);
+ }
+
+ return 0;
+}
+
static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, int *level,
@@ -2573,7 +2595,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
u64 ptr_gen;
struct extent_buffer *next;
struct extent_buffer *cur;
- u32 blocksize;
int ret = 0;
while (*level > 0) {
@@ -2593,7 +2614,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
check.level = *level - 1;
check.has_first_key = true;
btrfs_node_key_to_cpu(cur, &check.first_key, path->slots[*level]);
- blocksize = fs_info->nodesize;
next = btrfs_find_create_tree_block(fs_info, bytenr,
btrfs_header_owner(cur),
@@ -2617,22 +2637,10 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
return ret;
}
- btrfs_tree_lock(next);
- btrfs_clear_buffer_dirty(trans, next);
- wait_on_extent_buffer_writeback(next);
- btrfs_tree_unlock(next);
-
- if (trans) {
- ret = btrfs_pin_reserved_extent(trans,
- bytenr, blocksize);
- if (ret) {
- free_extent_buffer(next);
- return ret;
- }
- btrfs_redirty_list_add(
- trans->transaction, next);
- } else {
- unaccount_log_buffer(fs_info, bytenr);
+ ret = clean_log_buffer(trans, next);
+ if (ret) {
+ free_extent_buffer(next);
+ return ret;
}
}
free_extent_buffer(next);
@@ -2662,7 +2670,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_path *path, int *level,
struct walk_control *wc)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
int i;
int slot;
int ret;
@@ -2682,27 +2689,9 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
return ret;
if (wc->free) {
- struct extent_buffer *next;
-
- next = path->nodes[*level];
-
- btrfs_tree_lock(next);
- btrfs_clear_buffer_dirty(trans, next);
- wait_on_extent_buffer_writeback(next);
- btrfs_tree_unlock(next);
-
- if (trans) {
- ret = btrfs_pin_reserved_extent(trans,
- path->nodes[*level]->start,
- path->nodes[*level]->len);
- if (ret)
- return ret;
- btrfs_redirty_list_add(trans->transaction,
- next);
- } else {
- unaccount_log_buffer(fs_info,
- path->nodes[*level]->start);
- }
+ ret = clean_log_buffer(trans, path->nodes[*level]);
+ if (ret)
+ return ret;
}
free_extent_buffer(path->nodes[*level]);
path->nodes[*level] = NULL;
@@ -2720,7 +2709,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
static int walk_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *log, struct walk_control *wc)
{
- struct btrfs_fs_info *fs_info = log->fs_info;
int ret = 0;
int wret;
int level;
@@ -2762,26 +2750,8 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
orig_level);
if (ret)
goto out;
- if (wc->free) {
- struct extent_buffer *next;
-
- next = path->nodes[orig_level];
-
- btrfs_tree_lock(next);
- btrfs_clear_buffer_dirty(trans, next);
- wait_on_extent_buffer_writeback(next);
- btrfs_tree_unlock(next);
-
- if (trans) {
- ret = btrfs_pin_reserved_extent(trans,
- next->start, next->len);
- if (ret)
- goto out;
- btrfs_redirty_list_add(trans->transaction, next);
- } else {
- unaccount_log_buffer(fs_info, next->start);
- }
- }
+ if (wc->free)
+ ret = clean_log_buffer(trans, path->nodes[orig_level]);
}
out:
@@ -3648,6 +3618,9 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
ret = BTRFS_LOG_FORCE_COMMIT;
else
inode->last_dir_index_offset = last_index;
+
+ if (btrfs_get_first_dir_index_to_log(inode) == 0)
+ btrfs_set_first_dir_index_to_log(inode, batch.keys[0].offset);
out:
kfree(ins_data);
@@ -4099,7 +4072,7 @@ static int drop_inode_items(struct btrfs_trans_handle *trans,
found_key.offset = 0;
found_key.type = 0;
- ret = btrfs_bin_search(path->nodes[0], &found_key, &start_slot);
+ ret = btrfs_bin_search(path->nodes[0], 0, &found_key, &start_slot);
if (ret < 0)
break;
@@ -5406,6 +5379,7 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
LIST_HEAD(dir_list);
struct btrfs_dir_list *dir_elem;
u64 ino = btrfs_ino(start_inode);
+ struct btrfs_inode *curr_inode = start_inode;
int ret = 0;
/*
@@ -5420,43 +5394,39 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
+ /* Pairs with btrfs_add_delayed_iput below. */
+ ihold(&curr_inode->vfs_inode);
+
while (true) {
- struct extent_buffer *leaf;
- struct btrfs_key min_key;
+ struct inode *vfs_inode;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ u64 next_index;
bool continue_curr_inode = true;
- int nritems;
- int i;
+ int iter_ret;
- min_key.objectid = ino;
- min_key.type = BTRFS_DIR_INDEX_KEY;
- min_key.offset = 0;
+ key.objectid = ino;
+ key.type = BTRFS_DIR_INDEX_KEY;
+ key.offset = btrfs_get_first_dir_index_to_log(curr_inode);
+ next_index = key.offset;
again:
- btrfs_release_path(path);
- ret = btrfs_search_forward(root, &min_key, path, trans->transid);
- if (ret < 0) {
- break;
- } else if (ret > 0) {
- ret = 0;
- goto next;
- }
-
- leaf = path->nodes[0];
- nritems = btrfs_header_nritems(leaf);
- for (i = path->slots[0]; i < nritems; i++) {
+ btrfs_for_each_slot(root->log_root, &key, &found_key, path, iter_ret) {
+ struct extent_buffer *leaf = path->nodes[0];
struct btrfs_dir_item *di;
struct btrfs_key di_key;
struct inode *di_inode;
int log_mode = LOG_INODE_EXISTS;
int type;
- btrfs_item_key_to_cpu(leaf, &min_key, i);
- if (min_key.objectid != ino ||
- min_key.type != BTRFS_DIR_INDEX_KEY) {
+ if (found_key.objectid != ino ||
+ found_key.type != BTRFS_DIR_INDEX_KEY) {
continue_curr_inode = false;
break;
}
- di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
+ next_index = found_key.offset + 1;
+
+ di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
type = btrfs_dir_ftype(leaf, di);
if (btrfs_dir_transid(leaf, di) < trans->transid)
continue;
@@ -5496,12 +5466,24 @@ again:
break;
}
- if (continue_curr_inode && min_key.offset < (u64)-1) {
- min_key.offset++;
+ btrfs_release_path(path);
+
+ if (iter_ret < 0) {
+ ret = iter_ret;
+ goto out;
+ } else if (iter_ret > 0) {
+ continue_curr_inode = false;
+ } else {
+ key = found_key;
+ }
+
+ if (continue_curr_inode && key.offset < (u64)-1) {
+ key.offset++;
goto again;
}
-next:
+ btrfs_set_first_dir_index_to_log(curr_inode, next_index);
+
if (list_empty(&dir_list))
break;
@@ -5509,9 +5491,22 @@ next:
ino = dir_elem->ino;
list_del(&dir_elem->list);
kfree(dir_elem);
+
+ btrfs_add_delayed_iput(curr_inode);
+ curr_inode = NULL;
+
+ vfs_inode = btrfs_iget(fs_info->sb, ino, root);
+ if (IS_ERR(vfs_inode)) {
+ ret = PTR_ERR(vfs_inode);
+ break;
+ }
+ curr_inode = BTRFS_I(vfs_inode);
}
out:
btrfs_free_path(path);
+ if (curr_inode)
+ btrfs_add_delayed_iput(curr_inode);
+
if (ret) {
struct btrfs_dir_list *next;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c6d592870400..03f52e4a20aa 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -395,7 +395,6 @@ void btrfs_free_device(struct btrfs_device *device)
{
WARN_ON(!list_empty(&device->post_commit_list));
rcu_string_free(device->name);
- extent_io_tree_release(&device->alloc_state);
btrfs_destroy_dev_zone_info(device);
kfree(device);
}
@@ -1150,10 +1149,10 @@ static void btrfs_close_one_device(struct btrfs_device *device)
device->last_flush_error = 0;
/* Verify the device is back in a pristine state */
- ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state));
- ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
- ASSERT(list_empty(&device->dev_alloc_list));
- ASSERT(list_empty(&device->post_commit_list));
+ WARN_ON(test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state));
+ WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
+ WARN_ON(!list_empty(&device->dev_alloc_list));
+ WARN_ON(!list_empty(&device->post_commit_list));
}
static void close_fs_devices(struct btrfs_fs_devices *fs_devices)
@@ -2618,7 +2617,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
struct block_device *bdev;
struct super_block *sb = fs_info->sb;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
- struct btrfs_fs_devices *seed_devices;
+ struct btrfs_fs_devices *seed_devices = NULL;
u64 orig_super_total_bytes;
u64 orig_super_num_devices;
int ret = 0;
@@ -5125,7 +5124,7 @@ static void init_alloc_chunk_ctl_policy_regular(
/* We don't want a chunk larger than 10% of writable space */
ctl->max_chunk_size = min(mult_perc(fs_devices->total_rw_bytes, 10),
ctl->max_chunk_size);
- ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes;
+ ctl->dev_extent_min = ctl->dev_stripes << BTRFS_STRIPE_LEN_SHIFT;
}
static void init_alloc_chunk_ctl_policy_zoned(
@@ -5407,7 +5406,6 @@ static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
j * ctl->stripe_size;
}
}
- map->stripe_len = BTRFS_STRIPE_LEN;
map->io_align = BTRFS_STRIPE_LEN;
map->io_width = BTRFS_STRIPE_LEN;
map->type = type;
@@ -5438,7 +5436,7 @@ static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
}
write_unlock(&em_tree->lock);
- block_group = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size);
+ block_group = btrfs_make_block_group(trans, type, start, ctl->chunk_size);
if (IS_ERR(block_group))
goto error_del_extent;
@@ -5615,11 +5613,11 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
btrfs_set_stack_chunk_length(chunk, bg->length);
btrfs_set_stack_chunk_owner(chunk, BTRFS_EXTENT_TREE_OBJECTID);
- btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
+ btrfs_set_stack_chunk_stripe_len(chunk, BTRFS_STRIPE_LEN);
btrfs_set_stack_chunk_type(chunk, map->type);
btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
- btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
- btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
+ btrfs_set_stack_chunk_io_align(chunk, BTRFS_STRIPE_LEN);
+ btrfs_set_stack_chunk_io_width(chunk, BTRFS_STRIPE_LEN);
btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize);
btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
@@ -5784,13 +5782,6 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
*/
ret = map->num_stripes;
free_extent_map(em);
-
- down_read(&fs_info->dev_replace.rwsem);
- if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) &&
- fs_info->dev_replace.tgtdev)
- ret++;
- up_read(&fs_info->dev_replace.rwsem);
-
return ret;
}
@@ -5809,7 +5800,7 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
if (!WARN_ON(IS_ERR(em))) {
map = em->map_lookup;
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
- len = map->stripe_len * nr_data_stripes(map);
+ len = nr_data_stripes(map) << BTRFS_STRIPE_LEN_SHIFT;
free_extent_map(em);
}
return len;
@@ -5895,41 +5886,16 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
return preferred_mirror;
}
-/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
-static void sort_parity_stripes(struct btrfs_io_context *bioc, int num_stripes)
-{
- int i;
- int again = 1;
-
- while (again) {
- again = 0;
- for (i = 0; i < num_stripes - 1; i++) {
- /* Swap if parity is on a smaller index */
- if (bioc->raid_map[i] > bioc->raid_map[i + 1]) {
- swap(bioc->stripes[i], bioc->stripes[i + 1]);
- swap(bioc->raid_map[i], bioc->raid_map[i + 1]);
- again = 1;
- }
- }
- }
-}
-
static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
- int total_stripes,
- int real_stripes)
+ u16 total_stripes)
{
- struct btrfs_io_context *bioc = kzalloc(
+ struct btrfs_io_context *bioc;
+
+ bioc = kzalloc(
/* The size of btrfs_io_context */
sizeof(struct btrfs_io_context) +
/* Plus the variable array for the stripes */
- sizeof(struct btrfs_io_stripe) * (total_stripes) +
- /* Plus the variable array for the tgt dev */
- sizeof(int) * (real_stripes) +
- /*
- * Plus the raid_map, which includes both the tgt dev
- * and the stripes.
- */
- sizeof(u64) * (total_stripes),
+ sizeof(struct btrfs_io_stripe) * (total_stripes),
GFP_NOFS);
if (!bioc)
@@ -5938,8 +5904,8 @@ static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_
refcount_set(&bioc->refs, 1);
bioc->fs_info = fs_info;
- bioc->tgtdev_map = (int *)(bioc->stripes + total_stripes);
- bioc->raid_map = (u64 *)(bioc->tgtdev_map + real_stripes);
+ bioc->replace_stripe_src = -1;
+ bioc->full_stripe_logical = (u64)-1;
return bioc;
}
@@ -5971,16 +5937,15 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
struct btrfs_discard_stripe *stripes;
u64 length = *length_ret;
u64 offset;
- u64 stripe_nr;
- u64 stripe_nr_end;
+ u32 stripe_nr;
+ u32 stripe_nr_end;
+ u32 stripe_cnt;
u64 stripe_end_offset;
- u64 stripe_cnt;
- u64 stripe_len;
u64 stripe_offset;
u32 stripe_index;
u32 factor = 0;
u32 sub_stripes = 0;
- u64 stripes_per_dev = 0;
+ u32 stripes_per_dev = 0;
u32 remaining_stripes = 0;
u32 last_stripe = 0;
int ret;
@@ -5996,26 +5961,25 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
ret = -EOPNOTSUPP;
goto out_free_map;
-}
+ }
offset = logical - em->start;
length = min_t(u64, em->start + em->len - logical, length);
*length_ret = length;
- stripe_len = map->stripe_len;
/*
* stripe_nr counts the total number of stripes we have to stride
* to get to this block
*/
- stripe_nr = div64_u64(offset, stripe_len);
+ stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT;
/* stripe_offset is the offset of this block in its stripe */
- stripe_offset = offset - stripe_nr * stripe_len;
+ stripe_offset = offset - (stripe_nr << BTRFS_STRIPE_LEN_SHIFT);
- stripe_nr_end = round_up(offset + length, map->stripe_len);
- stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len);
+ stripe_nr_end = round_up(offset + length, BTRFS_STRIPE_LEN) >>
+ BTRFS_STRIPE_LEN_SHIFT;
stripe_cnt = stripe_nr_end - stripe_nr;
- stripe_end_offset = stripe_nr_end * map->stripe_len -
+ stripe_end_offset = (stripe_nr_end << BTRFS_STRIPE_LEN_SHIFT) -
(offset + length);
/*
* after this, stripe_nr is the number of stripes on this
@@ -6034,18 +5998,19 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
factor = map->num_stripes / sub_stripes;
*num_stripes = min_t(u64, map->num_stripes,
sub_stripes * stripe_cnt);
- stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
+ stripe_index = stripe_nr % factor;
+ stripe_nr /= factor;
stripe_index *= sub_stripes;
- stripes_per_dev = div_u64_rem(stripe_cnt, factor,
- &remaining_stripes);
- div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
- last_stripe *= sub_stripes;
+
+ remaining_stripes = stripe_cnt % factor;
+ stripes_per_dev = stripe_cnt / factor;
+ last_stripe = ((stripe_nr_end - 1) % factor) * sub_stripes;
} else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK |
BTRFS_BLOCK_GROUP_DUP)) {
*num_stripes = map->num_stripes;
} else {
- stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
- &stripe_index);
+ stripe_index = stripe_nr % map->num_stripes;
+ stripe_nr /= map->num_stripes;
}
stripes = kcalloc(*num_stripes, sizeof(*stripes), GFP_NOFS);
@@ -6057,15 +6022,15 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
for (i = 0; i < *num_stripes; i++) {
stripes[i].physical =
map->stripes[stripe_index].physical +
- stripe_offset + stripe_nr * map->stripe_len;
+ stripe_offset + (stripe_nr << BTRFS_STRIPE_LEN_SHIFT);
stripes[i].dev = map->stripes[stripe_index].dev;
if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
BTRFS_BLOCK_GROUP_RAID10)) {
- stripes[i].length = stripes_per_dev * map->stripe_len;
+ stripes[i].length = stripes_per_dev << BTRFS_STRIPE_LEN_SHIFT;
if (i / sub_stripes < remaining_stripes)
- stripes[i].length += map->stripe_len;
+ stripes[i].length += BTRFS_STRIPE_LEN;
/*
* Special for the first stripe and
@@ -6103,83 +6068,6 @@ out_free_map:
return ERR_PTR(ret);
}
-/*
- * In dev-replace case, for repair case (that's the only case where the mirror
- * is selected explicitly when calling btrfs_map_block), blocks left of the
- * left cursor can also be read from the target drive.
- *
- * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the
- * array of stripes.
- * For READ, it also needs to be supported using the same mirror number.
- *
- * If the requested block is not left of the left cursor, EIO is returned. This
- * can happen because btrfs_num_copies() returns one more in the dev-replace
- * case.
- */
-static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
- u64 logical, u64 length,
- u64 srcdev_devid, int *mirror_num,
- u64 *physical)
-{
- struct btrfs_io_context *bioc = NULL;
- int num_stripes;
- int index_srcdev = 0;
- int found = 0;
- u64 physical_of_found = 0;
- int i;
- int ret = 0;
-
- ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
- logical, &length, &bioc, NULL, NULL, 0);
- if (ret) {
- ASSERT(bioc == NULL);
- return ret;
- }
-
- num_stripes = bioc->num_stripes;
- if (*mirror_num > num_stripes) {
- /*
- * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror,
- * that means that the requested area is not left of the left
- * cursor
- */
- btrfs_put_bioc(bioc);
- return -EIO;
- }
-
- /*
- * process the rest of the function using the mirror_num of the source
- * drive. Therefore look it up first. At the end, patch the device
- * pointer to the one of the target drive.
- */
- for (i = 0; i < num_stripes; i++) {
- if (bioc->stripes[i].dev->devid != srcdev_devid)
- continue;
-
- /*
- * In case of DUP, in order to keep it simple, only add the
- * mirror with the lowest physical address
- */
- if (found &&
- physical_of_found <= bioc->stripes[i].physical)
- continue;
-
- index_srcdev = i;
- found = 1;
- physical_of_found = bioc->stripes[i].physical;
- }
-
- btrfs_put_bioc(bioc);
-
- ASSERT(found);
- if (!found)
- return -EIO;
-
- *mirror_num = index_srcdev + 1;
- *physical = physical_of_found;
- return ret;
-}
-
static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical)
{
struct btrfs_block_group *cache;
@@ -6198,101 +6086,80 @@ static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical)
}
static void handle_ops_on_dev_replace(enum btrfs_map_op op,
- struct btrfs_io_context **bioc_ret,
+ struct btrfs_io_context *bioc,
struct btrfs_dev_replace *dev_replace,
u64 logical,
int *num_stripes_ret, int *max_errors_ret)
{
- struct btrfs_io_context *bioc = *bioc_ret;
u64 srcdev_devid = dev_replace->srcdev->devid;
- int tgtdev_indexes = 0;
+ /*
+ * At this stage, num_stripes is still the real number of stripes,
+ * excluding the duplicated stripes.
+ */
int num_stripes = *num_stripes_ret;
+ int nr_extra_stripes = 0;
int max_errors = *max_errors_ret;
int i;
- if (op == BTRFS_MAP_WRITE) {
- int index_where_to_add;
+ /*
+ * A block group which has "to_copy" set will eventually be copied by
+ * the dev-replace process. We can avoid cloning IO here.
+ */
+ if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical))
+ return;
- /*
- * A block group which have "to_copy" set will eventually
- * copied by dev-replace process. We can avoid cloning IO here.
- */
- if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical))
- return;
+ /*
+ * Duplicate the write operations while the dev-replace procedure is
+ * running. Since the copying of the old disk to the new disk takes
+ * place at run time while the filesystem is mounted writable, the
+ * regular write operations to the old disk have to be duplicated to go
+ * to the new disk as well.
+ *
+ * Note that device->missing is handled by the caller, and that the
+ * write to the old disk is already set up in the stripes array.
+ */
+ for (i = 0; i < num_stripes; i++) {
+ struct btrfs_io_stripe *old = &bioc->stripes[i];
+ struct btrfs_io_stripe *new = &bioc->stripes[num_stripes + nr_extra_stripes];
- /*
- * duplicate the write operations while the dev replace
- * procedure is running. Since the copying of the old disk to
- * the new disk takes place at run time while the filesystem is
- * mounted writable, the regular write operations to the old
- * disk have to be duplicated to go to the new disk as well.
- *
- * Note that device->missing is handled by the caller, and that
- * the write to the old disk is already set up in the stripes
- * array.
- */
- index_where_to_add = num_stripes;
- for (i = 0; i < num_stripes; i++) {
- if (bioc->stripes[i].dev->devid == srcdev_devid) {
- /* write to new disk, too */
- struct btrfs_io_stripe *new =
- bioc->stripes + index_where_to_add;
- struct btrfs_io_stripe *old =
- bioc->stripes + i;
-
- new->physical = old->physical;
- new->dev = dev_replace->tgtdev;
- bioc->tgtdev_map[i] = index_where_to_add;
- index_where_to_add++;
- max_errors++;
- tgtdev_indexes++;
- }
- }
- num_stripes = index_where_to_add;
- } else if (op == BTRFS_MAP_GET_READ_MIRRORS) {
- int index_srcdev = 0;
- int found = 0;
- u64 physical_of_found = 0;
+ if (old->dev->devid != srcdev_devid)
+ continue;
- /*
- * During the dev-replace procedure, the target drive can also
- * be used to read data in case it is needed to repair a corrupt
- * block elsewhere. This is possible if the requested area is
- * left of the left cursor. In this area, the target drive is a
- * full copy of the source drive.
- */
- for (i = 0; i < num_stripes; i++) {
- if (bioc->stripes[i].dev->devid == srcdev_devid) {
- /*
- * In case of DUP, in order to keep it simple,
- * only add the mirror with the lowest physical
- * address
- */
- if (found &&
- physical_of_found <= bioc->stripes[i].physical)
- continue;
- index_srcdev = i;
- found = 1;
- physical_of_found = bioc->stripes[i].physical;
- }
- }
- if (found) {
- struct btrfs_io_stripe *tgtdev_stripe =
- bioc->stripes + num_stripes;
+ new->physical = old->physical;
+ new->dev = dev_replace->tgtdev;
+ if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK)
+ bioc->replace_stripe_src = i;
+ nr_extra_stripes++;
+ }
+
+ /* We can only have at most 2 extra nr_stripes (for DUP). */
+ ASSERT(nr_extra_stripes <= 2);
+ /*
+ * For GET_READ_MIRRORS, we can only return at most 1 extra stripe for
+ * replace.
+ * If we have 2 extra stripes, only choose the one with smaller physical.
+ */
+ if (op == BTRFS_MAP_GET_READ_MIRRORS && nr_extra_stripes == 2) {
+ struct btrfs_io_stripe *first = &bioc->stripes[num_stripes];
+ struct btrfs_io_stripe *second = &bioc->stripes[num_stripes + 1];
- tgtdev_stripe->physical = physical_of_found;
- tgtdev_stripe->dev = dev_replace->tgtdev;
- bioc->tgtdev_map[index_srcdev] = num_stripes;
+ /* Only DUP can have two extra stripes. */
+ ASSERT(bioc->map_type & BTRFS_BLOCK_GROUP_DUP);
- tgtdev_indexes++;
- num_stripes++;
+ /*
+ * Swap the last stripe stripes and reduce @nr_extra_stripes.
+ * The extra stripe would still be there, but won't be accessed.
+ */
+ if (first->physical > second->physical) {
+ swap(second->physical, first->physical);
+ swap(second->dev, first->dev);
+ nr_extra_stripes--;
}
}
- *num_stripes_ret = num_stripes;
- *max_errors_ret = max_errors;
- bioc->num_tgtdevs = tgtdev_indexes;
- *bioc_ret = bioc;
+ *num_stripes_ret = num_stripes + nr_extra_stripes;
+ *max_errors_ret = max_errors + nr_extra_stripes;
+ bioc->replace_nr_stripes = nr_extra_stripes;
}
static bool need_full_stripe(enum btrfs_map_op op)
@@ -6301,25 +6168,35 @@ static bool need_full_stripe(enum btrfs_map_op op)
}
static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op,
- u64 offset, u64 *stripe_nr, u64 *stripe_offset,
+ u64 offset, u32 *stripe_nr, u64 *stripe_offset,
u64 *full_stripe_start)
{
- u32 stripe_len = map->stripe_len;
-
ASSERT(op != BTRFS_MAP_DISCARD);
/*
* Stripe_nr is the stripe where this block falls. stripe_offset is
* the offset of this block in its stripe.
*/
- *stripe_nr = div64_u64_rem(offset, stripe_len, stripe_offset);
+ *stripe_offset = offset & BTRFS_STRIPE_LEN_MASK;
+ *stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT;
ASSERT(*stripe_offset < U32_MAX);
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- unsigned long full_stripe_len = stripe_len * nr_data_stripes(map);
+ unsigned long full_stripe_len = nr_data_stripes(map) <<
+ BTRFS_STRIPE_LEN_SHIFT;
+ /*
+ * For full stripe start, we use previously calculated
+ * @stripe_nr. Align it to nr_data_stripes, then multiply with
+ * STRIPE_LEN.
+ *
+ * By this we can avoid u64 division completely. And we have
+ * to go rounddown(), not round_down(), as nr_data_stripes is
+ * not ensured to be power of 2.
+ */
*full_stripe_start =
- div64_u64(offset, full_stripe_len) * full_stripe_len;
+ rounddown(*stripe_nr, nr_data_stripes(map)) <<
+ BTRFS_STRIPE_LEN_SHIFT;
/*
* For writes to RAID56, allow to write a full stripe set, but
@@ -6334,16 +6211,16 @@ static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op,
* a single disk).
*/
if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK)
- return stripe_len - *stripe_offset;
+ return BTRFS_STRIPE_LEN - *stripe_offset;
return U64_MAX;
}
static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup *map,
- u32 stripe_index, u64 stripe_offset, u64 stripe_nr)
+ u32 stripe_index, u64 stripe_offset, u32 stripe_nr)
{
dst->dev = map->stripes[stripe_index].dev;
dst->physical = map->stripes[stripe_index].physical +
- stripe_offset + stripe_nr * map->stripe_len;
+ stripe_offset + (stripe_nr << BTRFS_STRIPE_LEN_SHIFT);
}
int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
@@ -6356,35 +6233,35 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
struct map_lookup *map;
u64 map_offset;
u64 stripe_offset;
- u64 stripe_nr;
- u64 stripe_len;
+ u32 stripe_nr;
u32 stripe_index;
int data_stripes;
int i;
int ret = 0;
int mirror_num = (mirror_num_ret ? *mirror_num_ret : 0);
int num_stripes;
+ int num_copies;
int max_errors = 0;
- int tgtdev_indexes = 0;
struct btrfs_io_context *bioc = NULL;
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
int dev_replace_is_ongoing = 0;
- int num_alloc_stripes;
- int patch_the_first_stripe_for_dev_replace = 0;
- u64 physical_to_patch_in_first_stripe = 0;
+ u16 num_alloc_stripes;
u64 raid56_full_stripe_start = (u64)-1;
u64 max_len;
ASSERT(bioc_ret);
ASSERT(op != BTRFS_MAP_DISCARD);
+ num_copies = btrfs_num_copies(fs_info, logical, fs_info->sectorsize);
+ if (mirror_num > num_copies)
+ return -EINVAL;
+
em = btrfs_get_chunk_map(fs_info, logical, *length);
if (IS_ERR(em))
return PTR_ERR(em);
map = em->map_lookup;
data_stripes = nr_data_stripes(map);
- stripe_len = map->stripe_len;
map_offset = logical - em->start;
max_len = btrfs_max_io_len(map, op, map_offset, &stripe_nr,
@@ -6400,25 +6277,11 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
if (!dev_replace_is_ongoing)
up_read(&dev_replace->rwsem);
- if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
- !need_full_stripe(op) && dev_replace->tgtdev != NULL) {
- ret = get_extra_mirror_from_replace(fs_info, logical, *length,
- dev_replace->srcdev->devid,
- &mirror_num,
- &physical_to_patch_in_first_stripe);
- if (ret)
- goto out;
- else
- patch_the_first_stripe_for_dev_replace = 1;
- } else if (mirror_num > map->num_stripes) {
- mirror_num = 0;
- }
-
num_stripes = 1;
stripe_index = 0;
if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
- stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
- &stripe_index);
+ stripe_index = stripe_nr % map->num_stripes;
+ stripe_nr /= map->num_stripes;
if (!need_full_stripe(op))
mirror_num = 1;
} else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
@@ -6444,8 +6307,8 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
u32 factor = map->num_stripes / map->sub_stripes;
- stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
- stripe_index *= map->sub_stripes;
+ stripe_index = (stripe_nr % factor) * map->sub_stripes;
+ stripe_nr /= factor;
if (need_full_stripe(op))
num_stripes = map->sub_stripes;
@@ -6460,11 +6323,17 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
}
} else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- ASSERT(map->stripe_len == BTRFS_STRIPE_LEN);
if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) {
- /* push stripe_nr back to the start of the full stripe */
- stripe_nr = div64_u64(raid56_full_stripe_start,
- stripe_len * data_stripes);
+ /*
+ * Push stripe_nr back to the start of the full stripe
+ * For those cases needing a full stripe, @stripe_nr
+ * is the full stripe number.
+ *
+ * Originally we go raid56_full_stripe_start / full_stripe_len,
+ * but that can be expensive. Here we just divide
+ * @stripe_nr with @data_stripes.
+ */
+ stripe_nr /= data_stripes;
/* RAID[56] write or recovery. Return all stripes */
num_stripes = map->num_stripes;
@@ -6473,7 +6342,7 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
/* Return the length to the full stripe end */
*length = min(logical + *length,
raid56_full_stripe_start + em->start +
- data_stripes * stripe_len) - logical;
+ (data_stripes << BTRFS_STRIPE_LEN_SHIFT)) - logical;
stripe_index = 0;
stripe_offset = 0;
} else {
@@ -6482,25 +6351,24 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
* Mirror #2 is RAID5 parity block.
* Mirror #3 is RAID6 Q block.
*/
- stripe_nr = div_u64_rem(stripe_nr,
- data_stripes, &stripe_index);
+ stripe_index = stripe_nr % data_stripes;
+ stripe_nr /= data_stripes;
if (mirror_num > 1)
stripe_index = data_stripes + mirror_num - 2;
/* We distribute the parity blocks across stripes */
- div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
- &stripe_index);
+ stripe_index = (stripe_nr + stripe_index) % map->num_stripes;
if (!need_full_stripe(op) && mirror_num <= 1)
mirror_num = 1;
}
} else {
/*
- * after this, stripe_nr is the number of stripes on this
+ * After this, stripe_nr is the number of stripes on this
* device we have to walk to find the data, and stripe_index is
* the number of our device in the stripe array
*/
- stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
- &stripe_index);
+ stripe_index = stripe_nr % map->num_stripes;
+ stripe_nr /= map->num_stripes;
mirror_num = stripe_index + 1;
}
if (stripe_index >= map->num_stripes) {
@@ -6512,13 +6380,16 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
}
num_alloc_stripes = num_stripes;
- if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) {
- if (op == BTRFS_MAP_WRITE)
- num_alloc_stripes <<= 1;
- if (op == BTRFS_MAP_GET_READ_MIRRORS)
- num_alloc_stripes++;
- tgtdev_indexes = num_stripes;
- }
+ if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
+ op != BTRFS_MAP_READ)
+ /*
+ * For replace case, we need to add extra stripes for extra
+ * duplicated stripes.
+ *
+ * For both WRITE and GET_READ_MIRRORS, we may have at most
+ * 2 more stripes (DUP types, otherwise 1).
+ */
+ num_alloc_stripes += 2;
/*
* If this I/O maps to a single device, try to return the device and
@@ -6529,53 +6400,53 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
!((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1) &&
(!need_full_stripe(op) || !dev_replace_is_ongoing ||
!dev_replace->tgtdev)) {
- if (patch_the_first_stripe_for_dev_replace) {
- smap->dev = dev_replace->tgtdev;
- smap->physical = physical_to_patch_in_first_stripe;
- *mirror_num_ret = map->num_stripes + 1;
- } else {
- set_io_stripe(smap, map, stripe_index, stripe_offset,
- stripe_nr);
- *mirror_num_ret = mirror_num;
- }
+ set_io_stripe(smap, map, stripe_index, stripe_offset, stripe_nr);
+ *mirror_num_ret = mirror_num;
*bioc_ret = NULL;
ret = 0;
goto out;
}
- bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes, tgtdev_indexes);
+ bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes);
if (!bioc) {
ret = -ENOMEM;
goto out;
}
+ bioc->map_type = map->type;
- for (i = 0; i < num_stripes; i++) {
- set_io_stripe(&bioc->stripes[i], map, stripe_index, stripe_offset,
- stripe_nr);
- stripe_index++;
- }
-
- /* Build raid_map */
+ /*
+ * For RAID56 full map, we need to make sure the stripes[] follows the
+ * rule that data stripes are all ordered, then followed with P and Q
+ * (if we have).
+ *
+ * It's still mostly the same as other profiles, just with extra rotation.
+ */
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
(need_full_stripe(op) || mirror_num > 1)) {
- u64 tmp;
- unsigned rot;
-
- /* Work out the disk rotation on this stripe-set */
- div_u64_rem(stripe_nr, num_stripes, &rot);
-
- /* Fill in the logical address of each stripe */
- tmp = stripe_nr * data_stripes;
- for (i = 0; i < data_stripes; i++)
- bioc->raid_map[(i + rot) % num_stripes] =
- em->start + (tmp + i) * map->stripe_len;
-
- bioc->raid_map[(i + rot) % map->num_stripes] = RAID5_P_STRIPE;
- if (map->type & BTRFS_BLOCK_GROUP_RAID6)
- bioc->raid_map[(i + rot + 1) % num_stripes] =
- RAID6_Q_STRIPE;
-
- sort_parity_stripes(bioc, num_stripes);
+ /*
+ * For RAID56 @stripe_nr is already the number of full stripes
+ * before us, which is also the rotation value (needs to modulo
+ * with num_stripes).
+ *
+ * In this case, we just add @stripe_nr with @i, then do the
+ * modulo, to reduce one modulo call.
+ */
+ bioc->full_stripe_logical = em->start +
+ ((stripe_nr * data_stripes) << BTRFS_STRIPE_LEN_SHIFT);
+ for (i = 0; i < num_stripes; i++)
+ set_io_stripe(&bioc->stripes[i], map,
+ (i + stripe_nr) % num_stripes,
+ stripe_offset, stripe_nr);
+ } else {
+ /*
+ * For all other non-RAID56 profiles, just copy the target
+ * stripe into the bioc.
+ */
+ for (i = 0; i < num_stripes; i++) {
+ set_io_stripe(&bioc->stripes[i], map, stripe_index,
+ stripe_offset, stripe_nr);
+ stripe_index++;
+ }
}
if (need_full_stripe(op))
@@ -6583,27 +6454,15 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
need_full_stripe(op)) {
- handle_ops_on_dev_replace(op, &bioc, dev_replace, logical,
+ handle_ops_on_dev_replace(op, bioc, dev_replace, logical,
&num_stripes, &max_errors);
}
*bioc_ret = bioc;
- bioc->map_type = map->type;
bioc->num_stripes = num_stripes;
bioc->max_errors = max_errors;
bioc->mirror_num = mirror_num;
- /*
- * this is the case that REQ_READ && dev_replace_is_ongoing &&
- * mirror_num == num_stripes + 1 && dev_replace target drive is
- * available as a mirror
- */
- if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
- WARN_ON(num_stripes > 1);
- bioc->stripes[0].dev = dev_replace->tgtdev;
- bioc->stripes[0].physical = physical_to_patch_in_first_stripe;
- bioc->mirror_num = map->num_stripes + 1;
- }
out:
if (dev_replace_is_ongoing) {
lockdep_assert_held(&dev_replace->rwsem);
@@ -6941,7 +6800,6 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
map->num_stripes = num_stripes;
map->io_width = btrfs_chunk_io_width(leaf, chunk);
map->io_align = btrfs_chunk_io_align(leaf, chunk);
- map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
map->type = type;
/*
* We can't use the sub_stripes value, as for profiles other than
@@ -8161,3 +8019,76 @@ bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical)
return true;
}
+
+static void map_raid56_repair_block(struct btrfs_io_context *bioc,
+ struct btrfs_io_stripe *smap,
+ u64 logical)
+{
+ int data_stripes = nr_bioc_data_stripes(bioc);
+ int i;
+
+ for (i = 0; i < data_stripes; i++) {
+ u64 stripe_start = bioc->full_stripe_logical +
+ (i << BTRFS_STRIPE_LEN_SHIFT);
+
+ if (logical >= stripe_start &&
+ logical < stripe_start + BTRFS_STRIPE_LEN)
+ break;
+ }
+ ASSERT(i < data_stripes);
+ smap->dev = bioc->stripes[i].dev;
+ smap->physical = bioc->stripes[i].physical +
+ ((logical - bioc->full_stripe_logical) &
+ BTRFS_STRIPE_LEN_MASK);
+}
+
+/*
+ * Map a repair write into a single device.
+ *
+ * A repair write is triggered by read time repair or scrub, which would only
+ * update the contents of a single device.
+ * Not update any other mirrors nor go through RMW path.
+ *
+ * Callers should ensure:
+ *
+ * - Call btrfs_bio_counter_inc_blocked() first
+ * - The range does not cross stripe boundary
+ * - Has a valid @mirror_num passed in.
+ */
+int btrfs_map_repair_block(struct btrfs_fs_info *fs_info,
+ struct btrfs_io_stripe *smap, u64 logical,
+ u32 length, int mirror_num)
+{
+ struct btrfs_io_context *bioc = NULL;
+ u64 map_length = length;
+ int mirror_ret = mirror_num;
+ int ret;
+
+ ASSERT(mirror_num > 0);
+
+ ret = __btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, &map_length,
+ &bioc, smap, &mirror_ret, true);
+ if (ret < 0)
+ return ret;
+
+ /* The map range should not cross stripe boundary. */
+ ASSERT(map_length >= length);
+
+ /* Already mapped to single stripe. */
+ if (!bioc)
+ goto out;
+
+ /* Map the RAID56 multi-stripe writes to a single one. */
+ if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+ map_raid56_repair_block(bioc, smap, logical);
+ goto out;
+ }
+
+ ASSERT(mirror_num <= bioc->num_stripes);
+ smap->dev = bioc->stripes[mirror_num - 1].dev;
+ smap->physical = bioc->stripes[mirror_num - 1].physical;
+out:
+ btrfs_put_bioc(bioc);
+ ASSERT(smap->dev);
+ return 0;
+}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 7e51f2238f72..bf47a1a70813 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -17,7 +17,11 @@
extern struct mutex uuid_mutex;
-#define BTRFS_STRIPE_LEN SZ_64K
+#define BTRFS_STRIPE_LEN SZ_64K
+#define BTRFS_STRIPE_LEN_SHIFT (16)
+#define BTRFS_STRIPE_LEN_MASK (BTRFS_STRIPE_LEN - 1)
+
+static_assert(const_ilog2(BTRFS_STRIPE_LEN) == BTRFS_STRIPE_LEN_SHIFT);
/* Used by sanity check for btrfs_raid_types. */
#define const_ffs(n) (__builtin_ctzll(n) + 1)
@@ -404,17 +408,74 @@ struct btrfs_io_context {
u64 map_type; /* get from map_lookup->type */
struct bio *orig_bio;
atomic_t error;
- int max_errors;
- int num_stripes;
- int mirror_num;
- int num_tgtdevs;
- int *tgtdev_map;
+ u16 max_errors;
+
+ /*
+ * The total number of stripes, including the extra duplicated
+ * stripe for replace.
+ */
+ u16 num_stripes;
+
+ /*
+ * The mirror_num of this bioc.
+ *
+ * This is for reads which use 0 as mirror_num, thus we should return a
+ * valid mirror_num (>0) for the reader.
+ */
+ u16 mirror_num;
+
+ /*
+ * The following two members are for dev-replace case only.
+ *
+ * @replace_nr_stripes: Number of duplicated stripes which need to be
+ * written to replace target.
+ * Should be <= 2 (2 for DUP, otherwise <= 1).
+ * @replace_stripe_src: The array indicates where the duplicated stripes
+ * are from.
+ *
+ * The @replace_stripe_src[] array is mostly for RAID56 cases.
+ * As non-RAID56 stripes share the same contents of the mapped range,
+ * thus no need to bother where the duplicated ones are from.
+ *
+ * But for RAID56 case, all stripes contain different contents, thus
+ * we need a way to know the mapping.
+ *
+ * There is an example for the two members, using a RAID5 write:
+ *
+ * num_stripes: 4 (3 + 1 duplicated write)
+ * stripes[0]: dev = devid 1, physical = X
+ * stripes[1]: dev = devid 2, physical = Y
+ * stripes[2]: dev = devid 3, physical = Z
+ * stripes[3]: dev = devid 0, physical = Y
+ *
+ * replace_nr_stripes = 1
+ * replace_stripe_src = 1 <- Means stripes[1] is involved in replace.
+ * The duplicated stripe index would be
+ * (@num_stripes - 1).
+ *
+ * Note, that we can still have cases replace_nr_stripes = 2 for DUP.
+ * In that case, all stripes share the same content, thus we don't
+ * need to bother @replace_stripe_src value at all.
+ */
+ u16 replace_nr_stripes;
+ s16 replace_stripe_src;
/*
- * logical block numbers for the start of each stripe
- * The last one or two are p/q. These are sorted,
- * so raid_map[0] is the start of our full stripe
+ * Logical bytenr of the full stripe start, only for RAID56 cases.
+ *
+ * When this value is set to other than (u64)-1, the stripes[] should
+ * follow this pattern:
+ *
+ * (real_stripes = num_stripes - replace_nr_stripes)
+ * (data_stripes = (is_raid6) ? (real_stripes - 2) : (real_stripes - 1))
+ *
+ * stripes[0]: The first data stripe
+ * stripes[1]: The second data stripe
+ * ...
+ * stripes[data_stripes - 1]: The last data stripe
+ * stripes[data_stripes]: The P stripe
+ * stripes[data_stripes + 1]: The Q stripe (only for RAID6).
*/
- u64 *raid_map;
+ u64 full_stripe_logical;
struct btrfs_io_stripe stripes[];
};
@@ -446,7 +507,6 @@ struct map_lookup {
u64 type;
int io_align;
int io_width;
- u32 stripe_len;
int num_stripes;
int sub_stripes;
int verified_stripes; /* For mount time dev extent verification */
@@ -527,6 +587,9 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
struct btrfs_io_context **bioc_ret,
struct btrfs_io_stripe *smap, int *mirror_num_ret,
int need_raid_map);
+int btrfs_map_repair_block(struct btrfs_fs_info *fs_info,
+ struct btrfs_io_stripe *smap, u64 logical,
+ u32 length, int mirror_num);
struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
u64 logical, u64 *length_ret,
u32 *num_stripes);
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index da7bb9187b68..8acb05e176c5 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -350,8 +350,6 @@ done:
zlib_inflateEnd(&workspace->strm);
if (data_in)
kunmap_local(data_in);
- if (!ret)
- zero_fill_bio(cb->orig_bio);
return ret;
}
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 45d04092f2f8..a9b32ba6b2ce 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -1640,14 +1640,14 @@ bool btrfs_use_zone_append(struct btrfs_bio *bbio)
{
u64 start = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT);
struct btrfs_inode *inode = bbio->inode;
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_fs_info *fs_info = bbio->fs_info;
struct btrfs_block_group *cache;
bool ret = false;
if (!btrfs_is_zoned(fs_info))
return false;
- if (!is_data_inode(&inode->vfs_inode))
+ if (!inode || !is_data_inode(&inode->vfs_inode))
return false;
if (btrfs_op(&bbio->bio) != BTRFS_MAP_WRITE)
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index e34f1ab99d56..f798da267590 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -609,7 +609,6 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
}
}
ret = 0;
- zero_fill_bio(cb->orig_bio);
done:
if (workspace->in_buf.src)
kunmap_local(workspace->in_buf.src);