diff options
author | Kent Overstreet <kent.overstreet@linux.dev> | 2023-09-10 18:05:17 -0400 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@linux.dev> | 2023-10-22 17:10:12 -0400 |
commit | 1809b8cba756d32bd6e976ed4ee64efdf66c6d94 (patch) | |
tree | f25287cf7f337aa8b8a093a91e0478a909833b8c /fs/bcachefs | |
parent | cbf57db53f311b09de2c17b514e104d421d72871 (diff) | |
download | lwn-1809b8cba756d32bd6e976ed4ee64efdf66c6d94.tar.gz lwn-1809b8cba756d32bd6e976ed4ee64efdf66c6d94.zip |
bcachefs: Break up io.c
More reorganization, this splits up io.c into
- io_read.c
- io_misc.c - fallocate, fpunch, truncate
- io_write.c
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Diffstat (limited to 'fs/bcachefs')
34 files changed, 1751 insertions, 1692 deletions
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile index 0a4d2fed66c1..9c00dabb26ac 100644 --- a/fs/bcachefs/Makefile +++ b/fs/bcachefs/Makefile @@ -46,7 +46,9 @@ bcachefs-y := \ fs-io-pagecache.o \ fsck.o \ inode.o \ - io.o \ + io_read.o \ + io_misc.o \ + io_write.o \ journal.o \ journal_io.o \ journal_reclaim.o \ diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index e02749ddc362..8e1888a89011 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -25,7 +25,7 @@ #include "disk_groups.h" #include "ec.h" #include "error.h" -#include "io.h" +#include "io_write.h" #include "journal.h" #include "movinggc.h" #include "nocow_locking.h" diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 0edbb73a5ec8..00f53cb5d44b 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -14,7 +14,7 @@ #include "debug.h" #include "error.h" #include "extents.h" -#include "io.h" +#include "io_write.h" #include "journal_reclaim.h" #include "journal_seq_blacklist.h" #include "recovery.h" diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h index cd99bbb00a5a..7e03dd76fb38 100644 --- a/fs/bcachefs/btree_io.h +++ b/fs/bcachefs/btree_io.h @@ -7,7 +7,7 @@ #include "btree_locking.h" #include "checksum.h" #include "extents.h" -#include "io_types.h" +#include "io_write_types.h" struct bch_fs; struct btree_write; diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c index 6b17f7cc5860..f1651807c2b7 100644 --- a/fs/bcachefs/compress.c +++ b/fs/bcachefs/compress.c @@ -3,7 +3,6 @@ #include "checksum.h" #include "compress.h" #include "extents.h" -#include "io.h" #include "super-io.h" #include <linux/lz4.h> diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 81518f20d37d..29576c4c109d 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -9,7 +9,7 @@ #include "ec.h" #include "error.h" #include "extents.h" -#include "io.h" +#include "io_write.h" #include "keylist.h" #include "move.h" #include "nocow_locking.h" diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h index 49e9055cbb52..7ca1f98d7e94 100644 --- a/fs/bcachefs/data_update.h +++ b/fs/bcachefs/data_update.h @@ -4,7 +4,7 @@ #define _BCACHEFS_DATA_UPDATE_H #include "bkey_buf.h" -#include "io_types.h" +#include "io_write_types.h" struct moving_context; diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index ae47e1854b80..5f3e65f9069e 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -19,7 +19,6 @@ #include "extents.h" #include "fsck.h" #include "inode.h" -#include "io.h" #include "super.h" #include <linux/console.h> diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 67a5453a36d9..40e72b96745a 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -11,10 +11,11 @@ #include "btree_update.h" #include "btree_write_buffer.h" #include "buckets.h" +#include "checksum.h" #include "disk_groups.h" #include "ec.h" #include "error.h" -#include "io.h" +#include "io_read.h" #include "keylist.h" #include "recovery.h" #include "replicas.h" diff --git a/fs/bcachefs/errcode.c b/fs/bcachefs/errcode.c index dc906fc9176f..8d58f2cca260 100644 --- a/fs/bcachefs/errcode.c +++ b/fs/bcachefs/errcode.c @@ -61,3 +61,10 @@ int __bch2_err_class(int err) return -err; } + +const char *bch2_blk_status_to_str(blk_status_t status) +{ + if (status == BLK_STS_REMOVED) + return "device removed"; + return blk_status_to_str(status); +} diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index f7fa87442e98..379d9d7ed333 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -249,4 +249,8 @@ static inline long bch2_err_class(long err) return err < 0 ? __bch2_err_class(err) : err; } +#define BLK_STS_REMOVED ((__force blk_status_t)128) + +const char *bch2_blk_status_to_str(blk_status_t); + #endif /* _BCACHFES_ERRCODE_H */ diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index 39009cf0c448..2a5af8872613 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -1,7 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "error.h" -#include "io.h" #include "super.h" #define FSCK_ERR_RATELIMIT_NR 10 diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c index dc22182d532f..2034d635c718 100644 --- a/fs/bcachefs/fs-io-buffered.c +++ b/fs/bcachefs/fs-io-buffered.c @@ -8,7 +8,8 @@ #include "fs-io-buffered.h" #include "fs-io-direct.h" #include "fs-io-pagecache.h" -#include "io.h" +#include "io_read.h" +#include "io_write.h" #include <linux/backing-dev.h> #include <linux/pagemap.h> diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c index 2b29abd24d56..219bc1124477 100644 --- a/fs/bcachefs/fs-io-direct.c +++ b/fs/bcachefs/fs-io-direct.c @@ -7,7 +7,8 @@ #include "fs-io.h" #include "fs-io-direct.h" #include "fs-io-pagecache.h" -#include "io.h" +#include "io_read.h" +#include "io_write.h" #include <linux/kthread.h> #include <linux/pagemap.h> diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index ceab12fb8a8f..0b0b3b0d6c7d 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -3,6 +3,7 @@ #include "bcachefs.h" #include "alloc_foreground.h" +#include "bkey_buf.h" #include "btree_update.h" #include "buckets.h" #include "clock.h" @@ -16,7 +17,7 @@ #include "fsck.h" #include "inode.h" #include "journal.h" -#include "io.h" +#include "io_misc.h" #include "keylist.h" #include "quota.h" #include "reflink.h" diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h index bb5b709fa8cf..bc6e8439d40b 100644 --- a/fs/bcachefs/fs-io.h +++ b/fs/bcachefs/fs-io.h @@ -6,7 +6,7 @@ #include "buckets.h" #include "fs.h" -#include "io_types.h" +#include "io_write_types.h" #include "quota.h" #include <linux/uio.h> diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 08f810992a1b..0648874d54f3 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -19,7 +19,7 @@ #include "fs-io-pagecache.h" #include "fsck.h" #include "inode.h" -#include "io.h" +#include "io_read.h" #include "journal.h" #include "keylist.h" #include "quota.h" diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h deleted file mode 100644 index 831e3f1b7e41..000000000000 --- a/fs/bcachefs/io.h +++ /dev/null @@ -1,202 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_IO_H -#define _BCACHEFS_IO_H - -#include "checksum.h" -#include "bkey_buf.h" -#include "io_types.h" - -#define to_wbio(_bio) \ - container_of((_bio), struct bch_write_bio, bio) - -#define to_rbio(_bio) \ - container_of((_bio), struct bch_read_bio, bio) - -void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *); -void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t); - -#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT -void bch2_latency_acct(struct bch_dev *, u64, int); -#else -static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {} -#endif - -void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, - enum bch_data_type, const struct bkey_i *, bool); - -#define BLK_STS_REMOVED ((__force blk_status_t)128) - -const char *bch2_blk_status_to_str(blk_status_t); - -#define BCH_WRITE_FLAGS() \ - x(ALLOC_NOWAIT) \ - x(CACHED) \ - x(DATA_ENCODED) \ - x(PAGES_STABLE) \ - x(PAGES_OWNED) \ - x(ONLY_SPECIFIED_DEVS) \ - x(WROTE_DATA_INLINE) \ - x(FROM_INTERNAL) \ - x(CHECK_ENOSPC) \ - x(SYNC) \ - x(MOVE) \ - x(IN_WORKER) \ - x(DONE) \ - x(IO_ERROR) \ - x(CONVERT_UNWRITTEN) - -enum __bch_write_flags { -#define x(f) __BCH_WRITE_##f, - BCH_WRITE_FLAGS() -#undef x -}; - -enum bch_write_flags { -#define x(f) BCH_WRITE_##f = BIT(__BCH_WRITE_##f), - BCH_WRITE_FLAGS() -#undef x -}; - -static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) -{ - return op->watermark == BCH_WATERMARK_copygc - ? op->c->copygc_wq - : op->c->btree_update_wq; -} - -int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *, - struct bkey_i *, bool *, s64 *, s64 *); -int bch2_extent_update(struct btree_trans *, subvol_inum, - struct btree_iter *, struct bkey_i *, - struct disk_reservation *, u64, s64 *, bool); -int bch2_extent_fallocate(struct btree_trans *, subvol_inum, struct btree_iter *, - unsigned, struct bch_io_opts, s64 *, - struct write_point_specifier); - -int bch2_fpunch_at(struct btree_trans *, struct btree_iter *, - subvol_inum, u64, s64 *); -int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *); - -static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, - struct bch_io_opts opts) -{ - op->c = c; - op->end_io = NULL; - op->flags = 0; - op->written = 0; - op->error = 0; - op->csum_type = bch2_data_checksum_type(c, opts); - op->compression_opt = opts.compression; - op->nr_replicas = 0; - op->nr_replicas_required = c->opts.data_replicas_required; - op->watermark = BCH_WATERMARK_normal; - op->incompressible = 0; - op->open_buckets.nr = 0; - op->devs_have.nr = 0; - op->target = 0; - op->opts = opts; - op->subvol = 0; - op->pos = POS_MAX; - op->version = ZERO_VERSION; - op->write_point = (struct write_point_specifier) { 0 }; - op->res = (struct disk_reservation) { 0 }; - op->new_i_size = U64_MAX; - op->i_sectors_delta = 0; - op->devs_need_flush = NULL; -} - -void bch2_write(struct closure *); - -void bch2_write_point_do_index_updates(struct work_struct *); - -static inline struct bch_write_bio *wbio_init(struct bio *bio) -{ - struct bch_write_bio *wbio = to_wbio(bio); - - memset(&wbio->wbio, 0, sizeof(wbio->wbio)); - return wbio; -} - -void bch2_write_op_to_text(struct printbuf *, struct bch_write_op *); - -struct bch_devs_mask; -struct cache_promote_op; -struct extent_ptr_decoded; - -int __bch2_read_indirect_extent(struct btree_trans *, unsigned *, - struct bkey_buf *); - -static inline int bch2_read_indirect_extent(struct btree_trans *trans, - enum btree_id *data_btree, - unsigned *offset_into_extent, - struct bkey_buf *k) -{ - if (k->k->k.type != KEY_TYPE_reflink_p) - return 0; - - *data_btree = BTREE_ID_reflink; - return __bch2_read_indirect_extent(trans, offset_into_extent, k); -} - -enum bch_read_flags { - BCH_READ_RETRY_IF_STALE = 1 << 0, - BCH_READ_MAY_PROMOTE = 1 << 1, - BCH_READ_USER_MAPPED = 1 << 2, - BCH_READ_NODECODE = 1 << 3, - BCH_READ_LAST_FRAGMENT = 1 << 4, - - /* internal: */ - BCH_READ_MUST_BOUNCE = 1 << 5, - BCH_READ_MUST_CLONE = 1 << 6, - BCH_READ_IN_RETRY = 1 << 7, -}; - -int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *, - struct bvec_iter, struct bpos, enum btree_id, - struct bkey_s_c, unsigned, - struct bch_io_failures *, unsigned); - -static inline void bch2_read_extent(struct btree_trans *trans, - struct bch_read_bio *rbio, struct bpos read_pos, - enum btree_id data_btree, struct bkey_s_c k, - unsigned offset_into_extent, unsigned flags) -{ - __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos, - data_btree, k, offset_into_extent, NULL, flags); -} - -void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter, - subvol_inum, struct bch_io_failures *, unsigned flags); - -static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, - subvol_inum inum) -{ - struct bch_io_failures failed = { .nr = 0 }; - - BUG_ON(rbio->_state); - - rbio->c = c; - rbio->start_time = local_clock(); - rbio->subvol = inum.subvol; - - __bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed, - BCH_READ_RETRY_IF_STALE| - BCH_READ_MAY_PROMOTE| - BCH_READ_USER_MAPPED); -} - -static inline struct bch_read_bio *rbio_init(struct bio *bio, - struct bch_io_opts opts) -{ - struct bch_read_bio *rbio = to_rbio(bio); - - rbio->_state = 0; - rbio->promote = NULL; - rbio->opts = opts; - return rbio; -} - -void bch2_fs_io_exit(struct bch_fs *); -int bch2_fs_io_init(struct bch_fs *); - -#endif /* _BCACHEFS_IO_H */ diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c new file mode 100644 index 000000000000..c04e5dacfc8d --- /dev/null +++ b/fs/bcachefs/io_misc.c @@ -0,0 +1,215 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * io_misc.c - fallocate, fpunch, truncate: + */ + +#include "bcachefs.h" +#include "alloc_foreground.h" +#include "bkey_buf.h" +#include "btree_update.h" +#include "buckets.h" +#include "clock.h" +#include "extents.h" +#include "io_misc.h" +#include "io_write.h" +#include "subvolume.h" + +/* Overwrites whatever was present with zeroes: */ +int bch2_extent_fallocate(struct btree_trans *trans, + subvol_inum inum, + struct btree_iter *iter, + unsigned sectors, + struct bch_io_opts opts, + s64 *i_sectors_delta, + struct write_point_specifier write_point) +{ + struct bch_fs *c = trans->c; + struct disk_reservation disk_res = { 0 }; + struct closure cl; + struct open_buckets open_buckets = { 0 }; + struct bkey_s_c k; + struct bkey_buf old, new; + unsigned sectors_allocated = 0; + bool have_reservation = false; + bool unwritten = opts.nocow && + c->sb.version >= bcachefs_metadata_version_unwritten_extents; + int ret; + + bch2_bkey_buf_init(&old); + bch2_bkey_buf_init(&new); + closure_init_stack(&cl); + + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) + return ret; + + sectors = min_t(u64, sectors, k.k->p.offset - iter->pos.offset); + + if (!have_reservation) { + unsigned new_replicas = + max(0, (int) opts.data_replicas - + (int) bch2_bkey_nr_ptrs_fully_allocated(k)); + /* + * Get a disk reservation before (in the nocow case) calling + * into the allocator: + */ + ret = bch2_disk_reservation_get(c, &disk_res, sectors, new_replicas, 0); + if (unlikely(ret)) + goto err; + + bch2_bkey_buf_reassemble(&old, c, k); + } + + if (have_reservation) { + if (!bch2_extents_match(k, bkey_i_to_s_c(old.k))) + goto err; + + bch2_key_resize(&new.k->k, sectors); + } else if (!unwritten) { + struct bkey_i_reservation *reservation; + + bch2_bkey_buf_realloc(&new, c, sizeof(*reservation) / sizeof(u64)); + reservation = bkey_reservation_init(new.k); + reservation->k.p = iter->pos; + bch2_key_resize(&reservation->k, sectors); + reservation->v.nr_replicas = opts.data_replicas; + } else { + struct bkey_i_extent *e; + struct bch_devs_list devs_have; + struct write_point *wp; + struct bch_extent_ptr *ptr; + + devs_have.nr = 0; + + bch2_bkey_buf_realloc(&new, c, BKEY_EXTENT_U64s_MAX); + + e = bkey_extent_init(new.k); + e->k.p = iter->pos; + + ret = bch2_alloc_sectors_start_trans(trans, + opts.foreground_target, + false, + write_point, + &devs_have, + opts.data_replicas, + opts.data_replicas, + BCH_WATERMARK_normal, 0, &cl, &wp); + if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) + ret = -BCH_ERR_transaction_restart_nested; + if (ret) + goto err; + + sectors = min(sectors, wp->sectors_free); + sectors_allocated = sectors; + + bch2_key_resize(&e->k, sectors); + + bch2_open_bucket_get(c, wp, &open_buckets); + bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false); + bch2_alloc_sectors_done(c, wp); + + extent_for_each_ptr(extent_i_to_s(e), ptr) + ptr->unwritten = true; + } + + have_reservation = true; + + ret = bch2_extent_update(trans, inum, iter, new.k, &disk_res, + 0, i_sectors_delta, true); +err: + if (!ret && sectors_allocated) + bch2_increment_clock(c, sectors_allocated, WRITE); + + bch2_open_buckets_put(c, &open_buckets); + bch2_disk_reservation_put(c, &disk_res); + bch2_bkey_buf_exit(&new, c); + bch2_bkey_buf_exit(&old, c); + + if (closure_nr_remaining(&cl) != 1) { + bch2_trans_unlock(trans); + closure_sync(&cl); + } + + return ret; +} + +/* + * Returns -BCH_ERR_transacton_restart if we had to drop locks: + */ +int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, + subvol_inum inum, u64 end, + s64 *i_sectors_delta) +{ + struct bch_fs *c = trans->c; + unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); + struct bpos end_pos = POS(inum.inum, end); + struct bkey_s_c k; + int ret = 0, ret2 = 0; + u32 snapshot; + + while (!ret || + bch2_err_matches(ret, BCH_ERR_transaction_restart)) { + struct disk_reservation disk_res = + bch2_disk_reservation_init(c, 0); + struct bkey_i delete; + + if (ret) + ret2 = ret; + + bch2_trans_begin(trans); + + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + continue; + + bch2_btree_iter_set_snapshot(iter, snapshot); + + /* + * peek_upto() doesn't have ideal semantics for extents: + */ + k = bch2_btree_iter_peek_upto(iter, end_pos); + if (!k.k) + break; + + ret = bkey_err(k); + if (ret) + continue; + + bkey_init(&delete.k); + delete.k.p = iter->pos; + + /* create the biggest key we can */ + bch2_key_resize(&delete.k, max_sectors); + bch2_cut_back(end_pos, &delete); + + ret = bch2_extent_update(trans, inum, iter, &delete, + &disk_res, 0, i_sectors_delta, false); + bch2_disk_reservation_put(c, &disk_res); + } + + return ret ?: ret2; +} + +int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end, + s64 *i_sectors_delta) +{ + struct btree_trans trans; + struct btree_iter iter; + int ret; + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); + bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, + POS(inum.inum, start), + BTREE_ITER_INTENT); + + ret = bch2_fpunch_at(&trans, &iter, inum, end, i_sectors_delta); + + bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + ret = 0; + + return ret; +} diff --git a/fs/bcachefs/io_misc.h b/fs/bcachefs/io_misc.h new file mode 100644 index 000000000000..46e9ce3251d6 --- /dev/null +++ b/fs/bcachefs/io_misc.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_IO_MISC_H +#define _BCACHEFS_IO_MISC_H + +int bch2_extent_fallocate(struct btree_trans *, subvol_inum, struct btree_iter *, + unsigned, struct bch_io_opts, s64 *, + struct write_point_specifier); +int bch2_fpunch_at(struct btree_trans *, struct btree_iter *, + subvol_inum, u64, s64 *); +int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *); + +#endif /* _BCACHEFS_IO_MISC_H */ diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c new file mode 100644 index 000000000000..cd62bf730396 --- /dev/null +++ b/fs/bcachefs/io_read.c @@ -0,0 +1,1207 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Some low level IO code, and hacks for various block layer limitations + * + * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> + * Copyright 2012 Google, Inc. + */ + +#include "bcachefs.h" +#include "alloc_background.h" +#include "alloc_foreground.h" +#include "btree_update.h" +#include "buckets.h" +#include "checksum.h" +#include "clock.h" +#include "compress.h" +#include "data_update.h" +#include "disk_groups.h" +#include "ec.h" +#include "error.h" +#include "io_read.h" +#include "io_misc.h" +#include "io_write.h" +#include "subvolume.h" +#include "trace.h" + +#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT + +static bool bch2_target_congested(struct bch_fs *c, u16 target) +{ + const struct bch_devs_mask *devs; + unsigned d, nr = 0, total = 0; + u64 now = local_clock(), last; + s64 congested; + struct bch_dev *ca; + + if (!target) + return false; + + rcu_read_lock(); + devs = bch2_target_to_mask(c, target) ?: + &c->rw_devs[BCH_DATA_user]; + + for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) { + ca = rcu_dereference(c->devs[d]); + if (!ca) + continue; + + congested = atomic_read(&ca->congested); + last = READ_ONCE(ca->congested_last); + if (time_after64(now, last)) + congested -= (now - last) >> 12; + + total += max(congested, 0LL); + nr++; + } + rcu_read_unlock(); + + return bch2_rand_range(nr * CONGESTED_MAX) < total; +} + +#else + +static bool bch2_target_congested(struct bch_fs *c, u16 target) +{ + return false; +} + +#endif + +/* Cache promotion on read */ + +struct promote_op { + struct rcu_head rcu; + u64 start_time; + + struct rhash_head hash; + struct bpos pos; + + struct data_update write; + struct bio_vec bi_inline_vecs[0]; /* must be last */ +}; + +static const struct rhashtable_params bch_promote_params = { + .head_offset = offsetof(struct promote_op, hash), + .key_offset = offsetof(struct promote_op, pos), + .key_len = sizeof(struct bpos), +}; + +static inline bool should_promote(struct bch_fs *c, struct bkey_s_c k, + struct bpos pos, + struct bch_io_opts opts, + unsigned flags) +{ + if (!(flags & BCH_READ_MAY_PROMOTE)) + return false; + + if (!opts.promote_target) + return false; + + if (bch2_bkey_has_target(c, k, opts.promote_target)) + return false; + + if (bkey_extent_is_unwritten(k)) + return false; + + if (bch2_target_congested(c, opts.promote_target)) { + /* XXX trace this */ + return false; + } + + if (rhashtable_lookup_fast(&c->promote_table, &pos, + bch_promote_params)) + return false; + + return true; +} + +static void promote_free(struct bch_fs *c, struct promote_op *op) +{ + int ret; + + bch2_data_update_exit(&op->write); + + ret = rhashtable_remove_fast(&c->promote_table, &op->hash, + bch_promote_params); + BUG_ON(ret); + bch2_write_ref_put(c, BCH_WRITE_REF_promote); + kfree_rcu(op, rcu); +} + +static void promote_done(struct bch_write_op *wop) +{ + struct promote_op *op = + container_of(wop, struct promote_op, write.op); + struct bch_fs *c = op->write.op.c; + + bch2_time_stats_update(&c->times[BCH_TIME_data_promote], + op->start_time); + promote_free(c, op); +} + +static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) +{ + struct bio *bio = &op->write.op.wbio.bio; + + trace_and_count(op->write.op.c, read_promote, &rbio->bio); + + /* we now own pages: */ + BUG_ON(!rbio->bounce); + BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs); + + memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec, + sizeof(struct bio_vec) * rbio->bio.bi_vcnt); + swap(bio->bi_vcnt, rbio->bio.bi_vcnt); + + bch2_data_update_read_done(&op->write, rbio->pick.crc); +} + +static struct promote_op *__promote_alloc(struct btree_trans *trans, + enum btree_id btree_id, + struct bkey_s_c k, + struct bpos pos, + struct extent_ptr_decoded *pick, + struct bch_io_opts opts, + unsigned sectors, + struct bch_read_bio **rbio) +{ + struct bch_fs *c = trans->c; + struct promote_op *op = NULL; + struct bio *bio; + unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); + int ret; + + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote)) + return NULL; + + op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOFS); + if (!op) + goto err; + + op->start_time = local_clock(); + op->pos = pos; + + /* + * We don't use the mempool here because extents that aren't + * checksummed or compressed can be too big for the mempool: + */ + *rbio = kzalloc(sizeof(struct bch_read_bio) + + sizeof(struct bio_vec) * pages, + GFP_NOFS); + if (!*rbio) + goto err; + + rbio_init(&(*rbio)->bio, opts); + bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0); + + if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9, + GFP_NOFS)) + goto err; + + (*rbio)->bounce = true; + (*rbio)->split = true; + (*rbio)->kmalloc = true; + + if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, + bch_promote_params)) + goto err; + + bio = &op->write.op.wbio.bio; + bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0); + + ret = bch2_data_update_init(trans, NULL, &op->write, + writepoint_hashed((unsigned long) current), + opts, + (struct data_update_opts) { + .target = opts.promote_target, + .extra_replicas = 1, + .write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED, + }, + btree_id, k); + /* + * possible errors: -BCH_ERR_nocow_lock_blocked, + * -BCH_ERR_ENOSPC_disk_reservation: + */ + if (ret) { + ret = rhashtable_remove_fast(&c->promote_table, &op->hash, + bch_promote_params); + BUG_ON(ret); + goto err; + } + + op->write.op.end_io = promote_done; + + return op; +err: + if (*rbio) + bio_free_pages(&(*rbio)->bio); + kfree(*rbio); + *rbio = NULL; + kfree(op); + bch2_write_ref_put(c, BCH_WRITE_REF_promote); + return NULL; +} + +noinline +static struct promote_op *promote_alloc(struct btree_trans *trans, + struct bvec_iter iter, + struct bkey_s_c k, + struct extent_ptr_decoded *pick, + struct bch_io_opts opts, + unsigned flags, + struct bch_read_bio **rbio, + bool *bounce, + bool *read_full) +{ + struct bch_fs *c = trans->c; + bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents); + /* data might have to be decompressed in the write path: */ + unsigned sectors = promote_full + ? max(pick->crc.compressed_size, pick->crc.live_size) + : bvec_iter_sectors(iter); + struct bpos pos = promote_full + ? bkey_start_pos(k.k) + : POS(k.k->p.inode, iter.bi_sector); + struct promote_op *promote; + + if (!should_promote(c, k, pos, opts, flags)) + return NULL; + + promote = __promote_alloc(trans, + k.k->type == KEY_TYPE_reflink_v + ? BTREE_ID_reflink + : BTREE_ID_extents, + k, pos, pick, opts, sectors, rbio); + if (!promote) + return NULL; + + *bounce = true; + *read_full = promote_full; + return promote; +} + +/* Read */ + +#define READ_RETRY_AVOID 1 +#define READ_RETRY 2 +#define READ_ERR 3 + +enum rbio_context { + RBIO_CONTEXT_NULL, + RBIO_CONTEXT_HIGHPRI, + RBIO_CONTEXT_UNBOUND, +}; + +static inline struct bch_read_bio * +bch2_rbio_parent(struct bch_read_bio *rbio) +{ + return rbio->split ? rbio->parent : rbio; +} + +__always_inline +static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn, + enum rbio_context context, + struct workqueue_struct *wq) +{ + if (context <= rbio->context) { + fn(&rbio->work); + } else { + rbio->work.func = fn; + rbio->context = context; + queue_work(wq, &rbio->work); + } +} + +static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) +{ + BUG_ON(rbio->bounce && !rbio->split); + + if (rbio->promote) + promote_free(rbio->c, rbio->promote); + rbio->promote = NULL; + + if (rbio->bounce) + bch2_bio_free_pages_pool(rbio->c, &rbio->bio); + + if (rbio->split) { + struct bch_read_bio *parent = rbio->parent; + + if (rbio->kmalloc) + kfree(rbio); + else + bio_put(&rbio->bio); + + rbio = parent; + } + + return rbio; +} + +/* + * Only called on a top level bch_read_bio to complete an entire read request, + * not a split: + */ +static void bch2_rbio_done(struct bch_read_bio *rbio) +{ + if (rbio->start_time) + bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read], + rbio->start_time); + bio_endio(&rbio->bio); +} + +static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, + struct bvec_iter bvec_iter, + struct bch_io_failures *failed, + unsigned flags) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_buf sk; + struct bkey_s_c k; + int ret; + + flags &= ~BCH_READ_LAST_FRAGMENT; + flags |= BCH_READ_MUST_CLONE; + + bch2_bkey_buf_init(&sk); + bch2_trans_init(&trans, c, 0, 0); + + bch2_trans_iter_init(&trans, &iter, rbio->data_btree, + rbio->read_pos, BTREE_ITER_SLOTS); +retry: + rbio->bio.bi_status = 0; + + k = bch2_btree_iter_peek_slot(&iter); + if (bkey_err(k)) + goto err; + + bch2_bkey_buf_reassemble(&sk, c, k); + k = bkey_i_to_s_c(sk.k); + bch2_trans_unlock(&trans); + + if (!bch2_bkey_matches_ptr(c, k, + rbio->pick.ptr, + rbio->data_pos.offset - + rbio->pick.crc.offset)) { + /* extent we wanted to read no longer exists: */ + rbio->hole = true; + goto out; + } + + ret = __bch2_read_extent(&trans, rbio, bvec_iter, + rbio->read_pos, + rbio->data_btree, + k, 0, failed, flags); + if (ret == READ_RETRY) + goto retry; + if (ret) + goto err; +out: + bch2_rbio_done(rbio); + bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + bch2_bkey_buf_exit(&sk, c); + return; +err: + rbio->bio.bi_status = BLK_STS_IOERR; + goto out; +} + +static void bch2_rbio_retry(struct work_struct *work) +{ + struct bch_read_bio *rbio = + container_of(work, struct bch_read_bio, work); + struct bch_fs *c = rbio->c; + struct bvec_iter iter = rbio->bvec_iter; + unsigned flags = rbio->flags; + subvol_inum inum = { + .subvol = rbio->subvol, + .inum = rbio->read_pos.inode, + }; + struct bch_io_failures failed = { .nr = 0 }; + + trace_and_count(c, read_retry, &rbio->bio); + + if (rbio->retry == READ_RETRY_AVOID) + bch2_mark_io_failure(&failed, &rbio->pick); + + rbio->bio.bi_status = 0; + + rbio = bch2_rbio_free(rbio); + + flags |= BCH_READ_IN_RETRY; + flags &= ~BCH_READ_MAY_PROMOTE; + + if (flags & BCH_READ_NODECODE) { + bch2_read_retry_nodecode(c, rbio, iter, &failed, flags); + } else { + flags &= ~BCH_READ_LAST_FRAGMENT; + flags |= BCH_READ_MUST_CLONE; + + __bch2_read(c, rbio, iter, inum, &failed, flags); + } +} + +static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, + blk_status_t error) +{ + rbio->retry = retry; + + if (rbio->flags & BCH_READ_IN_RETRY) + return; + + if (retry == READ_ERR) { + rbio = bch2_rbio_free(rbio); + + rbio->bio.bi_status = error; + bch2_rbio_done(rbio); + } else { + bch2_rbio_punt(rbio, bch2_rbio_retry, + RBIO_CONTEXT_UNBOUND, system_unbound_wq); + } +} + +static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, + struct bch_read_bio *rbio) +{ + struct bch_fs *c = rbio->c; + u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset; + struct bch_extent_crc_unpacked new_crc; + struct btree_iter iter; + struct bkey_i *new; + struct bkey_s_c k; + int ret = 0; + + if (crc_is_compressed(rbio->pick.crc)) + return 0; + + k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + if ((ret = bkey_err(k))) + goto out; + + if (bversion_cmp(k.k->version, rbio->version) || + !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset)) + goto out; + + /* Extent was merged? */ + if (bkey_start_offset(k.k) < data_offset || + k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size) + goto out; + + if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version, + rbio->pick.crc, NULL, &new_crc, + bkey_start_offset(k.k) - data_offset, k.k->size, + rbio->pick.crc.csum_type)) { + bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)"); + ret = 0; + goto out; + } + + /* + * going to be temporarily appending another checksum entry: + */ + new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + + sizeof(struct bch_extent_crc128)); + if ((ret = PTR_ERR_OR_ZERO(new))) + goto out; + + bkey_reassemble(new, k); + + if (!bch2_bkey_narrow_crcs(new, new_crc)) + goto out; + + ret = bch2_trans_update(trans, &iter, new, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); +out: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) +{ + bch2_trans_do(rbio->c, NULL, NULL, BTREE_INSERT_NOFAIL, + __bch2_rbio_narrow_crcs(&trans, rbio)); +} + +/* Inner part that may run in process context */ +static void __bch2_read_endio(struct work_struct *work) +{ + struct bch_read_bio *rbio = + container_of(work, struct bch_read_bio, work); + struct bch_fs *c = rbio->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev); + struct bio *src = &rbio->bio; + struct bio *dst = &bch2_rbio_parent(rbio)->bio; + struct bvec_iter dst_iter = rbio->bvec_iter; + struct bch_extent_crc_unpacked crc = rbio->pick.crc; + struct nonce nonce = extent_nonce(rbio->version, crc); + unsigned nofs_flags; + struct bch_csum csum; + int ret; + + nofs_flags = memalloc_nofs_save(); + + /* Reset iterator for checksumming and copying bounced data: */ + if (rbio->bounce) { + src->bi_iter.bi_size = crc.compressed_size << 9; + src->bi_iter.bi_idx = 0; + src->bi_iter.bi_bvec_done = 0; + } else { + src->bi_iter = rbio->bvec_iter; + } + + csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); + if (bch2_crc_cmp(csum, rbio->pick.crc.csum) && !c->opts.no_data_io) + goto csum_err; + + /* + * XXX + * We need to rework the narrow_crcs path to deliver the read completion + * first, and then punt to a different workqueue, otherwise we're + * holding up reads while doing btree updates which is bad for memory + * reclaim. + */ + if (unlikely(rbio->narrow_crcs)) + bch2_rbio_narrow_crcs(rbio); + + if (rbio->flags & BCH_READ_NODECODE) + goto nodecode; + + /* Adjust crc to point to subset of data we want: */ + crc.offset += rbio->offset_into_extent; + crc.live_size = bvec_iter_sectors(rbio->bvec_iter); + + if (crc_is_compressed(crc)) { + ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); + if (ret) + goto decrypt_err; + + if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) && + !c->opts.no_data_io) + goto decompression_err; + } else { + /* don't need to decrypt the entire bio: */ + nonce = nonce_add(nonce, crc.offset << 9); + bio_advance(src, crc.offset << 9); + + BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); + src->bi_iter.bi_size = dst_iter.bi_size; + + ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); + if (ret) + goto decrypt_err; + + if (rbio->bounce) { + struct bvec_iter src_iter = src->bi_iter; + + bio_copy_data_iter(dst, &dst_iter, src, &src_iter); + } + } + + if (rbio->promote) { + /* + * Re encrypt data we decrypted, so it's consistent with + * rbio->crc: + */ + ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); + if (ret) + goto decrypt_err; + + promote_start(rbio->promote, rbio); + rbio->promote = NULL; + } +nodecode: + if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) { + rbio = bch2_rbio_free(rbio); + bch2_rbio_done(rbio); + } +out: + memalloc_nofs_restore(nofs_flags); + return; +csum_err: + /* + * Checksum error: if the bio wasn't bounced, we may have been + * reading into buffers owned by userspace (that userspace can + * scribble over) - retry the read, bouncing it this time: + */ + if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) { + rbio->flags |= BCH_READ_MUST_BOUNCE; + bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR); + goto out; + } + + bch_err_inum_offset_ratelimited(ca, + rbio->read_pos.inode, + rbio->read_pos.offset << 9, + "data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)", + rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo, + csum.hi, csum.lo, bch2_csum_types[crc.csum_type]); + bch2_io_error(ca); + bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); + goto out; +decompression_err: + bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode, + rbio->read_pos.offset << 9, + "decompression error"); + bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); + goto out; +decrypt_err: + bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode, + rbio->read_pos.offset << 9, + "decrypt error"); + bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); + goto out; +} + +static void bch2_read_endio(struct bio *bio) +{ + struct bch_read_bio *rbio = + container_of(bio, struct bch_read_bio, bio); + struct bch_fs *c = rbio->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev); + struct workqueue_struct *wq = NULL; + enum rbio_context context = RBIO_CONTEXT_NULL; + + if (rbio->have_ioref) { + bch2_latency_acct(ca, rbio->submit_time, READ); + percpu_ref_put(&ca->io_ref); + } + + if (!rbio->split) + rbio->bio.bi_end_io = rbio->end_io; + + if (bch2_dev_inum_io_err_on(bio->bi_status, ca, + rbio->read_pos.inode, + rbio->read_pos.offset, + "data read error: %s", + bch2_blk_status_to_str(bio->bi_status))) { + bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); + return; + } + + if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || + ptr_stale(ca, &rbio->pick.ptr)) { + trace_and_count(c, read_reuse_race, &rbio->bio); + + if (rbio->flags & BCH_READ_RETRY_IF_STALE) + bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN); + else + bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN); + return; + } + + if (rbio->narrow_crcs || + rbio->promote || + crc_is_compressed(rbio->pick.crc) || + bch2_csum_type_is_encryption(rbio->pick.crc.csum_type)) + context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq; + else if (rbio->pick.crc.csum_type) + context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq; + + bch2_rbio_punt(rbio, __bch2_read_endio, context, wq); +} + +int __bch2_read_indirect_extent(struct btree_trans *trans, + unsigned *offset_into_extent, + struct bkey_buf *orig_k) +{ + struct btree_iter iter; + struct bkey_s_c k; + u64 reflink_offset; + int ret; + + reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) + + *offset_into_extent; + + k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_reflink, + POS(0, reflink_offset), 0); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != KEY_TYPE_reflink_v && + k.k->type != KEY_TYPE_indirect_inline_data) { + bch_err_inum_offset_ratelimited(trans->c, + orig_k->k->k.p.inode, + orig_k->k->k.p.offset << 9, + "%llu len %u points to nonexistent indirect extent %llu", + orig_k->k->k.p.offset, + orig_k->k->k.size, + reflink_offset); + bch2_inconsistent_error(trans->c); + ret = -EIO; + goto err; + } + + *offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); + bch2_bkey_buf_reassemble(orig_k, trans->c, k); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, + struct bkey_s_c k, + struct bch_extent_ptr ptr) +{ + struct bch_fs *c = trans->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr.dev); + struct btree_iter iter; + struct printbuf buf = PRINTBUF; + int ret; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, + PTR_BUCKET_POS(c, &ptr), + BTREE_ITER_CACHED); + + prt_printf(&buf, "Attempting to read from stale dirty pointer:"); + printbuf_indent_add(&buf, 2); + prt_newline(&buf); + + bch2_bkey_val_to_text(&buf, c, k); + prt_newline(&buf); + + prt_printf(&buf, "memory gen: %u", *bucket_gen(ca, iter.pos.offset)); + + ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); + if (!ret) { + prt_newline(&buf); + bch2_bkey_val_to_text(&buf, c, k); + } + + bch2_fs_inconsistent(c, "%s", buf.buf); + + bch2_trans_iter_exit(trans, &iter); + printbuf_exit(&buf); +} + +int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + struct bvec_iter iter, struct bpos read_pos, + enum btree_id data_btree, struct bkey_s_c k, + unsigned offset_into_extent, + struct bch_io_failures *failed, unsigned flags) +{ + struct bch_fs *c = trans->c; + struct extent_ptr_decoded pick; + struct bch_read_bio *rbio = NULL; + struct bch_dev *ca = NULL; + struct promote_op *promote = NULL; + bool bounce = false, read_full = false, narrow_crcs = false; + struct bpos data_pos = bkey_start_pos(k.k); + int pick_ret; + + if (bkey_extent_is_inline_data(k.k)) { + unsigned bytes = min_t(unsigned, iter.bi_size, + bkey_inline_data_bytes(k.k)); + + swap(iter.bi_size, bytes); + memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k)); + swap(iter.bi_size, bytes); + bio_advance_iter(&orig->bio, &iter, bytes); + zero_fill_bio_iter(&orig->bio, iter); + goto out_read_done; + } +retry_pick: + pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick); + + /* hole or reservation - just zero fill: */ + if (!pick_ret) + goto hole; + + if (pick_ret < 0) { + bch_err_inum_offset_ratelimited(c, + read_pos.inode, read_pos.offset << 9, + "no device to read from"); + goto err; + } + + ca = bch_dev_bkey_exists(c, pick.ptr.dev); + + /* + * Stale dirty pointers are treated as IO errors, but @failed isn't + * allocated unless we're in the retry path - so if we're not in the + * retry path, don't check here, it'll be caught in bch2_read_endio() + * and we'll end up in the retry path: + */ + if ((flags & BCH_READ_IN_RETRY) && + !pick.ptr.cached && + unlikely(ptr_stale(ca, &pick.ptr))) { + read_from_stale_dirty_pointer(trans, k, pick.ptr); + bch2_mark_io_failure(failed, &pick); + goto retry_pick; + } + + /* + * Unlock the iterator while the btree node's lock is still in + * cache, before doing the IO: + */ + bch2_trans_unlock(trans); + + if (flags & BCH_READ_NODECODE) { + /* + * can happen if we retry, and the extent we were going to read + * has been merged in the meantime: + */ + if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) + goto hole; + + iter.bi_size = pick.crc.compressed_size << 9; + goto get_bio; + } + + if (!(flags & BCH_READ_LAST_FRAGMENT) || + bio_flagged(&orig->bio, BIO_CHAIN)) + flags |= BCH_READ_MUST_CLONE; + + narrow_crcs = !(flags & BCH_READ_IN_RETRY) && + bch2_can_narrow_extent_crcs(k, pick.crc); + + if (narrow_crcs && (flags & BCH_READ_USER_MAPPED)) + flags |= BCH_READ_MUST_BOUNCE; + + EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); + + if (crc_is_compressed(pick.crc) || + (pick.crc.csum_type != BCH_CSUM_none && + (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || + (bch2_csum_type_is_encryption(pick.crc.csum_type) && + (flags & BCH_READ_USER_MAPPED)) || + (flags & BCH_READ_MUST_BOUNCE)))) { + read_full = true; + bounce = true; + } + + if (orig->opts.promote_target) + promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags, + &rbio, &bounce, &read_full); + + if (!read_full) { + EBUG_ON(crc_is_compressed(pick.crc)); + EBUG_ON(pick.crc.csum_type && + (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || + bvec_iter_sectors(iter) != pick.crc.live_size || + pick.crc.offset || + offset_into_extent)); + + data_pos.offset += offset_into_extent; + pick.ptr.offset += pick.crc.offset + + offset_into_extent; + offset_into_extent = 0; + pick.crc.compressed_size = bvec_iter_sectors(iter); + pick.crc.uncompressed_size = bvec_iter_sectors(iter); + pick.crc.offset = 0; + pick.crc.live_size = bvec_iter_sectors(iter); + offset_into_extent = 0; + } +get_bio: + if (rbio) { + /* + * promote already allocated bounce rbio: + * promote needs to allocate a bio big enough for uncompressing + * data in the write path, but we're not going to use it all + * here: + */ + EBUG_ON(rbio->bio.bi_iter.bi_size < + pick.crc.compressed_size << 9); + rbio->bio.bi_iter.bi_size = + pick.crc.compressed_size << 9; + } else if (bounce) { + unsigned sectors = pick.crc.compressed_size; + + rbio = rbio_init(bio_alloc_bioset(NULL, + DIV_ROUND_UP(sectors, PAGE_SECTORS), + 0, + GFP_NOFS, + &c->bio_read_split), + orig->opts); + + bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9); + rbio->bounce = true; + rbio->split = true; + } else if (flags & BCH_READ_MUST_CLONE) { + /* + * Have to clone if there were any splits, due to error + * reporting issues (if a split errored, and retrying didn't + * work, when it reports the error to its parent (us) we don't + * know if the error was from our bio, and we should retry, or + * from the whole bio, in which case we don't want to retry and + * lose the error) + */ + rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS, + &c->bio_read_split), + orig->opts); + rbio->bio.bi_iter = iter; + rbio->split = true; + } else { + rbio = orig; + rbio->bio.bi_iter = iter; + EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN)); + } + + EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size); + + rbio->c = c; + rbio->submit_time = local_clock(); + if (rbio->split) + rbio->parent = orig; + else + rbio->end_io = orig->bio.bi_end_io; + rbio->bvec_iter = iter; + rbio->offset_into_extent= offset_into_extent; + rbio->flags = flags; + rbio->have_ioref = pick_ret > 0 && bch2_dev_get_ioref(ca, READ); + rbio->narrow_crcs = narrow_crcs; + rbio->hole = 0; + rbio->retry = 0; + rbio->context = 0; + /* XXX: only initialize this if needed */ + rbio->devs_have = bch2_bkey_devs(k); + rbio->pick = pick; + rbio->subvol = orig->subvol; + rbio->read_pos = read_pos; + rbio->data_btree = data_btree; + rbio->data_pos = data_pos; + rbio->version = k.k->version; + rbio->promote = promote; + INIT_WORK(&rbio->work, NULL); + + rbio->bio.bi_opf = orig->bio.bi_opf; + rbio->bio.bi_iter.bi_sector = pick.ptr.offset; + rbio->bio.bi_end_io = bch2_read_endio; + + if (rbio->bounce) + trace_and_count(c, read_bounce, &rbio->bio); + + this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio)); + bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); + + /* + * If it's being moved internally, we don't want to flag it as a cache + * hit: + */ + if (pick.ptr.cached && !(flags & BCH_READ_NODECODE)) + bch2_bucket_io_time_reset(trans, pick.ptr.dev, + PTR_BUCKET_NR(ca, &pick.ptr), READ); + + if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) { + bio_inc_remaining(&orig->bio); + trace_and_count(c, read_split, &orig->bio); + } + + if (!rbio->pick.idx) { + if (!rbio->have_ioref) { + bch_err_inum_offset_ratelimited(c, + read_pos.inode, + read_pos.offset << 9, + "no device to read from"); + bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); + goto out; + } + + this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user], + bio_sectors(&rbio->bio)); + bio_set_dev(&rbio->bio, ca->disk_sb.bdev); + + if (unlikely(c->opts.no_data_io)) { + if (likely(!(flags & BCH_READ_IN_RETRY))) + bio_endio(&rbio->bio); + } else { + if (likely(!(flags & BCH_READ_IN_RETRY))) + submit_bio(&rbio->bio); + else + submit_bio_wait(&rbio->bio); + } + + /* + * We just submitted IO which may block, we expect relock fail + * events and shouldn't count them: + */ + trans->notrace_relock_fail = true; + } else { + /* Attempting reconstruct read: */ + if (bch2_ec_read_extent(c, rbio)) { + bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); + goto out; + } + + if (likely(!(flags & BCH_READ_IN_RETRY))) + bio_endio(&rbio->bio); + } +out: + if (likely(!(flags & BCH_READ_IN_RETRY))) { + return 0; + } else { + int ret; + + rbio->context = RBIO_CONTEXT_UNBOUND; + bch2_read_endio(&rbio->bio); + + ret = rbio->retry; + rbio = bch2_rbio_free(rbio); + + if (ret == READ_RETRY_AVOID) { + bch2_mark_io_failure(failed, &pick); + ret = READ_RETRY; + } + + if (!ret) + goto out_read_done; + + return ret; + } + +err: + if (flags & BCH_READ_IN_RETRY) + return READ_ERR; + + orig->bio.bi_status = BLK_STS_IOERR; + goto out_read_done; + +hole: + /* + * won't normally happen in the BCH_READ_NODECODE + * (bch2_move_extent()) path, but if we retry and the extent we wanted + * to read no longer exists we have to signal that: + */ + if (flags & BCH_READ_NODECODE) + orig->hole = true; + + zero_fill_bio_iter(&orig->bio, iter); +out_read_done: + if (flags & BCH_READ_LAST_FRAGMENT) + bch2_rbio_done(orig); + return 0; +} + +void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, + struct bvec_iter bvec_iter, subvol_inum inum, + struct bch_io_failures *failed, unsigned flags) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_buf sk; + struct bkey_s_c k; + u32 snapshot; + int ret; + + BUG_ON(flags & BCH_READ_NODECODE); + + bch2_bkey_buf_init(&sk); + bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); + iter = (struct btree_iter) { NULL }; + + ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); + if (ret) + goto err; + + bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, + SPOS(inum.inum, bvec_iter.bi_sector, snapshot), + BTREE_ITER_SLOTS); + while (1) { + unsigned bytes, sectors, offset_into_extent; + enum btree_id data_btree = BTREE_ID_extents; + + /* + * read_extent -> io_time_reset may cause a transaction restart + * without returning an error, we need to check for that here: + */ + ret = bch2_trans_relock(&trans); + if (ret) + break; + + bch2_btree_iter_set_pos(&iter, + POS(inum.inum, bvec_iter.bi_sector)); + + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + break; + + offset_into_extent = iter.pos.offset - + bkey_start_offset(k.k); + sectors = k.k->size - offset_into_extent; + + bch2_bkey_buf_reassemble(&sk, c, k); + + ret = bch2_read_indirect_extent(&trans, &data_btree, + &offset_into_extent, &sk); + if (ret) + break; + + k = bkey_i_to_s_c(sk.k); + + /* + * With indirect extents, the amount of data to read is the min + * of the original extent and the indirect extent: + */ + sectors = min(sectors, k.k->size - offset_into_extent); + + bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; + swap(bvec_iter.bi_size, bytes); + + if (bvec_iter.bi_size == bytes) + flags |= BCH_READ_LAST_FRAGMENT; + + ret = __bch2_read_extent(&trans, rbio, bvec_iter, iter.pos, + data_btree, k, + offset_into_extent, failed, flags); + if (ret) + break; + + if (flags & BCH_READ_LAST_FRAGMENT) + break; + + swap(bvec_iter.bi_size, bytes); + bio_advance_iter(&rbio->bio, &bvec_iter, bytes); + + ret = btree_trans_too_many_iters(&trans); + if (ret) + break; + } +err: + bch2_trans_iter_exit(&trans, &iter); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || + ret == READ_RETRY || + ret == READ_RETRY_AVOID) + goto retry; + + bch2_trans_exit(&trans); + bch2_bkey_buf_exit(&sk, c); + + if (ret) { + bch_err_inum_offset_ratelimited(c, inum.inum, + bvec_iter.bi_sector << 9, + "read error %i from btree lookup", ret); + rbio->bio.bi_status = BLK_STS_IOERR; + bch2_rbio_done(rbio); + } +} + +void bch2_fs_io_read_exit(struct bch_fs *c) +{ + if (c->promote_table.tbl) + rhashtable_destroy(&c->promote_table); + bioset_exit(&c->bio_read_split); + bioset_exit(&c->bio_read); +} + +int bch2_fs_io_read_init(struct bch_fs *c) +{ + if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio), + BIOSET_NEED_BVECS)) + return -BCH_ERR_ENOMEM_bio_read_init; + + if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio), + BIOSET_NEED_BVECS)) + return -BCH_ERR_ENOMEM_bio_read_split_init; + + if (rhashtable_init(&c->promote_table, &bch_promote_params)) + return -BCH_ERR_ENOMEM_promote_table_init; + + return 0; +} diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h new file mode 100644 index 000000000000..d9c18bb7d403 --- /dev/null +++ b/fs/bcachefs/io_read.h @@ -0,0 +1,158 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_IO_READ_H +#define _BCACHEFS_IO_READ_H + +#include "bkey_buf.h" + +struct bch_read_bio { + struct bch_fs *c; + u64 start_time; + u64 submit_time; + + /* + * Reads will often have to be split, and if the extent being read from + * was checksummed or compressed we'll also have to allocate bounce + * buffers and copy the data back into the original bio. + * + * If we didn't have to split, we have to save and restore the original + * bi_end_io - @split below indicates which: + */ + union { + struct bch_read_bio *parent; + bio_end_io_t *end_io; + }; + + /* + * Saved copy of bio->bi_iter, from submission time - allows us to + * resubmit on IO error, and also to copy data back to the original bio + * when we're bouncing: + */ + struct bvec_iter bvec_iter; + + unsigned offset_into_extent; + + u16 flags; + union { + struct { + u16 bounce:1, + split:1, + kmalloc:1, + have_ioref:1, + narrow_crcs:1, + hole:1, + retry:2, + context:2; + }; + u16 _state; + }; + + struct bch_devs_list devs_have; + + struct extent_ptr_decoded pick; + + /* + * pos we read from - different from data_pos for indirect extents: + */ + u32 subvol; + struct bpos read_pos; + + /* + * start pos of data we read (may not be pos of data we want) - for + * promote, narrow extents paths: + */ + enum btree_id data_btree; + struct bpos data_pos; + struct bversion version; + + struct promote_op *promote; + + struct bch_io_opts opts; + + struct work_struct work; + + struct bio bio; +}; + +#define to_rbio(_bio) container_of((_bio), struct bch_read_bio, bio) + +struct bch_devs_mask; +struct cache_promote_op; +struct extent_ptr_decoded; + +int __bch2_read_indirect_extent(struct btree_trans *, unsigned *, + struct bkey_buf *); + +static inline int bch2_read_indirect_extent(struct btree_trans *trans, + enum btree_id *data_btree, + unsigned *offset_into_extent, + struct bkey_buf *k) +{ + if (k->k->k.type != KEY_TYPE_reflink_p) + return 0; + + *data_btree = BTREE_ID_reflink; + return __bch2_read_indirect_extent(trans, offset_into_extent, k); +} + +enum bch_read_flags { + BCH_READ_RETRY_IF_STALE = 1 << 0, + BCH_READ_MAY_PROMOTE = 1 << 1, + BCH_READ_USER_MAPPED = 1 << 2, + BCH_READ_NODECODE = 1 << 3, + BCH_READ_LAST_FRAGMENT = 1 << 4, + + /* internal: */ + BCH_READ_MUST_BOUNCE = 1 << 5, + BCH_READ_MUST_CLONE = 1 << 6, + BCH_READ_IN_RETRY = 1 << 7, +}; + +int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *, + struct bvec_iter, struct bpos, enum btree_id, + struct bkey_s_c, unsigned, + struct bch_io_failures *, unsigned); + +static inline void bch2_read_extent(struct btree_trans *trans, + struct bch_read_bio *rbio, struct bpos read_pos, + enum btree_id data_btree, struct bkey_s_c k, + unsigned offset_into_extent, unsigned flags) +{ + __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos, + data_btree, k, offset_into_extent, NULL, flags); +} + +void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter, + subvol_inum, struct bch_io_failures *, unsigned flags); + +static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, + subvol_inum inum) +{ + struct bch_io_failures failed = { .nr = 0 }; + + BUG_ON(rbio->_state); + + rbio->c = c; + rbio->start_time = local_clock(); + rbio->subvol = inum.subvol; + + __bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed, + BCH_READ_RETRY_IF_STALE| + BCH_READ_MAY_PROMOTE| + BCH_READ_USER_MAPPED); +} + +static inline struct bch_read_bio *rbio_init(struct bio *bio, + struct bch_io_opts opts) +{ + struct bch_read_bio *rbio = to_rbio(bio); + + rbio->_state = 0; + rbio->promote = NULL; + rbio->opts = opts; + return rbio; +} + +void bch2_fs_io_read_exit(struct bch_fs *); +int bch2_fs_io_read_init(struct bch_fs *); + +#endif /* _BCACHEFS_IO_READ_H */ diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io_write.c index 3c614c864b6e..7f29fd2f05b1 100644 --- a/fs/bcachefs/io.c +++ b/fs/bcachefs/io_write.c @@ -1,29 +1,24 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Some low level IO code, and hacks for various block layer limitations - * * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> * Copyright 2012 Google, Inc. */ #include "bcachefs.h" -#include "alloc_background.h" #include "alloc_foreground.h" #include "bkey_buf.h" #include "bset.h" #include "btree_update.h" #include "buckets.h" #include "checksum.h" -#include "compress.h" #include "clock.h" -#include "data_update.h" +#include "compress.h" #include "debug.h" -#include "disk_groups.h" #include "ec.h" #include "error.h" #include "extent_update.h" #include "inode.h" -#include "io.h" +#include "io_write.h" #include "journal.h" #include "keylist.h" #include "move.h" @@ -39,48 +34,8 @@ #include <linux/random.h> #include <linux/sched/mm.h> -const char *bch2_blk_status_to_str(blk_status_t status) -{ - if (status == BLK_STS_REMOVED) - return "device removed"; - return blk_status_to_str(status); -} - #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT -static bool bch2_target_congested(struct bch_fs *c, u16 target) -{ - const struct bch_devs_mask *devs; - unsigned d, nr = 0, total = 0; - u64 now = local_clock(), last; - s64 congested; - struct bch_dev *ca; - - if (!target) - return false; - - rcu_read_lock(); - devs = bch2_target_to_mask(c, target) ?: - &c->rw_devs[BCH_DATA_user]; - - for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) { - ca = rcu_dereference(c->devs[d]); - if (!ca) - continue; - - congested = atomic_read(&ca->congested); - last = READ_ONCE(ca->congested_last); - if (time_after64(now, last)) - congested -= (now - last) >> 12; - - total += max(congested, 0LL); - nr++; - } - rcu_read_unlock(); - - return bch2_rand_range(nr * CONGESTED_MAX) < total; -} - static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency, u64 now, int rw) { @@ -136,13 +91,6 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now); } -#else - -static bool bch2_target_congested(struct bch_fs *c, u16 target) -{ - return false; -} - #endif /* Allocate, free from mempool: */ @@ -368,206 +316,6 @@ int bch2_extent_update(struct btree_trans *trans, return 0; } -/* Overwrites whatever was present with zeroes: */ -int bch2_extent_fallocate(struct btree_trans *trans, - subvol_inum inum, - struct btree_iter *iter, - unsigned sectors, - struct bch_io_opts opts, - s64 *i_sectors_delta, - struct write_point_specifier write_point) -{ - struct bch_fs *c = trans->c; - struct disk_reservation disk_res = { 0 }; - struct closure cl; - struct open_buckets open_buckets = { 0 }; - struct bkey_s_c k; - struct bkey_buf old, new; - unsigned sectors_allocated = 0; - bool have_reservation = false; - bool unwritten = opts.nocow && - c->sb.version >= bcachefs_metadata_version_unwritten_extents; - int ret; - - bch2_bkey_buf_init(&old); - bch2_bkey_buf_init(&new); - closure_init_stack(&cl); - - k = bch2_btree_iter_peek_slot(iter); - ret = bkey_err(k); - if (ret) - return ret; - - sectors = min_t(u64, sectors, k.k->p.offset - iter->pos.offset); - - if (!have_reservation) { - unsigned new_replicas = - max(0, (int) opts.data_replicas - - (int) bch2_bkey_nr_ptrs_fully_allocated(k)); - /* - * Get a disk reservation before (in the nocow case) calling - * into the allocator: - */ - ret = bch2_disk_reservation_get(c, &disk_res, sectors, new_replicas, 0); - if (unlikely(ret)) - goto err; - - bch2_bkey_buf_reassemble(&old, c, k); - } - - if (have_reservation) { - if (!bch2_extents_match(k, bkey_i_to_s_c(old.k))) - goto err; - - bch2_key_resize(&new.k->k, sectors); - } else if (!unwritten) { - struct bkey_i_reservation *reservation; - - bch2_bkey_buf_realloc(&new, c, sizeof(*reservation) / sizeof(u64)); - reservation = bkey_reservation_init(new.k); - reservation->k.p = iter->pos; - bch2_key_resize(&reservation->k, sectors); - reservation->v.nr_replicas = opts.data_replicas; - } else { - struct bkey_i_extent *e; - struct bch_devs_list devs_have; - struct write_point *wp; - struct bch_extent_ptr *ptr; - - devs_have.nr = 0; - - bch2_bkey_buf_realloc(&new, c, BKEY_EXTENT_U64s_MAX); - - e = bkey_extent_init(new.k); - e->k.p = iter->pos; - - ret = bch2_alloc_sectors_start_trans(trans, - opts.foreground_target, - false, - write_point, - &devs_have, - opts.data_replicas, - opts.data_replicas, - BCH_WATERMARK_normal, 0, &cl, &wp); - if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) - ret = -BCH_ERR_transaction_restart_nested; - if (ret) - goto err; - - sectors = min(sectors, wp->sectors_free); - sectors_allocated = sectors; - - bch2_key_resize(&e->k, sectors); - - bch2_open_bucket_get(c, wp, &open_buckets); - bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false); - bch2_alloc_sectors_done(c, wp); - - extent_for_each_ptr(extent_i_to_s(e), ptr) - ptr->unwritten = true; - } - - have_reservation = true; - - ret = bch2_extent_update(trans, inum, iter, new.k, &disk_res, - 0, i_sectors_delta, true); -err: - if (!ret && sectors_allocated) - bch2_increment_clock(c, sectors_allocated, WRITE); - - bch2_open_buckets_put(c, &open_buckets); - bch2_disk_reservation_put(c, &disk_res); - bch2_bkey_buf_exit(&new, c); - bch2_bkey_buf_exit(&old, c); - - if (closure_nr_remaining(&cl) != 1) { - bch2_trans_unlock(trans); - closure_sync(&cl); - } - - return ret; -} - -/* - * Returns -BCH_ERR_transacton_restart if we had to drop locks: - */ -int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, - subvol_inum inum, u64 end, - s64 *i_sectors_delta) -{ - struct bch_fs *c = trans->c; - unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); - struct bpos end_pos = POS(inum.inum, end); - struct bkey_s_c k; - int ret = 0, ret2 = 0; - u32 snapshot; - - while (!ret || - bch2_err_matches(ret, BCH_ERR_transaction_restart)) { - struct disk_reservation disk_res = - bch2_disk_reservation_init(c, 0); - struct bkey_i delete; - - if (ret) - ret2 = ret; - - bch2_trans_begin(trans); - - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (ret) - continue; - - bch2_btree_iter_set_snapshot(iter, snapshot); - - /* - * peek_upto() doesn't have ideal semantics for extents: - */ - k = bch2_btree_iter_peek_upto(iter, end_pos); - if (!k.k) - break; - - ret = bkey_err(k); - if (ret) - continue; - - bkey_init(&delete.k); - delete.k.p = iter->pos; - - /* create the biggest key we can */ - bch2_key_resize(&delete.k, max_sectors); - bch2_cut_back(end_pos, &delete); - - ret = bch2_extent_update(trans, inum, iter, &delete, - &disk_res, 0, i_sectors_delta, false); - bch2_disk_reservation_put(c, &disk_res); - } - - return ret ?: ret2; -} - -int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end, - s64 *i_sectors_delta) -{ - struct btree_trans trans; - struct btree_iter iter; - int ret; - - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); - bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, - POS(inum.inum, start), - BTREE_ITER_INTENT); - - ret = bch2_fpunch_at(&trans, &iter, inum, end, i_sectors_delta); - - bch2_trans_iter_exit(&trans, &iter); - bch2_trans_exit(&trans); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - ret = 0; - - return ret; -} - static int bch2_write_index_default(struct bch_write_op *op) { struct bch_fs *c = op->c; @@ -1899,1140 +1647,14 @@ void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op) printbuf_indent_sub(out, 2); } -/* Cache promotion on read */ - -struct promote_op { - struct rcu_head rcu; - u64 start_time; - - struct rhash_head hash; - struct bpos pos; - - struct data_update write; - struct bio_vec bi_inline_vecs[0]; /* must be last */ -}; - -static const struct rhashtable_params bch_promote_params = { - .head_offset = offsetof(struct promote_op, hash), - .key_offset = offsetof(struct promote_op, pos), - .key_len = sizeof(struct bpos), -}; - -static inline bool should_promote(struct bch_fs *c, struct bkey_s_c k, - struct bpos pos, - struct bch_io_opts opts, - unsigned flags) -{ - if (!(flags & BCH_READ_MAY_PROMOTE)) - return false; - - if (!opts.promote_target) - return false; - - if (bch2_bkey_has_target(c, k, opts.promote_target)) - return false; - - if (bkey_extent_is_unwritten(k)) - return false; - - if (bch2_target_congested(c, opts.promote_target)) { - /* XXX trace this */ - return false; - } - - if (rhashtable_lookup_fast(&c->promote_table, &pos, - bch_promote_params)) - return false; - - return true; -} - -static void promote_free(struct bch_fs *c, struct promote_op *op) -{ - int ret; - - bch2_data_update_exit(&op->write); - - ret = rhashtable_remove_fast(&c->promote_table, &op->hash, - bch_promote_params); - BUG_ON(ret); - bch2_write_ref_put(c, BCH_WRITE_REF_promote); - kfree_rcu(op, rcu); -} - -static void promote_done(struct bch_write_op *wop) -{ - struct promote_op *op = - container_of(wop, struct promote_op, write.op); - struct bch_fs *c = op->write.op.c; - - bch2_time_stats_update(&c->times[BCH_TIME_data_promote], - op->start_time); - promote_free(c, op); -} - -static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) -{ - struct bio *bio = &op->write.op.wbio.bio; - - trace_and_count(op->write.op.c, read_promote, &rbio->bio); - - /* we now own pages: */ - BUG_ON(!rbio->bounce); - BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs); - - memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec, - sizeof(struct bio_vec) * rbio->bio.bi_vcnt); - swap(bio->bi_vcnt, rbio->bio.bi_vcnt); - - bch2_data_update_read_done(&op->write, rbio->pick.crc); -} - -static struct promote_op *__promote_alloc(struct btree_trans *trans, - enum btree_id btree_id, - struct bkey_s_c k, - struct bpos pos, - struct extent_ptr_decoded *pick, - struct bch_io_opts opts, - unsigned sectors, - struct bch_read_bio **rbio) -{ - struct bch_fs *c = trans->c; - struct promote_op *op = NULL; - struct bio *bio; - unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); - int ret; - - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote)) - return NULL; - - op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOFS); - if (!op) - goto err; - - op->start_time = local_clock(); - op->pos = pos; - - /* - * We don't use the mempool here because extents that aren't - * checksummed or compressed can be too big for the mempool: - */ - *rbio = kzalloc(sizeof(struct bch_read_bio) + - sizeof(struct bio_vec) * pages, - GFP_NOFS); - if (!*rbio) - goto err; - - rbio_init(&(*rbio)->bio, opts); - bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0); - - if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9, - GFP_NOFS)) - goto err; - - (*rbio)->bounce = true; - (*rbio)->split = true; - (*rbio)->kmalloc = true; - - if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, - bch_promote_params)) - goto err; - - bio = &op->write.op.wbio.bio; - bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0); - - ret = bch2_data_update_init(trans, NULL, &op->write, - writepoint_hashed((unsigned long) current), - opts, - (struct data_update_opts) { - .target = opts.promote_target, - .extra_replicas = 1, - .write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED, - }, - btree_id, k); - /* - * possible errors: -BCH_ERR_nocow_lock_blocked, - * -BCH_ERR_ENOSPC_disk_reservation: - */ - if (ret) { - ret = rhashtable_remove_fast(&c->promote_table, &op->hash, - bch_promote_params); - BUG_ON(ret); - goto err; - } - - op->write.op.end_io = promote_done; - - return op; -err: - if (*rbio) - bio_free_pages(&(*rbio)->bio); - kfree(*rbio); - *rbio = NULL; - kfree(op); - bch2_write_ref_put(c, BCH_WRITE_REF_promote); - return NULL; -} - -noinline -static struct promote_op *promote_alloc(struct btree_trans *trans, - struct bvec_iter iter, - struct bkey_s_c k, - struct extent_ptr_decoded *pick, - struct bch_io_opts opts, - unsigned flags, - struct bch_read_bio **rbio, - bool *bounce, - bool *read_full) -{ - struct bch_fs *c = trans->c; - bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents); - /* data might have to be decompressed in the write path: */ - unsigned sectors = promote_full - ? max(pick->crc.compressed_size, pick->crc.live_size) - : bvec_iter_sectors(iter); - struct bpos pos = promote_full - ? bkey_start_pos(k.k) - : POS(k.k->p.inode, iter.bi_sector); - struct promote_op *promote; - - if (!should_promote(c, k, pos, opts, flags)) - return NULL; - - promote = __promote_alloc(trans, - k.k->type == KEY_TYPE_reflink_v - ? BTREE_ID_reflink - : BTREE_ID_extents, - k, pos, pick, opts, sectors, rbio); - if (!promote) - return NULL; - - *bounce = true; - *read_full = promote_full; - return promote; -} - -/* Read */ - -#define READ_RETRY_AVOID 1 -#define READ_RETRY 2 -#define READ_ERR 3 - -enum rbio_context { - RBIO_CONTEXT_NULL, - RBIO_CONTEXT_HIGHPRI, - RBIO_CONTEXT_UNBOUND, -}; - -static inline struct bch_read_bio * -bch2_rbio_parent(struct bch_read_bio *rbio) -{ - return rbio->split ? rbio->parent : rbio; -} - -__always_inline -static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn, - enum rbio_context context, - struct workqueue_struct *wq) -{ - if (context <= rbio->context) { - fn(&rbio->work); - } else { - rbio->work.func = fn; - rbio->context = context; - queue_work(wq, &rbio->work); - } -} - -static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) -{ - BUG_ON(rbio->bounce && !rbio->split); - - if (rbio->promote) - promote_free(rbio->c, rbio->promote); - rbio->promote = NULL; - - if (rbio->bounce) - bch2_bio_free_pages_pool(rbio->c, &rbio->bio); - - if (rbio->split) { - struct bch_read_bio *parent = rbio->parent; - - if (rbio->kmalloc) - kfree(rbio); - else - bio_put(&rbio->bio); - - rbio = parent; - } - - return rbio; -} - -/* - * Only called on a top level bch_read_bio to complete an entire read request, - * not a split: - */ -static void bch2_rbio_done(struct bch_read_bio *rbio) -{ - if (rbio->start_time) - bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read], - rbio->start_time); - bio_endio(&rbio->bio); -} - -static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, - struct bvec_iter bvec_iter, - struct bch_io_failures *failed, - unsigned flags) -{ - struct btree_trans trans; - struct btree_iter iter; - struct bkey_buf sk; - struct bkey_s_c k; - int ret; - - flags &= ~BCH_READ_LAST_FRAGMENT; - flags |= BCH_READ_MUST_CLONE; - - bch2_bkey_buf_init(&sk); - bch2_trans_init(&trans, c, 0, 0); - - bch2_trans_iter_init(&trans, &iter, rbio->data_btree, - rbio->read_pos, BTREE_ITER_SLOTS); -retry: - rbio->bio.bi_status = 0; - - k = bch2_btree_iter_peek_slot(&iter); - if (bkey_err(k)) - goto err; - - bch2_bkey_buf_reassemble(&sk, c, k); - k = bkey_i_to_s_c(sk.k); - bch2_trans_unlock(&trans); - - if (!bch2_bkey_matches_ptr(c, k, - rbio->pick.ptr, - rbio->data_pos.offset - - rbio->pick.crc.offset)) { - /* extent we wanted to read no longer exists: */ - rbio->hole = true; - goto out; - } - - ret = __bch2_read_extent(&trans, rbio, bvec_iter, - rbio->read_pos, - rbio->data_btree, - k, 0, failed, flags); - if (ret == READ_RETRY) - goto retry; - if (ret) - goto err; -out: - bch2_rbio_done(rbio); - bch2_trans_iter_exit(&trans, &iter); - bch2_trans_exit(&trans); - bch2_bkey_buf_exit(&sk, c); - return; -err: - rbio->bio.bi_status = BLK_STS_IOERR; - goto out; -} - -static void bch2_rbio_retry(struct work_struct *work) -{ - struct bch_read_bio *rbio = - container_of(work, struct bch_read_bio, work); - struct bch_fs *c = rbio->c; - struct bvec_iter iter = rbio->bvec_iter; - unsigned flags = rbio->flags; - subvol_inum inum = { - .subvol = rbio->subvol, - .inum = rbio->read_pos.inode, - }; - struct bch_io_failures failed = { .nr = 0 }; - - trace_and_count(c, read_retry, &rbio->bio); - - if (rbio->retry == READ_RETRY_AVOID) - bch2_mark_io_failure(&failed, &rbio->pick); - - rbio->bio.bi_status = 0; - - rbio = bch2_rbio_free(rbio); - - flags |= BCH_READ_IN_RETRY; - flags &= ~BCH_READ_MAY_PROMOTE; - - if (flags & BCH_READ_NODECODE) { - bch2_read_retry_nodecode(c, rbio, iter, &failed, flags); - } else { - flags &= ~BCH_READ_LAST_FRAGMENT; - flags |= BCH_READ_MUST_CLONE; - - __bch2_read(c, rbio, iter, inum, &failed, flags); - } -} - -static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, - blk_status_t error) -{ - rbio->retry = retry; - - if (rbio->flags & BCH_READ_IN_RETRY) - return; - - if (retry == READ_ERR) { - rbio = bch2_rbio_free(rbio); - - rbio->bio.bi_status = error; - bch2_rbio_done(rbio); - } else { - bch2_rbio_punt(rbio, bch2_rbio_retry, - RBIO_CONTEXT_UNBOUND, system_unbound_wq); - } -} - -static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, - struct bch_read_bio *rbio) -{ - struct bch_fs *c = rbio->c; - u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset; - struct bch_extent_crc_unpacked new_crc; - struct btree_iter iter; - struct bkey_i *new; - struct bkey_s_c k; - int ret = 0; - - if (crc_is_compressed(rbio->pick.crc)) - return 0; - - k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos, - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - if ((ret = bkey_err(k))) - goto out; - - if (bversion_cmp(k.k->version, rbio->version) || - !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset)) - goto out; - - /* Extent was merged? */ - if (bkey_start_offset(k.k) < data_offset || - k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size) - goto out; - - if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version, - rbio->pick.crc, NULL, &new_crc, - bkey_start_offset(k.k) - data_offset, k.k->size, - rbio->pick.crc.csum_type)) { - bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)"); - ret = 0; - goto out; - } - - /* - * going to be temporarily appending another checksum entry: - */ - new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + - sizeof(struct bch_extent_crc128)); - if ((ret = PTR_ERR_OR_ZERO(new))) - goto out; - - bkey_reassemble(new, k); - - if (!bch2_bkey_narrow_crcs(new, new_crc)) - goto out; - - ret = bch2_trans_update(trans, &iter, new, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); -out: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) -{ - bch2_trans_do(rbio->c, NULL, NULL, BTREE_INSERT_NOFAIL, - __bch2_rbio_narrow_crcs(&trans, rbio)); -} - -/* Inner part that may run in process context */ -static void __bch2_read_endio(struct work_struct *work) -{ - struct bch_read_bio *rbio = - container_of(work, struct bch_read_bio, work); - struct bch_fs *c = rbio->c; - struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev); - struct bio *src = &rbio->bio; - struct bio *dst = &bch2_rbio_parent(rbio)->bio; - struct bvec_iter dst_iter = rbio->bvec_iter; - struct bch_extent_crc_unpacked crc = rbio->pick.crc; - struct nonce nonce = extent_nonce(rbio->version, crc); - unsigned nofs_flags; - struct bch_csum csum; - int ret; - - nofs_flags = memalloc_nofs_save(); - - /* Reset iterator for checksumming and copying bounced data: */ - if (rbio->bounce) { - src->bi_iter.bi_size = crc.compressed_size << 9; - src->bi_iter.bi_idx = 0; - src->bi_iter.bi_bvec_done = 0; - } else { - src->bi_iter = rbio->bvec_iter; - } - - csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); - if (bch2_crc_cmp(csum, rbio->pick.crc.csum) && !c->opts.no_data_io) - goto csum_err; - - /* - * XXX - * We need to rework the narrow_crcs path to deliver the read completion - * first, and then punt to a different workqueue, otherwise we're - * holding up reads while doing btree updates which is bad for memory - * reclaim. - */ - if (unlikely(rbio->narrow_crcs)) - bch2_rbio_narrow_crcs(rbio); - - if (rbio->flags & BCH_READ_NODECODE) - goto nodecode; - - /* Adjust crc to point to subset of data we want: */ - crc.offset += rbio->offset_into_extent; - crc.live_size = bvec_iter_sectors(rbio->bvec_iter); - - if (crc_is_compressed(crc)) { - ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); - if (ret) - goto decrypt_err; - - if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) && - !c->opts.no_data_io) - goto decompression_err; - } else { - /* don't need to decrypt the entire bio: */ - nonce = nonce_add(nonce, crc.offset << 9); - bio_advance(src, crc.offset << 9); - - BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); - src->bi_iter.bi_size = dst_iter.bi_size; - - ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); - if (ret) - goto decrypt_err; - - if (rbio->bounce) { - struct bvec_iter src_iter = src->bi_iter; - - bio_copy_data_iter(dst, &dst_iter, src, &src_iter); - } - } - - if (rbio->promote) { - /* - * Re encrypt data we decrypted, so it's consistent with - * rbio->crc: - */ - ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); - if (ret) - goto decrypt_err; - - promote_start(rbio->promote, rbio); - rbio->promote = NULL; - } -nodecode: - if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) { - rbio = bch2_rbio_free(rbio); - bch2_rbio_done(rbio); - } -out: - memalloc_nofs_restore(nofs_flags); - return; -csum_err: - /* - * Checksum error: if the bio wasn't bounced, we may have been - * reading into buffers owned by userspace (that userspace can - * scribble over) - retry the read, bouncing it this time: - */ - if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) { - rbio->flags |= BCH_READ_MUST_BOUNCE; - bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR); - goto out; - } - - bch_err_inum_offset_ratelimited(ca, - rbio->read_pos.inode, - rbio->read_pos.offset << 9, - "data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)", - rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo, - csum.hi, csum.lo, bch2_csum_types[crc.csum_type]); - bch2_io_error(ca); - bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); - goto out; -decompression_err: - bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode, - rbio->read_pos.offset << 9, - "decompression error"); - bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); - goto out; -decrypt_err: - bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode, - rbio->read_pos.offset << 9, - "decrypt error"); - bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); - goto out; -} - -static void bch2_read_endio(struct bio *bio) +void bch2_fs_io_write_exit(struct bch_fs *c) { - struct bch_read_bio *rbio = - container_of(bio, struct bch_read_bio, bio); - struct bch_fs *c = rbio->c; - struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev); - struct workqueue_struct *wq = NULL; - enum rbio_context context = RBIO_CONTEXT_NULL; - - if (rbio->have_ioref) { - bch2_latency_acct(ca, rbio->submit_time, READ); - percpu_ref_put(&ca->io_ref); - } - - if (!rbio->split) - rbio->bio.bi_end_io = rbio->end_io; - - if (bch2_dev_inum_io_err_on(bio->bi_status, ca, - rbio->read_pos.inode, - rbio->read_pos.offset, - "data read error: %s", - bch2_blk_status_to_str(bio->bi_status))) { - bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); - return; - } - - if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || - ptr_stale(ca, &rbio->pick.ptr)) { - trace_and_count(c, read_reuse_race, &rbio->bio); - - if (rbio->flags & BCH_READ_RETRY_IF_STALE) - bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN); - else - bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN); - return; - } - - if (rbio->narrow_crcs || - rbio->promote || - crc_is_compressed(rbio->pick.crc) || - bch2_csum_type_is_encryption(rbio->pick.crc.csum_type)) - context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq; - else if (rbio->pick.crc.csum_type) - context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq; - - bch2_rbio_punt(rbio, __bch2_read_endio, context, wq); -} - -int __bch2_read_indirect_extent(struct btree_trans *trans, - unsigned *offset_into_extent, - struct bkey_buf *orig_k) -{ - struct btree_iter iter; - struct bkey_s_c k; - u64 reflink_offset; - int ret; - - reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) + - *offset_into_extent; - - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_reflink, - POS(0, reflink_offset), 0); - ret = bkey_err(k); - if (ret) - goto err; - - if (k.k->type != KEY_TYPE_reflink_v && - k.k->type != KEY_TYPE_indirect_inline_data) { - bch_err_inum_offset_ratelimited(trans->c, - orig_k->k->k.p.inode, - orig_k->k->k.p.offset << 9, - "%llu len %u points to nonexistent indirect extent %llu", - orig_k->k->k.p.offset, - orig_k->k->k.size, - reflink_offset); - bch2_inconsistent_error(trans->c); - ret = -EIO; - goto err; - } - - *offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); - bch2_bkey_buf_reassemble(orig_k, trans->c, k); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, - struct bkey_s_c k, - struct bch_extent_ptr ptr) -{ - struct bch_fs *c = trans->c; - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr.dev); - struct btree_iter iter; - struct printbuf buf = PRINTBUF; - int ret; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, - PTR_BUCKET_POS(c, &ptr), - BTREE_ITER_CACHED); - - prt_printf(&buf, "Attempting to read from stale dirty pointer:"); - printbuf_indent_add(&buf, 2); - prt_newline(&buf); - - bch2_bkey_val_to_text(&buf, c, k); - prt_newline(&buf); - - prt_printf(&buf, "memory gen: %u", *bucket_gen(ca, iter.pos.offset)); - - ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); - if (!ret) { - prt_newline(&buf); - bch2_bkey_val_to_text(&buf, c, k); - } - - bch2_fs_inconsistent(c, "%s", buf.buf); - - bch2_trans_iter_exit(trans, &iter); - printbuf_exit(&buf); -} - -int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, - struct bvec_iter iter, struct bpos read_pos, - enum btree_id data_btree, struct bkey_s_c k, - unsigned offset_into_extent, - struct bch_io_failures *failed, unsigned flags) -{ - struct bch_fs *c = trans->c; - struct extent_ptr_decoded pick; - struct bch_read_bio *rbio = NULL; - struct bch_dev *ca = NULL; - struct promote_op *promote = NULL; - bool bounce = false, read_full = false, narrow_crcs = false; - struct bpos data_pos = bkey_start_pos(k.k); - int pick_ret; - - if (bkey_extent_is_inline_data(k.k)) { - unsigned bytes = min_t(unsigned, iter.bi_size, - bkey_inline_data_bytes(k.k)); - - swap(iter.bi_size, bytes); - memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k)); - swap(iter.bi_size, bytes); - bio_advance_iter(&orig->bio, &iter, bytes); - zero_fill_bio_iter(&orig->bio, iter); - goto out_read_done; - } -retry_pick: - pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick); - - /* hole or reservation - just zero fill: */ - if (!pick_ret) - goto hole; - - if (pick_ret < 0) { - bch_err_inum_offset_ratelimited(c, - read_pos.inode, read_pos.offset << 9, - "no device to read from"); - goto err; - } - - ca = bch_dev_bkey_exists(c, pick.ptr.dev); - - /* - * Stale dirty pointers are treated as IO errors, but @failed isn't - * allocated unless we're in the retry path - so if we're not in the - * retry path, don't check here, it'll be caught in bch2_read_endio() - * and we'll end up in the retry path: - */ - if ((flags & BCH_READ_IN_RETRY) && - !pick.ptr.cached && - unlikely(ptr_stale(ca, &pick.ptr))) { - read_from_stale_dirty_pointer(trans, k, pick.ptr); - bch2_mark_io_failure(failed, &pick); - goto retry_pick; - } - - /* - * Unlock the iterator while the btree node's lock is still in - * cache, before doing the IO: - */ - bch2_trans_unlock(trans); - - if (flags & BCH_READ_NODECODE) { - /* - * can happen if we retry, and the extent we were going to read - * has been merged in the meantime: - */ - if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) - goto hole; - - iter.bi_size = pick.crc.compressed_size << 9; - goto get_bio; - } - - if (!(flags & BCH_READ_LAST_FRAGMENT) || - bio_flagged(&orig->bio, BIO_CHAIN)) - flags |= BCH_READ_MUST_CLONE; - - narrow_crcs = !(flags & BCH_READ_IN_RETRY) && - bch2_can_narrow_extent_crcs(k, pick.crc); - - if (narrow_crcs && (flags & BCH_READ_USER_MAPPED)) - flags |= BCH_READ_MUST_BOUNCE; - - EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); - - if (crc_is_compressed(pick.crc) || - (pick.crc.csum_type != BCH_CSUM_none && - (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || - (bch2_csum_type_is_encryption(pick.crc.csum_type) && - (flags & BCH_READ_USER_MAPPED)) || - (flags & BCH_READ_MUST_BOUNCE)))) { - read_full = true; - bounce = true; - } - - if (orig->opts.promote_target) - promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags, - &rbio, &bounce, &read_full); - - if (!read_full) { - EBUG_ON(crc_is_compressed(pick.crc)); - EBUG_ON(pick.crc.csum_type && - (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || - bvec_iter_sectors(iter) != pick.crc.live_size || - pick.crc.offset || - offset_into_extent)); - - data_pos.offset += offset_into_extent; - pick.ptr.offset += pick.crc.offset + - offset_into_extent; - offset_into_extent = 0; - pick.crc.compressed_size = bvec_iter_sectors(iter); - pick.crc.uncompressed_size = bvec_iter_sectors(iter); - pick.crc.offset = 0; - pick.crc.live_size = bvec_iter_sectors(iter); - offset_into_extent = 0; - } -get_bio: - if (rbio) { - /* - * promote already allocated bounce rbio: - * promote needs to allocate a bio big enough for uncompressing - * data in the write path, but we're not going to use it all - * here: - */ - EBUG_ON(rbio->bio.bi_iter.bi_size < - pick.crc.compressed_size << 9); - rbio->bio.bi_iter.bi_size = - pick.crc.compressed_size << 9; - } else if (bounce) { - unsigned sectors = pick.crc.compressed_size; - - rbio = rbio_init(bio_alloc_bioset(NULL, - DIV_ROUND_UP(sectors, PAGE_SECTORS), - 0, - GFP_NOFS, - &c->bio_read_split), - orig->opts); - - bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9); - rbio->bounce = true; - rbio->split = true; - } else if (flags & BCH_READ_MUST_CLONE) { - /* - * Have to clone if there were any splits, due to error - * reporting issues (if a split errored, and retrying didn't - * work, when it reports the error to its parent (us) we don't - * know if the error was from our bio, and we should retry, or - * from the whole bio, in which case we don't want to retry and - * lose the error) - */ - rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS, - &c->bio_read_split), - orig->opts); - rbio->bio.bi_iter = iter; - rbio->split = true; - } else { - rbio = orig; - rbio->bio.bi_iter = iter; - EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN)); - } - - EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size); - - rbio->c = c; - rbio->submit_time = local_clock(); - if (rbio->split) - rbio->parent = orig; - else - rbio->end_io = orig->bio.bi_end_io; - rbio->bvec_iter = iter; - rbio->offset_into_extent= offset_into_extent; - rbio->flags = flags; - rbio->have_ioref = pick_ret > 0 && bch2_dev_get_ioref(ca, READ); - rbio->narrow_crcs = narrow_crcs; - rbio->hole = 0; - rbio->retry = 0; - rbio->context = 0; - /* XXX: only initialize this if needed */ - rbio->devs_have = bch2_bkey_devs(k); - rbio->pick = pick; - rbio->subvol = orig->subvol; - rbio->read_pos = read_pos; - rbio->data_btree = data_btree; - rbio->data_pos = data_pos; - rbio->version = k.k->version; - rbio->promote = promote; - INIT_WORK(&rbio->work, NULL); - - rbio->bio.bi_opf = orig->bio.bi_opf; - rbio->bio.bi_iter.bi_sector = pick.ptr.offset; - rbio->bio.bi_end_io = bch2_read_endio; - - if (rbio->bounce) - trace_and_count(c, read_bounce, &rbio->bio); - - this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio)); - bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); - - /* - * If it's being moved internally, we don't want to flag it as a cache - * hit: - */ - if (pick.ptr.cached && !(flags & BCH_READ_NODECODE)) - bch2_bucket_io_time_reset(trans, pick.ptr.dev, - PTR_BUCKET_NR(ca, &pick.ptr), READ); - - if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) { - bio_inc_remaining(&orig->bio); - trace_and_count(c, read_split, &orig->bio); - } - - if (!rbio->pick.idx) { - if (!rbio->have_ioref) { - bch_err_inum_offset_ratelimited(c, - read_pos.inode, - read_pos.offset << 9, - "no device to read from"); - bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); - goto out; - } - - this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user], - bio_sectors(&rbio->bio)); - bio_set_dev(&rbio->bio, ca->disk_sb.bdev); - - if (unlikely(c->opts.no_data_io)) { - if (likely(!(flags & BCH_READ_IN_RETRY))) - bio_endio(&rbio->bio); - } else { - if (likely(!(flags & BCH_READ_IN_RETRY))) - submit_bio(&rbio->bio); - else - submit_bio_wait(&rbio->bio); - } - - /* - * We just submitted IO which may block, we expect relock fail - * events and shouldn't count them: - */ - trans->notrace_relock_fail = true; - } else { - /* Attempting reconstruct read: */ - if (bch2_ec_read_extent(c, rbio)) { - bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); - goto out; - } - - if (likely(!(flags & BCH_READ_IN_RETRY))) - bio_endio(&rbio->bio); - } -out: - if (likely(!(flags & BCH_READ_IN_RETRY))) { - return 0; - } else { - int ret; - - rbio->context = RBIO_CONTEXT_UNBOUND; - bch2_read_endio(&rbio->bio); - - ret = rbio->retry; - rbio = bch2_rbio_free(rbio); - - if (ret == READ_RETRY_AVOID) { - bch2_mark_io_failure(failed, &pick); - ret = READ_RETRY; - } - - if (!ret) - goto out_read_done; - - return ret; - } - -err: - if (flags & BCH_READ_IN_RETRY) - return READ_ERR; - - orig->bio.bi_status = BLK_STS_IOERR; - goto out_read_done; - -hole: - /* - * won't normally happen in the BCH_READ_NODECODE - * (bch2_move_extent()) path, but if we retry and the extent we wanted - * to read no longer exists we have to signal that: - */ - if (flags & BCH_READ_NODECODE) - orig->hole = true; - - zero_fill_bio_iter(&orig->bio, iter); -out_read_done: - if (flags & BCH_READ_LAST_FRAGMENT) - bch2_rbio_done(orig); - return 0; -} - -void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, - struct bvec_iter bvec_iter, subvol_inum inum, - struct bch_io_failures *failed, unsigned flags) -{ - struct btree_trans trans; - struct btree_iter iter; - struct bkey_buf sk; - struct bkey_s_c k; - u32 snapshot; - int ret; - - BUG_ON(flags & BCH_READ_NODECODE); - - bch2_bkey_buf_init(&sk); - bch2_trans_init(&trans, c, 0, 0); -retry: - bch2_trans_begin(&trans); - iter = (struct btree_iter) { NULL }; - - ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); - if (ret) - goto err; - - bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, - SPOS(inum.inum, bvec_iter.bi_sector, snapshot), - BTREE_ITER_SLOTS); - while (1) { - unsigned bytes, sectors, offset_into_extent; - enum btree_id data_btree = BTREE_ID_extents; - - /* - * read_extent -> io_time_reset may cause a transaction restart - * without returning an error, we need to check for that here: - */ - ret = bch2_trans_relock(&trans); - if (ret) - break; - - bch2_btree_iter_set_pos(&iter, - POS(inum.inum, bvec_iter.bi_sector)); - - k = bch2_btree_iter_peek_slot(&iter); - ret = bkey_err(k); - if (ret) - break; - - offset_into_extent = iter.pos.offset - - bkey_start_offset(k.k); - sectors = k.k->size - offset_into_extent; - - bch2_bkey_buf_reassemble(&sk, c, k); - - ret = bch2_read_indirect_extent(&trans, &data_btree, - &offset_into_extent, &sk); - if (ret) - break; - - k = bkey_i_to_s_c(sk.k); - - /* - * With indirect extents, the amount of data to read is the min - * of the original extent and the indirect extent: - */ - sectors = min(sectors, k.k->size - offset_into_extent); - - bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; - swap(bvec_iter.bi_size, bytes); - - if (bvec_iter.bi_size == bytes) - flags |= BCH_READ_LAST_FRAGMENT; - - ret = __bch2_read_extent(&trans, rbio, bvec_iter, iter.pos, - data_btree, k, - offset_into_extent, failed, flags); - if (ret) - break; - - if (flags & BCH_READ_LAST_FRAGMENT) - break; - - swap(bvec_iter.bi_size, bytes); - bio_advance_iter(&rbio->bio, &bvec_iter, bytes); - - ret = btree_trans_too_many_iters(&trans); - if (ret) - break; - } -err: - bch2_trans_iter_exit(&trans, &iter); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || - ret == READ_RETRY || - ret == READ_RETRY_AVOID) - goto retry; - - bch2_trans_exit(&trans); - bch2_bkey_buf_exit(&sk, c); - - if (ret) { - bch_err_inum_offset_ratelimited(c, inum.inum, - bvec_iter.bi_sector << 9, - "read error %i from btree lookup", ret); - rbio->bio.bi_status = BLK_STS_IOERR; - bch2_rbio_done(rbio); - } -} - -void bch2_fs_io_exit(struct bch_fs *c) -{ - if (c->promote_table.tbl) - rhashtable_destroy(&c->promote_table); mempool_exit(&c->bio_bounce_pages); bioset_exit(&c->bio_write); - bioset_exit(&c->bio_read_split); - bioset_exit(&c->bio_read); } -int bch2_fs_io_init(struct bch_fs *c) +int bch2_fs_io_write_init(struct bch_fs *c) { - if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio), - BIOSET_NEED_BVECS)) - return -BCH_ERR_ENOMEM_bio_read_init; - - if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio), - BIOSET_NEED_BVECS)) - return -BCH_ERR_ENOMEM_bio_read_split_init; - if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), BIOSET_NEED_BVECS)) return -BCH_ERR_ENOMEM_bio_write_init; @@ -3044,8 +1666,5 @@ int bch2_fs_io_init(struct bch_fs *c) PAGE_SIZE, 0)) return -BCH_ERR_ENOMEM_bio_bounce_pages_init; - if (rhashtable_init(&c->promote_table, &bch_promote_params)) - return -BCH_ERR_ENOMEM_promote_table_init; - return 0; } diff --git a/fs/bcachefs/io_write.h b/fs/bcachefs/io_write.h new file mode 100644 index 000000000000..9323167229ee --- /dev/null +++ b/fs/bcachefs/io_write.h @@ -0,0 +1,110 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_IO_WRITE_H +#define _BCACHEFS_IO_WRITE_H + +#include "checksum.h" +#include "io_write_types.h" + +#define to_wbio(_bio) \ + container_of((_bio), struct bch_write_bio, bio) + +void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *); +void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t); + +#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT +void bch2_latency_acct(struct bch_dev *, u64, int); +#else +static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {} +#endif + +void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, + enum bch_data_type, const struct bkey_i *, bool); + +#define BCH_WRITE_FLAGS() \ + x(ALLOC_NOWAIT) \ + x(CACHED) \ + x(DATA_ENCODED) \ + x(PAGES_STABLE) \ + x(PAGES_OWNED) \ + x(ONLY_SPECIFIED_DEVS) \ + x(WROTE_DATA_INLINE) \ + x(FROM_INTERNAL) \ + x(CHECK_ENOSPC) \ + x(SYNC) \ + x(MOVE) \ + x(IN_WORKER) \ + x(DONE) \ + x(IO_ERROR) \ + x(CONVERT_UNWRITTEN) + +enum __bch_write_flags { +#define x(f) __BCH_WRITE_##f, + BCH_WRITE_FLAGS() +#undef x +}; + +enum bch_write_flags { +#define x(f) BCH_WRITE_##f = BIT(__BCH_WRITE_##f), + BCH_WRITE_FLAGS() +#undef x +}; + +static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) +{ + return op->watermark == BCH_WATERMARK_copygc + ? op->c->copygc_wq + : op->c->btree_update_wq; +} + +int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *, + struct bkey_i *, bool *, s64 *, s64 *); +int bch2_extent_update(struct btree_trans *, subvol_inum, + struct btree_iter *, struct bkey_i *, + struct disk_reservation *, u64, s64 *, bool); + +static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, + struct bch_io_opts opts) +{ + op->c = c; + op->end_io = NULL; + op->flags = 0; + op->written = 0; + op->error = 0; + op->csum_type = bch2_data_checksum_type(c, opts); + op->compression_opt = opts.compression; + op->nr_replicas = 0; + op->nr_replicas_required = c->opts.data_replicas_required; + op->watermark = BCH_WATERMARK_normal; + op->incompressible = 0; + op->open_buckets.nr = 0; + op->devs_have.nr = 0; + op->target = 0; + op->opts = opts; + op->subvol = 0; + op->pos = POS_MAX; + op->version = ZERO_VERSION; + op->write_point = (struct write_point_specifier) { 0 }; + op->res = (struct disk_reservation) { 0 }; + op->new_i_size = U64_MAX; + op->i_sectors_delta = 0; + op->devs_need_flush = NULL; +} + +void bch2_write(struct closure *); + +void bch2_write_point_do_index_updates(struct work_struct *); + +static inline struct bch_write_bio *wbio_init(struct bio *bio) +{ + struct bch_write_bio *wbio = to_wbio(bio); + + memset(&wbio->wbio, 0, sizeof(wbio->wbio)); + return wbio; +} + +void bch2_write_op_to_text(struct printbuf *, struct bch_write_op *); + +void bch2_fs_io_write_exit(struct bch_fs *); +int bch2_fs_io_write_init(struct bch_fs *); + +#endif /* _BCACHEFS_IO_WRITE_H */ diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_write_types.h index 737f16d78c48..c7f97c2c4805 100644 --- a/fs/bcachefs/io_types.h +++ b/fs/bcachefs/io_write_types.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_IO_TYPES_H -#define _BCACHEFS_IO_TYPES_H +#ifndef _BCACHEFS_IO_WRITE_TYPES_H +#define _BCACHEFS_IO_WRITE_TYPES_H #include "alloc_types.h" #include "btree_types.h" @@ -13,75 +13,6 @@ #include <linux/llist.h> #include <linux/workqueue.h> -struct bch_read_bio { - struct bch_fs *c; - u64 start_time; - u64 submit_time; - - /* - * Reads will often have to be split, and if the extent being read from - * was checksummed or compressed we'll also have to allocate bounce - * buffers and copy the data back into the original bio. - * - * If we didn't have to split, we have to save and restore the original - * bi_end_io - @split below indicates which: - */ - union { - struct bch_read_bio *parent; - bio_end_io_t *end_io; - }; - - /* - * Saved copy of bio->bi_iter, from submission time - allows us to - * resubmit on IO error, and also to copy data back to the original bio - * when we're bouncing: - */ - struct bvec_iter bvec_iter; - - unsigned offset_into_extent; - - u16 flags; - union { - struct { - u16 bounce:1, - split:1, - kmalloc:1, - have_ioref:1, - narrow_crcs:1, - hole:1, - retry:2, - context:2; - }; - u16 _state; - }; - - struct bch_devs_list devs_have; - - struct extent_ptr_decoded pick; - - /* - * pos we read from - different from data_pos for indirect extents: - */ - u32 subvol; - struct bpos read_pos; - - /* - * start pos of data we read (may not be pos of data we want) - for - * promote, narrow extents paths: - */ - enum btree_id data_btree; - struct bpos data_pos; - struct bversion version; - - struct promote_op *promote; - - struct bch_io_opts opts; - - struct work_struct work; - - struct bio bio; -}; - struct bch_write_bio { struct_group(wbio, struct bch_fs *c; @@ -162,4 +93,4 @@ struct bch_write_op { struct bch_write_bio wbio; }; -#endif /* _BCACHEFS_IO_TYPES_H */ +#endif /* _BCACHEFS_IO_WRITE_TYPES_H */ diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 34740dca4b15..0e606009dc46 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -8,7 +8,6 @@ #include "checksum.h" #include "disk_groups.h" #include "error.h" -#include "io.h" #include "journal.h" #include "journal_io.h" #include "journal_reclaim.h" diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c index 3d7c5b919421..4746dfa7af97 100644 --- a/fs/bcachefs/migrate.c +++ b/fs/bcachefs/migrate.c @@ -10,7 +10,7 @@ #include "buckets.h" #include "errcode.h" #include "extents.h" -#include "io.h" +#include "io_write.h" #include "journal.h" #include "keylist.h" #include "migrate.h" diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index fb76a1dac74e..ac4df53bfde2 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -14,7 +14,8 @@ #include "errcode.h" #include "error.h" #include "inode.h" -#include "io.h" +#include "io_read.h" +#include "io_write.h" #include "journal_reclaim.h" #include "keylist.h" #include "move.h" diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h index c3136abe8587..cbdd58db8782 100644 --- a/fs/bcachefs/move.h +++ b/fs/bcachefs/move.h @@ -2,6 +2,7 @@ #ifndef _BCACHEFS_MOVE_H #define _BCACHEFS_MOVE_H +#include "bcachefs_ioctl.h" #include "btree_iter.h" #include "buckets.h" #include "data_update.h" diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index ac658e99bf57..2371fd61ea58 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -13,25 +13,17 @@ #include "btree_write_buffer.h" #include "buckets.h" #include "clock.h" -#include "disk_groups.h" #include "errcode.h" #include "error.h" -#include "extents.h" -#include "eytzinger.h" -#include "io.h" -#include "keylist.h" #include "lru.h" #include "move.h" #include "movinggc.h" -#include "super-io.h" #include "trace.h" -#include <linux/bsearch.h> #include <linux/freezer.h> #include <linux/kthread.h> #include <linux/math64.h> #include <linux/sched/task.h> -#include <linux/sort.h> #include <linux/wait.h> struct buckets_in_flight { diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index 016cf0834b3d..568f1e8e7507 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -8,8 +8,6 @@ #include "compress.h" #include "disk_groups.h" #include "errcode.h" -#include "extents.h" -#include "io.h" #include "move.h" #include "rebalance.h" #include "super-io.h" diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index 39f711d5069e..f155428ff395 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -5,9 +5,11 @@ #include "buckets.h" #include "extents.h" #include "inode.h" -#include "io.h" +#include "io_misc.h" +#include "io_write.h" #include "reflink.h" #include "subvolume.h" +#include "super-io.h" #include <linux/sched/signal.h> diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index f01883e785a5..5a1115396edc 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -6,7 +6,6 @@ #include "disk_groups.h" #include "ec.h" #include "error.h" -#include "io.h" #include "journal.h" #include "journal_sb.h" #include "journal_seq_blacklist.h" diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 7cfc04947717..55176023f15b 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -35,7 +35,8 @@ #include "fs-io-direct.h" #include "fsck.h" #include "inode.h" -#include "io.h" +#include "io_read.h" +#include "io_write.h" #include "journal.h" #include "journal_reclaim.h" #include "journal_seq_blacklist.h" @@ -483,7 +484,8 @@ static void __bch2_fs_free(struct bch_fs *c) bch2_fs_fsio_exit(c); bch2_fs_ec_exit(c); bch2_fs_encryption_exit(c); - bch2_fs_io_exit(c); + bch2_fs_io_write_exit(c); + bch2_fs_io_read_exit(c); bch2_fs_buckets_waiting_for_journal_exit(c); bch2_fs_btree_interior_update_exit(c); bch2_fs_btree_iter_exit(c); @@ -848,7 +850,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_fs_buckets_waiting_for_journal_init(c) ?: bch2_fs_btree_write_buffer_init(c) ?: bch2_fs_subvolumes_init(c) ?: - bch2_fs_io_init(c) ?: + bch2_fs_io_read_init(c) ?: + bch2_fs_io_write_init(c) ?: bch2_fs_nocow_locking_init(c) ?: bch2_fs_encryption_init(c) ?: bch2_fs_compress_init(c) ?: |