From 76426098e419c1732efc3f88166f3f3592c215c9 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 16 Aug 2019 09:59:56 -0400 Subject: bcachefs: Reflink Signed-off-by: Kent Overstreet --- fs/bcachefs/Makefile | 1 + fs/bcachefs/bcachefs.h | 4 + fs/bcachefs/bcachefs_format.h | 26 +++- fs/bcachefs/bkey.h | 2 + fs/bcachefs/bkey_methods.c | 1 + fs/bcachefs/btree_types.h | 9 +- fs/bcachefs/btree_update_leaf.c | 3 +- fs/bcachefs/buckets.c | 100 +++++++++++++- fs/bcachefs/extents.c | 50 +++++-- fs/bcachefs/extents.h | 19 ++- fs/bcachefs/fs-io.c | 218 ++++++++++++++++++++++------- fs/bcachefs/fs-io.h | 19 +++ fs/bcachefs/fs.c | 42 +++++- fs/bcachefs/fs.h | 15 +- fs/bcachefs/io.c | 127 +++++++++++++---- fs/bcachefs/io.h | 3 + fs/bcachefs/migrate.c | 13 +- fs/bcachefs/move.c | 98 ++++++++----- fs/bcachefs/move.h | 3 +- fs/bcachefs/recovery.c | 18 +-- fs/bcachefs/reflink.c | 300 ++++++++++++++++++++++++++++++++++++++++ fs/bcachefs/reflink.h | 32 +++++ fs/bcachefs/replicas.c | 1 + 23 files changed, 945 insertions(+), 159 deletions(-) create mode 100644 fs/bcachefs/reflink.c create mode 100644 fs/bcachefs/reflink.h diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile index c29ccdb45965..4c2608409144 100644 --- a/fs/bcachefs/Makefile +++ b/fs/bcachefs/Makefile @@ -44,6 +44,7 @@ bcachefs-y := \ quota.o \ rebalance.o \ recovery.o \ + reflink.o \ replicas.o \ siphash.o \ six.o \ diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 68e2d3b1a9a6..410fce3ed8d4 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -361,6 +361,7 @@ enum gc_phase { GC_PHASE_BTREE_XATTRS, GC_PHASE_BTREE_ALLOC, GC_PHASE_BTREE_QUOTAS, + GC_PHASE_BTREE_REFLINK, GC_PHASE_PENDING_DELETE, GC_PHASE_ALLOC, @@ -750,6 +751,9 @@ struct bch_fs { struct work_struct ec_stripe_delete_work; struct llist_head ec_stripe_delete_list; + /* REFLINK */ + u64 reflink_hint; + /* VFS IO PATH - fs-io.c */ struct bio_set writepage_bioset; struct bio_set dio_write_bioset; diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index b8aafd2e283a..62afea1e7ec3 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -340,7 +340,9 @@ static inline void bkey_init(struct bkey *k) x(xattr, 11) \ x(alloc, 12) \ x(quota, 13) \ - x(stripe, 14) + x(stripe, 14) \ + x(reflink_p, 15) \ + x(reflink_v, 16) enum bch_bkey_type { #define x(name, nr) KEY_TYPE_##name = nr, @@ -895,6 +897,24 @@ struct bch_stripe { struct bch_extent_ptr ptrs[0]; } __attribute__((packed, aligned(8))); +/* Reflink: */ + +struct bch_reflink_p { + struct bch_val v; + __le64 idx; + + __le32 reservation_generation; + __u8 nr_replicas; + __u8 pad[3]; +}; + +struct bch_reflink_v { + struct bch_val v; + __le64 refcount; + union bch_extent_entry start[0]; + __u64 _data[0]; +}; + /* Optional/variable size superblock sections: */ struct bch_sb_field { @@ -1297,6 +1317,7 @@ enum bch_sb_features { BCH_FEATURE_ATOMIC_NLINK = 3, /* should have gone under compat */ BCH_FEATURE_EC = 4, BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3 = 5, + BCH_FEATURE_REFLINK = 6, BCH_FEATURE_NR, }; @@ -1487,7 +1508,8 @@ LE32_BITMASK(JSET_BIG_ENDIAN, struct jset, flags, 4, 5); x(XATTRS, 3, "xattrs") \ x(ALLOC, 4, "alloc") \ x(QUOTAS, 5, "quotas") \ - x(EC, 6, "erasure_coding") + x(EC, 6, "erasure_coding") \ + x(REFLINK, 7, "reflink") enum btree_id { #define x(kwd, val, name) BTREE_ID_##kwd = val, diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h index b3a08e52e6b3..321fe6fe0b55 100644 --- a/fs/bcachefs/bkey.h +++ b/fs/bcachefs/bkey.h @@ -560,6 +560,8 @@ BKEY_VAL_ACCESSORS(xattr); BKEY_VAL_ACCESSORS(alloc); BKEY_VAL_ACCESSORS(quota); BKEY_VAL_ACCESSORS(stripe); +BKEY_VAL_ACCESSORS(reflink_p); +BKEY_VAL_ACCESSORS(reflink_v); /* byte order helpers */ diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c index 8af16ca994e0..6fa6ac1fadc1 100644 --- a/fs/bcachefs/bkey_methods.c +++ b/fs/bcachefs/bkey_methods.c @@ -10,6 +10,7 @@ #include "extents.h" #include "inode.h" #include "quota.h" +#include "reflink.h" #include "xattr.h" const char * const bch2_bkey_types[] = { diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index ec14e2deecb7..621cbfa22fc9 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -464,7 +464,13 @@ static inline enum btree_node_type btree_node_type(struct btree *b) static inline bool btree_node_type_is_extents(enum btree_node_type type) { - return type == BKEY_TYPE_EXTENTS; + switch (type) { + case BKEY_TYPE_EXTENTS: + case BKEY_TYPE_REFLINK: + return true; + default: + return false; + } } static inline bool btree_node_is_extents(struct btree *b) @@ -480,6 +486,7 @@ static inline bool btree_node_type_needs_gc(enum btree_node_type type) case BKEY_TYPE_EXTENTS: case BKEY_TYPE_INODES: case BKEY_TYPE_EC: + case BKEY_TYPE_REFLINK: return true; default: return false; diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c index 5f94b6e9cf28..443ffb5c709d 100644 --- a/fs/bcachefs/btree_update_leaf.c +++ b/fs/bcachefs/btree_update_leaf.c @@ -521,7 +521,8 @@ static inline bool update_triggers_transactional(struct btree_trans *trans, { return likely(!(trans->flags & BTREE_INSERT_MARK_INMEM)) && (i->iter->btree_id == BTREE_ID_EXTENTS || - i->iter->btree_id == BTREE_ID_INODES); + i->iter->btree_id == BTREE_ID_INODES || + i->iter->btree_id == BTREE_ID_REFLINK); } static inline bool update_has_triggers(struct btree_trans *trans, diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index baf9642d21ca..3d243f2d1095 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -972,7 +972,7 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c, spin_unlock(&c->ec_stripes_heap_lock); bch_err_ratelimited(c, "pointer to nonexistent stripe %llu", (u64) p.idx); - return -1; + return -EIO; } BUG_ON(m->r.e.data_type != data_type); @@ -1144,6 +1144,7 @@ int bch2_mark_key_locked(struct bch_fs *c, fs_usage, journal_seq, flags); break; case KEY_TYPE_extent: + case KEY_TYPE_reflink_v: ret = bch2_mark_extent(c, k, offset, sectors, BCH_DATA_USER, fs_usage, journal_seq, flags); break; @@ -1304,7 +1305,8 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans, xchg(&warned_disk_usage, 1)) return; - pr_err("disk usage increased more than %llu sectors reserved", disk_res_sectors); + bch_err(c, "disk usage increased more than %llu sectors reserved", + disk_res_sectors); trans_for_each_update_iter(trans, i) { struct btree_iter *iter = i->iter; @@ -1319,7 +1321,7 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans, node_iter = iter->l[0].iter; while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b, - KEY_TYPE_discard))) { + KEY_TYPE_discard))) { struct bkey unpacked; struct bkey_s_c k; @@ -1471,6 +1473,7 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, struct bch_extent_stripe_ptr p, s64 sectors, enum bch_data_type data_type) { + struct bch_fs *c = trans->c; struct bch_replicas_padded r; struct btree_iter *iter; struct bkey_i *new_k; @@ -1487,10 +1490,10 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, return ret; if (k.k->type != KEY_TYPE_stripe) { - bch_err_ratelimited(trans->c, - "pointer to nonexistent stripe %llu", - (u64) p.idx); - ret = -1; + bch2_fs_inconsistent(c, + "pointer to nonexistent stripe %llu", + (u64) p.idx); + ret = -EIO; goto out; } @@ -1578,6 +1581,84 @@ static int bch2_trans_mark_extent(struct btree_trans *trans, return 0; } +static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, + struct bkey_s_c_reflink_p p, + u64 idx, unsigned sectors, + unsigned flags) +{ + struct bch_fs *c = trans->c; + struct btree_iter *iter; + struct bkey_i *new_k; + struct bkey_s_c k; + struct bkey_i_reflink_v *r_v; + s64 ret; + + ret = trans_get_key(trans, BTREE_ID_REFLINK, + POS(0, idx), &iter, &k); + if (ret) + return ret; + + if (k.k->type != KEY_TYPE_reflink_v) { + bch2_fs_inconsistent(c, + "%llu:%llu len %u points to nonexistent indirect extent %llu", + p.k->p.inode, p.k->p.offset, p.k->size, idx); + ret = -EIO; + goto err; + } + + if ((flags & BCH_BUCKET_MARK_OVERWRITE) && + (bkey_start_offset(k.k) < idx || + k.k->p.offset > idx + sectors)) + goto out; + + bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k)); + BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK); + + new_k = trans_update_key(trans, iter, k.k->u64s); + ret = PTR_ERR_OR_ZERO(new_k); + if (ret) + goto err; + + bkey_reassemble(new_k, k); + r_v = bkey_i_to_reflink_v(new_k); + + le64_add_cpu(&r_v->v.refcount, + !(flags & BCH_BUCKET_MARK_OVERWRITE) ? 1 : -1); + + if (!r_v->v.refcount) { + r_v->k.type = KEY_TYPE_deleted; + set_bkey_val_u64s(&r_v->k, 0); + } +out: + ret = k.k->p.offset - idx; +err: + bch2_trans_iter_put(trans, iter); + return ret; +} + +static int bch2_trans_mark_reflink_p(struct btree_trans *trans, + struct bkey_s_c_reflink_p p, unsigned offset, + s64 sectors, unsigned flags) +{ + u64 idx = le64_to_cpu(p.v->idx) + offset; + s64 ret = 0; + + sectors = abs(sectors); + BUG_ON(offset + sectors > p.k->size); + + while (sectors) { + ret = __bch2_trans_mark_reflink_p(trans, p, idx, sectors, flags); + if (ret < 0) + break; + + idx += ret; + sectors = max_t(s64, 0LL, sectors - ret); + ret = 0; + } + + return ret; +} + int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k, unsigned offset, s64 sectors, unsigned flags) { @@ -1593,6 +1674,7 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k, return bch2_trans_mark_extent(trans, k, offset, sectors, flags, BCH_DATA_BTREE); case KEY_TYPE_extent: + case KEY_TYPE_reflink_v: return bch2_trans_mark_extent(trans, k, offset, sectors, flags, BCH_DATA_USER); case KEY_TYPE_inode: @@ -1616,6 +1698,10 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k, d->fs_usage.persistent_reserved[replicas - 1] += sectors; return 0; } + case KEY_TYPE_reflink_p: + return bch2_trans_mark_reflink_p(trans, + bkey_s_c_to_reflink_p(k), + offset, sectors, flags); default: return 0; } diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 11defa3d99a5..81ec55526ce9 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -744,7 +744,8 @@ void __bch2_cut_front(struct bpos where, struct bkey_s k) case KEY_TYPE_error: case KEY_TYPE_cookie: break; - case KEY_TYPE_extent: { + case KEY_TYPE_extent: + case KEY_TYPE_reflink_v: { struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); union bch_extent_entry *entry; bool seen_crc = false; @@ -774,6 +775,12 @@ void __bch2_cut_front(struct bpos where, struct bkey_s k) break; } + case KEY_TYPE_reflink_p: { + struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k); + + le64_add_cpu(&p.v->idx, sub); + break; + } case KEY_TYPE_reservation: break; default: @@ -968,6 +975,33 @@ static int __bch2_extent_atomic_end(struct btree_trans *trans, } break; + case KEY_TYPE_reflink_p: { + struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); + u64 idx = le64_to_cpu(p.v->idx); + unsigned sectors = end->offset - bkey_start_offset(p.k); + struct btree_iter *iter; + struct bkey_s_c r_k; + + for_each_btree_key(trans, iter, + BTREE_ID_REFLINK, POS(0, idx + offset), + BTREE_ITER_SLOTS, r_k, ret) { + if (bkey_cmp(bkey_start_pos(r_k.k), + POS(0, idx + sectors)) >= 0) + break; + + *nr_iters += 1; + if (*nr_iters >= max_iters) { + struct bpos pos = bkey_start_pos(k.k); + pos.offset += r_k.k->p.offset - idx; + + *end = bpos_min(*end, pos); + break; + } + } + + bch2_trans_iter_put(trans, iter); + break; + } } return ret; @@ -1561,17 +1595,17 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) return false; } -void bch2_extent_mark_replicas_cached(struct bch_fs *c, - struct bkey_s_extent e, - unsigned target, - unsigned nr_desired_replicas) +void bch2_bkey_mark_replicas_cached(struct bch_fs *c, struct bkey_s k, + unsigned target, + unsigned nr_desired_replicas) { + struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); union bch_extent_entry *entry; struct extent_ptr_decoded p; - int extra = bch2_bkey_durability(c, e.s_c) - nr_desired_replicas; + int extra = bch2_bkey_durability(c, k.s_c) - nr_desired_replicas; if (target && extra > 0) - extent_for_each_ptr_decode(e, p, entry) { + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { int n = bch2_extent_ptr_durability(c, p); if (n && n <= extra && @@ -1582,7 +1616,7 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *c, } if (extra > 0) - extent_for_each_ptr_decode(e, p, entry) { + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { int n = bch2_extent_ptr_durability(c, p); if (n && n <= extra) { diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index 156d8e37045a..cef93af25858 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -306,6 +306,14 @@ static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k) to_entry(&s.v->ptrs[s.v->nr_blocks]), }; } + case KEY_TYPE_reflink_v: { + struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); + + return (struct bkey_ptrs_c) { + r.v->start, + bkey_val_end(r), + }; + } default: return (struct bkey_ptrs_c) { NULL, NULL }; } @@ -436,8 +444,8 @@ bch2_extent_can_insert(struct btree_trans *, struct btree_insert_entry *, void bch2_insert_fixup_extent(struct btree_trans *, struct btree_insert_entry *); -void bch2_extent_mark_replicas_cached(struct bch_fs *, struct bkey_s_extent, - unsigned, unsigned); +void bch2_bkey_mark_replicas_cached(struct bch_fs *, struct bkey_s, + unsigned, unsigned); const struct bch_extent_ptr * bch2_extent_has_device(struct bkey_s_c_extent, unsigned); @@ -452,17 +460,24 @@ static inline bool bkey_extent_is_data(const struct bkey *k) switch (k->type) { case KEY_TYPE_btree_ptr: case KEY_TYPE_extent: + case KEY_TYPE_reflink_p: + case KEY_TYPE_reflink_v: return true; default: return false; } } +/* + * Should extent be counted under inode->i_sectors? + */ static inline bool bkey_extent_is_allocation(const struct bkey *k) { switch (k->type) { case KEY_TYPE_extent: case KEY_TYPE_reservation: + case KEY_TYPE_reflink_p: + case KEY_TYPE_reflink_v: return true; default: return false; diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index ef94aecaa7cb..771fb111550d 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -16,6 +16,7 @@ #include "io.h" #include "keylist.h" #include "quota.h" +#include "reflink.h" #include "trace.h" #include @@ -201,9 +202,9 @@ static int inode_set_size(struct bch_inode_info *inode, return 0; } -static int __must_check bch2_write_inode_size(struct bch_fs *c, - struct bch_inode_info *inode, - loff_t new_size, unsigned fields) +int __must_check bch2_write_inode_size(struct bch_fs *c, + struct bch_inode_info *inode, + loff_t new_size, unsigned fields) { struct inode_new_size s = { .new_size = new_size, @@ -936,15 +937,12 @@ static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k) { struct bvec_iter iter; struct bio_vec bv; - unsigned nr_ptrs = bch2_bkey_nr_ptrs_allocated(k); + unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v + ? 0 : bch2_bkey_nr_ptrs_allocated(k); unsigned state = k.k->type == KEY_TYPE_reservation ? SECTOR_RESERVED : SECTOR_ALLOCATED; - BUG_ON(bio->bi_iter.bi_sector < bkey_start_offset(k.k)); - BUG_ON(bio_end_sector(bio) > k.k->p.offset); - - bio_for_each_segment(bv, bio, iter) { struct bch_page_state *s = bch2_page_state(bv.bv_page); unsigned i; @@ -959,10 +957,11 @@ static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k) } static void readpage_bio_extend(struct readpages_iter *iter, - struct bio *bio, u64 offset, + struct bio *bio, + unsigned sectors_this_extent, bool get_more) { - while (bio_end_sector(bio) < offset && + while (bio_sectors(bio) < sectors_this_extent && bio->bi_vcnt < bio->bi_max_vecs) { pgoff_t page_offset = bio_end_sector(bio) >> PAGE_SECTOR_SHIFT; struct page *page = readpage_iter_next(iter); @@ -1012,35 +1011,39 @@ static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter, struct bch_fs *c = trans->c; int flags = BCH_READ_RETRY_IF_STALE| BCH_READ_MAY_PROMOTE; + int ret = 0; rbio->c = c; rbio->start_time = local_clock(); - +retry: while (1) { BKEY_PADDED(k) tmp; struct bkey_s_c k; - unsigned bytes, offset_into_extent; + unsigned bytes, sectors, offset_into_extent; bch2_btree_iter_set_pos(iter, POS(inum, rbio->bio.bi_iter.bi_sector)); k = bch2_btree_iter_peek_slot(iter); - BUG_ON(!k.k); - - if (IS_ERR(k.k)) { - int ret = btree_iter_err(iter); - BUG_ON(!ret); - bcache_io_error(c, &rbio->bio, "btree IO error %i", ret); - bio_endio(&rbio->bio); - return; - } + ret = bkey_err(k); + if (ret) + break; bkey_reassemble(&tmp.k, k); - bch2_trans_unlock(trans); k = bkey_i_to_s_c(&tmp.k); offset_into_extent = iter->pos.offset - bkey_start_offset(k.k); + sectors = k.k->size - offset_into_extent; + + ret = bch2_read_indirect_extent(trans, iter, + &offset_into_extent, &tmp.k); + if (ret) + break; + + sectors = min(sectors, k.k->size - offset_into_extent); + + bch2_trans_unlock(trans); if (readpages_iter) { bool want_full_extent = false; @@ -1055,13 +1058,11 @@ static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter, (p.crc.compression_type != 0)); } - readpage_bio_extend(readpages_iter, - &rbio->bio, k.k->p.offset, - want_full_extent); + readpage_bio_extend(readpages_iter, &rbio->bio, + sectors, want_full_extent); } - bytes = min_t(unsigned, bio_sectors(&rbio->bio), - (k.k->size - offset_into_extent)) << 9; + bytes = min(sectors, bio_sectors(&rbio->bio)) << 9; swap(rbio->bio.bi_iter.bi_size, bytes); if (rbio->bio.bi_iter.bi_size == bytes) @@ -1078,6 +1079,12 @@ static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter, swap(rbio->bio.bi_iter.bi_size, bytes); bio_advance(&rbio->bio, bytes); } + + if (ret == -EINTR) + goto retry; + + bcache_io_error(c, &rbio->bio, "btree IO error %i", ret); + bio_endio(&rbio->bio); } void bch2_readahead(struct readahead_control *ractl) @@ -2256,29 +2263,25 @@ out: /* truncate: */ -static int __bch2_fpunch(struct bch_fs *c, struct bch_inode_info *inode, - u64 start_offset, u64 end_offset, u64 *journal_seq) +int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, + struct bpos end, struct bch_inode_info *inode, + u64 new_i_size) { - struct bpos start = POS(inode->v.i_ino, start_offset); - struct bpos end = POS(inode->v.i_ino, end_offset); + struct bch_fs *c = trans->c; unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); - struct btree_trans trans; - struct btree_iter *iter; struct bkey_s_c k; - int ret = 0; - - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); - - iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, start, - BTREE_ITER_INTENT); + int ret = 0, ret2 = 0; while ((k = bch2_btree_iter_peek(iter)).k && - !(ret = bkey_err(k)) && bkey_cmp(iter->pos, end) < 0) { struct disk_reservation disk_res = bch2_disk_reservation_init(c, 0); struct bkey_i delete; + ret = bkey_err(k); + if (ret) + goto btree_err; + bkey_init(&delete.k); delete.k.p = iter->pos; @@ -2286,23 +2289,51 @@ static int __bch2_fpunch(struct bch_fs *c, struct bch_inode_info *inode, bch2_key_resize(&delete.k, max_sectors); bch2_cut_back(end, &delete.k); - bch2_trans_begin_updates(&trans); + bch2_trans_begin_updates(trans); - ret = bch2_extent_update(&trans, inode, + ret = bch2_extent_update(trans, inode, &disk_res, NULL, iter, &delete, - 0, true, true, NULL); + new_i_size, false, true, NULL); bch2_disk_reservation_put(c, &disk_res); - - if (ret == -EINTR) +btree_err: + if (ret == -EINTR) { + ret2 = ret; ret = 0; + } if (ret) break; + } - bch2_trans_cond_resched(&trans); + if (bkey_cmp(iter->pos, end) > 0) { + bch2_btree_iter_set_pos(iter, end); + ret = bch2_btree_iter_traverse(iter); } + return ret ?: ret2; +} + +static int __bch2_fpunch(struct bch_fs *c, struct bch_inode_info *inode, + u64 start_offset, u64 end_offset) +{ + struct btree_trans trans; + struct btree_iter *iter; + int ret = 0; + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); + + iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, + POS(inode->v.i_ino, start_offset), + BTREE_ITER_INTENT); + + ret = bch2_fpunch_at(&trans, iter, + POS(inode->v.i_ino, end_offset), + inode, 0); + bch2_trans_exit(&trans); + if (ret == -EINTR) + ret = 0; + return ret; } @@ -2510,7 +2541,7 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr) ret = __bch2_fpunch(c, inode, round_up(iattr->ia_size, block_bytes(c)) >> 9, - U64_MAX, &inode->ei_journal_seq); + U64_MAX); if (unlikely(ret)) goto err; @@ -2557,8 +2588,7 @@ static long bch2_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len) truncate_pagecache_range(&inode->v, offset, offset + len - 1); if (discard_start < discard_end) - ret = __bch2_fpunch(c, inode, discard_start, discard_end, - &inode->ei_journal_seq); + ret = __bch2_fpunch(c, inode, discard_start, discard_end); err: bch2_pagecache_block_put(&inode->ei_pagecache_lock); inode_unlock(&inode->v); @@ -2670,7 +2700,7 @@ bkey_err: ret = __bch2_fpunch(c, inode, round_up(new_size, block_bytes(c)) >> 9, - U64_MAX, &inode->ei_journal_seq); + U64_MAX); if (ret) goto err; @@ -2853,6 +2883,94 @@ long bch2_fallocate_dispatch(struct file *file, int mode, return -EOPNOTSUPP; } +static void mark_range_unallocated(struct bch_inode_info *inode, + loff_t start, loff_t end) +{ + pgoff_t index = start >> PAGE_SHIFT; + pgoff_t end_index = (end - 1) >> PAGE_SHIFT; + struct folio_batch fbatch; + unsigned i, j; + + folio_batch_init(&fbatch); + + while (filemap_get_folios(inode->v.i_mapping, + &index, end_index, &fbatch)) { + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i]; + struct bch_page_state *s; + + folio_lock(folio); + s = bch2_page_state(&folio->page); + + if (s) + for (j = 0; j < PAGE_SECTORS; j++) + s->s[j].nr_replicas = 0; + + folio_unlock(folio); + } + folio_batch_release(&fbatch); + cond_resched(); + } +} + +loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, + struct file *file_dst, loff_t pos_dst, + loff_t len, unsigned remap_flags) +{ + struct bch_inode_info *src = file_bch_inode(file_src); + struct bch_inode_info *dst = file_bch_inode(file_dst); + struct bch_fs *c = src->v.i_sb->s_fs_info; + loff_t ret = 0; + loff_t aligned_len; + + if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY)) + return -EINVAL; + + if (remap_flags & REMAP_FILE_DEDUP) + return -EOPNOTSUPP; + + if ((pos_src & (block_bytes(c) - 1)) || + (pos_dst & (block_bytes(c) - 1))) + return -EINVAL; + + if (src == dst && + abs(pos_src - pos_dst) < len) + return -EINVAL; + + bch2_lock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst); + + inode_dio_wait(&src->v); + inode_dio_wait(&dst->v); + + ret = generic_remap_file_range_prep(file_src, pos_src, + file_dst, pos_dst, + &len, remap_flags); + if (ret < 0 || len == 0) + goto out_unlock; + + aligned_len = round_up(len, block_bytes(c)); + + ret = write_invalidate_inode_pages_range(dst->v.i_mapping, + pos_dst, pos_dst + aligned_len); + if (ret) + goto out_unlock; + + mark_range_unallocated(src, pos_src, pos_src + aligned_len); + + ret = bch2_remap_range(c, dst, + POS(dst->v.i_ino, pos_dst >> 9), + POS(src->v.i_ino, pos_src >> 9), + aligned_len >> 9, + pos_dst + len); + if (ret > 0) + ret = min(ret << 9, len); + +out_unlock: + bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst); + + return ret; +} + /* fseek: */ static int folio_data_offset(struct folio *folio, unsigned offset) diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h index e263b515e901..861ec25ab9ef 100644 --- a/fs/bcachefs/fs-io.h +++ b/fs/bcachefs/fs-io.h @@ -9,6 +9,22 @@ #include +struct quota_res; + +int bch2_extent_update(struct btree_trans *, + struct bch_inode_info *, + struct disk_reservation *, + struct quota_res *, + struct btree_iter *, + struct bkey_i *, + u64, bool, bool, s64 *); +int bch2_fpunch_at(struct btree_trans *, struct btree_iter *, + struct bpos, struct bch_inode_info *, u64); + +int __must_check bch2_write_inode_size(struct bch_fs *, + struct bch_inode_info *, + loff_t, unsigned); + int bch2_writepage(struct page *, struct writeback_control *); int bch2_read_folio(struct file *, struct folio *); @@ -28,6 +44,9 @@ int bch2_fsync(struct file *, loff_t, loff_t, int); int bch2_truncate(struct bch_inode_info *, struct iattr *); long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t); +loff_t bch2_remap_file_range(struct file *, loff_t, struct file *, + loff_t, loff_t, unsigned); + loff_t bch2_llseek(struct file *, loff_t, int); vm_fault_t bch2_page_fault(struct vm_fault *); diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 54e555fb4d5d..fad019d3c3f5 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -1157,6 +1157,9 @@ static int bch2_fill_extent(struct bch_fs *c, struct extent_ptr_decoded p; int ret; + if (k.k->type == KEY_TYPE_reflink_v) + flags |= FIEMAP_EXTENT_SHARED; + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { int flags2 = 0; u64 offset = p.ptr.offset; @@ -1200,6 +1203,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, struct btree_iter *iter; struct bkey_s_c k; BKEY_PADDED(k) cur, prev; + unsigned offset_into_extent, sectors; bool have_extent = false; int ret = 0; @@ -1212,15 +1216,36 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, bch2_trans_init(&trans, c, 0, 0); - for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, - POS(ei->v.i_ino, start >> 9), 0, k, ret) { - if (bkey_cmp(bkey_start_pos(k.k), - POS(ei->v.i_ino, (start + len) >> 9)) >= 0) - break; + iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, + POS(ei->v.i_ino, start >> 9), + BTREE_ITER_SLOTS); + + while (bkey_cmp(iter->pos, POS(ei->v.i_ino, (start + len) >> 9)) < 0) { + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) + goto err; bkey_reassemble(&cur.k, k); k = bkey_i_to_s_c(&cur.k); + offset_into_extent = iter->pos.offset - + bkey_start_offset(k.k); + sectors = k.k->size - offset_into_extent; + + ret = bch2_read_indirect_extent(&trans, iter, + &offset_into_extent, &cur.k); + if (ret) + break; + + sectors = min(sectors, k.k->size - offset_into_extent); + + bch2_cut_front(POS(k.k->p.inode, + bkey_start_offset(k.k) + offset_into_extent), + &cur.k); + bch2_key_resize(&cur.k.k, sectors); + cur.k.k.p.offset = iter->pos.offset + cur.k.k.size; + if (bkey_extent_is_data(k.k) || k.k->type == KEY_TYPE_reservation) { if (have_extent) { @@ -1233,12 +1258,16 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, bkey_copy(&prev.k, &cur.k); have_extent = true; } + + bch2_btree_iter_set_pos(iter, + POS(iter->pos.inode, + iter->pos.offset + sectors)); } if (!ret && have_extent) ret = bch2_fill_extent(c, info, bkey_i_to_s_c(&prev.k), FIEMAP_EXTENT_LAST); - +err: ret = bch2_trans_exit(&trans) ?: ret; return ret < 0 ? ret : 0; } @@ -1286,6 +1315,7 @@ static const struct file_operations bch_file_operations = { #ifdef CONFIG_COMPAT .compat_ioctl = bch2_compat_fs_ioctl, #endif + .remap_file_range = bch2_remap_file_range, }; static const struct inode_operations bch_file_inode_operations = { diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h index de07f0f1dd51..6edf5dd803f0 100644 --- a/fs/bcachefs/fs.h +++ b/fs/bcachefs/fs.h @@ -59,7 +59,8 @@ static inline int ptrcmp(void *l, void *r) enum bch_inode_lock_op { INODE_LOCK = (1U << 0), - INODE_UPDATE_LOCK = (1U << 1), + INODE_PAGECACHE_BLOCK = (1U << 1), + INODE_UPDATE_LOCK = (1U << 2), }; #define bch2_lock_inodes(_locks, ...) \ @@ -71,9 +72,11 @@ do { \ \ for (i = 1; i < ARRAY_SIZE(a); i++) \ if (a[i] != a[i - 1]) { \ - if (_locks & INODE_LOCK) \ + if ((_locks) & INODE_LOCK) \ down_write_nested(&a[i]->v.i_rwsem, i); \ - if (_locks & INODE_UPDATE_LOCK) \ + if ((_locks) & INODE_PAGECACHE_BLOCK) \ + bch2_pagecache_block_get(&a[i]->ei_pagecache_lock);\ + if ((_locks) & INODE_UPDATE_LOCK) \ mutex_lock_nested(&a[i]->ei_update_lock, i);\ } \ } while (0) @@ -87,9 +90,11 @@ do { \ \ for (i = 1; i < ARRAY_SIZE(a); i++) \ if (a[i] != a[i - 1]) { \ - if (_locks & INODE_LOCK) \ + if ((_locks) & INODE_LOCK) \ up_write(&a[i]->v.i_rwsem); \ - if (_locks & INODE_UPDATE_LOCK) \ + if ((_locks) & INODE_PAGECACHE_BLOCK) \ + bch2_pagecache_block_put(&a[i]->ei_pagecache_lock);\ + if ((_locks) & INODE_UPDATE_LOCK) \ mutex_unlock(&a[i]->ei_update_lock); \ } \ } while (0) diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c index ed84572a9e67..4d359931edb3 100644 --- a/fs/bcachefs/io.c +++ b/fs/bcachefs/io.c @@ -1041,6 +1041,7 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) noinline static struct promote_op *__promote_alloc(struct bch_fs *c, + enum btree_id btree_id, struct bpos pos, struct extent_ptr_decoded *pick, struct bch_io_opts opts, @@ -1097,6 +1098,7 @@ static struct promote_op *__promote_alloc(struct bch_fs *c, (struct data_opts) { .target = opts.promote_target }, + btree_id, bkey_s_c_null); BUG_ON(ret); @@ -1134,7 +1136,11 @@ static inline struct promote_op *promote_alloc(struct bch_fs *c, if (!should_promote(c, k, pos, opts, flags)) return NULL; - promote = __promote_alloc(c, pos, pick, opts, sectors, rbio); + promote = __promote_alloc(c, + k.k->type == KEY_TYPE_reflink_v + ? BTREE_ID_REFLINK + : BTREE_ID_EXTENTS, + pos, pick, opts, sectors, rbio); if (!promote) return NULL; @@ -1278,18 +1284,25 @@ retry: POS(inode, bvec_iter.bi_sector), BTREE_ITER_SLOTS, k, ret) { BKEY_PADDED(k) tmp; - unsigned bytes, offset_into_extent; + unsigned bytes, sectors, offset_into_extent; bkey_reassemble(&tmp.k, k); k = bkey_i_to_s_c(&tmp.k); - bch2_trans_unlock(&trans); - offset_into_extent = iter->pos.offset - bkey_start_offset(k.k); + sectors = k.k->size - offset_into_extent; + + ret = bch2_read_indirect_extent(&trans, iter, + &offset_into_extent, &tmp.k); + if (ret) + break; - bytes = min_t(unsigned, bvec_iter_sectors(bvec_iter), - (k.k->size - offset_into_extent)) << 9; + sectors = min(sectors, k.k->size - offset_into_extent); + + bch2_trans_unlock(&trans); + + bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; swap(bvec_iter.bi_size, bytes); ret = __bch2_read_extent(c, rbio, bvec_iter, k, @@ -1569,6 +1582,48 @@ static void bch2_read_endio(struct bio *bio) bch2_rbio_punt(rbio, __bch2_read_endio, context, wq); } +int bch2_read_indirect_extent(struct btree_trans *trans, + struct btree_iter *extent_iter, + unsigned *offset_into_extent, + struct bkey_i *orig_k) +{ + struct btree_iter *iter; + struct bkey_s_c k; + u64 reflink_offset; + int ret; + + if (orig_k->k.type != KEY_TYPE_reflink_p) + return 0; + + reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k)->v.idx) + + *offset_into_extent; + + iter = __bch2_trans_get_iter(trans, BTREE_ID_REFLINK, + POS(0, reflink_offset), + BTREE_ITER_SLOTS, 1); + ret = PTR_ERR_OR_ZERO(iter); + if (ret) + return ret; + + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != KEY_TYPE_reflink_v) { + __bcache_io_error(trans->c, + "pointer to nonexistent indirect extent"); + ret = -EIO; + goto err; + } + + *offset_into_extent = iter->pos.offset - bkey_start_offset(k.k); + bkey_reassemble(orig_k, k); +err: + bch2_trans_iter_put(trans, iter); + return ret; +} + int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, struct bvec_iter iter, struct bkey_s_c k, unsigned offset_into_extent, @@ -1644,6 +1699,7 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, pos.offset += offset_into_extent; pick.ptr.offset += pick.crc.offset + offset_into_extent; + offset_into_extent = 0; pick.crc.compressed_size = bvec_iter_sectors(iter); pick.crc.uncompressed_size = bvec_iter_sectors(iter); pick.crc.offset = 0; @@ -1829,25 +1885,47 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode) rbio->c = c; rbio->start_time = local_clock(); - for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, - POS(inode, rbio->bio.bi_iter.bi_sector), - BTREE_ITER_SLOTS, k, ret) { + iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, + POS(inode, rbio->bio.bi_iter.bi_sector), + BTREE_ITER_SLOTS); + + while (1) { BKEY_PADDED(k) tmp; - unsigned bytes, offset_into_extent; + unsigned bytes, sectors, offset_into_extent; + + bch2_btree_iter_set_pos(iter, + POS(inode, rbio->bio.bi_iter.bi_sector)); + + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) + goto err; - /* - * Unlock the iterator while the btree node's lock is still in - * cache, before doing the IO: - */ bkey_reassemble(&tmp.k, k); k = bkey_i_to_s_c(&tmp.k); - bch2_trans_unlock(&trans); offset_into_extent = iter->pos.offset - bkey_start_offset(k.k); + sectors = k.k->size - offset_into_extent; + + ret = bch2_read_indirect_extent(&trans, iter, + &offset_into_extent, &tmp.k); + if (ret) + goto err; + + /* + * With indirect extents, the amount of data to read is the min + * of the original extent and the indirect extent: + */ + sectors = min(sectors, k.k->size - offset_into_extent); + + /* + * Unlock the iterator while the btree node's lock is still in + * cache, before doing the IO: + */ + bch2_trans_unlock(&trans); - bytes = min_t(unsigned, bio_sectors(&rbio->bio), - (k.k->size - offset_into_extent)) << 9; + bytes = min(sectors, bio_sectors(&rbio->bio)) << 9; swap(rbio->bio.bi_iter.bi_size, bytes); if (rbio->bio.bi_iter.bi_size == bytes) @@ -1856,21 +1934,18 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode) bch2_read_extent(c, rbio, k, offset_into_extent, flags); if (flags & BCH_READ_LAST_FRAGMENT) - return; + break; swap(rbio->bio.bi_iter.bi_size, bytes); bio_advance(&rbio->bio, bytes); } - - /* - * If we get here, it better have been because there was an error - * reading a btree node - */ - BUG_ON(!ret); - bcache_io_error(c, &rbio->bio, "btree IO error: %i", ret); - +out: bch2_trans_exit(&trans); + return; +err: + bcache_io_error(c, &rbio->bio, "btree IO error: %i", ret); bch2_rbio_done(rbio); + goto out; } void bch2_fs_io_exit(struct bch_fs *c) diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h index aa437cb05fe7..a768ccc90f1f 100644 --- a/fs/bcachefs/io.h +++ b/fs/bcachefs/io.h @@ -99,6 +99,9 @@ struct bch_devs_mask; struct cache_promote_op; struct extent_ptr_decoded; +int bch2_read_indirect_extent(struct btree_trans *, struct btree_iter *, + unsigned *, struct bkey_i *); + enum bch_read_flags { BCH_READ_RETRY_IF_STALE = 1 << 0, BCH_READ_MAY_PROMOTE = 1 << 1, diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c index 301cb72bd3e4..dc3b03d6e627 100644 --- a/fs/bcachefs/migrate.c +++ b/fs/bcachefs/migrate.c @@ -34,7 +34,8 @@ static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k, return 0; } -static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) +static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags, + enum btree_id btree_id) { struct btree_trans trans; struct btree_iter *iter; @@ -44,8 +45,8 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); - iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, - POS_MIN, BTREE_ITER_PREFETCH); + iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN, + BTREE_ITER_PREFETCH); while ((k = bch2_btree_iter_peek(iter)).k && !(ret = bkey_err(k))) { @@ -98,6 +99,12 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) return ret; } +static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) +{ + return __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_EXTENTS) ?: + __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_REFLINK); +} + static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) { struct btree_trans trans; diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index ffa0c2bbe290..05bb74a36230 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -63,13 +63,14 @@ static int bch2_migrate_index_update(struct bch_write_op *op) bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); - iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, + iter = bch2_trans_get_iter(&trans, m->btree_id, bkey_start_pos(&bch2_keylist_front(keys)->k), BTREE_ITER_SLOTS|BTREE_ITER_INTENT); while (1) { struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); - struct bkey_i_extent *insert, *new = + struct bkey_i *insert; + struct bkey_i_extent *new = bkey_i_to_extent(bch2_keylist_front(keys)); BKEY_PADDED(k) _new, _insert; const union bch_extent_entry *entry; @@ -86,26 +87,25 @@ static int bch2_migrate_index_update(struct bch_write_op *op) goto nomatch; if (m->data_cmd == DATA_REWRITE && - !bch2_extent_has_device(bkey_s_c_to_extent(k), - m->data_opts.rewrite_dev)) + !bch2_bkey_has_device(k, m->data_opts.rewrite_dev)) goto nomatch; bkey_reassemble(&_insert.k, k); - insert = bkey_i_to_extent(&_insert.k); + insert = &_insert.k; bkey_copy(&_new.k, bch2_keylist_front(keys)); new = bkey_i_to_extent(&_new.k); - bch2_cut_front(iter->pos, &insert->k_i); + bch2_cut_front(iter->pos, insert); bch2_cut_back(new->k.p, &insert->k); bch2_cut_back(insert->k.p, &new->k); if (m->data_cmd == DATA_REWRITE) - bch2_bkey_drop_device(extent_i_to_s(insert).s, + bch2_bkey_drop_device(bkey_i_to_s(insert), m->data_opts.rewrite_dev); extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) { - if (bch2_extent_has_device(extent_i_to_s_c(insert), p.ptr.dev)) { + if (bch2_bkey_has_device(bkey_i_to_s_c(insert), p.ptr.dev)) { /* * raced with another move op? extent already * has a pointer to the device we just wrote @@ -114,25 +114,25 @@ static int bch2_migrate_index_update(struct bch_write_op *op) continue; } - bch2_extent_ptr_decoded_append(&insert->k_i, &p); + bch2_extent_ptr_decoded_append(insert, &p); did_work = true; } if (!did_work) goto nomatch; - bch2_bkey_narrow_crcs(&insert->k_i, + bch2_bkey_narrow_crcs(insert, (struct bch_extent_crc_unpacked) { 0 }); - bch2_extent_normalize(c, extent_i_to_s(insert).s); - bch2_extent_mark_replicas_cached(c, extent_i_to_s(insert), - op->opts.background_target, - op->opts.data_replicas); + bch2_extent_normalize(c, bkey_i_to_s(insert)); + bch2_bkey_mark_replicas_cached(c, bkey_i_to_s(insert), + op->opts.background_target, + op->opts.data_replicas); /* * If we're not fully overwriting @k, and it's compressed, we * need a reservation for all the pointers in @insert */ - nr = bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(&insert->k_i)) - + nr = bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(insert)) - m->nr_ptrs_reserved; if (insert->k.size < k.k->size && @@ -148,7 +148,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op) } bch2_trans_update(&trans, - BTREE_INSERT_ENTRY(iter, &insert->k_i)); + BTREE_INSERT_ENTRY(iter, insert)); ret = bch2_trans_commit(&trans, &op->res, op_journal_seq(op), @@ -213,10 +213,12 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, struct bch_io_opts io_opts, enum data_cmd data_cmd, struct data_opts data_opts, + enum btree_id btree_id, struct bkey_s_c k) { int ret; + m->btree_id = btree_id; m->data_cmd = data_cmd; m->data_opts = data_opts; m->nr_ptrs_reserved = 0; @@ -264,11 +266,12 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, break; } case DATA_REWRITE: { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; unsigned compressed_sectors = 0; - extent_for_each_ptr_decode(bkey_s_c_to_extent(k), p, entry) + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) if (!p.ptr.cached && p.crc.compression_type != BCH_COMPRESSION_NONE && bch2_dev_in_target(c, p.ptr.dev, data_opts.target)) @@ -391,6 +394,7 @@ static int bch2_move_extent(struct bch_fs *c, struct moving_context *ctxt, struct write_point_specifier wp, struct bch_io_opts io_opts, + enum btree_id btree_id, struct bkey_s_c k, enum data_cmd data_cmd, struct data_opts data_opts) @@ -443,7 +447,7 @@ static int bch2_move_extent(struct bch_fs *c, io->rbio.bio.bi_end_io = move_read_endio; ret = bch2_migrate_write_init(c, &io->write, wp, io_opts, - data_cmd, data_opts, k); + data_cmd, data_opts, btree_id, k); if (ret) goto err_free_pages; @@ -473,16 +477,17 @@ err: return ret; } -int bch2_move_data(struct bch_fs *c, - struct bch_ratelimit *rate, - struct write_point_specifier wp, - struct bpos start, - struct bpos end, - move_pred_fn pred, void *arg, - struct bch_move_stats *stats) +static int __bch2_move_data(struct bch_fs *c, + struct moving_context *ctxt, + struct bch_ratelimit *rate, + struct write_point_specifier wp, + struct bpos start, + struct bpos end, + move_pred_fn pred, void *arg, + struct bch_move_stats *stats, + enum btree_id btree_id) { bool kthread = (current->flags & PF_KTHREAD) != 0; - struct moving_context ctxt = { .stats = stats }; struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); BKEY_PADDED(k) tmp; struct btree_trans trans; @@ -493,17 +498,13 @@ int bch2_move_data(struct bch_fs *c, u64 delay, cur_inum = U64_MAX; int ret = 0, ret2; - closure_init_stack(&ctxt.cl); - INIT_LIST_HEAD(&ctxt.reads); - init_waitqueue_head(&ctxt.wait); - bch2_trans_init(&trans, c, 0, 0); stats->data_type = BCH_DATA_USER; - stats->btree_id = BTREE_ID_EXTENTS; + stats->btree_id = btree_id; stats->pos = POS_MIN; - iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, start, + iter = bch2_trans_get_iter(&trans, btree_id, start, BTREE_ITER_PREFETCH); if (rate) @@ -528,7 +529,7 @@ int bch2_move_data(struct bch_fs *c, if (unlikely(freezing(current))) { bch2_trans_unlock(&trans); - move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads)); + move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads)); try_to_freeze(); } } while (delay); @@ -579,12 +580,12 @@ peek: k = bkey_i_to_s_c(&tmp.k); bch2_trans_unlock(&trans); - ret2 = bch2_move_extent(c, &ctxt, wp, io_opts, k, + ret2 = bch2_move_extent(c, ctxt, wp, io_opts, btree_id, k, data_cmd, data_opts); if (ret2) { if (ret2 == -ENOMEM) { /* memory allocation failure, wait for some IO to finish */ - bch2_move_ctxt_wait_for_io(&ctxt); + bch2_move_ctxt_wait_for_io(ctxt); continue; } @@ -602,7 +603,32 @@ next_nondata: bch2_trans_cond_resched(&trans); } out: - bch2_trans_exit(&trans); + ret = bch2_trans_exit(&trans) ?: ret; + + return ret; +} + +int bch2_move_data(struct bch_fs *c, + struct bch_ratelimit *rate, + struct write_point_specifier wp, + struct bpos start, + struct bpos end, + move_pred_fn pred, void *arg, + struct bch_move_stats *stats) +{ + struct moving_context ctxt = { .stats = stats }; + int ret; + + closure_init_stack(&ctxt.cl); + INIT_LIST_HEAD(&ctxt.reads); + init_waitqueue_head(&ctxt.wait); + + stats->data_type = BCH_DATA_USER; + + ret = __bch2_move_data(c, &ctxt, rate, wp, start, end, + pred, arg, stats, BTREE_ID_EXTENTS) ?: + __bch2_move_data(c, &ctxt, rate, wp, start, end, + pred, arg, stats, BTREE_ID_REFLINK); move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads)); closure_sync(&ctxt.cl); diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h index 71b3d2b2ddb6..0acd1720d4f8 100644 --- a/fs/bcachefs/move.h +++ b/fs/bcachefs/move.h @@ -25,6 +25,7 @@ struct data_opts { }; struct migrate_write { + enum btree_id btree_id; enum data_cmd data_cmd; struct data_opts data_opts; @@ -44,7 +45,7 @@ int bch2_migrate_write_init(struct bch_fs *, struct migrate_write *, struct write_point_specifier, struct bch_io_opts, enum data_cmd, struct data_opts, - struct bkey_s_c); + enum btree_id, struct bkey_s_c); typedef enum data_cmd (*move_pred_fn)(struct bch_fs *, void *, struct bkey_s_c, diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 3742b241807c..f2899ba9ad43 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -236,7 +236,8 @@ static void replay_now_at(struct journal *j, u64 seq) bch2_journal_pin_put(j, j->replay_journal_seq++); } -static int bch2_extent_replay_key(struct bch_fs *c, struct bkey_i *k) +static int bch2_extent_replay_key(struct bch_fs *c, enum btree_id btree_id, + struct bkey_i *k) { struct btree_trans trans; struct btree_iter *iter, *split_iter; @@ -255,7 +256,7 @@ static int bch2_extent_replay_key(struct bch_fs *c, struct bkey_i *k) retry: bch2_trans_begin(&trans); - iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, + iter = bch2_trans_get_iter(&trans, btree_id, bkey_start_pos(&k->k), BTREE_ITER_INTENT); @@ -341,22 +342,17 @@ static int bch2_journal_replay(struct bch_fs *c, for_each_journal_key(keys, i) { replay_now_at(j, keys.journal_seq_base + i->journal_seq); - switch (i->btree_id) { - case BTREE_ID_ALLOC: + if (i->btree_id == BTREE_ID_ALLOC) ret = bch2_alloc_replay_key(c, i->k); - break; - case BTREE_ID_EXTENTS: - ret = bch2_extent_replay_key(c, i->k); - break; - default: + else if (btree_node_type_is_extents(i->btree_id)) + ret = bch2_extent_replay_key(c, i->btree_id, i->k); + else ret = bch2_btree_insert(c, i->btree_id, i->k, NULL, NULL, BTREE_INSERT_NOFAIL| BTREE_INSERT_LAZY_RW| BTREE_INSERT_JOURNAL_REPLAY| BTREE_INSERT_NOMARK); - break; - } if (ret) { bch_err(c, "journal replay: error %d while replaying key", diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c new file mode 100644 index 000000000000..dcca9c1d0f47 --- /dev/null +++ b/fs/bcachefs/reflink.c @@ -0,0 +1,300 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "btree_update.h" +#include "extents.h" +#include "fs.h" +#include "fs-io.h" +#include "reflink.h" + +#include + +/* reflink pointers */ + +const char *bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); + + if (bkey_val_bytes(p.k) != sizeof(*p.v)) + return "incorrect value size"; + + return NULL; +} + +void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); + + pr_buf(out, "idx %llu", le64_to_cpu(p.v->idx)); +} + +enum merge_result bch2_reflink_p_merge(struct bch_fs *c, + struct bkey_s _l, struct bkey_s _r) +{ + struct bkey_s_reflink_p l = bkey_s_to_reflink_p(_l); + struct bkey_s_reflink_p r = bkey_s_to_reflink_p(_r); + + if (le64_to_cpu(l.v->idx) + l.k->size != le64_to_cpu(r.v->idx)) + return BCH_MERGE_NOMERGE; + + if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) { + bch2_key_resize(l.k, KEY_SIZE_MAX); + __bch2_cut_front(l.k->p, _r); + return BCH_MERGE_PARTIAL; + } + + bch2_key_resize(l.k, l.k->size + r.k->size); + + return BCH_MERGE_MERGE; +} + +/* indirect extents */ + +const char *bch2_reflink_v_invalid(const struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); + + if (bkey_val_bytes(r.k) < sizeof(*r.v)) + return "incorrect value size"; + + return bch2_bkey_ptrs_invalid(c, k); +} + +void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); + + pr_buf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount)); + + bch2_bkey_ptrs_to_text(out, c, k); +} + +/* + * bch2_remap_range() depends on bch2_extent_update(), which depends on various + * things tied to the linux vfs for inode updates, for now: + */ +#ifndef NO_BCACHEFS_FS + +static int bch2_make_extent_indirect(struct btree_trans *trans, + struct btree_iter *extent_iter, + struct bkey_i_extent *e) +{ + struct bch_fs *c = trans->c; + struct btree_iter *reflink_iter; + struct bkey_s_c k; + struct bkey_i_reflink_v *r_v; + struct bkey_i_reflink_p *r_p; + int ret; + + for_each_btree_key(trans, reflink_iter, BTREE_ID_REFLINK, + POS(0, c->reflink_hint), + BTREE_ITER_INTENT|BTREE_ITER_SLOTS, k, ret) { + if (reflink_iter->pos.inode) { + bch2_btree_iter_set_pos(reflink_iter, POS_MIN); + continue; + } + + if (bkey_deleted(k.k) && e->k.size <= k.k->size) + break; + } + + if (ret) + goto err; + + /* rewind iter to start of hole, if necessary: */ + bch2_btree_iter_set_pos(reflink_iter, bkey_start_pos(k.k)); + + r_v = bch2_trans_kmalloc(trans, sizeof(*r_v) + bkey_val_bytes(&e->k)); + ret = PTR_ERR_OR_ZERO(r_v); + if (ret) + goto err; + + bkey_reflink_v_init(&r_v->k_i); + r_v->k.p = reflink_iter->pos; + bch2_key_resize(&r_v->k, e->k.size); + r_v->k.version = e->k.version; + + set_bkey_val_u64s(&r_v->k, bkey_val_u64s(&r_v->k) + + bkey_val_u64s(&e->k)); + r_v->v.refcount = 0; + memcpy(r_v->v.start, e->v.start, bkey_val_bytes(&e->k)); + + bch2_trans_update(trans, BTREE_INSERT_ENTRY(reflink_iter, &r_v->k_i)); + + r_p = bch2_trans_kmalloc(trans, sizeof(*r_p)); + if (IS_ERR(r_p)) + return PTR_ERR(r_p); + + e->k.type = KEY_TYPE_reflink_p; + r_p = bkey_i_to_reflink_p(&e->k_i); + set_bkey_val_bytes(&r_p->k, sizeof(r_p->v)); + r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k)); + + bch2_trans_update(trans, BTREE_INSERT_ENTRY(extent_iter, &r_p->k_i)); +err: + if (!IS_ERR(reflink_iter)) { + c->reflink_hint = reflink_iter->pos.offset; + bch2_trans_iter_put(trans, reflink_iter); + } + + return ret; +} + +static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end) +{ + struct bkey_s_c k = bch2_btree_iter_peek(iter); + + while (1) { + if (bkey_err(k)) + return k; + + if (bkey_cmp(iter->pos, end) >= 0) + return bkey_s_c_null; + + if (k.k->type == KEY_TYPE_extent || + k.k->type == KEY_TYPE_reflink_p) + return k; + + k = bch2_btree_iter_next(iter); + } +} + +s64 bch2_remap_range(struct bch_fs *c, + struct bch_inode_info *dst_inode, + struct bpos dst_start, struct bpos src_start, + u64 remap_sectors, u64 new_i_size) +{ + struct btree_trans trans; + struct btree_iter *dst_iter, *src_iter; + struct bkey_s_c src_k; + BKEY_PADDED(k) new_dst, new_src; + struct bpos dst_end = dst_start, src_end = src_start; + struct bpos dst_want, src_want; + u64 src_done, dst_done; + int ret = 0; + + if (!(c->sb.features & (1ULL << BCH_FEATURE_REFLINK))) { + mutex_lock(&c->sb_lock); + if (!(c->sb.features & (1ULL << BCH_FEATURE_REFLINK))) { + c->disk_sb.sb->features[0] |= + cpu_to_le64(1ULL << BCH_FEATURE_REFLINK); + + bch2_write_super(c); + } + mutex_unlock(&c->sb_lock); + } + + dst_end.offset += remap_sectors; + src_end.offset += remap_sectors; + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096); + + src_iter = __bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, src_start, + BTREE_ITER_INTENT, 1); + dst_iter = __bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, dst_start, + BTREE_ITER_INTENT, 2); + + while (1) { + bch2_trans_begin_updates(&trans); + trans.mem_top = 0; + + if (fatal_signal_pending(current)) { + ret = -EINTR; + goto err; + } + + src_k = get_next_src(src_iter, src_end); + ret = bkey_err(src_k); + if (ret) + goto btree_err; + + src_done = bpos_min(src_iter->pos, src_end).offset - + src_start.offset; + dst_want = POS(dst_start.inode, dst_start.offset + src_done); + + if (bkey_cmp(dst_iter->pos, dst_want) < 0) { + ret = bch2_fpunch_at(&trans, dst_iter, dst_want, + dst_inode, new_i_size); + if (ret) + goto btree_err; + continue; + } + + BUG_ON(bkey_cmp(dst_iter->pos, dst_want)); + + if (!bkey_cmp(dst_iter->pos, dst_end)) + break; + + if (src_k.k->type == KEY_TYPE_extent) { + bkey_reassemble(&new_src.k, src_k); + src_k = bkey_i_to_s_c(&new_src.k); + + bch2_cut_front(src_iter->pos, &new_src.k); + bch2_cut_back(src_end, &new_src.k.k); + + ret = bch2_make_extent_indirect(&trans, src_iter, + bkey_i_to_extent(&new_src.k)); + if (ret) + goto btree_err; + + BUG_ON(src_k.k->type != KEY_TYPE_reflink_p); + } + + if (src_k.k->type == KEY_TYPE_reflink_p) { + struct bkey_s_c_reflink_p src_p = + bkey_s_c_to_reflink_p(src_k); + struct bkey_i_reflink_p *dst_p = + bkey_reflink_p_init(&new_dst.k); + + u64 offset = le64_to_cpu(src_p.v->idx) + + (src_iter->pos.offset - + bkey_start_offset(src_k.k)); + + dst_p->v.idx = cpu_to_le64(offset); + } else { + BUG(); + } + + new_dst.k.k.p = dst_iter->pos; + bch2_key_resize(&new_dst.k.k, + min(src_k.k->p.offset - src_iter->pos.offset, + dst_end.offset - dst_iter->pos.offset)); + + ret = bch2_extent_update(&trans, dst_inode, NULL, NULL, + dst_iter, &new_dst.k, + new_i_size, false, true, NULL); + if (ret) + goto btree_err; + + dst_done = dst_iter->pos.offset - dst_start.offset; + src_want = POS(src_start.inode, src_start.offset + dst_done); + bch2_btree_iter_set_pos(src_iter, src_want); +btree_err: + if (ret == -EINTR) + ret = 0; + if (ret) + goto err; + } + + BUG_ON(bkey_cmp(dst_iter->pos, dst_end)); +err: + BUG_ON(bkey_cmp(dst_iter->pos, dst_end) > 0); + + dst_done = dst_iter->pos.offset - dst_start.offset; + new_i_size = min(dst_iter->pos.offset << 9, new_i_size); + + ret = bch2_trans_exit(&trans) ?: ret; + + mutex_lock(&dst_inode->ei_update_lock); + if (dst_inode->v.i_size < new_i_size) { + i_size_write(&dst_inode->v, new_i_size); + ret = bch2_write_inode_size(c, dst_inode, new_i_size, + ATTR_MTIME|ATTR_CTIME); + } + mutex_unlock(&dst_inode->ei_update_lock); + + return dst_done ?: ret; +} + +#endif /* NO_BCACHEFS_FS */ diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h new file mode 100644 index 000000000000..327618c36d33 --- /dev/null +++ b/fs/bcachefs/reflink.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_REFLINK_H +#define _BCACHEFS_REFLINK_H + +const char *bch2_reflink_p_invalid(const struct bch_fs *, struct bkey_s_c); +void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); +enum merge_result bch2_reflink_p_merge(struct bch_fs *, + struct bkey_s, struct bkey_s); + +#define bch2_bkey_ops_reflink_p (struct bkey_ops) { \ + .key_invalid = bch2_reflink_p_invalid, \ + .val_to_text = bch2_reflink_p_to_text, \ + .key_merge = bch2_reflink_p_merge, \ +} + +const char *bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c); +void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); + + +#define bch2_bkey_ops_reflink_v (struct bkey_ops) { \ + .key_invalid = bch2_reflink_v_invalid, \ + .val_to_text = bch2_reflink_v_to_text, \ +} + +#ifndef NO_BCACHEFS_FS +s64 bch2_remap_range(struct bch_fs *, struct bch_inode_info *, + struct bpos, struct bpos, u64, u64); +#endif /* NO_BCACHEFS_FS */ + +#endif /* _BCACHEFS_REFLINK_H */ diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c index 7a9a7ec26c93..4fb142f3d39c 100644 --- a/fs/bcachefs/replicas.c +++ b/fs/bcachefs/replicas.c @@ -113,6 +113,7 @@ void bch2_bkey_to_replicas(struct bch_replicas_entry *e, extent_to_replicas(k, e); break; case KEY_TYPE_extent: + case KEY_TYPE_reflink_v: e->data_type = BCH_DATA_USER; extent_to_replicas(k, e); break; -- cgit v1.2.3