diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2018-11-01 15:13:19 -0400 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@linux.dev> | 2023-10-22 17:08:11 -0400 |
commit | cd575ddf57af004913ff5a994aa5f3203216fa68 (patch) | |
tree | 15fafb5d59b359aef897a2296f1ce7f8bc33f55a | |
parent | 91f8b5677b5d831cff34b25ef03322ae49e03256 (diff) | |
download | lwn-cd575ddf57af004913ff5a994aa5f3203216fa68.tar.gz lwn-cd575ddf57af004913ff5a994aa5f3203216fa68.zip |
bcachefs: Erasure coding
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
32 files changed, 2418 insertions, 233 deletions
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig index c13f2cfa6489..2f8300b60807 100644 --- a/fs/bcachefs/Kconfig +++ b/fs/bcachefs/Kconfig @@ -16,6 +16,8 @@ config BCACHEFS_FS select CRYPTO_CHACHA20 select CRYPTO_POLY1305 select KEYS + select RAID6_PQ + select XOR_BLOCKS help The bcachefs filesystem - a modern, copy on write filesystem, with support for multiple devices, compression, checksumming, etc. diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile index 5318287c5ac4..b9521d772db1 100644 --- a/fs/bcachefs/Makefile +++ b/fs/bcachefs/Makefile @@ -22,6 +22,7 @@ bcachefs-y := \ debug.o \ dirent.o \ disk_groups.o \ + ec.o \ error.o \ extents.o \ fs.o \ diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 291d352ee370..b49d0cd84b78 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -10,6 +10,7 @@ #include "buckets.h" #include "clock.h" #include "debug.h" +#include "ec.h" #include "error.h" #include "journal_io.h" #include "trace.h" @@ -1113,6 +1114,24 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) } mutex_unlock(&c->btree_reserve_cache_lock); + while (1) { + struct open_bucket *ob; + + spin_lock(&c->freelist_lock); + if (!ca->open_buckets_partial_nr) { + spin_unlock(&c->freelist_lock); + break; + } + ob = c->open_buckets + + ca->open_buckets_partial[--ca->open_buckets_partial_nr]; + ob->on_partial_list = false; + spin_unlock(&c->freelist_lock); + + bch2_open_bucket_put(c, ob); + } + + bch2_ec_stop_dev(c, ca); + /* * Wake up threads that were blocked on allocation, so they can notice * the device can no longer be removed and the capacity has changed: diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index df74e41ec890..6e5f6e57da56 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -62,6 +62,7 @@ #include "clock.h" #include "debug.h" #include "disk_groups.h" +#include "ec.h" #include "io.h" #include "trace.h" @@ -95,6 +96,11 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) { struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); + if (ob->ec) { + bch2_ec_bucket_written(c, ob); + return; + } + percpu_down_read(&c->usage_lock); spin_lock(&ob->lock); @@ -114,6 +120,19 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) closure_wake_up(&c->open_buckets_wait); } +void bch2_open_bucket_write_error(struct bch_fs *c, + struct open_buckets *obs, + unsigned dev) +{ + struct open_bucket *ob; + unsigned i; + + open_bucket_for_each(c, obs, ob, i) + if (ob->ptr.dev == dev && + ob->ec) + bch2_ec_bucket_cancel(c, ob); +} + static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) { struct open_bucket *ob; @@ -129,15 +148,17 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) } static void open_bucket_free_unused(struct bch_fs *c, - struct write_point *wp, - struct open_bucket *ob) + struct open_bucket *ob, + bool may_realloc) { struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); BUG_ON(ca->open_buckets_partial_nr >= ARRAY_SIZE(ca->open_buckets_partial)); - if (wp->type == BCH_DATA_USER) { + if (ca->open_buckets_partial_nr < + ARRAY_SIZE(ca->open_buckets_partial) && + may_realloc) { spin_lock(&c->freelist_lock); ob->on_partial_list = true; ca->open_buckets_partial[ca->open_buckets_partial_nr++] = @@ -285,18 +306,18 @@ out: return ob; } -static int __dev_alloc_cmp(struct write_point *wp, - unsigned l, unsigned r) +static int __dev_stripe_cmp(struct dev_stripe_state *stripe, + unsigned l, unsigned r) { - return ((wp->next_alloc[l] > wp->next_alloc[r]) - - (wp->next_alloc[l] < wp->next_alloc[r])); + return ((stripe->next_alloc[l] > stripe->next_alloc[r]) - + (stripe->next_alloc[l] < stripe->next_alloc[r])); } -#define dev_alloc_cmp(l, r) __dev_alloc_cmp(wp, l, r) +#define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r) -struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *c, - struct write_point *wp, - struct bch_devs_mask *devs) +struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c, + struct dev_stripe_state *stripe, + struct bch_devs_mask *devs) { struct dev_alloc_list ret = { .nr = 0 }; struct bch_dev *ca; @@ -305,14 +326,14 @@ struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *c, for_each_member_device_rcu(ca, c, i, devs) ret.devs[ret.nr++] = i; - bubble_sort(ret.devs, ret.nr, dev_alloc_cmp); + bubble_sort(ret.devs, ret.nr, dev_stripe_cmp); return ret; } -void bch2_wp_rescale(struct bch_fs *c, struct bch_dev *ca, - struct write_point *wp) +void bch2_dev_stripe_increment(struct bch_fs *c, struct bch_dev *ca, + struct dev_stripe_state *stripe) { - u64 *v = wp->next_alloc + ca->dev_idx; + u64 *v = stripe->next_alloc + ca->dev_idx; u64 free_space = dev_buckets_free(c, ca); u64 free_space_inv = free_space ? div64_u64(1ULL << 48, free_space) @@ -324,26 +345,30 @@ void bch2_wp_rescale(struct bch_fs *c, struct bch_dev *ca, else *v = U64_MAX; - for (v = wp->next_alloc; - v < wp->next_alloc + ARRAY_SIZE(wp->next_alloc); v++) + for (v = stripe->next_alloc; + v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++) *v = *v < scale ? 0 : *v - scale; } +#define BUCKET_MAY_ALLOC_PARTIAL (1 << 0) +#define BUCKET_ALLOC_USE_DURABILITY (1 << 1) + static int bch2_bucket_alloc_set(struct bch_fs *c, struct open_buckets *ptrs, - struct write_point *wp, + struct dev_stripe_state *stripe, struct bch_devs_mask *devs_may_alloc, unsigned nr_replicas, unsigned *nr_effective, bool *have_cache, enum alloc_reserve reserve, + unsigned flags, struct closure *cl) { struct dev_alloc_list devs_sorted = - bch2_wp_alloc_list(c, wp, devs_may_alloc); + bch2_dev_alloc_list(c, stripe, devs_may_alloc); struct bch_dev *ca; bool alloc_failure = false; - unsigned i; + unsigned i, durability; BUG_ON(*nr_effective >= nr_replicas); @@ -354,13 +379,11 @@ static int bch2_bucket_alloc_set(struct bch_fs *c, if (!ca) continue; - if (!ca->mi.durability && - (*have_cache || - wp->type != BCH_DATA_USER)) + if (!ca->mi.durability && *have_cache) continue; ob = bch2_bucket_alloc(c, ca, reserve, - wp->type == BCH_DATA_USER, cl); + flags & BUCKET_MAY_ALLOC_PARTIAL, cl); if (IS_ERR(ob)) { enum bucket_alloc_ret ret = -PTR_ERR(ob); @@ -375,13 +398,16 @@ static int bch2_bucket_alloc_set(struct bch_fs *c, continue; } + durability = (flags & BUCKET_ALLOC_USE_DURABILITY) + ? ca->mi.durability : 1; + __clear_bit(ca->dev_idx, devs_may_alloc->d); - *nr_effective += ca->mi.durability; - *have_cache |= !ca->mi.durability; + *nr_effective += durability; + *have_cache |= !durability; ob_push(c, ptrs, ob); - bch2_wp_rescale(c, ca, wp); + bch2_dev_stripe_increment(c, ca, stripe); if (*nr_effective >= nr_replicas) return 0; @@ -390,15 +416,150 @@ static int bch2_bucket_alloc_set(struct bch_fs *c, return alloc_failure ? -ENOSPC : -EROFS; } +/* Allocate from stripes: */ + +/* + * XXX: use a higher watermark for allocating open buckets here: + */ +static int ec_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) +{ + struct bch_devs_mask devs; + struct open_bucket *ob; + unsigned i, nr_have = 0, nr_data = + min_t(unsigned, h->nr_active_devs, + EC_STRIPE_MAX) - h->redundancy; + bool have_cache = true; + int ret = 0; + + BUG_ON(h->blocks.nr > nr_data); + BUG_ON(h->parity.nr > h->redundancy); + + devs = h->devs; + + open_bucket_for_each(c, &h->parity, ob, i) + __clear_bit(ob->ptr.dev, devs.d); + open_bucket_for_each(c, &h->blocks, ob, i) + __clear_bit(ob->ptr.dev, devs.d); + + percpu_down_read(&c->usage_lock); + rcu_read_lock(); + + if (h->parity.nr < h->redundancy) { + nr_have = h->parity.nr; + + ret = bch2_bucket_alloc_set(c, &h->parity, + &h->parity_stripe, + &devs, + h->redundancy, + &nr_have, + &have_cache, + RESERVE_NONE, + 0, + NULL); + if (ret) + goto err; + } + + if (h->blocks.nr < nr_data) { + nr_have = h->blocks.nr; + + ret = bch2_bucket_alloc_set(c, &h->blocks, + &h->block_stripe, + &devs, + nr_data, + &nr_have, + &have_cache, + RESERVE_NONE, + 0, + NULL); + if (ret) + goto err; + } + + rcu_read_unlock(); + percpu_up_read(&c->usage_lock); + + return bch2_ec_stripe_new_alloc(c, h); +err: + rcu_read_unlock(); + percpu_up_read(&c->usage_lock); + return -1; +} + +/* + * if we can't allocate a new stripe because there are already too many + * partially filled stripes, force allocating from an existing stripe even when + * it's to a device we don't want: + */ + +static void bucket_alloc_from_stripe(struct bch_fs *c, + struct open_buckets *ptrs, + struct write_point *wp, + struct bch_devs_mask *devs_may_alloc, + u16 target, + unsigned erasure_code, + unsigned nr_replicas, + unsigned *nr_effective, + bool *have_cache) +{ + struct dev_alloc_list devs_sorted; + struct ec_stripe_head *h; + struct open_bucket *ob; + struct bch_dev *ca; + unsigned i, ec_idx; + + if (!erasure_code) + return; + + if (nr_replicas < 2) + return; + + if (ec_open_bucket(c, ptrs)) + return; + + h = bch2_ec_stripe_head_get(c, target, erasure_code, nr_replicas - 1); + if (!h) + return; + + if (!h->s && ec_stripe_alloc(c, h)) + goto out_put_head; + + rcu_read_lock(); + devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc); + rcu_read_unlock(); + + for (i = 0; i < devs_sorted.nr; i++) + open_bucket_for_each(c, &h->s->blocks, ob, ec_idx) + if (ob->ptr.dev == devs_sorted.devs[i] && + !test_and_set_bit(ec_idx, h->s->blocks_allocated)) + goto got_bucket; + goto out_put_head; +got_bucket: + ca = bch_dev_bkey_exists(c, ob->ptr.dev); + + ob->ec_idx = ec_idx; + ob->ec = h->s; + + __clear_bit(ob->ptr.dev, devs_may_alloc->d); + *nr_effective += ca->mi.durability; + *have_cache |= !ca->mi.durability; + + ob_push(c, ptrs, ob); + atomic_inc(&h->s->pin); +out_put_head: + bch2_ec_stripe_head_put(h); +} + /* Sector allocator */ -static int get_buckets_from_writepoint(struct bch_fs *c, - struct open_buckets *ptrs, - struct write_point *wp, - struct bch_devs_mask *devs_may_alloc, - unsigned nr_replicas, - unsigned *nr_effective, - bool *have_cache) +static void get_buckets_from_writepoint(struct bch_fs *c, + struct open_buckets *ptrs, + struct write_point *wp, + struct bch_devs_mask *devs_may_alloc, + unsigned nr_replicas, + unsigned *nr_effective, + bool *have_cache, + bool need_ec) { struct open_buckets ptrs_skip = { .nr = 0 }; struct open_bucket *ob; @@ -410,7 +571,8 @@ static int get_buckets_from_writepoint(struct bch_fs *c, if (*nr_effective < nr_replicas && test_bit(ob->ptr.dev, devs_may_alloc->d) && (ca->mi.durability || - (wp->type == BCH_DATA_USER && !*have_cache))) { + (wp->type == BCH_DATA_USER && !*have_cache)) && + (ob->ec || !need_ec)) { __clear_bit(ob->ptr.dev, devs_may_alloc->d); *nr_effective += ca->mi.durability; *have_cache |= !ca->mi.durability; @@ -421,8 +583,6 @@ static int get_buckets_from_writepoint(struct bch_fs *c, } } wp->ptrs = ptrs_skip; - - return *nr_effective < nr_replicas ? -ENOSPC : 0; } static int open_bucket_add_buckets(struct bch_fs *c, @@ -430,22 +590,25 @@ static int open_bucket_add_buckets(struct bch_fs *c, struct write_point *wp, struct bch_devs_list *devs_have, u16 target, + unsigned erasure_code, unsigned nr_replicas, unsigned *nr_effective, bool *have_cache, enum alloc_reserve reserve, - struct closure *cl) + struct closure *_cl) { struct bch_devs_mask devs; - const struct bch_devs_mask *t; struct open_bucket *ob; - unsigned i; + struct closure *cl = NULL; + unsigned i, flags = BUCKET_ALLOC_USE_DURABILITY; int ret; - percpu_down_read(&c->usage_lock); - rcu_read_lock(); + if (wp->type == BCH_DATA_USER) + flags |= BUCKET_MAY_ALLOC_PARTIAL; - devs = c->rw_devs[wp->type]; + rcu_read_lock(); + devs = target_rw_devs(c, wp->type, target); + rcu_read_unlock(); /* Don't allocate from devices we already have pointers to: */ for (i = 0; i < devs_have->nr; i++) @@ -454,50 +617,83 @@ static int open_bucket_add_buckets(struct bch_fs *c, open_bucket_for_each(c, ptrs, ob, i) __clear_bit(ob->ptr.dev, devs.d); - t = bch2_target_to_mask(c, target); - if (t) - bitmap_and(devs.d, devs.d, t->d, BCH_SB_MEMBERS_MAX); + if (erasure_code) { + get_buckets_from_writepoint(c, ptrs, wp, &devs, + nr_replicas, nr_effective, + have_cache, true); + if (*nr_effective >= nr_replicas) + return 0; - ret = get_buckets_from_writepoint(c, ptrs, wp, &devs, - nr_replicas, nr_effective, have_cache); - if (!ret) - goto out; + bucket_alloc_from_stripe(c, ptrs, wp, &devs, + target, erasure_code, + nr_replicas, nr_effective, + have_cache); + if (*nr_effective >= nr_replicas) + return 0; + } + + get_buckets_from_writepoint(c, ptrs, wp, &devs, + nr_replicas, nr_effective, + have_cache, false); + if (*nr_effective >= nr_replicas) + return 0; + + percpu_down_read(&c->usage_lock); + rcu_read_lock(); +retry_blocking: /* * Try nonblocking first, so that if one device is full we'll try from * other devices: */ - ret = bch2_bucket_alloc_set(c, ptrs, wp, &devs, + ret = bch2_bucket_alloc_set(c, ptrs, &wp->stripe, &devs, nr_replicas, nr_effective, have_cache, - reserve, NULL); - if (!ret || ret == -EROFS || !cl) - goto out; + reserve, flags, cl); + if (ret && ret != -EROFS && !cl && _cl) { + cl = _cl; + goto retry_blocking; + } - ret = bch2_bucket_alloc_set(c, ptrs, wp, &devs, - nr_replicas, nr_effective, have_cache, - reserve, cl); -out: rcu_read_unlock(); percpu_up_read(&c->usage_lock); return ret; } -void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca, - struct write_point *wp) +void bch2_open_buckets_stop_dev(struct bch_fs *c, struct bch_dev *ca, + struct open_buckets *obs, + enum bch_data_type data_type) { struct open_buckets ptrs = { .nr = 0 }; - struct open_bucket *ob; - unsigned i; + struct open_bucket *ob, *ob2; + unsigned i, j; - mutex_lock(&wp->lock); - open_bucket_for_each(c, &wp->ptrs, ob, i) - if (!ca || ob->ptr.dev == ca->dev_idx) - open_bucket_free_unused(c, wp, ob); + open_bucket_for_each(c, obs, ob, i) { + bool drop = !ca || ob->ptr.dev == ca->dev_idx; + + if (!drop && ob->ec) { + mutex_lock(&ob->ec->lock); + open_bucket_for_each(c, &ob->ec->blocks, ob2, j) + drop |= ob2->ptr.dev == ca->dev_idx; + open_bucket_for_each(c, &ob->ec->parity, ob2, j) + drop |= ob2->ptr.dev == ca->dev_idx; + mutex_unlock(&ob->ec->lock); + } + + if (drop) + bch2_open_bucket_put(c, ob); else ob_push(c, &ptrs, ob); + } - wp->ptrs = ptrs; + *obs = ptrs; +} + +void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca, + struct write_point *wp) +{ + mutex_lock(&wp->lock); + bch2_open_buckets_stop_dev(c, ca, &wp->ptrs, wp->type); mutex_unlock(&wp->lock); } @@ -630,6 +826,7 @@ out: */ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c, unsigned target, + unsigned erasure_code, struct write_point_specifier write_point, struct bch_devs_list *devs_have, unsigned nr_replicas, @@ -649,26 +846,37 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c, BUG_ON(!nr_replicas || !nr_replicas_required); retry: write_points_nr = c->write_points_nr; + wp = writepoint_find(c, write_point.v); + /* metadata may not allocate on cache devices: */ + if (wp->type != BCH_DATA_USER) + have_cache = true; + if (!target || (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) { - ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, target, + ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, + target, erasure_code, nr_replicas, &nr_effective, &have_cache, reserve, cl); } else { - ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, target, + ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, + target, erasure_code, nr_replicas, &nr_effective, &have_cache, reserve, NULL); if (!ret) goto alloc_done; - ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, 0, + ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, + 0, erasure_code, nr_replicas, &nr_effective, &have_cache, reserve, cl); } alloc_done: BUG_ON(!ret && nr_effective < nr_replicas); + if (erasure_code && !ec_open_bucket(c, &ptrs)) + pr_debug("failed to get ec bucket: ret %u", ret); + if (ret == -EROFS && nr_effective >= nr_replicas_required) ret = 0; @@ -678,7 +886,7 @@ alloc_done: /* Free buckets we didn't use: */ open_bucket_for_each(c, &wp->ptrs, ob, i) - open_bucket_free_unused(c, wp, ob); + open_bucket_free_unused(c, ob, wp->type == BCH_DATA_USER); wp->ptrs = ptrs; @@ -697,7 +905,8 @@ err: if (ptrs.nr < ARRAY_SIZE(ptrs.v)) ob_push(c, &ptrs, ob); else - open_bucket_free_unused(c, wp, ob); + open_bucket_free_unused(c, ob, + wp->type == BCH_DATA_USER); wp->ptrs = ptrs; mutex_unlock(&wp->lock); diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h index 6672101cbe26..c71cf7381729 100644 --- a/fs/bcachefs/alloc_foreground.h +++ b/fs/bcachefs/alloc_foreground.h @@ -17,11 +17,11 @@ struct dev_alloc_list { u8 devs[BCH_SB_MEMBERS_MAX]; }; -struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *, - struct write_point *, - struct bch_devs_mask *); -void bch2_wp_rescale(struct bch_fs *, struct bch_dev *, - struct write_point *); +struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *, + struct dev_stripe_state *, + struct bch_devs_mask *); +void bch2_dev_stripe_increment(struct bch_fs *, struct bch_dev *, + struct dev_stripe_state *); long bch2_bucket_alloc_new_fs(struct bch_dev *); @@ -43,6 +43,22 @@ static inline void ob_push(struct bch_fs *c, struct open_buckets *obs, ((_ob) = (_c)->open_buckets + (_obs)->v[_i], true); \ (_i)++) +static inline struct open_bucket *ec_open_bucket(struct bch_fs *c, + struct open_buckets *obs) +{ + struct open_bucket *ob; + unsigned i; + + open_bucket_for_each(c, obs, ob, i) + if (ob->ec) + return ob; + + return NULL; +} + +void bch2_open_bucket_write_error(struct bch_fs *, + struct open_buckets *, unsigned); + void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *); static inline void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) @@ -76,7 +92,7 @@ static inline void bch2_open_bucket_get(struct bch_fs *c, } struct write_point *bch2_alloc_sectors_start(struct bch_fs *, - unsigned, + unsigned, unsigned, struct write_point_specifier, struct bch_devs_list *, unsigned, unsigned, @@ -88,6 +104,9 @@ void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *, struct bkey_i_extent *, unsigned); void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *); +void bch2_open_buckets_stop_dev(struct bch_fs *, struct bch_dev *, + struct open_buckets *, enum bch_data_type); + void bch2_writepoint_stop(struct bch_fs *, struct bch_dev *, struct write_point *); diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h index 2a9c6f0344ed..ef3e400c7d3d 100644 --- a/fs/bcachefs/alloc_types.h +++ b/fs/bcachefs/alloc_types.h @@ -8,6 +8,8 @@ #include "clock_types.h" #include "fifo.h" +struct ec_bucket_buf; + /* There's two of these clocks, one for reads and one for writes: */ struct bucket_clock { /* @@ -56,8 +58,10 @@ struct open_bucket { u8 freelist; bool valid; bool on_partial_list; + u8 ec_idx; unsigned sectors_free; struct bch_extent_ptr ptr; + struct ec_stripe_new *ec; }; #define OPEN_BUCKET_LIST_MAX 15 @@ -67,18 +71,23 @@ struct open_buckets { u8 v[OPEN_BUCKET_LIST_MAX]; }; +struct dev_stripe_state { + u64 next_alloc[BCH_SB_MEMBERS_MAX]; +}; + struct write_point { struct hlist_node node; struct mutex lock; u64 last_used; unsigned long write_point; enum bch_data_type type; + bool is_ec; /* calculated based on how many pointers we're actually going to use: */ unsigned sectors_free; struct open_buckets ptrs; - u64 next_alloc[BCH_SB_MEMBERS_MAX]; + struct dev_stripe_state stripe; }; struct write_point_specifier { diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 22df84b78f4b..b33fbf709705 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -204,7 +204,7 @@ #define dynamic_fault(...) 0 #define race_fault(...) 0 -#define bch2_fs_init_fault(name) \ +#define bch2_fs_init_fault(name) \ dynamic_fault("bcachefs:bch_fs_init:" name) #define bch2_meta_read_fault(name) \ dynamic_fault("bcachefs:meta:read:" name) @@ -273,7 +273,10 @@ do { \ BCH_DEBUG_PARAM(test_alloc_startup, \ "Force allocator startup to use the slowpath where it" \ "can't find enough free buckets without invalidating" \ - "cached data") + "cached data") \ + BCH_DEBUG_PARAM(force_reconstruct_read, \ + "Force reads to use the reconstruct path, when reading" \ + "from erasure coded extents") #define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG() @@ -311,6 +314,7 @@ enum bch_time_stats { #include "btree_types.h" #include "buckets_types.h" #include "clock_types.h" +#include "ec_types.h" #include "journal_types.h" #include "keylist_types.h" #include "quota_types.h" @@ -333,9 +337,13 @@ enum gc_phase { GC_PHASE_START, GC_PHASE_SB, -#define DEF_BTREE_ID(kwd, val, name) GC_PHASE_BTREE_##kwd, - DEFINE_BCH_BTREE_IDS() -#undef DEF_BTREE_ID + GC_PHASE_BTREE_EC, + GC_PHASE_BTREE_EXTENTS, + GC_PHASE_BTREE_INODES, + GC_PHASE_BTREE_DIRENTS, + GC_PHASE_BTREE_XATTRS, + GC_PHASE_BTREE_ALLOC, + GC_PHASE_BTREE_QUOTAS, GC_PHASE_PENDING_DELETE, GC_PHASE_ALLOC, @@ -684,6 +692,21 @@ struct bch_fs { /* REBALANCE */ struct bch_fs_rebalance rebalance; + /* ERASURE CODING */ + struct list_head ec_new_stripe_list; + struct mutex ec_new_stripe_lock; + + GENRADIX(struct ec_stripe) ec_stripes; + struct mutex ec_stripes_lock; + + ec_stripes_heap ec_stripes_heap; + spinlock_t ec_stripes_heap_lock; + + struct bio_set ec_bioset; + + struct work_struct ec_stripe_delete_work; + struct llist_head ec_stripe_delete_list; + /* VFS IO PATH - fs-io.c */ struct bio_set writepage_bioset; struct bio_set dio_write_bioset; diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index ecb7a97ee533..a00e77fa1d37 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -238,6 +238,9 @@ struct bkey_packed { } __attribute__((packed, aligned(8))); #define BKEY_U64s (sizeof(struct bkey) / sizeof(__u64)) +#define BKEY_U64s_MAX U8_MAX +#define BKEY_VAL_U64s_MAX (BKEY_U64s_MAX - BKEY_U64s) + #define KEY_PACKED_BITS_START 24 #define KEY_FORMAT_LOCAL_BTREE 0 @@ -465,8 +468,9 @@ enum bch_compression_type { x(ptr, 0) \ x(crc32, 1) \ x(crc64, 2) \ - x(crc128, 3) -#define BCH_EXTENT_ENTRY_MAX 4 + x(crc128, 3) \ + x(stripe_ptr, 4) +#define BCH_EXTENT_ENTRY_MAX 5 enum bch_extent_entry_type { #define x(f, n) BCH_EXTENT_ENTRY_##f = n, @@ -557,7 +561,7 @@ struct bch_extent_ptr { #if defined(__LITTLE_ENDIAN_BITFIELD) __u64 type:1, cached:1, - erasure_coded:1, + unused:1, reservation:1, offset:44, /* 8 petabytes */ dev:8, @@ -567,23 +571,35 @@ struct bch_extent_ptr { dev:8, offset:44, reservation:1, - erasure_coded:1, + unused:1, cached:1, type:1; #endif } __attribute__((packed, aligned(8))); -struct bch_extent_reservation { +struct bch_extent_stripe_ptr { #if defined(__LITTLE_ENDIAN_BITFIELD) __u64 type:5, - unused:23, + block:8, + idx:51; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u64 idx:51, + block:8, + type:5; +#endif +}; + +struct bch_extent_reservation { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 type:6, + unused:22, replicas:4, generation:32; #elif defined (__BIG_ENDIAN_BITFIELD) __u64 generation:32, replicas:4, - unused:23, - type:5; + unused:22, + type:6; #endif }; @@ -706,7 +722,8 @@ BKEY_VAL_TYPE(inode_generation, BCH_INODE_GENERATION); BCH_INODE_FIELD(bi_data_replicas, 8) \ BCH_INODE_FIELD(bi_promote_target, 16) \ BCH_INODE_FIELD(bi_foreground_target, 16) \ - BCH_INODE_FIELD(bi_background_target, 16) + BCH_INODE_FIELD(bi_background_target, 16) \ + BCH_INODE_FIELD(bi_erasure_code, 16) #define BCH_INODE_FIELDS_INHERIT() \ BCH_INODE_FIELD(bi_data_checksum) \ @@ -716,7 +733,8 @@ BKEY_VAL_TYPE(inode_generation, BCH_INODE_GENERATION); BCH_INODE_FIELD(bi_data_replicas) \ BCH_INODE_FIELD(bi_promote_target) \ BCH_INODE_FIELD(bi_foreground_target) \ - BCH_INODE_FIELD(bi_background_target) + BCH_INODE_FIELD(bi_background_target) \ + BCH_INODE_FIELD(bi_erasure_code) enum { /* @@ -876,6 +894,27 @@ struct bch_quota { } __attribute__((packed, aligned(8))); BKEY_VAL_TYPE(quota, BCH_QUOTA); +/* Erasure coding */ + +enum { + BCH_STRIPE = 128, +}; + +struct bch_stripe { + struct bch_val v; + __le16 sectors; + __u8 algorithm; + __u8 nr_blocks; + __u8 nr_redundant; + + __u8 csum_granularity_bits; + __u8 csum_type; + __u8 pad; + + struct bch_extent_ptr ptrs[0]; +} __attribute__((packed, aligned(8))); +BKEY_VAL_TYPE(stripe, BCH_STRIPE); + /* Optional/variable size superblock sections: */ struct bch_sb_field { @@ -1065,7 +1104,7 @@ struct bch_sb_field_quota { struct bch_disk_group { __u8 label[BCH_SB_LABEL_SIZE]; __le64 flags[2]; -}; +} __attribute__((packed, aligned(8))); LE64_BITMASK(BCH_GROUP_DELETED, struct bch_disk_group, flags[0], 0, 1) LE64_BITMASK(BCH_GROUP_DATA_ALLOWED, struct bch_disk_group, flags[0], 1, 6) @@ -1074,7 +1113,7 @@ LE64_BITMASK(BCH_GROUP_PARENT, struct bch_disk_group, flags[0], 6, 24) struct bch_sb_field_disk_groups { struct bch_sb_field field; struct bch_disk_group entries[0]; -}; +} __attribute__((packed, aligned(8))); /* * On clean shutdown, store btree roots and current journal sequence number in @@ -1242,12 +1281,15 @@ LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE, struct bch_sb, flags[2], 0, 4); LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES, struct bch_sb, flags[2], 4, 64); +LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16); + /* Features: */ enum bch_sb_features { BCH_FEATURE_LZ4 = 0, BCH_FEATURE_GZIP = 1, BCH_FEATURE_ZSTD = 2, BCH_FEATURE_ATOMIC_NLINK = 3, /* should have gone under compat */ + BCH_FEATURE_EC = 4, BCH_FEATURE_NR, }; @@ -1417,7 +1459,8 @@ LE32_BITMASK(JSET_BIG_ENDIAN, struct jset, flags, 4, 5); DEF_BTREE_ID(DIRENTS, 2, "dirents") \ DEF_BTREE_ID(XATTRS, 3, "xattrs") \ DEF_BTREE_ID(ALLOC, 4, "alloc") \ - DEF_BTREE_ID(QUOTAS, 5, "quotas") + DEF_BTREE_ID(QUOTAS, 5, "quotas") \ + DEF_BTREE_ID(EC, 6, "erasure_coding") #define DEF_BTREE_ID(kwd, val, name) BTREE_ID_##kwd = val, diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h index 9a0286d86784..9679631a7e89 100644 --- a/fs/bcachefs/bkey.h +++ b/fs/bcachefs/bkey.h @@ -588,6 +588,8 @@ BKEY_VAL_ACCESSORS(alloc, BCH_ALLOC); BKEY_VAL_ACCESSORS(quota, BCH_QUOTA); +BKEY_VAL_ACCESSORS(stripe, BCH_STRIPE); + /* byte order helpers */ #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c index 7335fbbb3f61..81c66950668c 100644 --- a/fs/bcachefs/bkey_methods.c +++ b/fs/bcachefs/bkey_methods.c @@ -5,6 +5,7 @@ #include "btree_types.h" #include "alloc_background.h" #include "dirent.h" +#include "ec.h" #include "error.h" #include "extents.h" #include "inode.h" @@ -18,6 +19,7 @@ const struct bkey_ops bch2_bkey_ops[] = { [BKEY_TYPE_XATTRS] = bch2_bkey_xattr_ops, [BKEY_TYPE_ALLOC] = bch2_bkey_alloc_ops, [BKEY_TYPE_QUOTAS] = bch2_bkey_quota_ops, + [BKEY_TYPE_EC] = bch2_bkey_ec_ops, [BKEY_TYPE_BTREE] = bch2_bkey_btree_ops, }; diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 92b82eaee69d..e900fd4ffd06 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -15,6 +15,7 @@ #include "buckets.h" #include "clock.h" #include "debug.h" +#include "ec.h" #include "error.h" #include "extents.h" #include "journal.h" @@ -116,6 +117,7 @@ static bool bkey_type_needs_gc(enum bkey_type type) switch (type) { case BKEY_TYPE_BTREE: case BKEY_TYPE_EXTENTS: + case BKEY_TYPE_EC: return true; default: return false; @@ -156,6 +158,17 @@ static u8 ptr_gens_recalc_oldest(struct bch_fs *c, } } break; + case BKEY_TYPE_EC: + switch (k.k->type) { + case BCH_STRIPE: { + struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); + + for (ptr = s.v->ptrs; + ptr < s.v->ptrs + s.v->nr_blocks; + ptr++) + ptr_gen_recalc_oldest(c, ptr, &max_stale); + } + } default: break; } @@ -217,6 +230,21 @@ static int ptr_gens_check(struct bch_fs *c, enum bkey_type type, } } break; + case BKEY_TYPE_EC: + switch (k.k->type) { + case BCH_STRIPE: { + struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); + + for (ptr = s.v->ptrs; + ptr < s.v->ptrs + s.v->nr_blocks; + ptr++) { + ret = ptr_gen_check(c, type, ptr); + if (ret) + return ret; + } + } + } + break; default: break; } @@ -362,15 +390,27 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, return 0; } +static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) +{ + return (int) btree_id_to_gc_phase(l) - + (int) btree_id_to_gc_phase(r); +} + static int bch2_gc_btrees(struct bch_fs *c, struct list_head *journal, bool initial) { + enum btree_id ids[BTREE_ID_NR]; unsigned i; + for (i = 0; i < BTREE_ID_NR; i++) + ids[i] = i; + bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp); + for (i = 0; i < BTREE_ID_NR; i++) { - enum bkey_type type = bkey_type(0, i); + enum btree_id id = ids[i]; + enum bkey_type type = bkey_type(0, id); - int ret = bch2_gc_btree(c, i, initial); + int ret = bch2_gc_btree(c, id, initial); if (ret) return ret; @@ -602,6 +642,7 @@ static void bch2_gc_start(struct bch_fs *c) new.data_type = 0; new.cached_sectors = 0; new.dirty_sectors = 0; + new.stripe = 0; })); ca->oldest_gens[b] = new.gen; } diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h index 86b80e32e310..47a590015325 100644 --- a/fs/bcachefs/btree_gc.h +++ b/fs/bcachefs/btree_gc.h @@ -55,11 +55,22 @@ static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r) return 0; } +static inline enum gc_phase btree_id_to_gc_phase(enum btree_id id) +{ + switch (id) { +#define DEF_BTREE_ID(n, v, s) case BTREE_ID_##n: return GC_PHASE_BTREE_##n; + DEFINE_BCH_BTREE_IDS() +#undef DEF_BTREE_ID + default: + BUG(); + } +} + static inline struct gc_pos gc_pos_btree(enum btree_id id, struct bpos pos, unsigned level) { return (struct gc_pos) { - .phase = GC_PHASE_BTREE_EXTENTS + id, + .phase = btree_id_to_gc_phase(id), .pos = pos, .level = level, }; diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 92bacd16fdc3..01e476d72595 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -340,7 +340,7 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c, mutex_unlock(&c->btree_reserve_cache_lock); retry: - wp = bch2_alloc_sectors_start(c, c->opts.foreground_target, + wp = bch2_alloc_sectors_start(c, c->opts.foreground_target, 0, writepoint_ptr(&c->btree_write_point), &devs_have, res->nr_replicas, diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index ea28788b26dd..9558129e77ba 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -69,6 +69,7 @@ #include "btree_gc.h" #include "btree_update.h" #include "buckets.h" +#include "ec.h" #include "error.h" #include "movinggc.h" #include "trace.h" @@ -270,6 +271,7 @@ static inline struct fs_usage_sum __fs_usage_sum(struct bch_fs_usage stats) for (i = 0; i < ARRAY_SIZE(stats.replicas); i++) { sum.data += stats.replicas[i].data[BCH_DATA_BTREE]; sum.data += stats.replicas[i].data[BCH_DATA_USER]; + sum.data += stats.replicas[i].ec_data; sum.cached += stats.replicas[i].data[BCH_DATA_CACHED]; sum.reserved += stats.replicas[i].persistent_reserved; } @@ -400,6 +402,8 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, dev_usage->buckets_alloc += (int) new.owned_by_allocator - (int) old.owned_by_allocator; + dev_usage->buckets_ec += + (int) new.stripe - (int) old.stripe; dev_usage->buckets_unavailable += is_unavailable_bucket(new) - is_unavailable_bucket(old); @@ -639,6 +643,49 @@ static void bch2_mark_pointer(struct bch_fs *c, bucket_became_unavailable(c, old, new)); } +static void bch2_mark_stripe_ptr(struct bch_fs *c, + struct bch_extent_stripe_ptr p, + s64 sectors, unsigned flags, + s64 *adjusted_disk_sectors, + unsigned *redundancy) +{ + struct ec_stripe *m; + unsigned old, new, nr_data; + int blocks_nonempty_delta; + s64 parity_sectors; + + m = genradix_ptr(&c->ec_stripes, p.idx); + if (WARN_ON(!m)) + return; + + if (WARN_ON(!m->alive)) + return; + + nr_data = m->nr_blocks - m->nr_redundant; + + parity_sectors = DIV_ROUND_UP(abs(sectors) * m->nr_redundant, nr_data); + + if (sectors < 0) + parity_sectors = -parity_sectors; + + *adjusted_disk_sectors += parity_sectors; + + *redundancy = max_t(unsigned, *redundancy, m->nr_redundant + 1); + + new = atomic_add_return(sectors, &m->block_sectors[p.block]); + old = new - sectors; + + blocks_nonempty_delta = (int) !!new - (int) !!old; + if (!blocks_nonempty_delta) + return; + + atomic_add(blocks_nonempty_delta, &m->blocks_nonempty); + + BUG_ON(atomic_read(&m->blocks_nonempty) < 0); + + bch2_stripes_heap_update(c, m, p.idx); +} + static void bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k, s64 sectors, enum bch_data_type data_type, struct gc_pos pos, @@ -655,28 +702,43 @@ static void bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k, struct extent_ptr_decoded p; s64 cached_sectors = 0; s64 dirty_sectors = 0; + s64 ec_sectors = 0; unsigned replicas = 0; + unsigned ec_redundancy = 0; + unsigned i; extent_for_each_ptr_decode(e, p, entry) { s64 disk_sectors = ptr_disk_sectors(e, p, sectors); + s64 adjusted_disk_sectors = disk_sectors; bch2_mark_pointer(c, e, p, disk_sectors, data_type, stats, journal_seq, flags); if (!p.ptr.cached) + for (i = 0; i < p.ec_nr; i++) + bch2_mark_stripe_ptr(c, p.ec[i], + disk_sectors, flags, + &adjusted_disk_sectors, + &ec_redundancy); + if (!p.ptr.cached) replicas++; if (p.ptr.cached) - cached_sectors += disk_sectors; + cached_sectors += adjusted_disk_sectors; + else if (!p.ec_nr) + dirty_sectors += adjusted_disk_sectors; else - dirty_sectors += disk_sectors; + ec_sectors += adjusted_disk_sectors; } replicas = clamp_t(unsigned, replicas, 1, ARRAY_SIZE(stats->replicas)); + ec_redundancy = clamp_t(unsigned, ec_redundancy, + 1, ARRAY_SIZE(stats->replicas)); stats->replicas[0].data[BCH_DATA_CACHED] += cached_sectors; stats->replicas[replicas - 1].data[data_type] += dirty_sectors; + stats->replicas[ec_redundancy - 1].ec_data += ec_sectors; break; } case BCH_RESERVATION: { @@ -692,6 +754,78 @@ static void bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k, } } +static void bucket_set_stripe(struct bch_fs *c, + const struct bch_stripe *v, + bool enabled, + struct bch_fs_usage *fs_usage, + u64 journal_seq) +{ + unsigned i; + + for (i = 0; i < v->nr_blocks; i++) { + const struct bch_extent_ptr *ptr = v->ptrs + i; + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + struct bucket *g; + struct bucket_mark new, old; + + BUG_ON(ptr_stale(ca, ptr)); + + rcu_read_lock(); + g = PTR_BUCKET(ca, ptr); + + old = bucket_cmpxchg(g, new, ({ + new.stripe = enabled; + if (journal_seq) { + new.journal_seq_valid = 1; + new.journal_seq = journal_seq; + } + })); + rcu_read_unlock(); + + BUG_ON(old.stripe == enabled); + + bch2_dev_usage_update(c, ca, fs_usage, old, new); + } +} + +static void bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k, + bool inserting, struct gc_pos pos, + struct bch_fs_usage *fs_usage, + u64 journal_seq, unsigned flags) +{ + switch (k.k->type) { + case BCH_STRIPE: { + struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); + size_t idx = s.k->p.offset; + struct ec_stripe *m = genradix_ptr(&c->ec_stripes, idx); + unsigned i; + + BUG_ON(!m); + BUG_ON(m->alive == inserting); + + BUG_ON(atomic_read(&m->blocks_nonempty)); + + for (i = 0; i < EC_STRIPE_MAX; i++) + BUG_ON(atomic_read(&m->block_sectors[i])); + + if (inserting) { + m->sectors = le16_to_cpu(s.v->sectors); + m->algorithm = s.v->algorithm; + m->nr_blocks = s.v->nr_blocks; + m->nr_redundant = s.v->nr_redundant; + } + + if (inserting) + bch2_stripes_heap_insert(c, m, idx); + else + bch2_stripes_heap_del(c, m, idx); + + bucket_set_stripe(c, s.v, inserting, fs_usage, 0); + break; + } + } +} + void bch2_mark_key(struct bch_fs *c, enum bkey_type type, struct bkey_s_c k, bool inserting, s64 sectors, @@ -747,6 +881,10 @@ void bch2_mark_key(struct bch_fs *c, bch2_mark_extent(c, k, sectors, BCH_DATA_USER, pos, stats, journal_seq, flags); break; + case BKEY_TYPE_EC: + bch2_mark_stripe(c, k, inserting, + pos, stats, journal_seq, flags); + break; default: break; } diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index 8fe6871ad165..b48960fa5ce7 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -185,6 +185,7 @@ static inline bool is_available_bucket(struct bucket_mark mark) { return (!mark.owned_by_allocator && !mark.dirty_sectors && + !mark.stripe && !mark.nouse); } diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h index 49f3ab9009ea..9ec96dbab0e8 100644 --- a/fs/bcachefs/buckets_types.h +++ b/fs/bcachefs/buckets_types.h @@ -19,7 +19,8 @@ struct bucket_mark { gen_valid:1, owned_by_allocator:1, nouse:1, - journal_seq_valid:1; + journal_seq_valid:1, + stripe:1; u16 dirty_sectors; u16 cached_sectors; @@ -53,6 +54,7 @@ struct bucket_array { struct bch_dev_usage { u64 buckets[BCH_DATA_NR]; u64 buckets_alloc; + u64 buckets_ec; u64 buckets_unavailable; /* _compressed_ sectors: */ @@ -67,6 +69,7 @@ struct bch_fs_usage { struct { u64 data[BCH_DATA_NR]; + u64 ec_data; u64 persistent_reserved; } replicas[BCH_REPLICAS_MAX]; diff --git a/fs/bcachefs/disk_groups.h b/fs/bcachefs/disk_groups.h index ceb75f86b615..c8e0c37a5e1a 100644 --- a/fs/bcachefs/disk_groups.h +++ b/fs/bcachefs/disk_groups.h @@ -55,6 +55,19 @@ static inline struct target target_decode(unsigned target) } const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned); + +static inline struct bch_devs_mask target_rw_devs(struct bch_fs *c, + enum bch_data_type data_type, + u16 target) +{ + struct bch_devs_mask devs = c->rw_devs[data_type]; + const struct bch_devs_mask *t = bch2_target_to_mask(c, target); + + if (t) + bitmap_and(devs.d, devs.d, t->d, BCH_SB_MEMBERS_MAX); + return devs; +} + bool bch2_dev_in_target(struct bch_fs *, unsigned, unsigned); int bch2_disk_path_find(struct bch_sb_handle *, const char *); diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c new file mode 100644 index 000000000000..f6314aa6a0f1 --- /dev/null +++ b/fs/bcachefs/ec.c @@ -0,0 +1,1265 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* erasure coding */ + +#include "bcachefs.h" +#include "alloc_foreground.h" +#include "bset.h" +#include "btree_gc.h" +#include "btree_update.h" +#include "buckets.h" +#include "disk_groups.h" +#include "ec.h" +#include "error.h" +#include "io.h" +#include "keylist.h" +#include "super-io.h" +#include "util.h" + +#include <linux/raid/pq.h> +#include <linux/raid/xor.h> +#include <linux/sort.h> + +struct ec_bio { + struct bch_dev *ca; + struct ec_stripe_buf *buf; + size_t idx; + struct bio bio; +}; + +/* Stripes btree keys: */ + +static unsigned stripe_csums_per_device(const struct bch_stripe *s) +{ + return DIV_ROUND_UP(le16_to_cpu(s->sectors), + 1 << s->csum_granularity_bits); +} + +static unsigned stripe_val_u64s(const struct bch_stripe *s) +{ + unsigned bytes = sizeof(struct bch_stripe) + + sizeof(struct bch_extent_ptr) * s->nr_blocks + + bch_crc_bytes[s->csum_type] * s->nr_blocks * stripe_csums_per_device(s); + return DIV_ROUND_UP(bytes, sizeof(u64)); +} + +static void *stripe_csum(struct bch_stripe *s, unsigned dev, unsigned csum_idx) +{ + unsigned csum_bytes = bch_crc_bytes[s->csum_type]; + void *csums = s->ptrs + s->nr_blocks; + + BUG_ON(!csum_bytes); + + return csums + (dev * stripe_csums_per_device(s) + csum_idx) * csum_bytes; +} + +const char *bch2_ec_key_invalid(const struct bch_fs *c, struct bkey_s_c k) +{ + if (k.k->p.inode) + return "invalid stripe key"; + + switch (k.k->type) { + case BCH_STRIPE: { + const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; + + if (bkey_val_bytes(k.k) < sizeof(*s)) + return "incorrect value size"; + + if (bkey_val_u64s(k.k) != stripe_val_u64s(s)) + return "incorrect value size"; + + return NULL; + } + default: + return "invalid type"; + } +} + +void bch2_ec_key_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + switch (k.k->type) { + case BCH_STRIPE: { + const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; + unsigned i; + + pr_buf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u", + s->algorithm, + le16_to_cpu(s->sectors), + s->nr_blocks - s->nr_redundant, + s->nr_redundant, + s->csum_type, + 1U << s->csum_granularity_bits); + + for (i = 0; i < s->nr_blocks; i++) + pr_buf(out, " %u:%llu", s->ptrs[i].dev, + (u64) s->ptrs[i].offset); + } + } +} + +static int ptr_matches_stripe(struct bch_fs *c, + struct bch_stripe *v, + const struct bch_extent_ptr *ptr) +{ + unsigned i; + + for (i = 0; i < v->nr_blocks - v->nr_redundant; i++) { + const struct bch_extent_ptr *ptr2 = v->ptrs + i; + + if (ptr->dev == ptr2->dev && + ptr->gen == ptr2->gen && + ptr->offset >= ptr2->offset && + ptr->offset < ptr2->offset + le16_to_cpu(v->sectors)) + return i; + } + + return -1; +} + +static int extent_matches_stripe(struct bch_fs *c, + struct bch_stripe *v, + struct bkey_s_c k) +{ + struct bkey_s_c_extent e; + const struct bch_extent_ptr *ptr; + int idx; + + if (!bkey_extent_is_data(k.k)) + return -1; + + e = bkey_s_c_to_extent(k); + + extent_for_each_ptr(e, ptr) { + idx = ptr_matches_stripe(c, v, ptr); + if (idx >= 0) + return idx; + } + + return -1; +} + +static void ec_stripe_key_init(struct bch_fs *c, + struct bkey_i_stripe *s, + struct open_buckets *blocks, + struct open_buckets *parity, + unsigned stripe_size) +{ + struct open_bucket *ob; + unsigned i, u64s; + + bkey_stripe_init(&s->k_i); + s->v.sectors = cpu_to_le16(stripe_size); + s->v.algorithm = 0; + s->v.nr_blocks = parity->nr + blocks->nr; + s->v.nr_redundant = parity->nr; + s->v.csum_granularity_bits = ilog2(c->sb.encoded_extent_max); + s->v.csum_type = BCH_CSUM_CRC32C; + s->v.pad = 0; + + open_bucket_for_each(c, blocks, ob, i) + s->v.ptrs[i] = ob->ptr; + + open_bucket_for_each(c, parity, ob, i) + s->v.ptrs[blocks->nr + i] = ob->ptr; + + while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) { + BUG_ON(1 << s->v.csum_granularity_bits >= + le16_to_cpu(s->v.sectors) || + s->v.csum_granularity_bits == U8_MAX); + s->v.csum_granularity_bits++; + } + + set_bkey_val_u64s(&s->k, u64s); +} + +/* Checksumming: */ + +static void ec_generate_checksums(struct ec_stripe_buf *buf) +{ + struct bch_stripe *v = &buf->key.v; + unsigned csum_granularity = 1 << v->csum_granularity_bits; + unsigned csums_per_device = stripe_csums_per_device(v); + unsigned csum_bytes = bch_crc_bytes[v->csum_type]; + unsigned i, j; + + if (!csum_bytes) + return; + + BUG_ON(buf->offset); + BUG_ON(buf->size != le16_to_cpu(v->sectors)); + + for (i = 0; i < v->nr_blocks; i++) { + for (j = 0; j < csums_per_device; j++) { + unsigned offset = j << v->csum_granularity_bits; + unsigned len = min(csum_granularity, buf->size - offset); + + struct bch_csum csum = + bch2_checksum(NULL, v->csum_type, + null_nonce(), + buf->data[i] + (offset << 9), + len << 9); + + memcpy(stripe_csum(v, i, j), &csum, csum_bytes); + } + } +} + +static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf) +{ + struct bch_stripe *v = &buf->key.v; + unsigned csum_granularity = 1 << v->csum_granularity_bits; + unsigned csum_bytes = bch_crc_bytes[v->csum_type]; + unsigned i; + + if (!csum_bytes) + return; + + for (i = 0; i < v->nr_blocks; i++) { + unsigned offset = buf->offset; + unsigned end = buf->offset + buf->size; + + if (!test_bit(i, buf->valid)) + continue; + + while (offset < end) { + unsigned j = offset >> v->csum_granularity_bits; + unsigned len = min(csum_granularity, end - offset); + struct bch_csum csum; + + BUG_ON(offset & (csum_granularity - 1)); + BUG_ON(offset + len != le16_to_cpu(v->sectors) && + ((offset + len) & (csum_granularity - 1))); + + csum = bch2_checksum(NULL, v->csum_type, + null_nonce(), + buf->data[i] + ((offset - buf->offset) << 9), + len << 9); + + if (memcmp(stripe_csum(v, i, j), &csum, csum_bytes)) { + __bcache_io_error(c, + "checksum error while doing reconstruct read (%u:%u)", + i, j); + clear_bit(i, buf->valid); + break; + } + + offset += len; + } + } +} + +/* Erasure coding: */ + +static void raid5_recov(unsigned disks, unsigned bytes, + unsigned failed, void **data) +{ + unsigned i = 2, nr; + + BUG_ON(failed >= disks); + + swap(data[0], data[failed]); + memcpy(data[0], data[1], bytes); + + while (i < disks) { + nr = min_t(unsigned, disks - i, MAX_XOR_BLOCKS); + xor_blocks(nr, bytes, data[0], data + i); + i += nr; + } + + swap(data[0], data[failed]); +} + +static void ec_generate_ec(struct ec_stripe_buf *buf) +{ + struct bch_stripe *v = &buf->key.v; + unsigned nr_data = v->nr_blocks - v->nr_redundant; + unsigned bytes = le16_to_cpu(v->sectors) << 9; + + switch (v->nr_redundant) { + case 2: + raid6_call.gen_syndrome(v->nr_blocks, bytes, buf->data); + fallthrough; + case 1: + raid5_recov(v->nr_blocks, bytes, nr_data, buf->data); + break; + default: + BUG(); + } +} + +static unsigned __ec_nr_failed(struct ec_stripe_buf *buf, unsigned nr) +{ + return nr - bitmap_weight(buf->valid, nr); +} + +static unsigned ec_nr_failed(struct ec_stripe_buf *buf) +{ + return __ec_nr_failed(buf, buf->key.v.nr_blocks); +} + +static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf) +{ + struct bch_stripe *v = &buf->key.v; + unsigned i, failed[EC_STRIPE_MAX], nr_failed = 0; + unsigned nr_data = v->nr_blocks - v->nr_redundant; + unsigned bytes = buf->size << 9; + + if (ec_nr_failed(buf) > v->nr_redundant) { + __bcache_io_error(c, + "error doing reconstruct read: unable to read enough blocks"); + return -1; + } + + for (i = 0; i < nr_data; i++) + if (!test_bit(i, buf->valid)) + failed[nr_failed++] = i; + + switch (nr_failed) { + case 0: + break; + case 1: + if (test_bit(nr_data, buf->valid)) + raid5_recov(nr_data + 1, bytes, failed[0], buf->data); + else + raid6_datap_recov(v->nr_blocks, bytes, failed[0], buf->data); + break; + case 2: + /* data+data failure. */ + raid6_2data_recov(v->nr_blocks, bytes, failed[0], failed[1], buf->data); + break; + + default: + BUG(); + } + + for (i = nr_data; i < v->nr_blocks; i++) + if (!test_bit(i, buf->valid)) { + ec_generate_ec(buf); + break; + } + + return 0; +} + +/* IO: */ + +static void ec_block_endio(struct bio *bio) +{ + struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio); + struct bch_dev *ca = ec_bio->ca; + struct closure *cl = bio->bi_private; + + if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding")) + clear_bit(ec_bio->idx, ec_bio->buf->valid); + + bio_put(&ec_bio->bio); + percpu_ref_put(&ca->io_ref); + closure_put(cl); +} + +static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, + unsigned rw, unsigned idx, struct closure *cl) +{ + struct bch_stripe *v = &buf->key.v; + unsigned offset = 0, bytes = buf->size << 9; + struct bch_extent_ptr *ptr = &v->ptrs[idx]; + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + + if (!bch2_dev_get_ioref(ca, rw)) { + clear_bit(idx, buf->valid); + return; + } + + while (offset < bytes) { + unsigned nr_iovecs = min_t(size_t, BIO_MAX_VECS, + DIV_ROUND_UP(bytes, PAGE_SIZE)); + unsigned b = min_t(size_t, bytes - offset, + nr_iovecs << PAGE_SHIFT); + struct ec_bio *ec_bio; + + ec_bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, + nr_iovecs, + rw, + GFP_KERNEL, + &c->ec_bioset), + struct ec_bio, bio); + + ec_bio->ca = ca; + ec_bio->buf = buf; + ec_bio->idx = idx; + + ec_bio->bio.bi_iter.bi_sector = ptr->offset + buf->offset + (offset >> 9); + ec_bio->bio.bi_iter.bi_size = b; + ec_bio->bio.bi_end_io = ec_block_endio; + ec_bio->bio.bi_private = cl; + + bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset); + + closure_get(cl); + percpu_ref_get(&ca->io_ref); + + submit_bio(&ec_bio->bio); + + offset += b; + } + + percpu_ref_put(&ca->io_ref); +} + +/* recovery read path: */ +int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio) +{ + struct btree_iter iter; + struct ec_stripe_buf *buf; + struct closure cl; + struct bkey_s_c k; + struct bch_stripe *v; + unsigned stripe_idx; + unsigned offset, end; + unsigned i, nr_data, csum_granularity; + int ret = 0, idx; + + closure_init_stack(&cl); + + BUG_ON(!rbio->pick.idx || + rbio->pick.idx - 1 >= rbio->pick.ec_nr); + + stripe_idx = rbio->pick.ec[rbio->pick.idx - 1].idx; + + buf = kzalloc(sizeof(*buf), GFP_NOIO); + if (!buf) + return -ENOMEM; + + bch2_btree_iter_init(&iter, c, BTREE_ID_EC, + POS(0, stripe_idx), + BTREE_ITER_SLOTS); + k = bch2_btree_iter_peek_slot(&iter); + if (btree_iter_err(k) || k.k->type != BCH_STRIPE) { + __bcache_io_error(c, + "error doing reconstruct read: stripe not found"); + kfree(buf); + return bch2_btree_iter_unlock(&iter) ?: -EIO; + } + + bkey_reassemble(&buf->key.k_i, k); + bch2_btree_iter_unlock(&iter); + + v = &buf->key.v; + + nr_data = v->nr_blocks - v->nr_redundant; + + idx = ptr_matches_stripe(c, v, &rbio->pick.ptr); + BUG_ON(idx < 0); + + csum_granularity = 1U << v->csum_granularity_bits; + + offset = rbio->bio.bi_iter.bi_sector - v->ptrs[idx].offset; + end = offset + bio_sectors(&rbio->bio); + + BUG_ON(end > le16_to_cpu(v->sectors)); + + buf->offset = round_down(offset, csum_granularity); + buf->size = min_t(unsigned, le16_to_cpu(v->sectors), + round_up(end, csum_granularity)) - buf->offset; + + for (i = 0; i < v->nr_blocks; i++) { + buf->data[i] = kmalloc(buf->size << 9, GFP_NOIO); + if (!buf->data[i]) { + ret = -ENOMEM; + goto err; + } + } + + memset(buf->valid, 0xFF, sizeof(buf->valid)); + + for (i = 0; i < v->nr_blocks; i++) { + struct bch_extent_ptr *ptr = v->ptrs + i; + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + + if (ptr_stale(ca, ptr)) { + __bcache_io_error(c, + "error doing reconstruct read: stale pointer"); + clear_bit(i, buf->valid); + continue; + } + + ec_block_io(c, buf, REQ_OP_READ, i, &cl); + } + + closure_sync(&cl); + + if (ec_nr_failed(buf) > v->nr_redundant) { + __bcache_io_error(c, + "error doing reconstruct read: unable to read enough blocks"); + ret = -EIO; + goto err; + } + + ec_validate_checksums(c, buf); + + ret = ec_do_recov(c, buf); + if (ret) + goto err; + + memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter, + buf->data[idx] + ((offset - buf->offset) << 9)); +err: + for (i = 0; i < v->nr_blocks; i++) + kfree(buf->data[i]); + kfree(buf); + return ret; +} + +/* ec_stripe bucket accounting: */ + +static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp) +{ + ec_stripes_heap n, *h = &c->ec_stripes_heap; + + if (idx >= h->size) { + if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp)) + return -ENOMEM; + + spin_lock(&c->ec_stripes_heap_lock); + if (n.size > h->size) { + memcpy(n.data, h->data, h->used * sizeof(h->data[0])); + n.used = h->used; + swap(*h, n); + } + spin_unlock(&c->ec_stripes_heap_lock); + + free_heap(&n); + } + + if (!genradix_ptr_alloc(&c->ec_stripes, idx, gfp)) + return -ENOMEM; + + return 0; +} + +static int ec_stripe_mem_alloc(struct bch_fs *c, + struct btree_iter *iter) +{ + size_t idx = iter->pos.offset; + + if (!__ec_stripe_mem_alloc(c, idx, GFP_NOWAIT|__GFP_NOWARN)) + return 0; + + bch2_btree_iter_unlock(iter); + + if (!__ec_stripe_mem_alloc(c, idx, GFP_KERNEL)) + return -EINTR; + return -ENOMEM; +} + +static ssize_t stripe_idx_to_delete(struct bch_fs *c) +{ + ec_stripes_heap *h = &c->ec_stripes_heap; + + return h->data[0].blocks_nonempty == 0 ? h->data[0].idx : -1; +} + +static inline int ec_stripes_heap_cmp(ec_stripes_heap *h, + struct ec_stripe_heap_entry l, + struct ec_stripe_heap_entry r) +{ + return ((l.blocks_nonempty > r.blocks_nonempty) - + (l.blocks_nonempty < r.blocks_nonempty)); +} + +static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h, + size_t i) +{ + struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap); + + genradix_ptr(&c->ec_stripes, h->data[i].idx)->heap_idx = i; +} + +static void heap_verify_backpointer(struct bch_fs *c, size_t idx) +{ + ec_stripes_heap *h = &c->ec_stripes_heap; + struct ec_stripe *m = genradix_ptr(&c->ec_stripes, idx); + + BUG_ON(!m->alive); + BUG_ON(m->heap_idx >= h->used); + BUG_ON(h->data[m->heap_idx].idx != idx); +} + +static inline unsigned stripe_entry_blocks(struct ec_stripe *m) +{ + return atomic_read(&m->pin) + ? UINT_MAX : atomic_read(&m->blocks_nonempty); +} + +void bch2_stripes_heap_update(struct bch_fs *c, + struct ec_stripe *m, size_t idx) +{ + ec_stripes_heap *h = &c->ec_stripes_heap; + bool queue_delete; + size_t i; + + spin_lock(&c->ec_stripes_heap_lock); + + if (!m->alive) { + spin_unlock(&c->ec_stripes_heap_lock); + return; + } + + heap_verify_backpointer(c, idx); + + h->data[m->heap_idx].blocks_nonempty = + stripe_entry_blocks(m); + + i = m->heap_idx; + heap_sift_up(h, i, ec_stripes_heap_cmp, + ec_stripes_heap_set_backpointer); + heap_sift_down(h, i, ec_stripes_heap_cmp, + ec_stripes_heap_set_backpointer); + + heap_verify_backpointer(c, idx); + + queue_delete = stripe_idx_to_delete(c) >= 0; + spin_unlock(&c->ec_stripes_heap_lock); + + if (queue_delete) + schedule_work(&c->ec_stripe_delete_work); +} + +void bch2_stripes_heap_del(struct bch_fs *c, + struct ec_stripe *m, size_t idx) +{ + spin_lock(&c->ec_stripes_heap_lock); + heap_verify_backpointer(c, idx); + + m->alive = false; + heap_del(&c->ec_stripes_heap, m->heap_idx, + ec_stripes_heap_cmp, + ec_stripes_heap_set_backpointer); + spin_unlock(&c->ec_stripes_heap_lock); +} + +void bch2_stripes_heap_insert(struct bch_fs *c, + struct ec_stripe *m, size_t idx) +{ + spin_lock(&c->ec_stripes_heap_lock); + + BUG_ON(heap_full(&c->ec_stripes_heap)); + + heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) { + .idx = idx, + .blocks_nonempty = stripe_entry_blocks(m), + }), + ec_stripes_heap_cmp, + ec_stripes_heap_set_backpointer); + m->alive = true; + + heap_verify_backpointer(c, idx); + + spin_unlock(&c->ec_stripes_heap_lock); +} + +static void ec_stripe_delete(struct bch_fs *c, unsigned idx) +{ + struct btree_iter iter; + struct bch_stripe *v = NULL; + struct bkey_s_c k; + struct bkey_i delete; + u64 journal_seq = 0; + + bch2_btree_iter_init(&iter, c, BTREE_ID_EC, + POS(0, idx), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(&iter); + if (btree_iter_err(k) || k.k->type != BCH_STRIPE) + goto out; + + v = kmalloc(bkey_val_bytes(k.k), GFP_KERNEL); + BUG_ON(!v); + memcpy(v, bkey_s_c_to_stripe(k).v, bkey_val_bytes(k.k)); + + bkey_init(&delete.k); + delete.k.p = iter.pos; + + bch2_btree_insert_at(c, NULL, &journal_seq, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_USE_RESERVE| + BTREE_INSERT_NOUNLOCK, + BTREE_INSERT_ENTRY(&iter, &delete)); +out: + bch2_btree_iter_unlock(&iter); + kfree(v); +} + +static void ec_stripe_delete_work(struct work_struct *work) +{ + struct bch_fs *c = + container_of(work, struct bch_fs, ec_stripe_delete_work); + ssize_t idx; + + down_read(&c->gc_lock); + + while (1) { + spin_lock(&c->ec_stripes_heap_lock); + idx = stripe_idx_to_delete(c); + spin_unlock(&c->ec_stripes_heap_lock); + + if (idx < 0) + break; + + ec_stripe_delete(c, idx); + } + + up_read(&c->gc_lock); +} + +static int ec_stripe_bkey_insert(struct bch_fs *c, + struct bkey_i_stripe *stripe) +{ + struct ec_stripe *m; + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + /* XXX: start pos hint */ +retry: + for_each_btree_key(&iter, c, BTREE_ID_EC, POS_MIN, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k) { + if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) { + bch2_btree_iter_unlock(&iter); + return -ENOSPC; + } + + if (bkey_deleted(k.k)) + goto found_slot; + } + + return bch2_btree_iter_unlock(&iter) ?: -ENOSPC; +found_slot: + mutex_lock(&c->ec_stripes_lock); + ret = ec_stripe_mem_alloc(c, &iter); + mutex_unlock(&c->ec_stripes_lock); + + if (ret == -EINTR) + goto retry; + if (ret) + return ret; + + m = genradix_ptr(&c->ec_stripes, iter.pos.offset); + atomic_inc(&m->pin); + + stripe->k.p = iter.pos; + + ret = bch2_btree_insert_at(c, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_USE_RESERVE, + BTREE_INSERT_ENTRY(&iter, &stripe->k_i)); + bch2_btree_iter_unlock(&iter); + + if (ret) + atomic_dec(&m->pin); + + return ret; +} + +/* stripe creation: */ + +static void extent_stripe_ptr_add(struct bkey_s_extent e, + struct ec_stripe_buf *s, + struct bch_extent_ptr *ptr, + unsigned block) +{ + struct bch_extent_stripe_ptr *dst = (void *) ptr; + union bch_extent_entry *end = extent_entry_last(e); + + memmove_u64s_up(dst + 1, dst, (u64 *) end - (u64 *) dst); + e.k->u64s += sizeof(*dst) / sizeof(u64); + + *dst = (struct bch_extent_stripe_ptr) { + .type = 1 << BCH_EXTENT_ENTRY_stripe_ptr, + .block = block, + .idx = s->key.k.p.offset, + }; +} + +static int ec_stripe_update_ptrs(struct bch_fs *c, + struct ec_stripe_buf *s, + struct bkey *pos) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_s_extent e; + struct bch_extent_ptr *ptr; + BKEY_PADDED(k) tmp; + int ret = 0, dev, idx; + + bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, + bkey_start_pos(pos), + BTREE_ITER_INTENT); + + while ((k = bch2_btree_iter_peek(&iter)).k && + !btree_iter_err(k) && + bkey_cmp(bkey_start_pos(k.k), pos->p) < 0) { + idx = extent_matches_stripe(c, &s->key.v, k); + if (idx < 0) { + bch2_btree_iter_next(&iter); + continue; + } + + dev = s->key.v.ptrs[idx].dev; + + bkey_reassemble(&tmp.k, k); + e = bkey_i_to_s_extent(&tmp.k); + + extent_for_each_ptr(e, ptr) + if (ptr->dev != dev) + ptr->cached = true; + + ptr = (void *) bch2_extent_has_device(e.c, dev); + BUG_ON(!ptr); + + extent_stripe_ptr_add(e, s, ptr, idx); + + ret = bch2_btree_insert_at(c, NULL, NULL, + BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOFAIL| + BTREE_INSERT_USE_RESERVE, + BTREE_INSERT_ENTRY(&iter, &tmp.k)); + if (ret == -EINTR) + ret = 0; + if (ret) + break; + } + + return bch2_btree_iter_unlock(&iter) ?: ret; +} + +/* + * data buckets of new stripe all written: create the stripe + */ +static void ec_stripe_create(struct ec_stripe_new *s) +{ + struct ec_stripe *ec_stripe; + struct bch_fs *c = s->c; + struct open_bucket *ob; + struct bkey_i *k; + struct bch_stripe *v = &s->stripe.key.v; + unsigned i, nr_data = v->nr_blocks - v->nr_redundant; + struct closure cl; + int ret; + + BUG_ON(s->h->s == s); + + closure_init_stack(&cl); + + if (s->err) { + bch_err(c, "error creating stripe: error writing data buckets"); + goto err; + } + + if (!percpu_ref_tryget(&c->writes)) + goto err; + + BUG_ON(bitmap_weight(s->blocks_allocated, + s->blocks.nr) != s->blocks.nr); + + ec_generate_ec(&s->stripe); + + ec_generate_checksums(&s->stripe); + + /* write p/q: */ + for (i = nr_data; i < v->nr_blocks; i++) + ec_block_io(c, &s->stripe, REQ_OP_WRITE, i, &cl); + + closure_sync(&cl); + + for (i = nr_data; i < v->nr_blocks; i++) + if (!test_bit(i, s->stripe.valid)) { + bch_err(c, "error creating stripe: error writing redundancy buckets"); + goto err_put_writes; + } + + ret = ec_stripe_bkey_insert(c, &s->stripe.key); + if (ret) { + bch_err(c, "error creating stripe: error creating stripe key"); + goto err_put_writes; + } + + for_each_keylist_key(&s->keys, k) { + ret = ec_stripe_update_ptrs(c, &s->stripe, &k->k); + if (ret) + break; + } + + ec_stripe = genradix_ptr(&c->ec_stripes, s->stripe.key.k.p.offset); + + atomic_dec(&ec_stripe->pin); + bch2_stripes_heap_update(c, ec_stripe, + s->stripe.key.k.p.offset); + +err_put_writes: + percpu_ref_put(&c->writes); +err: + open_bucket_for_each(c, &s->blocks, ob, i) { + ob->ec = NULL; + __bch2_open_bucket_put(c, ob); + } + + bch2_open_buckets_put(c, &s->parity); + + bch2_keylist_free(&s->keys, s->inline_keys); + + mutex_lock(&s->h->lock); + list_del(&s->list); + mutex_unlock(&s->h->lock); + + for (i = 0; i < s->stripe.key.v.nr_blocks; i++) + kvpfree(s->stripe.data[i], s->stripe.size << 9); + kfree(s); +} + +static struct ec_stripe_new *ec_stripe_set_pending(struct ec_stripe_head *h) +{ + struct ec_stripe_new *s = h->s; + + list_add(&s->list, &h->stripes); + h->s = NULL; + + return s; +} + +static void ec_stripe_new_put(struct ec_stripe_new *s) +{ + BUG_ON(atomic_read(&s->pin) <= 0); + if (atomic_dec_and_test(&s->pin)) + ec_stripe_create(s); +} + +/* have a full bucket - hand it off to be erasure coded: */ +void bch2_ec_bucket_written(struct bch_fs *c, struct open_bucket *ob) +{ + struct ec_stripe_new *s = ob->ec; + + if (ob->sectors_free) + s->err = -1; + + ec_stripe_new_put(s); +} + +void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob) +{ + struct ec_stripe_new *s = ob->ec; + + s->err = -EIO; +} + +void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp) +{ + struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs); + struct bch_dev *ca; + unsigned offset; + + if (!ob) + return NULL; + + ca = bch_dev_bkey_exists(c, ob->ptr.dev); + offset = ca->mi.bucket_size - ob->sectors_free; + + return ob->ec->stripe.data[ob->ec_idx] + (offset << 9); +} + +void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp, + struct bpos pos, unsigned sectors) +{ + struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs); + struct ec_stripe_new *ec; + + if (!ob) + return; + + ec = ob->ec; + mutex_lock(&ec->lock); + + if (bch2_keylist_realloc(&ec->keys, ec->inline_keys, + ARRAY_SIZE(ec->inline_keys), + BKEY_U64s)) { + BUG(); + } + + bkey_init(&ec->keys.top->k); + ec->keys.top->k.p = pos; + bch2_key_resize(&ec->keys.top->k, sectors); + bch2_keylist_push(&ec->keys); + + mutex_unlock(&ec->lock); +} + +static int unsigned_cmp(const void *_l, const void *_r) +{ + unsigned l = *((const unsigned *) _l); + unsigned r = *((const unsigned *) _r); + + return (l > r) - (l < r); +} + +/* pick most common bucket size: */ +static unsigned pick_blocksize(struct bch_fs *c, + struct bch_devs_mask *devs) +{ + struct bch_dev *ca; + unsigned i, nr = 0, sizes[BCH_SB_MEMBERS_MAX]; + struct { + unsigned nr, size; + } cur = { 0, 0 }, best = { 0, 0 }; + + for_each_member_device_rcu(ca, c, i, devs) + sizes[nr++] = ca->mi.bucket_size; + + sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL); + + for (i = 0; i < nr; i++) { + if (sizes[i] != cur.size) { + if (cur.nr > best.nr) + best = cur; + + cur.nr = 0; + cur.size = sizes[i]; + } + + cur.nr++; + } + + if (cur.nr > best.nr) + best = cur; + + return best.size; +} + +int bch2_ec_stripe_new_alloc(struct bch_fs *c, struct ec_stripe_head *h) +{ + struct ec_stripe_new *s; + unsigned i; + + BUG_ON(h->parity.nr != h->redundancy); + BUG_ON(!h->blocks.nr); + BUG_ON(h->parity.nr + h->blocks.nr > EC_STRIPE_MAX); + lockdep_assert_held(&h->lock); + + s = kzalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + mutex_init(&s->lock); + atomic_set(&s->pin, 1); + s->c = c; + s->h = h; + s->blocks = h->blocks; + s->parity = h->parity; + + memset(&h->blocks, 0, sizeof(h->blocks)); + memset(&h->parity, 0, sizeof(h->parity)); + + bch2_keylist_init(&s->keys, s->inline_keys); + + s->stripe.offset = 0; + s->stripe.size = h->blocksize; + memset(s->stripe.valid, 0xFF, sizeof(s->stripe.valid)); + + ec_stripe_key_init(c, &s->stripe.key, + &s->blocks, &s->parity, + h->blocksize); + + for (i = 0; i < s->stripe.key.v.nr_blocks; i++) { + s->stripe.data[i] = kvpmalloc(s->stripe.size << 9, GFP_KERNEL); + if (!s->stripe.data[i]) + goto err; + } + + h->s = s; + + return 0; +err: + for (i = 0; i < s->stripe.key.v.nr_blocks; i++) + kvpfree(s->stripe.data[i], s->stripe.size << 9); + kfree(s); + return -ENOMEM; +} + +static struct ec_stripe_head * +ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target, + unsigned algo, unsigned redundancy) +{ + struct ec_stripe_head *h; + struct bch_dev *ca; + unsigned i; + + h = kzalloc(sizeof(*h), GFP_KERNEL); + if (!h) + return NULL; + + mutex_init(&h->lock); + mutex_lock(&h->lock); + INIT_LIST_HEAD(&h->stripes); + + h->target = target; + h->algo = algo; + h->redundancy = redundancy; + + rcu_read_lock(); + h->devs = target_rw_devs(c, BCH_DATA_USER, target); + + for_each_member_device_rcu(ca, c, i, &h->devs) + if (!ca->mi.durability) + __clear_bit(i, h->devs.d); + + h->blocksize = pick_blocksize(c, &h->devs); + + for_each_member_device_rcu(ca, c, i, &h->devs) + if (ca->mi.bucket_size == h->blocksize) + h->nr_active_devs++; + + rcu_read_unlock(); + list_add(&h->list, &c->ec_new_stripe_list); + return h; +} + +void bch2_ec_stripe_head_put(struct ec_stripe_head *h) +{ + struct ec_stripe_new *s = NULL; + + if (h->s && + bitmap_weight(h->s->blocks_allocated, + h->s->blocks.nr) == h->s->blocks.nr) + s = ec_stripe_set_pending(h); + + mutex_unlock(&h->lock); + + if (s) + ec_stripe_new_put(s); +} + +struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, + unsigned target, + unsigned algo, + unsigned redundancy) +{ + struct ec_stripe_head *h; + + if (!redundancy) + return NULL; + + mutex_lock(&c->ec_new_stripe_lock); + list_for_each_entry(h, &c->ec_new_stripe_list, list) + if (h->target == target && + h->algo == algo && + h->redundancy == redundancy) { + mutex_lock(&h->lock); + goto found; + } + + h = ec_new_stripe_head_alloc(c, target, algo, redundancy); +found: + mutex_unlock(&c->ec_new_stripe_lock); + return h; +} + +void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca) +{ + struct ec_stripe_head *h; + struct open_bucket *ob; + unsigned i; + + mutex_lock(&c->ec_new_stripe_lock); + list_for_each_entry(h, &c->ec_new_stripe_list, list) { + struct ec_stripe_new *s = NULL; + + mutex_lock(&h->lock); + bch2_open_buckets_stop_dev(c, ca, + &h->blocks, + BCH_DATA_USER); + bch2_open_buckets_stop_dev(c, ca, + &h->parity, + BCH_DATA_USER); + + if (!h->s) + goto unlock; + + open_bucket_for_each(c, &h->s->blocks, ob, i) + if (ob->ptr.dev == ca->dev_idx) + goto found; + open_bucket_for_each(c, &h->s->parity, ob, i) + if (ob->ptr.dev == ca->dev_idx) + goto found; + goto unlock; +found: + h->s->err = -1; + s = ec_stripe_set_pending(h); +unlock: + mutex_unlock(&h->lock); + + if (s) + ec_stripe_new_put(s); + } + mutex_unlock(&c->ec_new_stripe_lock); +} + +int bch2_fs_ec_start(struct bch_fs *c) +{ + struct btree_iter iter; + struct bkey_s_c k; + size_t i, idx = 0; + int ret = 0; + + bch2_btree_iter_init(&iter, c, BTREE_ID_EC, POS(0, U64_MAX), 0); + + k = bch2_btree_iter_prev(&iter); + if (!IS_ERR_OR_NULL(k.k)) + idx = k.k->p.offset + 1; + ret = bch2_btree_iter_unlock(&iter); + if (ret) + return ret; + + if (!init_heap(&c->ec_stripes_heap, roundup_pow_of_two(idx), + GFP_KERNEL)) + return -ENOMEM; +#if 0 + ret = genradix_prealloc(&c->ec_stripes, idx, GFP_KERNEL); +#else + for (i = 0; i < idx; i++) + if (!genradix_ptr_alloc(&c->ec_stripes, i, GFP_KERNEL)) + return -ENOMEM; +#endif + return 0; +} + +void bch2_fs_ec_exit(struct bch_fs *c) +{ + struct ec_stripe_head *h; + + while (1) { + mutex_lock(&c->ec_new_stripe_lock); + h = list_first_entry_or_null(&c->ec_new_stripe_list, + struct ec_stripe_head, list); + if (h) + list_del(&h->list); + mutex_unlock(&c->ec_new_stripe_lock); + if (!h) + break; + + BUG_ON(h->s); + BUG_ON(!list_empty(&h->stripes)); + kfree(h); + } + + free_heap(&c->ec_stripes_heap); + genradix_free(&c->ec_stripes); + bioset_exit(&c->ec_bioset); +} + +int bch2_fs_ec_init(struct bch_fs *c) +{ + INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work); + + return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio), + BIOSET_NEED_BVECS); +} diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h new file mode 100644 index 000000000000..bcf06529dcfc --- /dev/null +++ b/fs/bcachefs/ec.h @@ -0,0 +1,109 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_EC_H +#define _BCACHEFS_EC_H + +#include "ec_types.h" +#include "keylist_types.h" + +const char *bch2_ec_key_invalid(const struct bch_fs *, struct bkey_s_c); +void bch2_ec_key_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); + +#define bch2_bkey_ec_ops (struct bkey_ops) { \ + .key_invalid = bch2_ec_key_invalid, \ + .val_to_text = bch2_ec_key_to_text, \ +} + +struct bch_read_bio; + +struct ec_stripe_buf { + /* might not be buffering the entire stripe: */ + unsigned offset; + unsigned size; + unsigned long valid[BITS_TO_LONGS(EC_STRIPE_MAX)]; + + void *data[EC_STRIPE_MAX]; + + union { + struct bkey_i_stripe key; + u64 pad[255]; + }; +}; + +struct ec_stripe_head; + +struct ec_stripe_new { + struct bch_fs *c; + struct ec_stripe_head *h; + struct mutex lock; + struct list_head list; + + /* counts in flight writes, stripe is created when pin == 0 */ + atomic_t pin; + + int err; + + unsigned long blocks_allocated[BITS_TO_LONGS(EC_STRIPE_MAX)]; + + struct open_buckets blocks; + struct open_buckets parity; + + struct keylist keys; + u64 inline_keys[BKEY_U64s * 8]; + + struct ec_stripe_buf stripe; +}; + +struct ec_stripe_head { + struct list_head list; + struct mutex lock; + + struct list_head stripes; + + unsigned target; + unsigned algo; + unsigned redundancy; + + struct bch_devs_mask devs; + unsigned nr_active_devs; + + unsigned blocksize; + + struct dev_stripe_state block_stripe; + struct dev_stripe_state parity_stripe; + + struct open_buckets blocks; + struct open_buckets parity; + + struct ec_stripe_new *s; +}; + +int bch2_ec_read_extent(struct bch_fs *, struct bch_read_bio *); + +void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *); +void bch2_ec_add_backpointer(struct bch_fs *, struct write_point *, + struct bpos, unsigned); + +void bch2_ec_bucket_written(struct bch_fs *, struct open_bucket *); +void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *); + +int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *); + +void bch2_ec_stripe_head_put(struct ec_stripe_head *); +struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *, unsigned, + unsigned, unsigned); + +void bch2_stripes_heap_update(struct bch_fs *, struct ec_stripe *, size_t); +void bch2_stripes_heap_del(struct bch_fs *, struct ec_stripe *, size_t); +void bch2_stripes_heap_insert(struct bch_fs *, struct ec_stripe *, size_t); + +void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *); + +void bch2_ec_flush_new_stripes(struct bch_fs *); + +int bch2_fs_ec_start(struct bch_fs *); + +void bch2_fs_ec_exit(struct bch_fs *); +int bch2_fs_ec_init(struct bch_fs *); + +#endif /* _BCACHEFS_EC_H */ diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h new file mode 100644 index 000000000000..00e89c3b7767 --- /dev/null +++ b/fs/bcachefs/ec_types.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_EC_TYPES_H +#define _BCACHEFS_EC_TYPES_H + +#include <linux/llist.h> + +#define EC_STRIPE_MAX 16 + +struct ec_stripe { + size_t heap_idx; + + u16 sectors; + u8 algorithm; + + u8 nr_blocks; + u8 nr_redundant; + + u8 alive; + atomic_t pin; + atomic_t blocks_nonempty; + atomic_t block_sectors[EC_STRIPE_MAX]; +}; + +struct ec_stripe_heap_entry { + size_t idx; + unsigned blocks_nonempty; +}; + +typedef HEAP(struct ec_stripe_heap_entry) ec_stripes_heap; + +#endif /* _BCACHEFS_EC_TYPES_H */ diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index df0ca1fcf2e8..9bb4e10283e1 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -193,29 +193,41 @@ unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c k) return nr_ptrs; } -unsigned bch2_extent_ptr_durability(struct bch_fs *c, - const struct bch_extent_ptr *ptr) +static unsigned bch2_extent_ptr_durability(struct bch_fs *c, + struct extent_ptr_decoded p) { + unsigned i, durability = 0; struct bch_dev *ca; - if (ptr->cached) + if (p.ptr.cached) return 0; - ca = bch_dev_bkey_exists(c, ptr->dev); + ca = bch_dev_bkey_exists(c, p.ptr.dev); - if (ca->mi.state == BCH_MEMBER_STATE_FAILED) - return 0; + if (ca->mi.state != BCH_MEMBER_STATE_FAILED) + durability = max_t(unsigned, durability, ca->mi.durability); + + for (i = 0; i < p.ec_nr; i++) { + struct ec_stripe *s = + genradix_ptr(&c->ec_stripes, p.idx); - return ca->mi.durability; + if (WARN_ON(!s)) + continue; + + durability = max_t(unsigned, durability, s->nr_redundant); + } + + return durability; } unsigned bch2_extent_durability(struct bch_fs *c, struct bkey_s_c_extent e) { - const struct bch_extent_ptr *ptr; + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; unsigned durability = 0; - extent_for_each_ptr(e, ptr) - durability += bch2_extent_ptr_durability(c, ptr); + extent_for_each_ptr_decode(e, p, entry) + durability += bch2_extent_ptr_durability(c, p); return durability; } @@ -258,30 +270,46 @@ bool bch2_extent_matches_ptr(struct bch_fs *c, struct bkey_s_c_extent e, return false; } +static union bch_extent_entry *extent_entry_prev(struct bkey_s_extent e, + union bch_extent_entry *entry) +{ + union bch_extent_entry *i = e.v->start; + + if (i == entry) + return NULL; + + while (extent_entry_next(i) != entry) + i = extent_entry_next(i); + return i; +} + union bch_extent_entry *bch2_extent_drop_ptr(struct bkey_s_extent e, struct bch_extent_ptr *ptr) { - union bch_extent_entry *dst; - union bch_extent_entry *src; + union bch_extent_entry *dst, *src, *prev; + bool drop_crc = true; EBUG_ON(ptr < &e.v->start->ptr || ptr >= &extent_entry_last(e)->ptr); EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr); - src = to_entry(ptr + 1); - + src = extent_entry_next(to_entry(ptr)); if (src != extent_entry_last(e) && - extent_entry_type(src) == BCH_EXTENT_ENTRY_ptr) { - dst = to_entry(ptr); - } else { - extent_for_each_entry(e, dst) { - if (dst == to_entry(ptr)) - break; + !extent_entry_is_crc(src)) + drop_crc = false; - if (extent_entry_next(dst) == to_entry(ptr) && - extent_entry_is_crc(dst)) - break; + dst = to_entry(ptr); + while ((prev = extent_entry_prev(e, dst))) { + if (extent_entry_is_ptr(prev)) + break; + + if (extent_entry_is_crc(prev)) { + if (drop_crc) + dst = prev; + break; } + + dst = prev; } memmove_u64s_down(dst, src, @@ -423,6 +451,8 @@ void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k) entry->crc128.csum.lo = (__force __le64) swab64((__force u64) entry->crc128.csum.lo); break; + case BCH_EXTENT_ENTRY_stripe_ptr: + break; } } break; @@ -470,6 +500,7 @@ static void extent_print_ptrs(struct printbuf *out, struct bch_fs *c, const union bch_extent_entry *entry; struct bch_extent_crc_unpacked crc; const struct bch_extent_ptr *ptr; + const struct bch_extent_stripe_ptr *ec; struct bch_dev *ca; bool first = true; @@ -478,6 +509,18 @@ static void extent_print_ptrs(struct printbuf *out, struct bch_fs *c, pr_buf(out, " "); switch (__extent_entry_type(entry)) { + case BCH_EXTENT_ENTRY_ptr: + ptr = entry_to_ptr(entry); + ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] + ? bch_dev_bkey_exists(c, ptr->dev) + : NULL; + + pr_buf(out, "ptr: %u:%llu gen %u%s%s", ptr->dev, + (u64) ptr->offset, ptr->gen, + ptr->cached ? " cached" : "", + ca && ptr_stale(ca, ptr) + ? " stale" : ""); + break; case BCH_EXTENT_ENTRY_crc32: case BCH_EXTENT_ENTRY_crc64: case BCH_EXTENT_ENTRY_crc128: @@ -490,17 +533,11 @@ static void extent_print_ptrs(struct printbuf *out, struct bch_fs *c, crc.csum_type, crc.compression_type); break; - case BCH_EXTENT_ENTRY_ptr: - ptr = entry_to_ptr(entry); - ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] - ? bch_dev_bkey_exists(c, ptr->dev) - : NULL; + case BCH_EXTENT_ENTRY_stripe_ptr: + ec = &entry->stripe_ptr; - pr_buf(out, "ptr: %u:%llu gen %u%s%s", ptr->dev, - (u64) ptr->offset, ptr->gen, - ptr->cached ? " cached" : "", - ca && ptr_stale(ca, ptr) - ? " stale" : ""); + pr_buf(out, "ec: idx %llu block %u", + (u64) ec->idx, ec->block); break; default: pr_buf(out, "(invalid extent entry %.16llx)", *((u64 *) entry)); @@ -536,6 +573,11 @@ void bch2_mark_io_failure(struct bch_io_failures *failed, f = &failed->devs[failed->nr++]; f->dev = p->ptr.dev; + f->idx = p->idx; + f->nr_failed = 1; + f->nr_retries = 0; + } else if (p->idx != f->idx) { + f->idx = p->idx; f->nr_failed = 1; f->nr_retries = 0; } else { @@ -550,15 +592,22 @@ static inline bool ptr_better(struct bch_fs *c, const struct extent_ptr_decoded p1, const struct extent_ptr_decoded p2) { - struct bch_dev *dev1 = bch_dev_bkey_exists(c, p1.ptr.dev); - struct bch_dev *dev2 = bch_dev_bkey_exists(c, p2.ptr.dev); + if (likely(!p1.idx && !p2.idx)) { + struct bch_dev *dev1 = bch_dev_bkey_exists(c, p1.ptr.dev); + struct bch_dev *dev2 = bch_dev_bkey_exists(c, p2.ptr.dev); + + u64 l1 = atomic64_read(&dev1->cur_latency[READ]); + u64 l2 = atomic64_read(&dev2->cur_latency[READ]); - u64 l1 = atomic64_read(&dev1->cur_latency[READ]); - u64 l2 = atomic64_read(&dev2->cur_latency[READ]); + /* Pick at random, biased in favor of the faster device: */ + + return bch2_rand_range(l1 + l2) > l1; + } - /* Pick at random, biased in favor of the faster device: */ + if (force_reconstruct_read(c)) + return p1.idx > p2.idx; - return bch2_rand_range(l1 + l2) > l1; + return p1.idx < p2.idx; } static int extent_pick_read_device(struct bch_fs *c, @@ -579,7 +628,20 @@ static int extent_pick_read_device(struct bch_fs *c, continue; f = failed ? dev_io_failures(failed, p.ptr.dev) : NULL; - if (f && f->nr_failed >= f->nr_retries) + if (f) + p.idx = f->nr_failed < f->nr_retries + ? f->idx + : f->idx + 1; + + if (!p.idx && + !bch2_dev_is_readable(ca)) + p.idx++; + + if (!p.idx && p.ec_nr) + p.idx++; + + if (force_reconstruct_read(c) && + p.idx >= p.ec_nr + 1) continue; if (ret && !ptr_better(c, p, *pick)) @@ -616,8 +678,8 @@ const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k) if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) return "invalid extent entry type"; - if (extent_entry_is_crc(entry)) - return "has crc field"; + if (!extent_entry_is_ptr(entry)) + return "has non ptr field"; } extent_for_each_ptr(e, ptr) { @@ -754,6 +816,8 @@ static bool __bch2_cut_front(struct bpos where, struct bkey_s k) case BCH_EXTENT_ENTRY_crc128: entry->crc128.offset += e.k->size - len; break; + case BCH_EXTENT_ENTRY_stripe_ptr: + break; } if (extent_entry_is_crc(entry)) @@ -1512,7 +1576,18 @@ const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k) if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) return "invalid extent entry type"; - if (extent_entry_is_crc(entry)) { + switch (extent_entry_type(entry)) { + case BCH_EXTENT_ENTRY_ptr: + ptr = entry_to_ptr(entry); + + reason = extent_ptr_invalid(c, e, &entry->ptr, + size_ondisk, false); + if (reason) + return reason; + break; + case BCH_EXTENT_ENTRY_crc32: + case BCH_EXTENT_ENTRY_crc64: + case BCH_EXTENT_ENTRY_crc128: crc = bch2_extent_crc_unpack(e.k, entry_to_crc(entry)); if (crc.offset + e.k->size > @@ -1533,13 +1608,9 @@ const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k) else if (nonce != crc.offset + crc.nonce) return "incorrect nonce"; } - } else { - ptr = entry_to_ptr(entry); - - reason = extent_ptr_invalid(c, e, &entry->ptr, - size_ondisk, false); - if (reason) - return reason; + break; + case BCH_EXTENT_ENTRY_stripe_ptr: + break; } } @@ -1756,6 +1827,7 @@ void bch2_extent_ptr_decoded_append(struct bkey_i_extent *e, { struct bch_extent_crc_unpacked crc = bch2_extent_crc_unpack(&e->k, NULL); union bch_extent_entry *pos; + unsigned i; if (!bch2_crc_unpacked_cmp(crc, p->crc)) { pos = e->v.start; @@ -1773,6 +1845,11 @@ void bch2_extent_ptr_decoded_append(struct bkey_i_extent *e, found: p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; __extent_entry_insert(e, pos, to_entry(&p->ptr)); + + for (i = 0; i < p->ec_nr; i++) { + p->ec[i].type = 1 << BCH_EXTENT_ENTRY_stripe_ptr; + __extent_entry_insert(e, pos, to_entry(&p->ec[i])); + } } /* @@ -1827,26 +1904,27 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *c, unsigned target, unsigned nr_desired_replicas) { - struct bch_extent_ptr *ptr; + union bch_extent_entry *entry; + struct extent_ptr_decoded p; int extra = bch2_extent_durability(c, e.c) - nr_desired_replicas; if (target && extra > 0) - extent_for_each_ptr(e, ptr) { - int n = bch2_extent_ptr_durability(c, ptr); + extent_for_each_ptr_decode(e, p, entry) { + int n = bch2_extent_ptr_durability(c, p); if (n && n <= extra && - !bch2_dev_in_target(c, ptr->dev, target)) { - ptr->cached = true; + !bch2_dev_in_target(c, p.ptr.dev, target)) { + entry->ptr.cached = true; extra -= n; } } if (extra > 0) - extent_for_each_ptr(e, ptr) { - int n = bch2_extent_ptr_durability(c, ptr); + extent_for_each_ptr_decode(e, p, entry) { + int n = bch2_extent_ptr_durability(c, p); if (n && n <= extra) { - ptr->cached = true; + entry->ptr.cached = true; extra -= n; } } @@ -1922,7 +2000,7 @@ enum merge_result bch2_extent_merge(struct bch_fs *c, struct btree *b, if ((extent_entry_type(en_l) != extent_entry_type(en_r)) || - extent_entry_is_crc(en_l)) + !extent_entry_is_ptr(en_l)) return BCH_MERGE_NOMERGE; lp = &en_l->ptr; diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index d121ce5b3225..15865b27847d 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -96,8 +96,6 @@ unsigned bch2_extent_nr_ptrs(struct bkey_s_c_extent); unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c); unsigned bch2_extent_is_compressed(struct bkey_s_c); -unsigned bch2_extent_ptr_durability(struct bch_fs *, - const struct bch_extent_ptr *); unsigned bch2_extent_durability(struct bch_fs *, struct bkey_s_c_extent); bool bch2_extent_matches_ptr(struct bch_fs *, struct bkey_s_c_extent, @@ -362,20 +360,13 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc) /* Iterate over pointers, with crcs: */ -static inline struct extent_ptr_decoded -__extent_ptr_decoded_init(const struct bkey *k) -{ - return (struct extent_ptr_decoded) { - .crc = bch2_extent_crc_unpack(k, NULL), - }; -} - -#define EXTENT_ITERATE_EC (1 << 0) - #define __extent_ptr_next_decode(_e, _ptr, _entry) \ ({ \ __label__ out; \ \ + (_ptr).idx = 0; \ + (_ptr).ec_nr = 0; \ + \ extent_for_each_entry_from(_e, _entry, _entry) \ switch (extent_entry_type(_entry)) { \ case BCH_EXTENT_ENTRY_ptr: \ @@ -387,14 +378,16 @@ __extent_ptr_decoded_init(const struct bkey *k) (_ptr).crc = bch2_extent_crc_unpack((_e).k, \ entry_to_crc(_entry)); \ break; \ + case BCH_EXTENT_ENTRY_stripe_ptr: \ + (_ptr).ec[(_ptr).ec_nr++] = _entry->stripe_ptr; \ + break; \ } \ - \ out: \ _entry < extent_entry_last(_e); \ }) #define extent_for_each_ptr_decode(_e, _ptr, _entry) \ - for ((_ptr) = __extent_ptr_decoded_init((_e).k), \ + for ((_ptr).crc = bch2_extent_crc_unpack((_e).k, NULL), \ (_entry) = (_e).v->start; \ __extent_ptr_next_decode(_e, _ptr, _entry); \ (_entry) = extent_entry_next(_entry)) diff --git a/fs/bcachefs/extents_types.h b/fs/bcachefs/extents_types.h index 5738738d7953..a85cda0e7a6a 100644 --- a/fs/bcachefs/extents_types.h +++ b/fs/bcachefs/extents_types.h @@ -20,14 +20,18 @@ struct bch_extent_crc_unpacked { }; struct extent_ptr_decoded { + unsigned idx; + unsigned ec_nr; struct bch_extent_crc_unpacked crc; struct bch_extent_ptr ptr; + struct bch_extent_stripe_ptr ec[4]; }; struct bch_io_failures { u8 nr; struct bch_dev_io_failures { u8 dev; + u8 idx; u8 nr_failed; u8 nr_retries; } devs[BCH_REPLICAS_MAX]; diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c index fbd0a82fdeac..2fee2f2efd38 100644 --- a/fs/bcachefs/io.c +++ b/fs/bcachefs/io.c @@ -16,6 +16,7 @@ #include "clock.h" #include "debug.h" #include "disk_groups.h" +#include "ec.h" #include "error.h" #include "extents.h" #include "io.h" @@ -319,6 +320,7 @@ static void __bch2_write_index(struct bch_write_op *op) struct bkey_s_extent e; struct bch_extent_ptr *ptr; struct bkey_i *src, *dst = keys->keys, *n, *k; + unsigned dev; int ret; for (src = keys->keys; src != keys->top; src = n) { @@ -362,6 +364,10 @@ static void __bch2_write_index(struct bch_write_op *op) } } out: + /* If some a bucket wasn't written, we can't erasure code it: */ + for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX) + bch2_open_bucket_write_error(c, &op->open_buckets, dev); + bch2_open_buckets_put(c, &op->open_buckets); return; err: @@ -442,7 +448,8 @@ static void init_append_extent(struct bch_write_op *op, static struct bio *bch2_write_bio_alloc(struct bch_fs *c, struct write_point *wp, struct bio *src, - bool *page_alloc_failed) + bool *page_alloc_failed, + void *buf) { struct bch_write_bio *wbio; struct bio *bio; @@ -453,11 +460,18 @@ static struct bio *bch2_write_bio_alloc(struct bch_fs *c, bio = bio_alloc_bioset(NULL, pages, 0, GFP_NOIO, &c->bio_write); wbio = wbio_init(bio); - wbio->bounce = true; wbio->put_bio = true; /* copy WRITE_SYNC flag */ wbio->bio.bi_opf = src->bi_opf; + if (buf) { + bio->bi_iter.bi_size = output_available; + bch2_bio_map(bio, buf); + return bio; + } + + wbio->bounce = true; + /* * We can't use mempool for more than c->sb.encoded_extent_max * worth of pages, but we'd like to allocate more if we can: @@ -622,14 +636,18 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp) struct bio *src = &op->wbio.bio, *dst = src; struct bvec_iter saved_iter; struct bkey_i *key_to_write; + void *ec_buf; unsigned key_to_write_offset = op->insert_keys.top_p - op->insert_keys.keys_p; - unsigned total_output = 0; - bool bounce = false, page_alloc_failed = false; + unsigned total_output = 0, total_input = 0; + bool bounce = false; + bool page_alloc_failed = false; int ret, more = 0; BUG_ON(!bio_sectors(src)); + ec_buf = bch2_writepoint_ec_buf(c, wp); + switch (bch2_write_prep_encoded_data(op, wp)) { case PREP_ENCODED_OK: break; @@ -639,16 +657,26 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp) case PREP_ENCODED_CHECKSUM_ERR: goto csum_err; case PREP_ENCODED_DO_WRITE: + if (ec_buf) { + dst = bch2_write_bio_alloc(c, wp, src, + &page_alloc_failed, + ec_buf); + bio_copy_data(dst, src); + bounce = true; + } init_append_extent(op, wp, op->version, op->crc); goto do_write; } - if (op->compression_type || + if (ec_buf || + op->compression_type || (op->csum_type && !(op->flags & BCH_WRITE_PAGES_STABLE)) || (bch2_csum_type_is_encryption(op->csum_type) && !(op->flags & BCH_WRITE_PAGES_OWNED))) { - dst = bch2_write_bio_alloc(c, wp, src, &page_alloc_failed); + dst = bch2_write_bio_alloc(c, wp, src, + &page_alloc_failed, + ec_buf); bounce = true; } @@ -751,7 +779,8 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp) if (dst != src) bio_advance(dst, dst_len); bio_advance(src, src_len); - total_output += dst_len; + total_output += dst_len; + total_input += src_len; } while (dst->bi_iter.bi_size && src->bi_iter.bi_size && wp->sectors_free && @@ -764,16 +793,20 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp) dst->bi_iter = saved_iter; - if (!bounce && more) { - dst = bio_split(src, total_output >> 9, + if (dst == src && more) { + BUG_ON(total_output != total_input); + + dst = bio_split(src, total_input >> 9, GFP_NOIO, &c->bio_write); - wbio_init(dst)->put_bio = true; + wbio_init(dst)->put_bio = true; + /* copy WRITE_SYNC flag */ + dst->bi_opf = src->bi_opf; } dst->bi_iter.bi_size = total_output; /* Free unneeded pages after compressing: */ - if (bounce) + if (to_wbio(dst)->bounce) while (dst->bi_vcnt > DIV_ROUND_UP(dst->bi_iter.bi_size, PAGE_SIZE)) mempool_free(dst->bi_io_vec[--dst->bi_vcnt].bv_page, &c->bio_bounce_pages); @@ -782,6 +815,10 @@ do_write: key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset); + bch2_ec_add_backpointer(c, wp, + bkey_start_pos(&key_to_write->k), + total_input >> 9); + dst->bi_end_io = bch2_write_endio; dst->bi_private = &op->cl; dst->bi_opf = REQ_OP_WRITE; @@ -796,10 +833,10 @@ csum_err: "rewriting existing data (memory corruption?)"); ret = -EIO; err: - if (bounce) { + if (to_wbio(dst)->bounce) bch2_bio_free_pages_pool(c, dst); + if (to_wbio(dst)->put_bio) bio_put(dst); - } return ret; } @@ -811,6 +848,8 @@ static void __bch2_write(struct closure *cl) struct write_point *wp; int ret; again: + memset(&op->failed, 0, sizeof(op->failed)); + do { /* +1 for possible cache device: */ if (op->open_buckets.nr + op->nr_replicas + 1 > @@ -825,6 +864,7 @@ again: wp = bch2_alloc_sectors_start(c, op->target, + op->opts.erasure_code, op->write_point, &op->devs_have, op->nr_replicas, @@ -904,8 +944,6 @@ void bch2_write(struct closure *cl) op->start_time = local_clock(); - memset(&op->failed, 0, sizeof(op->failed)); - bch2_keylist_init(&op->insert_keys, op->inline_keys); wbio_init(&op->wbio.bio)->put_bio = false; @@ -1576,8 +1614,10 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, if (!pick_ret) goto hole; - if (pick_ret < 0) - goto no_device; + if (pick_ret < 0) { + __bcache_io_error(c, "no device to read from"); + goto err; + } if (pick_ret > 0) ca = bch_dev_bkey_exists(c, pick.ptr.dev); @@ -1704,36 +1744,51 @@ noclone: bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); - if (!rbio->have_ioref) - goto no_device_postclone; - percpu_down_read(&c->usage_lock); bucket_io_clock_reset(c, ca, PTR_BUCKET_NR(ca, &pick.ptr), READ); percpu_up_read(&c->usage_lock); - this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_USER], - bio_sectors(&rbio->bio)); - - bio_set_dev(&rbio->bio, ca->disk_sb.bdev); + if (likely(!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT)))) { + bio_inc_remaining(&orig->bio); + trace_read_split(&orig->bio); + } - if (likely(!(flags & BCH_READ_IN_RETRY))) { - if (!(flags & BCH_READ_LAST_FRAGMENT)) { - bio_inc_remaining(&orig->bio); - trace_read_split(&orig->bio); + if (!rbio->pick.idx) { + if (!rbio->have_ioref) { + __bcache_io_error(c, "no device to read from"); + bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); + goto out; } + this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_USER], + bio_sectors(&rbio->bio)); + bio_set_dev(&rbio->bio, ca->disk_sb.bdev); + if (unlikely(c->opts.no_data_io)) { - bio_endio(&rbio->bio); - return 0; + if (likely(!(flags & BCH_READ_IN_RETRY))) + bio_endio(&rbio->bio); + } else { + if (likely(!(flags & BCH_READ_IN_RETRY))) + submit_bio(&rbio->bio); + else + submit_bio_wait(&rbio->bio); + } + } else { + /* Attempting reconstruct read: */ + if (bch2_ec_read_extent(c, rbio)) { + bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); + goto out; } - submit_bio(&rbio->bio); + if (likely(!(flags & BCH_READ_IN_RETRY))) + bio_endio(&rbio->bio); + } +out: + if (likely(!(flags & BCH_READ_IN_RETRY))) { return 0; } else { int ret; - submit_bio_wait(&rbio->bio); - rbio->context = RBIO_CONTEXT_UNBOUND; bch2_read_endio(&rbio->bio); @@ -1748,22 +1803,12 @@ noclone: return ret; } -no_device_postclone: - if (!rbio->split) - rbio->bio.bi_end_io = rbio->end_io; - bch2_rbio_free(rbio); -no_device: - __bcache_io_error(c, "no device to read from"); - - if (likely(!(flags & BCH_READ_IN_RETRY))) { - orig->bio.bi_status = BLK_STS_IOERR; - - if (flags & BCH_READ_LAST_FRAGMENT) - bch2_rbio_done(orig); - return 0; - } else { +err: + if (flags & BCH_READ_IN_RETRY) return READ_ERR; - } + + orig->bio.bi_status = BLK_STS_IOERR; + goto out_read_done; hole: /* @@ -1775,7 +1820,7 @@ hole: orig->hole = true; zero_fill_bio_iter(&orig->bio, iter); - +out_read_done: if (flags & BCH_READ_LAST_FRAGMENT) bch2_rbio_done(orig); return 0; diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index b1f6433cf9e9..6eea96ad03fb 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1071,7 +1071,7 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w, replicas = bch2_extent_nr_ptrs(e.c); rcu_read_lock(); - devs_sorted = bch2_wp_alloc_list(c, &j->wp, + devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &c->rw_devs[BCH_DATA_JOURNAL]); for (i = 0; i < devs_sorted.nr; i++) { @@ -1098,8 +1098,7 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w, sectors > ca->mi.bucket_size) continue; - j->wp.next_alloc[ca->dev_idx] += U32_MAX; - bch2_wp_rescale(c, ca, &j->wp); + bch2_dev_stripe_increment(c, ca, &j->wp.stripe); ja->sectors_free = ca->mi.bucket_size - sectors; ja->cur_idx = (ja->cur_idx + 1) % ja->nr; diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index 74e92a196ccd..4d86c4bc4a5f 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -279,11 +279,37 @@ int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v) case Opt_background_compression: ret = bch2_check_set_has_compressed_data(c, v); break; + case Opt_erasure_code: + if (v && + !(c->sb.features & (1ULL << BCH_FEATURE_EC))) { + mutex_lock(&c->sb_lock); + c->disk_sb.sb->features[0] |= + cpu_to_le64(1ULL << BCH_FEATURE_EC); + + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + } + break; } return ret; } +int bch2_opts_check_may_set(struct bch_fs *c) +{ + unsigned i; + int ret; + + for (i = 0; i < bch2_opts_nr; i++) { + ret = bch2_opt_check_may_set(c, i, + bch2_opt_get_by_id(&c->opts, i)); + if (ret) + return ret; + } + + return 0; +} + int bch2_parse_mount_opts(struct bch_opts *opts, char *options) { char *opt, *name, *val; diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 8f4fab7f7dc8..80869e34e3b6 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -111,6 +111,9 @@ enum opt_type { BCH_OPT(promote_target, u16, OPT_RUNTIME, \ OPT_FN(bch2_opt_target), \ BCH_SB_PROMOTE_TARGET, 0) \ + BCH_OPT(erasure_code, u16, OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH_SB_ERASURE_CODE, false) \ BCH_OPT(inodes_32bit, u8, OPT_RUNTIME, \ OPT_BOOL(), \ BCH_SB_INODE_32BIT, false) \ @@ -270,6 +273,7 @@ void bch2_opt_to_text(struct printbuf *, struct bch_fs *, const struct bch_option *, u64, unsigned); int bch2_opt_check_may_set(struct bch_fs *, int, u64); +int bch2_opts_check_may_set(struct bch_fs *); int bch2_parse_mount_opts(struct bch_opts *, char *); /* inode opts: */ @@ -281,7 +285,8 @@ int bch2_parse_mount_opts(struct bch_opts *, char *); BCH_INODE_OPT(data_replicas, 8) \ BCH_INODE_OPT(promote_target, 16) \ BCH_INODE_OPT(foreground_target, 16) \ - BCH_INODE_OPT(background_target, 16) + BCH_INODE_OPT(background_target, 16) \ + BCH_INODE_OPT(erasure_code, 16) struct bch_io_opts { #define BCH_INODE_OPT(_name, _bits) unsigned _name##_defined:1; diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 1ae8133a1ef7..ddfba16a2998 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -7,6 +7,7 @@ #include "btree_update_interior.h" #include "btree_io.h" #include "dirent.h" +#include "ec.h" #include "error.h" #include "fsck.h" #include "journal_io.h" @@ -213,6 +214,11 @@ int bch2_fs_recovery(struct bch_fs *c) set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); + err = "cannot allocate memory"; + ret = bch2_fs_ec_start(c); + if (ret) + goto err; + bch_verbose(c, "starting mark and sweep:"); err = "error in recovery"; ret = bch2_initial_gc(c, &journal); diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c index 83fc9c93d295..0296931b6b8c 100644 --- a/fs/bcachefs/replicas.c +++ b/fs/bcachefs/replicas.c @@ -80,9 +80,33 @@ static void extent_to_replicas(struct bkey_s_c k, r->nr_required = 1; - extent_for_each_ptr_decode(e, p, entry) - if (!p.ptr.cached) - r->devs[r->nr_devs++] = p.ptr.dev; + extent_for_each_ptr_decode(e, p, entry) { + if (p.ptr.cached) + continue; + + if (p.ec_nr) { + r->nr_devs = 0; + break; + } + + r->devs[r->nr_devs++] = p.ptr.dev; + } + } +} + +static void stripe_to_replicas(struct bkey_s_c k, + struct bch_replicas_entry *r) +{ + if (k.k->type == BCH_STRIPE) { + struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); + const struct bch_extent_ptr *ptr; + + r->nr_required = s.v->nr_blocks - s.v->nr_redundant; + + for (ptr = s.v->ptrs; + ptr < s.v->ptrs + s.v->nr_blocks; + ptr++) + r->devs[r->nr_devs++] = ptr->dev; } } @@ -101,6 +125,10 @@ static void bkey_to_replicas(enum bkey_type type, e->data_type = BCH_DATA_USER; extent_to_replicas(k, e); break; + case BKEY_TYPE_EC: + e->data_type = BCH_DATA_USER; + stripe_to_replicas(k, e); + break; default: break; } diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index 071543033096..3dbcb6d7d261 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -3,6 +3,7 @@ #include "bcachefs.h" #include "checksum.h" #include "disk_groups.h" +#include "ec.h" #include "error.h" #include "io.h" #include "journal.h" diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index a22beff7cc96..931e50e8ad57 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -20,6 +20,7 @@ #include "compress.h" #include "debug.h" #include "disk_groups.h" +#include "ec.h" #include "error.h" #include "fs.h" #include "fs-io.h" @@ -364,6 +365,7 @@ static void bch2_fs_free(struct bch_fs *c) bch2_fs_quota_exit(c); bch2_fs_fsio_exit(c); + bch2_fs_ec_exit(c); bch2_fs_encryption_exit(c); bch2_fs_io_exit(c); bch2_fs_btree_cache_exit(c); @@ -544,6 +546,11 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) INIT_LIST_HEAD(&c->fsck_errors); mutex_init(&c->fsck_error_lock); + INIT_LIST_HEAD(&c->ec_new_stripe_list); + mutex_init(&c->ec_new_stripe_lock); + mutex_init(&c->ec_stripes_lock); + spin_lock_init(&c->ec_stripes_heap_lock); + seqcount_init(&c->gc_pos_lock); c->copy_gc_enabled = 1; @@ -612,6 +619,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_fs_io_init(c) || bch2_fs_encryption_init(c) || bch2_fs_compress_init(c) || + bch2_fs_ec_init(c) || bch2_fs_fsio_init(c)) goto err; @@ -683,6 +691,10 @@ const char *bch2_fs_start(struct bch_fs *c) if (ret) goto err; + ret = bch2_opts_check_may_set(c); + if (ret) + goto err; + err = "dynamic fault"; if (bch2_fs_init_fault("fs_start")) goto err; diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index 6a5da0f12713..188e19572d91 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -19,6 +19,7 @@ #include "btree_gc.h" #include "buckets.h" #include "disk_groups.h" +#include "ec.h" #include "inode.h" #include "journal.h" #include "keylist.h" @@ -188,6 +189,8 @@ sysfs_pd_controller_attribute(rebalance); read_attribute(rebalance_work); rw_attribute(promote_whole_extents); +read_attribute(new_stripes); + rw_attribute(pd_controllers_update_seconds); read_attribute(meta_replicas_have); @@ -242,6 +245,8 @@ static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf) pr_buf(&out, "\t%s:\t\t%llu\n", bch2_data_types[type], stats.replicas[replicas].data[type]); + pr_buf(&out, "\terasure coded:\t%llu\n", + stats.replicas[replicas].ec_data); pr_buf(&out, "\treserved:\t%llu\n", stats.replicas[replicas].persistent_reserved); } @@ -310,6 +315,41 @@ static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf) compressed_sectors_uncompressed << 9); } +static ssize_t bch2_new_stripes(struct bch_fs *c, char *buf) +{ + char *out = buf, *end = buf + PAGE_SIZE; + struct ec_stripe_head *h; + struct ec_stripe_new *s; + + mutex_lock(&c->ec_new_stripe_lock); + list_for_each_entry(h, &c->ec_new_stripe_list, list) { + out += scnprintf(out, end - out, + "target %u algo %u redundancy %u:\n", + h->target, h->algo, h->redundancy); + + if (h->s) + out += scnprintf(out, end - out, + "\tpending: blocks %u allocated %u\n", + h->s->blocks.nr, + bitmap_weight(h->s->blocks_allocated, + h->s->blocks.nr)); + + mutex_lock(&h->lock); + list_for_each_entry(s, &h->stripes, list) + out += scnprintf(out, end - out, + "\tin flight: blocks %u allocated %u pin %u\n", + s->blocks.nr, + bitmap_weight(s->blocks_allocated, + s->blocks.nr), + atomic_read(&s->pin)); + mutex_unlock(&h->lock); + + } + mutex_unlock(&c->ec_new_stripe_lock); + + return out - buf; +} + SHOW(bch2_fs) { struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); @@ -369,6 +409,9 @@ SHOW(bch2_fs) if (attr == &sysfs_compression_stats) return bch2_compression_stats(c, buf); + if (attr == &sysfs_new_stripes) + return bch2_new_stripes(c, buf); + #define BCH_DEBUG_PARAM(name, description) sysfs_print(name, c->name); BCH_DEBUG_PARAMS() #undef BCH_DEBUG_PARAM @@ -537,6 +580,8 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_rebalance_work, sysfs_pd_controller_files(rebalance), + &sysfs_new_stripes, + &sysfs_internal_uuid, #define BCH_DEBUG_PARAM(name, description) &sysfs_##name, @@ -765,6 +810,7 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf) " meta: %llu\n" " user: %llu\n" " cached: %llu\n" + " erasure coded: %llu\n" " available: %lli\n" "sectors:\n" " sb: %llu\n" @@ -788,6 +834,7 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf) stats.buckets[BCH_DATA_BTREE], stats.buckets[BCH_DATA_USER], stats.buckets[BCH_DATA_CACHED], + stats.buckets_ec, ca->mi.nbuckets - ca->mi.first_bucket - stats.buckets_unavailable, stats.sectors[BCH_DATA_SB], stats.sectors[BCH_DATA_JOURNAL], |