diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2018-11-01 15:13:19 -0400 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@linux.dev> | 2023-10-22 17:08:11 -0400 |
commit | cd575ddf57af004913ff5a994aa5f3203216fa68 (patch) | |
tree | 15fafb5d59b359aef897a2296f1ce7f8bc33f55a /fs/bcachefs/alloc_foreground.c | |
parent | 91f8b5677b5d831cff34b25ef03322ae49e03256 (diff) | |
download | lwn-cd575ddf57af004913ff5a994aa5f3203216fa68.tar.gz lwn-cd575ddf57af004913ff5a994aa5f3203216fa68.zip |
bcachefs: Erasure coding
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Diffstat (limited to 'fs/bcachefs/alloc_foreground.c')
-rw-r--r-- | fs/bcachefs/alloc_foreground.c | 353 |
1 files changed, 281 insertions, 72 deletions
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index df74e41ec890..6e5f6e57da56 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -62,6 +62,7 @@ #include "clock.h" #include "debug.h" #include "disk_groups.h" +#include "ec.h" #include "io.h" #include "trace.h" @@ -95,6 +96,11 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) { struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); + if (ob->ec) { + bch2_ec_bucket_written(c, ob); + return; + } + percpu_down_read(&c->usage_lock); spin_lock(&ob->lock); @@ -114,6 +120,19 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) closure_wake_up(&c->open_buckets_wait); } +void bch2_open_bucket_write_error(struct bch_fs *c, + struct open_buckets *obs, + unsigned dev) +{ + struct open_bucket *ob; + unsigned i; + + open_bucket_for_each(c, obs, ob, i) + if (ob->ptr.dev == dev && + ob->ec) + bch2_ec_bucket_cancel(c, ob); +} + static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) { struct open_bucket *ob; @@ -129,15 +148,17 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) } static void open_bucket_free_unused(struct bch_fs *c, - struct write_point *wp, - struct open_bucket *ob) + struct open_bucket *ob, + bool may_realloc) { struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); BUG_ON(ca->open_buckets_partial_nr >= ARRAY_SIZE(ca->open_buckets_partial)); - if (wp->type == BCH_DATA_USER) { + if (ca->open_buckets_partial_nr < + ARRAY_SIZE(ca->open_buckets_partial) && + may_realloc) { spin_lock(&c->freelist_lock); ob->on_partial_list = true; ca->open_buckets_partial[ca->open_buckets_partial_nr++] = @@ -285,18 +306,18 @@ out: return ob; } -static int __dev_alloc_cmp(struct write_point *wp, - unsigned l, unsigned r) +static int __dev_stripe_cmp(struct dev_stripe_state *stripe, + unsigned l, unsigned r) { - return ((wp->next_alloc[l] > wp->next_alloc[r]) - - (wp->next_alloc[l] < wp->next_alloc[r])); + return ((stripe->next_alloc[l] > stripe->next_alloc[r]) - + (stripe->next_alloc[l] < stripe->next_alloc[r])); } -#define dev_alloc_cmp(l, r) __dev_alloc_cmp(wp, l, r) +#define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r) -struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *c, - struct write_point *wp, - struct bch_devs_mask *devs) +struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c, + struct dev_stripe_state *stripe, + struct bch_devs_mask *devs) { struct dev_alloc_list ret = { .nr = 0 }; struct bch_dev *ca; @@ -305,14 +326,14 @@ struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *c, for_each_member_device_rcu(ca, c, i, devs) ret.devs[ret.nr++] = i; - bubble_sort(ret.devs, ret.nr, dev_alloc_cmp); + bubble_sort(ret.devs, ret.nr, dev_stripe_cmp); return ret; } -void bch2_wp_rescale(struct bch_fs *c, struct bch_dev *ca, - struct write_point *wp) +void bch2_dev_stripe_increment(struct bch_fs *c, struct bch_dev *ca, + struct dev_stripe_state *stripe) { - u64 *v = wp->next_alloc + ca->dev_idx; + u64 *v = stripe->next_alloc + ca->dev_idx; u64 free_space = dev_buckets_free(c, ca); u64 free_space_inv = free_space ? div64_u64(1ULL << 48, free_space) @@ -324,26 +345,30 @@ void bch2_wp_rescale(struct bch_fs *c, struct bch_dev *ca, else *v = U64_MAX; - for (v = wp->next_alloc; - v < wp->next_alloc + ARRAY_SIZE(wp->next_alloc); v++) + for (v = stripe->next_alloc; + v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++) *v = *v < scale ? 0 : *v - scale; } +#define BUCKET_MAY_ALLOC_PARTIAL (1 << 0) +#define BUCKET_ALLOC_USE_DURABILITY (1 << 1) + static int bch2_bucket_alloc_set(struct bch_fs *c, struct open_buckets *ptrs, - struct write_point *wp, + struct dev_stripe_state *stripe, struct bch_devs_mask *devs_may_alloc, unsigned nr_replicas, unsigned *nr_effective, bool *have_cache, enum alloc_reserve reserve, + unsigned flags, struct closure *cl) { struct dev_alloc_list devs_sorted = - bch2_wp_alloc_list(c, wp, devs_may_alloc); + bch2_dev_alloc_list(c, stripe, devs_may_alloc); struct bch_dev *ca; bool alloc_failure = false; - unsigned i; + unsigned i, durability; BUG_ON(*nr_effective >= nr_replicas); @@ -354,13 +379,11 @@ static int bch2_bucket_alloc_set(struct bch_fs *c, if (!ca) continue; - if (!ca->mi.durability && - (*have_cache || - wp->type != BCH_DATA_USER)) + if (!ca->mi.durability && *have_cache) continue; ob = bch2_bucket_alloc(c, ca, reserve, - wp->type == BCH_DATA_USER, cl); + flags & BUCKET_MAY_ALLOC_PARTIAL, cl); if (IS_ERR(ob)) { enum bucket_alloc_ret ret = -PTR_ERR(ob); @@ -375,13 +398,16 @@ static int bch2_bucket_alloc_set(struct bch_fs *c, continue; } + durability = (flags & BUCKET_ALLOC_USE_DURABILITY) + ? ca->mi.durability : 1; + __clear_bit(ca->dev_idx, devs_may_alloc->d); - *nr_effective += ca->mi.durability; - *have_cache |= !ca->mi.durability; + *nr_effective += durability; + *have_cache |= !durability; ob_push(c, ptrs, ob); - bch2_wp_rescale(c, ca, wp); + bch2_dev_stripe_increment(c, ca, stripe); if (*nr_effective >= nr_replicas) return 0; @@ -390,15 +416,150 @@ static int bch2_bucket_alloc_set(struct bch_fs *c, return alloc_failure ? -ENOSPC : -EROFS; } +/* Allocate from stripes: */ + +/* + * XXX: use a higher watermark for allocating open buckets here: + */ +static int ec_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) +{ + struct bch_devs_mask devs; + struct open_bucket *ob; + unsigned i, nr_have = 0, nr_data = + min_t(unsigned, h->nr_active_devs, + EC_STRIPE_MAX) - h->redundancy; + bool have_cache = true; + int ret = 0; + + BUG_ON(h->blocks.nr > nr_data); + BUG_ON(h->parity.nr > h->redundancy); + + devs = h->devs; + + open_bucket_for_each(c, &h->parity, ob, i) + __clear_bit(ob->ptr.dev, devs.d); + open_bucket_for_each(c, &h->blocks, ob, i) + __clear_bit(ob->ptr.dev, devs.d); + + percpu_down_read(&c->usage_lock); + rcu_read_lock(); + + if (h->parity.nr < h->redundancy) { + nr_have = h->parity.nr; + + ret = bch2_bucket_alloc_set(c, &h->parity, + &h->parity_stripe, + &devs, + h->redundancy, + &nr_have, + &have_cache, + RESERVE_NONE, + 0, + NULL); + if (ret) + goto err; + } + + if (h->blocks.nr < nr_data) { + nr_have = h->blocks.nr; + + ret = bch2_bucket_alloc_set(c, &h->blocks, + &h->block_stripe, + &devs, + nr_data, + &nr_have, + &have_cache, + RESERVE_NONE, + 0, + NULL); + if (ret) + goto err; + } + + rcu_read_unlock(); + percpu_up_read(&c->usage_lock); + + return bch2_ec_stripe_new_alloc(c, h); +err: + rcu_read_unlock(); + percpu_up_read(&c->usage_lock); + return -1; +} + +/* + * if we can't allocate a new stripe because there are already too many + * partially filled stripes, force allocating from an existing stripe even when + * it's to a device we don't want: + */ + +static void bucket_alloc_from_stripe(struct bch_fs *c, + struct open_buckets *ptrs, + struct write_point *wp, + struct bch_devs_mask *devs_may_alloc, + u16 target, + unsigned erasure_code, + unsigned nr_replicas, + unsigned *nr_effective, + bool *have_cache) +{ + struct dev_alloc_list devs_sorted; + struct ec_stripe_head *h; + struct open_bucket *ob; + struct bch_dev *ca; + unsigned i, ec_idx; + + if (!erasure_code) + return; + + if (nr_replicas < 2) + return; + + if (ec_open_bucket(c, ptrs)) + return; + + h = bch2_ec_stripe_head_get(c, target, erasure_code, nr_replicas - 1); + if (!h) + return; + + if (!h->s && ec_stripe_alloc(c, h)) + goto out_put_head; + + rcu_read_lock(); + devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc); + rcu_read_unlock(); + + for (i = 0; i < devs_sorted.nr; i++) + open_bucket_for_each(c, &h->s->blocks, ob, ec_idx) + if (ob->ptr.dev == devs_sorted.devs[i] && + !test_and_set_bit(ec_idx, h->s->blocks_allocated)) + goto got_bucket; + goto out_put_head; +got_bucket: + ca = bch_dev_bkey_exists(c, ob->ptr.dev); + + ob->ec_idx = ec_idx; + ob->ec = h->s; + + __clear_bit(ob->ptr.dev, devs_may_alloc->d); + *nr_effective += ca->mi.durability; + *have_cache |= !ca->mi.durability; + + ob_push(c, ptrs, ob); + atomic_inc(&h->s->pin); +out_put_head: + bch2_ec_stripe_head_put(h); +} + /* Sector allocator */ -static int get_buckets_from_writepoint(struct bch_fs *c, - struct open_buckets *ptrs, - struct write_point *wp, - struct bch_devs_mask *devs_may_alloc, - unsigned nr_replicas, - unsigned *nr_effective, - bool *have_cache) +static void get_buckets_from_writepoint(struct bch_fs *c, + struct open_buckets *ptrs, + struct write_point *wp, + struct bch_devs_mask *devs_may_alloc, + unsigned nr_replicas, + unsigned *nr_effective, + bool *have_cache, + bool need_ec) { struct open_buckets ptrs_skip = { .nr = 0 }; struct open_bucket *ob; @@ -410,7 +571,8 @@ static int get_buckets_from_writepoint(struct bch_fs *c, if (*nr_effective < nr_replicas && test_bit(ob->ptr.dev, devs_may_alloc->d) && (ca->mi.durability || - (wp->type == BCH_DATA_USER && !*have_cache))) { + (wp->type == BCH_DATA_USER && !*have_cache)) && + (ob->ec || !need_ec)) { __clear_bit(ob->ptr.dev, devs_may_alloc->d); *nr_effective += ca->mi.durability; *have_cache |= !ca->mi.durability; @@ -421,8 +583,6 @@ static int get_buckets_from_writepoint(struct bch_fs *c, } } wp->ptrs = ptrs_skip; - - return *nr_effective < nr_replicas ? -ENOSPC : 0; } static int open_bucket_add_buckets(struct bch_fs *c, @@ -430,22 +590,25 @@ static int open_bucket_add_buckets(struct bch_fs *c, struct write_point *wp, struct bch_devs_list *devs_have, u16 target, + unsigned erasure_code, unsigned nr_replicas, unsigned *nr_effective, bool *have_cache, enum alloc_reserve reserve, - struct closure *cl) + struct closure *_cl) { struct bch_devs_mask devs; - const struct bch_devs_mask *t; struct open_bucket *ob; - unsigned i; + struct closure *cl = NULL; + unsigned i, flags = BUCKET_ALLOC_USE_DURABILITY; int ret; - percpu_down_read(&c->usage_lock); - rcu_read_lock(); + if (wp->type == BCH_DATA_USER) + flags |= BUCKET_MAY_ALLOC_PARTIAL; - devs = c->rw_devs[wp->type]; + rcu_read_lock(); + devs = target_rw_devs(c, wp->type, target); + rcu_read_unlock(); /* Don't allocate from devices we already have pointers to: */ for (i = 0; i < devs_have->nr; i++) @@ -454,50 +617,83 @@ static int open_bucket_add_buckets(struct bch_fs *c, open_bucket_for_each(c, ptrs, ob, i) __clear_bit(ob->ptr.dev, devs.d); - t = bch2_target_to_mask(c, target); - if (t) - bitmap_and(devs.d, devs.d, t->d, BCH_SB_MEMBERS_MAX); + if (erasure_code) { + get_buckets_from_writepoint(c, ptrs, wp, &devs, + nr_replicas, nr_effective, + have_cache, true); + if (*nr_effective >= nr_replicas) + return 0; - ret = get_buckets_from_writepoint(c, ptrs, wp, &devs, - nr_replicas, nr_effective, have_cache); - if (!ret) - goto out; + bucket_alloc_from_stripe(c, ptrs, wp, &devs, + target, erasure_code, + nr_replicas, nr_effective, + have_cache); + if (*nr_effective >= nr_replicas) + return 0; + } + + get_buckets_from_writepoint(c, ptrs, wp, &devs, + nr_replicas, nr_effective, + have_cache, false); + if (*nr_effective >= nr_replicas) + return 0; + + percpu_down_read(&c->usage_lock); + rcu_read_lock(); +retry_blocking: /* * Try nonblocking first, so that if one device is full we'll try from * other devices: */ - ret = bch2_bucket_alloc_set(c, ptrs, wp, &devs, + ret = bch2_bucket_alloc_set(c, ptrs, &wp->stripe, &devs, nr_replicas, nr_effective, have_cache, - reserve, NULL); - if (!ret || ret == -EROFS || !cl) - goto out; + reserve, flags, cl); + if (ret && ret != -EROFS && !cl && _cl) { + cl = _cl; + goto retry_blocking; + } - ret = bch2_bucket_alloc_set(c, ptrs, wp, &devs, - nr_replicas, nr_effective, have_cache, - reserve, cl); -out: rcu_read_unlock(); percpu_up_read(&c->usage_lock); return ret; } -void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca, - struct write_point *wp) +void bch2_open_buckets_stop_dev(struct bch_fs *c, struct bch_dev *ca, + struct open_buckets *obs, + enum bch_data_type data_type) { struct open_buckets ptrs = { .nr = 0 }; - struct open_bucket *ob; - unsigned i; + struct open_bucket *ob, *ob2; + unsigned i, j; - mutex_lock(&wp->lock); - open_bucket_for_each(c, &wp->ptrs, ob, i) - if (!ca || ob->ptr.dev == ca->dev_idx) - open_bucket_free_unused(c, wp, ob); + open_bucket_for_each(c, obs, ob, i) { + bool drop = !ca || ob->ptr.dev == ca->dev_idx; + + if (!drop && ob->ec) { + mutex_lock(&ob->ec->lock); + open_bucket_for_each(c, &ob->ec->blocks, ob2, j) + drop |= ob2->ptr.dev == ca->dev_idx; + open_bucket_for_each(c, &ob->ec->parity, ob2, j) + drop |= ob2->ptr.dev == ca->dev_idx; + mutex_unlock(&ob->ec->lock); + } + + if (drop) + bch2_open_bucket_put(c, ob); else ob_push(c, &ptrs, ob); + } - wp->ptrs = ptrs; + *obs = ptrs; +} + +void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca, + struct write_point *wp) +{ + mutex_lock(&wp->lock); + bch2_open_buckets_stop_dev(c, ca, &wp->ptrs, wp->type); mutex_unlock(&wp->lock); } @@ -630,6 +826,7 @@ out: */ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c, unsigned target, + unsigned erasure_code, struct write_point_specifier write_point, struct bch_devs_list *devs_have, unsigned nr_replicas, @@ -649,26 +846,37 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c, BUG_ON(!nr_replicas || !nr_replicas_required); retry: write_points_nr = c->write_points_nr; + wp = writepoint_find(c, write_point.v); + /* metadata may not allocate on cache devices: */ + if (wp->type != BCH_DATA_USER) + have_cache = true; + if (!target || (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) { - ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, target, + ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, + target, erasure_code, nr_replicas, &nr_effective, &have_cache, reserve, cl); } else { - ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, target, + ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, + target, erasure_code, nr_replicas, &nr_effective, &have_cache, reserve, NULL); if (!ret) goto alloc_done; - ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, 0, + ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, + 0, erasure_code, nr_replicas, &nr_effective, &have_cache, reserve, cl); } alloc_done: BUG_ON(!ret && nr_effective < nr_replicas); + if (erasure_code && !ec_open_bucket(c, &ptrs)) + pr_debug("failed to get ec bucket: ret %u", ret); + if (ret == -EROFS && nr_effective >= nr_replicas_required) ret = 0; @@ -678,7 +886,7 @@ alloc_done: /* Free buckets we didn't use: */ open_bucket_for_each(c, &wp->ptrs, ob, i) - open_bucket_free_unused(c, wp, ob); + open_bucket_free_unused(c, ob, wp->type == BCH_DATA_USER); wp->ptrs = ptrs; @@ -697,7 +905,8 @@ err: if (ptrs.nr < ARRAY_SIZE(ptrs.v)) ob_push(c, &ptrs, ob); else - open_bucket_free_unused(c, wp, ob); + open_bucket_free_unused(c, ob, + wp->type == BCH_DATA_USER); wp->ptrs = ptrs; mutex_unlock(&wp->lock); |