diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2018-07-22 10:43:01 -0400 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@linux.dev> | 2023-10-22 17:08:08 -0400 |
commit | b29e197aafd95fc5cd50f0fd85c6275e3aa319a6 (patch) | |
tree | d7111fa0f5c0515f4533acd4fc167fdcf445f63c | |
parent | b2be7c8b731262c5342e9f068b490d61e540ad0d (diff) | |
download | lwn-b29e197aafd95fc5cd50f0fd85c6275e3aa319a6.tar.gz lwn-b29e197aafd95fc5cd50f0fd85c6275e3aa319a6.zip |
bcachefs: Invalidate buckets when writing to alloc btree
Prep work for persistent alloc information. Refactoring also lets us
make free_inc much smaller, which means a lot fewer buckets stranded on
freelists.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
-rw-r--r-- | fs/bcachefs/alloc.c | 610 | ||||
-rw-r--r-- | fs/bcachefs/alloc.h | 2 | ||||
-rw-r--r-- | fs/bcachefs/bcachefs.h | 7 | ||||
-rw-r--r-- | fs/bcachefs/btree_update_interior.c | 3 | ||||
-rw-r--r-- | fs/bcachefs/buckets.c | 13 | ||||
-rw-r--r-- | fs/bcachefs/buckets.h | 2 |
6 files changed, 300 insertions, 337 deletions
diff --git a/fs/bcachefs/alloc.c b/fs/bcachefs/alloc.c index e6e506e4a8a3..19523226afd8 100644 --- a/fs/bcachefs/alloc.c +++ b/fs/bcachefs/alloc.c @@ -288,53 +288,41 @@ int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list) static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca, size_t b, struct btree_iter *iter, - u64 *journal_seq, bool nowait) + u64 *journal_seq, unsigned flags) { struct bucket_mark m; __BKEY_PADDED(k, DIV_ROUND_UP(sizeof(struct bch_alloc), 8)) alloc_key; struct bucket *g; struct bkey_i_alloc *a; u8 *d; - int ret; - unsigned flags = BTREE_INSERT_ATOMIC| - BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE| - BTREE_INSERT_USE_ALLOC_RESERVE; - - if (nowait) - flags |= BTREE_INSERT_NOWAIT; - bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b)); + percpu_down_read(&c->usage_lock); + g = bucket(ca, b); + + m = READ_ONCE(g->mark); + a = bkey_alloc_init(&alloc_key.k); + a->k.p = POS(ca->dev_idx, b); + a->v.fields = 0; + a->v.gen = m.gen; + set_bkey_val_u64s(&a->k, bch_alloc_val_u64s(&a->v)); + + d = a->v.data; + if (a->v.fields & (1 << BCH_ALLOC_FIELD_READ_TIME)) + put_alloc_field(&d, 2, g->io_time[READ]); + if (a->v.fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME)) + put_alloc_field(&d, 2, g->io_time[WRITE]); + percpu_up_read(&c->usage_lock); - do { - ret = btree_iter_err(bch2_btree_iter_peek_slot(iter)); - if (ret) - break; + bch2_btree_iter_cond_resched(iter); - percpu_down_read(&c->usage_lock); - g = bucket(ca, b); - - /* read mark under btree node lock: */ - m = READ_ONCE(g->mark); - a = bkey_alloc_init(&alloc_key.k); - a->k.p = iter->pos; - a->v.fields = 0; - a->v.gen = m.gen; - set_bkey_val_u64s(&a->k, bch_alloc_val_u64s(&a->v)); - - d = a->v.data; - if (a->v.fields & (1 << BCH_ALLOC_FIELD_READ_TIME)) - put_alloc_field(&d, 2, g->io_time[READ]); - if (a->v.fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME)) - put_alloc_field(&d, 2, g->io_time[WRITE]); - percpu_up_read(&c->usage_lock); - - ret = bch2_btree_insert_at(c, NULL, NULL, journal_seq, flags, - BTREE_INSERT_ENTRY(iter, &a->k_i)); - bch2_btree_iter_cond_resched(iter); - } while (ret == -EINTR); + bch2_btree_iter_set_pos(iter, a->k.p); - return ret; + return bch2_btree_insert_at(c, NULL, NULL, journal_seq, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_USE_RESERVE| + BTREE_INSERT_USE_ALLOC_RESERVE| + flags, + BTREE_INSERT_ENTRY(iter, &a->k_i)); } int bch2_alloc_replay_key(struct bch_fs *c, struct bpos pos) @@ -354,8 +342,7 @@ int bch2_alloc_replay_key(struct bch_fs *c, struct bpos pos) bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS_MIN, BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - ret = __bch2_alloc_write_key(c, ca, pos.offset, &iter, - NULL, false); + ret = __bch2_alloc_write_key(c, ca, pos.offset, &iter, NULL, 0); bch2_btree_iter_unlock(&iter); return ret; } @@ -375,8 +362,8 @@ int bch2_alloc_write(struct bch_fs *c) down_read(&ca->bucket_lock); for_each_set_bit(bucket, ca->buckets_dirty, ca->mi.nbuckets) { - ret = __bch2_alloc_write_key(c, ca, bucket, &iter, - NULL, false); + ret = __bch2_alloc_write_key(c, ca, bucket, + &iter, NULL, 0); if (ret) break; @@ -582,47 +569,6 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca, return gc_gen < BUCKET_GC_GEN_MAX; } -static void bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca, - size_t bucket) -{ - struct bucket_mark m; - - percpu_down_read(&c->usage_lock); - spin_lock(&c->freelist_lock); - - if (!bch2_invalidate_bucket(c, ca, bucket, &m)) { - spin_unlock(&c->freelist_lock); - percpu_up_read(&c->usage_lock); - return; - } - - verify_not_on_freelist(c, ca, bucket); - BUG_ON(!fifo_push(&ca->free_inc, bucket)); - - spin_unlock(&c->freelist_lock); - percpu_up_read(&c->usage_lock); - - /* gc lock held: */ - bucket_io_clock_reset(c, ca, bucket, READ); - bucket_io_clock_reset(c, ca, bucket, WRITE); - - if (m.cached_sectors) { - ca->allocator_invalidating_data = true; - } else if (m.journal_seq_valid) { - u64 journal_seq = atomic64_read(&c->journal.seq); - u64 bucket_seq = journal_seq; - - bucket_seq &= ~((u64) U16_MAX); - bucket_seq |= m.journal_seq; - - if (bucket_seq > journal_seq) - bucket_seq -= 1 << 16; - - ca->allocator_journal_seq_flush = - max(ca->allocator_journal_seq_flush, bucket_seq); - } -} - /* * Determines what order we're going to reuse buckets, smallest bucket_key() * first. @@ -674,11 +620,18 @@ static inline int bucket_alloc_cmp(alloc_heap *h, (l.bucket > r.bucket) - (l.bucket < r.bucket); } +static inline int bucket_idx_cmp(const void *_l, const void *_r) +{ + const struct alloc_heap_entry *l = _l, *r = _r; + + return (l->bucket > r->bucket) - (l->bucket < r->bucket); +} + static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) { struct bucket_array *buckets; struct alloc_heap_entry e = { 0 }; - size_t b; + size_t b, i, nr = 0; ca->alloc_heap.used = 0; @@ -720,55 +673,58 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) if (e.nr) heap_add_or_replace(&ca->alloc_heap, e, -bucket_alloc_cmp); - up_read(&ca->bucket_lock); - mutex_unlock(&c->bucket_clock[READ].lock); - - heap_resort(&ca->alloc_heap, bucket_alloc_cmp); - - while (heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp)) { - for (b = e.bucket; - b < e.bucket + e.nr; - b++) { - if (fifo_full(&ca->free_inc)) - return; + for (i = 0; i < ca->alloc_heap.used; i++) + nr += ca->alloc_heap.data[i].nr; - bch2_invalidate_one_bucket(c, ca, b); - } + while (nr - ca->alloc_heap.data[0].nr >= ALLOC_SCAN_BATCH(ca)) { + nr -= ca->alloc_heap.data[0].nr; + heap_pop(&ca->alloc_heap, e, -bucket_alloc_cmp); } + + up_read(&ca->bucket_lock); + mutex_unlock(&c->bucket_clock[READ].lock); } static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca) { struct bucket_array *buckets = bucket_array(ca); struct bucket_mark m; - size_t b, checked; + size_t b, start; - for (checked = 0; - checked < ca->mi.nbuckets && !fifo_full(&ca->free_inc); - checked++) { - if (ca->fifo_last_bucket < ca->mi.first_bucket || - ca->fifo_last_bucket >= ca->mi.nbuckets) - ca->fifo_last_bucket = ca->mi.first_bucket; + if (ca->fifo_last_bucket < ca->mi.first_bucket || + ca->fifo_last_bucket >= ca->mi.nbuckets) + ca->fifo_last_bucket = ca->mi.first_bucket; + + start = ca->fifo_last_bucket; - b = ca->fifo_last_bucket++; + do { + ca->fifo_last_bucket++; + if (ca->fifo_last_bucket == ca->mi.nbuckets) + ca->fifo_last_bucket = ca->mi.first_bucket; + b = ca->fifo_last_bucket; m = READ_ONCE(buckets->b[b].mark); - if (bch2_can_invalidate_bucket(ca, b, m)) - bch2_invalidate_one_bucket(c, ca, b); + if (bch2_can_invalidate_bucket(ca, b, m)) { + struct alloc_heap_entry e = { .bucket = b, .nr = 1, }; + + heap_add(&ca->alloc_heap, e, bucket_alloc_cmp); + if (heap_full(&ca->alloc_heap)) + break; + } cond_resched(); - } + } while (ca->fifo_last_bucket != start); } static void find_reclaimable_buckets_random(struct bch_fs *c, struct bch_dev *ca) { struct bucket_array *buckets = bucket_array(ca); struct bucket_mark m; - size_t checked; + size_t checked, i; for (checked = 0; - checked < ca->mi.nbuckets / 2 && !fifo_full(&ca->free_inc); + checked < ca->mi.nbuckets / 2; checked++) { size_t b = bch2_rand_range(ca->mi.nbuckets - ca->mi.first_bucket) + @@ -776,17 +732,34 @@ static void find_reclaimable_buckets_random(struct bch_fs *c, struct bch_dev *ca m = READ_ONCE(buckets->b[b].mark); - if (bch2_can_invalidate_bucket(ca, b, m)) - bch2_invalidate_one_bucket(c, ca, b); + if (bch2_can_invalidate_bucket(ca, b, m)) { + struct alloc_heap_entry e = { .bucket = b, .nr = 1, }; + + heap_add(&ca->alloc_heap, e, bucket_alloc_cmp); + if (heap_full(&ca->alloc_heap)) + break; + } cond_resched(); } + + sort(ca->alloc_heap.data, + ca->alloc_heap.used, + sizeof(ca->alloc_heap.data[0]), + bucket_idx_cmp, NULL); + + /* remove duplicates: */ + for (i = 0; i + 1 < ca->alloc_heap.used; i++) + if (ca->alloc_heap.data[i].bucket == + ca->alloc_heap.data[i + 1].bucket) + ca->alloc_heap.data[i].nr = 0; } -static void find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca) +static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca) { + size_t i, nr = 0; + ca->inc_gen_needs_gc = 0; - ca->inc_gen_really_needs_gc = 0; switch (ca->mi.replacement) { case CACHE_REPLACEMENT_LRU: @@ -799,86 +772,132 @@ static void find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca) find_reclaimable_buckets_random(c, ca); break; } + + heap_resort(&ca->alloc_heap, bucket_alloc_cmp); + + for (i = 0; i < ca->alloc_heap.used; i++) + nr += ca->alloc_heap.data[i].nr; + + return nr; } -static int size_t_cmp(const void *_l, const void *_r) +static inline long next_alloc_bucket(struct bch_dev *ca) { - const size_t *l = _l, *r = _r; + struct alloc_heap_entry e, *top = ca->alloc_heap.data; + + while (ca->alloc_heap.used) { + if (top->nr) { + size_t b = top->bucket; + + top->bucket++; + top->nr--; + return b; + } - return (*l > *r) - (*l < *r); + heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp); + } + + return -1; } -static void sort_free_inc(struct bch_fs *c, struct bch_dev *ca) +static bool bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca, + size_t bucket, u64 *flush_seq) { - BUG_ON(ca->free_inc.front); + struct bucket_mark m; + percpu_down_read(&c->usage_lock); spin_lock(&c->freelist_lock); - sort(ca->free_inc.data, - ca->free_inc.back, - sizeof(ca->free_inc.data[0]), - size_t_cmp, NULL); + + bch2_invalidate_bucket(c, ca, bucket, &m); + + verify_not_on_freelist(c, ca, bucket); + BUG_ON(!fifo_push(&ca->free_inc, bucket)); + spin_unlock(&c->freelist_lock); + + bucket_io_clock_reset(c, ca, bucket, READ); + bucket_io_clock_reset(c, ca, bucket, WRITE); + + percpu_up_read(&c->usage_lock); + + if (m.journal_seq_valid) { + u64 journal_seq = atomic64_read(&c->journal.seq); + u64 bucket_seq = journal_seq; + + bucket_seq &= ~((u64) U16_MAX); + bucket_seq |= m.journal_seq; + + if (bucket_seq > journal_seq) + bucket_seq -= 1 << 16; + + *flush_seq = max(*flush_seq, bucket_seq); + } + + return m.cached_sectors != 0; } -static int bch2_invalidate_free_inc(struct bch_fs *c, struct bch_dev *ca, - u64 *journal_seq, size_t nr, - bool nowait) +/* + * Pull buckets off ca->alloc_heap, invalidate them, move them to ca->free_inc: + */ +static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca) { struct btree_iter iter; + u64 journal_seq = 0; int ret = 0; + long b; bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS(ca->dev_idx, 0), BTREE_ITER_SLOTS|BTREE_ITER_INTENT); /* Only use nowait if we've already invalidated at least one bucket: */ - while (ca->nr_invalidated < min(nr, fifo_used(&ca->free_inc))) { - size_t b = fifo_idx_entry(&ca->free_inc, ca->nr_invalidated); - - ret = __bch2_alloc_write_key(c, ca, b, &iter, journal_seq, - nowait && ca->nr_invalidated); - if (ret) - break; - - ca->nr_invalidated++; + while (!ret && + !fifo_full(&ca->free_inc) && + (b = next_alloc_bucket(ca)) >= 0) { + bool must_flush = + bch2_invalidate_one_bucket(c, ca, b, &journal_seq); + + ret = __bch2_alloc_write_key(c, ca, b, &iter, + must_flush ? &journal_seq : NULL, + !fifo_empty(&ca->free_inc) ? BTREE_INSERT_NOWAIT : 0); } bch2_btree_iter_unlock(&iter); /* If we used NOWAIT, don't return the error: */ - return ca->nr_invalidated ? 0 : ret; -} - -static bool __push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t bucket) -{ - unsigned i; + if (!fifo_empty(&ca->free_inc)) + ret = 0; + if (ret) { + bch_err(ca, "error invalidating buckets: %i", ret); + return ret; + } - /* - * Don't remove from free_inc until after it's added to - * freelist, so gc can find it: - */ - spin_lock(&c->freelist_lock); - for (i = 0; i < RESERVE_NR; i++) - if (fifo_push(&ca->free[i], bucket)) { - fifo_pop(&ca->free_inc, bucket); - --ca->nr_invalidated; - closure_wake_up(&c->freelist_wait); - spin_unlock(&c->freelist_lock); - return true; - } - spin_unlock(&c->freelist_lock); + if (journal_seq) + ret = bch2_journal_flush_seq(&c->journal, journal_seq); + if (ret) { + bch_err(ca, "journal error: %i", ret); + return ret; + } - return false; + return 0; } static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t bucket) { + unsigned i; int ret = 0; while (1) { set_current_state(TASK_INTERRUPTIBLE); - if (__push_invalidated_bucket(c, ca, bucket)) - break; + spin_lock(&c->freelist_lock); + for (i = 0; i < RESERVE_NR; i++) + if (fifo_push(&ca->free[i], bucket)) { + fifo_pop(&ca->free_inc, bucket); + closure_wake_up(&c->freelist_wait); + spin_unlock(&c->freelist_lock); + goto out; + } + spin_unlock(&c->freelist_lock); if ((current->flags & PF_KTHREAD) && kthread_should_stop()) { @@ -889,22 +908,20 @@ static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t schedule(); try_to_freeze(); } - +out: __set_current_state(TASK_RUNNING); return ret; } /* - * Given an invalidated, ready to use bucket: issue a discard to it if enabled, - * then add it to the freelist, waiting until there's room if necessary: + * Pulls buckets off free_inc, discards them (if enabled), then adds them to + * freelists, waiting until there's room if necessary: */ static int discard_invalidated_buckets(struct bch_fs *c, struct bch_dev *ca) { - while (ca->nr_invalidated) { + while (!fifo_empty(&ca->free_inc)) { size_t bucket = fifo_peek(&ca->free_inc); - BUG_ON(fifo_empty(&ca->free_inc) || !ca->nr_invalidated); - if (ca->mi.discard && bdev_max_discard_sectors(ca->disk_sb.bdev)) blkdev_issue_discard(ca->disk_sb.bdev, @@ -930,68 +947,32 @@ static int bch2_allocator_thread(void *arg) { struct bch_dev *ca = arg; struct bch_fs *c = ca->fs; - u64 journal_seq; + size_t nr; int ret; set_freezable(); while (1) { - while (1) { - cond_resched(); - - pr_debug("discarding %zu invalidated buckets", - ca->nr_invalidated); - - ret = discard_invalidated_buckets(c, ca); - if (ret) - goto stop; - - if (fifo_empty(&ca->free_inc)) - break; + cond_resched(); - pr_debug("invalidating %zu buckets", - fifo_used(&ca->free_inc)); + pr_debug("discarding %zu invalidated buckets", + fifo_used(&ca->free_inc)); - journal_seq = 0; - ret = bch2_invalidate_free_inc(c, ca, &journal_seq, - SIZE_MAX, true); - if (ret) { - bch_err(ca, "error invalidating buckets: %i", ret); - goto stop; - } - - if (!ca->nr_invalidated) { - bch_err(ca, "allocator thread unable to make forward progress!"); - goto stop; - } + ret = discard_invalidated_buckets(c, ca); + if (ret) + goto stop; - if (ca->allocator_invalidating_data) - ret = bch2_journal_flush_seq(&c->journal, journal_seq); - else if (ca->allocator_journal_seq_flush) - ret = bch2_journal_flush_seq(&c->journal, - ca->allocator_journal_seq_flush); + ret = bch2_invalidate_buckets(c, ca); + if (ret) + goto stop; - /* - * journal error - buckets haven't actually been - * invalidated, can't discard them: - */ - if (ret) { - bch_err(ca, "journal error: %i", ret); - goto stop; - } - } + if (!fifo_empty(&ca->free_inc)) + continue; pr_debug("free_inc now empty"); - /* Reset front/back so we can easily sort fifo entries later: */ - ca->free_inc.front = ca->free_inc.back = 0; - ca->allocator_journal_seq_flush = 0; - ca->allocator_invalidating_data = false; - down_read(&c->gc_lock); - while (1) { - size_t prev = fifo_used(&ca->free_inc); - + do { if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) { up_read(&c->gc_lock); bch_err(ca, "gc failure"); @@ -1007,56 +988,46 @@ static int bch2_allocator_thread(void *arg) pr_debug("scanning for reclaimable buckets"); - find_reclaimable_buckets(c, ca); + nr = find_reclaimable_buckets(c, ca); - pr_debug("found %zu buckets (free_inc %zu/%zu)", - fifo_used(&ca->free_inc) - prev, - fifo_used(&ca->free_inc), ca->free_inc.size); + pr_debug("found %zu buckets", nr); - trace_alloc_batch(ca, fifo_used(&ca->free_inc), - ca->free_inc.size); + trace_alloc_batch(ca, nr, ca->alloc_heap.size); - if ((ca->inc_gen_needs_gc >= ca->free_inc.size || - (!fifo_full(&ca->free_inc) && - ca->inc_gen_really_needs_gc >= - fifo_free(&ca->free_inc))) && + if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) || + ca->inc_gen_really_needs_gc) && c->gc_thread) { atomic_inc(&c->kick_gc); wake_up_process(c->gc_thread); } - if (fifo_full(&ca->free_inc)) - break; - - if (!fifo_empty(&ca->free_inc) && - !fifo_full(&ca->free[RESERVE_MOVINGGC])) - break; - /* - * copygc may be waiting until either its reserve fills - * up, or we can't make forward progress: + * If we found any buckets, we have to invalidate them + * before we scan for more - but if we didn't find very + * many we may want to wait on more buckets being + * available so we don't spin: */ - ca->allocator_blocked = true; - closure_wake_up(&c->freelist_wait); - - ret = wait_buckets_available(c, ca); - if (ret) { - up_read(&c->gc_lock); - goto stop; + if (!nr || + (nr < ALLOC_SCAN_BATCH(ca) && + !fifo_full(&ca->free[RESERVE_MOVINGGC]))) { + ca->allocator_blocked = true; + closure_wake_up(&c->freelist_wait); + + ret = wait_buckets_available(c, ca); + if (ret) { + up_read(&c->gc_lock); + goto stop; + } } - } + } while (!nr); ca->allocator_blocked = false; up_read(&c->gc_lock); - pr_debug("free_inc now %zu/%zu", - fifo_used(&ca->free_inc), - ca->free_inc.size); - - sort_free_inc(c, ca); + pr_debug("%zu buckets to invalidate", nr); /* - * free_inc is now full of newly-invalidated buckets: next, + * alloc_heap is now full of newly-invalidated buckets: next, * write out the new bucket gens: */ } @@ -1946,39 +1917,83 @@ int bch2_dev_allocator_start(struct bch_dev *ca) return 0; } +static void flush_held_btree_writes(struct bch_fs *c) +{ + struct bucket_table *tbl; + struct rhash_head *pos; + struct btree *b; + bool flush_updates; + size_t i, nr_pending_updates; + + clear_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags); +again: + pr_debug("flushing dirty btree nodes"); + cond_resched(); + + flush_updates = false; + nr_pending_updates = bch2_btree_interior_updates_nr_pending(c); + + rcu_read_lock(); + for_each_cached_btree(b, c, tbl, i, pos) + if (btree_node_dirty(b) && (!b->written || b->level)) { + if (btree_node_may_write(b)) { + rcu_read_unlock(); + btree_node_lock_type(c, b, SIX_LOCK_read); + bch2_btree_node_write(c, b, SIX_LOCK_read); + six_unlock_read(&b->lock); + goto again; + } else { + flush_updates = true; + } + } + rcu_read_unlock(); + + if (c->btree_roots_dirty) + bch2_journal_meta(&c->journal); + + /* + * This is ugly, but it's needed to flush btree node writes + * without spinning... + */ + if (flush_updates) { + closure_wait_event(&c->btree_interior_update_wait, + bch2_btree_interior_updates_nr_pending(c) < + nr_pending_updates); + goto again; + } + +} + static void allocator_start_issue_discards(struct bch_fs *c) { struct bch_dev *ca; unsigned dev_iter; - size_t i, bu; - - for_each_rw_member(ca, c, dev_iter) { - unsigned done = 0; - - fifo_for_each_entry(bu, &ca->free_inc, i) { - if (done == ca->nr_invalidated) - break; + size_t bu; + for_each_rw_member(ca, c, dev_iter) + while (fifo_pop(&ca->free_inc, bu)) blkdev_issue_discard(ca->disk_sb.bdev, bucket_to_sector(ca, bu), ca->mi.bucket_size, GFP_NOIO); - done++; - } - } } static int __bch2_fs_allocator_start(struct bch_fs *c) { struct bch_dev *ca; - size_t bu, i; unsigned dev_iter; u64 journal_seq = 0; + long bu; bool invalidating_data = false; int ret = 0; if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) return -1; + if (test_alloc_startup(c)) { + invalidating_data = true; + goto not_enough; + } + /* Scan for buckets that are already invalidated: */ for_each_rw_member(ca, c, dev_iter) { struct btree_iter iter; @@ -2003,7 +2018,6 @@ static int __bch2_fs_allocator_start(struct bch_fs *c) percpu_up_read(&c->usage_lock); fifo_push(&ca->free_inc, bu); - ca->nr_invalidated++; if (fifo_full(&ca->free_inc)) break; @@ -2022,24 +2036,23 @@ static int __bch2_fs_allocator_start(struct bch_fs *c) not_enough: pr_debug("did not find enough empty buckets; issuing discards"); - /* clear out free_inc - find_reclaimable_buckets() assumes it's empty */ + /* clear out free_inc, we'll be using it again below: */ for_each_rw_member(ca, c, dev_iter) discard_invalidated_buckets(c, ca); pr_debug("scanning for reclaimable buckets"); for_each_rw_member(ca, c, dev_iter) { - BUG_ON(!fifo_empty(&ca->free_inc)); - ca->free_inc.front = ca->free_inc.back = 0; - find_reclaimable_buckets(c, ca); - sort_free_inc(c, ca); - invalidating_data |= ca->allocator_invalidating_data; + while (!fifo_full(&ca->free[RESERVE_BTREE]) && + (bu = next_alloc_bucket(ca)) >= 0) { + invalidating_data |= + bch2_invalidate_one_bucket(c, ca, bu, &journal_seq); - fifo_for_each_entry(bu, &ca->free_inc, i) - if (!fifo_push(&ca->free[RESERVE_BTREE], bu)) - break; + fifo_push(&ca->free[RESERVE_BTREE], bu); + set_bit(bu, ca->buckets_dirty); + } } pr_debug("done scanning for reclaimable buckets"); @@ -2065,16 +2078,9 @@ not_enough: * XXX: it's possible for this to deadlock waiting on journal reclaim, * since we're holding btree writes. What then? */ - - for_each_rw_member(ca, c, dev_iter) { - ret = bch2_invalidate_free_inc(c, ca, &journal_seq, - ca->free[RESERVE_BTREE].size, - false); - if (ret) { - percpu_ref_put(&ca->io_ref); - return ret; - } - } + ret = bch2_alloc_write(c); + if (ret) + return ret; if (invalidating_data) { pr_debug("flushing journal"); @@ -2087,57 +2093,11 @@ not_enough: allocator_start_issue_discards(c); } - for_each_rw_member(ca, c, dev_iter) - while (ca->nr_invalidated) { - BUG_ON(!fifo_pop(&ca->free_inc, bu)); - ca->nr_invalidated--; - } - set_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags); /* now flush dirty btree nodes: */ - if (invalidating_data) { - struct bucket_table *tbl; - struct rhash_head *pos; - struct btree *b; - bool flush_updates; - size_t nr_pending_updates; - - clear_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags); -again: - pr_debug("flushing dirty btree nodes"); - cond_resched(); - - flush_updates = false; - nr_pending_updates = bch2_btree_interior_updates_nr_pending(c); - - - rcu_read_lock(); - for_each_cached_btree(b, c, tbl, i, pos) - if (btree_node_dirty(b) && (!b->written || b->level)) { - if (btree_node_may_write(b)) { - rcu_read_unlock(); - btree_node_lock_type(c, b, SIX_LOCK_read); - bch2_btree_node_write(c, b, SIX_LOCK_read); - six_unlock_read(&b->lock); - goto again; - } else { - flush_updates = true; - } - } - rcu_read_unlock(); - - /* - * This is ugly, but it's needed to flush btree node writes - * without spinning... - */ - if (flush_updates) { - closure_wait_event(&c->btree_interior_update_wait, - bch2_btree_interior_updates_nr_pending(c) < - nr_pending_updates); - goto again; - } - } + if (invalidating_data) + flush_held_btree_writes(c); return 0; } diff --git a/fs/bcachefs/alloc.h b/fs/bcachefs/alloc.h index 00d01f464c68..2a6500d6f97a 100644 --- a/fs/bcachefs/alloc.h +++ b/fs/bcachefs/alloc.h @@ -9,6 +9,8 @@ struct bch_dev; struct bch_fs; struct bch_devs_List; +#define ALLOC_SCAN_BATCH(ca) ((ca)->mi.nbuckets >> 9) + const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c); void bch2_alloc_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c); diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 7c6b1925f67b..8dd96a2de1a3 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -270,6 +270,10 @@ do { \ "Store the journal sequence number in the version " \ "number of every btree key, and verify that btree " \ "update ordering is preserved during recovery") \ + BCH_DEBUG_PARAM(test_alloc_startup, \ + "Force allocator startup to use the slowpath where it" \ + "can't find enough free buckets without invalidating" \ + "cached data") #define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG() @@ -403,7 +407,6 @@ struct bch_dev { alloc_fifo free[RESERVE_NR]; alloc_fifo free_inc; spinlock_t freelist_lock; - size_t nr_invalidated; u8 open_buckets_partial[OPEN_BUCKETS_COUNT]; unsigned open_buckets_partial_nr; @@ -415,8 +418,6 @@ struct bch_dev { size_t inc_gen_needs_gc; size_t inc_gen_really_needs_gc; - u64 allocator_journal_seq_flush; - bool allocator_invalidating_data; bool allocator_blocked; alloc_heap alloc_heap; diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index cc1f8b9a9e09..bc667ac70f57 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -1145,7 +1145,8 @@ static void bch2_btree_set_root(struct btree_update *as, struct btree *b, struct btree *old; trace_btree_set_root(c, b); - BUG_ON(!b->written); + BUG_ON(!b->written && + !test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)); old = btree_node_root(c, b); diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index eec2f6cb4f5b..6a7e8b7b6a79 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -405,7 +405,7 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, _old; \ }) -bool bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, +void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, size_t b, struct bucket_mark *old) { struct bucket *g; @@ -416,8 +416,7 @@ bool bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, g = bucket(ca, b); *old = bucket_data_cmpxchg(c, ca, g, new, ({ - if (!is_available_bucket(new)) - return false; + BUG_ON(!is_available_bucket(new)); new.owned_by_allocator = 1; new.data_type = 0; @@ -429,7 +428,6 @@ bool bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, if (!old->owned_by_allocator && old->cached_sectors) trace_invalidate(ca, bucket_to_sector(ca, b), old->cached_sectors); - return true; } void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, @@ -822,7 +820,8 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) /* XXX: these should be tunable */ size_t reserve_none = max_t(size_t, 4, ca->mi.nbuckets >> 9); size_t copygc_reserve = max_t(size_t, 16, ca->mi.nbuckets >> 7); - size_t free_inc_reserve = copygc_reserve / 2; + size_t free_inc_nr = max(max_t(size_t, 16, ca->mi.nbuckets >> 12), + btree_reserve); bool resize = ca->buckets != NULL, start_copygc = ca->copygc_thread != NULL; int ret = -ENOMEM; @@ -845,8 +844,8 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) !init_fifo(&free[RESERVE_MOVINGGC], copygc_reserve, GFP_KERNEL) || !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) || - !init_fifo(&free_inc, free_inc_reserve, GFP_KERNEL) || - !init_heap(&alloc_heap, free_inc_reserve, GFP_KERNEL) || + !init_fifo(&free_inc, free_inc_nr, GFP_KERNEL) || + !init_heap(&alloc_heap, ALLOC_SCAN_BATCH(ca) << 1, GFP_KERNEL) || !init_heap(©gc_heap, copygc_reserve, GFP_KERNEL)) goto err; diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index d0dc9c8b4f0b..2671ad29edf9 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -205,7 +205,7 @@ static inline bool bucket_needs_journal_commit(struct bucket_mark m, void bch2_bucket_seq_cleanup(struct bch_fs *); -bool bch2_invalidate_bucket(struct bch_fs *, struct bch_dev *, +void bch2_invalidate_bucket(struct bch_fs *, struct bch_dev *, size_t, struct bucket_mark *); void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *, size_t, bool, struct gc_pos, unsigned); |