diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2018-07-24 16:42:49 -0400 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@linux.dev> | 2023-10-22 17:08:08 -0400 |
commit | 6eac2c2e2440280ca551d4936807a8a130970469 (patch) | |
tree | 95f0176b9e94bb29f10d98c7abfdb9dac2374d74 /fs/bcachefs | |
parent | 5b650fd11a00271b9d4c033d1d0780826e050137 (diff) | |
download | lwn-6eac2c2e2440280ca551d4936807a8a130970469.tar.gz lwn-6eac2c2e2440280ca551d4936807a8a130970469.zip |
bcachefs: Change how replicated data is accounted
Due to compression, the different replicas of a replicated extent don't
necessarily have to take up the same amount of space - so replicated
data sector counts shouldn't be stored divided by the number of
replicas.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Diffstat (limited to 'fs/bcachefs')
-rw-r--r-- | fs/bcachefs/btree_update_interior.c | 2 | ||||
-rw-r--r-- | fs/bcachefs/buckets.c | 99 | ||||
-rw-r--r-- | fs/bcachefs/buckets_types.h | 1 | ||||
-rw-r--r-- | fs/bcachefs/super.c | 51 | ||||
-rw-r--r-- | fs/bcachefs/sysfs.c | 4 |
5 files changed, 107 insertions, 50 deletions
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index a37b5edea699..b60eb3d33c7b 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -185,7 +185,7 @@ found: replicas = bch2_extent_nr_dirty_ptrs(k); if (replicas) stats->replicas[replicas - 1].data[BCH_DATA_BTREE] -= - c->opts.btree_node_size; + c->opts.btree_node_size * replicas; /* * We're dropping @k from the btree, but it's still live until the diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 56b197bff4f0..ab61abdf975d 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -248,29 +248,28 @@ bch2_fs_usage_read(struct bch_fs *c) struct fs_usage_sum { u64 hidden; u64 data; + u64 cached; u64 reserved; }; static inline struct fs_usage_sum __fs_usage_sum(struct bch_fs_usage stats) { struct fs_usage_sum sum = { 0 }; - unsigned i, j; + unsigned i; /* * For superblock and journal we count bucket usage, not sector usage, * because any internal fragmentation should _not_ be counted as * free space: */ - for (j = 1; j < BCH_DATA_BTREE; j++) - sum.hidden += stats.buckets[j]; + sum.hidden += stats.buckets[BCH_DATA_SB]; + sum.hidden += stats.buckets[BCH_DATA_JOURNAL]; for (i = 0; i < ARRAY_SIZE(stats.replicas); i++) { - for (j = BCH_DATA_BTREE; - j < ARRAY_SIZE(stats.replicas[i].data); - j++) - sum.data += stats.replicas[i].data[j] * (i + 1); - - sum.reserved += stats.replicas[i].persistent_reserved * (i + 1); + sum.data += stats.replicas[i].data[BCH_DATA_BTREE]; + sum.data += stats.replicas[i].data[BCH_DATA_USER]; + sum.cached += stats.replicas[i].data[BCH_DATA_CACHED]; + sum.reserved += stats.replicas[i].persistent_reserved; } sum.reserved += stats.online_reserved; @@ -379,17 +378,13 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, { struct bch_dev_usage *dev_usage; - if (c) - percpu_rwsem_assert_held(&c->usage_lock); + percpu_rwsem_assert_held(&c->usage_lock); - if (old.data_type && new.data_type && - old.data_type != new.data_type) { - BUG_ON(!c); - bch2_fs_inconsistent(c, - "different types of data in same bucket: %s, %s", - bch2_data_types[old.data_type], - bch2_data_types[new.data_type]); - } + bch2_fs_inconsistent_on(old.data_type && new.data_type && + old.data_type != new.data_type, c, + "different types of data in same bucket: %s, %s", + bch2_data_types[old.data_type], + bch2_data_types[new.data_type]); stats->buckets[bucket_type(old)] -= ca->mi.bucket_size; stats->buckets[bucket_type(new)] += ca->mi.bucket_size; @@ -448,6 +443,12 @@ void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, new.gen++; })); + /* + * This isn't actually correct yet, since fs usage is still + * uncompressed sectors: + */ + stats->replicas[0].data[BCH_DATA_CACHED] -= old->cached_sectors; + if (!old->owned_by_allocator && old->cached_sectors) trace_invalidate(ca, bucket_to_sector(ca, b), old->cached_sectors); @@ -501,26 +502,34 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) && gc_will_visit(c, pos)) return; - } - preempt_disable(); - stats = this_cpu_ptr(c->usage_percpu); + preempt_disable(); + stats = this_cpu_ptr(c->usage_percpu); - g = bucket(ca, b); - old = bucket_data_cmpxchg(c, ca, stats, g, new, ({ - new.data_type = type; - checked_add(new.dirty_sectors, sectors); - })); + g = bucket(ca, b); + old = bucket_data_cmpxchg(c, ca, stats, g, new, ({ + new.data_type = type; + checked_add(new.dirty_sectors, sectors); + })); - stats->replicas[0].data[type] += sectors; - preempt_enable(); + stats->replicas[0].data[type] += sectors; + preempt_enable(); + } else { + rcu_read_lock(); + + g = bucket(ca, b); + old = bucket_cmpxchg(g, new, ({ + new.data_type = type; + checked_add(new.dirty_sectors, sectors); + })); + + rcu_read_unlock(); + } BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) && bucket_became_unavailable(c, old, new)); } -/* Reverting this until the copygc + compression issue is fixed: */ - static int __disk_sectors(struct bch_extent_crc_unpacked crc, unsigned sectors) { if (!sectors) @@ -540,12 +549,14 @@ static void bch2_mark_pointer(struct bch_fs *c, const struct bch_extent_ptr *ptr, struct bch_extent_crc_unpacked crc, s64 sectors, enum bch_data_type data_type, - struct bch_fs_usage *stats, + unsigned replicas, + struct bch_fs_usage *fs_usage, u64 journal_seq, unsigned flags) { struct bucket_mark old, new; struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); struct bucket *g = PTR_BUCKET(ca, ptr); + s64 uncompressed_sectors = sectors; u64 v; if (crc.compression_type) { @@ -563,6 +574,20 @@ static void bch2_mark_pointer(struct bch_fs *c, +__disk_sectors(crc, new_sectors); } + /* + * fs level usage (which determines free space) is in uncompressed + * sectors, until copygc + compression is sorted out: + * + * note also that we always update @fs_usage, even when we otherwise + * wouldn't do anything because gc is running - this is because the + * caller still needs to account w.r.t. its disk reservation. It is + * caller's responsibility to not apply @fs_usage if gc is in progress. + */ + fs_usage->replicas + [!ptr->cached && replicas ? replicas - 1 : 0].data + [!ptr->cached ? data_type : BCH_DATA_CACHED] += + uncompressed_sectors; + if (flags & BCH_BUCKET_MARK_GC_WILL_VISIT) { if (journal_seq) bucket_cmpxchg(g, new, ({ @@ -614,7 +639,7 @@ static void bch2_mark_pointer(struct bch_fs *c, old.v.counter, new.v.counter)) != old.v.counter); - bch2_dev_usage_update(c, ca, stats, old, new); + bch2_dev_usage_update(c, ca, fs_usage, old, new); BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) && bucket_became_unavailable(c, old, new)); @@ -677,15 +702,13 @@ void bch2_mark_key(struct bch_fs *c, struct bkey_s_c k, extent_for_each_ptr_crc(e, ptr, crc) bch2_mark_pointer(c, e, ptr, crc, sectors, data_type, - stats, journal_seq, flags); - - if (replicas) - stats->replicas[replicas - 1].data[data_type] += sectors; + replicas, stats, journal_seq, flags); break; } case BCH_RESERVATION: if (replicas) - stats->replicas[replicas - 1].persistent_reserved += sectors; + stats->replicas[replicas - 1].persistent_reserved += + sectors * replicas; break; } percpu_up_read(&c->usage_lock); diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h index 9968570832e3..49f3ab9009ea 100644 --- a/fs/bcachefs/buckets_types.h +++ b/fs/bcachefs/buckets_types.h @@ -62,7 +62,6 @@ struct bch_dev_usage { struct bch_fs_usage { /* all fields are in units of 512 byte sectors: */ - /* _uncompressed_ sectors: */ u64 online_reserved; u64 available_cache; diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index fe95b8b026e8..e44bc95d8deb 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -985,14 +985,6 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) ca->disk_sb = *sb; memset(sb, 0, sizeof(*sb)); - if (ca->fs) - mutex_lock(&ca->fs->sb_lock); - - bch2_mark_dev_superblock(ca->fs, ca, BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE); - - if (ca->fs) - mutex_unlock(&ca->fs->sb_lock); - percpu_ref_reinit(&ca->io_ref); return 0; @@ -1018,6 +1010,11 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) if (ret) return ret; + mutex_lock(&c->sb_lock); + bch2_mark_dev_superblock(ca->fs, ca, + BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE); + mutex_unlock(&c->sb_lock); + bch2_dev_sysfs_online(c, ca); if (c->sb.nr_devices == 1) @@ -1295,6 +1292,24 @@ err: return ret; } +static void dev_usage_clear(struct bch_dev *ca) +{ + struct bucket_array *buckets; + int cpu; + + for_each_possible_cpu(cpu) { + struct bch_dev_usage *p = + per_cpu_ptr(ca->usage_percpu, cpu); + memset(p, 0, sizeof(*p)); + } + + down_read(&ca->bucket_lock); + buckets = bucket_array(ca); + + memset(buckets->b, 0, sizeof(buckets->b[0]) * buckets->nbuckets); + up_read(&ca->bucket_lock); +} + /* Add new device to running filesystem: */ int bch2_dev_add(struct bch_fs *c, const char *path) { @@ -1333,11 +1348,28 @@ int bch2_dev_add(struct bch_fs *c, const char *path) return ret; } + /* + * We want to allocate journal on the new device before adding the new + * device to the filesystem because allocating after we attach requires + * spinning up the allocator thread, and the allocator thread requires + * doing btree writes, which if the existing devices are RO isn't going + * to work + * + * So we have to mark where the superblocks are, but marking allocated + * data normally updates the filesystem usage too, so we have to mark, + * allocate the journal, reset all the marks, then remark after we + * attach... + */ + bch2_mark_dev_superblock(ca->fs, ca, + BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE); + err = "journal alloc failed"; ret = bch2_dev_journal_alloc(ca); if (ret) goto err; + dev_usage_clear(ca); + mutex_lock(&c->state_lock); mutex_lock(&c->sb_lock); @@ -1388,6 +1420,9 @@ have_slot: ca->disk_sb.sb->dev_idx = dev_idx; bch2_dev_attach(c, ca, dev_idx); + bch2_mark_dev_superblock(c, ca, + BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE); + bch2_write_super(c); mutex_unlock(&c->sb_lock); diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index 4ce7168e930b..582e281694a9 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -781,7 +781,7 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf) " meta: %llu\n" " user: %llu\n" " cached: %llu\n" - " available: %llu\n" + " available: %lli\n" "sectors:\n" " sb: %llu\n" " journal: %llu\n" @@ -802,7 +802,7 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf) stats.buckets[BCH_DATA_BTREE], stats.buckets[BCH_DATA_USER], stats.buckets[BCH_DATA_CACHED], - __dev_buckets_available(ca, stats), + ca->mi.nbuckets - ca->mi.first_bucket - stats.buckets_unavailable, stats.sectors[BCH_DATA_SB], stats.sectors[BCH_DATA_JOURNAL], stats.sectors[BCH_DATA_BTREE], |