summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@linux.dev>2023-01-04 00:00:50 -0500
committerKent Overstreet <kent.overstreet@linux.dev>2023-10-22 17:09:50 -0400
commit920e69bc3db88d3825c69190cafd43f0a1918d3b (patch)
tree6ff5cce6d8cbf0e014eeb1369e92512b60a21595
parentf2b542ba42a8b35d9dc43f5eab9791fea76bfd3a (diff)
downloadlwn-920e69bc3db88d3825c69190cafd43f0a1918d3b.tar.gz
lwn-920e69bc3db88d3825c69190cafd43f0a1918d3b.zip
bcachefs: Btree write buffer
This adds a new method of doing btree updates - a straight write buffer, implemented as a flat fixed size array. This is only useful when we don't need to read from the btree in order to do the update, and when reading is infrequent - perfect for the LRU btree. This will make LRU btree updates fast enough that we'll be able to use it for persistently indexing buckets by fragmentation, which will be a massive boost to copygc performance. Changes: - A new btree_insert_type enum, for btree_insert_entries. Specifies btree, btree key cache, or btree write buffer. - bch2_trans_update_buffered(): updates via the btree write buffer don't need a btree path, so we need a new update path. - Transaction commit path changes: The update to the btree write buffer both mutates global, and can fail if there isn't currently room. Therefore we do all write buffer updates in the transaction all at once, and also if it fails we have to revert filesystem usage counter changes. If there isn't room we flush the write buffer in the transaction commit error path and retry. - A new persistent option, for specifying the number of entries in the write buffer. Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
-rw-r--r--fs/bcachefs/Makefile1
-rw-r--r--fs/bcachefs/bcachefs.h4
-rw-r--r--fs/bcachefs/bcachefs_format.h4
-rw-r--r--fs/bcachefs/btree_iter.c17
-rw-r--r--fs/bcachefs/btree_types.h3
-rw-r--r--fs/bcachefs/btree_update.h12
-rw-r--r--fs/bcachefs/btree_update_leaf.c172
-rw-r--r--fs/bcachefs/btree_write_buffer.c330
-rw-r--r--fs/bcachefs/btree_write_buffer.h14
-rw-r--r--fs/bcachefs/btree_write_buffer_types.h44
-rw-r--r--fs/bcachefs/buckets.c41
-rw-r--r--fs/bcachefs/buckets.h1
-rw-r--r--fs/bcachefs/errcode.h2
-rw-r--r--fs/bcachefs/opts.h5
-rw-r--r--fs/bcachefs/super.c3
-rw-r--r--fs/bcachefs/trace.h45
16 files changed, 677 insertions, 21 deletions
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index 966c9b9a74fc..c0e715760c8b 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -16,6 +16,7 @@ bcachefs-y := \
btree_locking.o \
btree_update_interior.o \
btree_update_leaf.o \
+ btree_write_buffer.o \
buckets.o \
buckets_waiting_for_journal.o \
chardev.o \
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index ad3bf019487e..91f635faccb0 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -391,6 +391,7 @@ enum bch_time_stats {
#include "alloc_types.h"
#include "btree_types.h"
+#include "btree_write_buffer_types.h"
#include "buckets_types.h"
#include "buckets_waiting_for_journal_types.h"
#include "clock_types.h"
@@ -575,6 +576,7 @@ struct btree_transaction_stats {
struct bch2_time_stats lock_hold_times;
struct mutex lock;
unsigned nr_max_paths;
+ unsigned wb_updates_size;
unsigned max_mem;
char *max_paths_text;
};
@@ -789,6 +791,8 @@ struct bch_fs {
struct btree_key_cache btree_key_cache;
unsigned btree_key_cache_btrees;
+ struct btree_write_buffer btree_write_buffer;
+
struct workqueue_struct *btree_update_wq;
struct workqueue_struct *btree_io_complete_wq;
/* copygc needs its own workqueue for index updates.. */
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index d96efc8338d5..8e070402e73f 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1404,7 +1404,8 @@ struct bch_sb_field_disk_groups {
x(trans_traverse_all, 71) \
x(transaction_commit, 72) \
x(write_super, 73) \
- x(trans_restart_would_deadlock_recursion_limit, 74)
+ x(trans_restart_would_deadlock_recursion_limit, 74) \
+ x(trans_restart_write_buffer_flush, 75)
enum bch_persistent_counters {
#define x(t, n, ...) BCH_COUNTER_##t,
@@ -1633,6 +1634,7 @@ LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62);
LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63);
LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32);
LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33);
+LE64_BITMASK(BCH_SB_WRITE_BUFFER_SIZE, struct bch_sb, flags[4], 34, 54);
/*
* Features:
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 21f12e522360..4ac1364acc8b 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1374,6 +1374,7 @@ noinline __cold
void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
{
struct btree_insert_entry *i;
+ struct btree_write_buffered_key *wb;
prt_printf(buf, "transaction updates for %s journal seq %llu",
trans->fn, trans->journal_res.seq);
@@ -1398,6 +1399,17 @@ void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
prt_newline(buf);
}
+ trans_for_each_wb_update(trans, wb) {
+ prt_printf(buf, "update: btree=%s wb=1 %pS",
+ bch2_btree_ids[wb->btree],
+ (void *) i->ip_allocated);
+ prt_newline(buf);
+
+ prt_printf(buf, " new ");
+ bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(&wb->k));
+ prt_newline(buf);
+ }
+
printbuf_indent_sub(buf, 2);
}
@@ -2929,8 +2941,11 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, unsigned fn_
trans->mem_bytes = expected_mem_bytes;
}
}
- if (s)
+
+ if (s) {
trans->nr_max_paths = s->nr_max_paths;
+ trans->wb_updates_size = s->wb_updates_size;
+ }
trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
trans->srcu_lock_time = jiffies;
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 93c928a93dca..153ae548a89a 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -419,6 +419,8 @@ struct btree_trans {
u8 fn_idx;
u8 nr_sorted;
u8 nr_updates;
+ u8 nr_wb_updates;
+ u8 wb_updates_size;
bool used_mempool:1;
bool in_traverse_all:1;
bool paths_sorted:1;
@@ -448,6 +450,7 @@ struct btree_trans {
u8 sorted[BTREE_ITER_MAX + 8];
struct btree_path *paths;
struct btree_insert_entry *updates;
+ struct btree_write_buffered_key *wb_updates;
/* update path: */
struct btree_trans_commit_hook *hooks;
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 673c3a78aae2..96d27e34d5b1 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -15,6 +15,9 @@ bool bch2_btree_bset_insert_key(struct btree_trans *, struct btree_path *,
struct bkey_i *);
void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64);
+void bch2_btree_insert_key_leaf(struct btree_trans *, struct btree_path *,
+ struct bkey_i *, u64);
+
enum btree_insert_flags {
/* First two bits for journal watermark: */
__BTREE_INSERT_NOFAIL = 2,
@@ -77,6 +80,8 @@ int bch2_trans_update_extent(struct btree_trans *, struct btree_iter *,
int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *,
struct bkey_i *, enum btree_update_flags);
+int __must_check bch2_trans_update_buffered(struct btree_trans *,
+ enum btree_id, struct bkey_i *);
void bch2_trans_commit_hook(struct btree_trans *,
struct btree_trans_commit_hook *);
@@ -142,6 +147,11 @@ static inline int bch2_trans_commit(struct btree_trans *trans,
(_i) < (_trans)->updates + (_trans)->nr_updates; \
(_i)++)
+#define trans_for_each_wb_update(_trans, _i) \
+ for ((_i) = (_trans)->wb_updates; \
+ (_i) < (_trans)->wb_updates + (_trans)->nr_wb_updates; \
+ (_i)++)
+
static inline void bch2_trans_reset_updates(struct btree_trans *trans)
{
struct btree_insert_entry *i;
@@ -151,6 +161,8 @@ static inline void bch2_trans_reset_updates(struct btree_trans *trans)
trans->extra_journal_res = 0;
trans->nr_updates = 0;
+ trans->nr_wb_updates = 0;
+ trans->wb_updates = NULL;
trans->hooks = NULL;
trans->extra_journal_entries.nr = 0;
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 84f79affbe07..8169f2b89848 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -8,6 +8,7 @@
#include "btree_iter.h"
#include "btree_key_cache.h"
#include "btree_locking.h"
+#include "btree_write_buffer.h"
#include "buckets.h"
#include "debug.h"
#include "errcode.h"
@@ -100,9 +101,6 @@ inline void bch2_btree_node_prep_for_write(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
- if (path->cached)
- return;
-
if (unlikely(btree_node_just_written(b)) &&
bch2_btree_post_write_cleanup(c, b))
bch2_trans_node_reinit_iter(trans, b);
@@ -252,25 +250,26 @@ inline void bch2_btree_add_journal_pin(struct bch_fs *c,
/**
* btree_insert_key - insert a key one key into a leaf node
*/
-static void btree_insert_key_leaf(struct btree_trans *trans,
- struct btree_insert_entry *insert)
+inline void bch2_btree_insert_key_leaf(struct btree_trans *trans,
+ struct btree_path *path,
+ struct bkey_i *insert,
+ u64 journal_seq)
{
struct bch_fs *c = trans->c;
- struct btree *b = insert_l(insert)->b;
+ struct btree *b = path_l(path)->b;
struct bset_tree *t = bset_tree_last(b);
struct bset *i = bset(b, t);
int old_u64s = bset_u64s(t);
int old_live_u64s = b->nr.live_u64s;
int live_u64s_added, u64s_added;
- if (unlikely(!bch2_btree_bset_insert_key(trans, insert->path, b,
- &insert_l(insert)->iter, insert->k)))
+ if (unlikely(!bch2_btree_bset_insert_key(trans, path, b,
+ &path_l(path)->iter, insert)))
return;
- i->journal_seq = cpu_to_le64(max(trans->journal_res.seq,
- le64_to_cpu(i->journal_seq)));
+ i->journal_seq = cpu_to_le64(max(journal_seq, le64_to_cpu(i->journal_seq)));
- bch2_btree_add_journal_pin(c, b, trans->journal_res.seq);
+ bch2_btree_add_journal_pin(c, b, journal_seq);
if (unlikely(!btree_node_dirty(b)))
set_btree_node_dirty_acct(c, b);
@@ -288,6 +287,12 @@ static void btree_insert_key_leaf(struct btree_trans *trans,
bch2_trans_node_reinit_iter(trans, b);
}
+static void btree_insert_key_leaf(struct btree_trans *trans,
+ struct btree_insert_entry *insert)
+{
+ bch2_btree_insert_key_leaf(trans, insert->path, insert->k, trans->journal_res.seq);
+}
+
/* Cached btree updates: */
/* Normal update interface: */
@@ -594,6 +599,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
{
struct bch_fs *c = trans->c;
struct btree_insert_entry *i;
+ struct btree_write_buffered_key *wb;
struct btree_trans_commit_hook *h;
unsigned u64s = 0;
bool marking = false;
@@ -638,6 +644,10 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
marking = true;
}
+ if (trans->nr_wb_updates &&
+ trans->nr_wb_updates + c->btree_write_buffer.state.nr > c->btree_write_buffer.size)
+ return -BCH_ERR_btree_insert_need_flush_buffer;
+
/*
* Don't get journal reservation until after we know insert will
* succeed:
@@ -674,17 +684,25 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas))
return -BCH_ERR_btree_insert_need_mark_replicas;
+ if (trans->nr_wb_updates) {
+ EBUG_ON(flags & BTREE_INSERT_JOURNAL_REPLAY);
+
+ ret = bch2_btree_insert_keys_write_buffer(trans);
+ if (ret)
+ goto revert_fs_usage;
+ }
+
trans_for_each_update(trans, i)
if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) {
ret = run_one_mem_trigger(trans, i, i->flags);
if (ret)
- return ret;
+ goto fatal_err;
}
if (unlikely(c->gc_pos.phase)) {
ret = bch2_trans_commit_run_gc_triggers(trans);
if (ret)
- return ret;
+ goto fatal_err;
}
if (unlikely(trans->extra_journal_entries.nr)) {
@@ -697,10 +715,10 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
}
if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) {
- trans_for_each_update(trans, i) {
- struct journal *j = &c->journal;
- struct jset_entry *entry;
+ struct journal *j = &c->journal;
+ struct jset_entry *entry;
+ trans_for_each_update(trans, i) {
if (i->key_cache_already_flushed)
continue;
@@ -725,6 +743,14 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
bkey_copy(&entry->start[0], i->k);
}
+ trans_for_each_wb_update(trans, wb) {
+ entry = bch2_journal_add_entry(j, &trans->journal_res,
+ BCH_JSET_ENTRY_btree_keys,
+ wb->btree, 0,
+ wb->k.k.u64s);
+ bkey_copy(&entry->start[0], &wb->k);
+ }
+
if (trans->journal_seq)
*trans->journal_seq = trans->journal_res.seq;
}
@@ -742,6 +768,12 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
}
}
+ return 0;
+fatal_err:
+ bch2_fatal_error(c);
+revert_fs_usage:
+ if (trans->fs_usage_deltas)
+ bch2_trans_fs_usage_revert(trans, trans->fs_usage_deltas);
return ret;
}
@@ -769,7 +801,8 @@ static inline int trans_lock_write(struct btree_trans *trans)
if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c))
return trans_lock_write_fail(trans, i);
- bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
+ if (!i->cached)
+ bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
}
return 0;
@@ -778,9 +811,13 @@ static inline int trans_lock_write(struct btree_trans *trans)
static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans)
{
struct btree_insert_entry *i;
+ struct btree_write_buffered_key *wb;
trans_for_each_update(trans, i)
bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p);
+
+ trans_for_each_wb_update(trans, wb)
+ bch2_journal_key_overwritten(trans->c, wb->btree, 0, wb->k.k.p);
}
#ifdef CONFIG_BCACHEFS_DEBUG
@@ -821,10 +858,11 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags
{
struct bch_fs *c = trans->c;
struct btree_insert_entry *i;
- struct printbuf buf = PRINTBUF;
int ret, u64s_delta = 0;
#ifdef CONFIG_BCACHEFS_DEBUG
+ struct printbuf buf = PRINTBUF;
+
trans_for_each_update(trans, i) {
int rw = (flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE;
@@ -833,8 +871,8 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags
return bch2_trans_commit_bkey_invalid(trans, flags, i, &buf);
btree_insert_entry_checks(trans, i);
}
-#endif
printbuf_exit(&buf);
+#endif
trans_for_each_update(trans, i) {
if (i->cached)
@@ -962,6 +1000,30 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
if (ret)
trace_and_count(c, trans_restart_journal_reclaim, trans, trace_ip);
break;
+ case -BCH_ERR_btree_insert_need_flush_buffer: {
+ struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+ ret = 0;
+
+ if (wb->state.nr > wb->size * 3 / 4) {
+ bch2_trans_reset_updates(trans);
+ bch2_trans_unlock(trans);
+
+ mutex_lock(&wb->flush_lock);
+
+ if (wb->state.nr > wb->size * 3 / 4)
+ ret = __bch2_btree_write_buffer_flush(trans,
+ flags|BTREE_INSERT_NOCHECK_RW, true);
+ else
+ mutex_unlock(&wb->flush_lock);
+
+ if (!ret) {
+ trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_);
+ ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush);
+ }
+ }
+ break;
+ }
default:
BUG_ON(ret >= 0);
break;
@@ -1023,10 +1085,12 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
{
struct bch_fs *c = trans->c;
struct btree_insert_entry *i = NULL;
+ struct btree_write_buffered_key *wb;
unsigned u64s;
int ret = 0;
if (!trans->nr_updates &&
+ !trans->nr_wb_updates &&
!trans->extra_journal_entries.nr)
goto out_reset;
@@ -1049,6 +1113,20 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
goto out_reset;
}
+ if (c->btree_write_buffer.state.nr > c->btree_write_buffer.size / 2 &&
+ mutex_trylock(&c->btree_write_buffer.flush_lock)) {
+ bch2_trans_begin(trans);
+ bch2_trans_unlock(trans);
+
+ ret = __bch2_btree_write_buffer_flush(trans,
+ flags|BTREE_INSERT_NOCHECK_RW, true);
+ if (!ret) {
+ trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_);
+ ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush);
+ }
+ goto out;
+ }
+
EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
memset(&trans->journal_preres, 0, sizeof(trans->journal_preres));
@@ -1089,6 +1167,9 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
trans->journal_u64s += jset_u64s(i->old_k.u64s);
}
+ trans_for_each_wb_update(trans, wb)
+ trans->journal_u64s += jset_u64s(wb->k.k.u64s);
+
if (trans->extra_journal_res) {
ret = bch2_disk_reservation_add(c, trans->disk_res,
trans->extra_journal_res,
@@ -1606,6 +1687,59 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter
return bch2_trans_update_by_path(trans, path, k, flags);
}
+int __must_check bch2_trans_update_buffered(struct btree_trans *trans,
+ enum btree_id btree,
+ struct bkey_i *k)
+{
+ struct btree_write_buffered_key *i;
+ int ret;
+
+ EBUG_ON(trans->nr_wb_updates > trans->wb_updates_size);
+ EBUG_ON(k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX);
+
+ trans_for_each_wb_update(trans, i) {
+ if (i->btree == btree && bpos_eq(i->k.k.p, k->k.p)) {
+ bkey_copy(&i->k, k);
+ return 0;
+ }
+ }
+
+ if (!trans->wb_updates ||
+ trans->nr_wb_updates == trans->wb_updates_size) {
+ struct btree_write_buffered_key *u;
+
+ if (trans->nr_wb_updates == trans->wb_updates_size) {
+ struct btree_transaction_stats *s = btree_trans_stats(trans);
+
+ BUG_ON(trans->wb_updates_size > U8_MAX / 2);
+ trans->wb_updates_size = max(1, trans->wb_updates_size * 2);
+ if (s)
+ s->wb_updates_size = trans->wb_updates_size;
+ }
+
+ u = bch2_trans_kmalloc_nomemzero(trans,
+ trans->wb_updates_size *
+ sizeof(struct btree_write_buffered_key));
+ ret = PTR_ERR_OR_ZERO(u);
+ if (ret)
+ return ret;
+
+ if (trans->nr_wb_updates)
+ memcpy(u, trans->wb_updates, trans->nr_wb_updates *
+ sizeof(struct btree_write_buffered_key));
+ trans->wb_updates = u;
+ }
+
+ trans->wb_updates[trans->nr_wb_updates] = (struct btree_write_buffered_key) {
+ .btree = btree,
+ };
+
+ bkey_copy(&trans->wb_updates[trans->nr_wb_updates].k, k);
+ trans->nr_wb_updates++;
+
+ return 0;
+}
+
void bch2_trans_commit_hook(struct btree_trans *trans,
struct btree_trans_commit_hook *h)
{
diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
new file mode 100644
index 000000000000..84c3e6ddb38e
--- /dev/null
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -0,0 +1,330 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_locking.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "btree_write_buffer.h"
+#include "error.h"
+#include "journal.h"
+#include "journal_reclaim.h"
+
+#include <linux/sort.h>
+
+static int btree_write_buffered_key_cmp(const void *_l, const void *_r)
+{
+ const struct btree_write_buffered_key *l = _l;
+ const struct btree_write_buffered_key *r = _r;
+
+ return cmp_int(l->btree, r->btree) ?:
+ bpos_cmp(l->k.k.p, r->k.k.p) ?:
+ cmp_int(l->journal_seq, r->journal_seq) ?:
+ cmp_int(l->journal_offset, r->journal_offset);
+}
+
+static int btree_write_buffered_journal_cmp(const void *_l, const void *_r)
+{
+ const struct btree_write_buffered_key *l = _l;
+ const struct btree_write_buffered_key *r = _r;
+
+ return cmp_int(l->journal_seq, r->journal_seq);
+}
+
+static int bch2_btree_write_buffer_flush_one(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct btree_write_buffered_key *wb,
+ unsigned commit_flags,
+ bool *write_locked,
+ size_t *fast)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_path *path;
+ int ret;
+
+ ret = bch2_btree_iter_traverse(iter);
+ if (ret)
+ return ret;
+
+ path = iter->path;
+
+ if (!*write_locked) {
+ ret = bch2_btree_node_lock_write(trans, path, &path->l[0].b->c);
+ if (ret)
+ return ret;
+
+ bch2_btree_node_prep_for_write(trans, path, path->l[0].b);
+ *write_locked = true;
+ }
+
+ if (!bch2_btree_node_insert_fits(c, path->l[0].b, wb->k.k.u64s)) {
+ bch2_btree_node_unlock_write(trans, path, path->l[0].b);
+ *write_locked = false;
+ goto trans_commit;
+ }
+
+ bch2_btree_insert_key_leaf(trans, path, &wb->k, wb->journal_seq);
+ (*fast)++;
+ return 0;
+trans_commit:
+ return bch2_trans_update(trans, iter, &wb->k, 0) ?:
+ bch2_trans_commit(trans, NULL, NULL,
+ commit_flags|
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_JOURNAL_RECLAIM);
+}
+
+static union btree_write_buffer_state btree_write_buffer_switch(struct btree_write_buffer *wb)
+{
+ union btree_write_buffer_state old, new;
+ u64 v = READ_ONCE(wb->state.v);
+
+ do {
+ old.v = new.v = v;
+
+ new.nr = 0;
+ new.idx++;
+ } while ((v = atomic64_cmpxchg_acquire(&wb->state.counter, old.v, new.v)) != old.v);
+
+ while (old.idx == 0 ? wb->state.ref0 : wb->state.ref1)
+ cpu_relax();
+
+ smp_mb();
+
+ return old;
+}
+
+int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_flags,
+ bool locked)
+{
+ struct bch_fs *c = trans->c;
+ struct journal *j = &c->journal;
+ struct btree_write_buffer *wb = &c->btree_write_buffer;
+ struct journal_entry_pin pin;
+ struct btree_write_buffered_key *i, *dst, *keys;
+ struct btree_iter iter = { NULL };
+ size_t nr = 0, skipped = 0, fast = 0;
+ bool write_locked = false;
+ union btree_write_buffer_state s;
+ int ret = 0;
+
+ memset(&pin, 0, sizeof(pin));
+
+ if (!locked && !mutex_trylock(&wb->flush_lock))
+ return 0;
+
+ bch2_journal_pin_copy(j, &pin, &wb->journal_pin, NULL);
+ bch2_journal_pin_drop(j, &wb->journal_pin);
+
+ s = btree_write_buffer_switch(wb);
+ keys = wb->keys[s.idx];
+ nr = s.nr;
+
+ /*
+ * We first sort so that we can detect and skip redundant updates, and
+ * then we attempt to flush in sorted btree order, as this is most
+ * efficient.
+ *
+ * However, since we're not flushing in the order they appear in the
+ * journal we won't be able to drop our journal pin until everything is
+ * flushed - which means this could deadlock the journal, if we weren't
+ * passing BTREE_INSERT_JORUNAL_RECLAIM. This causes the update to fail
+ * if it would block taking a journal reservation.
+ *
+ * If that happens, we sort them by the order they appeared in the
+ * journal - after dropping redundant entries - and then restart
+ * flushing, this time dropping journal pins as we go.
+ */
+
+ sort(keys, nr, sizeof(keys[0]),
+ btree_write_buffered_key_cmp, NULL);
+
+ for (i = keys; i < keys + nr; i++) {
+ if (i + 1 < keys + nr &&
+ i[0].btree == i[1].btree &&
+ bpos_eq(i[0].k.k.p, i[1].k.k.p)) {
+ skipped++;
+ continue;
+ }
+
+ if (write_locked &&
+ (iter.path->btree_id != i->btree ||
+ bpos_gt(i->k.k.p, iter.path->l[0].b->key.k.p))) {
+ bch2_btree_node_unlock_write(trans, iter.path, iter.path->l[0].b);
+ write_locked = false;
+ }
+
+ if (!iter.path || iter.path->btree_id != i->btree) {
+ bch2_trans_iter_exit(trans, &iter);
+ bch2_trans_iter_init(trans, &iter, i->btree, i->k.k.p, BTREE_ITER_INTENT);
+ }
+
+ bch2_btree_iter_set_pos(&iter, i->k.k.p);
+ iter.path->preserve = false;
+
+ do {
+ ret = bch2_btree_write_buffer_flush_one(trans, &iter, i,
+ commit_flags, &write_locked, &fast);
+ if (!write_locked)
+ bch2_trans_begin(trans);
+ } while (bch2_err_matches(ret, BCH_ERR_transaction_restart));
+
+ if (ret)
+ break;
+ }
+
+ if (write_locked)
+ bch2_btree_node_unlock_write(trans, iter.path, iter.path->l[0].b);
+ bch2_trans_iter_exit(trans, &iter);
+
+ trace_write_buffer_flush(trans, nr, skipped, fast, wb->size);
+
+ if (ret == -BCH_ERR_journal_reclaim_would_deadlock)
+ goto slowpath;
+
+ bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret));
+out:
+ bch2_journal_pin_drop(j, &pin);
+ mutex_unlock(&wb->flush_lock);
+ return ret;
+slowpath:
+ trace_write_buffer_flush_slowpath(trans, i - keys, nr);
+
+ dst = keys;
+ for (; i < keys + nr; i++) {
+ if (i + 1 < keys + nr &&
+ i[0].btree == i[1].btree &&
+ bpos_eq(i[0].k.k.p, i[1].k.k.p))
+ continue;
+
+ *dst = *i;
+ dst++;
+ }
+ nr = dst - keys;
+
+ sort(keys, nr, sizeof(keys[0]),
+ btree_write_buffered_journal_cmp,
+ NULL);
+
+ for (i = keys; i < keys + nr; i++) {
+ if (i->journal_seq > pin.seq) {
+ struct journal_entry_pin pin2;
+
+ memset(&pin2, 0, sizeof(pin2));
+
+ bch2_journal_pin_add(j, i->journal_seq, &pin2, NULL);
+ bch2_journal_pin_drop(j, &pin);
+ bch2_journal_pin_copy(j, &pin, &pin2, NULL);
+ bch2_journal_pin_drop(j, &pin2);
+ }
+
+ ret = commit_do(trans, NULL, NULL,
+ commit_flags|
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_JOURNAL_RECLAIM|
+ JOURNAL_WATERMARK_reserved,
+ __bch2_btree_insert(trans, i->btree, &i->k));
+ if (bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret)))
+ break;
+ }
+
+ goto out;
+}
+
+int bch2_btree_write_buffer_flush_sync(struct btree_trans *trans)
+{
+ bch2_trans_unlock(trans);
+ mutex_lock(&trans->c->btree_write_buffer.flush_lock);
+ return __bch2_btree_write_buffer_flush(trans, 0, true);
+}
+
+int bch2_btree_write_buffer_flush(struct btree_trans *trans)
+{
+ return __bch2_btree_write_buffer_flush(trans, 0, false);
+}
+
+static int bch2_btree_write_buffer_journal_flush(struct journal *j,
+ struct journal_entry_pin *_pin, u64 seq)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+ mutex_lock(&wb->flush_lock);
+
+ return bch2_trans_run(c,
+ __bch2_btree_write_buffer_flush(&trans, BTREE_INSERT_NOCHECK_RW, true));
+}
+
+static inline u64 btree_write_buffer_ref(int idx)
+{
+ return ((union btree_write_buffer_state) {
+ .ref0 = idx == 0,
+ .ref1 = idx == 1,
+ }).v;
+}
+
+int bch2_btree_insert_keys_write_buffer(struct btree_trans *trans)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_write_buffer *wb = &c->btree_write_buffer;
+ struct btree_write_buffered_key *i;
+ union btree_write_buffer_state old, new;
+ int ret = 0;
+ u64 v;
+
+ trans_for_each_wb_update(trans, i) {
+ EBUG_ON(i->k.k.u64s > BTREE_WRITE_BUFERED_U64s_MAX);
+
+ i->journal_seq = trans->journal_res.seq;
+ i->journal_offset = trans->journal_res.offset;
+ }
+
+ preempt_disable();
+ v = READ_ONCE(wb->state.v);
+ do {
+ old.v = new.v = v;
+
+ new.v += btree_write_buffer_ref(new.idx);
+ new.nr += trans->nr_wb_updates;
+ if (new.nr > wb->size) {
+ ret = -BCH_ERR_btree_insert_need_flush_buffer;
+ goto out;
+ }
+ } while ((v = atomic64_cmpxchg_acquire(&wb->state.counter, old.v, new.v)) != old.v);
+
+ memcpy(wb->keys[new.idx] + old.nr,
+ trans->wb_updates,
+ sizeof(trans->wb_updates[0]) * trans->nr_wb_updates);
+
+ bch2_journal_pin_add(&c->journal, trans->journal_res.seq, &wb->journal_pin,
+ bch2_btree_write_buffer_journal_flush);
+
+ atomic64_sub_return_release(btree_write_buffer_ref(new.idx), &wb->state.counter);
+out:
+ preempt_enable();
+ return ret;
+}
+
+void bch2_fs_btree_write_buffer_exit(struct bch_fs *c)
+{
+ struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+ BUG_ON(wb->state.nr && !bch2_journal_error(&c->journal));
+
+ kvfree(wb->keys[1]);
+ kvfree(wb->keys[0]);
+}
+
+int bch2_fs_btree_write_buffer_init(struct bch_fs *c)
+{
+ struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+ mutex_init(&wb->flush_lock);
+ wb->size = c->opts.btree_write_buffer_size;
+
+ wb->keys[0] = kvmalloc_array(wb->size, sizeof(*wb->keys[0]), GFP_KERNEL);
+ wb->keys[1] = kvmalloc_array(wb->size, sizeof(*wb->keys[1]), GFP_KERNEL);
+ if (!wb->keys[0] || !wb->keys[1])
+ return -ENOMEM;
+
+ return 0;
+}
diff --git a/fs/bcachefs/btree_write_buffer.h b/fs/bcachefs/btree_write_buffer.h
new file mode 100644
index 000000000000..322df1c8304e
--- /dev/null
+++ b/fs/bcachefs/btree_write_buffer.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_WRITE_BUFFER_H
+#define _BCACHEFS_BTREE_WRITE_BUFFER_H
+
+int __bch2_btree_write_buffer_flush(struct btree_trans *, unsigned, bool);
+int bch2_btree_write_buffer_flush_sync(struct btree_trans *);
+int bch2_btree_write_buffer_flush(struct btree_trans *);
+
+int bch2_btree_insert_keys_write_buffer(struct btree_trans *);
+
+void bch2_fs_btree_write_buffer_exit(struct bch_fs *);
+int bch2_fs_btree_write_buffer_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_BTREE_WRITE_BUFFER_H */
diff --git a/fs/bcachefs/btree_write_buffer_types.h b/fs/bcachefs/btree_write_buffer_types.h
new file mode 100644
index 000000000000..99993ba77aea
--- /dev/null
+++ b/fs/bcachefs/btree_write_buffer_types.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H
+#define _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H
+
+#include "journal_types.h"
+
+#define BTREE_WRITE_BUFERED_VAL_U64s_MAX 4
+#define BTREE_WRITE_BUFERED_U64s_MAX (BKEY_U64s + BTREE_WRITE_BUFERED_VAL_U64s_MAX)
+
+struct btree_write_buffered_key {
+ u64 journal_seq;
+ unsigned journal_offset;
+ enum btree_id btree;
+ __BKEY_PADDED(k, BTREE_WRITE_BUFERED_VAL_U64s_MAX);
+};
+
+union btree_write_buffer_state {
+ struct {
+ atomic64_t counter;
+ };
+
+ struct {
+ u64 v;
+ };
+
+ struct {
+ u64 nr:23;
+ u64 idx:1;
+ u64 ref0:20;
+ u64 ref1:20;
+ };
+};
+
+struct btree_write_buffer {
+ struct mutex flush_lock;
+ struct journal_entry_pin journal_pin;
+
+ union btree_write_buffer_state state;
+ size_t size;
+
+ struct btree_write_buffered_key *keys[2];
+};
+
+#endif /* _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H */
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 153987376b89..86f48f5762dd 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1278,6 +1278,47 @@ int bch2_mark_reflink_p(struct btree_trans *trans,
return ret;
}
+void bch2_trans_fs_usage_revert(struct btree_trans *trans,
+ struct replicas_delta_list *deltas)
+{
+ struct bch_fs *c = trans->c;
+ struct bch_fs_usage *dst;
+ struct replicas_delta *d, *top = (void *) deltas->d + deltas->used;
+ s64 added = 0;
+ unsigned i;
+
+ percpu_down_read(&c->mark_lock);
+ preempt_disable();
+ dst = fs_usage_ptr(c, trans->journal_res.seq, false);
+
+ /* revert changes: */
+ for (d = deltas->d; d != top; d = replicas_delta_next(d)) {
+ switch (d->r.data_type) {
+ case BCH_DATA_btree:
+ case BCH_DATA_user:
+ case BCH_DATA_parity:
+ added += d->delta;
+ }
+ BUG_ON(__update_replicas(c, dst, &d->r, -d->delta));
+ }
+
+ dst->nr_inodes -= deltas->nr_inodes;
+
+ for (i = 0; i < BCH_REPLICAS_MAX; i++) {
+ added -= deltas->persistent_reserved[i];
+ dst->reserved -= deltas->persistent_reserved[i];
+ dst->persistent_reserved[i] -= deltas->persistent_reserved[i];
+ }
+
+ if (added > 0) {
+ trans->disk_res->sectors += added;
+ this_cpu_add(*c->online_reserved, added);
+ }
+
+ preempt_enable();
+ percpu_up_read(&c->mark_lock);
+}
+
int bch2_trans_fs_usage_apply(struct btree_trans *trans,
struct replicas_delta_list *deltas)
{
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 0fc101b9aaf1..e8e3a3b09714 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -229,6 +229,7 @@ int bch2_trans_mark_inode(struct btree_trans *, enum btree_id, unsigned, struct
int bch2_trans_mark_reservation(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
int bch2_trans_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
+void bch2_trans_fs_usage_revert(struct btree_trans *, struct replicas_delta_list *);
int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *,
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index bb296edcf4f7..7a6448f48fca 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -42,6 +42,7 @@
x(BCH_ERR_transaction_restart, transaction_restart_key_cache_realloced)\
x(BCH_ERR_transaction_restart, transaction_restart_journal_preres_get) \
x(BCH_ERR_transaction_restart, transaction_restart_split_race) \
+ x(BCH_ERR_transaction_restart, transaction_restart_write_buffer_flush) \
x(BCH_ERR_transaction_restart, transaction_restart_nested) \
x(0, no_btree_node) \
x(BCH_ERR_no_btree_node, no_btree_node_relock) \
@@ -58,6 +59,7 @@
x(BCH_ERR_btree_insert_fail, btree_insert_need_mark_replicas) \
x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_res) \
x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_reclaim) \
+ x(BCH_ERR_btree_insert_fail, btree_insert_need_flush_buffer) \
x(0, lock_fail_root_changed) \
x(0, journal_reclaim_would_deadlock) \
x(0, fsck) \
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index c6025172f32e..85927b306014 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -206,6 +206,11 @@ enum opt_type {
OPT_BOOL(), \
BCH2_NO_SB_OPT, true, \
NULL, "Stash pointer to in memory btree node in btree ptr")\
+ x(btree_write_buffer_size, u32, \
+ OPT_FS|OPT_MOUNT, \
+ OPT_UINT(16, (1U << 20) - 1), \
+ BCH2_NO_SB_OPT, 1U << 13, \
+ NULL, "Number of btree write buffer entries") \
x(gc_reserve_percent, u8, \
OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_UINT(5, 21), \
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index e7e3dcbe2339..ade8d074e887 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -16,6 +16,7 @@
#include "btree_key_cache.h"
#include "btree_update_interior.h"
#include "btree_io.h"
+#include "btree_write_buffer.h"
#include "buckets_waiting_for_journal.h"
#include "chardev.h"
#include "checksum.h"
@@ -463,6 +464,7 @@ static void __bch2_fs_free(struct bch_fs *c)
bch2_fs_compress_exit(c);
bch2_journal_keys_free(&c->journal_keys);
bch2_journal_entries_free(c);
+ bch2_fs_btree_write_buffer_exit(c);
percpu_free_rwsem(&c->mark_lock);
free_percpu(c->online_reserved);
@@ -816,6 +818,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
bch2_fs_btree_iter_init(c) ?:
bch2_fs_btree_interior_update_init(c) ?:
bch2_fs_buckets_waiting_for_journal_init(c) ?:
+ bch2_fs_btree_write_buffer_init(c) ?:
bch2_fs_subvolumes_init(c) ?:
bch2_fs_io_init(c) ?:
bch2_fs_encryption_init(c) ?:
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 17fc58e73702..937fd132bfd2 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -1111,6 +1111,51 @@ TRACE_EVENT(trans_restart_key_cache_key_realloced,
__entry->new_u64s)
);
+DEFINE_EVENT(transaction_event, trans_restart_write_buffer_flush,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip),
+ TP_ARGS(trans, caller_ip)
+);
+
+TRACE_EVENT(write_buffer_flush,
+ TP_PROTO(struct btree_trans *trans, size_t nr, size_t skipped, size_t fast, size_t size),
+ TP_ARGS(trans, nr, skipped, fast, size),
+
+ TP_STRUCT__entry(
+ __field(size_t, nr )
+ __field(size_t, skipped )
+ __field(size_t, fast )
+ __field(size_t, size )
+ ),
+
+ TP_fast_assign(
+ __entry->nr = nr;
+ __entry->skipped = skipped;
+ __entry->fast = fast;
+ __entry->size = size;
+ ),
+
+ TP_printk("%zu/%zu skipped %zu fast %zu",
+ __entry->nr, __entry->size, __entry->skipped, __entry->fast)
+);
+
+TRACE_EVENT(write_buffer_flush_slowpath,
+ TP_PROTO(struct btree_trans *trans, size_t nr, size_t size),
+ TP_ARGS(trans, nr, size),
+
+ TP_STRUCT__entry(
+ __field(size_t, nr )
+ __field(size_t, size )
+ ),
+
+ TP_fast_assign(
+ __entry->nr = nr;
+ __entry->size = size;
+ ),
+
+ TP_printk("%zu/%zu", __entry->nr, __entry->size)
+);
+
#endif /* _TRACE_BCACHEFS_H */
/* This part must be outside protection */