summaryrefslogtreecommitdiff
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/Kconfig2
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/accessors.h1
-rw-r--r--fs/btrfs/acl.h2
-rw-r--r--fs/btrfs/async-thread.c17
-rw-r--r--fs/btrfs/backref.c176
-rw-r--r--fs/btrfs/backref.h16
-rw-r--r--fs/btrfs/bio.c49
-rw-r--r--fs/btrfs/block-group.c217
-rw-r--r--fs/btrfs/block-rsv.c10
-rw-r--r--fs/btrfs/btrfs_inode.h19
-rw-r--r--fs/btrfs/compression.c31
-rw-r--r--fs/btrfs/compression.h26
-rw-r--r--fs/btrfs/ctree.c99
-rw-r--r--fs/btrfs/ctree.h31
-rw-r--r--fs/btrfs/defrag.c78
-rw-r--r--fs/btrfs/defrag.h4
-rw-r--r--fs/btrfs/delalloc-space.c2
-rw-r--r--fs/btrfs/delayed-inode.c150
-rw-r--r--fs/btrfs/delayed-inode.h2
-rw-r--r--fs/btrfs/delayed-ref.c89
-rw-r--r--fs/btrfs/delayed-ref.h3
-rw-r--r--fs/btrfs/dev-replace.c36
-rw-r--r--fs/btrfs/dir-item.c26
-rw-r--r--fs/btrfs/dir-item.h1
-rw-r--r--fs/btrfs/direct-io.c20
-rw-r--r--fs/btrfs/direct-io.h2
-rw-r--r--fs/btrfs/discard.c34
-rw-r--r--fs/btrfs/discard.h1
-rw-r--r--fs/btrfs/disk-io.c192
-rw-r--r--fs/btrfs/disk-io.h3
-rw-r--r--fs/btrfs/export.c51
-rw-r--r--fs/btrfs/extent-io-tree.c8
-rw-r--r--fs/btrfs/extent-tree.c263
-rw-r--r--fs/btrfs/extent-tree.h8
-rw-r--r--fs/btrfs/extent_io.c856
-rw-r--r--fs/btrfs/extent_io.h25
-rw-r--r--fs/btrfs/extent_map.c83
-rw-r--r--fs/btrfs/file-item.c31
-rw-r--r--fs/btrfs/file-item.h2
-rw-r--r--fs/btrfs/file.c154
-rw-r--r--fs/btrfs/file.h2
-rw-r--r--fs/btrfs/free-space-cache.c62
-rw-r--r--fs/btrfs/free-space-tree.c56
-rw-r--r--fs/btrfs/fs.c131
-rw-r--r--fs/btrfs/fs.h57
-rw-r--r--fs/btrfs/inode-item.c11
-rw-r--r--fs/btrfs/inode.c1042
-rw-r--r--fs/btrfs/ioctl.c572
-rw-r--r--fs/btrfs/ioctl.h5
-rw-r--r--fs/btrfs/locking.c1
-rw-r--r--fs/btrfs/locking.h5
-rw-r--r--fs/btrfs/ordered-data.c35
-rw-r--r--fs/btrfs/ordered-data.h9
-rw-r--r--fs/btrfs/print-tree.h2
-rw-r--r--fs/btrfs/props.c66
-rw-r--r--fs/btrfs/props.h8
-rw-r--r--fs/btrfs/qgroup.c55
-rw-r--r--fs/btrfs/qgroup.h3
-rw-r--r--fs/btrfs/raid-stripe-tree.c146
-rw-r--r--fs/btrfs/raid-stripe-tree.h1
-rw-r--r--fs/btrfs/reflink.c100
-rw-r--r--fs/btrfs/relocation.c401
-rw-r--r--fs/btrfs/root-tree.c2
-rw-r--r--fs/btrfs/scrub.c8
-rw-r--r--fs/btrfs/send.c553
-rw-r--r--fs/btrfs/send.h4
-rw-r--r--fs/btrfs/space-info.c71
-rw-r--r--fs/btrfs/space-info.h15
-rw-r--r--fs/btrfs/subpage.c249
-rw-r--r--fs/btrfs/subpage.h65
-rw-r--r--fs/btrfs/super.c31
-rw-r--r--fs/btrfs/sysfs.c184
-rw-r--r--fs/btrfs/sysfs.h7
-rw-r--r--fs/btrfs/tests/btrfs-tests.c18
-rw-r--r--fs/btrfs/tests/btrfs-tests.h6
-rw-r--r--fs/btrfs/tests/delayed-refs-tests.c1016
-rw-r--r--fs/btrfs/tests/extent-io-tests.c6
-rw-r--r--fs/btrfs/tests/extent-map-tests.c1
-rw-r--r--fs/btrfs/tests/raid-stripe-tree-tests.c661
-rw-r--r--fs/btrfs/transaction.c46
-rw-r--r--fs/btrfs/transaction.h18
-rw-r--r--fs/btrfs/tree-checker.c98
-rw-r--r--fs/btrfs/tree-checker.h7
-rw-r--r--fs/btrfs/tree-log.c396
-rw-r--r--fs/btrfs/uuid-tree.c2
-rw-r--r--fs/btrfs/verity.c4
-rw-r--r--fs/btrfs/volumes.c269
-rw-r--r--fs/btrfs/volumes.h25
-rw-r--r--fs/btrfs/xattr.c1
-rw-r--r--fs/btrfs/xattr.h2
-rw-r--r--fs/btrfs/zlib.c85
-rw-r--r--fs/btrfs/zoned.c157
-rw-r--r--fs/btrfs/zoned.h7
-rw-r--r--fs/btrfs/zstd.c68
95 files changed, 6288 insertions, 3386 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index fa8515598341..73a2dfb854c5 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -3,9 +3,9 @@
config BTRFS_FS
tristate "Btrfs filesystem support"
select BLK_CGROUP_PUNT_BIO
+ select CRC32
select CRYPTO
select CRYPTO_CRC32C
- select LIBCRC32C
select CRYPTO_XXHASH
select CRYPTO_SHA256
select CRYPTO_BLAKE2B
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 3cfc440c636c..2d5f0482678b 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -44,4 +44,4 @@ btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \
tests/extent-buffer-tests.o tests/btrfs-tests.o \
tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o \
tests/free-space-tree-tests.o tests/extent-map-tests.o \
- tests/raid-stripe-tree-tests.o
+ tests/raid-stripe-tree-tests.o tests/delayed-refs-tests.o
diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h
index 7a7e0ef69973..15ea6348800b 100644
--- a/fs/btrfs/accessors.h
+++ b/fs/btrfs/accessors.h
@@ -12,6 +12,7 @@
#include <linux/string.h>
#include <linux/mm.h>
#include <uapi/linux/btrfs_tree.h>
+#include "extent_io.h"
struct extent_buffer;
diff --git a/fs/btrfs/acl.h b/fs/btrfs/acl.h
index 48b9ddae4a46..0458cd51ed48 100644
--- a/fs/btrfs/acl.h
+++ b/fs/btrfs/acl.h
@@ -3,6 +3,8 @@
#ifndef BTRFS_ACL_H
#define BTRFS_ACL_H
+#include <linux/types.h>
+
struct posix_acl;
struct inode;
struct btrfs_trans_handle;
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 361a866c1995..f3bffe08b290 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -18,7 +18,7 @@ enum {
};
#define NO_THRESHOLD (-1)
-#define DFT_THRESHOLD (32)
+#define DEFAULT_THRESHOLD (32)
struct btrfs_workqueue {
struct workqueue_struct *normal_wq;
@@ -94,9 +94,9 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info,
ret->limit_active = limit_active;
if (thresh == 0)
- thresh = DFT_THRESHOLD;
+ thresh = DEFAULT_THRESHOLD;
/* For low threshold, disabling threshold is a better choice */
- if (thresh < DFT_THRESHOLD) {
+ if (thresh < DEFAULT_THRESHOLD) {
ret->current_active = limit_active;
ret->thresh = NO_THRESHOLD;
} else {
@@ -168,7 +168,7 @@ static inline void thresh_exec_hook(struct btrfs_workqueue *wq)
{
int new_current_active;
long pending;
- int need_change = 0;
+ bool need_change = false;
if (wq->thresh == NO_THRESHOLD)
return;
@@ -196,15 +196,14 @@ static inline void thresh_exec_hook(struct btrfs_workqueue *wq)
new_current_active--;
new_current_active = clamp_val(new_current_active, 1, wq->limit_active);
if (new_current_active != wq->current_active) {
- need_change = 1;
+ need_change = true;
wq->current_active = new_current_active;
}
out:
spin_unlock(&wq->thres_lock);
- if (need_change) {
+ if (need_change)
workqueue_set_max_active(wq->normal_wq, wq->current_active);
- }
}
static void run_ordered_work(struct btrfs_workqueue *wq,
@@ -296,7 +295,7 @@ static void btrfs_work_helper(struct work_struct *normal_work)
struct btrfs_work *work = container_of(normal_work, struct btrfs_work,
normal_work);
struct btrfs_workqueue *wq = work->wq;
- int need_order = 0;
+ bool need_order = false;
/*
* We should not touch things inside work in the following cases:
@@ -307,7 +306,7 @@ static void btrfs_work_helper(struct work_struct *normal_work)
* So we save the needed things here.
*/
if (work->ordered_func)
- need_order = 1;
+ need_order = true;
trace_btrfs_work_sched(work);
thresh_exec_hook(wq);
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 04f53ca548e1..5936cff80ff3 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -250,6 +250,21 @@ static int prelim_ref_compare(const struct prelim_ref *ref1,
return 0;
}
+static int prelim_ref_rb_add_cmp(const struct rb_node *new,
+ const struct rb_node *exist)
+{
+ const struct prelim_ref *ref_new =
+ rb_entry(new, struct prelim_ref, rbnode);
+ const struct prelim_ref *ref_exist =
+ rb_entry(exist, struct prelim_ref, rbnode);
+
+ /*
+ * prelim_ref_compare() expects the first parameter as the existing one,
+ * different from the rb_find_add_cached() order.
+ */
+ return prelim_ref_compare(ref_exist, ref_new);
+}
+
static void update_share_count(struct share_check *sc, int oldcount,
int newcount, const struct prelim_ref *newref)
{
@@ -278,55 +293,39 @@ static void prelim_ref_insert(const struct btrfs_fs_info *fs_info,
struct share_check *sc)
{
struct rb_root_cached *root;
- struct rb_node **p;
- struct rb_node *parent = NULL;
- struct prelim_ref *ref;
- int result;
- bool leftmost = true;
+ struct rb_node *exist;
root = &preftree->root;
- p = &root->rb_root.rb_node;
+ exist = rb_find_add_cached(&newref->rbnode, root, prelim_ref_rb_add_cmp);
+ if (exist) {
+ struct prelim_ref *ref = rb_entry(exist, struct prelim_ref, rbnode);
+ /* Identical refs, merge them and free @newref */
+ struct extent_inode_elem *eie = ref->inode_list;
- while (*p) {
- parent = *p;
- ref = rb_entry(parent, struct prelim_ref, rbnode);
- result = prelim_ref_compare(ref, newref);
- if (result < 0) {
- p = &(*p)->rb_left;
- } else if (result > 0) {
- p = &(*p)->rb_right;
- leftmost = false;
- } else {
- /* Identical refs, merge them and free @newref */
- struct extent_inode_elem *eie = ref->inode_list;
+ while (eie && eie->next)
+ eie = eie->next;
- while (eie && eie->next)
- eie = eie->next;
-
- if (!eie)
- ref->inode_list = newref->inode_list;
- else
- eie->next = newref->inode_list;
- trace_btrfs_prelim_ref_merge(fs_info, ref, newref,
- preftree->count);
- /*
- * A delayed ref can have newref->count < 0.
- * The ref->count is updated to follow any
- * BTRFS_[ADD|DROP]_DELAYED_REF actions.
- */
- update_share_count(sc, ref->count,
- ref->count + newref->count, newref);
- ref->count += newref->count;
- free_pref(newref);
- return;
- }
+ if (!eie)
+ ref->inode_list = newref->inode_list;
+ else
+ eie->next = newref->inode_list;
+ trace_btrfs_prelim_ref_merge(fs_info, ref, newref,
+ preftree->count);
+ /*
+ * A delayed ref can have newref->count < 0.
+ * The ref->count is updated to follow any
+ * BTRFS_[ADD|DROP]_DELAYED_REF actions.
+ */
+ update_share_count(sc, ref->count,
+ ref->count + newref->count, newref);
+ ref->count += newref->count;
+ free_pref(newref);
+ return;
}
update_share_count(sc, 0, newref->count, newref);
preftree->count++;
trace_btrfs_prelim_ref_insert(fs_info, newref, NULL, preftree->count);
- rb_link_node(&newref->rbnode, parent, p);
- rb_insert_color_cached(&newref->rbnode, root, leftmost);
}
/*
@@ -1400,11 +1399,11 @@ static int find_parent_nodes(struct btrfs_backref_walk_ctx *ctx,
ASSERT(ctx->roots == NULL);
key.objectid = ctx->bytenr;
- key.offset = (u64)-1;
if (btrfs_fs_incompat(ctx->fs_info, SKINNY_METADATA))
key.type = BTRFS_METADATA_ITEM_KEY;
else
key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = (u64)-1;
path = btrfs_alloc_path();
if (!path)
@@ -2207,11 +2206,11 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
struct btrfs_extent_item *ei;
struct btrfs_key key;
+ key.objectid = logical;
if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
key.type = BTRFS_METADATA_ITEM_KEY;
else
key.type = BTRFS_EXTENT_ITEM_KEY;
- key.objectid = logical;
key.offset = (u64)-1;
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
@@ -3022,9 +3021,6 @@ void btrfs_backref_init_cache(struct btrfs_fs_info *fs_info,
cache->rb_root = RB_ROOT;
for (i = 0; i < BTRFS_MAX_LEVEL; i++)
INIT_LIST_HEAD(&cache->pending[i]);
- INIT_LIST_HEAD(&cache->changed);
- INIT_LIST_HEAD(&cache->detached);
- INIT_LIST_HEAD(&cache->leaves);
INIT_LIST_HEAD(&cache->pending_edge);
INIT_LIST_HEAD(&cache->useless_node);
cache->fs_info = fs_info;
@@ -3132,29 +3128,17 @@ void btrfs_backref_drop_node(struct btrfs_backref_cache *tree,
void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache,
struct btrfs_backref_node *node)
{
- struct btrfs_backref_node *upper;
struct btrfs_backref_edge *edge;
if (!node)
return;
- BUG_ON(!node->lowest && !node->detached);
while (!list_empty(&node->upper)) {
edge = list_entry(node->upper.next, struct btrfs_backref_edge,
list[LOWER]);
- upper = edge->node[UPPER];
list_del(&edge->list[LOWER]);
list_del(&edge->list[UPPER]);
btrfs_backref_free_edge(cache, edge);
-
- /*
- * Add the node to leaf node list if no other child block
- * cached.
- */
- if (list_empty(&upper->lower)) {
- list_add_tail(&upper->lower, &cache->leaves);
- upper->lowest = 1;
- }
}
btrfs_backref_drop_node(cache, node);
@@ -3166,33 +3150,13 @@ void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache,
void btrfs_backref_release_cache(struct btrfs_backref_cache *cache)
{
struct btrfs_backref_node *node;
- int i;
- while (!list_empty(&cache->detached)) {
- node = list_entry(cache->detached.next,
- struct btrfs_backref_node, list);
+ while ((node = rb_entry_safe(rb_first(&cache->rb_root),
+ struct btrfs_backref_node, rb_node)))
btrfs_backref_cleanup_node(cache, node);
- }
- while (!list_empty(&cache->leaves)) {
- node = list_entry(cache->leaves.next,
- struct btrfs_backref_node, lower);
- btrfs_backref_cleanup_node(cache, node);
- }
-
- for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
- while (!list_empty(&cache->pending[i])) {
- node = list_first_entry(&cache->pending[i],
- struct btrfs_backref_node,
- list);
- btrfs_backref_cleanup_node(cache, node);
- }
- }
ASSERT(list_empty(&cache->pending_edge));
ASSERT(list_empty(&cache->useless_node));
- ASSERT(list_empty(&cache->changed));
- ASSERT(list_empty(&cache->detached));
- ASSERT(RB_EMPTY_ROOT(&cache->rb_root));
ASSERT(!cache->nr_nodes);
ASSERT(!cache->nr_edges);
}
@@ -3316,8 +3280,12 @@ static int handle_indirect_tree_backref(struct btrfs_trans_handle *trans,
root = btrfs_get_fs_root(fs_info, ref_key->offset, false);
if (IS_ERR(root))
return PTR_ERR(root);
- if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
- cur->cowonly = 1;
+
+ /* We shouldn't be using backref cache for non-shareable roots. */
+ if (unlikely(!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))) {
+ btrfs_put_root(root);
+ return -EUCLEAN;
+ }
if (btrfs_root_level(&root->root_item) == cur->level) {
/* Tree root */
@@ -3403,8 +3371,15 @@ static int handle_indirect_tree_backref(struct btrfs_trans_handle *trans,
goto out;
}
upper->owner = btrfs_header_owner(eb);
- if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
- upper->cowonly = 1;
+
+ /* We shouldn't be using backref cache for non shareable roots. */
+ if (unlikely(!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))) {
+ btrfs_put_root(root);
+ btrfs_backref_free_edge(cache, edge);
+ btrfs_backref_free_node(cache, upper);
+ ret = -EUCLEAN;
+ goto out;
+ }
/*
* If we know the block isn't shared we can avoid
@@ -3595,15 +3570,9 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache,
ASSERT(start->checked);
- /* Insert this node to cache if it's not COW-only */
- if (!start->cowonly) {
- rb_node = rb_simple_insert(&cache->rb_root, start->bytenr,
- &start->rb_node);
- if (rb_node)
- btrfs_backref_panic(cache->fs_info, start->bytenr,
- -EEXIST);
- list_add_tail(&start->lower, &cache->leaves);
- }
+ rb_node = rb_simple_insert(&cache->rb_root, start->bytenr, &start->rb_node);
+ if (rb_node)
+ btrfs_backref_panic(cache->fs_info, start->bytenr, -EEXIST);
/*
* Use breadth first search to iterate all related edges.
@@ -3642,11 +3611,6 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache,
* parents have already been linked.
*/
if (!RB_EMPTY_NODE(&upper->rb_node)) {
- if (upper->lowest) {
- list_del_init(&upper->lower);
- upper->lowest = 0;
- }
-
list_add_tail(&edge->list[UPPER], &upper->lower);
continue;
}
@@ -3657,23 +3621,13 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache,
return -EUCLEAN;
}
- /* Sanity check, COW-only node has non-COW-only parent */
- if (start->cowonly != upper->cowonly) {
- ASSERT(0);
+ rb_node = rb_simple_insert(&cache->rb_root, upper->bytenr,
+ &upper->rb_node);
+ if (unlikely(rb_node)) {
+ btrfs_backref_panic(cache->fs_info, upper->bytenr, -EEXIST);
return -EUCLEAN;
}
- /* Only cache non-COW-only (subvolume trees) tree blocks */
- if (!upper->cowonly) {
- rb_node = rb_simple_insert(&cache->rb_root, upper->bytenr,
- &upper->rb_node);
- if (rb_node) {
- btrfs_backref_panic(cache->fs_info,
- upper->bytenr, -EEXIST);
- return -EUCLEAN;
- }
- }
-
list_add_tail(&edge->list[UPPER], &upper->lower);
/*
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index e8c22cccb5c1..74e614031274 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -318,6 +318,12 @@ struct btrfs_backref_node {
u64 bytenr;
}; /* Use rb_simple_node for search/insert */
+ /*
+ * This is a sanity check, whenever we COW a block we will update
+ * new_bytenr with it's current location, and we will check this in
+ * various places to validate that the cache makes sense, it shouldn't
+ * be used for anything else.
+ */
u64 new_bytenr;
/* Objectid of tree block owner, can be not uptodate */
u64 owner;
@@ -335,10 +341,6 @@ struct btrfs_backref_node {
struct extent_buffer *eb;
/* Level of the tree block */
unsigned int level:8;
- /* Is the block in a non-shareable tree */
- unsigned int cowonly:1;
- /* 1 if no child node is in the cache */
- unsigned int lowest:1;
/* Is the extent buffer locked */
unsigned int locked:1;
/* Has the block been processed */
@@ -391,12 +393,6 @@ struct btrfs_backref_cache {
* level blocks may not reflect the new location
*/
struct list_head pending[BTRFS_MAX_LEVEL];
- /* List of backref nodes with no child node */
- struct list_head leaves;
- /* List of blocks that have been COWed in current transaction */
- struct list_head changed;
- /* List of detached backref node. */
- struct list_head detached;
u64 last_trans;
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index 7ea6f0b43b95..8c2eee1f1878 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -97,33 +97,17 @@ static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info,
return bbio;
}
-/* Free a bio that was never submitted to the underlying device. */
-static void btrfs_cleanup_bio(struct btrfs_bio *bbio)
-{
- if (bbio_has_ordered_extent(bbio))
- btrfs_put_ordered_extent(bbio->ordered);
- bio_put(&bbio->bio);
-}
-
-static void __btrfs_bio_end_io(struct btrfs_bio *bbio)
-{
- if (bbio_has_ordered_extent(bbio)) {
- struct btrfs_ordered_extent *ordered = bbio->ordered;
-
- bbio->end_io(bbio);
- btrfs_put_ordered_extent(ordered);
- } else {
- bbio->end_io(bbio);
- }
-}
-
void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
{
bbio->bio.bi_status = status;
if (bbio->bio.bi_pool == &btrfs_clone_bioset) {
struct btrfs_bio *orig_bbio = bbio->private;
- btrfs_cleanup_bio(bbio);
+ /* Free bio that was never submitted to the underlying device. */
+ if (bbio_has_ordered_extent(bbio))
+ btrfs_put_ordered_extent(bbio->ordered);
+ bio_put(&bbio->bio);
+
bbio = orig_bbio;
}
@@ -138,7 +122,15 @@ void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
/* Load split bio's error which might be set above. */
if (status == BLK_STS_OK)
bbio->bio.bi_status = READ_ONCE(bbio->status);
- __btrfs_bio_end_io(bbio);
+
+ if (bbio_has_ordered_extent(bbio)) {
+ struct btrfs_ordered_extent *ordered = bbio->ordered;
+
+ bbio->end_io(bbio);
+ btrfs_put_ordered_extent(ordered);
+ } else {
+ bbio->end_io(bbio);
+ }
}
}
@@ -453,6 +445,14 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio)
(unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev),
dev->devid, bio->bi_iter.bi_size);
+ /*
+ * Track reads if tracking is enabled; ignore I/O operations before the
+ * filesystem is fully initialized.
+ */
+ if (dev->fs_devices->collect_fs_stats && bio_op(bio) == REQ_OP_READ && dev->fs_info)
+ percpu_counter_add(&dev->fs_info->stats_read_blocks,
+ bio->bi_iter.bi_size >> dev->fs_info->sectorsize_bits);
+
if (bio->bi_opf & REQ_BTRFS_CGROUP_PUNT)
blkcg_punt_bio_submit(bio);
else
@@ -573,7 +573,7 @@ static void run_one_async_done(struct btrfs_work *work, bool do_free)
/* If an error occurred we just want to clean up the bio and move on. */
if (bio->bi_status) {
- btrfs_bio_end_io(async->bbio, async->bbio->bio.bi_status);
+ btrfs_bio_end_io(async->bbio, bio->bi_status);
return;
}
@@ -725,8 +725,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
bio->bi_opf |= REQ_OP_ZONE_APPEND;
}
- if (is_data_bbio(bbio) && bioc &&
- btrfs_need_stripe_tree_update(bioc->fs_info, bioc->map_type)) {
+ if (is_data_bbio(bbio) && bioc && bioc->use_rst) {
/*
* No locking for the list update, as we only add to
* the list in the I/O submission path, and list
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 4427c1b835e8..a8129f1ce78c 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -173,43 +173,41 @@ void btrfs_put_block_group(struct btrfs_block_group *cache)
}
}
+static int btrfs_bg_start_cmp(const struct rb_node *new,
+ const struct rb_node *exist)
+{
+ const struct btrfs_block_group *new_bg =
+ rb_entry(new, struct btrfs_block_group, cache_node);
+ const struct btrfs_block_group *exist_bg =
+ rb_entry(exist, struct btrfs_block_group, cache_node);
+
+ if (new_bg->start < exist_bg->start)
+ return -1;
+ if (new_bg->start > exist_bg->start)
+ return 1;
+ return 0;
+}
+
/*
* This adds the block group to the fs_info rb tree for the block group cache
*/
-static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
- struct btrfs_block_group *block_group)
+static int btrfs_add_block_group_cache(struct btrfs_block_group *block_group)
{
- struct rb_node **p;
- struct rb_node *parent = NULL;
- struct btrfs_block_group *cache;
- bool leftmost = true;
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+ struct rb_node *exist;
+ int ret = 0;
ASSERT(block_group->length != 0);
- write_lock(&info->block_group_cache_lock);
- p = &info->block_group_cache_tree.rb_root.rb_node;
-
- while (*p) {
- parent = *p;
- cache = rb_entry(parent, struct btrfs_block_group, cache_node);
- if (block_group->start < cache->start) {
- p = &(*p)->rb_left;
- } else if (block_group->start > cache->start) {
- p = &(*p)->rb_right;
- leftmost = false;
- } else {
- write_unlock(&info->block_group_cache_lock);
- return -EEXIST;
- }
- }
-
- rb_link_node(&block_group->cache_node, parent, p);
- rb_insert_color_cached(&block_group->cache_node,
- &info->block_group_cache_tree, leftmost);
+ write_lock(&fs_info->block_group_cache_lock);
- write_unlock(&info->block_group_cache_lock);
+ exist = rb_find_add_cached(&block_group->cache_node,
+ &fs_info->block_group_cache_tree, btrfs_bg_start_cmp);
+ if (exist)
+ ret = -EEXIST;
+ write_unlock(&fs_info->block_group_cache_lock);
- return 0;
+ return ret;
}
/*
@@ -586,7 +584,7 @@ static int sample_block_group_extent_item(struct btrfs_caching_control *caching_
struct btrfs_root *extent_root;
u64 search_offset;
u64 search_end = block_group->start + block_group->length;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key search_key;
int ret = 0;
@@ -628,7 +626,6 @@ static int sample_block_group_extent_item(struct btrfs_caching_control *caching_
lockdep_assert_held(&caching_ctl->mutex);
lockdep_assert_held_read(&fs_info->commit_root_sem);
- btrfs_free_path(path);
return ret;
}
@@ -740,8 +737,8 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
path->reada = READA_FORWARD;
key.objectid = last;
- key.offset = 0;
key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = 0;
next:
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
@@ -787,8 +784,8 @@ next:
if (key.objectid < last) {
key.objectid = last;
- key.offset = 0;
key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = 0;
btrfs_release_path(path);
goto next;
}
@@ -1223,7 +1220,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
block_group->space_info->total_bytes -= block_group->length;
block_group->space_info->bytes_readonly -=
(block_group->length - block_group->zone_unusable);
- btrfs_space_info_update_bytes_zone_unusable(fs_info, block_group->space_info,
+ btrfs_space_info_update_bytes_zone_unusable(block_group->space_info,
-block_group->zone_unusable);
block_group->space_info->disk_total -= block_group->length * factor;
@@ -1396,8 +1393,7 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
if (btrfs_is_zoned(cache->fs_info)) {
/* Migrate zone_unusable bytes to readonly */
sinfo->bytes_readonly += cache->zone_unusable;
- btrfs_space_info_update_bytes_zone_unusable(cache->fs_info, sinfo,
- -cache->zone_unusable);
+ btrfs_space_info_update_bytes_zone_unusable(sinfo, -cache->zone_unusable);
cache->zone_unusable = 0;
}
cache->ro++;
@@ -1460,6 +1456,32 @@ out:
}
/*
+ * Link the block_group to a list via bg_list.
+ *
+ * @bg: The block_group to link to the list.
+ * @list: The list to link it to.
+ *
+ * Use this rather than list_add_tail() directly to ensure proper respect
+ * to locking and refcounting.
+ *
+ * Returns: true if the bg was linked with a refcount bump and false otherwise.
+ */
+static bool btrfs_link_bg_list(struct btrfs_block_group *bg, struct list_head *list)
+{
+ struct btrfs_fs_info *fs_info = bg->fs_info;
+ bool added = false;
+
+ spin_lock(&fs_info->unused_bgs_lock);
+ if (list_empty(&bg->bg_list)) {
+ btrfs_get_block_group(bg);
+ list_add_tail(&bg->bg_list, list);
+ added = true;
+ }
+ spin_unlock(&fs_info->unused_bgs_lock);
+ return added;
+}
+
+/*
* Process the unused_bgs list and remove any that don't have any allocated
* space inside of them.
*/
@@ -1574,8 +1596,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
* drop under the "next" label for the
* fs_info->unused_bgs list.
*/
- btrfs_get_block_group(block_group);
- list_add_tail(&block_group->bg_list, &retry_list);
+ btrfs_link_bg_list(block_group, &retry_list);
trace_btrfs_skip_unused_block_group(block_group);
spin_unlock(&block_group->lock);
@@ -1645,8 +1666,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
spin_lock(&space_info->lock);
spin_lock(&block_group->lock);
- btrfs_space_info_update_bytes_pinned(fs_info, space_info,
- -block_group->pinned);
+ btrfs_space_info_update_bytes_pinned(space_info, -block_group->pinned);
space_info->bytes_readonly += block_group->pinned;
block_group->pinned = 0;
@@ -1827,7 +1847,8 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp);
while (!list_empty(&fs_info->reclaim_bgs)) {
u64 zone_unusable;
- u64 reclaimed;
+ u64 used;
+ u64 reserved;
int ret = 0;
bg = list_first_entry(&fs_info->reclaim_bgs,
@@ -1891,6 +1912,17 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
up_write(&space_info->groups_sem);
goto next;
}
+
+ /*
+ * Cache the zone_unusable value before turning the block group
+ * to read only. As soon as the block group is read only it's
+ * zone_unusable value gets moved to the block group's read-only
+ * bytes and isn't available for calculations anymore. We also
+ * cache it before unlocking the block group, to prevent races
+ * (reports from KCSAN and such tools) with tasks updating it.
+ */
+ zone_unusable = bg->zone_unusable;
+
spin_unlock(&bg->lock);
spin_unlock(&space_info->lock);
@@ -1907,31 +1939,47 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
goto next;
}
- /*
- * Cache the zone_unusable value before turning the block group
- * to read only. As soon as the blog group is read only it's
- * zone_unusable value gets moved to the block group's read-only
- * bytes and isn't available for calculations anymore.
- */
- zone_unusable = bg->zone_unusable;
ret = inc_block_group_ro(bg, 0);
up_write(&space_info->groups_sem);
if (ret < 0)
goto next;
+ /*
+ * The amount of bytes reclaimed corresponds to the sum of the
+ * "used" and "reserved" counters. We have set the block group
+ * to RO above, which prevents reservations from happening but
+ * we may have existing reservations for which allocation has
+ * not yet been done - btrfs_update_block_group() was not yet
+ * called, which is where we will transfer a reserved extent's
+ * size from the "reserved" counter to the "used" counter - this
+ * happens when running delayed references. When we relocate the
+ * chunk below, relocation first flushes dellaloc, waits for
+ * ordered extent completion (which is where we create delayed
+ * references for data extents) and commits the current
+ * transaction (which runs delayed references), and only after
+ * it does the actual work to move extents out of the block
+ * group. So the reported amount of reclaimed bytes is
+ * effectively the sum of the 'used' and 'reserved' counters.
+ */
+ spin_lock(&bg->lock);
+ used = bg->used;
+ reserved = bg->reserved;
+ spin_unlock(&bg->lock);
+
btrfs_info(fs_info,
- "reclaiming chunk %llu with %llu%% used %llu%% unusable",
+ "reclaiming chunk %llu with %llu%% used %llu%% reserved %llu%% unusable",
bg->start,
- div64_u64(bg->used * 100, bg->length),
+ div64_u64(used * 100, bg->length),
+ div64_u64(reserved * 100, bg->length),
div64_u64(zone_unusable * 100, bg->length));
trace_btrfs_reclaim_block_group(bg);
- reclaimed = bg->used;
ret = btrfs_relocate_chunk(fs_info, bg->start);
if (ret) {
btrfs_dec_block_group_ro(bg);
btrfs_err(fs_info, "error relocating chunk %llu",
bg->start);
- reclaimed = 0;
+ used = 0;
+ reserved = 0;
spin_lock(&space_info->lock);
space_info->reclaim_errors++;
if (READ_ONCE(space_info->periodic_reclaim))
@@ -1940,24 +1988,13 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
}
spin_lock(&space_info->lock);
space_info->reclaim_count++;
- space_info->reclaim_bytes += reclaimed;
+ space_info->reclaim_bytes += used;
+ space_info->reclaim_bytes += reserved;
spin_unlock(&space_info->lock);
next:
- if (ret && !READ_ONCE(space_info->periodic_reclaim)) {
- /* Refcount held by the reclaim_bgs list after splice. */
- spin_lock(&fs_info->unused_bgs_lock);
- /*
- * This block group might be added to the unused list
- * during the above process. Move it back to the
- * reclaim list otherwise.
- */
- if (list_empty(&bg->bg_list)) {
- btrfs_get_block_group(bg);
- list_add_tail(&bg->bg_list, &retry_list);
- }
- spin_unlock(&fs_info->unused_bgs_lock);
- }
+ if (ret && !READ_ONCE(space_info->periodic_reclaim))
+ btrfs_link_bg_list(bg, &retry_list);
btrfs_put_block_group(bg);
mutex_unlock(&fs_info->reclaim_bgs_lock);
@@ -1997,13 +2034,8 @@ void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg)
{
struct btrfs_fs_info *fs_info = bg->fs_info;
- spin_lock(&fs_info->unused_bgs_lock);
- if (list_empty(&bg->bg_list)) {
- btrfs_get_block_group(bg);
+ if (btrfs_link_bg_list(bg, &fs_info->reclaim_bgs))
trace_btrfs_add_reclaim_block_group(bg);
- list_add_tail(&bg->bg_list, &fs_info->reclaim_bgs);
- }
- spin_unlock(&fs_info->unused_bgs_lock);
}
static int read_bg_from_eb(struct btrfs_fs_info *fs_info, const struct btrfs_key *key,
@@ -2414,7 +2446,7 @@ static int read_one_block_group(struct btrfs_fs_info *info,
goto error;
}
- ret = btrfs_add_block_group_cache(info, cache);
+ ret = btrfs_add_block_group_cache(cache);
if (ret) {
btrfs_remove_free_space_cache(cache);
goto error;
@@ -2463,7 +2495,7 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
bg->cached = BTRFS_CACHE_FINISHED;
bg->used = map->chunk_len;
bg->flags = map->type;
- ret = btrfs_add_block_group_cache(fs_info, bg);
+ ret = btrfs_add_block_group_cache(bg);
/*
* We may have some valid block group cache added already, in
* that case we skip to the next one.
@@ -2513,8 +2545,8 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
return fill_dummy_bgs(info);
key.objectid = 0;
- key.offset = 0;
key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+ key.offset = 0;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -2645,7 +2677,7 @@ static int insert_dev_extent(struct btrfs_trans_handle *trans,
{
struct btrfs_fs_info *fs_info = device->fs_info;
struct btrfs_root *root = fs_info->dev_root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_dev_extent *extent;
struct extent_buffer *leaf;
struct btrfs_key key;
@@ -2662,7 +2694,7 @@ static int insert_dev_extent(struct btrfs_trans_handle *trans,
key.offset = start;
ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*extent));
if (ret)
- goto out;
+ return ret;
leaf = path->nodes[0];
extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
@@ -2670,11 +2702,8 @@ static int insert_dev_extent(struct btrfs_trans_handle *trans,
btrfs_set_dev_extent_chunk_objectid(leaf, extent,
BTRFS_FIRST_CHUNK_TREE_OBJECTID);
btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
-
btrfs_set_dev_extent_length(leaf, extent, num_bytes);
- btrfs_mark_buffer_dirty(trans, leaf);
-out:
- btrfs_free_path(path);
+
return ret;
}
@@ -2776,8 +2805,12 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
/* Already aborted the transaction if it failed. */
next:
btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info);
+
+ spin_lock(&fs_info->unused_bgs_lock);
list_del_init(&block_group->bg_list);
clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags);
+ btrfs_put_block_group(block_group);
+ spin_unlock(&fs_info->unused_bgs_lock);
/*
* If the block group is still unused, add it to the list of
@@ -2893,7 +2926,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
cache->space_info = btrfs_find_space_info(fs_info, cache->flags);
ASSERT(cache->space_info);
- ret = btrfs_add_block_group_cache(fs_info, cache);
+ ret = btrfs_add_block_group_cache(cache);
if (ret) {
btrfs_remove_free_space_cache(cache);
btrfs_put_block_group(cache);
@@ -2915,7 +2948,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
}
#endif
- list_add_tail(&cache->bg_list, &trans->new_bgs);
+ btrfs_link_bg_list(cache, &trans->new_bgs);
btrfs_inc_delayed_refs_rsv_bg_inserts(fs_info);
set_avail_alloc_bits(fs_info, type);
@@ -3060,8 +3093,7 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group *cache)
(cache->alloc_offset - cache->used - cache->pinned -
cache->reserved) +
(cache->length - cache->zone_capacity);
- btrfs_space_info_update_bytes_zone_unusable(cache->fs_info, sinfo,
- cache->zone_unusable);
+ btrfs_space_info_update_bytes_zone_unusable(sinfo, cache->zone_unusable);
sinfo->bytes_readonly -= cache->zone_unusable;
}
num_bytes = cache->length - cache->reserved -
@@ -3123,7 +3155,6 @@ static int update_block_group_item(struct btrfs_trans_handle *trans,
cache->global_root_id);
btrfs_set_stack_block_group_flags(&bgi, cache->flags);
write_extent_buffer(leaf, &bgi, bi, sizeof(bgi));
- btrfs_mark_buffer_dirty(trans, leaf);
fail:
btrfs_release_path(path);
/*
@@ -3313,7 +3344,7 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans)
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_block_group *cache, *tmp;
struct btrfs_transaction *cur_trans = trans->transaction;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
if (list_empty(&cur_trans->dirty_bgs) ||
!btrfs_test_opt(fs_info, SPACE_CACHE))
@@ -3330,7 +3361,6 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans)
cache_save_setup(cache, trans, path);
}
- btrfs_free_path(path);
return 0;
}
@@ -3353,7 +3383,7 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
struct btrfs_transaction *cur_trans = trans->transaction;
int ret = 0;
int should_put;
- struct btrfs_path *path = NULL;
+ BTRFS_PATH_AUTO_FREE(path);
LIST_HEAD(dirty);
struct list_head *io = &cur_trans->io_bgs;
int loops = 0;
@@ -3508,7 +3538,6 @@ out:
btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
}
- btrfs_free_path(path);
return ret;
}
@@ -3519,7 +3548,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
struct btrfs_transaction *cur_trans = trans->transaction;
int ret = 0;
int should_put;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct list_head *io = &cur_trans->io_bgs;
path = btrfs_alloc_path();
@@ -3631,7 +3660,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
btrfs_put_block_group(cache);
}
- btrfs_free_path(path);
return ret;
}
@@ -3699,7 +3727,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
old_val -= num_bytes;
cache->used = old_val;
cache->pinned += num_bytes;
- btrfs_space_info_update_bytes_pinned(info, space_info, num_bytes);
+ btrfs_space_info_update_bytes_pinned(space_info, num_bytes);
space_info->bytes_used -= num_bytes;
space_info->disk_used -= num_bytes * factor;
if (READ_ONCE(space_info->periodic_reclaim))
@@ -3781,8 +3809,7 @@ int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
space_info->bytes_reserved += num_bytes;
trace_btrfs_space_reservation(cache->fs_info, "space_info",
space_info->flags, num_bytes, 1);
- btrfs_space_info_update_bytes_may_use(cache->fs_info,
- space_info, -ram_bytes);
+ btrfs_space_info_update_bytes_may_use(space_info, -ram_bytes);
if (delalloc)
cache->delalloc_bytes += num_bytes;
diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
index a07b9594dc70..3f3608299c0b 100644
--- a/fs/btrfs/block-rsv.c
+++ b/fs/btrfs/block-rsv.c
@@ -150,9 +150,7 @@ static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
spin_unlock(&dest->lock);
}
if (num_bytes)
- btrfs_space_info_free_bytes_may_use(fs_info,
- space_info,
- num_bytes);
+ btrfs_space_info_free_bytes_may_use(space_info, num_bytes);
}
if (qgroup_to_release_ret)
*qgroup_to_release_ret = qgroup_to_release;
@@ -383,13 +381,11 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info)
if (block_rsv->reserved < block_rsv->size) {
num_bytes = block_rsv->size - block_rsv->reserved;
- btrfs_space_info_update_bytes_may_use(fs_info, sinfo,
- num_bytes);
+ btrfs_space_info_update_bytes_may_use(sinfo, num_bytes);
block_rsv->reserved = block_rsv->size;
} else if (block_rsv->reserved > block_rsv->size) {
num_bytes = block_rsv->reserved - block_rsv->size;
- btrfs_space_info_update_bytes_may_use(fs_info, sinfo,
- -num_bytes);
+ btrfs_space_info_update_bytes_may_use(sinfo, -num_bytes);
block_rsv->reserved = block_rsv->size;
btrfs_try_granting_tickets(fs_info, sinfo);
}
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index aa1f55cd81b7..4e2952cf5766 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -145,6 +145,7 @@ struct btrfs_inode {
* different from prop_compress and takes precedence if set.
*/
u8 defrag_compress;
+ s8 defrag_compress_level;
/*
* Lock for counters and all fields used to determine if the inode is in
@@ -516,6 +517,14 @@ static inline void btrfs_assert_inode_locked(struct btrfs_inode *inode)
lockdep_assert_held(&inode->vfs_inode.i_rwsem);
}
+static inline void btrfs_update_inode_mapping_flags(struct btrfs_inode *inode)
+{
+ if (inode->flags & BTRFS_INODE_NODATASUM)
+ mapping_clear_stable_writes(inode->vfs_inode.i_mapping);
+ else
+ mapping_set_stable_writes(inode->vfs_inode.i_mapping);
+}
+
/* Array of bytes with variable length, hexadecimal format 0x1234 */
#define CSUM_FMT "0x%*phN"
#define CSUM_FMT_VALUE(size, bytes) size, bytes
@@ -524,9 +533,9 @@ int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
u32 pgoff, u8 *csum, const u8 * const csum_expected);
bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
u32 bio_offset, struct bio_vec *bv);
-noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
+noinline int can_nocow_extent(struct btrfs_inode *inode, u64 offset, u64 *len,
struct btrfs_file_extent *file_extent,
- bool nowait, bool strict);
+ bool nowait);
void btrfs_del_delalloc_inode(struct btrfs_inode *inode);
struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry);
@@ -584,9 +593,9 @@ void btrfs_free_inode(struct inode *inode);
int btrfs_drop_inode(struct inode *inode);
int __init btrfs_init_cachep(void);
void __cold btrfs_destroy_cachep(void);
-struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root,
- struct btrfs_path *path);
-struct inode *btrfs_iget(u64 ino, struct btrfs_root *root);
+struct btrfs_inode *btrfs_iget_path(u64 ino, struct btrfs_root *root,
+ struct btrfs_path *path);
+struct btrfs_inode *btrfs_iget(u64 ino, struct btrfs_root *root);
struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
struct folio *folio, u64 start, u64 len);
int btrfs_update_inode(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 0c4d486c3048..e7f8ee5d48a4 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -740,7 +740,7 @@ static const struct btrfs_compress_op * const btrfs_compress_op[] = {
&btrfs_zstd_compress,
};
-static struct list_head *alloc_workspace(int type, unsigned int level)
+static struct list_head *alloc_workspace(int type, int level)
{
switch (type) {
case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws();
@@ -818,7 +818,7 @@ static void btrfs_cleanup_workspace_manager(int type)
* Preallocation makes a forward progress guarantees and we do not return
* errors.
*/
-struct list_head *btrfs_get_workspace(int type, unsigned int level)
+struct list_head *btrfs_get_workspace(int type, int level)
{
struct workspace_manager *wsm;
struct list_head *workspace;
@@ -968,18 +968,28 @@ static void put_workspace(int type, struct list_head *ws)
* Adjust @level according to the limits of the compression algorithm or
* fallback to default
*/
-static unsigned int btrfs_compress_set_level(int type, unsigned level)
+static int btrfs_compress_set_level(unsigned int type, int level)
{
const struct btrfs_compress_op *ops = btrfs_compress_op[type];
if (level == 0)
level = ops->default_level;
else
- level = min(level, ops->max_level);
+ level = min(max(level, ops->min_level), ops->max_level);
return level;
}
+/*
+ * Check whether the @level is within the valid range for the given type.
+ */
+bool btrfs_compress_level_valid(unsigned int type, int level)
+{
+ const struct btrfs_compress_op *ops = btrfs_compress_op[type];
+
+ return ops->min_level <= level && level <= ops->max_level;
+}
+
/* Wrapper around find_get_page(), with extra error message. */
int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start,
struct folio **in_folio_ret)
@@ -1023,12 +1033,10 @@ int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start,
* @total_out is an in/out parameter, must be set to the input length and will
* be also used to return the total number of compressed bytes
*/
-int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping,
+int btrfs_compress_folios(unsigned int type, int level, struct address_space *mapping,
u64 start, struct folio **folios, unsigned long *out_folios,
unsigned long *total_in, unsigned long *total_out)
{
- int type = btrfs_compress_type(type_level);
- int level = btrfs_compress_level(type_level);
const unsigned long orig_len = *total_out;
struct list_head *workspace;
int ret;
@@ -1590,18 +1598,19 @@ out:
/*
* Convert the compression suffix (eg. after "zlib" starting with ":") to
- * level, unrecognized string will set the default level
+ * level, unrecognized string will set the default level. Negative level
+ * numbers are allowed.
*/
-unsigned int btrfs_compress_str2level(unsigned int type, const char *str)
+int btrfs_compress_str2level(unsigned int type, const char *str)
{
- unsigned int level = 0;
+ int level = 0;
int ret;
if (!type)
return 0;
if (str[0] == ':') {
- ret = kstrtouint(str + 1, 10, &level);
+ ret = kstrtoint(str + 1, 10, &level);
if (ret)
level = 0;
}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 954034086d0d..df198623cc08 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -72,16 +72,6 @@ struct compressed_bio {
struct btrfs_bio bbio;
};
-static inline unsigned int btrfs_compress_type(unsigned int type_level)
-{
- return (type_level & 0xF);
-}
-
-static inline unsigned int btrfs_compress_level(unsigned int type_level)
-{
- return ((type_level & 0xF0) >> 4);
-}
-
/* @range_end must be exclusive. */
static inline u32 btrfs_calc_input_length(u64 range_end, u64 cur)
{
@@ -93,7 +83,8 @@ static inline u32 btrfs_calc_input_length(u64 range_end, u64 cur)
int __init btrfs_init_compress(void);
void __cold btrfs_exit_compress(void);
-int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping,
+bool btrfs_compress_level_valid(unsigned int type, int level);
+int btrfs_compress_folios(unsigned int type, int level, struct address_space *mapping,
u64 start, struct folio **folios, unsigned long *out_folios,
unsigned long *total_in, unsigned long *total_out);
int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio,
@@ -107,7 +98,7 @@ void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered,
bool writeback);
void btrfs_submit_compressed_read(struct btrfs_bio *bbio);
-unsigned int btrfs_compress_str2level(unsigned int type, const char *str);
+int btrfs_compress_str2level(unsigned int type, const char *str);
struct folio *btrfs_alloc_compr_folio(void);
void btrfs_free_compr_folio(struct folio *folio);
@@ -131,14 +122,15 @@ struct workspace_manager {
wait_queue_head_t ws_wait;
};
-struct list_head *btrfs_get_workspace(int type, unsigned int level);
+struct list_head *btrfs_get_workspace(int type, int level);
void btrfs_put_workspace(int type, struct list_head *ws);
struct btrfs_compress_op {
struct workspace_manager *workspace_manager;
/* Maximum level supported by the compression algorithm */
- unsigned int max_level;
- unsigned int default_level;
+ int min_level;
+ int max_level;
+ int default_level;
};
/* The heuristic workspaces are managed via the 0th workspace manager */
@@ -187,9 +179,9 @@ int zstd_decompress(struct list_head *ws, const u8 *data_in,
size_t destlen);
void zstd_init_workspace_manager(void);
void zstd_cleanup_workspace_manager(void);
-struct list_head *zstd_alloc_workspace(unsigned int level);
+struct list_head *zstd_alloc_workspace(int level);
void zstd_free_workspace(struct list_head *ws);
-struct list_head *zstd_get_workspace(unsigned int level);
+struct list_head *zstd_get_workspace(int level);
void zstd_put_workspace(struct list_head *ws);
#endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 693dc27ffb89..a2e7979372cc 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -37,19 +37,6 @@ static int push_node_left(struct btrfs_trans_handle *trans,
static int balance_node_right(struct btrfs_trans_handle *trans,
struct extent_buffer *dst_buf,
struct extent_buffer *src_buf);
-
-static const struct btrfs_csums {
- u16 size;
- const char name[10];
- const char driver[12];
-} btrfs_csums[] = {
- [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
- [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" },
- [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" },
- [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b",
- .driver = "blake2b-256" },
-};
-
/*
* The leaf data grows from end-to-front in the node. this returns the address
* of the start of the last item, which is the stop of the leaf data stack.
@@ -148,44 +135,6 @@ static inline void copy_leaf_items(const struct extent_buffer *dst,
nr_items * sizeof(struct btrfs_item));
}
-/* This exists for btrfs-progs usages. */
-u16 btrfs_csum_type_size(u16 type)
-{
- return btrfs_csums[type].size;
-}
-
-int btrfs_super_csum_size(const struct btrfs_super_block *s)
-{
- u16 t = btrfs_super_csum_type(s);
- /*
- * csum type is validated at mount time
- */
- return btrfs_csum_type_size(t);
-}
-
-const char *btrfs_super_csum_name(u16 csum_type)
-{
- /* csum type is validated at mount time */
- return btrfs_csums[csum_type].name;
-}
-
-/*
- * Return driver name if defined, otherwise the name that's also a valid driver
- * name
- */
-const char *btrfs_super_csum_driver(u16 csum_type)
-{
- /* csum type is validated at mount time */
- return btrfs_csums[csum_type].driver[0] ?
- btrfs_csums[csum_type].driver :
- btrfs_csums[csum_type].name;
-}
-
-size_t __attribute_const__ btrfs_get_num_csums(void)
-{
- return ARRAY_SIZE(btrfs_csums);
-}
-
struct btrfs_path *btrfs_alloc_path(void)
{
might_sleep();
@@ -226,22 +175,6 @@ noinline void btrfs_release_path(struct btrfs_path *p)
}
/*
- * We want the transaction abort to print stack trace only for errors where the
- * cause could be a bug, eg. due to ENOSPC, and not for common errors that are
- * caused by external factors.
- */
-bool __cold abort_should_print_stack(int error)
-{
- switch (error) {
- case -EIO:
- case -EROFS:
- case -ENOMEM:
- return false;
- }
- return true;
-}
-
-/*
* safely gets a reference on the root node of a tree. A lock
* is not taken, so a concurrent writer may put a different node
* at the root of the tree. See btrfs_lock_root_node for the
@@ -654,6 +587,8 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
goto error_unlock_cow;
}
}
+
+ trace_btrfs_cow_block(root, buf, cow);
if (unlock_orig)
btrfs_tree_unlock(buf);
free_extent_buffer_stale(buf);
@@ -710,7 +645,6 @@ int btrfs_cow_block(struct btrfs_trans_handle *trans,
{
struct btrfs_fs_info *fs_info = root->fs_info;
u64 search_start;
- int ret;
if (unlikely(test_bit(BTRFS_ROOT_DELETING, &root->state))) {
btrfs_abort_transaction(trans, -EUCLEAN);
@@ -751,12 +685,8 @@ int btrfs_cow_block(struct btrfs_trans_handle *trans,
* Also We don't care about the error, as it's handled internally.
*/
btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
- ret = btrfs_force_cow_block(trans, root, buf, parent, parent_slot,
- cow_ret, search_start, 0, nest);
-
- trace_btrfs_cow_block(root, buf, *cow_ret);
-
- return ret;
+ return btrfs_force_cow_block(trans, root, buf, parent, parent_slot,
+ cow_ret, search_start, 0, nest);
}
ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO);
@@ -1566,6 +1496,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
if (!p->skip_locking) {
btrfs_unlock_up_safe(p, parent_level + 1);
+ btrfs_maybe_reset_lockdep_class(root, tmp);
tmp_locked = true;
btrfs_tree_read_lock(tmp);
btrfs_release_path(p);
@@ -1609,6 +1540,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
if (!p->skip_locking) {
ASSERT(ret == -EAGAIN);
+ btrfs_maybe_reset_lockdep_class(root, tmp);
tmp_locked = true;
btrfs_tree_read_lock(tmp);
btrfs_release_path(p);
@@ -3903,6 +3835,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
+ key.type != BTRFS_RAID_STRIPE_KEY &&
key.type != BTRFS_EXTENT_CSUM_KEY);
if (btrfs_leaf_free_space(leaf) >= ins_len)
@@ -4373,7 +4306,7 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
u32 data_size)
{
int ret = 0;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
unsigned long ptr;
@@ -4387,7 +4320,6 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
write_extent_buffer(leaf, data, ptr, data_size);
btrfs_mark_buffer_dirty(trans, leaf);
}
- btrfs_free_path(path);
return ret;
}
@@ -4675,7 +4607,6 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
u64 min_trans)
{
struct extent_buffer *cur;
- struct btrfs_key found_key;
int slot;
int sret;
u32 nritems;
@@ -4711,7 +4642,8 @@ again:
goto find_next_key;
ret = 0;
path->slots[level] = slot;
- btrfs_item_key_to_cpu(cur, &found_key, slot);
+ /* Save our key for returning back. */
+ btrfs_item_key_to_cpu(cur, min_key, slot);
goto out;
}
if (sret && slot > 0)
@@ -4735,8 +4667,8 @@ find_next_key:
* we didn't find a candidate key in this node, walk forward
* and find another one
*/
+ path->slots[level] = slot;
if (slot >= nritems) {
- path->slots[level] = slot;
sret = btrfs_find_next_key(root, path, min_key, level,
min_trans);
if (sret == 0) {
@@ -4746,11 +4678,10 @@ find_next_key:
goto out;
}
}
- /* save our key for returning back */
- btrfs_node_key_to_cpu(cur, &found_key, slot);
- path->slots[level] = slot;
if (level == path->lowest_level) {
ret = 0;
+ /* Save our key for returning back. */
+ btrfs_node_key_to_cpu(cur, min_key, slot);
goto out;
}
cur = btrfs_read_node_slot(cur, slot);
@@ -4767,10 +4698,8 @@ find_next_key:
}
out:
path->keep_locks = keep_locks;
- if (ret == 0) {
+ if (ret == 0)
btrfs_unlock_up_safe(path, path->lowest_level + 1);
- memcpy(min_key, &found_key, sizeof(found_key));
- }
return ret;
}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2c341956a01c..075a06db43a1 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -6,8 +6,7 @@
#ifndef BTRFS_CTREE_H
#define BTRFS_CTREE_H
-#include "linux/cleanup.h"
-#include <linux/pagemap.h>
+#include <linux/cleanup.h>
#include <linux/spinlock.h>
#include <linux/rbtree.h>
#include <linux/mutex.h>
@@ -506,20 +505,6 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
return BTRFS_MAX_ITEM_SIZE(info) - sizeof(struct btrfs_dir_item);
}
-#define BTRFS_BYTES_TO_BLKS(fs_info, bytes) \
- ((bytes) >> (fs_info)->sectorsize_bits)
-
-static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
-{
- return mapping_gfp_constraint(mapping, ~__GFP_FS);
-}
-
-void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end);
-int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 num_bytes, u64 *actual_bytes);
-int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range);
-
-/* ctree.c */
int __init btrfs_ctree_init(void);
void __cold btrfs_ctree_exit(void);
@@ -756,18 +741,4 @@ static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root)
return root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID;
}
-u16 btrfs_csum_type_size(u16 type);
-int btrfs_super_csum_size(const struct btrfs_super_block *s);
-const char *btrfs_super_csum_name(u16 csum_type);
-const char *btrfs_super_csum_driver(u16 csum_type);
-size_t __attribute_const__ btrfs_get_num_csums(void);
-
-/*
- * We use folio flag owner_2 to indicate there is an ordered extent with
- * unfinished IO.
- */
-#define folio_test_ordered(folio) folio_test_owner_2(folio)
-#define folio_set_ordered(folio) folio_set_owner_2(folio)
-#define folio_clear_ordered(folio) folio_clear_owner_2(folio)
-
#endif
diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
index 968dae953948..d4310d93f532 100644
--- a/fs/btrfs/defrag.c
+++ b/fs/btrfs/defrag.c
@@ -225,7 +225,7 @@ static int btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
struct file_ra_state *ra)
{
struct btrfs_root *inode_root;
- struct inode *inode;
+ struct btrfs_inode *inode;
struct btrfs_ioctl_defrag_range_args range;
int ret = 0;
u64 cur = 0;
@@ -250,24 +250,24 @@ again:
goto cleanup;
}
- if (cur >= i_size_read(inode)) {
- iput(inode);
+ if (cur >= i_size_read(&inode->vfs_inode)) {
+ iput(&inode->vfs_inode);
goto cleanup;
}
/* Do a chunk of defrag */
- clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
+ clear_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags);
memset(&range, 0, sizeof(range));
range.len = (u64)-1;
range.start = cur;
range.extent_thresh = defrag->extent_thresh;
- file_ra_state_init(ra, inode->i_mapping);
+ file_ra_state_init(ra, inode->vfs_inode.i_mapping);
sb_start_write(fs_info->sb);
ret = btrfs_defrag_file(inode, ra, &range, defrag->transid,
- BTRFS_DEFRAG_BATCH);
+ BTRFS_DEFRAG_BATCH);
sb_end_write(fs_info->sb);
- iput(inode);
+ iput(&inode->vfs_inode);
if (ret < 0)
goto cleanup;
@@ -1352,17 +1352,18 @@ out:
* (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without
* defragging all the range).
*/
-int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
+int btrfs_defrag_file(struct btrfs_inode *inode, struct file_ra_state *ra,
struct btrfs_ioctl_defrag_range_args *range,
u64 newer_than, unsigned long max_to_defrag)
{
- struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
unsigned long sectors_defragged = 0;
- u64 isize = i_size_read(inode);
+ u64 isize = i_size_read(&inode->vfs_inode);
u64 cur;
u64 last_byte;
bool do_compress = (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS);
int compress_type = BTRFS_COMPRESS_ZLIB;
+ int compress_level = 0;
int ret = 0;
u32 extent_thresh = range->extent_thresh;
pgoff_t start_index;
@@ -1376,10 +1377,21 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
return -EINVAL;
if (do_compress) {
- if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES)
- return -EINVAL;
- if (range->compress_type)
- compress_type = range->compress_type;
+ if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS_LEVEL) {
+ if (range->compress.type >= BTRFS_NR_COMPRESS_TYPES)
+ return -EINVAL;
+ if (range->compress.type) {
+ compress_type = range->compress.type;
+ compress_level = range->compress.level;
+ if (!btrfs_compress_level_valid(compress_type, compress_level))
+ return -EINVAL;
+ }
+ } else {
+ if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES)
+ return -EINVAL;
+ if (range->compress_type)
+ compress_type = range->compress_type;
+ }
}
if (extent_thresh == 0)
@@ -1402,8 +1414,8 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
* defrag range can be written sequentially.
*/
start_index = cur >> PAGE_SHIFT;
- if (start_index < inode->i_mapping->writeback_index)
- inode->i_mapping->writeback_index = start_index;
+ if (start_index < inode->vfs_inode.i_mapping->writeback_index)
+ inode->vfs_inode.i_mapping->writeback_index = start_index;
while (cur < last_byte) {
const unsigned long prev_sectors_defragged = sectors_defragged;
@@ -1420,27 +1432,29 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
(SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1;
cluster_end = min(cluster_end, last_byte);
- btrfs_inode_lock(BTRFS_I(inode), 0);
- if (IS_SWAPFILE(inode)) {
+ btrfs_inode_lock(inode, 0);
+ if (IS_SWAPFILE(&inode->vfs_inode)) {
ret = -ETXTBSY;
- btrfs_inode_unlock(BTRFS_I(inode), 0);
+ btrfs_inode_unlock(inode, 0);
break;
}
- if (!(inode->i_sb->s_flags & SB_ACTIVE)) {
- btrfs_inode_unlock(BTRFS_I(inode), 0);
+ if (!(inode->vfs_inode.i_sb->s_flags & SB_ACTIVE)) {
+ btrfs_inode_unlock(inode, 0);
break;
}
- if (do_compress)
- BTRFS_I(inode)->defrag_compress = compress_type;
- ret = defrag_one_cluster(BTRFS_I(inode), ra, cur,
+ if (do_compress) {
+ inode->defrag_compress = compress_type;
+ inode->defrag_compress_level = compress_level;
+ }
+ ret = defrag_one_cluster(inode, ra, cur,
cluster_end + 1 - cur, extent_thresh,
newer_than, do_compress, &sectors_defragged,
max_to_defrag, &last_scanned);
if (sectors_defragged > prev_sectors_defragged)
- balance_dirty_pages_ratelimited(inode->i_mapping);
+ balance_dirty_pages_ratelimited(inode->vfs_inode.i_mapping);
- btrfs_inode_unlock(BTRFS_I(inode), 0);
+ btrfs_inode_unlock(inode, 0);
if (ret < 0)
break;
cur = max(cluster_end + 1, last_scanned);
@@ -1462,10 +1476,10 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
* need to be written back immediately.
*/
if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) {
- filemap_flush(inode->i_mapping);
+ filemap_flush(inode->vfs_inode.i_mapping);
if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
- &BTRFS_I(inode)->runtime_flags))
- filemap_flush(inode->i_mapping);
+ &inode->runtime_flags))
+ filemap_flush(inode->vfs_inode.i_mapping);
}
if (range->compress_type == BTRFS_COMPRESS_LZO)
btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
@@ -1474,9 +1488,9 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
ret = sectors_defragged;
}
if (do_compress) {
- btrfs_inode_lock(BTRFS_I(inode), 0);
- BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE;
- btrfs_inode_unlock(BTRFS_I(inode), 0);
+ btrfs_inode_lock(inode, 0);
+ inode->defrag_compress = BTRFS_COMPRESS_NONE;
+ btrfs_inode_unlock(inode, 0);
}
return ret;
}
diff --git a/fs/btrfs/defrag.h b/fs/btrfs/defrag.h
index 6b7596c4f0dc..a7f917a38dbf 100644
--- a/fs/btrfs/defrag.h
+++ b/fs/btrfs/defrag.h
@@ -6,14 +6,14 @@
#include <linux/types.h>
#include <linux/compiler_types.h>
-struct inode;
struct file_ra_state;
+struct btrfs_inode;
struct btrfs_fs_info;
struct btrfs_root;
struct btrfs_trans_handle;
struct btrfs_ioctl_defrag_range_args;
-int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
+int btrfs_defrag_file(struct btrfs_inode *inode, struct file_ra_state *ra,
struct btrfs_ioctl_defrag_range_args *range,
u64 newer_than, unsigned long max_to_defrag);
int __init btrfs_auto_defrag_init(void);
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index 7aa8a395d838..88e900e5a43d 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -176,7 +176,7 @@ void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info,
ASSERT(IS_ALIGNED(len, fs_info->sectorsize));
data_sinfo = fs_info->data_sinfo;
- btrfs_space_info_free_bytes_may_use(fs_info, data_sinfo, len);
+ btrfs_space_info_free_bytes_may_use(data_sinfo, len);
}
/*
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 508bdbae29a0..3f1551d8a5c6 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -366,40 +366,35 @@ static struct btrfs_delayed_item *__btrfs_lookup_delayed_item(
return NULL;
}
+static int btrfs_delayed_item_cmp(const struct rb_node *new,
+ const struct rb_node *exist)
+{
+ const struct btrfs_delayed_item *new_item =
+ rb_entry(new, struct btrfs_delayed_item, rb_node);
+ const struct btrfs_delayed_item *exist_item =
+ rb_entry(exist, struct btrfs_delayed_item, rb_node);
+
+ if (new_item->index < exist_item->index)
+ return -1;
+ if (new_item->index > exist_item->index)
+ return 1;
+ return 0;
+}
+
static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
struct btrfs_delayed_item *ins)
{
- struct rb_node **p, *node;
- struct rb_node *parent_node = NULL;
struct rb_root_cached *root;
- struct btrfs_delayed_item *item;
- bool leftmost = true;
+ struct rb_node *exist;
if (ins->type == BTRFS_DELAYED_INSERTION_ITEM)
root = &delayed_node->ins_root;
else
root = &delayed_node->del_root;
- p = &root->rb_root.rb_node;
- node = &ins->rb_node;
-
- while (*p) {
- parent_node = *p;
- item = rb_entry(parent_node, struct btrfs_delayed_item,
- rb_node);
-
- if (item->index < ins->index) {
- p = &(*p)->rb_right;
- leftmost = false;
- } else if (item->index > ins->index) {
- p = &(*p)->rb_left;
- } else {
- return -EEXIST;
- }
- }
-
- rb_link_node(node, parent_node, p);
- rb_insert_color_cached(node, root, leftmost);
+ exist = rb_find_add_cached(&ins->rb_node, root, btrfs_delayed_item_cmp);
+ if (exist)
+ return -EEXIST;
if (ins->type == BTRFS_DELAYED_INSERTION_ITEM &&
ins->index >= delayed_node->index_cnt)
@@ -1038,7 +1033,6 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
struct btrfs_inode_item);
write_extent_buffer(leaf, &node->inode_item, (unsigned long)inode_item,
sizeof(struct btrfs_inode_item));
- btrfs_mark_buffer_dirty(trans, leaf);
if (!test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags))
goto out;
@@ -1217,7 +1211,7 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode)
{
struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_block_rsv *block_rsv;
int ret;
@@ -1244,7 +1238,6 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
ret = __btrfs_commit_inode_delayed_items(trans, path, delayed_node);
btrfs_release_delayed_node(delayed_node);
- btrfs_free_path(path);
trans->block_rsv = block_rsv;
return ret;
@@ -1561,8 +1554,7 @@ release_node:
return ret;
}
-static int btrfs_delete_delayed_insertion_item(struct btrfs_fs_info *fs_info,
- struct btrfs_delayed_node *node,
+static int btrfs_delete_delayed_insertion_item(struct btrfs_delayed_node *node,
u64 index)
{
struct btrfs_delayed_item *item;
@@ -1620,7 +1612,7 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
if (IS_ERR(node))
return PTR_ERR(node);
- ret = btrfs_delete_delayed_insertion_item(trans->fs_info, node, index);
+ ret = btrfs_delete_delayed_insertion_item(node, index);
if (!ret)
goto end;
@@ -1824,53 +1816,53 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
struct btrfs_inode_item *inode_item,
- struct inode *inode)
+ struct btrfs_inode *inode)
{
+ struct inode *vfs_inode = &inode->vfs_inode;
u64 flags;
- btrfs_set_stack_inode_uid(inode_item, i_uid_read(inode));
- btrfs_set_stack_inode_gid(inode_item, i_gid_read(inode));
- btrfs_set_stack_inode_size(inode_item, BTRFS_I(inode)->disk_i_size);
- btrfs_set_stack_inode_mode(inode_item, inode->i_mode);
- btrfs_set_stack_inode_nlink(inode_item, inode->i_nlink);
- btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(inode));
- btrfs_set_stack_inode_generation(inode_item,
- BTRFS_I(inode)->generation);
+ btrfs_set_stack_inode_uid(inode_item, i_uid_read(vfs_inode));
+ btrfs_set_stack_inode_gid(inode_item, i_gid_read(vfs_inode));
+ btrfs_set_stack_inode_size(inode_item, inode->disk_i_size);
+ btrfs_set_stack_inode_mode(inode_item, vfs_inode->i_mode);
+ btrfs_set_stack_inode_nlink(inode_item, vfs_inode->i_nlink);
+ btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(vfs_inode));
+ btrfs_set_stack_inode_generation(inode_item, inode->generation);
btrfs_set_stack_inode_sequence(inode_item,
- inode_peek_iversion(inode));
+ inode_peek_iversion(vfs_inode));
btrfs_set_stack_inode_transid(inode_item, trans->transid);
- btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev);
- flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
- BTRFS_I(inode)->ro_flags);
+ btrfs_set_stack_inode_rdev(inode_item, vfs_inode->i_rdev);
+ flags = btrfs_inode_combine_flags(inode->flags, inode->ro_flags);
btrfs_set_stack_inode_flags(inode_item, flags);
btrfs_set_stack_inode_block_group(inode_item, 0);
btrfs_set_stack_timespec_sec(&inode_item->atime,
- inode_get_atime_sec(inode));
+ inode_get_atime_sec(vfs_inode));
btrfs_set_stack_timespec_nsec(&inode_item->atime,
- inode_get_atime_nsec(inode));
+ inode_get_atime_nsec(vfs_inode));
btrfs_set_stack_timespec_sec(&inode_item->mtime,
- inode_get_mtime_sec(inode));
+ inode_get_mtime_sec(vfs_inode));
btrfs_set_stack_timespec_nsec(&inode_item->mtime,
- inode_get_mtime_nsec(inode));
+ inode_get_mtime_nsec(vfs_inode));
btrfs_set_stack_timespec_sec(&inode_item->ctime,
- inode_get_ctime_sec(inode));
+ inode_get_ctime_sec(vfs_inode));
btrfs_set_stack_timespec_nsec(&inode_item->ctime,
- inode_get_ctime_nsec(inode));
+ inode_get_ctime_nsec(vfs_inode));
- btrfs_set_stack_timespec_sec(&inode_item->otime, BTRFS_I(inode)->i_otime_sec);
- btrfs_set_stack_timespec_nsec(&inode_item->otime, BTRFS_I(inode)->i_otime_nsec);
+ btrfs_set_stack_timespec_sec(&inode_item->otime, inode->i_otime_sec);
+ btrfs_set_stack_timespec_nsec(&inode_item->otime, inode->i_otime_nsec);
}
-int btrfs_fill_inode(struct inode *inode, u32 *rdev)
+int btrfs_fill_inode(struct btrfs_inode *inode, u32 *rdev)
{
- struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_delayed_node *delayed_node;
struct btrfs_inode_item *inode_item;
+ struct inode *vfs_inode = &inode->vfs_inode;
- delayed_node = btrfs_get_delayed_node(BTRFS_I(inode));
+ delayed_node = btrfs_get_delayed_node(inode);
if (!delayed_node)
return -ENOENT;
@@ -1883,39 +1875,38 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
inode_item = &delayed_node->inode_item;
- i_uid_write(inode, btrfs_stack_inode_uid(inode_item));
- i_gid_write(inode, btrfs_stack_inode_gid(inode_item));
- btrfs_i_size_write(BTRFS_I(inode), btrfs_stack_inode_size(inode_item));
- btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0,
- round_up(i_size_read(inode), fs_info->sectorsize));
- inode->i_mode = btrfs_stack_inode_mode(inode_item);
- set_nlink(inode, btrfs_stack_inode_nlink(inode_item));
- inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item));
- BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item);
- BTRFS_I(inode)->last_trans = btrfs_stack_inode_transid(inode_item);
-
- inode_set_iversion_queried(inode,
- btrfs_stack_inode_sequence(inode_item));
- inode->i_rdev = 0;
+ i_uid_write(vfs_inode, btrfs_stack_inode_uid(inode_item));
+ i_gid_write(vfs_inode, btrfs_stack_inode_gid(inode_item));
+ btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item));
+ btrfs_inode_set_file_extent_range(inode, 0,
+ round_up(i_size_read(vfs_inode), fs_info->sectorsize));
+ vfs_inode->i_mode = btrfs_stack_inode_mode(inode_item);
+ set_nlink(vfs_inode, btrfs_stack_inode_nlink(inode_item));
+ inode_set_bytes(vfs_inode, btrfs_stack_inode_nbytes(inode_item));
+ inode->generation = btrfs_stack_inode_generation(inode_item);
+ inode->last_trans = btrfs_stack_inode_transid(inode_item);
+
+ inode_set_iversion_queried(vfs_inode, btrfs_stack_inode_sequence(inode_item));
+ vfs_inode->i_rdev = 0;
*rdev = btrfs_stack_inode_rdev(inode_item);
btrfs_inode_split_flags(btrfs_stack_inode_flags(inode_item),
- &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags);
+ &inode->flags, &inode->ro_flags);
- inode_set_atime(inode, btrfs_stack_timespec_sec(&inode_item->atime),
+ inode_set_atime(vfs_inode, btrfs_stack_timespec_sec(&inode_item->atime),
btrfs_stack_timespec_nsec(&inode_item->atime));
- inode_set_mtime(inode, btrfs_stack_timespec_sec(&inode_item->mtime),
+ inode_set_mtime(vfs_inode, btrfs_stack_timespec_sec(&inode_item->mtime),
btrfs_stack_timespec_nsec(&inode_item->mtime));
- inode_set_ctime(inode, btrfs_stack_timespec_sec(&inode_item->ctime),
+ inode_set_ctime(vfs_inode, btrfs_stack_timespec_sec(&inode_item->ctime),
btrfs_stack_timespec_nsec(&inode_item->ctime));
- BTRFS_I(inode)->i_otime_sec = btrfs_stack_timespec_sec(&inode_item->otime);
- BTRFS_I(inode)->i_otime_nsec = btrfs_stack_timespec_nsec(&inode_item->otime);
+ inode->i_otime_sec = btrfs_stack_timespec_sec(&inode_item->otime);
+ inode->i_otime_nsec = btrfs_stack_timespec_nsec(&inode_item->otime);
- inode->i_generation = BTRFS_I(inode)->generation;
- if (S_ISDIR(inode->i_mode))
- BTRFS_I(inode)->index_cnt = (u64)-1;
+ vfs_inode->i_generation = inode->generation;
+ if (S_ISDIR(vfs_inode->i_mode))
+ inode->index_cnt = (u64)-1;
mutex_unlock(&delayed_node->mutex);
btrfs_release_delayed_node(delayed_node);
@@ -1935,8 +1926,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
mutex_lock(&delayed_node->mutex);
if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
- fill_stack_inode_item(trans, &delayed_node->inode_item,
- &inode->vfs_inode);
+ fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
goto release_node;
}
@@ -1944,7 +1934,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
if (ret)
goto release_node;
- fill_stack_inode_item(trans, &delayed_node->inode_item, &inode->vfs_inode);
+ fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
set_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags);
delayed_node->count++;
atomic_inc(&root->fs_info->delayed_root->items);
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index f4d9feac0d0e..c4b4ba122beb 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -133,7 +133,7 @@ int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode);
int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode);
-int btrfs_fill_inode(struct inode *inode, u32 *rdev);
+int btrfs_fill_inode(struct btrfs_inode *inode, u32 *rdev);
int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode);
/* Used for drop dead root */
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 0d878dbbabba..98c5b61dabe8 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -93,6 +93,9 @@ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)
u64 num_bytes;
u64 reserved_bytes;
+ if (btrfs_is_testing(fs_info))
+ return;
+
num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, trans->delayed_ref_updates);
num_bytes += btrfs_calc_delayed_ref_csum_bytes(fs_info,
trans->delayed_ref_csum_deletions);
@@ -254,7 +257,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
spin_unlock(&block_rsv->lock);
if (to_free > 0)
- btrfs_space_info_free_bytes_may_use(fs_info, space_info, to_free);
+ btrfs_space_info_free_bytes_may_use(space_info, to_free);
if (refilled_bytes > 0)
trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 0,
@@ -265,8 +268,8 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
/*
* compare two delayed data backrefs with same bytenr and type
*/
-static int comp_data_refs(struct btrfs_delayed_ref_node *ref1,
- struct btrfs_delayed_ref_node *ref2)
+static int comp_data_refs(const struct btrfs_delayed_ref_node *ref1,
+ const struct btrfs_delayed_ref_node *ref2)
{
if (ref1->data_ref.objectid < ref2->data_ref.objectid)
return -1;
@@ -279,8 +282,8 @@ static int comp_data_refs(struct btrfs_delayed_ref_node *ref1,
return 0;
}
-static int comp_refs(struct btrfs_delayed_ref_node *ref1,
- struct btrfs_delayed_ref_node *ref2,
+static int comp_refs(const struct btrfs_delayed_ref_node *ref1,
+ const struct btrfs_delayed_ref_node *ref2,
bool check_seq)
{
int ret = 0;
@@ -314,34 +317,25 @@ static int comp_refs(struct btrfs_delayed_ref_node *ref1,
return 0;
}
+static int cmp_refs_node(const struct rb_node *new, const struct rb_node *exist)
+{
+ const struct btrfs_delayed_ref_node *new_node =
+ rb_entry(new, struct btrfs_delayed_ref_node, ref_node);
+ const struct btrfs_delayed_ref_node *exist_node =
+ rb_entry(exist, struct btrfs_delayed_ref_node, ref_node);
+
+ return comp_refs(new_node, exist_node, true);
+}
+
static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root,
struct btrfs_delayed_ref_node *ins)
{
- struct rb_node **p = &root->rb_root.rb_node;
struct rb_node *node = &ins->ref_node;
- struct rb_node *parent_node = NULL;
- struct btrfs_delayed_ref_node *entry;
- bool leftmost = true;
-
- while (*p) {
- int comp;
-
- parent_node = *p;
- entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
- ref_node);
- comp = comp_refs(ins, entry, true);
- if (comp < 0) {
- p = &(*p)->rb_left;
- } else if (comp > 0) {
- p = &(*p)->rb_right;
- leftmost = false;
- } else {
- return entry;
- }
- }
+ struct rb_node *exist;
- rb_link_node(node, parent_node, p);
- rb_insert_color_cached(node, root, leftmost);
+ exist = rb_find_add_cached(node, root, cmp_refs_node);
+ if (exist)
+ return rb_entry(exist, struct btrfs_delayed_ref_node, ref_node);
return NULL;
}
@@ -555,6 +549,32 @@ void btrfs_delete_ref_head(const struct btrfs_fs_info *fs_info,
delayed_refs->num_heads_ready--;
}
+struct btrfs_delayed_ref_node *btrfs_select_delayed_ref(struct btrfs_delayed_ref_head *head)
+{
+ struct btrfs_delayed_ref_node *ref;
+
+ lockdep_assert_held(&head->mutex);
+ lockdep_assert_held(&head->lock);
+
+ if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
+ return NULL;
+
+ /*
+ * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
+ * This is to prevent a ref count from going down to zero, which deletes
+ * the extent item from the extent tree, when there still are references
+ * to add, which would fail because they would not find the extent item.
+ */
+ if (!list_empty(&head->ref_add_list))
+ return list_first_entry(&head->ref_add_list,
+ struct btrfs_delayed_ref_node, add_list);
+
+ ref = rb_entry(rb_first_cached(&head->ref_tree),
+ struct btrfs_delayed_ref_node, ref_node);
+ ASSERT(list_empty(&ref->add_list));
+ return ref;
+}
+
/*
* Helper to insert the ref_node to the tail or merge with tail.
*
@@ -1234,6 +1254,7 @@ void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans)
{
struct btrfs_delayed_ref_root *delayed_refs = &trans->delayed_refs;
struct btrfs_fs_info *fs_info = trans->fs_info;
+ bool testing = btrfs_is_testing(fs_info);
spin_lock(&delayed_refs->lock);
while (true) {
@@ -1263,7 +1284,7 @@ void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans)
spin_unlock(&delayed_refs->lock);
mutex_unlock(&head->mutex);
- if (pin_bytes) {
+ if (!testing && pin_bytes) {
struct btrfs_block_group *bg;
bg = btrfs_lookup_block_group(fs_info, head->bytenr);
@@ -1281,8 +1302,7 @@ void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans)
spin_lock(&bg->space_info->lock);
spin_lock(&bg->lock);
bg->pinned += head->num_bytes;
- btrfs_space_info_update_bytes_pinned(fs_info,
- bg->space_info,
+ btrfs_space_info_update_bytes_pinned(bg->space_info,
head->num_bytes);
bg->reserved -= head->num_bytes;
bg->space_info->bytes_reserved -= head->num_bytes;
@@ -1295,12 +1315,15 @@ void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans)
btrfs_error_unpin_extent_range(fs_info, head->bytenr,
head->bytenr + head->num_bytes - 1);
}
- btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
+ if (!testing)
+ btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
btrfs_put_delayed_ref_head(head);
cond_resched();
spin_lock(&delayed_refs->lock);
}
- btrfs_qgroup_destroy_extent_records(trans);
+
+ if (!testing)
+ btrfs_qgroup_destroy_extent_records(trans);
spin_unlock(&delayed_refs->lock);
}
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 611fb3388f82..f5ae880308d3 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -14,6 +14,8 @@
#include <linux/spinlock.h>
#include <linux/slab.h>
#include <uapi/linux/btrfs_tree.h>
+#include "fs.h"
+#include "messages.h"
struct btrfs_trans_handle;
struct btrfs_fs_info;
@@ -402,6 +404,7 @@ struct btrfs_delayed_ref_head *btrfs_select_ref_head(
struct btrfs_delayed_ref_root *delayed_refs);
void btrfs_unselect_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head);
+struct btrfs_delayed_ref_node *btrfs_select_delayed_ref(struct btrfs_delayed_ref_head *head);
int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq);
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index ac8e97ed13f7..53d7d85cb4be 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -76,7 +76,7 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
struct extent_buffer *eb;
int slot;
int ret = 0;
- struct btrfs_path *path = NULL;
+ BTRFS_PATH_AUTO_FREE(path);
int item_size;
struct btrfs_dev_replace_item *ptr;
u64 src_devid;
@@ -85,10 +85,8 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
return 0;
path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!path)
+ return -ENOMEM;
key.objectid = 0;
key.type = BTRFS_DEV_REPLACE_KEY;
@@ -103,10 +101,8 @@ no_valid_dev_replace_entry_found:
if (btrfs_find_device(fs_info->fs_devices, &args)) {
btrfs_err(fs_info,
"found replace target device without a valid replace item");
- ret = -EUCLEAN;
- goto out;
+ return -EUCLEAN;
}
- ret = 0;
dev_replace->replace_state =
BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED;
dev_replace->cont_reading_from_srcdev_mode =
@@ -123,7 +119,7 @@ no_valid_dev_replace_entry_found:
dev_replace->tgtdev = NULL;
dev_replace->is_valid = 0;
dev_replace->item_needs_writeback = 0;
- goto out;
+ return 0;
}
slot = path->slots[0];
eb = path->nodes[0];
@@ -226,8 +222,6 @@ no_valid_dev_replace_entry_found:
break;
}
-out:
- btrfs_free_path(path);
return ret;
}
@@ -346,7 +340,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans)
struct btrfs_fs_info *fs_info = trans->fs_info;
int ret;
struct btrfs_root *dev_root = fs_info->dev_root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct extent_buffer *eb;
struct btrfs_dev_replace_item *ptr;
@@ -365,16 +359,15 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans)
key.offset = 0;
path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!path)
+ return -ENOMEM;
+
ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
if (ret < 0) {
btrfs_warn(fs_info,
"error %d while searching for dev_replace item!",
ret);
- goto out;
+ return ret;
}
if (ret == 0 &&
@@ -395,7 +388,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans)
btrfs_warn(fs_info,
"delete too small dev_replace item failed %d!",
ret);
- goto out;
+ return ret;
}
ret = 1;
}
@@ -408,7 +401,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans)
if (ret < 0) {
btrfs_warn(fs_info,
"insert dev_replace item failed %d!", ret);
- goto out;
+ return ret;
}
}
@@ -441,11 +434,6 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans)
dev_replace->item_needs_writeback = 0;
up_write(&dev_replace->rwsem);
- btrfs_mark_buffer_dirty(trans, eb);
-
-out:
- btrfs_free_path(path);
-
return ret;
}
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 1ea5d8fcfbf7..b29cc31a7c4a 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -92,7 +92,6 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
write_extent_buffer(leaf, name, name_ptr, name_len);
write_extent_buffer(leaf, data, data_ptr, data_len);
- btrfs_mark_buffer_dirty(trans, path->nodes[0]);
return ret;
}
@@ -152,7 +151,6 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
name_ptr = (unsigned long)(dir_item + 1);
write_extent_buffer(leaf, name->name, name_ptr, name->len);
- btrfs_mark_buffer_dirty(trans, leaf);
second_insert:
/* FIXME, use some real flag for selecting the extra index */
@@ -238,7 +236,7 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
int data_size;
struct extent_buffer *leaf;
int slot;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
path = btrfs_alloc_path();
if (!path)
@@ -253,20 +251,17 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
if (IS_ERR(di)) {
ret = PTR_ERR(di);
/* Nothing found, we're safe */
- if (ret == -ENOENT) {
- ret = 0;
- goto out;
- }
+ if (ret == -ENOENT)
+ return 0;
if (ret < 0)
- goto out;
+ return ret;
}
/* we found an item, look for our name in the item */
if (di) {
/* our exact name was found */
- ret = -EEXIST;
- goto out;
+ return -EEXIST;
}
/* See if there is room in the item to insert this name. */
@@ -275,14 +270,11 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
slot = path->slots[0];
if (data_size + btrfs_item_size(leaf, slot) +
sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root->fs_info)) {
- ret = -EOVERFLOW;
- } else {
- /* plenty of insertion room */
- ret = 0;
+ return -EOVERFLOW;
}
-out:
- btrfs_free_path(path);
- return ret;
+
+ /* Plenty of insertion room. */
+ return 0;
}
/*
diff --git a/fs/btrfs/dir-item.h b/fs/btrfs/dir-item.h
index 28d69970bc70..8462579a95f4 100644
--- a/fs/btrfs/dir-item.h
+++ b/fs/btrfs/dir-item.h
@@ -10,6 +10,7 @@ struct fscrypt_str;
struct btrfs_fs_info;
struct btrfs_key;
struct btrfs_path;
+struct btrfs_inode;
struct btrfs_root;
struct btrfs_trans_handle;
diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c
index a7c3e221378d..a374ce7a1813 100644
--- a/fs/btrfs/direct-io.c
+++ b/fs/btrfs/direct-io.c
@@ -248,8 +248,8 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
len = min(len, em->len - (start - em->start));
block_start = extent_map_block_start(em) + (start - em->start);
- if (can_nocow_extent(inode, start, &len,
- &file_extent, false, false) == 1) {
+ if (can_nocow_extent(BTRFS_I(inode), start, &len, &file_extent,
+ false) == 1) {
bg = btrfs_inc_nocow_writers(fs_info, block_start);
if (bg)
can_nocow = true;
@@ -856,6 +856,22 @@ relock:
btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
goto buffered;
}
+ /*
+ * We can't control the folios being passed in, applications can write
+ * to them while a direct IO write is in progress. This means the
+ * content might change after we calculated the data checksum.
+ * Therefore we can end up storing a checksum that doesn't match the
+ * persisted data.
+ *
+ * To be extra safe and avoid false data checksum mismatch, if the
+ * inode requires data checksum, just fallback to buffered IO.
+ * For buffered IO we have full control of page cache and can ensure
+ * no one is modifying the content during writeback.
+ */
+ if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
+ btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
+ goto buffered;
+ }
/*
* The iov_iter can be mapped to the same file range we are writing to.
diff --git a/fs/btrfs/direct-io.h b/fs/btrfs/direct-io.h
index 3dc3ea926afe..df5d45ee6de7 100644
--- a/fs/btrfs/direct-io.h
+++ b/fs/btrfs/direct-io.h
@@ -5,6 +5,8 @@
#include <linux/types.h>
+struct kiocb;
+
int __init btrfs_init_dio(void);
void __cold btrfs_destroy_dio(void);
diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c
index e815d165cccc..d6eef4bd9e9d 100644
--- a/fs/btrfs/discard.c
+++ b/fs/btrfs/discard.c
@@ -167,13 +167,7 @@ static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl,
block_group->discard_eligible_time = 0;
queued = !list_empty(&block_group->discard_list);
list_del_init(&block_group->discard_list);
- /*
- * If the block group is currently running in the discard workfn, we
- * don't want to deref it, since it's still being used by the workfn.
- * The workfn will notice this case and deref the block group when it is
- * finished.
- */
- if (queued && !running)
+ if (queued)
btrfs_put_block_group(block_group);
spin_unlock(&discard_ctl->lock);
@@ -260,9 +254,10 @@ again:
block_group->discard_cursor = block_group->start;
block_group->discard_state = BTRFS_DISCARD_EXTENTS;
}
- discard_ctl->block_group = block_group;
}
if (block_group) {
+ btrfs_get_block_group(block_group);
+ discard_ctl->block_group = block_group;
*discard_state = block_group->discard_state;
*discard_index = block_group->discard_index;
}
@@ -493,9 +488,20 @@ static void btrfs_discard_workfn(struct work_struct *work)
block_group = peek_discard_list(discard_ctl, &discard_state,
&discard_index, now);
- if (!block_group || !btrfs_run_discard_work(discard_ctl))
+ if (!block_group)
return;
+ if (!btrfs_run_discard_work(discard_ctl)) {
+ spin_lock(&discard_ctl->lock);
+ btrfs_put_block_group(block_group);
+ discard_ctl->block_group = NULL;
+ spin_unlock(&discard_ctl->lock);
+ return;
+ }
if (now < block_group->discard_eligible_time) {
+ spin_lock(&discard_ctl->lock);
+ btrfs_put_block_group(block_group);
+ discard_ctl->block_group = NULL;
+ spin_unlock(&discard_ctl->lock);
btrfs_discard_schedule_work(discard_ctl, false);
return;
}
@@ -547,15 +553,7 @@ static void btrfs_discard_workfn(struct work_struct *work)
spin_lock(&discard_ctl->lock);
discard_ctl->prev_discard = trimmed;
discard_ctl->prev_discard_time = now;
- /*
- * If the block group was removed from the discard list while it was
- * running in this workfn, then we didn't deref it, since this function
- * still owned that reference. But we set the discard_ctl->block_group
- * back to NULL, so we can use that condition to know that now we need
- * to deref the block_group.
- */
- if (discard_ctl->block_group == NULL)
- btrfs_put_block_group(block_group);
+ btrfs_put_block_group(block_group);
discard_ctl->block_group = NULL;
__btrfs_discard_schedule_work(discard_ctl, now, false);
spin_unlock(&discard_ctl->lock);
diff --git a/fs/btrfs/discard.h b/fs/btrfs/discard.h
index dddb0f9101ba..2c5e85394092 100644
--- a/fs/btrfs/discard.h
+++ b/fs/btrfs/discard.h
@@ -3,6 +3,7 @@
#ifndef BTRFS_DISCARD_H
#define BTRFS_DISCARD_H
+#include <linux/types.h>
#include <linux/sizes.h>
struct btrfs_fs_info;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index eff0dd1ae62f..aa58e0663a5d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -182,13 +182,12 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb,
int mirror_num)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
- int num_folios = num_extent_folios(eb);
int ret = 0;
if (sb_rdonly(fs_info->sb))
return -EROFS;
- for (int i = 0; i < num_folios; i++) {
+ for (int i = 0; i < num_extent_folios(eb); i++) {
struct folio *folio = eb->folios[i];
u64 start = max_t(u64, eb->start, folio_pos(folio));
u64 end = min_t(u64, eb->start + eb->len,
@@ -226,7 +225,7 @@ int btrfs_read_extent_buffer(struct extent_buffer *eb,
while (1) {
clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
- ret = read_extent_buffer_pages(eb, WAIT_COMPLETE, mirror_num, check);
+ ret = read_extent_buffer_pages(eb, mirror_num, check);
if (!ret)
break;
@@ -284,8 +283,7 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio)
if (WARN_ON_ONCE(found_start != eb->start))
return BLK_STS_IOERR;
- if (WARN_ON(!btrfs_folio_test_uptodate(fs_info, eb->folios[0],
- eb->start, eb->len)))
+ if (WARN_ON(!btrfs_meta_folio_test_uptodate(eb->folios[0], eb)))
return BLK_STS_IOERR;
ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid,
@@ -1089,21 +1087,22 @@ struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
const struct btrfs_key *key)
{
struct btrfs_root *root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
path = btrfs_alloc_path();
if (!path)
return ERR_PTR(-ENOMEM);
root = read_tree_root_path(tree_root, path, key);
- btrfs_free_path(path);
return root;
}
/*
- * Initialize subvolume root in-memory structure
+ * Initialize subvolume root in-memory structure.
*
* @anon_dev: anonymous device to attach to the root, if zero, allocate new
+ *
+ * In case of failure the caller is responsible to call btrfs_free_fs_root()
*/
static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
{
@@ -1127,7 +1126,7 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
if (!anon_dev) {
ret = get_anon_bdev(&root->anon_dev);
if (ret)
- goto fail;
+ return ret;
} else {
root->anon_dev = anon_dev;
}
@@ -1137,7 +1136,7 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
ret = btrfs_init_root_free_objectid(root);
if (ret) {
mutex_unlock(&root->objectid_mutex);
- goto fail;
+ return ret;
}
ASSERT(root->free_objectid <= BTRFS_LAST_FREE_OBJECTID);
@@ -1145,9 +1144,6 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
mutex_unlock(&root->objectid_mutex);
return 0;
-fail:
- /* The caller is responsible to call btrfs_free_fs_root */
- return ret;
}
static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
@@ -1258,6 +1254,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
{
struct percpu_counter *em_counter = &fs_info->evictable_extent_maps;
+ percpu_counter_destroy(&fs_info->stats_read_blocks);
percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
percpu_counter_destroy(&fs_info->delalloc_bytes);
percpu_counter_destroy(&fs_info->ordered_bytes);
@@ -1567,7 +1564,7 @@ static int transaction_kthread(void *arg)
do {
cannot_commit = false;
- delay = msecs_to_jiffies(fs_info->commit_interval * 1000);
+ delay = secs_to_jiffies(fs_info->commit_interval);
mutex_lock(&fs_info->transaction_kthread_mutex);
spin_lock(&fs_info->trans_lock);
@@ -1582,9 +1579,9 @@ static int transaction_kthread(void *arg)
cur->state < TRANS_STATE_COMMIT_PREP &&
delta < fs_info->commit_interval) {
spin_unlock(&fs_info->trans_lock);
- delay -= msecs_to_jiffies((delta - 1) * 1000);
+ delay -= secs_to_jiffies(delta - 1);
delay = min(delay,
- msecs_to_jiffies(fs_info->commit_interval * 1000));
+ secs_to_jiffies(fs_info->commit_interval));
goto sleep;
}
transid = cur->transid;
@@ -2199,8 +2196,8 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root,
static int load_global_roots(struct btrfs_root *tree_root)
{
- struct btrfs_path *path;
- int ret = 0;
+ BTRFS_PATH_AUTO_FREE(path);
+ int ret;
path = btrfs_alloc_path();
if (!path)
@@ -2209,18 +2206,17 @@ static int load_global_roots(struct btrfs_root *tree_root)
ret = load_global_roots_objectid(tree_root, path,
BTRFS_EXTENT_TREE_OBJECTID, "extent");
if (ret)
- goto out;
+ return ret;
ret = load_global_roots_objectid(tree_root, path,
BTRFS_CSUM_TREE_OBJECTID, "csum");
if (ret)
- goto out;
+ return ret;
if (!btrfs_fs_compat_ro(tree_root->fs_info, FREE_SPACE_TREE))
- goto out;
+ return ret;
ret = load_global_roots_objectid(tree_root, path,
BTRFS_FREE_SPACE_TREE_OBJECTID,
"free space");
-out:
- btrfs_free_path(path);
+
return ret;
}
@@ -2327,6 +2323,71 @@ out:
return ret;
}
+static int validate_sys_chunk_array(const struct btrfs_fs_info *fs_info,
+ const struct btrfs_super_block *sb)
+{
+ unsigned int cur = 0; /* Offset inside the sys chunk array */
+ /*
+ * At sb read time, fs_info is not fully initialized. Thus we have
+ * to use super block sectorsize, which should have been validated.
+ */
+ const u32 sectorsize = btrfs_super_sectorsize(sb);
+ u32 sys_array_size = btrfs_super_sys_array_size(sb);
+
+ if (sys_array_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
+ btrfs_err(fs_info, "system chunk array too big %u > %u",
+ sys_array_size, BTRFS_SYSTEM_CHUNK_ARRAY_SIZE);
+ return -EUCLEAN;
+ }
+
+ while (cur < sys_array_size) {
+ struct btrfs_disk_key *disk_key;
+ struct btrfs_chunk *chunk;
+ struct btrfs_key key;
+ u64 type;
+ u16 num_stripes;
+ u32 len;
+ int ret;
+
+ disk_key = (struct btrfs_disk_key *)(sb->sys_chunk_array + cur);
+ len = sizeof(*disk_key);
+
+ if (cur + len > sys_array_size)
+ goto short_read;
+ cur += len;
+
+ btrfs_disk_key_to_cpu(&key, disk_key);
+ if (key.type != BTRFS_CHUNK_ITEM_KEY) {
+ btrfs_err(fs_info,
+ "unexpected item type %u in sys_array at offset %u",
+ key.type, cur);
+ return -EUCLEAN;
+ }
+ chunk = (struct btrfs_chunk *)(sb->sys_chunk_array + cur);
+ num_stripes = btrfs_stack_chunk_num_stripes(chunk);
+ if (cur + btrfs_chunk_item_size(num_stripes) > sys_array_size)
+ goto short_read;
+ type = btrfs_stack_chunk_type(chunk);
+ if (!(type & BTRFS_BLOCK_GROUP_SYSTEM)) {
+ btrfs_err(fs_info,
+ "invalid chunk type %llu in sys_array at offset %u",
+ type, cur);
+ return -EUCLEAN;
+ }
+ ret = btrfs_check_chunk_valid(fs_info, NULL, chunk, key.offset,
+ sectorsize);
+ if (ret < 0)
+ return ret;
+ cur += btrfs_chunk_item_size(num_stripes);
+ }
+ return 0;
+short_read:
+ btrfs_err(fs_info,
+ "super block sys chunk array short read, cur=%u sys_array_size=%u",
+ cur, sys_array_size);
+ return -EUCLEAN;
+}
+
/*
* Real super block validation
* NOTE: super csum type and incompat features will not be checked here.
@@ -2381,21 +2442,27 @@ int btrfs_validate_super(const struct btrfs_fs_info *fs_info,
* Check sectorsize and nodesize first, other check will need it.
* Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here.
*/
- if (!is_power_of_2(sectorsize) || sectorsize < 4096 ||
+ if (!is_power_of_2(sectorsize) || sectorsize < BTRFS_MIN_BLOCKSIZE ||
sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) {
btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize);
ret = -EINVAL;
}
/*
- * We only support at most two sectorsizes: 4K and PAGE_SIZE.
+ * We only support at most 3 sectorsizes: 4K, PAGE_SIZE, MIN_BLOCKSIZE.
+ *
+ * For 4K page sized systems with non-debug builds, all 3 matches (4K).
+ * For 4K page sized systems with debug builds, there are two block sizes
+ * supported. (4K and 2K)
*
* We can support 16K sectorsize with 64K page size without problem,
* but such sectorsize/pagesize combination doesn't make much sense.
* 4K will be our future standard, PAGE_SIZE is supported from the very
* beginning.
*/
- if (sectorsize > PAGE_SIZE || (sectorsize != SZ_4K && sectorsize != PAGE_SIZE)) {
+ if (sectorsize > PAGE_SIZE || (sectorsize != SZ_4K &&
+ sectorsize != PAGE_SIZE &&
+ sectorsize != BTRFS_MIN_BLOCKSIZE)) {
btrfs_err(fs_info,
"sectorsize %llu not yet supported for page size %lu",
sectorsize, PAGE_SIZE);
@@ -2495,6 +2562,11 @@ int btrfs_validate_super(const struct btrfs_fs_info *fs_info,
ret = -EINVAL;
}
+ if (ret)
+ return ret;
+
+ ret = validate_sys_chunk_array(fs_info, sb);
+
/*
* Obvious sys_chunk_array corruptions, it must hold at least one key
* and one chunk
@@ -2856,6 +2928,10 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block
if (ret)
return ret;
+ ret = percpu_counter_init(&fs_info->stats_read_blocks, 0, GFP_KERNEL);
+ if (ret)
+ return ret;
+
fs_info->dirty_metadata_batch = PAGE_SIZE *
(1 + ilog2(nr_cpu_ids));
@@ -3318,9 +3394,9 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
fs_info->nodesize = nodesize;
fs_info->sectorsize = sectorsize;
fs_info->sectorsize_bits = ilog2(sectorsize);
- fs_info->sectors_per_page = (PAGE_SIZE >> fs_info->sectorsize_bits);
fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
fs_info->stripesize = stripesize;
+ fs_info->fs_devices->fs_info = fs_info;
/*
* Handle the space caching options appropriately now that we have the
@@ -3343,11 +3419,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
*/
fs_info->max_inline = min_t(u64, fs_info->max_inline, fs_info->sectorsize);
- if (sectorsize < PAGE_SIZE)
- btrfs_warn(fs_info,
- "read-write for sector size %u with page size %lu is experimental",
- sectorsize, PAGE_SIZE);
-
ret = btrfs_init_workqueues(fs_info);
if (ret)
goto fail_sb_buffer;
@@ -3782,7 +3853,6 @@ static int write_dev_supers(struct btrfs_device *device,
atomic_inc(&device->sb_write_errors);
continue;
}
- ASSERT(folio_order(folio) == 0);
offset = offset_in_folio(folio, bytenr);
disk_super = folio_address(folio) + offset;
@@ -3855,7 +3925,6 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
/* If the folio has been removed, then we know it completed. */
if (IS_ERR(folio))
continue;
- ASSERT(folio_order(folio) == 0);
/* Folio will be unlocked once the write completes. */
folio_wait_locked(folio);
@@ -4253,6 +4322,14 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
btrfs_cleanup_defrag_inodes(fs_info);
/*
+ * Handle the error fs first, as it will flush and wait for all ordered
+ * extents. This will generate delayed iputs, thus we want to handle
+ * it first.
+ */
+ if (unlikely(BTRFS_FS_ERROR(fs_info)))
+ btrfs_error_commit_super(fs_info);
+
+ /*
* Wait for any fixup workers to complete.
* If we don't wait for them here and they are still running by the time
* we call kthread_stop() against the cleaner kthread further below, we
@@ -4273,6 +4350,31 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
btrfs_flush_workqueue(fs_info->delalloc_workers);
/*
+ * We can have ordered extents getting their last reference dropped from
+ * the fs_info->workers queue because for async writes for data bios we
+ * queue a work for that queue, at btrfs_wq_submit_bio(), that runs
+ * run_one_async_done() which calls btrfs_bio_end_io() in case the bio
+ * has an error, and that later function can do the final
+ * btrfs_put_ordered_extent() on the ordered extent attached to the bio,
+ * which adds a delayed iput for the inode. So we must flush the queue
+ * so that we don't have delayed iputs after committing the current
+ * transaction below and stopping the cleaner and transaction kthreads.
+ */
+ btrfs_flush_workqueue(fs_info->workers);
+
+ /*
+ * When finishing a compressed write bio we schedule a work queue item
+ * to finish an ordered extent - btrfs_finish_compressed_write_work()
+ * calls btrfs_finish_ordered_extent() which in turns does a call to
+ * btrfs_queue_ordered_fn(), and that queues the ordered extent
+ * completion either in the endio_write_workers work queue or in the
+ * fs_info->endio_freespace_worker work queue. We flush those queues
+ * below, so before we flush them we must flush this queue for the
+ * workers of compressed writes.
+ */
+ flush_workqueue(fs_info->compressed_write_workers);
+
+ /*
* After we parked the cleaner kthread, ordered extents may have
* completed and created new delayed iputs. If one of the async reclaim
* tasks is running and in the RUN_DELAYED_IPUTS flush state, then we
@@ -4296,6 +4398,8 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
/* Ordered extents for free space inodes. */
btrfs_flush_workqueue(fs_info->endio_freespace_worker);
btrfs_run_delayed_iputs(fs_info);
+ /* There should be no more workload to generate new delayed iputs. */
+ set_bit(BTRFS_FS_STATE_NO_DELAYED_IPUT, &fs_info->fs_state);
cancel_work_sync(&fs_info->async_reclaim_work);
cancel_work_sync(&fs_info->async_data_reclaim_work);
@@ -4330,9 +4434,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
btrfs_err(fs_info, "commit super ret %d", ret);
}
- if (BTRFS_FS_ERROR(fs_info))
- btrfs_error_commit_super(fs_info);
-
kthread_stop(fs_info->transaction_kthread);
kthread_stop(fs_info->cleaner_kthread);
@@ -4455,10 +4556,6 @@ static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info)
/* cleanup FS via transaction */
btrfs_cleanup_transaction(fs_info);
- mutex_lock(&fs_info->cleaner_mutex);
- btrfs_run_delayed_iputs(fs_info);
- mutex_unlock(&fs_info->cleaner_mutex);
-
down_write(&fs_info->cleanup_work_sem);
up_write(&fs_info->cleanup_work_sem);
}
@@ -4829,7 +4926,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
int btrfs_init_root_free_objectid(struct btrfs_root *root)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
int ret;
struct extent_buffer *l;
struct btrfs_key search_key;
@@ -4845,14 +4942,13 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root)
search_key.offset = (u64)-1;
ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
if (ret < 0)
- goto error;
+ return ret;
if (ret == 0) {
/*
* Key with offset -1 found, there would have to exist a root
* with such id, but this is out of valid range.
*/
- ret = -EUCLEAN;
- goto error;
+ return -EUCLEAN;
}
if (path->slots[0] > 0) {
slot = path->slots[0] - 1;
@@ -4863,10 +4959,8 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root)
} else {
root->free_objectid = BTRFS_FIRST_FREE_OBJECTID;
}
- ret = 0;
-error:
- btrfs_free_path(path);
- return ret;
+
+ return 0;
}
int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid)
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index a7051e2570c1..587842991b24 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -96,9 +96,6 @@ struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info);
/*
* This function is used to grab the root, and avoid it is freed when we
* access it. But it doesn't ensure that the tree is not dropped.
- *
- * If you want to ensure the whole tree is safe, you should use
- * fs_info->subvol_srcu
*/
static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root)
{
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index e2b22bea348a..7fc8a3200b40 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -75,7 +75,7 @@ struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
{
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
struct btrfs_root *root;
- struct inode *inode;
+ struct btrfs_inode *inode;
if (objectid < BTRFS_FIRST_FREE_OBJECTID)
return ERR_PTR(-ESTALE);
@@ -89,12 +89,12 @@ struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
if (IS_ERR(inode))
return ERR_CAST(inode);
- if (generation != 0 && generation != inode->i_generation) {
- iput(inode);
+ if (generation != 0 && generation != inode->vfs_inode.i_generation) {
+ iput(&inode->vfs_inode);
return ERR_PTR(-ESTALE);
}
- return d_obtain_alias(inode);
+ return d_obtain_alias(&inode->vfs_inode);
}
static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
@@ -145,9 +145,10 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
struct dentry *btrfs_get_parent(struct dentry *child)
{
- struct inode *dir = d_inode(child);
- struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
- struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_inode *dir = BTRFS_I(d_inode(child));
+ struct btrfs_inode *inode;
+ struct btrfs_root *root = dir->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_root_ref *ref;
@@ -159,13 +160,13 @@ struct dentry *btrfs_get_parent(struct dentry *child)
if (!path)
return ERR_PTR(-ENOMEM);
- if (btrfs_ino(BTRFS_I(dir)) == BTRFS_FIRST_FREE_OBJECTID) {
+ if (btrfs_ino(dir) == BTRFS_FIRST_FREE_OBJECTID) {
key.objectid = btrfs_root_id(root);
key.type = BTRFS_ROOT_BACKREF_KEY;
key.offset = (u64)-1;
root = fs_info->tree_root;
} else {
- key.objectid = btrfs_ino(BTRFS_I(dir));
+ key.objectid = btrfs_ino(dir);
key.type = BTRFS_INODE_REF_KEY;
key.offset = (u64)-1;
}
@@ -210,7 +211,11 @@ struct dentry *btrfs_get_parent(struct dentry *child)
found_key.offset, 0);
}
- return d_obtain_alias(btrfs_iget(key.objectid, root));
+ inode = btrfs_iget(key.objectid, root);
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
+
+ return d_obtain_alias(&inode->vfs_inode);
fail:
btrfs_free_path(path);
return ERR_PTR(ret);
@@ -219,11 +224,11 @@ fail:
static int btrfs_get_name(struct dentry *parent, char *name,
struct dentry *child)
{
- struct inode *inode = d_inode(child);
- struct inode *dir = d_inode(parent);
- struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
- struct btrfs_path *path;
- struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_inode *inode = BTRFS_I(d_inode(child));
+ struct btrfs_inode *dir = BTRFS_I(d_inode(parent));
+ struct btrfs_root *root = dir->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_inode_ref *iref;
struct btrfs_root_ref *rref;
struct extent_buffer *leaf;
@@ -233,37 +238,34 @@ static int btrfs_get_name(struct dentry *parent, char *name,
int ret;
u64 ino;
- if (!S_ISDIR(dir->i_mode))
+ if (!S_ISDIR(dir->vfs_inode.i_mode))
return -EINVAL;
- ino = btrfs_ino(BTRFS_I(inode));
+ ino = btrfs_ino(inode);
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
if (ino == BTRFS_FIRST_FREE_OBJECTID) {
- key.objectid = btrfs_root_id(BTRFS_I(inode)->root);
+ key.objectid = btrfs_root_id(inode->root);
key.type = BTRFS_ROOT_BACKREF_KEY;
key.offset = (u64)-1;
root = fs_info->tree_root;
} else {
key.objectid = ino;
- key.offset = btrfs_ino(BTRFS_I(dir));
key.type = BTRFS_INODE_REF_KEY;
+ key.offset = btrfs_ino(dir);
}
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0) {
- btrfs_free_path(path);
return ret;
} else if (ret > 0) {
- if (ino == BTRFS_FIRST_FREE_OBJECTID) {
+ if (ino == BTRFS_FIRST_FREE_OBJECTID)
path->slots[0]--;
- } else {
- btrfs_free_path(path);
+ else
return -ENOENT;
- }
}
leaf = path->nodes[0];
@@ -280,7 +282,6 @@ static int btrfs_get_name(struct dentry *parent, char *name,
}
read_extent_buffer(leaf, name, name_ptr, name_len);
- btrfs_free_path(path);
/*
* have to add the null termination to make sure that reconnect_path
diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c
index 6d08c100b01d..13de6af279e5 100644
--- a/fs/btrfs/extent-io-tree.c
+++ b/fs/btrfs/extent-io-tree.c
@@ -346,10 +346,10 @@ static inline struct extent_state *tree_search(struct extent_io_tree *tree, u64
return tree_search_for_insert(tree, offset, NULL, NULL);
}
-static void extent_io_tree_panic(const struct extent_io_tree *tree,
- const struct extent_state *state,
- const char *opname,
- int err)
+static void __cold extent_io_tree_panic(const struct extent_io_tree *tree,
+ const struct extent_state *state,
+ const char *opname,
+ int err)
{
btrfs_panic(extent_io_tree_to_fs_info(tree), err,
"extent io tree error on %s state start %llu end %llu",
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3c6f7fecbb9a..957230abd827 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -70,20 +70,17 @@ static int block_group_bits(struct btrfs_block_group *cache, u64 bits)
int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
{
struct btrfs_root *root = btrfs_extent_root(fs_info, start);
- int ret;
struct btrfs_key key;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
key.objectid = start;
- key.offset = len;
key.type = BTRFS_EXTENT_ITEM_KEY;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- btrfs_free_path(path);
- return ret;
+ key.offset = len;
+ return btrfs_search_slot(NULL, root, &key, path, 0, 0);
}
/*
@@ -103,7 +100,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root;
struct btrfs_delayed_ref_head *head;
struct btrfs_delayed_ref_root *delayed_refs;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
u64 num_refs;
u64 extent_flags;
@@ -125,16 +122,16 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
search_again:
key.objectid = bytenr;
- key.offset = offset;
if (metadata)
key.type = BTRFS_METADATA_ITEM_KEY;
else
key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = offset;
extent_root = btrfs_extent_root(fs_info, bytenr);
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
if (ret < 0)
- goto out_free;
+ return ret;
if (ret > 0 && key.type == BTRFS_METADATA_ITEM_KEY) {
if (path->slots[0]) {
@@ -159,7 +156,7 @@ search_again:
"unexpected extent item size, has %u expect >= %zu",
item_size, sizeof(*ei));
btrfs_abort_transaction(trans, ret);
- goto out_free;
+ return ret;
}
ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
@@ -170,7 +167,7 @@ search_again:
"unexpected zero reference count for extent item (%llu %u %llu)",
key.objectid, key.type, key.offset);
btrfs_abort_transaction(trans, ret);
- goto out_free;
+ return ret;
}
extent_flags = btrfs_extent_flags(leaf, ei);
owner = btrfs_get_extent_owner_root(fs_info, leaf, path->slots[0]);
@@ -216,8 +213,7 @@ search_again:
*flags = extent_flags;
if (owning_root)
*owning_root = owner;
-out_free:
- btrfs_free_path(path);
+
return ret;
}
@@ -570,7 +566,6 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
}
}
- btrfs_mark_buffer_dirty(trans, leaf);
ret = 0;
fail:
btrfs_release_path(path);
@@ -618,7 +613,6 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
- btrfs_mark_buffer_dirty(trans, leaf);
}
return ret;
}
@@ -1050,7 +1044,6 @@ void setup_inline_extent_backref(struct btrfs_trans_handle *trans,
} else {
btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
}
- btrfs_mark_buffer_dirty(trans, leaf);
}
static int lookup_extent_backref(struct btrfs_trans_handle *trans,
@@ -1195,7 +1188,6 @@ static noinline_for_stack int update_inline_extent_backref(
item_size -= size;
btrfs_truncate_item(trans, path, item_size, 1);
}
- btrfs_mark_buffer_dirty(trans, leaf);
return 0;
}
@@ -1260,12 +1252,12 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
{
int j, ret = 0;
u64 bytes_left, end;
- u64 aligned_start = ALIGN(start, 1 << SECTOR_SHIFT);
+ u64 aligned_start = ALIGN(start, SECTOR_SIZE);
/* Adjust the range to be aligned to 512B sectors if necessary. */
if (start != aligned_start) {
len -= aligned_start - start;
- len = round_down(len, 1 << SECTOR_SHIFT);
+ len = round_down(len, SECTOR_SIZE);
start = aligned_start;
}
@@ -1491,7 +1483,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
struct btrfs_extent_item *item;
struct btrfs_key key;
@@ -1512,7 +1504,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
node->parent, node->ref_root, owner,
offset, refs_to_add, extent_op);
if ((ret < 0 && ret != -EAGAIN) || !ret)
- goto out;
+ return ret;
/*
* Ok we had -EAGAIN which means we didn't have space to insert and
@@ -1527,7 +1519,6 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
if (extent_op)
__run_delayed_extent_op(extent_op, leaf, item);
- btrfs_mark_buffer_dirty(trans, leaf);
btrfs_release_path(path);
/* now insert the actual backref */
@@ -1538,8 +1529,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
if (ret)
btrfs_abort_transaction(trans, ret);
-out:
- btrfs_free_path(path);
+
return ret;
}
@@ -1636,7 +1626,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *root;
struct btrfs_key key;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_extent_item *ei;
struct extent_buffer *leaf;
u32 item_size;
@@ -1667,7 +1657,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
again:
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
if (ret < 0) {
- goto out;
+ return ret;
} else if (ret > 0) {
if (metadata) {
if (path->slots[0] > 0) {
@@ -1684,8 +1674,8 @@ again:
metadata = 0;
key.objectid = head->bytenr;
- key.offset = head->num_bytes;
key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = head->num_bytes;
goto again;
}
} else {
@@ -1693,7 +1683,7 @@ again:
btrfs_err(fs_info,
"missing extent item for extent %llu num_bytes %llu level %d",
head->bytenr, head->num_bytes, head->level);
- goto out;
+ return ret;
}
}
@@ -1706,15 +1696,12 @@ again:
"unexpected extent item size, has %u expect >= %zu",
item_size, sizeof(*ei));
btrfs_abort_transaction(trans, ret);
- goto out;
+ return ret;
}
ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
__run_delayed_extent_op(extent_op, leaf, ei);
- btrfs_mark_buffer_dirty(trans, leaf);
-out:
- btrfs_free_path(path);
return ret;
}
@@ -1803,30 +1790,6 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
return ret;
}
-static inline struct btrfs_delayed_ref_node *
-select_delayed_ref(struct btrfs_delayed_ref_head *head)
-{
- struct btrfs_delayed_ref_node *ref;
-
- if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
- return NULL;
-
- /*
- * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
- * This is to prevent a ref count from going down to zero, which deletes
- * the extent item from the extent tree, when there still are references
- * to add, which would fail because they would not find the extent item.
- */
- if (!list_empty(&head->ref_add_list))
- return list_first_entry(&head->ref_add_list,
- struct btrfs_delayed_ref_node, add_list);
-
- ref = rb_entry(rb_first_cached(&head->ref_tree),
- struct btrfs_delayed_ref_node, ref_node);
- ASSERT(list_empty(&ref->add_list));
- return ref;
-}
-
static struct btrfs_delayed_extent_op *cleanup_extent_op(
struct btrfs_delayed_ref_head *head)
{
@@ -1959,7 +1922,7 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
lockdep_assert_held(&locked_ref->mutex);
lockdep_assert_held(&locked_ref->lock);
- while ((ref = select_delayed_ref(locked_ref))) {
+ while ((ref = btrfs_select_delayed_ref(locked_ref))) {
if (ref->seq &&
btrfs_check_delayed_seq(fs_info, ref->seq)) {
spin_unlock(&locked_ref->lock);
@@ -2230,10 +2193,11 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
return ret;
}
-static noinline int check_delayed_ref(struct btrfs_root *root,
+static noinline int check_delayed_ref(struct btrfs_inode *inode,
struct btrfs_path *path,
- u64 objectid, u64 offset, u64 bytenr)
+ u64 offset, u64 bytenr)
{
+ struct btrfs_root *root = inode->root;
struct btrfs_delayed_ref_head *head;
struct btrfs_delayed_ref_node *ref;
struct btrfs_delayed_ref_root *delayed_refs;
@@ -2307,7 +2271,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
* then we have a cross reference.
*/
if (ref->ref_root != btrfs_root_id(root) ||
- ref_owner != objectid || ref_offset != offset) {
+ ref_owner != btrfs_ino(inode) || ref_offset != offset) {
ret = 1;
break;
}
@@ -2318,11 +2282,53 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
return ret;
}
-static noinline int check_committed_ref(struct btrfs_root *root,
+/*
+ * Check if there are references for a data extent other than the one belonging
+ * to the given inode and offset.
+ *
+ * @inode: The only inode we expect to find associated with the data extent.
+ * @path: A path to use for searching the extent tree.
+ * @offset: The only offset we expect to find associated with the data extent.
+ * @bytenr: The logical address of the data extent.
+ *
+ * When the extent does not have any other references other than the one we
+ * expect to find, we always return a value of 0 with the path having a locked
+ * leaf that contains the extent's extent item - this is necessary to ensure
+ * we don't race with a task running delayed references, and our caller must
+ * have such a path when calling check_delayed_ref() - it must lock a delayed
+ * ref head while holding the leaf locked. In case the extent item is not found
+ * in the extent tree, we return -ENOENT with the path having the leaf (locked)
+ * where the extent item should be, in order to prevent races with another task
+ * running delayed references, so that we don't miss any reference when calling
+ * check_delayed_ref().
+ *
+ * Note: this may return false positives, and this is because we want to be
+ * quick here as we're called in write paths (when flushing delalloc and
+ * in the direct IO write path). For example we can have an extent with
+ * a single reference but that reference is not inlined, or we may have
+ * many references in the extent tree but we also have delayed references
+ * that cancel all the reference except the one for our inode and offset,
+ * but it would be expensive to do such checks and complex due to all
+ * locking to avoid races between the checks and flushing delayed refs,
+ * plus non-inline references may be located on leaves other than the one
+ * that contains the extent item in the extent tree. The important thing
+ * here is to not return false negatives and that the false positives are
+ * not very common.
+ *
+ * Returns: 0 if there are no cross references and with the path having a locked
+ * leaf from the extent tree that contains the extent's extent item.
+ *
+ * 1 if there are cross references (false positives can happen).
+ *
+ * < 0 in case of an error. In case of -ENOENT the leaf in the extent
+ * tree where the extent item should be located at is read locked and
+ * accessible in the given path.
+ */
+static noinline int check_committed_ref(struct btrfs_inode *inode,
struct btrfs_path *path,
- u64 objectid, u64 offset, u64 bytenr,
- bool strict)
+ u64 offset, u64 bytenr)
{
+ struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bytenr);
struct extent_buffer *leaf;
@@ -2336,40 +2342,37 @@ static noinline int check_committed_ref(struct btrfs_root *root,
int ret;
key.objectid = bytenr;
- key.offset = (u64)-1;
key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = (u64)-1;
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
if (ret < 0)
- goto out;
+ return ret;
if (ret == 0) {
/*
* Key with offset -1 found, there would have to exist an extent
* item with such offset, but this is out of the valid range.
*/
- ret = -EUCLEAN;
- goto out;
+ return -EUCLEAN;
}
- ret = -ENOENT;
if (path->slots[0] == 0)
- goto out;
+ return -ENOENT;
path->slots[0]--;
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
- goto out;
+ return -ENOENT;
- ret = 1;
item_size = btrfs_item_size(leaf, path->slots[0]);
ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
expected_size = sizeof(*ei) + btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY);
/* No inline refs; we need to bail before checking for owner ref. */
if (item_size == sizeof(*ei))
- goto out;
+ return 1;
/* Check for an owner ref; skip over it to the real inline refs. */
iref = (struct btrfs_extent_inline_ref *)(ei + 1);
@@ -2377,56 +2380,69 @@ static noinline int check_committed_ref(struct btrfs_root *root,
if (btrfs_fs_incompat(fs_info, SIMPLE_QUOTA) && type == BTRFS_EXTENT_OWNER_REF_KEY) {
expected_size += btrfs_extent_inline_ref_size(BTRFS_EXTENT_OWNER_REF_KEY);
iref = (struct btrfs_extent_inline_ref *)(iref + 1);
+ type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
}
/* If extent item has more than 1 inline ref then it's shared */
if (item_size != expected_size)
- goto out;
-
- /*
- * If extent created before last snapshot => it's shared unless the
- * snapshot has been deleted. Use the heuristic if strict is false.
- */
- if (!strict &&
- (btrfs_extent_generation(leaf, ei) <=
- btrfs_root_last_snapshot(&root->root_item)))
- goto out;
+ return 1;
/* If this extent has SHARED_DATA_REF then it's shared */
- type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
if (type != BTRFS_EXTENT_DATA_REF_KEY)
- goto out;
+ return 1;
ref = (struct btrfs_extent_data_ref *)(&iref->offset);
if (btrfs_extent_refs(leaf, ei) !=
btrfs_extent_data_ref_count(leaf, ref) ||
btrfs_extent_data_ref_root(leaf, ref) != btrfs_root_id(root) ||
- btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
+ btrfs_extent_data_ref_objectid(leaf, ref) != btrfs_ino(inode) ||
btrfs_extent_data_ref_offset(leaf, ref) != offset)
- goto out;
+ return 1;
- ret = 0;
-out:
- return ret;
+ return 0;
}
-int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
- u64 bytenr, bool strict, struct btrfs_path *path)
+int btrfs_cross_ref_exist(struct btrfs_inode *inode, u64 offset,
+ u64 bytenr, struct btrfs_path *path)
{
int ret;
do {
- ret = check_committed_ref(root, path, objectid,
- offset, bytenr, strict);
+ ret = check_committed_ref(inode, path, offset, bytenr);
if (ret && ret != -ENOENT)
goto out;
- ret = check_delayed_ref(root, path, objectid, offset, bytenr);
+ /*
+ * The path must have a locked leaf from the extent tree where
+ * the extent item for our extent is located, in case it exists,
+ * or where it should be located in case it doesn't exist yet
+ * because it's new and its delayed ref was not yet flushed.
+ * We need to lock the delayed ref head at check_delayed_ref(),
+ * if one exists, while holding the leaf locked in order to not
+ * race with delayed ref flushing, missing references and
+ * incorrectly reporting that the extent is not shared.
+ */
+ if (IS_ENABLED(CONFIG_BTRFS_ASSERT)) {
+ struct extent_buffer *leaf = path->nodes[0];
+
+ ASSERT(leaf != NULL);
+ btrfs_assert_tree_read_locked(leaf);
+
+ if (ret != -ENOENT) {
+ struct btrfs_key key;
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ ASSERT(key.objectid == bytenr);
+ ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY);
+ }
+ }
+
+ ret = check_delayed_ref(inode, path, offset, bytenr);
} while (ret == -EAGAIN && !path->nowait);
out:
btrfs_release_path(path);
- if (btrfs_is_data_reloc_root(root))
+ if (btrfs_is_data_reloc_root(inode->root))
WARN_ON(ret > 0);
return ret;
}
@@ -2571,13 +2587,10 @@ static int pin_down_extent(struct btrfs_trans_handle *trans,
struct btrfs_block_group *cache,
u64 bytenr, u64 num_bytes, int reserved)
{
- struct btrfs_fs_info *fs_info = cache->fs_info;
-
spin_lock(&cache->space_info->lock);
spin_lock(&cache->lock);
cache->pinned += num_bytes;
- btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info,
- num_bytes);
+ btrfs_space_info_update_bytes_pinned(cache->space_info, num_bytes);
if (reserved) {
cache->reserved -= num_bytes;
cache->space_info->bytes_reserved -= num_bytes;
@@ -2724,15 +2737,15 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
{
struct btrfs_block_group *cache = NULL;
struct btrfs_space_info *space_info;
- struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
struct btrfs_free_cluster *cluster = NULL;
- u64 len;
u64 total_unpinned = 0;
u64 empty_cluster = 0;
bool readonly;
int ret = 0;
while (start <= end) {
+ u64 len;
+
readonly = false;
if (!cache ||
start >= cache->start + cache->length) {
@@ -2778,37 +2791,19 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
spin_lock(&space_info->lock);
spin_lock(&cache->lock);
cache->pinned -= len;
- btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len);
+ btrfs_space_info_update_bytes_pinned(space_info, -len);
space_info->max_extent_size = 0;
if (cache->ro) {
space_info->bytes_readonly += len;
readonly = true;
} else if (btrfs_is_zoned(fs_info)) {
/* Need reset before reusing in a zoned block group */
- btrfs_space_info_update_bytes_zone_unusable(fs_info, space_info,
- len);
+ btrfs_space_info_update_bytes_zone_unusable(space_info, len);
readonly = true;
}
spin_unlock(&cache->lock);
- if (!readonly && return_free_space &&
- global_rsv->space_info == space_info) {
- spin_lock(&global_rsv->lock);
- if (!global_rsv->full) {
- u64 to_add = min(len, global_rsv->size -
- global_rsv->reserved);
-
- global_rsv->reserved += to_add;
- btrfs_space_info_update_bytes_may_use(fs_info,
- space_info, to_add);
- if (global_rsv->reserved >= global_rsv->size)
- global_rsv->full = 1;
- len -= to_add;
- }
- spin_unlock(&global_rsv->lock);
- }
- /* Add to any tickets we may have */
- if (!readonly && return_free_space && len)
- btrfs_try_granting_tickets(fs_info, space_info);
+ if (!readonly && return_free_space)
+ btrfs_return_free_space(space_info, len);
spin_unlock(&space_info->lock);
}
@@ -2873,7 +2868,15 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
block_group->length,
&trimmed);
+ /*
+ * Not strictly necessary to lock, as the block_group should be
+ * read-only from btrfs_delete_unused_bgs().
+ */
+ ASSERT(block_group->ro);
+ spin_lock(&fs_info->unused_bgs_lock);
list_del_init(&block_group->bg_list);
+ spin_unlock(&fs_info->unused_bgs_lock);
+
btrfs_unfreeze_block_group(block_group);
btrfs_put_block_group(block_group);
@@ -3259,7 +3262,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
} else {
btrfs_set_extent_refs(leaf, ei, refs);
- btrfs_mark_buffer_dirty(trans, leaf);
}
if (found_extent) {
ret = remove_extent_backref(trans, extent_root, path,
@@ -4827,7 +4829,6 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
}
- btrfs_mark_buffer_dirty(trans, path->nodes[0]);
btrfs_free_path(path);
return alloc_reserved_extent(trans, ins->objectid, ins->offset);
@@ -4902,7 +4903,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
btrfs_set_extent_inline_ref_offset(leaf, iref, node->ref_root);
}
- btrfs_mark_buffer_dirty(trans, leaf);
btrfs_free_path(path);
return alloc_reserved_extent(trans, node->bytenr, fs_info->nodesize);
@@ -5467,7 +5467,7 @@ static int check_ref_exists(struct btrfs_trans_handle *trans,
{
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_delayed_ref_head *head;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_extent_inline_ref *iref;
int ret;
bool exists = false;
@@ -5484,7 +5484,6 @@ again:
* If we get 0 then we found our reference, return 1, else
* return the error if it's not -ENOENT;
*/
- btrfs_free_path(path);
return (ret < 0 ) ? ret : 1;
}
@@ -5519,7 +5518,6 @@ again:
mutex_unlock(&head->mutex);
out:
spin_unlock(&delayed_refs->lock);
- btrfs_free_path(path);
return exists ? 1 : 0;
}
@@ -6287,7 +6285,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
struct extent_buffer *parent)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct walk_control *wc;
int level;
int parent_level;
@@ -6300,10 +6298,8 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
return -ENOMEM;
wc = kzalloc(sizeof(*wc), GFP_NOFS);
- if (!wc) {
- btrfs_free_path(path);
+ if (!wc)
return -ENOMEM;
- }
btrfs_assert_tree_write_locked(parent);
parent_level = btrfs_header_level(parent);
@@ -6340,7 +6336,6 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
}
kfree(wc);
- btrfs_free_path(path);
return ret;
}
diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
index 2ad51130c037..0ed682d9ed7b 100644
--- a/fs/btrfs/extent-tree.h
+++ b/fs/btrfs/extent-tree.h
@@ -4,7 +4,6 @@
#define BTRFS_EXTENT_TREE_H
#include <linux/types.h>
-#include "misc.h"
#include "block-group.h"
#include "locking.h"
@@ -116,8 +115,7 @@ int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num,
int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
const struct extent_buffer *eb);
int btrfs_exclude_logged_extents(struct extent_buffer *eb);
-int btrfs_cross_ref_exist(struct btrfs_root *root,
- u64 objectid, u64 offset, u64 bytenr, bool strict,
+int btrfs_cross_ref_exist(struct btrfs_inode *inode, u64 offset, u64 bytenr,
struct btrfs_path *path);
struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -163,5 +161,9 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *node,
struct extent_buffer *parent);
+void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end);
+int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
+ u64 num_bytes, u64 *actual_bytes);
+int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range);
#endif
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index b923d0cec61c..197f5e51c474 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -198,9 +198,8 @@ static void __process_folios_contig(struct address_space *mapping,
u64 end, unsigned long page_ops)
{
struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host);
- pgoff_t start_index = start >> PAGE_SHIFT;
+ pgoff_t index = start >> PAGE_SHIFT;
pgoff_t end_index = end >> PAGE_SHIFT;
- pgoff_t index = start_index;
struct folio_batch fbatch;
int i;
@@ -221,7 +220,7 @@ static void __process_folios_contig(struct address_space *mapping,
}
}
-static noinline void __unlock_for_delalloc(const struct inode *inode,
+static noinline void unlock_delalloc_folio(const struct inode *inode,
const struct folio *locked_folio,
u64 start, u64 end)
{
@@ -242,9 +241,8 @@ static noinline int lock_delalloc_folios(struct inode *inode,
{
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct address_space *mapping = inode->i_mapping;
- pgoff_t start_index = start >> PAGE_SHIFT;
+ pgoff_t index = start >> PAGE_SHIFT;
pgoff_t end_index = end >> PAGE_SHIFT;
- pgoff_t index = start_index;
u64 processed_end = start;
struct folio_batch fbatch;
@@ -288,8 +286,7 @@ static noinline int lock_delalloc_folios(struct inode *inode,
out:
folio_batch_release(&fbatch);
if (processed_end > start)
- __unlock_for_delalloc(inode, locked_folio, start,
- processed_end);
+ unlock_delalloc_folio(inode, locked_folio, start, processed_end);
return -EAGAIN;
}
@@ -390,7 +387,7 @@ again:
unlock_extent(tree, delalloc_start, delalloc_end, &cached_state);
if (!ret) {
- __unlock_for_delalloc(inode, locked_folio, delalloc_start,
+ unlock_delalloc_folio(inode, locked_folio, delalloc_start,
delalloc_end);
cond_resched();
goto again;
@@ -428,14 +425,14 @@ static void end_folio_read(struct folio *folio, bool uptodate, u64 start, u32 le
struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
ASSERT(folio_pos(folio) <= start &&
- start + len <= folio_pos(folio) + PAGE_SIZE);
+ start + len <= folio_pos(folio) + folio_size(folio));
if (uptodate && btrfs_verify_folio(folio, start, len))
btrfs_folio_set_uptodate(fs_info, folio, start, len);
else
btrfs_folio_clear_uptodate(fs_info, folio, start, len);
- if (!btrfs_is_subpage(fs_info, folio->mapping))
+ if (!btrfs_is_subpage(fs_info, folio))
folio_unlock(folio);
else
btrfs_folio_end_lock(fs_info, folio, start, len);
@@ -491,11 +488,11 @@ static void end_bbio_data_write(struct btrfs_bio *bbio)
static void begin_folio_read(struct btrfs_fs_info *fs_info, struct folio *folio)
{
ASSERT(folio_test_locked(folio));
- if (!btrfs_is_subpage(fs_info, folio->mapping))
+ if (!btrfs_is_subpage(fs_info, folio))
return;
ASSERT(folio_test_private(folio));
- btrfs_folio_set_lock(fs_info, folio, folio_pos(folio), PAGE_SIZE);
+ btrfs_folio_set_lock(fs_info, folio, folio_pos(folio), folio_size(folio));
}
/*
@@ -526,8 +523,6 @@ static void end_bbio_data_read(struct btrfs_bio *bbio)
u64 end;
u32 len;
- /* For now only order 0 folios are supported for data. */
- ASSERT(folio_order(folio) == 0);
btrfs_debug(fs_info,
"%s: bi_sector=%llu, err=%d, mirror=%u",
__func__, bio->bi_iter.bi_sector, bio->bi_status,
@@ -555,7 +550,6 @@ static void end_bbio_data_read(struct btrfs_bio *bbio)
if (likely(uptodate)) {
loff_t i_size = i_size_read(inode);
- pgoff_t end_index = i_size >> folio_shift(folio);
/*
* Zero out the remaining part if this range straddles
@@ -564,9 +558,11 @@ static void end_bbio_data_read(struct btrfs_bio *bbio)
* Here we should only zero the range inside the folio,
* not touch anything else.
*
- * NOTE: i_size is exclusive while end is inclusive.
+ * NOTE: i_size is exclusive while end is inclusive and
+ * folio_contains() takes PAGE_SIZE units.
*/
- if (folio_index(folio) == end_index && i_size <= end) {
+ if (folio_contains(folio, i_size >> PAGE_SHIFT) &&
+ i_size <= end) {
u32 zero_start = max(offset_in_folio(folio, i_size),
offset_in_folio(folio, start));
u32 zero_len = offset_in_folio(folio, end) + 1 -
@@ -632,7 +628,7 @@ int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array,
for (allocated = 0; allocated < nr_pages;) {
unsigned int last = allocated;
- allocated = alloc_pages_bulk_array(gfp, nr_pages, page_array);
+ allocated = alloc_pages_bulk(gfp, nr_pages, page_array);
if (unlikely(allocated == last)) {
/* No progress, fail and do cleanup. */
for (int i = 0; i < allocated; i++) {
@@ -710,6 +706,7 @@ static void alloc_new_bio(struct btrfs_inode *inode,
bbio = btrfs_bio_alloc(BIO_MAX_VECS, bio_ctrl->opf, fs_info,
bio_ctrl->end_io_func, NULL);
bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
+ bbio->bio.bi_write_hint = inode->vfs_inode.i_write_hint;
bbio->inode = inode;
bbio->file_offset = file_offset;
bio_ctrl->bbio = bbio;
@@ -756,7 +753,7 @@ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl,
{
struct btrfs_inode *inode = folio_to_inode(folio);
- ASSERT(pg_offset + size <= PAGE_SIZE);
+ ASSERT(pg_offset + size <= folio_size(folio));
ASSERT(bio_ctrl->end_io_func);
if (bio_ctrl->bbio &&
@@ -839,7 +836,7 @@ static int attach_extent_buffer_folio(struct extent_buffer *eb,
if (folio->mapping)
lockdep_assert_held(&folio->mapping->i_private_lock);
- if (fs_info->nodesize >= PAGE_SIZE) {
+ if (!btrfs_meta_is_subpage(fs_info)) {
if (!folio_test_private(folio))
folio_attach_private(folio, eb);
else
@@ -862,11 +859,6 @@ static int attach_extent_buffer_folio(struct extent_buffer *eb,
return ret;
}
-int set_page_extent_mapped(struct page *page)
-{
- return set_folio_extent_mapped(page_folio(page));
-}
-
int set_folio_extent_mapped(struct folio *folio)
{
struct btrfs_fs_info *fs_info;
@@ -878,7 +870,7 @@ int set_folio_extent_mapped(struct folio *folio)
fs_info = folio_to_fs_info(folio);
- if (btrfs_is_subpage(fs_info, folio->mapping))
+ if (btrfs_is_subpage(fs_info, folio))
return btrfs_attach_subpage(fs_info, folio, BTRFS_SUBPAGE_DATA);
folio_attach_private(folio, (void *)EXTENT_FOLIO_PRIVATE);
@@ -895,18 +887,17 @@ void clear_folio_extent_mapped(struct folio *folio)
return;
fs_info = folio_to_fs_info(folio);
- if (btrfs_is_subpage(fs_info, folio->mapping))
- return btrfs_detach_subpage(fs_info, folio);
+ if (btrfs_is_subpage(fs_info, folio))
+ return btrfs_detach_subpage(fs_info, folio, BTRFS_SUBPAGE_DATA);
folio_detach_private(folio);
}
-static struct extent_map *__get_extent_map(struct inode *inode,
- struct folio *folio, u64 start,
- u64 len, struct extent_map **em_cached)
+static struct extent_map *get_extent_map(struct btrfs_inode *inode,
+ struct folio *folio, u64 start,
+ u64 len, struct extent_map **em_cached)
{
struct extent_map *em;
- struct extent_state *cached_state = NULL;
ASSERT(em_cached);
@@ -922,14 +913,12 @@ static struct extent_map *__get_extent_map(struct inode *inode,
*em_cached = NULL;
}
- btrfs_lock_and_flush_ordered_range(BTRFS_I(inode), start, start + len - 1, &cached_state);
- em = btrfs_get_extent(BTRFS_I(inode), folio, start, len);
+ em = btrfs_get_extent(inode, folio, start, len);
if (!IS_ERR(em)) {
BUG_ON(*em_cached);
refcount_inc(&em->refs);
*em_cached = em;
}
- unlock_extent(&BTRFS_I(inode)->io_tree, start, start + len - 1, &cached_state);
return em;
}
@@ -946,16 +935,12 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
struct inode *inode = folio->mapping->host;
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
u64 start = folio_pos(folio);
- const u64 end = start + PAGE_SIZE - 1;
- u64 cur = start;
+ const u64 end = start + folio_size(folio) - 1;
u64 extent_offset;
u64 last_byte = i_size_read(inode);
- u64 block_start;
struct extent_map *em;
int ret = 0;
- size_t pg_offset = 0;
- size_t iosize;
- size_t blocksize = fs_info->sectorsize;
+ const size_t blocksize = fs_info->sectorsize;
ret = set_folio_extent_mapped(folio);
if (ret < 0) {
@@ -963,30 +948,33 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
return ret;
}
- if (folio->index == last_byte >> folio_shift(folio)) {
+ if (folio_contains(folio, last_byte >> PAGE_SHIFT)) {
size_t zero_offset = offset_in_folio(folio, last_byte);
- if (zero_offset) {
- iosize = folio_size(folio) - zero_offset;
- folio_zero_range(folio, zero_offset, iosize);
- }
+ if (zero_offset)
+ folio_zero_range(folio, zero_offset,
+ folio_size(folio) - zero_offset);
}
bio_ctrl->end_io_func = end_bbio_data_read;
begin_folio_read(fs_info, folio);
- while (cur <= end) {
+ for (u64 cur = start; cur <= end; cur += blocksize) {
enum btrfs_compression_type compress_type = BTRFS_COMPRESS_NONE;
+ unsigned long pg_offset = offset_in_folio(folio, cur);
bool force_bio_submit = false;
u64 disk_bytenr;
+ u64 block_start;
ASSERT(IS_ALIGNED(cur, fs_info->sectorsize));
if (cur >= last_byte) {
- iosize = folio_size(folio) - pg_offset;
- folio_zero_range(folio, pg_offset, iosize);
- end_folio_read(folio, true, cur, iosize);
+ folio_zero_range(folio, pg_offset, end - cur + 1);
+ end_folio_read(folio, true, cur, end - cur + 1);
break;
}
- em = __get_extent_map(inode, folio, cur, end - cur + 1,
- em_cached);
+ if (btrfs_folio_test_uptodate(fs_info, folio, cur, blocksize)) {
+ end_folio_read(folio, true, cur, blocksize);
+ continue;
+ }
+ em = get_extent_map(BTRFS_I(inode), folio, cur, end - cur + 1, em_cached);
if (IS_ERR(em)) {
end_folio_read(folio, false, cur, end + 1 - cur);
return PTR_ERR(em);
@@ -997,15 +985,15 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
compress_type = extent_map_compression(em);
- iosize = min(extent_map_end(em) - cur, end - cur + 1);
- iosize = ALIGN(iosize, blocksize);
if (compress_type != BTRFS_COMPRESS_NONE)
disk_bytenr = em->disk_bytenr;
else
disk_bytenr = extent_map_block_start(em) + extent_offset;
- block_start = extent_map_block_start(em);
+
if (em->flags & EXTENT_FLAG_PREALLOC)
block_start = EXTENT_MAP_HOLE;
+ else
+ block_start = extent_map_block_start(em);
/*
* If we have a file range that points to a compressed extent
@@ -1054,18 +1042,13 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
/* we've found a hole, just zero and go on */
if (block_start == EXTENT_MAP_HOLE) {
- folio_zero_range(folio, pg_offset, iosize);
-
- end_folio_read(folio, true, cur, iosize);
- cur = cur + iosize;
- pg_offset += iosize;
+ folio_zero_range(folio, pg_offset, blocksize);
+ end_folio_read(folio, true, cur, blocksize);
continue;
}
/* the get_extent function already copied into the folio */
if (block_start == EXTENT_MAP_INLINE) {
- end_folio_read(folio, true, cur, iosize);
- cur = cur + iosize;
- pg_offset += iosize;
+ end_folio_read(folio, true, cur, blocksize);
continue;
}
@@ -1076,22 +1059,204 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
if (force_bio_submit)
submit_one_bio(bio_ctrl);
- submit_extent_folio(bio_ctrl, disk_bytenr, folio, iosize,
+ submit_extent_folio(bio_ctrl, disk_bytenr, folio, blocksize,
pg_offset);
- cur = cur + iosize;
- pg_offset += iosize;
}
-
return 0;
}
+/*
+ * Check if we can skip waiting the @ordered extent covering the block at @fileoff.
+ *
+ * @fileoff: Both input and output.
+ * Input as the file offset where the check should start at.
+ * Output as where the next check should start at,
+ * if the function returns true.
+ *
+ * Return true if we can skip to @fileoff. The caller needs to check the new
+ * @fileoff value to make sure it covers the full range, before skipping the
+ * full OE.
+ *
+ * Return false if we must wait for the ordered extent.
+ */
+static bool can_skip_one_ordered_range(struct btrfs_inode *inode,
+ struct btrfs_ordered_extent *ordered,
+ u64 *fileoff)
+{
+ const struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct folio *folio;
+ const u32 blocksize = fs_info->sectorsize;
+ u64 cur = *fileoff;
+ bool ret;
+
+ folio = filemap_get_folio(inode->vfs_inode.i_mapping, cur >> PAGE_SHIFT);
+
+ /*
+ * We should have locked the folio(s) for range [start, end], thus
+ * there must be a folio and it must be locked.
+ */
+ ASSERT(!IS_ERR(folio));
+ ASSERT(folio_test_locked(folio));
+
+ /*
+ * There are several cases for the folio and OE combination:
+ *
+ * 1) Folio has no private flag
+ * The OE has all its IO done but not yet finished, and folio got
+ * invalidated.
+ *
+ * Have we have to wait for the OE to finish, as it may contain the
+ * to-be-inserted data checksum.
+ * Without the data checksum inserted into the csum tree, read will
+ * just fail with missing csum.
+ */
+ if (!folio_test_private(folio)) {
+ ret = false;
+ goto out;
+ }
+
+ /*
+ * 2) The first block is DIRTY.
+ *
+ * This means the OE is created by some other folios whose file pos is
+ * before this one. And since we are holding the folio lock, the writeback
+ * of this folio cannot start.
+ *
+ * We must skip the whole OE, because it will never start until we
+ * finished our folio read and unlocked the folio.
+ */
+ if (btrfs_folio_test_dirty(fs_info, folio, cur, blocksize)) {
+ u64 range_len = min(folio_pos(folio) + folio_size(folio),
+ ordered->file_offset + ordered->num_bytes) - cur;
+
+ ret = true;
+ /*
+ * At least inside the folio, all the remaining blocks should
+ * also be dirty.
+ */
+ ASSERT(btrfs_folio_test_dirty(fs_info, folio, cur, range_len));
+ *fileoff = ordered->file_offset + ordered->num_bytes;
+ goto out;
+ }
+
+ /*
+ * 3) The first block is uptodate.
+ *
+ * At least the first block can be skipped, but we are still not fully
+ * sure. E.g. if the OE has some other folios in the range that cannot
+ * be skipped.
+ * So we return true and update @next_ret to the OE/folio boundary.
+ */
+ if (btrfs_folio_test_uptodate(fs_info, folio, cur, blocksize)) {
+ u64 range_len = min(folio_pos(folio) + folio_size(folio),
+ ordered->file_offset + ordered->num_bytes) - cur;
+
+ /*
+ * The whole range to the OE end or folio boundary should also
+ * be uptodate.
+ */
+ ASSERT(btrfs_folio_test_uptodate(fs_info, folio, cur, range_len));
+ ret = true;
+ *fileoff = cur + range_len;
+ goto out;
+ }
+
+ /*
+ * 4) The first block is not uptodate.
+ *
+ * This means the folio is invalidated after the writeback was finished,
+ * but by some other operations (e.g. block aligned buffered write) the
+ * folio is inserted into filemap.
+ * Very much the same as case 1).
+ */
+ ret = false;
+out:
+ folio_put(folio);
+ return ret;
+}
+
+static bool can_skip_ordered_extent(struct btrfs_inode *inode,
+ struct btrfs_ordered_extent *ordered,
+ u64 start, u64 end)
+{
+ const u64 range_end = min(end, ordered->file_offset + ordered->num_bytes - 1);
+ u64 cur = max(start, ordered->file_offset);
+
+ while (cur < range_end) {
+ bool can_skip;
+
+ can_skip = can_skip_one_ordered_range(inode, ordered, &cur);
+ if (!can_skip)
+ return false;
+ }
+ return true;
+}
+
+/*
+ * Locking helper to make sure we get a stable view of extent maps for the
+ * involved range.
+ *
+ * This is for folio read paths (read and readahead), thus the involved range
+ * should have all the folios locked.
+ */
+static void lock_extents_for_read(struct btrfs_inode *inode, u64 start, u64 end,
+ struct extent_state **cached_state)
+{
+ u64 cur_pos;
+
+ /* Caller must provide a valid @cached_state. */
+ ASSERT(cached_state);
+
+ /* The range must at least be page aligned, as all read paths are folio based. */
+ ASSERT(IS_ALIGNED(start, PAGE_SIZE));
+ ASSERT(IS_ALIGNED(end + 1, PAGE_SIZE));
+
+again:
+ lock_extent(&inode->io_tree, start, end, cached_state);
+ cur_pos = start;
+ while (cur_pos < end) {
+ struct btrfs_ordered_extent *ordered;
+
+ ordered = btrfs_lookup_ordered_range(inode, cur_pos,
+ end - cur_pos + 1);
+ /*
+ * No ordered extents in the range, and we hold the extent lock,
+ * no one can modify the extent maps in the range, we're safe to return.
+ */
+ if (!ordered)
+ break;
+
+ /* Check if we can skip waiting for the whole OE. */
+ if (can_skip_ordered_extent(inode, ordered, start, end)) {
+ cur_pos = min(ordered->file_offset + ordered->num_bytes,
+ end + 1);
+ btrfs_put_ordered_extent(ordered);
+ continue;
+ }
+
+ /* Now wait for the OE to finish. */
+ unlock_extent(&inode->io_tree, start, end, cached_state);
+ btrfs_start_ordered_extent_nowriteback(ordered, start, end + 1 - start);
+ btrfs_put_ordered_extent(ordered);
+ /* We have unlocked the whole range, restart from the beginning. */
+ goto again;
+ }
+}
+
int btrfs_read_folio(struct file *file, struct folio *folio)
{
+ struct btrfs_inode *inode = folio_to_inode(folio);
+ const u64 start = folio_pos(folio);
+ const u64 end = start + folio_size(folio) - 1;
+ struct extent_state *cached_state = NULL;
struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ };
struct extent_map *em_cached = NULL;
int ret;
+ lock_extents_for_read(inode, start, end, &cached_state);
ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl, NULL);
+ unlock_extent(&inode->io_tree, start, end, &cached_state);
+
free_extent_map(em_cached);
/*
@@ -1110,7 +1275,7 @@ static void set_delalloc_bitmap(struct folio *folio, unsigned long *delalloc_bit
unsigned int start_bit;
unsigned int nbits;
- ASSERT(start >= folio_start && start + len <= folio_start + PAGE_SIZE);
+ ASSERT(start >= folio_start && start + len <= folio_start + folio_size(folio));
start_bit = (start - folio_start) >> fs_info->sectorsize_bits;
nbits = len >> fs_info->sectorsize_bits;
ASSERT(bitmap_test_range_all_zero(delalloc_bitmap, start_bit, nbits));
@@ -1123,12 +1288,12 @@ static bool find_next_delalloc_bitmap(struct folio *folio,
{
struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
const u64 folio_start = folio_pos(folio);
- const unsigned int bitmap_size = fs_info->sectors_per_page;
+ const unsigned int bitmap_size = btrfs_blocks_per_folio(fs_info, folio);
unsigned int start_bit;
unsigned int first_zero;
unsigned int first_set;
- ASSERT(start >= folio_start && start < folio_start + PAGE_SIZE);
+ ASSERT(start >= folio_start && start < folio_start + folio_size(folio));
start_bit = (start - folio_start) >> fs_info->sectorsize_bits;
first_set = find_next_bit(delalloc_bitmap, bitmap_size, start_bit);
@@ -1142,14 +1307,19 @@ static bool find_next_delalloc_bitmap(struct folio *folio,
}
/*
- * helper for extent_writepage(), doing all of the delayed allocation setup.
+ * Do all of the delayed allocation setup.
*
- * This returns 1 if btrfs_run_delalloc_range function did all the work required
- * to write the page (copy into inline extent). In this case the IO has
- * been started and the page is already unlocked.
+ * Return >0 if all the dirty blocks are submitted async (compression) or inlined.
+ * The @folio should no longer be touched (treat it as already unlocked).
*
- * This returns 0 if all went well (page still locked)
- * This returns < 0 if there were errors (page still locked)
+ * Return 0 if there is still dirty block that needs to be submitted through
+ * extent_writepage_io().
+ * bio_ctrl->submit_bitmap will indicate which blocks of the folio should be
+ * submitted, and @folio is still kept locked.
+ *
+ * Return <0 if there is any error hit.
+ * Any allocated ordered extent range covering this folio will be marked
+ * finished (IOERR), and @folio is still kept locked.
*/
static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
struct folio *folio,
@@ -1157,9 +1327,10 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
{
struct btrfs_fs_info *fs_info = inode_to_fs_info(&inode->vfs_inode);
struct writeback_control *wbc = bio_ctrl->wbc;
- const bool is_subpage = btrfs_is_subpage(fs_info, folio->mapping);
+ const bool is_subpage = btrfs_is_subpage(fs_info, folio);
const u64 page_start = folio_pos(folio);
const u64 page_end = page_start + folio_size(folio) - 1;
+ const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
unsigned long delalloc_bitmap = 0;
/*
* Save the last found delalloc end. As the delalloc end can go beyond
@@ -1167,6 +1338,16 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
* last delalloc end.
*/
u64 last_delalloc_end = 0;
+ /*
+ * The range end (exclusive) of the last successfully finished delalloc
+ * range.
+ * Any range covered by ordered extent must either be manually marked
+ * finished (error handling), or has IO submitted (and finish the
+ * ordered extent normally).
+ *
+ * This records the end of ordered extent cleanup if we hit an error.
+ */
+ u64 last_finished_delalloc_end = page_start;
u64 delalloc_start = page_start;
u64 delalloc_end = page_end;
u64 delalloc_to_write = 0;
@@ -1174,14 +1355,14 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
int bit;
/* Save the dirty bitmap as our submission bitmap will be a subset of it. */
- if (btrfs_is_subpage(fs_info, inode->vfs_inode.i_mapping)) {
- ASSERT(fs_info->sectors_per_page > 1);
+ if (btrfs_is_subpage(fs_info, folio)) {
+ ASSERT(blocks_per_folio > 1);
btrfs_get_subpage_dirty_bitmap(fs_info, folio, &bio_ctrl->submit_bitmap);
} else {
bio_ctrl->submit_bitmap = 1;
}
- for_each_set_bit(bit, &bio_ctrl->submit_bitmap, fs_info->sectors_per_page) {
+ for_each_set_bit(bit, &bio_ctrl->submit_bitmap, blocks_per_folio) {
u64 start = page_start + (bit << fs_info->sectorsize_bits);
btrfs_folio_set_lock(fs_info, folio, start, fs_info->sectorsize);
@@ -1235,11 +1416,28 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
found_len = last_delalloc_end + 1 - found_start;
if (ret >= 0) {
+ /*
+ * Some delalloc range may be created by previous folios.
+ * Thus we still need to clean up this range during error
+ * handling.
+ */
+ last_finished_delalloc_end = found_start;
/* No errors hit so far, run the current delalloc range. */
ret = btrfs_run_delalloc_range(inode, folio,
found_start,
found_start + found_len - 1,
wbc);
+ if (ret >= 0)
+ last_finished_delalloc_end = found_start + found_len;
+ if (unlikely(ret < 0))
+ btrfs_err_rl(fs_info,
+"failed to run delalloc range, root=%lld ino=%llu folio=%llu submit_bitmap=%*pbl start=%llu len=%u: %d",
+ btrfs_root_id(inode->root),
+ btrfs_ino(inode),
+ folio_pos(folio),
+ blocks_per_folio,
+ &bio_ctrl->submit_bitmap,
+ found_start, found_len, ret);
} else {
/*
* We've hit an error during previous delalloc range,
@@ -1247,7 +1445,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
*/
unlock_extent(&inode->io_tree, found_start,
found_start + found_len - 1, NULL);
- __unlock_for_delalloc(&inode->vfs_inode, folio,
+ unlock_delalloc_folio(&inode->vfs_inode, folio,
found_start,
found_start + found_len - 1);
}
@@ -1274,8 +1472,22 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
delalloc_start = found_start + found_len;
}
- if (ret < 0)
+ /*
+ * It's possible we had some ordered extents created before we hit
+ * an error, cleanup non-async successfully created delalloc ranges.
+ */
+ if (unlikely(ret < 0)) {
+ unsigned int bitmap_size = min(
+ (last_finished_delalloc_end - page_start) >>
+ fs_info->sectorsize_bits,
+ blocks_per_folio);
+
+ for_each_set_bit(bit, &bio_ctrl->submit_bitmap, bitmap_size)
+ btrfs_mark_ordered_io_finished(inode, folio,
+ page_start + (bit << fs_info->sectorsize_bits),
+ fs_info->sectorsize, false);
return ret;
+ }
out:
if (last_delalloc_end)
delalloc_end = last_delalloc_end;
@@ -1283,7 +1495,7 @@ out:
delalloc_end = page_end;
/*
* delalloc_end is already one less than the total length, so
- * we don't subtract one from PAGE_SIZE
+ * we don't subtract one from PAGE_SIZE.
*/
delalloc_to_write +=
DIV_ROUND_UP(delalloc_end + 1 - page_start, PAGE_SIZE);
@@ -1292,7 +1504,7 @@ out:
* If all ranges are submitted asynchronously, we just need to account
* for them here.
*/
- if (bitmap_empty(&bio_ctrl->submit_bitmap, fs_info->sectors_per_page)) {
+ if (bitmap_empty(&bio_ctrl->submit_bitmap, blocks_per_folio)) {
wbc->nr_to_write -= delalloc_to_write;
return 1;
}
@@ -1335,7 +1547,7 @@ static int submit_one_sector(struct btrfs_inode *inode,
em = btrfs_get_extent(inode, NULL, filepos, sectorsize);
if (IS_ERR(em))
- return PTR_ERR_OR_ZERO(em);
+ return PTR_ERR(em);
extent_offset = filepos - em->start;
em_end = extent_map_end(em);
@@ -1391,7 +1603,9 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
struct btrfs_fs_info *fs_info = inode->root->fs_info;
unsigned long range_bitmap = 0;
bool submitted_io = false;
+ bool error = false;
const u64 folio_start = folio_pos(folio);
+ const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
u64 cur;
int bit;
int ret = 0;
@@ -1400,21 +1614,23 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
start + len <= folio_start + folio_size(folio));
ret = btrfs_writepage_cow_fixup(folio);
- if (ret) {
+ if (ret == -EAGAIN) {
/* Fixup worker will requeue */
folio_redirty_for_writepage(bio_ctrl->wbc, folio);
folio_unlock(folio);
return 1;
}
+ if (ret < 0)
+ return ret;
for (cur = start; cur < start + len; cur += fs_info->sectorsize)
set_bit((cur - folio_start) >> fs_info->sectorsize_bits, &range_bitmap);
bitmap_and(&bio_ctrl->submit_bitmap, &bio_ctrl->submit_bitmap, &range_bitmap,
- fs_info->sectors_per_page);
+ blocks_per_folio);
bio_ctrl->end_io_func = end_bbio_data_write;
- for_each_set_bit(bit, &bio_ctrl->submit_bitmap, fs_info->sectors_per_page) {
+ for_each_set_bit(bit, &bio_ctrl->submit_bitmap, blocks_per_folio) {
cur = folio_pos(folio) + (bit << fs_info->sectorsize_bits);
if (cur >= i_size) {
@@ -1433,11 +1649,26 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
break;
}
ret = submit_one_sector(inode, folio, cur, bio_ctrl, i_size);
- if (ret < 0)
- goto out;
+ if (unlikely(ret < 0)) {
+ /*
+ * bio_ctrl may contain a bio crossing several folios.
+ * Submit it immediately so that the bio has a chance
+ * to finish normally, other than marked as error.
+ */
+ submit_one_bio(bio_ctrl);
+ /*
+ * Failed to grab the extent map which should be very rare.
+ * Since there is no bio submitted to finish the ordered
+ * extent, we have to manually finish this sector.
+ */
+ btrfs_mark_ordered_io_finished(inode, folio, cur,
+ fs_info->sectorsize, false);
+ error = true;
+ continue;
+ }
submitted_io = true;
}
-out:
+
/*
* If we didn't submitted any sector (>= i_size), folio dirty get
* cleared but PAGECACHE_TAG_DIRTY is not cleared (only cleared
@@ -1445,8 +1676,11 @@ out:
*
* Here we set writeback and clear for the range. If the full folio
* is no longer dirty then we clear the PAGECACHE_TAG_DIRTY tag.
+ *
+ * If we hit any error, the corresponding sector will still be dirty
+ * thus no need to clear PAGECACHE_TAG_DIRTY.
*/
- if (!submitted_io) {
+ if (!submitted_io && !error) {
btrfs_folio_set_writeback(fs_info, folio, start, len);
btrfs_folio_clear_writeback(fs_info, folio, start, len);
}
@@ -1464,15 +1698,15 @@ out:
*/
static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl)
{
- struct inode *inode = folio->mapping->host;
- struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
- const u64 page_start = folio_pos(folio);
+ struct btrfs_inode *inode = BTRFS_I(folio->mapping->host);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
int ret;
size_t pg_offset;
- loff_t i_size = i_size_read(inode);
+ loff_t i_size = i_size_read(&inode->vfs_inode);
unsigned long end_index = i_size >> PAGE_SHIFT;
+ const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
- trace_extent_writepage(folio, inode, bio_ctrl->wbc);
+ trace_extent_writepage(folio, &inode->vfs_inode, bio_ctrl->wbc);
WARN_ON(!folio_test_locked(folio));
@@ -1492,30 +1726,56 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
* The proper bitmap can only be initialized until writepage_delalloc().
*/
bio_ctrl->submit_bitmap = (unsigned long)-1;
+
+ /*
+ * If the page is dirty but without private set, it's marked dirty
+ * without informing the fs.
+ * Nowadays that is a bug, since the introduction of
+ * pin_user_pages*().
+ *
+ * So here we check if the page has private set to rule out such
+ * case.
+ * But we also have a long history of relying on the COW fixup,
+ * so here we only enable this check for experimental builds until
+ * we're sure it's safe.
+ */
+ if (IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL) &&
+ unlikely(!folio_test_private(folio))) {
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ btrfs_err_rl(fs_info,
+ "root %lld ino %llu folio %llu is marked dirty without notifying the fs",
+ inode->root->root_key.objectid,
+ btrfs_ino(inode), folio_pos(folio));
+ ret = -EUCLEAN;
+ goto done;
+ }
+
ret = set_folio_extent_mapped(folio);
if (ret < 0)
goto done;
- ret = writepage_delalloc(BTRFS_I(inode), folio, bio_ctrl);
+ ret = writepage_delalloc(inode, folio, bio_ctrl);
if (ret == 1)
return 0;
if (ret)
goto done;
- ret = extent_writepage_io(BTRFS_I(inode), folio, folio_pos(folio),
- PAGE_SIZE, bio_ctrl, i_size);
+ ret = extent_writepage_io(inode, folio, folio_pos(folio),
+ folio_size(folio), bio_ctrl, i_size);
if (ret == 1)
return 0;
+ if (ret < 0)
+ btrfs_err_rl(fs_info,
+"failed to submit blocks, root=%lld inode=%llu folio=%llu submit_bitmap=%*pbl: %d",
+ btrfs_root_id(inode->root), btrfs_ino(inode),
+ folio_pos(folio), blocks_per_folio,
+ &bio_ctrl->submit_bitmap, ret);
bio_ctrl->wbc->nr_to_write--;
done:
- if (ret) {
- btrfs_mark_ordered_io_finished(BTRFS_I(inode), folio,
- page_start, PAGE_SIZE, !ret);
+ if (ret < 0)
mapping_set_error(folio->mapping, ret);
- }
-
/*
* Only unlock ranges that are submitted. As there can be some async
* submitted ranges inside the folio.
@@ -1525,12 +1785,6 @@ done:
return ret;
}
-void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
-{
- wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK,
- TASK_UNINTERRUPTIBLE);
-}
-
/*
* Lock extent buffer status and pages for writeback.
*
@@ -1670,21 +1924,13 @@ static struct extent_buffer *find_extent_buffer_nolock(
static void end_bbio_meta_write(struct btrfs_bio *bbio)
{
struct extent_buffer *eb = bbio->private;
- struct btrfs_fs_info *fs_info = eb->fs_info;
- bool uptodate = !bbio->bio.bi_status;
struct folio_iter fi;
- u32 bio_offset = 0;
- if (!uptodate)
+ if (bbio->bio.bi_status != BLK_STS_OK)
set_btree_ioerr(eb);
bio_for_each_folio_all(fi, &bbio->bio) {
- u64 start = eb->start + bio_offset;
- struct folio *folio = fi.folio;
- u32 len = fi.length;
-
- btrfs_folio_clear_writeback(fs_info, folio, start, len);
- bio_offset += len;
+ btrfs_meta_folio_clear_writeback(fi.folio, eb);
}
clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
@@ -1738,38 +1984,21 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
wbc_init_bio(wbc, &bbio->bio);
bbio->inode = BTRFS_I(eb->fs_info->btree_inode);
bbio->file_offset = eb->start;
- if (fs_info->nodesize < PAGE_SIZE) {
- struct folio *folio = eb->folios[0];
- bool ret;
+ for (int i = 0; i < num_extent_folios(eb); i++) {
+ struct folio *folio = eb->folios[i];
+ u64 range_start = max_t(u64, eb->start, folio_pos(folio));
+ u32 range_len = min_t(u64, folio_pos(folio) + folio_size(folio),
+ eb->start + eb->len) - range_start;
folio_lock(folio);
- btrfs_subpage_set_writeback(fs_info, folio, eb->start, eb->len);
- if (btrfs_subpage_clear_and_test_dirty(fs_info, folio, eb->start,
- eb->len)) {
- folio_clear_dirty_for_io(folio);
- wbc->nr_to_write--;
- }
- ret = bio_add_folio(&bbio->bio, folio, eb->len,
- eb->start - folio_pos(folio));
- ASSERT(ret);
- wbc_account_cgroup_owner(wbc, folio, eb->len);
- folio_unlock(folio);
- } else {
- int num_folios = num_extent_folios(eb);
-
- for (int i = 0; i < num_folios; i++) {
- struct folio *folio = eb->folios[i];
- bool ret;
-
- folio_lock(folio);
- folio_clear_dirty_for_io(folio);
- folio_start_writeback(folio);
- ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0);
- ASSERT(ret);
- wbc_account_cgroup_owner(wbc, folio, eb->folio_size);
+ btrfs_meta_folio_clear_dirty(folio, eb);
+ btrfs_meta_folio_set_writeback(folio, eb);
+ if (!folio_test_dirty(folio))
wbc->nr_to_write -= folio_nr_pages(folio);
- folio_unlock(folio);
- }
+ bio_add_folio_nofail(&bbio->bio, folio, range_len,
+ offset_in_folio(folio, range_start));
+ wbc_account_cgroup_owner(wbc, folio, range_len);
+ folio_unlock(folio);
}
btrfs_submit_bbio(bbio, 0);
}
@@ -1795,9 +2024,10 @@ static int submit_eb_subpage(struct folio *folio, struct writeback_control *wbc)
u64 folio_start = folio_pos(folio);
int bit_start = 0;
int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits;
+ const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
/* Lock and write each dirty extent buffers in the range */
- while (bit_start < fs_info->sectors_per_page) {
+ while (bit_start < blocks_per_folio) {
struct btrfs_subpage *subpage = folio_get_private(folio);
struct extent_buffer *eb;
unsigned long flags;
@@ -1813,7 +2043,7 @@ static int submit_eb_subpage(struct folio *folio, struct writeback_control *wbc)
break;
}
spin_lock_irqsave(&subpage->lock, flags);
- if (!test_bit(bit_start + btrfs_bitmap_nr_dirty * fs_info->sectors_per_page,
+ if (!test_bit(bit_start + btrfs_bitmap_nr_dirty * blocks_per_folio,
subpage->bitmaps)) {
spin_unlock_irqrestore(&subpage->lock, flags);
spin_unlock(&folio->mapping->i_private_lock);
@@ -1879,7 +2109,7 @@ static int submit_eb_page(struct folio *folio, struct btrfs_eb_write_context *ct
if (!folio_test_private(folio))
return 0;
- if (folio_to_fs_info(folio)->nodesize < PAGE_SIZE)
+ if (btrfs_meta_is_subpage(folio_to_fs_info(folio)))
return submit_eb_subpage(folio, wbc);
spin_lock(&mapping->i_private_lock);
@@ -2138,10 +2368,8 @@ retry:
done_index = folio_next_index(folio);
/*
* At this point we hold neither the i_pages lock nor
- * the page lock: the page may be truncated or
- * invalidated (changing page->mapping to NULL),
- * or even swizzled back from swapper_space to
- * tmpfs file mapping
+ * the folio lock: the folio may be truncated or
+ * invalidated (changing folio->mapping to NULL).
*/
if (!folio_trylock(folio)) {
submit_write_bio(bio_ctrl, 0);
@@ -2179,7 +2407,7 @@ retry:
* regular submission.
*/
if (wbc->sync_mode != WB_SYNC_NONE ||
- btrfs_is_subpage(inode_to_fs_info(inode), mapping)) {
+ btrfs_is_subpage(inode_to_fs_info(inode), folio)) {
if (folio_test_writeback(folio))
submit_write_bio(bio_ctrl, 0);
folio_wait_writeback(folio);
@@ -2260,8 +2488,8 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f
ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(end + 1, sectorsize));
while (cur <= end) {
- u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end);
- u32 cur_len = cur_end + 1 - cur;
+ u64 cur_end;
+ u32 cur_len;
struct folio *folio;
folio = filemap_get_folio(mapping, cur >> PAGE_SHIFT);
@@ -2271,13 +2499,18 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f
* code is just in case, but shouldn't actually be run.
*/
if (IS_ERR(folio)) {
+ cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end);
+ cur_len = cur_end + 1 - cur;
btrfs_mark_ordered_io_finished(BTRFS_I(inode), NULL,
cur, cur_len, false);
mapping_set_error(mapping, PTR_ERR(folio));
- cur = cur_end + 1;
+ cur = cur_end;
continue;
}
+ cur_end = min_t(u64, folio_pos(folio) + folio_size(folio) - 1, end);
+ cur_len = cur_end + 1 - cur;
+
ASSERT(folio_test_locked(folio));
if (pages_dirty && folio != locked_folio)
ASSERT(folio_test_dirty(folio));
@@ -2292,11 +2525,8 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f
if (ret == 1)
goto next_page;
- if (ret) {
- btrfs_mark_ordered_io_finished(BTRFS_I(inode), folio,
- cur, cur_len, !ret);
+ if (ret)
mapping_set_error(mapping, ret);
- }
btrfs_folio_end_lock(fs_info, folio, cur, cur_len);
if (ret < 0)
found_error = true;
@@ -2332,12 +2562,20 @@ void btrfs_readahead(struct readahead_control *rac)
{
struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ | REQ_RAHEAD };
struct folio *folio;
+ struct btrfs_inode *inode = BTRFS_I(rac->mapping->host);
+ const u64 start = readahead_pos(rac);
+ const u64 end = start + readahead_length(rac) - 1;
+ struct extent_state *cached_state = NULL;
struct extent_map *em_cached = NULL;
u64 prev_em_start = (u64)-1;
+ lock_extents_for_read(inode, start, end, &cached_state);
+
while ((folio = readahead_folio(rac)) != NULL)
btrfs_do_readpage(folio, &em_cached, &bio_ctrl, &prev_em_start);
+ unlock_extent(&inode->io_tree, start, end, &cached_state);
+
if (em_cached)
free_extent_map(em_cached);
submit_one_bio(&bio_ctrl);
@@ -2384,7 +2622,7 @@ static bool try_release_extent_state(struct extent_io_tree *tree,
struct folio *folio)
{
u64 start = folio_pos(folio);
- u64 end = start + PAGE_SIZE - 1;
+ u64 end = start + folio_size(folio) - 1;
bool ret;
if (test_range_bit_exists(tree, start, end, EXTENT_LOCKED)) {
@@ -2422,7 +2660,7 @@ static bool try_release_extent_state(struct extent_io_tree *tree,
bool try_release_extent_mapping(struct folio *folio, gfp_t mask)
{
u64 start = folio_pos(folio);
- u64 end = start + PAGE_SIZE - 1;
+ u64 end = start + folio_size(folio) - 1;
struct btrfs_inode *inode = folio_to_inode(folio);
struct extent_io_tree *io_tree = &inode->io_tree;
@@ -2495,11 +2733,6 @@ next:
return try_release_extent_state(io_tree, folio);
}
-static void __free_extent_buffer(struct extent_buffer *eb)
-{
- kmem_cache_free(extent_buffer_cache, eb);
-}
-
static int extent_buffer_under_io(const struct extent_buffer *eb)
{
return (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
@@ -2538,7 +2771,7 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo
return;
}
- if (fs_info->nodesize >= PAGE_SIZE) {
+ if (!btrfs_meta_is_subpage(fs_info)) {
/*
* We do this since we'll remove the pages after we've
* removed the eb from the radix tree, so we could race
@@ -2564,7 +2797,7 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo
* attached to one dummy eb, no sharing.
*/
if (!mapped) {
- btrfs_detach_subpage(fs_info, folio);
+ btrfs_detach_subpage(fs_info, folio, BTRFS_SUBPAGE_METADATA);
return;
}
@@ -2575,13 +2808,13 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo
* page range and no unfinished IO.
*/
if (!folio_range_has_eb(folio))
- btrfs_detach_subpage(fs_info, folio);
+ btrfs_detach_subpage(fs_info, folio, BTRFS_SUBPAGE_METADATA);
spin_unlock(&folio->mapping->i_private_lock);
}
-/* Release all pages attached to the extent buffer */
-static void btrfs_release_extent_buffer_pages(const struct extent_buffer *eb)
+/* Release all folios attached to the extent buffer */
+static void btrfs_release_extent_buffer_folios(const struct extent_buffer *eb)
{
ASSERT(!extent_buffer_under_io(eb));
@@ -2603,20 +2836,19 @@ static void btrfs_release_extent_buffer_pages(const struct extent_buffer *eb)
*/
static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
{
- btrfs_release_extent_buffer_pages(eb);
+ btrfs_release_extent_buffer_folios(eb);
btrfs_leak_debug_del_eb(eb);
- __free_extent_buffer(eb);
+ kmem_cache_free(extent_buffer_cache, eb);
}
-static struct extent_buffer *
-__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
- unsigned long len)
+static struct extent_buffer *__alloc_extent_buffer(struct btrfs_fs_info *fs_info,
+ u64 start)
{
struct extent_buffer *eb = NULL;
eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL);
eb->start = start;
- eb->len = len;
+ eb->len = fs_info->nodesize;
eb->fs_info = fs_info;
init_rwsem(&eb->lock);
@@ -2625,7 +2857,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
spin_lock_init(&eb->refs_lock);
atomic_set(&eb->refs, 1);
- ASSERT(len <= BTRFS_MAX_METADATA_BLOCKSIZE);
+ ASSERT(eb->len <= BTRFS_MAX_METADATA_BLOCKSIZE);
return eb;
}
@@ -2633,10 +2865,9 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
{
struct extent_buffer *new;
- int num_folios = num_extent_folios(src);
int ret;
- new = __alloc_extent_buffer(src->fs_info, src->start, src->len);
+ new = __alloc_extent_buffer(src->fs_info, src->start);
if (new == NULL)
return NULL;
@@ -2653,7 +2884,7 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
return NULL;
}
- for (int i = 0; i < num_folios; i++) {
+ for (int i = 0; i < num_extent_folios(src); i++) {
struct folio *folio = new->folios[i];
ret = attach_extent_buffer_folio(new, folio, NULL);
@@ -2669,26 +2900,24 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
return new;
}
-struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
- u64 start, unsigned long len)
+struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
+ u64 start)
{
struct extent_buffer *eb;
- int num_folios = 0;
int ret;
- eb = __alloc_extent_buffer(fs_info, start, len);
+ eb = __alloc_extent_buffer(fs_info, start);
if (!eb)
return NULL;
ret = alloc_eb_folio_array(eb, false);
if (ret)
- goto err;
+ goto out;
- num_folios = num_extent_folios(eb);
- for (int i = 0; i < num_folios; i++) {
+ for (int i = 0; i < num_extent_folios(eb); i++) {
ret = attach_extent_buffer_folio(eb, eb->folios[i], NULL);
if (ret < 0)
- goto err;
+ goto out_detach;
}
set_extent_buffer_uptodate(eb);
@@ -2696,23 +2925,19 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
return eb;
-err:
- for (int i = 0; i < num_folios; i++) {
+
+out_detach:
+ for (int i = 0; i < num_extent_folios(eb); i++) {
if (eb->folios[i]) {
detach_extent_buffer_folio(eb, eb->folios[i]);
folio_put(eb->folios[i]);
}
}
- __free_extent_buffer(eb);
+out:
+ kmem_cache_free(extent_buffer_cache, eb);
return NULL;
}
-struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
- u64 start)
-{
- return __alloc_dummy_extent_buffer(fs_info, start, fs_info->nodesize);
-}
-
static void check_buffer_tree_ref(struct extent_buffer *eb)
{
int refs;
@@ -2751,11 +2976,9 @@ static void check_buffer_tree_ref(struct extent_buffer *eb)
static void mark_extent_buffer_accessed(struct extent_buffer *eb)
{
- int num_folios= num_extent_folios(eb);
-
check_buffer_tree_ref(eb);
- for (int i = 0; i < num_folios; i++)
+ for (int i = 0; i < num_extent_folios(eb); i++)
folio_mark_accessed(eb->folios[i]);
}
@@ -2788,10 +3011,10 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
return eb;
}
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start)
{
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct extent_buffer *eb, *exists = NULL;
int ret;
@@ -2827,23 +3050,25 @@ again:
free_eb:
btrfs_release_extent_buffer(eb);
return exists;
-}
+#else
+ /* Stub to avoid linker error when compiled with optimizations turned off. */
+ return NULL;
#endif
+}
-static struct extent_buffer *grab_extent_buffer(
- struct btrfs_fs_info *fs_info, struct page *page)
+static struct extent_buffer *grab_extent_buffer(struct btrfs_fs_info *fs_info,
+ struct folio *folio)
{
- struct folio *folio = page_folio(page);
struct extent_buffer *exists;
- lockdep_assert_held(&page->mapping->i_private_lock);
+ lockdep_assert_held(&folio->mapping->i_private_lock);
/*
* For subpage case, we completely rely on radix tree to ensure we
* don't try to insert two ebs for the same bytenr. So here we always
* return NULL and just continue.
*/
- if (fs_info->nodesize < PAGE_SIZE)
+ if (btrfs_meta_is_subpage(fs_info))
return NULL;
/* Page not yet attached to an extent buffer */
@@ -2851,7 +3076,7 @@ static struct extent_buffer *grab_extent_buffer(
return NULL;
/*
- * We could have already allocated an eb for this page and attached one
+ * We could have already allocated an eb for this folio and attached one
* so lets see if we can get a ref on the existing eb, and if we can we
* know it's good and we can just return that one, else we know we can
* just overwrite folio private.
@@ -2860,16 +3085,19 @@ static struct extent_buffer *grab_extent_buffer(
if (atomic_inc_not_zero(&exists->refs))
return exists;
- WARN_ON(PageDirty(page));
+ WARN_ON(folio_test_dirty(folio));
folio_detach_private(folio);
return NULL;
}
-static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
+/*
+ * Validate alignment constraints of eb at logical address @start.
+ */
+static bool check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
{
if (!IS_ALIGNED(start, fs_info->sectorsize)) {
btrfs_err(fs_info, "bad tree block start %llu", start);
- return -EINVAL;
+ return true;
}
if (fs_info->nodesize < PAGE_SIZE &&
@@ -2877,14 +3105,14 @@ static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
btrfs_err(fs_info,
"tree block crosses page boundary, start %llu nodesize %u",
start, fs_info->nodesize);
- return -EINVAL;
+ return true;
}
if (fs_info->nodesize >= PAGE_SIZE &&
!PAGE_ALIGNED(start)) {
btrfs_err(fs_info,
"tree block is not page aligned, start %llu nodesize %u",
start, fs_info->nodesize);
- return -EINVAL;
+ return true;
}
if (!IS_ALIGNED(start, fs_info->nodesize) &&
!test_and_set_bit(BTRFS_FS_UNALIGNED_TREE_BLOCK, &fs_info->flags)) {
@@ -2892,10 +3120,9 @@ static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
"tree block not nodesize aligned, start %llu nodesize %u, can be resolved by a full metadata balance",
start, fs_info->nodesize);
}
- return 0;
+ return false;
}
-
/*
* Return 0 if eb->folios[i] is attached to btree inode successfully.
* Return >0 if there is already another extent buffer for the range,
@@ -2944,15 +3171,14 @@ retry:
finish:
spin_lock(&mapping->i_private_lock);
- if (existing_folio && fs_info->nodesize < PAGE_SIZE) {
+ if (existing_folio && btrfs_meta_is_subpage(fs_info)) {
/* We're going to reuse the existing page, can drop our folio now. */
__free_page(folio_page(eb->folios[i], 0));
eb->folios[i] = existing_folio;
} else if (existing_folio) {
struct extent_buffer *existing_eb;
- existing_eb = grab_extent_buffer(fs_info,
- folio_page(existing_folio, 0));
+ existing_eb = grab_extent_buffer(fs_info, existing_folio);
if (existing_eb) {
/* The extent buffer still exists, we can use it directly. */
*found_eb_ret = existing_eb;
@@ -2987,8 +3213,6 @@ finish:
struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start, u64 owner_root, int level)
{
- unsigned long len = fs_info->nodesize;
- int num_folios;
int attached = 0;
struct extent_buffer *eb;
struct extent_buffer *existing_eb = NULL;
@@ -3016,7 +3240,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
if (eb)
return eb;
- eb = __alloc_extent_buffer(fs_info, start, len);
+ eb = __alloc_extent_buffer(fs_info, start);
if (!eb)
return ERR_PTR(-ENOMEM);
@@ -3036,8 +3260,8 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
* The memory will be freed by attach_extent_buffer_page() or freed
* manually if we exit earlier.
*/
- if (fs_info->nodesize < PAGE_SIZE) {
- prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA);
+ if (btrfs_meta_is_subpage(fs_info)) {
+ prealloc = btrfs_alloc_subpage(fs_info, PAGE_SIZE, BTRFS_SUBPAGE_METADATA);
if (IS_ERR(prealloc)) {
ret = PTR_ERR(prealloc);
goto out;
@@ -3052,9 +3276,8 @@ reallocate:
goto out;
}
- num_folios = num_extent_folios(eb);
/* Attach all pages to the filemap. */
- for (int i = 0; i < num_folios; i++) {
+ for (int i = 0; i < num_extent_folios(eb); i++) {
struct folio *folio;
ret = attach_eb_folio_to_filemap(eb, i, prealloc, &existing_eb);
@@ -3094,7 +3317,7 @@ reallocate:
* and free the allocated page.
*/
folio = eb->folios[i];
- WARN_ON(btrfs_folio_test_dirty(fs_info, folio, eb->start, eb->len));
+ WARN_ON(btrfs_meta_folio_test_dirty(folio, eb));
/*
* Check if the current page is physically contiguous with previous eb
@@ -3105,7 +3328,7 @@ reallocate:
if (i && folio_page(eb->folios[i - 1], 0) + 1 != folio_page(folio, 0))
page_contig = false;
- if (!btrfs_folio_test_uptodate(fs_info, folio, eb->start, eb->len))
+ if (!btrfs_meta_folio_test_uptodate(folio, eb))
uptodate = 0;
/*
@@ -3148,8 +3371,8 @@ again:
* btree_release_folio will correctly detect that a page belongs to a
* live buffer and won't free them prematurely.
*/
- for (int i = 0; i < num_folios; i++)
- unlock_page(folio_page(eb->folios[i], 0));
+ for (int i = 0; i < num_extent_folios(eb); i++)
+ folio_unlock(eb->folios[i]);
return eb;
out:
@@ -3173,13 +3396,13 @@ out:
for (int i = 0; i < attached; i++) {
ASSERT(eb->folios[i]);
detach_extent_buffer_folio(eb, eb->folios[i]);
- unlock_page(folio_page(eb->folios[i], 0));
+ folio_unlock(eb->folios[i]);
folio_put(eb->folios[i]);
eb->folios[i] = NULL;
}
/*
* Now all pages of that extent buffer is unmapped, set UNMAPPED flag,
- * so it can be cleaned up without utilizing page->mapping.
+ * so it can be cleaned up without utilizing folio->mapping.
*/
set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
@@ -3195,7 +3418,7 @@ static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
struct extent_buffer *eb =
container_of(head, struct extent_buffer, rcu_head);
- __free_extent_buffer(eb);
+ kmem_cache_free(extent_buffer_cache, eb);
}
static int release_extent_buffer(struct extent_buffer *eb)
@@ -3219,11 +3442,11 @@ static int release_extent_buffer(struct extent_buffer *eb)
}
btrfs_leak_debug_del_eb(eb);
- /* Should be safe to release our pages at this point */
- btrfs_release_extent_buffer_pages(eb);
+ /* Should be safe to release folios at this point. */
+ btrfs_release_extent_buffer_folios(eb);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags))) {
- __free_extent_buffer(eb);
+ kmem_cache_free(extent_buffer_cache, eb);
return 1;
}
#endif
@@ -3279,11 +3502,10 @@ void free_extent_buffer_stale(struct extent_buffer *eb)
release_extent_buffer(eb);
}
-static void btree_clear_folio_dirty(struct folio *folio)
+static void btree_clear_folio_dirty_tag(struct folio *folio)
{
- ASSERT(folio_test_dirty(folio));
+ ASSERT(!folio_test_dirty(folio));
ASSERT(folio_test_locked(folio));
- folio_clear_dirty_for_io(folio);
xa_lock_irq(&folio->mapping->i_pages);
if (!folio_test_dirty(folio))
__xa_clear_mark(&folio->mapping->i_pages,
@@ -3291,26 +3513,10 @@ static void btree_clear_folio_dirty(struct folio *folio)
xa_unlock_irq(&folio->mapping->i_pages);
}
-static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb)
-{
- struct btrfs_fs_info *fs_info = eb->fs_info;
- struct folio *folio = eb->folios[0];
- bool last;
-
- /* btree_clear_folio_dirty() needs page locked. */
- folio_lock(folio);
- last = btrfs_subpage_clear_and_test_dirty(fs_info, folio, eb->start, eb->len);
- if (last)
- btree_clear_folio_dirty(folio);
- folio_unlock(folio);
- WARN_ON(atomic_read(&eb->refs) == 0);
-}
-
void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
struct extent_buffer *eb)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
- int num_folios;
btrfs_assert_tree_write_locked(eb);
@@ -3337,17 +3543,16 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, -eb->len,
fs_info->dirty_metadata_batch);
- if (eb->fs_info->nodesize < PAGE_SIZE)
- return clear_subpage_extent_buffer_dirty(eb);
-
- num_folios = num_extent_folios(eb);
- for (int i = 0; i < num_folios; i++) {
+ for (int i = 0; i < num_extent_folios(eb); i++) {
struct folio *folio = eb->folios[i];
+ bool last;
if (!folio_test_dirty(folio))
continue;
folio_lock(folio);
- btree_clear_folio_dirty(folio);
+ last = btrfs_meta_folio_clear_and_test_dirty(folio, eb);
+ if (last)
+ btree_clear_folio_dirty_tag(folio);
folio_unlock(folio);
}
WARN_ON(atomic_read(&eb->refs) == 0);
@@ -3355,92 +3560,66 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
void set_extent_buffer_dirty(struct extent_buffer *eb)
{
- int num_folios;
bool was_dirty;
check_buffer_tree_ref(eb);
was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
- num_folios = num_extent_folios(eb);
WARN_ON(atomic_read(&eb->refs) == 0);
WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
WARN_ON(test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags));
if (!was_dirty) {
- bool subpage = eb->fs_info->nodesize < PAGE_SIZE;
+ bool subpage = btrfs_meta_is_subpage(eb->fs_info);
/*
* For subpage case, we can have other extent buffers in the
- * same page, and in clear_subpage_extent_buffer_dirty() we
+ * same page, and in clear_extent_buffer_dirty() we
* have to clear page dirty without subpage lock held.
* This can cause race where our page gets dirty cleared after
* we just set it.
*
- * Thankfully, clear_subpage_extent_buffer_dirty() has locked
+ * Thankfully, clear_extent_buffer_dirty() has locked
* its page for other reasons, we can use page lock to prevent
* the above race.
*/
if (subpage)
- lock_page(folio_page(eb->folios[0], 0));
- for (int i = 0; i < num_folios; i++)
- btrfs_folio_set_dirty(eb->fs_info, eb->folios[i],
- eb->start, eb->len);
+ folio_lock(eb->folios[0]);
+ for (int i = 0; i < num_extent_folios(eb); i++)
+ btrfs_meta_folio_set_dirty(eb->folios[i], eb);
if (subpage)
- unlock_page(folio_page(eb->folios[0], 0));
+ folio_unlock(eb->folios[0]);
percpu_counter_add_batch(&eb->fs_info->dirty_metadata_bytes,
eb->len,
eb->fs_info->dirty_metadata_batch);
}
#ifdef CONFIG_BTRFS_DEBUG
- for (int i = 0; i < num_folios; i++)
+ for (int i = 0; i < num_extent_folios(eb); i++)
ASSERT(folio_test_dirty(eb->folios[i]));
#endif
}
void clear_extent_buffer_uptodate(struct extent_buffer *eb)
{
- struct btrfs_fs_info *fs_info = eb->fs_info;
- int num_folios = num_extent_folios(eb);
clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
- for (int i = 0; i < num_folios; i++) {
+ for (int i = 0; i < num_extent_folios(eb); i++) {
struct folio *folio = eb->folios[i];
if (!folio)
continue;
- /*
- * This is special handling for metadata subpage, as regular
- * btrfs_is_subpage() can not handle cloned/dummy metadata.
- */
- if (fs_info->nodesize >= PAGE_SIZE)
- folio_clear_uptodate(folio);
- else
- btrfs_subpage_clear_uptodate(fs_info, folio,
- eb->start, eb->len);
+ btrfs_meta_folio_clear_uptodate(folio, eb);
}
}
void set_extent_buffer_uptodate(struct extent_buffer *eb)
{
- struct btrfs_fs_info *fs_info = eb->fs_info;
- int num_folios = num_extent_folios(eb);
set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
- for (int i = 0; i < num_folios; i++) {
- struct folio *folio = eb->folios[i];
-
- /*
- * This is special handling for metadata subpage, as regular
- * btrfs_is_subpage() can not handle cloned/dummy metadata.
- */
- if (fs_info->nodesize >= PAGE_SIZE)
- folio_mark_uptodate(folio);
- else
- btrfs_subpage_set_uptodate(fs_info, folio,
- eb->start, eb->len);
- }
+ for (int i = 0; i < num_extent_folios(eb); i++)
+ btrfs_meta_folio_set_uptodate(eb->folios[i], eb);
}
static void clear_extent_buffer_reading(struct extent_buffer *eb)
@@ -3453,10 +3632,7 @@ static void clear_extent_buffer_reading(struct extent_buffer *eb)
static void end_bbio_meta_read(struct btrfs_bio *bbio)
{
struct extent_buffer *eb = bbio->private;
- struct btrfs_fs_info *fs_info = eb->fs_info;
bool uptodate = !bbio->bio.bi_status;
- struct folio_iter fi;
- u32 bio_offset = 0;
/*
* If the extent buffer is marked UPTODATE before the read operation
@@ -3478,30 +3654,16 @@ static void end_bbio_meta_read(struct btrfs_bio *bbio)
set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
}
- bio_for_each_folio_all(fi, &bbio->bio) {
- struct folio *folio = fi.folio;
- u64 start = eb->start + bio_offset;
- u32 len = fi.length;
-
- if (uptodate)
- btrfs_folio_set_uptodate(fs_info, folio, start, len);
- else
- btrfs_folio_clear_uptodate(fs_info, folio, start, len);
-
- bio_offset += len;
- }
-
clear_extent_buffer_reading(eb);
free_extent_buffer(eb);
bio_put(&bbio->bio);
}
-int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
- const struct btrfs_tree_parent_check *check)
+int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num,
+ const struct btrfs_tree_parent_check *check)
{
struct btrfs_bio *bbio;
- bool ret;
if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
return 0;
@@ -3516,7 +3678,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
/* Someone else is already reading the buffer, just wait for it. */
if (test_and_set_bit(EXTENT_BUFFER_READING, &eb->bflags))
- goto done;
+ return 0;
/*
* Between the initial test_bit(EXTENT_BUFFER_UPTODATE) and the above
@@ -3541,29 +3703,31 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
bbio->inode = BTRFS_I(eb->fs_info->btree_inode);
bbio->file_offset = eb->start;
memcpy(&bbio->parent_check, check, sizeof(*check));
- if (eb->fs_info->nodesize < PAGE_SIZE) {
- ret = bio_add_folio(&bbio->bio, eb->folios[0], eb->len,
- eb->start - folio_pos(eb->folios[0]));
- ASSERT(ret);
- } else {
- int num_folios = num_extent_folios(eb);
-
- for (int i = 0; i < num_folios; i++) {
- struct folio *folio = eb->folios[i];
+ for (int i = 0; i < num_extent_folios(eb); i++) {
+ struct folio *folio = eb->folios[i];
+ u64 range_start = max_t(u64, eb->start, folio_pos(folio));
+ u32 range_len = min_t(u64, folio_pos(folio) + folio_size(folio),
+ eb->start + eb->len) - range_start;
- ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0);
- ASSERT(ret);
- }
+ bio_add_folio_nofail(&bbio->bio, folio, range_len,
+ offset_in_folio(folio, range_start));
}
btrfs_submit_bbio(bbio, mirror_num);
+ return 0;
+}
-done:
- if (wait == WAIT_COMPLETE) {
- wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_READING, TASK_UNINTERRUPTIBLE);
- if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
- return -EIO;
- }
+int read_extent_buffer_pages(struct extent_buffer *eb, int mirror_num,
+ const struct btrfs_tree_parent_check *check)
+{
+ int ret;
+ ret = read_extent_buffer_pages_nowait(eb, mirror_num, check);
+ if (ret < 0)
+ return ret;
+
+ wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_READING, TASK_UNINTERRUPTIBLE);
+ if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
+ return -EIO;
return 0;
}
@@ -3735,7 +3899,7 @@ static void assert_eb_folio_uptodate(const struct extent_buffer *eb, int i)
if (test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
return;
- if (fs_info->nodesize < PAGE_SIZE) {
+ if (btrfs_meta_is_subpage(fs_info)) {
folio = eb->folios[0];
ASSERT(i == 0);
if (WARN_ON(!btrfs_subpage_test_uptodate(fs_info, folio,
@@ -4221,7 +4385,7 @@ int try_release_extent_buffer(struct folio *folio)
{
struct extent_buffer *eb;
- if (folio_to_fs_info(folio)->nodesize < PAGE_SIZE)
+ if (btrfs_meta_is_subpage(folio_to_fs_info(folio)))
return try_release_subpage_extent_buffer(folio);
/*
@@ -4294,7 +4458,7 @@ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
return;
}
- ret = read_extent_buffer_pages(eb, WAIT_NONE, 0, &check);
+ ret = read_extent_buffer_pages_nowait(eb, 0, &check);
if (ret < 0)
free_extent_buffer_stale(eb);
else
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 8a36117ed453..2e261892c7bc 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -248,13 +248,10 @@ int btree_write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc);
void btrfs_readahead(struct readahead_control *rac);
int set_folio_extent_mapped(struct folio *folio);
-int set_page_extent_mapped(struct page *page);
void clear_folio_extent_mapped(struct folio *folio);
struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start, u64 owner_root, int level);
-struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
- u64 start, unsigned long len);
struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start);
struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src);
@@ -262,17 +259,23 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start);
void free_extent_buffer(struct extent_buffer *eb);
void free_extent_buffer_stale(struct extent_buffer *eb);
-#define WAIT_NONE 0
-#define WAIT_COMPLETE 1
-#define WAIT_PAGE_LOCK 2
-int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
+int read_extent_buffer_pages(struct extent_buffer *eb, int mirror_num,
const struct btrfs_tree_parent_check *parent_check);
-void wait_on_extent_buffer_writeback(struct extent_buffer *eb);
+int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num,
+ const struct btrfs_tree_parent_check *parent_check);
+
+static inline void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
+{
+ wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK,
+ TASK_UNINTERRUPTIBLE);
+}
+
void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
u64 bytenr, u64 owner_root, u64 gen, int level);
void btrfs_readahead_node_child(struct extent_buffer *node, int slot);
-static inline int num_extent_pages(const struct extent_buffer *eb)
+/* Note: this can be used in for loops without caching the value in a variable. */
+static inline int __pure num_extent_pages(const struct extent_buffer *eb)
{
/*
* For sectorsize == PAGE_SIZE case, since nodesize is always aligned to
@@ -290,8 +293,10 @@ static inline int num_extent_pages(const struct extent_buffer *eb)
* As we can have either one large folio covering the whole eb
* (either nodesize <= PAGE_SIZE, or high order folio), or multiple
* single-paged folios.
+ *
+ * Note: this can be used in for loops without caching the value in a variable.
*/
-static inline int num_extent_folios(const struct extent_buffer *eb)
+static inline int __pure num_extent_folios(const struct extent_buffer *eb)
{
if (folio_order(eb->folios[0]))
return 1;
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 67ce85ff0ae2..7f46abbd6311 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1128,6 +1128,8 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c
long nr_dropped = 0;
struct rb_node *node;
+ lockdep_assert_held_write(&tree->lock);
+
/*
* Take the mmap lock so that we serialize with the inode logging phase
* of fsync because we may need to set the full sync flag on the inode,
@@ -1139,28 +1141,12 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c
* to find new extents, which may not be there yet because ordered
* extents haven't completed yet.
*
- * We also do a try lock because otherwise we could deadlock. This is
- * because the shrinker for this filesystem may be invoked while we are
- * in a path that is holding the mmap lock in write mode. For example in
- * a reflink operation while COWing an extent buffer, when allocating
- * pages for a new extent buffer and under memory pressure, the shrinker
- * may be invoked, and therefore we would deadlock by attempting to read
- * lock the mmap lock while we are holding already a write lock on it.
+ * We also do a try lock because we don't want to block for too long and
+ * we are holding the extent map tree's lock in write mode.
*/
if (!down_read_trylock(&inode->i_mmap_lock))
return 0;
- /*
- * We want to be fast so if the lock is busy we don't want to spend time
- * waiting for it - either some task is about to do IO for the inode or
- * we may have another task shrinking extent maps, here in this code, so
- * skip this inode.
- */
- if (!write_trylock(&tree->lock)) {
- up_read(&inode->i_mmap_lock);
- return 0;
- }
-
node = rb_first(&tree->root);
while (node) {
struct rb_node *next = rb_next(node);
@@ -1201,12 +1187,61 @@ next:
break;
node = next;
}
- write_unlock(&tree->lock);
up_read(&inode->i_mmap_lock);
return nr_dropped;
}
+static struct btrfs_inode *find_first_inode_to_shrink(struct btrfs_root *root,
+ u64 min_ino)
+{
+ struct btrfs_inode *inode;
+ unsigned long from = min_ino;
+
+ xa_lock(&root->inodes);
+ while (true) {
+ struct extent_map_tree *tree;
+
+ inode = xa_find(&root->inodes, &from, ULONG_MAX, XA_PRESENT);
+ if (!inode)
+ break;
+
+ tree = &inode->extent_tree;
+
+ /*
+ * We want to be fast so if the lock is busy we don't want to
+ * spend time waiting for it (some task is about to do IO for
+ * the inode).
+ */
+ if (!write_trylock(&tree->lock))
+ goto next;
+
+ /*
+ * Skip inode if it doesn't have loaded extent maps, so we avoid
+ * getting a reference and doing an iput later. This includes
+ * cases like files that were opened for things like stat(2), or
+ * files with all extent maps previously released through the
+ * release folio callback (btrfs_release_folio()) or released in
+ * a previous run, or directories which never have extent maps.
+ */
+ if (RB_EMPTY_ROOT(&tree->root)) {
+ write_unlock(&tree->lock);
+ goto next;
+ }
+
+ if (igrab(&inode->vfs_inode))
+ break;
+
+ write_unlock(&tree->lock);
+next:
+ from = btrfs_ino(inode) + 1;
+ cond_resched_lock(&root->inodes.xa_lock);
+ }
+ xa_unlock(&root->inodes);
+
+ return inode;
+}
+
static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx)
{
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -1214,21 +1249,21 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx
long nr_dropped = 0;
u64 min_ino = fs_info->em_shrinker_last_ino + 1;
- inode = btrfs_find_first_inode(root, min_ino);
+ inode = find_first_inode_to_shrink(root, min_ino);
while (inode) {
nr_dropped += btrfs_scan_inode(inode, ctx);
+ write_unlock(&inode->extent_tree.lock);
min_ino = btrfs_ino(inode) + 1;
fs_info->em_shrinker_last_ino = btrfs_ino(inode);
- btrfs_add_delayed_iput(inode);
+ iput(&inode->vfs_inode);
- if (ctx->scanned >= ctx->nr_to_scan ||
- btrfs_fs_closing(inode->root->fs_info))
+ if (ctx->scanned >= ctx->nr_to_scan || btrfs_fs_closing(fs_info))
break;
cond_resched();
- inode = btrfs_find_first_inode(root, min_ino);
+ inode = find_first_inode_to_shrink(root, min_ino);
}
if (inode) {
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 886749b39672..344b4db487a0 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -163,20 +163,21 @@ int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans,
int ret = 0;
struct btrfs_file_extent_item *item;
struct btrfs_key file_key;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
+
file_key.objectid = objectid;
- file_key.offset = pos;
file_key.type = BTRFS_EXTENT_DATA_KEY;
+ file_key.offset = pos;
ret = btrfs_insert_empty_item(trans, root, path, &file_key,
sizeof(*item));
if (ret < 0)
- goto out;
+ return ret;
leaf = path->nodes[0];
item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
@@ -191,9 +192,6 @@ int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans,
btrfs_set_file_extent_encryption(leaf, item, 0);
btrfs_set_file_extent_other_encoding(leaf, item, 0);
- btrfs_mark_buffer_dirty(trans, leaf);
-out:
- btrfs_free_path(path);
return ret;
}
@@ -214,8 +212,8 @@ btrfs_lookup_csum(struct btrfs_trans_handle *trans,
int csums_in_item;
file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
- file_key.offset = bytenr;
file_key.type = BTRFS_EXTENT_CSUM_KEY;
+ file_key.offset = bytenr;
ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow);
if (ret < 0)
goto fail;
@@ -261,8 +259,8 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
int cow = mod != 0;
file_key.objectid = objectid;
- file_key.offset = offset;
file_key.type = BTRFS_EXTENT_DATA_KEY;
+ file_key.offset = offset;
return btrfs_search_slot(trans, root, &file_key, path, ins_len, cow);
}
@@ -343,7 +341,7 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
struct btrfs_inode *inode = bbio->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct bio *bio = &bbio->bio;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
const u32 sectorsize = fs_info->sectorsize;
const u32 csum_size = fs_info->csum_size;
u32 orig_len = bio->bi_iter.bi_size;
@@ -375,10 +373,8 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) {
bbio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS);
- if (!bbio->csum) {
- btrfs_free_path(path);
+ if (!bbio->csum)
return BLK_STS_RESOURCE;
- }
} else {
bbio->csum = bbio->csum_inline;
}
@@ -446,7 +442,6 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
bio_offset += count * sectorsize;
}
- btrfs_free_path(path);
return ret;
}
@@ -486,8 +481,8 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end,
path->nowait = nowait;
key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
- key.offset = start;
key.type = BTRFS_EXTENT_CSUM_KEY;
+ key.offset = start;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
@@ -876,7 +871,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr, u64 len)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
u64 end_byte = bytenr + len;
u64 csum_end;
@@ -894,8 +889,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
while (1) {
key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
- key.offset = end_byte - 1;
key.type = BTRFS_EXTENT_CSUM_KEY;
+ key.offset = end_byte - 1;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret > 0) {
@@ -1012,7 +1007,6 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
}
btrfs_release_path(path);
}
- btrfs_free_path(path);
return ret;
}
@@ -1076,8 +1070,8 @@ again:
found_next = 0;
bytenr = sums->logical + total_bytes;
file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
- file_key.offset = bytenr;
file_key.type = BTRFS_EXTENT_CSUM_KEY;
+ file_key.offset = bytenr;
item = btrfs_lookup_csum(trans, root, path, bytenr, 1);
if (!IS_ERR(item)) {
@@ -1259,7 +1253,6 @@ found:
ins_size /= csum_size;
total_bytes += ins_size * fs_info->sectorsize;
- btrfs_mark_buffer_dirty(trans, path->nodes[0]);
if (total_bytes < sums->len) {
btrfs_release_path(path);
cond_resched();
diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h
index 0e13661a71f3..6181a70ec3ef 100644
--- a/fs/btrfs/file-item.h
+++ b/fs/btrfs/file-item.h
@@ -3,8 +3,10 @@
#ifndef BTRFS_FILE_ITEM_H
#define BTRFS_FILE_ITEM_H
+#include <linux/blk_types.h>
#include <linux/list.h>
#include <uapi/linux/btrfs_tree.h>
+#include "ctree.h"
#include "accessors.h"
struct extent_map;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 14e27473c5bc..71b8a825c447 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -36,52 +36,7 @@
#include "ioctl.h"
#include "file.h"
#include "super.h"
-
-/*
- * Helper to fault in page and copy. This should go away and be replaced with
- * calls into generic code.
- */
-static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
- struct folio *folio, struct iov_iter *i)
-{
- size_t copied = 0;
- size_t total_copied = 0;
- int offset = offset_in_page(pos);
-
- while (write_bytes > 0) {
- size_t count = min_t(size_t, PAGE_SIZE - offset, write_bytes);
- /*
- * Copy data from userspace to the current page
- */
- copied = copy_folio_from_iter_atomic(folio, offset, count, i);
-
- /* Flush processor's dcache for this page */
- flush_dcache_folio(folio);
-
- /*
- * if we get a partial write, we can end up with
- * partially up to date page. These add
- * a lot of complexity, so make sure they don't
- * happen by forcing this copy to be retried.
- *
- * The rest of the btrfs_file_write code will fall
- * back to page at a time copies after we return 0.
- */
- if (unlikely(copied < count)) {
- if (!folio_test_uptodate(folio)) {
- iov_iter_revert(i, copied);
- copied = 0;
- }
- if (!copied)
- break;
- }
-
- write_bytes -= copied;
- total_copied += copied;
- offset += copied;
- }
- return total_copied;
-}
+#include "print-tree.h"
/*
* Unlock folio after btrfs_file_write() is done with it.
@@ -106,7 +61,7 @@ static void btrfs_drop_folio(struct btrfs_fs_info *fs_info, struct folio *folio,
}
/*
- * After btrfs_copy_from_user(), update the following things for delalloc:
+ * After copy_folio_from_iter_atomic(), update the following things for delalloc:
* - Mark newly dirtied folio as DELALLOC in the io tree.
* Used to advise which range is to be written back.
* - Mark modified folio as Uptodate/Dirty and not needing COW fixup
@@ -224,7 +179,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
if (args->drop_cache)
btrfs_drop_extent_map_range(inode, args->start, args->end - 1, false);
- if (args->start >= inode->disk_i_size && !args->replace_extent)
+ if (data_race(args->start >= inode->disk_i_size) && !args->replace_extent)
modify_tree = 0;
update_refs = (btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID);
@@ -245,7 +200,11 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
next_slot:
leaf = path->nodes[0];
if (path->slots[0] >= btrfs_header_nritems(leaf)) {
- BUG_ON(del_nr > 0);
+ if (WARN_ON(del_nr > 0)) {
+ btrfs_print_leaf(leaf);
+ ret = -EINVAL;
+ break;
+ }
ret = btrfs_next_leaf(root, path);
if (ret < 0)
break;
@@ -321,7 +280,11 @@ next_slot:
* | -------- extent -------- |
*/
if (args->start > key.offset && args->end < extent_end) {
- BUG_ON(del_nr > 0);
+ if (WARN_ON(del_nr > 0)) {
+ btrfs_print_leaf(leaf);
+ ret = -EINVAL;
+ break;
+ }
if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
ret = -EOPNOTSUPP;
break;
@@ -351,7 +314,6 @@ next_slot:
btrfs_set_file_extent_offset(leaf, fi, extent_offset);
btrfs_set_file_extent_num_bytes(leaf, fi,
extent_end - args->start);
- btrfs_mark_buffer_dirty(trans, leaf);
if (update_refs && disk_bytenr > 0) {
struct btrfs_ref ref = {
@@ -397,7 +359,6 @@ next_slot:
btrfs_set_file_extent_offset(leaf, fi, extent_offset);
btrfs_set_file_extent_num_bytes(leaf, fi,
extent_end - args->end);
- btrfs_mark_buffer_dirty(trans, leaf);
if (update_refs && disk_bytenr > 0)
args->bytes_found += args->end - key.offset;
break;
@@ -409,7 +370,11 @@ next_slot:
* | -------- extent -------- |
*/
if (args->start > key.offset && args->end >= extent_end) {
- BUG_ON(del_nr > 0);
+ if (WARN_ON(del_nr > 0)) {
+ btrfs_print_leaf(leaf);
+ ret = -EINVAL;
+ break;
+ }
if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
ret = -EOPNOTSUPP;
break;
@@ -417,7 +382,6 @@ next_slot:
btrfs_set_file_extent_num_bytes(leaf, fi,
args->start - key.offset);
- btrfs_mark_buffer_dirty(trans, leaf);
if (update_refs && disk_bytenr > 0)
args->bytes_found += extent_end - args->start;
if (args->end == extent_end)
@@ -437,7 +401,11 @@ delete_extent_item:
del_slot = path->slots[0];
del_nr = 1;
} else {
- BUG_ON(del_slot + del_nr != path->slots[0]);
+ if (WARN_ON(del_slot + del_nr != path->slots[0])) {
+ btrfs_print_leaf(leaf);
+ ret = -EINVAL;
+ break;
+ }
del_nr++;
}
@@ -668,7 +636,6 @@ again:
trans->transid);
btrfs_set_file_extent_num_bytes(leaf, fi,
end - other_start);
- btrfs_mark_buffer_dirty(trans, leaf);
goto out;
}
}
@@ -697,7 +664,6 @@ again:
other_end - start);
btrfs_set_file_extent_offset(leaf, fi,
start - orig_offset);
- btrfs_mark_buffer_dirty(trans, leaf);
goto out;
}
}
@@ -731,7 +697,6 @@ again:
btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
btrfs_set_file_extent_num_bytes(leaf, fi,
extent_end - split);
- btrfs_mark_buffer_dirty(trans, leaf);
ref.action = BTRFS_ADD_DELAYED_REF;
ref.bytenr = bytenr;
@@ -810,7 +775,6 @@ again:
btrfs_set_file_extent_type(leaf, fi,
BTRFS_FILE_EXTENT_REG);
btrfs_set_file_extent_generation(leaf, fi, trans->transid);
- btrfs_mark_buffer_dirty(trans, leaf);
} else {
fi = btrfs_item_ptr(leaf, del_slot - 1,
struct btrfs_file_extent_item);
@@ -819,7 +783,6 @@ again:
btrfs_set_file_extent_generation(leaf, fi, trans->transid);
btrfs_set_file_extent_num_bytes(leaf, fi,
extent_end - key.offset);
- btrfs_mark_buffer_dirty(trans, leaf);
ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
if (ret < 0) {
@@ -841,14 +804,15 @@ static int prepare_uptodate_folio(struct inode *inode, struct folio *folio, u64
{
u64 clamp_start = max_t(u64, pos, folio_pos(folio));
u64 clamp_end = min_t(u64, pos + len, folio_pos(folio) + folio_size(folio));
+ const u32 blocksize = inode_to_fs_info(inode)->sectorsize;
int ret = 0;
if (folio_test_uptodate(folio))
return 0;
if (!force_uptodate &&
- IS_ALIGNED(clamp_start, PAGE_SIZE) &&
- IS_ALIGNED(clamp_end, PAGE_SIZE))
+ IS_ALIGNED(clamp_start, blocksize) &&
+ IS_ALIGNED(clamp_end, blocksize))
return 0;
ret = btrfs_read_folio(NULL, folio);
@@ -911,7 +875,6 @@ again:
ret = PTR_ERR(folio);
return ret;
}
- folio_wait_writeback(folio);
/* Only support page sized folio yet. */
ASSERT(folio_order(folio) == 0);
ret = set_folio_extent_mapped(folio);
@@ -1051,8 +1014,7 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend,
&cached_state);
}
- ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
- NULL, nowait, false);
+ ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, nowait);
if (ret <= 0)
btrfs_drew_write_unlock(&root->snapshot_lock);
else
@@ -1076,7 +1038,6 @@ int btrfs_write_check(struct kiocb *iocb, size_t count)
loff_t pos = iocb->ki_pos;
int ret;
loff_t oldsize;
- loff_t start_pos;
/*
* Quickly bail out on NOWAIT writes if we don't have the nodatacow or
@@ -1103,9 +1064,8 @@ int btrfs_write_check(struct kiocb *iocb, size_t count)
inode_inc_iversion(inode);
}
- start_pos = round_down(pos, fs_info->sectorsize);
oldsize = i_size_read(inode);
- if (start_pos > oldsize) {
+ if (pos > oldsize) {
/* Expand hole size to cover write data, preventing empty gap */
loff_t end_pos = round_up(pos + count, fs_info->sectorsize);
@@ -1129,7 +1089,7 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
u64 lockend;
size_t num_written = 0;
ssize_t ret;
- loff_t old_isize = i_size_read(inode);
+ loff_t old_isize;
unsigned int ilock_flags = 0;
const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0);
@@ -1142,6 +1102,13 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
if (ret < 0)
return ret;
+ /*
+ * We can only trust the isize with inode lock held, or it can race with
+ * other buffered writes and cause incorrect call of
+ * pagecache_isize_extended() to overwrite existing data.
+ */
+ old_isize = i_size_read(inode);
+
ret = generic_write_checks(iocb, i);
if (ret <= 0)
goto out;
@@ -1252,7 +1219,23 @@ again:
break;
}
- copied = btrfs_copy_from_user(pos, write_bytes, folio, i);
+ copied = copy_folio_from_iter_atomic(folio,
+ offset_in_folio(folio, pos), write_bytes, i);
+ flush_dcache_folio(folio);
+
+ /*
+ * If we get a partial write, we can end up with partially
+ * uptodate page. Although if sector size < page size we can
+ * handle it, but if it's not sector aligned it can cause
+ * a lot of complexity, so make sure they don't happen by
+ * forcing retry this copy.
+ */
+ if (unlikely(copied < write_bytes)) {
+ if (!folio_test_uptodate(folio)) {
+ iov_iter_revert(i, copied);
+ copied = 0;
+ }
+ }
num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes);
dirty_sectors = round_up(copied + sector_offset,
@@ -1799,6 +1782,7 @@ static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
struct extent_changeset *data_reserved = NULL;
unsigned long zero_start;
loff_t size;
+ size_t fsize = folio_size(folio);
vm_fault_t ret;
int ret2;
int reserved = 0;
@@ -1809,7 +1793,7 @@ static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
ASSERT(folio_order(folio) == 0);
- reserved_space = PAGE_SIZE;
+ reserved_space = fsize;
sb_start_pagefault(inode->i_sb);
page_start = folio_pos(folio);
@@ -1863,7 +1847,7 @@ again:
* We can't set the delalloc bits if there are pending ordered
* extents. Drop our locks and wait for them to finish.
*/
- ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, PAGE_SIZE);
+ ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, fsize);
if (ordered) {
unlock_extent(io_tree, page_start, page_end, &cached_state);
folio_unlock(folio);
@@ -1875,11 +1859,11 @@ again:
if (folio->index == ((size - 1) >> PAGE_SHIFT)) {
reserved_space = round_up(size - page_start, fs_info->sectorsize);
- if (reserved_space < PAGE_SIZE) {
+ if (reserved_space < fsize) {
end = page_start + reserved_space - 1;
btrfs_delalloc_release_space(BTRFS_I(inode),
data_reserved, page_start,
- PAGE_SIZE - reserved_space, true);
+ fsize - reserved_space, true);
}
}
@@ -1906,12 +1890,12 @@ again:
if (page_start + folio_size(folio) > size)
zero_start = offset_in_folio(folio, size);
else
- zero_start = PAGE_SIZE;
+ zero_start = fsize;
- if (zero_start != PAGE_SIZE)
+ if (zero_start != fsize)
folio_zero_range(folio, zero_start, folio_size(folio) - zero_start);
- btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE);
+ btrfs_folio_clear_checked(fs_info, folio, page_start, fsize);
btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start);
btrfs_folio_set_uptodate(fs_info, folio, page_start, end + 1 - page_start);
@@ -1920,7 +1904,7 @@ again:
unlock_extent(io_tree, page_start, page_end, &cached_state);
up_read(&BTRFS_I(inode)->i_mmap_lock);
- btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), fsize);
sb_end_pagefault(inode->i_sb);
extent_changeset_free(data_reserved);
return VM_FAULT_LOCKED;
@@ -1929,7 +1913,7 @@ out_unlock:
folio_unlock(folio);
up_read(&BTRFS_I(inode)->i_mmap_lock);
out:
- btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), fsize);
btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start,
reserved_space, (ret != 0));
out_noreserve:
@@ -2029,7 +2013,6 @@ static int fill_holes(struct btrfs_trans_handle *trans,
btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
btrfs_set_file_extent_offset(leaf, fi, 0);
btrfs_set_file_extent_generation(leaf, fi, trans->transid);
- btrfs_mark_buffer_dirty(trans, leaf);
goto out;
}
@@ -2046,7 +2029,6 @@ static int fill_holes(struct btrfs_trans_handle *trans,
btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
btrfs_set_file_extent_offset(leaf, fi, 0);
btrfs_set_file_extent_generation(leaf, fi, trans->transid);
- btrfs_mark_buffer_dirty(trans, leaf);
goto out;
}
btrfs_release_path(path);
@@ -2122,15 +2104,20 @@ static void btrfs_punch_hole_lock_range(struct inode *inode,
* will always return true.
* So here we need to do extra page alignment for
* filemap_range_has_page().
+ *
+ * And do not decrease page_lockend right now, as it can be 0.
*/
const u64 page_lockstart = round_up(lockstart, PAGE_SIZE);
- const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE) - 1;
+ const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE);
while (1) {
truncate_pagecache_range(inode, lockstart, lockend);
lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
cached_state);
+ /* The same page or adjacent pages. */
+ if (page_lockend <= page_lockstart)
+ break;
/*
* We can't have ordered extents in the range, nor dirty/writeback
* pages, because we have locked the inode's VFS lock in exclusive
@@ -2142,7 +2129,7 @@ static void btrfs_punch_hole_lock_range(struct inode *inode,
* we do, unlock the range and retry.
*/
if (!filemap_range_has_page(inode->i_mapping, page_lockstart,
- page_lockend))
+ page_lockend - 1))
break;
unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
@@ -2194,7 +2181,6 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
btrfs_set_file_extent_num_bytes(leaf, extent, replace_len);
if (extent_info->is_new_extent)
btrfs_set_file_extent_generation(leaf, extent, trans->transid);
- btrfs_mark_buffer_dirty(trans, leaf);
btrfs_release_path(path);
ret = btrfs_inode_set_file_extent_range(inode, extent_info->file_offset,
diff --git a/fs/btrfs/file.h b/fs/btrfs/file.h
index de89e644be29..d7df81388cbe 100644
--- a/fs/btrfs/file.h
+++ b/fs/btrfs/file.h
@@ -9,6 +9,8 @@ struct file;
struct extent_state;
struct kiocb;
struct iov_iter;
+struct inode;
+struct folio;
struct page;
struct btrfs_ioctl_encoded_io_args;
struct btrfs_drop_extents_args;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index cfa52ef40b06..05e173311c1a 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -12,7 +12,7 @@
#include <linux/error-injection.h>
#include <linux/sched/mm.h>
#include <linux/string_choices.h>
-#include "ctree.h"
+#include "extent-tree.h"
#include "fs.h"
#include "messages.h"
#include "misc.h"
@@ -88,13 +88,13 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
struct btrfs_disk_key disk_key;
struct btrfs_free_space_header *header;
struct extent_buffer *leaf;
- struct inode *inode = NULL;
+ struct btrfs_inode *inode;
unsigned nofs_flag;
int ret;
key.objectid = BTRFS_FREE_SPACE_OBJECTID;
- key.offset = offset;
key.type = 0;
+ key.offset = offset;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
@@ -120,13 +120,13 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
btrfs_release_path(path);
memalloc_nofs_restore(nofs_flag);
if (IS_ERR(inode))
- return inode;
+ return ERR_CAST(inode);
- mapping_set_gfp_mask(inode->i_mapping,
- mapping_gfp_constraint(inode->i_mapping,
+ mapping_set_gfp_mask(inode->vfs_inode.i_mapping,
+ mapping_gfp_constraint(inode->vfs_inode.i_mapping,
~(__GFP_FS | __GFP_HIGHMEM)));
- return inode;
+ return &inode->vfs_inode;
}
struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group,
@@ -198,12 +198,11 @@ static int __create_free_space_inode(struct btrfs_root *root,
btrfs_set_inode_nlink(leaf, inode_item, 1);
btrfs_set_inode_transid(leaf, inode_item, trans->transid);
btrfs_set_inode_block_group(leaf, inode_item, offset);
- btrfs_mark_buffer_dirty(trans, leaf);
btrfs_release_path(path);
key.objectid = BTRFS_FREE_SPACE_OBJECTID;
- key.offset = offset;
key.type = 0;
+ key.offset = offset;
ret = btrfs_insert_empty_item(trans, root, path, &key,
sizeof(struct btrfs_free_space_header));
if (ret < 0) {
@@ -216,7 +215,6 @@ static int __create_free_space_inode(struct btrfs_root *root,
struct btrfs_free_space_header);
memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header));
btrfs_set_free_space_key(leaf, header, &disk_key);
- btrfs_mark_buffer_dirty(trans, leaf);
btrfs_release_path(path);
return 0;
@@ -246,7 +244,7 @@ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans,
struct inode *inode,
struct btrfs_block_group *block_group)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
int ret = 0;
@@ -259,12 +257,12 @@ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans,
if (IS_ERR(inode)) {
if (PTR_ERR(inode) != -ENOENT)
ret = PTR_ERR(inode);
- goto out;
+ return ret;
}
ret = btrfs_orphan_add(trans, BTRFS_I(inode));
if (ret) {
btrfs_add_delayed_iput(BTRFS_I(inode));
- goto out;
+ return ret;
}
clear_nlink(inode);
/* One for the block groups ref */
@@ -287,12 +285,9 @@ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans,
if (ret) {
if (ret > 0)
ret = 0;
- goto out;
+ return ret;
}
- ret = btrfs_del_item(trans, trans->fs_info->tree_root, path);
-out:
- btrfs_free_path(path);
- return ret;
+ return btrfs_del_item(trans, trans->fs_info->tree_root, path);
}
int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
@@ -449,7 +444,7 @@ static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate)
{
- struct page *page;
+ struct folio *folio;
struct inode *inode = io_ctl->inode;
gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
int i;
@@ -457,31 +452,33 @@ static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate)
for (i = 0; i < io_ctl->num_pages; i++) {
int ret;
- page = find_or_create_page(inode->i_mapping, i, mask);
- if (!page) {
+ folio = __filemap_get_folio(inode->i_mapping, i,
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
+ mask);
+ if (IS_ERR(folio)) {
io_ctl_drop_pages(io_ctl);
return -ENOMEM;
}
- ret = set_page_extent_mapped(page);
+ ret = set_folio_extent_mapped(folio);
if (ret < 0) {
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
io_ctl_drop_pages(io_ctl);
return ret;
}
- io_ctl->pages[i] = page;
- if (uptodate && !PageUptodate(page)) {
- btrfs_read_folio(NULL, page_folio(page));
- lock_page(page);
- if (page->mapping != inode->i_mapping) {
+ io_ctl->pages[i] = &folio->page;
+ if (uptodate && !folio_test_uptodate(folio)) {
+ btrfs_read_folio(NULL, folio);
+ folio_lock(folio);
+ if (folio->mapping != inode->i_mapping) {
btrfs_err(BTRFS_I(inode)->root->fs_info,
"free space cache page truncated");
io_ctl_drop_pages(io_ctl);
return -EIO;
}
- if (!PageUptodate(page)) {
+ if (!folio_test_uptodate(folio)) {
btrfs_err(BTRFS_I(inode)->root->fs_info,
"error reading free space cache");
io_ctl_drop_pages(io_ctl);
@@ -755,8 +752,8 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
return 0;
key.objectid = BTRFS_FREE_SPACE_OBJECTID;
- key.offset = offset;
key.type = 0;
+ key.offset = offset;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
@@ -1158,8 +1155,8 @@ update_cache_item(struct btrfs_trans_handle *trans,
int ret;
key.objectid = BTRFS_FREE_SPACE_OBJECTID;
- key.offset = offset;
key.type = 0;
+ key.offset = offset;
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
if (ret < 0) {
@@ -1189,7 +1186,6 @@ update_cache_item(struct btrfs_trans_handle *trans,
btrfs_set_free_space_entries(leaf, header, entries);
btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
btrfs_set_free_space_generation(leaf, header, trans->transid);
- btrfs_mark_buffer_dirty(trans, leaf);
btrfs_release_path(path);
return 0;
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index 7ba50e133921..39c6b96a4c25 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -89,7 +89,6 @@ static int add_new_free_space_info(struct btrfs_trans_handle *trans,
struct btrfs_free_space_info);
btrfs_set_free_space_extent_count(leaf, info, 0);
btrfs_set_free_space_flags(leaf, info, 0);
- btrfs_mark_buffer_dirty(trans, leaf);
ret = 0;
out:
@@ -287,7 +286,6 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
flags |= BTRFS_FREE_SPACE_USING_BITMAPS;
btrfs_set_free_space_flags(leaf, info, flags);
expected_extent_count = btrfs_free_space_extent_count(leaf, info);
- btrfs_mark_buffer_dirty(trans, leaf);
btrfs_release_path(path);
if (extent_count != expected_extent_count) {
@@ -324,7 +322,6 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
write_extent_buffer(leaf, bitmap_cursor, ptr,
data_size);
- btrfs_mark_buffer_dirty(trans, leaf);
btrfs_release_path(path);
i += extent_size;
@@ -430,7 +427,6 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
flags &= ~BTRFS_FREE_SPACE_USING_BITMAPS;
btrfs_set_free_space_flags(leaf, info, flags);
expected_extent_count = btrfs_free_space_extent_count(leaf, info);
- btrfs_mark_buffer_dirty(trans, leaf);
btrfs_release_path(path);
nrbits = block_group->length >> block_group->fs_info->sectorsize_bits;
@@ -495,7 +491,6 @@ static int update_free_space_extent_count(struct btrfs_trans_handle *trans,
extent_count += new_extents;
btrfs_set_free_space_extent_count(path->nodes[0], info, extent_count);
- btrfs_mark_buffer_dirty(trans, path->nodes[0]);
btrfs_release_path(path);
if (!(flags & BTRFS_FREE_SPACE_USING_BITMAPS) &&
@@ -1067,7 +1062,8 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group)
{
struct btrfs_root *extent_root;
- struct btrfs_path *path, *path2;
+ BTRFS_PATH_AUTO_FREE(path);
+ BTRFS_PATH_AUTO_FREE(path2);
struct btrfs_key key;
u64 start, end;
int ret;
@@ -1075,17 +1071,16 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->reada = READA_FORWARD;
path2 = btrfs_alloc_path();
- if (!path2) {
- btrfs_free_path(path);
+ if (!path2)
return -ENOMEM;
- }
+
+ path->reada = READA_FORWARD;
ret = add_new_free_space_info(trans, block_group, path2);
if (ret)
- goto out;
+ return ret;
mutex_lock(&block_group->free_space_lock);
@@ -1151,9 +1146,7 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
ret = 0;
out_locked:
mutex_unlock(&block_group->free_space_lock);
-out:
- btrfs_free_path(path2);
- btrfs_free_path(path);
+
return ret;
}
@@ -1222,7 +1215,7 @@ out_clear:
static int clear_free_space_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
int nr;
int ret;
@@ -1238,7 +1231,7 @@ static int clear_free_space_tree(struct btrfs_trans_handle *trans,
while (1) {
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0)
- goto out;
+ return ret;
nr = btrfs_header_nritems(path->nodes[0]);
if (!nr)
@@ -1247,15 +1240,12 @@ static int clear_free_space_tree(struct btrfs_trans_handle *trans,
path->slots[0] = 0;
ret = btrfs_del_items(trans, root, path, 0, nr);
if (ret)
- goto out;
+ return ret;
btrfs_release_path(path);
}
- ret = 0;
-out:
- btrfs_free_path(path);
- return ret;
+ return 0;
}
int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info)
@@ -1350,6 +1340,12 @@ int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info)
btrfs_end_transaction(trans);
return ret;
}
+ if (btrfs_should_end_transaction(trans)) {
+ btrfs_end_transaction(trans);
+ trans = btrfs_start_transaction(free_space_root, 1);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+ }
node = rb_next(node);
}
@@ -1637,9 +1633,8 @@ int load_free_space_tree(struct btrfs_caching_control *caching_ctl)
{
struct btrfs_block_group *block_group;
struct btrfs_free_space_info *info;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
u32 extent_count, flags;
- int ret;
block_group = caching_ctl->block_group;
@@ -1656,10 +1651,9 @@ int load_free_space_tree(struct btrfs_caching_control *caching_ctl)
path->reada = READA_FORWARD;
info = search_free_space_info(NULL, block_group, path, 0);
- if (IS_ERR(info)) {
- ret = PTR_ERR(info);
- goto out;
- }
+ if (IS_ERR(info))
+ return PTR_ERR(info);
+
extent_count = btrfs_free_space_extent_count(path->nodes[0], info);
flags = btrfs_free_space_flags(path->nodes[0], info);
@@ -1669,11 +1663,7 @@ int load_free_space_tree(struct btrfs_caching_control *caching_ctl)
* there.
*/
if (flags & BTRFS_FREE_SPACE_USING_BITMAPS)
- ret = load_free_space_bitmaps(caching_ctl, path, extent_count);
+ return load_free_space_bitmaps(caching_ctl, path, extent_count);
else
- ret = load_free_space_extents(caching_ctl, path, extent_count);
-
-out:
- btrfs_free_path(path);
- return ret;
+ return load_free_space_extents(caching_ctl, path, extent_count);
}
diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c
index 31c1648bc0b4..b2bb86f8d7cf 100644
--- a/fs/btrfs/fs.c
+++ b/fs/btrfs/fs.c
@@ -1,9 +1,138 @@
// SPDX-License-Identifier: GPL-2.0
#include "messages.h"
-#include "ctree.h"
#include "fs.h"
#include "accessors.h"
+#include "volumes.h"
+
+static const struct btrfs_csums {
+ u16 size;
+ const char name[10];
+ const char driver[12];
+} btrfs_csums[] = {
+ [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
+ [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" },
+ [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" },
+ [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b",
+ .driver = "blake2b-256" },
+};
+
+/* This exists for btrfs-progs usages. */
+u16 btrfs_csum_type_size(u16 type)
+{
+ return btrfs_csums[type].size;
+}
+
+int btrfs_super_csum_size(const struct btrfs_super_block *s)
+{
+ u16 t = btrfs_super_csum_type(s);
+
+ /* csum type is validated at mount time. */
+ return btrfs_csum_type_size(t);
+}
+
+const char *btrfs_super_csum_name(u16 csum_type)
+{
+ /* csum type is validated at mount time. */
+ return btrfs_csums[csum_type].name;
+}
+
+/*
+ * Return driver name if defined, otherwise the name that's also a valid driver
+ * name.
+ */
+const char *btrfs_super_csum_driver(u16 csum_type)
+{
+ /* csum type is validated at mount time */
+ return btrfs_csums[csum_type].driver[0] ?
+ btrfs_csums[csum_type].driver :
+ btrfs_csums[csum_type].name;
+}
+
+size_t __attribute_const__ btrfs_get_num_csums(void)
+{
+ return ARRAY_SIZE(btrfs_csums);
+}
+
+/*
+ * Start exclusive operation @type, return true on success.
+ */
+bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
+ enum btrfs_exclusive_operation type)
+{
+ bool ret = false;
+
+ spin_lock(&fs_info->super_lock);
+ if (fs_info->exclusive_operation == BTRFS_EXCLOP_NONE) {
+ fs_info->exclusive_operation = type;
+ ret = true;
+ }
+ spin_unlock(&fs_info->super_lock);
+
+ return ret;
+}
+
+/*
+ * Conditionally allow to enter the exclusive operation in case it's compatible
+ * with the running one. This must be paired with btrfs_exclop_start_unlock()
+ * and btrfs_exclop_finish().
+ *
+ * Compatibility:
+ * - the same type is already running
+ * - when trying to add a device and balance has been paused
+ * - not BTRFS_EXCLOP_NONE - this is intentionally incompatible and the caller
+ * must check the condition first that would allow none -> @type
+ */
+bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info,
+ enum btrfs_exclusive_operation type)
+{
+ spin_lock(&fs_info->super_lock);
+ if (fs_info->exclusive_operation == type ||
+ (fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED &&
+ type == BTRFS_EXCLOP_DEV_ADD))
+ return true;
+
+ spin_unlock(&fs_info->super_lock);
+ return false;
+}
+
+void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info)
+{
+ spin_unlock(&fs_info->super_lock);
+}
+
+void btrfs_exclop_finish(struct btrfs_fs_info *fs_info)
+{
+ spin_lock(&fs_info->super_lock);
+ WRITE_ONCE(fs_info->exclusive_operation, BTRFS_EXCLOP_NONE);
+ spin_unlock(&fs_info->super_lock);
+ sysfs_notify(&fs_info->fs_devices->fsid_kobj, NULL, "exclusive_operation");
+}
+
+void btrfs_exclop_balance(struct btrfs_fs_info *fs_info,
+ enum btrfs_exclusive_operation op)
+{
+ switch (op) {
+ case BTRFS_EXCLOP_BALANCE_PAUSED:
+ spin_lock(&fs_info->super_lock);
+ ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE ||
+ fs_info->exclusive_operation == BTRFS_EXCLOP_DEV_ADD ||
+ fs_info->exclusive_operation == BTRFS_EXCLOP_NONE ||
+ fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED);
+ fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE_PAUSED;
+ spin_unlock(&fs_info->super_lock);
+ break;
+ case BTRFS_EXCLOP_BALANCE:
+ spin_lock(&fs_info->super_lock);
+ ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED);
+ fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE;
+ spin_unlock(&fs_info->super_lock);
+ break;
+ default:
+ btrfs_warn(fs_info,
+ "invalid exclop balance operation %d requested", op);
+ }
+}
void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag,
const char *name)
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 79a1a3d6f04d..bcca43046064 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -14,10 +14,10 @@
#include <linux/lockdep.h>
#include <linux/spinlock.h>
#include <linux/mutex.h>
-#include <linux/rwlock_types.h>
#include <linux/rwsem.h>
#include <linux/semaphore.h>
#include <linux/list.h>
+#include <linux/pagemap.h>
#include <linux/radix-tree.h>
#include <linux/workqueue.h>
#include <linux/wait.h>
@@ -47,6 +47,18 @@ struct btrfs_subpage_info;
struct btrfs_stripe_hash_table;
struct btrfs_space_info;
+/*
+ * Minimum data and metadata block size.
+ *
+ * Normally it's 4K, but for testing subpage block size on 4K page systems, we
+ * allow DEBUG builds to accept 2K page size.
+ */
+#ifdef CONFIG_BTRFS_DEBUG
+#define BTRFS_MIN_BLOCKSIZE (SZ_2K)
+#else
+#define BTRFS_MIN_BLOCKSIZE (SZ_4K)
+#endif
+
#define BTRFS_MAX_EXTENT_SIZE SZ_128M
#define BTRFS_OLDEST_GENERATION 0ULL
@@ -105,6 +117,9 @@ enum {
/* Indicates there was an error cleaning up a log tree. */
BTRFS_FS_STATE_LOG_CLEANUP_ERROR,
+ /* No more delayed iput can be queued. */
+ BTRFS_FS_STATE_NO_DELAYED_IPUT,
+
BTRFS_FS_STATE_COUNT
};
@@ -485,8 +500,8 @@ struct btrfs_fs_info {
u64 last_trans_log_full_commit;
unsigned long long mount_opt;
- unsigned long compress_type:4;
- unsigned int compress_level;
+ int compress_type;
+ int compress_level;
u32 commit_interval;
/*
* It is a suggestive number, the read side is safe even it gets a
@@ -627,6 +642,9 @@ struct btrfs_fs_info {
struct kobject *qgroups_kobj;
struct kobject *discard_kobj;
+ /* Track the number of blocks (sectors) read by the filesystem. */
+ struct percpu_counter stats_read_blocks;
+
/* Used to keep from writing metadata until there is a nice batch */
struct percpu_counter dirty_metadata_bytes;
struct percpu_counter delalloc_bytes;
@@ -706,7 +724,6 @@ struct btrfs_fs_info {
* running.
*/
refcount_t scrub_workers_refcnt;
- u32 sectors_per_page;
struct workqueue_struct *scrub_workers;
struct btrfs_discard_ctl discard_ctl;
@@ -887,6 +904,11 @@ struct btrfs_fs_info {
#define inode_to_fs_info(_inode) (BTRFS_I(_Generic((_inode), \
struct inode *: (_inode)))->root->fs_info)
+static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
+{
+ return mapping_gfp_constraint(mapping, ~__GFP_FS);
+}
+
static inline u64 btrfs_get_fs_generation(const struct btrfs_fs_info *fs_info)
{
return READ_ONCE(fs_info->generation);
@@ -953,6 +975,8 @@ static inline u64 btrfs_calc_metadata_size(const struct btrfs_fs_info *fs_info,
#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \
sizeof(struct btrfs_item))
+#define BTRFS_BYTES_TO_BLKS(fs_info, bytes) ((bytes) >> (fs_info)->sectorsize_bits)
+
static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info)
{
return IS_ENABLED(CONFIG_BLK_DEV_ZONED) && fs_info->zone_size > 0;
@@ -971,6 +995,12 @@ static inline u32 count_max_extents(const struct btrfs_fs_info *fs_info, u64 siz
return div_u64(size + fs_info->max_extent_size - 1, fs_info->max_extent_size);
}
+static inline unsigned int btrfs_blocks_per_folio(const struct btrfs_fs_info *fs_info,
+ const struct folio *folio)
+{
+ return folio_size(folio) >> fs_info->sectorsize_bits;
+}
+
bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
enum btrfs_exclusive_operation type);
bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info,
@@ -982,6 +1012,17 @@ void btrfs_exclop_balance(struct btrfs_fs_info *fs_info,
int btrfs_check_ioctl_vol_args_path(const struct btrfs_ioctl_vol_args *vol_args);
+u16 btrfs_csum_type_size(u16 type);
+int btrfs_super_csum_size(const struct btrfs_super_block *s);
+const char *btrfs_super_csum_name(u16 csum_type);
+const char *btrfs_super_csum_driver(u16 csum_type);
+size_t __attribute_const__ btrfs_get_num_csums(void);
+
+static inline bool btrfs_is_empty_uuid(const u8 *uuid)
+{
+ return uuid_is_null((const uuid_t *)uuid);
+}
+
/* Compatibility and incompatibility defines */
void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag,
const char *name);
@@ -1058,6 +1099,14 @@ static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
(unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, \
&(fs_info)->fs_state)))
+/*
+ * We use folio flag owner_2 to indicate there is an ordered extent with
+ * unfinished IO.
+ */
+#define folio_test_ordered(folio) folio_test_owner_2(folio)
+#define folio_set_ordered(folio) folio_set_owner_2(folio)
+#define folio_clear_ordered(folio) folio_clear_owner_2(folio)
+
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
#define EXPORT_FOR_TESTS
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 29572dfaf878..3530de0618c8 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -191,8 +191,8 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
int del_len = name->len + sizeof(*ref);
key.objectid = inode_objectid;
- key.offset = ref_objectid;
key.type = BTRFS_INODE_REF_KEY;
+ key.offset = ref_objectid;
path = btrfs_alloc_path();
if (!path)
@@ -298,8 +298,6 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
ptr = (unsigned long)&extref->name;
write_extent_buffer(path->nodes[0], name->name, ptr, name->len);
- btrfs_mark_buffer_dirty(trans, path->nodes[0]);
-
out:
btrfs_free_path(path);
return ret;
@@ -319,8 +317,8 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
int ins_len = name->len + sizeof(*ref);
key.objectid = inode_objectid;
- key.offset = ref_objectid;
key.type = BTRFS_INODE_REF_KEY;
+ key.offset = ref_objectid;
path = btrfs_alloc_path();
if (!path)
@@ -363,8 +361,6 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
ptr = (unsigned long)(ref + 1);
}
write_extent_buffer(path->nodes[0], name->name, ptr, name->len);
- btrfs_mark_buffer_dirty(trans, path->nodes[0]);
-
out:
btrfs_free_path(path);
@@ -497,8 +493,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
path->reada = READA_BACK;
key.objectid = control->ino;
- key.offset = (u64)-1;
key.type = (u8)-1;
+ key.offset = (u64)-1;
search_again:
/*
@@ -590,7 +586,6 @@ search_again:
num_dec = (orig_num_bytes - extent_num_bytes);
if (extent_start != 0)
control->sub_bytes += num_dec;
- btrfs_mark_buffer_dirty(trans, leaf);
} else {
extent_num_bytes =
btrfs_file_extent_disk_num_bytes(leaf, fi);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 488edca8333a..cc67d1a2d611 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -393,34 +393,13 @@ void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags)
* extent (btrfs_finish_ordered_io()).
*/
static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
- struct folio *locked_folio,
u64 offset, u64 bytes)
{
unsigned long index = offset >> PAGE_SHIFT;
unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
- u64 page_start = 0, page_end = 0;
struct folio *folio;
- if (locked_folio) {
- page_start = folio_pos(locked_folio);
- page_end = page_start + folio_size(locked_folio) - 1;
- }
-
while (index <= end_index) {
- /*
- * For locked page, we will call btrfs_mark_ordered_io_finished
- * through btrfs_mark_ordered_io_finished() on it
- * in run_delalloc_range() for the error handling, which will
- * clear page Ordered and run the ordered extent accounting.
- *
- * Here we can't just clear the Ordered bit, or
- * btrfs_mark_ordered_io_finished() would skip the accounting
- * for the page range, and the ordered extent will never finish.
- */
- if (locked_folio && index == (page_start >> PAGE_SHIFT)) {
- index++;
- continue;
- }
folio = filemap_get_folio(inode->vfs_inode.i_mapping, index);
index++;
if (IS_ERR(folio))
@@ -436,23 +415,6 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
folio_put(folio);
}
- if (locked_folio) {
- /* The locked page covers the full range, nothing needs to be done */
- if (bytes + offset <= page_start + folio_size(locked_folio))
- return;
- /*
- * In case this page belongs to the delalloc range being
- * instantiated then skip it, since the first page of a range is
- * going to be properly cleaned up by the caller of
- * run_delalloc_range
- */
- if (page_start >= offset && page_end <= (offset + bytes - 1)) {
- bytes = offset + bytes - folio_pos(locked_folio) -
- folio_size(locked_folio);
- offset = folio_pos(locked_folio) + folio_size(locked_folio);
- }
- }
-
return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false);
}
@@ -527,8 +489,8 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
size_t datasize;
key.objectid = btrfs_ino(inode);
- key.offset = 0;
key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = 0;
datasize = btrfs_file_extent_calc_inline_size(cur_size);
ret = btrfs_insert_empty_item(trans, root, path, &key,
@@ -564,7 +526,6 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
kunmap_local(kaddr);
folio_put(folio);
}
- btrfs_mark_buffer_dirty(trans, leaf);
btrfs_release_path(path);
/*
@@ -605,23 +566,14 @@ static bool can_cow_file_range_inline(struct btrfs_inode *inode,
if (offset != 0)
return false;
- /*
- * Due to the page size limit, for subpage we can only trigger the
- * writeback for the dirty sectors of page, that means data writeback
- * is doing more writeback than what we want.
- *
- * This is especially unexpected for some call sites like fallocate,
- * where we only increase i_size after everything is done.
- * This means we can trigger inline extent even if we didn't want to.
- * So here we skip inline extent creation completely.
- */
- if (fs_info->sectorsize != PAGE_SIZE)
- return false;
-
/* Inline extents are limited to sectorsize. */
if (size > fs_info->sectorsize)
return false;
+ /* We do not allow a non-compressed extent to be as large as block size. */
+ if (data_len >= fs_info->sectorsize)
+ return false;
+
/* We cannot exceed the maximum inline data size. */
if (data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info))
return false;
@@ -711,7 +663,7 @@ out:
* And at reserve time, it's always aligned to page size, so
* just free one page here.
*/
- btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE, NULL);
+ btrfs_qgroup_free_data(inode, NULL, 0, fs_info->sectorsize, NULL);
btrfs_free_path(path);
btrfs_end_transaction(trans);
return ret;
@@ -871,7 +823,7 @@ static inline void inode_should_defrag(struct btrfs_inode *inode,
btrfs_add_inode_defrag(inode, small_write);
}
-static int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
+static int extent_range_clear_dirty_for_io(struct btrfs_inode *inode, u64 start, u64 end)
{
unsigned long end_index = end >> PAGE_SHIFT;
struct folio *folio;
@@ -879,13 +831,13 @@ static int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 e
for (unsigned long index = start >> PAGE_SHIFT;
index <= end_index; index++) {
- folio = filemap_get_folio(inode->i_mapping, index);
+ folio = filemap_get_folio(inode->vfs_inode.i_mapping, index);
if (IS_ERR(folio)) {
if (!ret)
ret = PTR_ERR(folio);
continue;
}
- btrfs_folio_clamp_clear_dirty(inode_to_fs_info(inode), folio, start,
+ btrfs_folio_clamp_clear_dirty(inode->root->fs_info, folio, start,
end + 1 - start);
folio_put(folio);
}
@@ -925,6 +877,7 @@ static void compress_file_range(struct btrfs_work *work)
unsigned int poff;
int i;
int compress_type = fs_info->compress_type;
+ int compress_level = fs_info->compress_level;
inode_should_defrag(inode, start, end, end - start + 1, SZ_16K);
@@ -933,7 +886,7 @@ static void compress_file_range(struct btrfs_work *work)
* Otherwise applications with the file mmap'd can wander in and change
* the page contents while we are compressing them.
*/
- ret = extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end);
+ ret = extent_range_clear_dirty_for_io(inode, start, end);
/*
* All the folios should have been locked thus no failure.
@@ -1007,13 +960,15 @@ again:
goto cleanup_and_bail_uncompressed;
}
- if (inode->defrag_compress)
+ if (inode->defrag_compress) {
compress_type = inode->defrag_compress;
- else if (inode->prop_compress)
+ compress_level = inode->defrag_compress_level;
+ } else if (inode->prop_compress) {
compress_type = inode->prop_compress;
+ }
/* Compression level is applied here. */
- ret = btrfs_compress_folios(compress_type | (fs_info->compress_level << 4),
+ ret = btrfs_compress_folios(compress_type, compress_level,
mapping, start, folios, &nr_folios, &total_in,
&total_compressed);
if (ret)
@@ -1129,19 +1084,13 @@ static void submit_uncompressed_range(struct btrfs_inode *inode,
&wbc, false);
wbc_detach_inode(&wbc);
if (ret < 0) {
- btrfs_cleanup_ordered_extents(inode, locked_folio,
- start, end - start + 1);
- if (locked_folio) {
- const u64 page_start = folio_pos(locked_folio);
-
- folio_start_writeback(locked_folio);
- folio_end_writeback(locked_folio);
- btrfs_mark_ordered_io_finished(inode, locked_folio,
- page_start, PAGE_SIZE,
- !ret);
- mapping_set_error(locked_folio->mapping, ret);
- folio_unlock(locked_folio);
- }
+ if (locked_folio)
+ btrfs_folio_end_lock(inode->root->fs_info, locked_folio,
+ start, async_extent->ram_size);
+ btrfs_err_rl(inode->root->fs_info,
+ "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d",
+ __func__, btrfs_root_id(inode->root),
+ btrfs_ino(inode), start, async_extent->ram_size, ret);
}
}
@@ -1316,10 +1265,7 @@ u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
* - Else all pages except for @locked_folio are unlocked.
*
* When a failure happens in the second or later iteration of the
- * while-loop, the ordered extents created in previous iterations are kept
- * intact. So, the caller must clean them up by calling
- * btrfs_cleanup_ordered_extents(). See btrfs_run_delalloc_range() for
- * example.
+ * while-loop, the ordered extents created in previous iterations are cleaned up.
*/
static noinline int cow_file_range(struct btrfs_inode *inode,
struct folio *locked_folio, u64 start,
@@ -1373,6 +1319,17 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
alloc_hint = btrfs_get_extent_allocation_hint(inode, start, num_bytes);
/*
+ * We're not doing compressed IO, don't unlock the first page (which
+ * the caller expects to stay locked), don't clear any dirty bits and
+ * don't set any writeback bits.
+ *
+ * Do set the Ordered (Private2) bit so we know this page was properly
+ * setup for writepage.
+ */
+ page_ops = (keep_locked ? 0 : PAGE_UNLOCK);
+ page_ops |= PAGE_SET_ORDERED;
+
+ /*
* Relocation relies on the relocated extents to have exactly the same
* size as the original extents. Normally writeback for relocation data
* extents follows a NOCOW path because relocation preallocates the
@@ -1415,8 +1372,13 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
continue;
}
if (done_offset) {
- *done_offset = start - 1;
- return 0;
+ /*
+ * Move @end to the end of the processed range,
+ * and exit the loop to unlock the processed extents.
+ */
+ end = start - 1;
+ ret = 0;
+ break;
}
ret = -ENOSPC;
}
@@ -1431,6 +1393,10 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
file_extent.offset = 0;
file_extent.compression = BTRFS_COMPRESS_NONE;
+ /*
+ * Locked range will be released either during error clean up or
+ * after the whole range is finished.
+ */
lock_extent(&inode->io_tree, start, start + cur_alloc_size - 1,
&cached);
@@ -1476,21 +1442,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
- /*
- * We're not doing compressed IO, don't unlock the first page
- * (which the caller expects to stay locked), don't clear any
- * dirty bits and don't set any writeback bits
- *
- * Do set the Ordered flag so we know this page was
- * properly setup for writepage.
- */
- page_ops = (keep_locked ? 0 : PAGE_UNLOCK);
- page_ops |= PAGE_SET_ORDERED;
-
- extent_clear_unlock_delalloc(inode, start, start + cur_alloc_size - 1,
- locked_folio, &cached,
- EXTENT_LOCKED | EXTENT_DELALLOC,
- page_ops);
if (num_bytes < cur_alloc_size)
num_bytes = 0;
else
@@ -1507,6 +1458,8 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
if (ret)
goto out_unlock;
}
+ extent_clear_unlock_delalloc(inode, orig_start, end, locked_folio, &cached,
+ EXTENT_LOCKED | EXTENT_DELALLOC, page_ops);
done:
if (done_offset)
*done_offset = end;
@@ -1527,35 +1480,30 @@ out_unlock:
* We process each region below.
*/
- clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
- EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
- page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
-
/*
* For the range (1). We have already instantiated the ordered extents
- * for this region. They are cleaned up by
- * btrfs_cleanup_ordered_extents() in e.g,
- * btrfs_run_delalloc_range(). EXTENT_LOCKED | EXTENT_DELALLOC are
- * already cleared in the above loop. And, EXTENT_DELALLOC_NEW |
- * EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV are handled by the cleanup
- * function.
+ * for this region, thus we need to cleanup those ordered extents.
+ * EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV
+ * are also handled by the ordered extents cleanup.
*
- * However, in case of @keep_locked, we still need to unlock the pages
- * (except @locked_folio) to ensure all the pages are unlocked.
+ * So here we only clear EXTENT_LOCKED and EXTENT_DELALLOC flag, and
+ * finish the writeback of the involved folios, which will be never submitted.
*/
- if (keep_locked && orig_start < start) {
+ if (orig_start < start) {
+ clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC;
+ page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
+
if (!locked_folio)
mapping_set_error(inode->vfs_inode.i_mapping, ret);
+
+ btrfs_cleanup_ordered_extents(inode, orig_start, start - orig_start);
extent_clear_unlock_delalloc(inode, orig_start, start - 1,
- locked_folio, NULL, 0, page_ops);
+ locked_folio, NULL, clear_bits, page_ops);
}
- /*
- * At this point we're unlocked, we want to make sure we're only
- * clearing these flags under the extent lock, so lock the rest of the
- * range and clear everything up.
- */
- lock_extent(&inode->io_tree, start, end, NULL);
+ clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
+ EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
+ page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
/*
* For the range (2). If we reserved an extent for our delalloc range
@@ -1589,6 +1537,10 @@ out_unlock:
btrfs_qgroup_free_data(inode, NULL, start + cur_alloc_size,
end - start - cur_alloc_size + 1, NULL);
}
+ btrfs_err_rl(fs_info,
+ "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d",
+ __func__, btrfs_root_id(inode->root),
+ btrfs_ino(inode), orig_start, end + 1 - orig_start, ret);
return ret;
}
@@ -1809,7 +1761,7 @@ static int fallback_to_cow(struct btrfs_inode *inode,
bytes = range_bytes;
spin_lock(&sinfo->lock);
- btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes);
+ btrfs_space_info_update_bytes_may_use(sinfo, bytes);
spin_unlock(&sinfo->lock);
if (count > 0)
@@ -1837,7 +1789,6 @@ struct can_nocow_file_extent_args {
/* End file offset (inclusive) of the range we want to NOCOW. */
u64 end;
bool writeback_path;
- bool strict;
/*
* Free the path passed to can_nocow_file_extent() once it's not needed
* anymore.
@@ -1892,8 +1843,7 @@ static int can_nocow_file_extent(struct btrfs_path *path,
* for its subvolume was created, then this implies the extent is shared,
* hence we must COW.
*/
- if (!args->strict &&
- btrfs_file_extent_generation(leaf, fi) <=
+ if (btrfs_file_extent_generation(leaf, fi) <=
btrfs_root_last_snapshot(&root->root_item))
goto out;
@@ -1922,9 +1872,8 @@ static int can_nocow_file_extent(struct btrfs_path *path,
*/
btrfs_release_path(path);
- ret = btrfs_cross_ref_exist(root, btrfs_ino(inode),
- key->offset - args->file_extent.offset,
- args->file_extent.disk_bytenr, args->strict, path);
+ ret = btrfs_cross_ref_exist(inode, key->offset - args->file_extent.offset,
+ args->file_extent.disk_bytenr, path);
WARN_ON_ONCE(ret > 0 && is_freespace_inode);
if (ret != 0)
goto out;
@@ -1971,6 +1920,112 @@ static int can_nocow_file_extent(struct btrfs_path *path,
}
/*
+ * Cleanup the dirty folios which will never be submitted due to error.
+ *
+ * When running a delalloc range, we may need to split the ranges (due to
+ * fragmentation or NOCOW). If we hit an error in the later part, we will error
+ * out and previously successfully executed range will never be submitted, thus
+ * we have to cleanup those folios by clearing their dirty flag, starting and
+ * finishing the writeback.
+ */
+static void cleanup_dirty_folios(struct btrfs_inode *inode,
+ struct folio *locked_folio,
+ u64 start, u64 end, int error)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct address_space *mapping = inode->vfs_inode.i_mapping;
+ pgoff_t start_index = start >> PAGE_SHIFT;
+ pgoff_t end_index = end >> PAGE_SHIFT;
+ u32 len;
+
+ ASSERT(end + 1 - start < U32_MAX);
+ ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
+ IS_ALIGNED(end + 1, fs_info->sectorsize));
+ len = end + 1 - start;
+
+ /*
+ * Handle the locked folio first.
+ * The btrfs_folio_clamp_*() helpers can handle range out of the folio case.
+ */
+ btrfs_folio_clamp_finish_io(fs_info, locked_folio, start, len);
+
+ for (pgoff_t index = start_index; index <= end_index; index++) {
+ struct folio *folio;
+
+ /* Already handled at the beginning. */
+ if (index == locked_folio->index)
+ continue;
+ folio = __filemap_get_folio(mapping, index, FGP_LOCK, GFP_NOFS);
+ /* Cache already dropped, no need to do any cleanup. */
+ if (IS_ERR(folio))
+ continue;
+ btrfs_folio_clamp_finish_io(fs_info, locked_folio, start, len);
+ folio_unlock(folio);
+ folio_put(folio);
+ }
+ mapping_set_error(mapping, error);
+}
+
+static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio,
+ struct extent_state **cached,
+ struct can_nocow_file_extent_args *nocow_args,
+ u64 file_pos, bool is_prealloc)
+{
+ struct btrfs_ordered_extent *ordered;
+ u64 len = nocow_args->file_extent.num_bytes;
+ u64 end = file_pos + len - 1;
+ int ret = 0;
+
+ lock_extent(&inode->io_tree, file_pos, end, cached);
+
+ if (is_prealloc) {
+ struct extent_map *em;
+
+ em = btrfs_create_io_em(inode, file_pos, &nocow_args->file_extent,
+ BTRFS_ORDERED_PREALLOC);
+ if (IS_ERR(em)) {
+ unlock_extent(&inode->io_tree, file_pos, end, cached);
+ return PTR_ERR(em);
+ }
+ free_extent_map(em);
+ }
+
+ ordered = btrfs_alloc_ordered_extent(inode, file_pos, &nocow_args->file_extent,
+ is_prealloc
+ ? (1 << BTRFS_ORDERED_PREALLOC)
+ : (1 << BTRFS_ORDERED_NOCOW));
+ if (IS_ERR(ordered)) {
+ if (is_prealloc)
+ btrfs_drop_extent_map_range(inode, file_pos, end, false);
+ unlock_extent(&inode->io_tree, file_pos, end, cached);
+ return PTR_ERR(ordered);
+ }
+
+ if (btrfs_is_data_reloc_root(inode->root))
+ /*
+ * Errors are handled later, as we must prevent
+ * extent_clear_unlock_delalloc() in error handler from freeing
+ * metadata of the created ordered extent.
+ */
+ ret = btrfs_reloc_clone_csums(ordered);
+ btrfs_put_ordered_extent(ordered);
+
+ extent_clear_unlock_delalloc(inode, file_pos, end, locked_folio, cached,
+ EXTENT_LOCKED | EXTENT_DELALLOC |
+ EXTENT_CLEAR_DATA_RESV,
+ PAGE_UNLOCK | PAGE_SET_ORDERED);
+ /*
+ * On error, we need to cleanup the ordered extents we created.
+ *
+ * We do not clear the folio Dirty flags because they are set and
+ * cleaered by the caller.
+ */
+ if (ret < 0)
+ btrfs_cleanup_ordered_extents(inode, file_pos, end);
+ return ret;
+}
+
+/*
* when nowcow writeback call back. This checks for snapshots or COW copies
* of the extents that exist in the file, and COWs the file as required.
*
@@ -1985,6 +2040,11 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
struct btrfs_root *root = inode->root;
struct btrfs_path *path;
u64 cow_start = (u64)-1;
+ /*
+ * If not 0, represents the inclusive end of the last fallback_to_cow()
+ * range. Only for error handling.
+ */
+ u64 cow_end = 0;
u64 cur_offset = start;
int ret;
bool check_prev = true;
@@ -2009,15 +2069,12 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
while (cur_offset <= end) {
struct btrfs_block_group *nocow_bg = NULL;
- struct btrfs_ordered_extent *ordered;
struct btrfs_key found_key;
struct btrfs_file_extent_item *fi;
struct extent_buffer *leaf;
struct extent_state *cached_state = NULL;
u64 extent_end;
- u64 nocow_end;
int extent_type;
- bool is_prealloc;
ret = btrfs_lookup_file_extent(NULL, root, path, ino,
cur_offset, 0);
@@ -2143,74 +2200,21 @@ must_cow:
if (cow_start != (u64)-1) {
ret = fallback_to_cow(inode, locked_folio, cow_start,
found_key.offset - 1);
- cow_start = (u64)-1;
if (ret) {
+ cow_end = found_key.offset - 1;
btrfs_dec_nocow_writers(nocow_bg);
goto error;
}
+ cow_start = (u64)-1;
}
- nocow_end = cur_offset + nocow_args.file_extent.num_bytes - 1;
- lock_extent(&inode->io_tree, cur_offset, nocow_end, &cached_state);
-
- is_prealloc = extent_type == BTRFS_FILE_EXTENT_PREALLOC;
- if (is_prealloc) {
- struct extent_map *em;
-
- em = btrfs_create_io_em(inode, cur_offset,
- &nocow_args.file_extent,
- BTRFS_ORDERED_PREALLOC);
- if (IS_ERR(em)) {
- unlock_extent(&inode->io_tree, cur_offset,
- nocow_end, &cached_state);
- btrfs_dec_nocow_writers(nocow_bg);
- ret = PTR_ERR(em);
- goto error;
- }
- free_extent_map(em);
- }
-
- ordered = btrfs_alloc_ordered_extent(inode, cur_offset,
- &nocow_args.file_extent,
- is_prealloc
- ? (1 << BTRFS_ORDERED_PREALLOC)
- : (1 << BTRFS_ORDERED_NOCOW));
+ ret = nocow_one_range(inode, locked_folio, &cached_state,
+ &nocow_args, cur_offset,
+ extent_type == BTRFS_FILE_EXTENT_PREALLOC);
btrfs_dec_nocow_writers(nocow_bg);
- if (IS_ERR(ordered)) {
- if (is_prealloc) {
- btrfs_drop_extent_map_range(inode, cur_offset,
- nocow_end, false);
- }
- unlock_extent(&inode->io_tree, cur_offset,
- nocow_end, &cached_state);
- ret = PTR_ERR(ordered);
+ if (ret < 0)
goto error;
- }
-
- if (btrfs_is_data_reloc_root(root))
- /*
- * Error handled later, as we must prevent
- * extent_clear_unlock_delalloc() in error handler
- * from freeing metadata of created ordered extent.
- */
- ret = btrfs_reloc_clone_csums(ordered);
- btrfs_put_ordered_extent(ordered);
-
- extent_clear_unlock_delalloc(inode, cur_offset, nocow_end,
- locked_folio, &cached_state,
- EXTENT_LOCKED | EXTENT_DELALLOC |
- EXTENT_CLEAR_DATA_RESV,
- PAGE_UNLOCK | PAGE_SET_ORDERED);
-
cur_offset = extent_end;
-
- /*
- * btrfs_reloc_clone_csums() error, now we're OK to call error
- * handler, as metadata for created ordered extent will only
- * be freed by btrfs_finish_ordered_io().
- */
- if (ret)
- goto error;
}
btrfs_release_path(path);
@@ -2218,11 +2222,12 @@ must_cow:
cow_start = cur_offset;
if (cow_start != (u64)-1) {
- cur_offset = end;
ret = fallback_to_cow(inode, locked_folio, cow_start, end);
- cow_start = (u64)-1;
- if (ret)
+ if (ret) {
+ cow_end = end;
goto error;
+ }
+ cow_start = (u64)-1;
}
btrfs_free_path(path);
@@ -2230,13 +2235,59 @@ must_cow:
error:
/*
- * If an error happened while a COW region is outstanding, cur_offset
- * needs to be reset to cow_start to ensure the COW region is unlocked
- * as well.
+ * There are several error cases:
+ *
+ * 1) Failed without falling back to COW
+ * start cur_offset end
+ * |/////////////| |
+ *
+ * In this case, cow_start should be (u64)-1.
+ *
+ * For range [start, cur_offset) the folios are already unlocked (except
+ * @locked_folio), EXTENT_DELALLOC already removed.
+ * Need to clear the dirty flags and finish the ordered extents.
+ *
+ * 2) Failed with error before calling fallback_to_cow()
+ *
+ * start cow_start end
+ * |/////////////| |
+ *
+ * In this case, only @cow_start is set, @cur_offset is between
+ * [cow_start, end)
+ *
+ * It's mostly the same as case 1), just replace @cur_offset with
+ * @cow_start.
+ *
+ * 3) Failed with error from fallback_to_cow()
+ *
+ * start cow_start cow_end end
+ * |/////////////|-----------| |
+ *
+ * In this case, both @cow_start and @cow_end is set.
+ *
+ * For range [start, cow_start) it's the same as case 1).
+ * But for range [cow_start, cow_end), all the cleanup is handled by
+ * cow_file_range(), we should not touch anything in that range.
+ *
+ * So for all above cases, if @cow_start is set, cleanup ordered extents
+ * for range [start, @cow_start), other wise cleanup range [start, @cur_offset).
*/
if (cow_start != (u64)-1)
cur_offset = cow_start;
+ if (cur_offset > start) {
+ btrfs_cleanup_ordered_extents(inode, start, cur_offset - start);
+ cleanup_dirty_folios(inode, locked_folio, start, cur_offset - 1, ret);
+ }
+
+ /*
+ * If an error happened while a COW region is outstanding, cur_offset
+ * needs to be reset to @cow_end + 1 to skip the COW range, as
+ * cow_file_range() will do the proper cleanup at error.
+ */
+ if (cow_end)
+ cur_offset = cow_end + 1;
+
/*
* We need to lock the extent here because we're clearing DELALLOC and
* we're not locked at this point.
@@ -2255,6 +2306,10 @@ error:
btrfs_qgroup_free_data(inode, NULL, cur_offset, end - cur_offset + 1, NULL);
}
btrfs_free_path(path);
+ btrfs_err_rl(fs_info,
+ "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d",
+ __func__, btrfs_root_id(inode->root),
+ btrfs_ino(inode), start, end + 1 - start, ret);
return ret;
}
@@ -2288,7 +2343,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol
if (should_nocow(inode, start, end)) {
ret = run_delalloc_nocow(inode, locked_folio, start, end);
- goto out;
+ return ret;
}
if (btrfs_inode_can_compress(inode) &&
@@ -2302,11 +2357,6 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol
else
ret = cow_file_range(inode, locked_folio, start, end, NULL,
false, false);
-
-out:
- if (ret < 0)
- btrfs_cleanup_ordered_extents(inode, locked_folio, start,
- end - start + 1);
return ret;
}
@@ -2833,6 +2883,21 @@ int btrfs_writepage_cow_fixup(struct folio *folio)
return 0;
/*
+ * For experimental build, we error out instead of EAGAIN.
+ *
+ * We should not hit such out-of-band dirty folios anymore.
+ */
+ if (IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL)) {
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ btrfs_err_rl(fs_info,
+ "root %lld ino %llu folio %llu is marked dirty without notifying the fs",
+ BTRFS_I(inode)->root->root_key.objectid,
+ btrfs_ino(BTRFS_I(inode)),
+ folio_pos(folio));
+ return -EUCLEAN;
+ }
+
+ /*
* folio_checked is set below when we create a fixup worker for this
* folio, don't try to create another one if we're already
* folio_test_checked.
@@ -2851,7 +2916,7 @@ int btrfs_writepage_cow_fixup(struct folio *folio)
* We are already holding a reference to this inode from
* write_cache_pages. We need to hold it because the space reservation
* takes place outside of the folio lock, and we can't trust
- * page->mapping outside of the folio lock.
+ * folio->mapping outside of the folio lock.
*/
ihold(inode);
btrfs_folio_set_checked(fs_info, folio, folio_pos(folio), folio_size(folio));
@@ -2907,8 +2972,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
if (!drop_args.extent_inserted) {
ins.objectid = btrfs_ino(inode);
- ins.offset = file_pos;
ins.type = BTRFS_EXTENT_DATA_KEY;
+ ins.offset = file_pos;
ret = btrfs_insert_empty_item(trans, root, path, &ins,
sizeof(*stack_fi));
@@ -2921,7 +2986,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
btrfs_item_ptr_offset(leaf, path->slots[0]),
sizeof(struct btrfs_file_extent_item));
- btrfs_mark_buffer_dirty(trans, leaf);
btrfs_release_path(path);
/*
@@ -2944,8 +3008,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
btrfs_update_inode_bytes(inode, num_bytes, drop_args.bytes_found);
ins.objectid = disk_bytenr;
- ins.offset = disk_num_bytes;
ins.type = BTRFS_EXTENT_ITEM_KEY;
+ ins.offset = disk_num_bytes;
ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes);
if (ret)
@@ -3363,6 +3427,7 @@ void btrfs_add_delayed_iput(struct btrfs_inode *inode)
if (atomic_add_unless(&inode->vfs_inode.i_count, -1, 1))
return;
+ WARN_ON_ONCE(test_bit(BTRFS_FS_STATE_NO_DELAYED_IPUT, &fs_info->fs_state));
atomic_inc(&fs_info->nr_delayed_iputs);
/*
* Need to be irq safe here because we can be called from either an irq
@@ -3483,7 +3548,6 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
struct extent_buffer *leaf;
struct btrfs_key key, found_key;
struct btrfs_trans_handle *trans;
- struct inode *inode;
u64 last_objectid = 0;
int ret = 0, nr_unlink = 0;
@@ -3502,6 +3566,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
key.offset = (u64)-1;
while (1) {
+ struct btrfs_inode *inode;
+
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto out;
@@ -3625,10 +3691,10 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
* deleted but wasn't. The inode number may have been reused,
* but either way, we can delete the orphan item.
*/
- if (!inode || inode->i_nlink) {
+ if (!inode || inode->vfs_inode.i_nlink) {
if (inode) {
- ret = btrfs_drop_verity_items(BTRFS_I(inode));
- iput(inode);
+ ret = btrfs_drop_verity_items(inode);
+ iput(&inode->vfs_inode);
inode = NULL;
if (ret)
goto out;
@@ -3651,7 +3717,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
nr_unlink++;
/* this will do delete_inode and everything for us */
- iput(inode);
+ iput(&inode->vfs_inode);
}
/* release the path since we're done with it */
btrfs_release_path(path);
@@ -3801,12 +3867,13 @@ static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc)
*
* On failure clean up the inode.
*/
-static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *path)
+static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path *path)
{
- struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *leaf;
struct btrfs_inode_item *inode_item;
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct inode *vfs_inode = &inode->vfs_inode;
struct btrfs_key location;
unsigned long ptr;
int maybe_acls;
@@ -3815,7 +3882,7 @@ static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *path)
bool filled = false;
int first_xattr_slot;
- ret = btrfs_init_file_extent_tree(BTRFS_I(inode));
+ ret = btrfs_init_file_extent_tree(inode);
if (ret)
goto out;
@@ -3825,7 +3892,7 @@ static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *path)
ASSERT(path);
- btrfs_get_inode_key(BTRFS_I(inode), &location);
+ btrfs_get_inode_key(inode, &location);
ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
if (ret) {
@@ -3845,41 +3912,41 @@ static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *path)
inode_item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_inode_item);
- inode->i_mode = btrfs_inode_mode(leaf, inode_item);
- set_nlink(inode, btrfs_inode_nlink(leaf, inode_item));
- i_uid_write(inode, btrfs_inode_uid(leaf, inode_item));
- i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
- btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item));
- btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0,
- round_up(i_size_read(inode), fs_info->sectorsize));
-
- inode_set_atime(inode, btrfs_timespec_sec(leaf, &inode_item->atime),
+ vfs_inode->i_mode = btrfs_inode_mode(leaf, inode_item);
+ set_nlink(vfs_inode, btrfs_inode_nlink(leaf, inode_item));
+ i_uid_write(vfs_inode, btrfs_inode_uid(leaf, inode_item));
+ i_gid_write(vfs_inode, btrfs_inode_gid(leaf, inode_item));
+ btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
+ btrfs_inode_set_file_extent_range(inode, 0,
+ round_up(i_size_read(vfs_inode), fs_info->sectorsize));
+
+ inode_set_atime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->atime),
btrfs_timespec_nsec(leaf, &inode_item->atime));
- inode_set_mtime(inode, btrfs_timespec_sec(leaf, &inode_item->mtime),
+ inode_set_mtime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->mtime),
btrfs_timespec_nsec(leaf, &inode_item->mtime));
- inode_set_ctime(inode, btrfs_timespec_sec(leaf, &inode_item->ctime),
+ inode_set_ctime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->ctime),
btrfs_timespec_nsec(leaf, &inode_item->ctime));
- BTRFS_I(inode)->i_otime_sec = btrfs_timespec_sec(leaf, &inode_item->otime);
- BTRFS_I(inode)->i_otime_nsec = btrfs_timespec_nsec(leaf, &inode_item->otime);
+ inode->i_otime_sec = btrfs_timespec_sec(leaf, &inode_item->otime);
+ inode->i_otime_nsec = btrfs_timespec_nsec(leaf, &inode_item->otime);
- inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
- BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
- BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
+ inode_set_bytes(vfs_inode, btrfs_inode_nbytes(leaf, inode_item));
+ inode->generation = btrfs_inode_generation(leaf, inode_item);
+ inode->last_trans = btrfs_inode_transid(leaf, inode_item);
- inode_set_iversion_queried(inode,
- btrfs_inode_sequence(leaf, inode_item));
- inode->i_generation = BTRFS_I(inode)->generation;
- inode->i_rdev = 0;
+ inode_set_iversion_queried(vfs_inode, btrfs_inode_sequence(leaf, inode_item));
+ vfs_inode->i_generation = inode->generation;
+ vfs_inode->i_rdev = 0;
rdev = btrfs_inode_rdev(leaf, inode_item);
- if (S_ISDIR(inode->i_mode))
- BTRFS_I(inode)->index_cnt = (u64)-1;
+ if (S_ISDIR(vfs_inode->i_mode))
+ inode->index_cnt = (u64)-1;
btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item),
- &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags);
+ &inode->flags, &inode->ro_flags);
+ btrfs_update_inode_mapping_flags(inode);
cache_index:
/*
@@ -3891,9 +3958,8 @@ cache_index:
* This is required for both inode re-read from disk and delayed inode
* in the delayed_nodes xarray.
*/
- if (BTRFS_I(inode)->last_trans == btrfs_get_fs_generation(fs_info))
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &BTRFS_I(inode)->runtime_flags);
+ if (inode->last_trans == btrfs_get_fs_generation(fs_info))
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
/*
* We don't persist the id of the transaction where an unlink operation
@@ -3922,7 +3988,7 @@ cache_index:
* transaction commits on fsync if our inode is a directory, or if our
* inode is not a directory, logging its parent unnecessarily.
*/
- BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans;
+ inode->last_unlink_trans = inode->last_trans;
/*
* Same logic as for last_unlink_trans. We don't persist the generation
@@ -3930,15 +3996,15 @@ cache_index:
* operation, so after eviction and reloading the inode we must be
* pessimistic and assume the last transaction that modified the inode.
*/
- BTRFS_I(inode)->last_reflink_trans = BTRFS_I(inode)->last_trans;
+ inode->last_reflink_trans = inode->last_trans;
path->slots[0]++;
- if (inode->i_nlink != 1 ||
+ if (vfs_inode->i_nlink != 1 ||
path->slots[0] >= btrfs_header_nritems(leaf))
goto cache_acl;
btrfs_item_key_to_cpu(leaf, &location, path->slots[0]);
- if (location.objectid != btrfs_ino(BTRFS_I(inode)))
+ if (location.objectid != btrfs_ino(inode))
goto cache_acl;
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
@@ -3946,13 +4012,12 @@ cache_index:
struct btrfs_inode_ref *ref;
ref = (struct btrfs_inode_ref *)ptr;
- BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref);
+ inode->dir_index = btrfs_inode_ref_index(leaf, ref);
} else if (location.type == BTRFS_INODE_EXTREF_KEY) {
struct btrfs_inode_extref *extref;
extref = (struct btrfs_inode_extref *)ptr;
- BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf,
- extref);
+ inode->dir_index = btrfs_inode_extref_index(leaf, extref);
}
cache_acl:
/*
@@ -3960,50 +4025,49 @@ cache_acl:
* any xattrs or acls
*/
maybe_acls = acls_after_inode_item(leaf, path->slots[0],
- btrfs_ino(BTRFS_I(inode)), &first_xattr_slot);
+ btrfs_ino(inode), &first_xattr_slot);
if (first_xattr_slot != -1) {
path->slots[0] = first_xattr_slot;
ret = btrfs_load_inode_props(inode, path);
if (ret)
btrfs_err(fs_info,
"error loading props for ino %llu (root %llu): %d",
- btrfs_ino(BTRFS_I(inode)),
- btrfs_root_id(root), ret);
+ btrfs_ino(inode), btrfs_root_id(root), ret);
}
if (!maybe_acls)
- cache_no_acl(inode);
+ cache_no_acl(vfs_inode);
- switch (inode->i_mode & S_IFMT) {
+ switch (vfs_inode->i_mode & S_IFMT) {
case S_IFREG:
- inode->i_mapping->a_ops = &btrfs_aops;
- inode->i_fop = &btrfs_file_operations;
- inode->i_op = &btrfs_file_inode_operations;
+ vfs_inode->i_mapping->a_ops = &btrfs_aops;
+ vfs_inode->i_fop = &btrfs_file_operations;
+ vfs_inode->i_op = &btrfs_file_inode_operations;
break;
case S_IFDIR:
- inode->i_fop = &btrfs_dir_file_operations;
- inode->i_op = &btrfs_dir_inode_operations;
+ vfs_inode->i_fop = &btrfs_dir_file_operations;
+ vfs_inode->i_op = &btrfs_dir_inode_operations;
break;
case S_IFLNK:
- inode->i_op = &btrfs_symlink_inode_operations;
- inode_nohighmem(inode);
- inode->i_mapping->a_ops = &btrfs_aops;
+ vfs_inode->i_op = &btrfs_symlink_inode_operations;
+ inode_nohighmem(vfs_inode);
+ vfs_inode->i_mapping->a_ops = &btrfs_aops;
break;
default:
- inode->i_op = &btrfs_special_inode_operations;
- init_special_inode(inode, inode->i_mode, rdev);
+ vfs_inode->i_op = &btrfs_special_inode_operations;
+ init_special_inode(vfs_inode, vfs_inode->i_mode, rdev);
break;
}
btrfs_sync_inode_flags_to_i_flags(inode);
- ret = btrfs_add_inode_to_root(BTRFS_I(inode), true);
+ ret = btrfs_add_inode_to_root(inode, true);
if (ret)
goto out;
return 0;
out:
- iget_failed(inode);
+ iget_failed(vfs_inode);
return ret;
}
@@ -4085,7 +4149,6 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
struct btrfs_inode_item);
fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode);
- btrfs_mark_buffer_dirty(trans, leaf);
btrfs_set_inode_last_trans(trans, inode);
ret = 0;
failed:
@@ -5559,7 +5622,7 @@ static int btrfs_find_actor(struct inode *inode, void *opaque)
args->root == BTRFS_I(inode)->root;
}
-static struct inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root)
+static struct btrfs_inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root)
{
struct inode *inode;
struct btrfs_iget_args args;
@@ -5571,40 +5634,42 @@ static struct inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root)
inode = iget5_locked_rcu(root->fs_info->sb, hashval, btrfs_find_actor,
btrfs_init_locked_inode,
(void *)&args);
- return inode;
+ if (!inode)
+ return NULL;
+ return BTRFS_I(inode);
}
/*
* Get an inode object given its inode number and corresponding root. Path is
* preallocated to prevent recursing back to iget through allocator.
*/
-struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root,
- struct btrfs_path *path)
+struct btrfs_inode *btrfs_iget_path(u64 ino, struct btrfs_root *root,
+ struct btrfs_path *path)
{
- struct inode *inode;
+ struct btrfs_inode *inode;
int ret;
inode = btrfs_iget_locked(ino, root);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode->vfs_inode.i_state & I_NEW))
return inode;
ret = btrfs_read_locked_inode(inode, path);
if (ret)
return ERR_PTR(ret);
- unlock_new_inode(inode);
+ unlock_new_inode(&inode->vfs_inode);
return inode;
}
/*
* Get an inode object given its inode number and corresponding root.
*/
-struct inode *btrfs_iget(u64 ino, struct btrfs_root *root)
+struct btrfs_inode *btrfs_iget(u64 ino, struct btrfs_root *root)
{
- struct inode *inode;
+ struct btrfs_inode *inode;
struct btrfs_path *path;
int ret;
@@ -5612,7 +5677,7 @@ struct inode *btrfs_iget(u64 ino, struct btrfs_root *root)
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode->vfs_inode.i_state & I_NEW))
return inode;
path = btrfs_alloc_path();
@@ -5624,43 +5689,46 @@ struct inode *btrfs_iget(u64 ino, struct btrfs_root *root)
if (ret)
return ERR_PTR(ret);
- unlock_new_inode(inode);
+ unlock_new_inode(&inode->vfs_inode);
return inode;
}
-static struct inode *new_simple_dir(struct inode *dir,
- struct btrfs_key *key,
- struct btrfs_root *root)
+static struct btrfs_inode *new_simple_dir(struct inode *dir,
+ struct btrfs_key *key,
+ struct btrfs_root *root)
{
struct timespec64 ts;
- struct inode *inode = new_inode(dir->i_sb);
+ struct inode *vfs_inode;
+ struct btrfs_inode *inode;
- if (!inode)
+ vfs_inode = new_inode(dir->i_sb);
+ if (!vfs_inode)
return ERR_PTR(-ENOMEM);
- BTRFS_I(inode)->root = btrfs_grab_root(root);
- BTRFS_I(inode)->ref_root_id = key->objectid;
- set_bit(BTRFS_INODE_ROOT_STUB, &BTRFS_I(inode)->runtime_flags);
- set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
+ inode = BTRFS_I(vfs_inode);
+ inode->root = btrfs_grab_root(root);
+ inode->ref_root_id = key->objectid;
+ set_bit(BTRFS_INODE_ROOT_STUB, &inode->runtime_flags);
+ set_bit(BTRFS_INODE_DUMMY, &inode->runtime_flags);
- btrfs_set_inode_number(BTRFS_I(inode), BTRFS_EMPTY_SUBVOL_DIR_OBJECTID);
+ btrfs_set_inode_number(inode, BTRFS_EMPTY_SUBVOL_DIR_OBJECTID);
/*
* We only need lookup, the rest is read-only and there's no inode
* associated with the dentry
*/
- inode->i_op = &simple_dir_inode_operations;
- inode->i_opflags &= ~IOP_XATTR;
- inode->i_fop = &simple_dir_operations;
- inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
+ vfs_inode->i_op = &simple_dir_inode_operations;
+ vfs_inode->i_opflags &= ~IOP_XATTR;
+ vfs_inode->i_fop = &simple_dir_operations;
+ vfs_inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
- ts = inode_set_ctime_current(inode);
- inode_set_mtime_to_ts(inode, ts);
- inode_set_atime_to_ts(inode, inode_get_atime(dir));
- BTRFS_I(inode)->i_otime_sec = ts.tv_sec;
- BTRFS_I(inode)->i_otime_nsec = ts.tv_nsec;
+ ts = inode_set_ctime_current(vfs_inode);
+ inode_set_mtime_to_ts(vfs_inode, ts);
+ inode_set_atime_to_ts(vfs_inode, inode_get_atime(dir));
+ inode->i_otime_sec = ts.tv_sec;
+ inode->i_otime_nsec = ts.tv_nsec;
- inode->i_uid = dir->i_uid;
- inode->i_gid = dir->i_gid;
+ vfs_inode->i_uid = dir->i_uid;
+ vfs_inode->i_gid = dir->i_gid;
return inode;
}
@@ -5674,15 +5742,15 @@ static_assert(BTRFS_FT_FIFO == FT_FIFO);
static_assert(BTRFS_FT_SOCK == FT_SOCK);
static_assert(BTRFS_FT_SYMLINK == FT_SYMLINK);
-static inline u8 btrfs_inode_type(struct inode *inode)
+static inline u8 btrfs_inode_type(const struct btrfs_inode *inode)
{
- return fs_umode_to_ftype(inode->i_mode);
+ return fs_umode_to_ftype(inode->vfs_inode.i_mode);
}
struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
{
struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
- struct inode *inode;
+ struct btrfs_inode *inode;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_root *sub_root = root;
struct btrfs_key location = { 0 };
@@ -5699,18 +5767,18 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
if (location.type == BTRFS_INODE_ITEM_KEY) {
inode = btrfs_iget(location.objectid, root);
if (IS_ERR(inode))
- return inode;
+ return ERR_CAST(inode);
/* Do extra check against inode mode with di_type */
if (btrfs_inode_type(inode) != di_type) {
btrfs_crit(fs_info,
"inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u",
- inode->i_mode, btrfs_inode_type(inode),
+ inode->vfs_inode.i_mode, btrfs_inode_type(inode),
di_type);
- iput(inode);
+ iput(&inode->vfs_inode);
return ERR_PTR(-EUCLEAN);
}
- return inode;
+ return &inode->vfs_inode;
}
ret = fixup_tree_root_location(fs_info, BTRFS_I(dir), dentry,
@@ -5725,19 +5793,22 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
btrfs_put_root(sub_root);
if (IS_ERR(inode))
- return inode;
+ return ERR_CAST(inode);
down_read(&fs_info->cleanup_work_sem);
- if (!sb_rdonly(inode->i_sb))
+ if (!sb_rdonly(inode->vfs_inode.i_sb))
ret = btrfs_orphan_cleanup(sub_root);
up_read(&fs_info->cleanup_work_sem);
if (ret) {
- iput(inode);
+ iput(&inode->vfs_inode);
inode = ERR_PTR(ret);
}
}
- return inode;
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
+
+ return &inode->vfs_inode;
}
static int btrfs_dentry_delete(const struct dentry *dentry)
@@ -6210,7 +6281,7 @@ static void btrfs_inherit_iflags(struct btrfs_inode *inode, struct btrfs_inode *
inode->flags |= BTRFS_INODE_NODATASUM;
}
- btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode);
+ btrfs_sync_inode_flags_to_i_flags(inode);
}
int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
@@ -6296,6 +6367,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
if (btrfs_test_opt(fs_info, NODATACOW))
BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
BTRFS_INODE_NODATASUM;
+ btrfs_update_inode_mapping_flags(BTRFS_I(inode));
}
ret = btrfs_insert_inode_locked(inode);
@@ -6380,7 +6452,6 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
}
}
- btrfs_mark_buffer_dirty(trans, path->nodes[0]);
/*
* We don't need the path anymore, plus inheriting properties, adding
* ACLs, security xattrs, orphan item or adding the link, will result in
@@ -6390,7 +6461,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
path = NULL;
if (args->subvol) {
- struct inode *parent;
+ struct btrfs_inode *parent;
/*
* Subvolumes inherit properties from their parent subvolume,
@@ -6400,11 +6471,13 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
if (IS_ERR(parent)) {
ret = PTR_ERR(parent);
} else {
- ret = btrfs_inode_inherit_props(trans, inode, parent);
- iput(parent);
+ ret = btrfs_inode_inherit_props(trans, BTRFS_I(inode),
+ parent);
+ iput(&parent->vfs_inode);
}
} else {
- ret = btrfs_inode_inherit_props(trans, inode, dir);
+ ret = btrfs_inode_inherit_props(trans, BTRFS_I(inode),
+ BTRFS_I(dir));
}
if (ret) {
btrfs_err(fs_info,
@@ -6502,7 +6575,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
return ret;
ret = btrfs_insert_dir_item(trans, name, parent_inode, &key,
- btrfs_inode_type(&inode->vfs_inode), index);
+ btrfs_inode_type(inode), index);
if (ret == -EEXIST || ret == -EOVERFLOW)
goto fail_dir_item;
else if (ret) {
@@ -6702,18 +6775,18 @@ fail:
return err;
}
-static int btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct inode *inode;
inode = new_inode(dir->i_sb);
if (!inode)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
inode_init_owner(idmap, inode, dir, S_IFDIR | mode);
inode->i_op = &btrfs_dir_inode_operations;
inode->i_fop = &btrfs_dir_file_operations;
- return btrfs_create_common(dir, dentry, inode);
+ return ERR_PTR(btrfs_create_common(dir, dentry, inode));
}
static noinline int uncompress_inline(struct btrfs_path *path,
@@ -6722,6 +6795,7 @@ static noinline int uncompress_inline(struct btrfs_path *path,
{
int ret;
struct extent_buffer *leaf = path->nodes[0];
+ const u32 blocksize = leaf->fs_info->sectorsize;
char *tmp;
size_t max_size;
unsigned long inline_size;
@@ -6738,7 +6812,7 @@ static noinline int uncompress_inline(struct btrfs_path *path,
read_extent_buffer(leaf, tmp, ptr, inline_size);
- max_size = min_t(unsigned long, PAGE_SIZE, max_size);
+ max_size = min_t(unsigned long, blocksize, max_size);
ret = btrfs_decompress(compress_type, tmp, folio, 0, inline_size,
max_size);
@@ -6750,14 +6824,15 @@ static noinline int uncompress_inline(struct btrfs_path *path,
* cover that region here.
*/
- if (max_size < PAGE_SIZE)
- folio_zero_range(folio, max_size, PAGE_SIZE - max_size);
+ if (max_size < blocksize)
+ folio_zero_range(folio, max_size, blocksize - max_size);
kfree(tmp);
return ret;
}
static int read_inline_extent(struct btrfs_path *path, struct folio *folio)
{
+ const u32 blocksize = path->nodes[0]->fs_info->sectorsize;
struct btrfs_file_extent_item *fi;
void *kaddr;
size_t copy_size;
@@ -6772,14 +6847,14 @@ static int read_inline_extent(struct btrfs_path *path, struct folio *folio)
if (btrfs_file_extent_compression(path->nodes[0], fi) != BTRFS_COMPRESS_NONE)
return uncompress_inline(path, folio, fi);
- copy_size = min_t(u64, PAGE_SIZE,
+ copy_size = min_t(u64, blocksize,
btrfs_file_extent_ram_bytes(path->nodes[0], fi));
kaddr = kmap_local_folio(folio, 0);
read_extent_buffer(path->nodes[0], kaddr,
btrfs_file_extent_inline_start(fi), copy_size);
kunmap_local(kaddr);
- if (copy_size < PAGE_SIZE)
- folio_zero_range(folio, copy_size, PAGE_SIZE - copy_size);
+ if (copy_size < blocksize)
+ folio_zero_range(folio, copy_size, blocksize - copy_size);
return 0;
}
@@ -7011,8 +7086,6 @@ static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
* @orig_start: (optional) Return the original file offset of the file extent
* @orig_len: (optional) Return the original on-disk length of the file extent
* @ram_bytes: (optional) Return the ram_bytes of the file extent
- * @strict: if true, omit optimizations that might force us into unnecessary
- * cow. e.g., don't trust generation number.
*
* Return:
* >0 and update @len if we can do nocow write
@@ -7022,17 +7095,17 @@ static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
* NOTE: This only checks the file extents, caller is responsible to wait for
* any ordered extents.
*/
-noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
+noinline int can_nocow_extent(struct btrfs_inode *inode, u64 offset, u64 *len,
struct btrfs_file_extent *file_extent,
- bool nowait, bool strict)
+ bool nowait)
{
- struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct can_nocow_file_extent_args nocow_args = { 0 };
struct btrfs_path *path;
int ret;
struct extent_buffer *leaf;
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct extent_io_tree *io_tree = &inode->io_tree;
struct btrfs_file_extent_item *fi;
struct btrfs_key key;
int found_type;
@@ -7042,8 +7115,8 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
return -ENOMEM;
path->nowait = nowait;
- ret = btrfs_lookup_file_extent(NULL, root, path,
- btrfs_ino(BTRFS_I(inode)), offset, 0);
+ ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
+ offset, 0);
if (ret < 0)
goto out;
@@ -7058,7 +7131,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
ret = 0;
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
- if (key.objectid != btrfs_ino(BTRFS_I(inode)) ||
+ if (key.objectid != btrfs_ino(inode) ||
key.type != BTRFS_EXTENT_DATA_KEY) {
/* not our file or wrong item type, must cow */
goto out;
@@ -7077,10 +7150,9 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
nocow_args.start = offset;
nocow_args.end = offset + *len - 1;
- nocow_args.strict = strict;
nocow_args.free_path = true;
- ret = can_nocow_file_extent(path, &key, BTRFS_I(inode), &nocow_args);
+ ret = can_nocow_file_extent(path, &key, inode, &nocow_args);
/* can_nocow_file_extent() has freed the path. */
path = NULL;
@@ -7096,7 +7168,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
nocow_args.file_extent.offset))
goto out;
- if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
+ if (!(inode->flags & BTRFS_INODE_NODATACOW) &&
found_type == BTRFS_FILE_EXTENT_PREALLOC) {
u64 range_end;
@@ -7201,7 +7273,7 @@ static void wait_subpage_spinlock(struct folio *folio)
struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
struct btrfs_subpage *subpage;
- if (!btrfs_is_subpage(fs_info, folio->mapping))
+ if (!btrfs_is_subpage(fs_info, folio))
return;
ASSERT(folio_test_private(folio) && folio_get_private(folio));
@@ -7225,7 +7297,7 @@ static void wait_subpage_spinlock(struct folio *folio)
static int btrfs_launder_folio(struct folio *folio)
{
return btrfs_qgroup_free_data(folio_to_inode(folio), NULL, folio_pos(folio),
- PAGE_SIZE, NULL);
+ folio_size(folio), NULL);
}
static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
@@ -8027,31 +8099,45 @@ static int btrfs_rename_exchange(struct inode *old_dir,
/* src is a subvolume */
if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_fail;
+ }
} else { /* src is an inode */
ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
BTRFS_I(old_dentry->d_inode),
old_name, &old_rename_ctx);
- if (!ret)
- ret = btrfs_update_inode(trans, BTRFS_I(old_inode));
- }
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- goto out_fail;
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_fail;
+ }
+ ret = btrfs_update_inode(trans, BTRFS_I(old_inode));
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_fail;
+ }
}
/* dest is a subvolume */
if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_fail;
+ }
} else { /* dest is an inode */
ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir),
BTRFS_I(new_dentry->d_inode),
new_name, &new_rename_ctx);
- if (!ret)
- ret = btrfs_update_inode(trans, BTRFS_I(new_inode));
- }
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- goto out_fail;
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_fail;
+ }
+ ret = btrfs_update_inode(trans, BTRFS_I(new_inode));
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_fail;
+ }
}
ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
@@ -8287,16 +8373,23 @@ static int btrfs_rename(struct mnt_idmap *idmap,
if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_fail;
+ }
} else {
ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
BTRFS_I(d_inode(old_dentry)),
&old_fname.disk_name, &rename_ctx);
- if (!ret)
- ret = btrfs_update_inode(trans, BTRFS_I(old_inode));
- }
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- goto out_fail;
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_fail;
+ }
+ ret = btrfs_update_inode(trans, BTRFS_I(old_inode));
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_fail;
+ }
}
if (new_inode) {
@@ -8304,18 +8397,27 @@ static int btrfs_rename(struct mnt_idmap *idmap,
if (unlikely(btrfs_ino(BTRFS_I(new_inode)) ==
BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_fail;
+ }
BUG_ON(new_inode->i_nlink == 0);
} else {
ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir),
BTRFS_I(d_inode(new_dentry)),
&new_fname.disk_name);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_fail;
+ }
}
- if (!ret && new_inode->i_nlink == 0)
+ if (new_inode->i_nlink == 0) {
ret = btrfs_orphan_add(trans,
BTRFS_I(d_inode(new_dentry)));
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- goto out_fail;
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_fail;
+ }
}
}
@@ -8430,8 +8532,6 @@ static int start_delalloc_inodes(struct btrfs_root *root,
struct writeback_control *wbc, bool snapshot,
bool in_reclaim_context)
{
- struct btrfs_inode *binode;
- struct inode *inode;
struct btrfs_delalloc_work *work, *next;
LIST_HEAD(works);
LIST_HEAD(splice);
@@ -8442,30 +8542,30 @@ static int start_delalloc_inodes(struct btrfs_root *root,
spin_lock(&root->delalloc_lock);
list_splice_init(&root->delalloc_inodes, &splice);
while (!list_empty(&splice)) {
- binode = list_entry(splice.next, struct btrfs_inode,
- delalloc_inodes);
+ struct btrfs_inode *inode;
+ struct inode *tmp_inode;
+
+ inode = list_entry(splice.next, struct btrfs_inode, delalloc_inodes);
- list_move_tail(&binode->delalloc_inodes,
- &root->delalloc_inodes);
+ list_move_tail(&inode->delalloc_inodes, &root->delalloc_inodes);
if (in_reclaim_context &&
- test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &binode->runtime_flags))
+ test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags))
continue;
- inode = igrab(&binode->vfs_inode);
- if (!inode) {
+ tmp_inode = igrab(&inode->vfs_inode);
+ if (!tmp_inode) {
cond_resched_lock(&root->delalloc_lock);
continue;
}
spin_unlock(&root->delalloc_lock);
if (snapshot)
- set_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
- &binode->runtime_flags);
+ set_bit(BTRFS_INODE_SNAPSHOT_FLUSH, &inode->runtime_flags);
if (full_flush) {
- work = btrfs_alloc_delalloc_work(inode);
+ work = btrfs_alloc_delalloc_work(&inode->vfs_inode);
if (!work) {
- iput(inode);
+ iput(&inode->vfs_inode);
ret = -ENOMEM;
goto out;
}
@@ -8473,8 +8573,8 @@ static int start_delalloc_inodes(struct btrfs_root *root,
btrfs_queue_work(root->fs_info->flush_workers,
&work->work);
} else {
- ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc);
- btrfs_add_delayed_iput(BTRFS_I(inode));
+ ret = filemap_fdatawrite_wbc(inode->vfs_inode.i_mapping, wbc);
+ btrfs_add_delayed_iput(inode);
if (ret || wbc->nr_to_write <= 0)
goto out;
}
@@ -8591,7 +8691,12 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
struct extent_buffer *leaf;
name_len = strlen(symname);
- if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info))
+ /*
+ * Symlinks utilize uncompressed inline extent data, which should not
+ * reach block size.
+ */
+ if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) ||
+ name_len >= fs_info->sectorsize)
return -ENAMETOOLONG;
inode = new_inode(dir->i_sb);
@@ -8630,8 +8735,8 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
goto out;
}
key.objectid = btrfs_ino(BTRFS_I(inode));
- key.offset = 0;
key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = 0;
datasize = btrfs_file_extent_calc_inline_size(name_len);
err = btrfs_insert_empty_item(trans, root, path, &key,
datasize);
@@ -8655,7 +8760,6 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
ptr = btrfs_file_extent_inline_start(ei);
write_extent_buffer(leaf, symname, ptr, name_len);
- btrfs_mark_buffer_dirty(trans, leaf);
btrfs_free_path(path);
d_instantiate_new(dentry, inode);
@@ -9078,9 +9182,9 @@ out:
}
struct btrfs_encoded_read_private {
- wait_queue_head_t wait;
+ struct completion *sync_reads;
void *uring_ctx;
- atomic_t pending;
+ refcount_t pending_refs;
blk_status_t status;
};
@@ -9090,23 +9194,22 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
if (bbio->bio.bi_status) {
/*
- * The memory barrier implied by the atomic_dec_return() here
- * pairs with the memory barrier implied by the
- * atomic_dec_return() or io_wait_event() in
- * btrfs_encoded_read_regular_fill_pages() to ensure that this
- * write is observed before the load of status in
+ * The memory barrier implied by the refcount_dec_and_test() here
+ * pairs with the memory barrier implied by the refcount_dec_and_test()
+ * in btrfs_encoded_read_regular_fill_pages() to ensure that
+ * this write is observed before the load of status in
* btrfs_encoded_read_regular_fill_pages().
*/
WRITE_ONCE(priv->status, bbio->bio.bi_status);
}
- if (atomic_dec_and_test(&priv->pending)) {
+ if (refcount_dec_and_test(&priv->pending_refs)) {
int err = blk_status_to_errno(READ_ONCE(priv->status));
if (priv->uring_ctx) {
btrfs_uring_read_extent_endio(priv->uring_ctx, err);
kfree(priv);
} else {
- wake_up(&priv->wait);
+ complete(priv->sync_reads);
}
}
bio_put(&bbio->bio);
@@ -9117,17 +9220,27 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
struct page **pages, void *uring_ctx)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct btrfs_encoded_read_private *priv;
+ struct btrfs_encoded_read_private *priv, sync_priv;
+ struct completion sync_reads;
unsigned long i = 0;
struct btrfs_bio *bbio;
int ret;
- priv = kmalloc(sizeof(struct btrfs_encoded_read_private), GFP_NOFS);
- if (!priv)
- return -ENOMEM;
+ /*
+ * Fast path for synchronous reads which completes in this call, io_uring
+ * needs longer time span.
+ */
+ if (uring_ctx) {
+ priv = kmalloc(sizeof(struct btrfs_encoded_read_private), GFP_NOFS);
+ if (!priv)
+ return -ENOMEM;
+ } else {
+ priv = &sync_priv;
+ init_completion(&sync_reads);
+ priv->sync_reads = &sync_reads;
+ }
- init_waitqueue_head(&priv->wait);
- atomic_set(&priv->pending, 1);
+ refcount_set(&priv->pending_refs, 1);
priv->status = 0;
priv->uring_ctx = uring_ctx;
@@ -9140,7 +9253,7 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE);
if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) {
- atomic_inc(&priv->pending);
+ refcount_inc(&priv->pending_refs);
btrfs_submit_bbio(bbio, 0);
bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
@@ -9155,11 +9268,11 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
disk_io_size -= bytes;
} while (disk_io_size);
- atomic_inc(&priv->pending);
+ refcount_inc(&priv->pending_refs);
btrfs_submit_bbio(bbio, 0);
if (uring_ctx) {
- if (atomic_dec_return(&priv->pending) == 0) {
+ if (refcount_dec_and_test(&priv->pending_refs)) {
ret = blk_status_to_errno(READ_ONCE(priv->status));
btrfs_uring_read_extent_endio(uring_ctx, ret);
kfree(priv);
@@ -9168,12 +9281,10 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
return -EIOCBQUEUED;
} else {
- if (atomic_dec_return(&priv->pending) != 0)
- io_wait_event(priv->wait, !atomic_read(&priv->pending));
+ if (!refcount_dec_and_test(&priv->pending_refs))
+ wait_for_completion_io(&sync_reads);
/* See btrfs_encoded_read_endio() for ordering. */
- ret = blk_status_to_errno(READ_ONCE(priv->status));
- kfree(priv);
- return ret;
+ return blk_status_to_errno(READ_ONCE(priv->status));
}
}
@@ -9799,15 +9910,25 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct extent_state *cached_state = NULL;
- struct extent_map *em = NULL;
struct btrfs_chunk_map *map = NULL;
struct btrfs_device *device = NULL;
struct btrfs_swap_info bsi = {
.lowest_ppage = (sector_t)-1ULL,
};
+ struct btrfs_backref_share_check_ctx *backref_ctx = NULL;
+ struct btrfs_path *path = NULL;
int ret = 0;
u64 isize;
- u64 start;
+ u64 prev_extent_end = 0;
+
+ /*
+ * Acquire the inode's mmap lock to prevent races with memory mapped
+ * writes, as they could happen after we flush delalloc below and before
+ * we lock the extent range further below. The inode was already locked
+ * up in the call chain.
+ */
+ btrfs_assert_inode_locked(BTRFS_I(inode));
+ down_write(&BTRFS_I(inode)->i_mmap_lock);
/*
* If the swap file was just created, make sure delalloc is done. If the
@@ -9816,22 +9937,32 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
*/
ret = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1);
if (ret)
- return ret;
+ goto out_unlock_mmap;
/*
* The inode is locked, so these flags won't change after we check them.
*/
if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) {
btrfs_warn(fs_info, "swapfile must not be compressed");
- return -EINVAL;
+ ret = -EINVAL;
+ goto out_unlock_mmap;
}
if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) {
btrfs_warn(fs_info, "swapfile must not be copy-on-write");
- return -EINVAL;
+ ret = -EINVAL;
+ goto out_unlock_mmap;
}
if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
btrfs_warn(fs_info, "swapfile must not be checksummed");
- return -EINVAL;
+ ret = -EINVAL;
+ goto out_unlock_mmap;
+ }
+
+ path = btrfs_alloc_path();
+ backref_ctx = btrfs_alloc_backref_share_check_ctx();
+ if (!path || !backref_ctx) {
+ ret = -ENOMEM;
+ goto out_unlock_mmap;
}
/*
@@ -9846,7 +9977,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) {
btrfs_warn(fs_info,
"cannot activate swapfile while exclusive operation is running");
- return -EBUSY;
+ ret = -EBUSY;
+ goto out_unlock_mmap;
}
/*
@@ -9860,7 +9992,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
btrfs_exclop_finish(fs_info);
btrfs_warn(fs_info,
"cannot activate swapfile because snapshot creation is in progress");
- return -EINVAL;
+ ret = -EINVAL;
+ goto out_unlock_mmap;
}
/*
* Snapshots can create extents which require COW even if NODATACOW is
@@ -9881,7 +10014,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
btrfs_warn(fs_info,
"cannot activate swapfile because subvolume %llu is being deleted",
btrfs_root_id(root));
- return -EPERM;
+ ret = -EPERM;
+ goto out_unlock_mmap;
}
atomic_inc(&root->nr_swapfiles);
spin_unlock(&root->root_item_lock);
@@ -9889,24 +10023,39 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize);
lock_extent(io_tree, 0, isize - 1, &cached_state);
- start = 0;
- while (start < isize) {
- u64 logical_block_start, physical_block_start;
+ while (prev_extent_end < isize) {
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ struct btrfs_file_extent_item *ei;
struct btrfs_block_group *bg;
- u64 len = isize - start;
+ u64 logical_block_start;
+ u64 physical_block_start;
+ u64 extent_gen;
+ u64 disk_bytenr;
+ u64 len;
- em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len);
- if (IS_ERR(em)) {
- ret = PTR_ERR(em);
+ key.objectid = btrfs_ino(BTRFS_I(inode));
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = prev_extent_end;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
goto out;
- }
- if (em->disk_bytenr == EXTENT_MAP_HOLE) {
+ /*
+ * If key not found it means we have an implicit hole (NO_HOLES
+ * is enabled).
+ */
+ if (ret > 0) {
btrfs_warn(fs_info, "swapfile must not have holes");
ret = -EINVAL;
goto out;
}
- if (em->disk_bytenr == EXTENT_MAP_INLINE) {
+
+ leaf = path->nodes[0];
+ ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
+
+ if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) {
/*
* It's unlikely we'll ever actually find ourselves
* here, as a file small enough to fit inline won't be
@@ -9918,23 +10067,45 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
ret = -EINVAL;
goto out;
}
- if (extent_map_is_compressed(em)) {
+
+ if (btrfs_file_extent_compression(leaf, ei) != BTRFS_COMPRESS_NONE) {
btrfs_warn(fs_info, "swapfile must not be compressed");
ret = -EINVAL;
goto out;
}
- logical_block_start = extent_map_block_start(em) + (start - em->start);
- len = min(len, em->len - (start - em->start));
- free_extent_map(em);
- em = NULL;
+ disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
+ if (disk_bytenr == 0) {
+ btrfs_warn(fs_info, "swapfile must not have holes");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ logical_block_start = disk_bytenr + btrfs_file_extent_offset(leaf, ei);
+ extent_gen = btrfs_file_extent_generation(leaf, ei);
+ prev_extent_end = btrfs_file_extent_end(path);
- ret = can_nocow_extent(inode, start, &len, NULL, false, true);
+ if (prev_extent_end > isize)
+ len = isize - key.offset;
+ else
+ len = btrfs_file_extent_num_bytes(leaf, ei);
+
+ backref_ctx->curr_leaf_bytenr = leaf->start;
+
+ /*
+ * Don't need the path anymore, release to avoid deadlocks when
+ * calling btrfs_is_data_extent_shared() because when joining a
+ * transaction it can block waiting for the current one's commit
+ * which in turn may be trying to lock the same leaf to flush
+ * delayed items for example.
+ */
+ btrfs_release_path(path);
+
+ ret = btrfs_is_data_extent_shared(BTRFS_I(inode), disk_bytenr,
+ extent_gen, backref_ctx);
if (ret < 0) {
goto out;
- } else if (ret) {
- ret = 0;
- } else {
+ } else if (ret > 0) {
btrfs_warn(fs_info,
"swapfile must not be copy-on-write");
ret = -EINVAL;
@@ -9969,7 +10140,6 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
physical_block_start = (map->stripes[0].physical +
(logical_block_start - map->start));
- len = min(len, map->chunk_len - (logical_block_start - map->start));
btrfs_free_chunk_map(map);
map = NULL;
@@ -10010,20 +10180,23 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
if (ret)
goto out;
}
- bsi.start = start;
+ bsi.start = key.offset;
bsi.block_start = physical_block_start;
bsi.block_len = len;
}
- start += len;
+ if (fatal_signal_pending(current)) {
+ ret = -EINTR;
+ goto out;
+ }
+
+ cond_resched();
}
if (bsi.block_len)
ret = btrfs_add_swap_extent(sis, &bsi);
out:
- if (!IS_ERR_OR_NULL(em))
- free_extent_map(em);
if (!IS_ERR_OR_NULL(map))
btrfs_free_chunk_map(map);
@@ -10036,6 +10209,10 @@ out:
btrfs_exclop_finish(fs_info);
+out_unlock_mmap:
+ up_write(&BTRFS_I(inode)->i_mmap_lock);
+ btrfs_free_backref_share_ctx(backref_ctx);
+ btrfs_free_path(path);
if (ret)
return ret;
@@ -10044,7 +10221,6 @@ out:
*span = bsi.highest_ppage - bsi.lowest_ppage + 1;
sis->max = bsi.nr_pages;
sis->pages = bsi.nr_pages - 1;
- sis->highest_bit = bsi.nr_pages - 1;
return bsi.nr_extents;
}
#else
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 3af8bb0c8d75..63aeacc54945 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -118,8 +118,8 @@ struct btrfs_ioctl_encoded_io_args_32 {
#endif
/* Mask out flags that are inappropriate for the given type of inode. */
-static unsigned int btrfs_mask_fsflags_for_type(struct inode *inode,
- unsigned int flags)
+static unsigned int btrfs_mask_fsflags_for_type(const struct inode *inode,
+ unsigned int flags)
{
if (S_ISDIR(inode->i_mode))
return flags;
@@ -133,11 +133,11 @@ static unsigned int btrfs_mask_fsflags_for_type(struct inode *inode,
* Export internal inode flags to the format expected by the FS_IOC_GETFLAGS
* ioctl.
*/
-static unsigned int btrfs_inode_flags_to_fsflags(struct btrfs_inode *binode)
+static unsigned int btrfs_inode_flags_to_fsflags(const struct btrfs_inode *inode)
{
unsigned int iflags = 0;
- u32 flags = binode->flags;
- u32 ro_flags = binode->ro_flags;
+ u32 flags = inode->flags;
+ u32 ro_flags = inode->ro_flags;
if (flags & BTRFS_INODE_SYNC)
iflags |= FS_SYNC_FL;
@@ -167,25 +167,24 @@ static unsigned int btrfs_inode_flags_to_fsflags(struct btrfs_inode *binode)
/*
* Update inode->i_flags based on the btrfs internal flags.
*/
-void btrfs_sync_inode_flags_to_i_flags(struct inode *inode)
+void btrfs_sync_inode_flags_to_i_flags(struct btrfs_inode *inode)
{
- struct btrfs_inode *binode = BTRFS_I(inode);
unsigned int new_fl = 0;
- if (binode->flags & BTRFS_INODE_SYNC)
+ if (inode->flags & BTRFS_INODE_SYNC)
new_fl |= S_SYNC;
- if (binode->flags & BTRFS_INODE_IMMUTABLE)
+ if (inode->flags & BTRFS_INODE_IMMUTABLE)
new_fl |= S_IMMUTABLE;
- if (binode->flags & BTRFS_INODE_APPEND)
+ if (inode->flags & BTRFS_INODE_APPEND)
new_fl |= S_APPEND;
- if (binode->flags & BTRFS_INODE_NOATIME)
+ if (inode->flags & BTRFS_INODE_NOATIME)
new_fl |= S_NOATIME;
- if (binode->flags & BTRFS_INODE_DIRSYNC)
+ if (inode->flags & BTRFS_INODE_DIRSYNC)
new_fl |= S_DIRSYNC;
- if (binode->ro_flags & BTRFS_INODE_RO_VERITY)
+ if (inode->ro_flags & BTRFS_INODE_RO_VERITY)
new_fl |= S_VERITY;
- set_mask_bits(&inode->i_flags,
+ set_mask_bits(&inode->vfs_inode.i_flags,
S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC |
S_VERITY, new_fl);
}
@@ -219,7 +218,7 @@ static int check_fsflags(unsigned int old_flags, unsigned int flags)
return 0;
}
-static int check_fsflags_compatible(struct btrfs_fs_info *fs_info,
+static int check_fsflags_compatible(const struct btrfs_fs_info *fs_info,
unsigned int flags)
{
if (btrfs_is_zoned(fs_info) && (flags & FS_NOCOW_FL))
@@ -248,24 +247,23 @@ static int btrfs_check_ioctl_vol_args2_subvol_name(const struct btrfs_ioctl_vol_
*/
int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
{
- struct btrfs_inode *binode = BTRFS_I(d_inode(dentry));
+ const struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
- fileattr_fill_flags(fa, btrfs_inode_flags_to_fsflags(binode));
+ fileattr_fill_flags(fa, btrfs_inode_flags_to_fsflags(inode));
return 0;
}
int btrfs_fileattr_set(struct mnt_idmap *idmap,
struct dentry *dentry, struct fileattr *fa)
{
- struct inode *inode = d_inode(dentry);
- struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
- struct btrfs_inode *binode = BTRFS_I(inode);
- struct btrfs_root *root = binode->root;
+ struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_trans_handle *trans;
unsigned int fsflags, old_fsflags;
int ret;
const char *comp = NULL;
- u32 binode_flags;
+ u32 inode_flags;
if (btrfs_root_readonly(root))
return -EROFS;
@@ -273,8 +271,8 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap,
if (fileattr_has_fsx(fa))
return -EOPNOTSUPP;
- fsflags = btrfs_mask_fsflags_for_type(inode, fa->flags);
- old_fsflags = btrfs_inode_flags_to_fsflags(binode);
+ fsflags = btrfs_mask_fsflags_for_type(&inode->vfs_inode, fa->flags);
+ old_fsflags = btrfs_inode_flags_to_fsflags(inode);
ret = check_fsflags(old_fsflags, fsflags);
if (ret)
return ret;
@@ -283,27 +281,27 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap,
if (ret)
return ret;
- binode_flags = binode->flags;
+ inode_flags = inode->flags;
if (fsflags & FS_SYNC_FL)
- binode_flags |= BTRFS_INODE_SYNC;
+ inode_flags |= BTRFS_INODE_SYNC;
else
- binode_flags &= ~BTRFS_INODE_SYNC;
+ inode_flags &= ~BTRFS_INODE_SYNC;
if (fsflags & FS_IMMUTABLE_FL)
- binode_flags |= BTRFS_INODE_IMMUTABLE;
+ inode_flags |= BTRFS_INODE_IMMUTABLE;
else
- binode_flags &= ~BTRFS_INODE_IMMUTABLE;
+ inode_flags &= ~BTRFS_INODE_IMMUTABLE;
if (fsflags & FS_APPEND_FL)
- binode_flags |= BTRFS_INODE_APPEND;
+ inode_flags |= BTRFS_INODE_APPEND;
else
- binode_flags &= ~BTRFS_INODE_APPEND;
+ inode_flags &= ~BTRFS_INODE_APPEND;
if (fsflags & FS_NODUMP_FL)
- binode_flags |= BTRFS_INODE_NODUMP;
+ inode_flags |= BTRFS_INODE_NODUMP;
else
- binode_flags &= ~BTRFS_INODE_NODUMP;
+ inode_flags &= ~BTRFS_INODE_NODUMP;
if (fsflags & FS_NOATIME_FL)
- binode_flags |= BTRFS_INODE_NOATIME;
+ inode_flags |= BTRFS_INODE_NOATIME;
else
- binode_flags &= ~BTRFS_INODE_NOATIME;
+ inode_flags &= ~BTRFS_INODE_NOATIME;
/* If coming from FS_IOC_FSSETXATTR then skip unconverted flags */
if (!fa->flags_valid) {
@@ -315,32 +313,32 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap,
}
if (fsflags & FS_DIRSYNC_FL)
- binode_flags |= BTRFS_INODE_DIRSYNC;
+ inode_flags |= BTRFS_INODE_DIRSYNC;
else
- binode_flags &= ~BTRFS_INODE_DIRSYNC;
+ inode_flags &= ~BTRFS_INODE_DIRSYNC;
if (fsflags & FS_NOCOW_FL) {
- if (S_ISREG(inode->i_mode)) {
+ if (S_ISREG(inode->vfs_inode.i_mode)) {
/*
* It's safe to turn csums off here, no extents exist.
* Otherwise we want the flag to reflect the real COW
* status of the file and will not set it.
*/
- if (inode->i_size == 0)
- binode_flags |= BTRFS_INODE_NODATACOW |
- BTRFS_INODE_NODATASUM;
+ if (inode->vfs_inode.i_size == 0)
+ inode_flags |= BTRFS_INODE_NODATACOW |
+ BTRFS_INODE_NODATASUM;
} else {
- binode_flags |= BTRFS_INODE_NODATACOW;
+ inode_flags |= BTRFS_INODE_NODATACOW;
}
} else {
/*
* Revert back under same assumptions as above
*/
- if (S_ISREG(inode->i_mode)) {
- if (inode->i_size == 0)
- binode_flags &= ~(BTRFS_INODE_NODATACOW |
- BTRFS_INODE_NODATASUM);
+ if (S_ISREG(inode->vfs_inode.i_mode)) {
+ if (inode->vfs_inode.i_size == 0)
+ inode_flags &= ~(BTRFS_INODE_NODATACOW |
+ BTRFS_INODE_NODATASUM);
} else {
- binode_flags &= ~BTRFS_INODE_NODATACOW;
+ inode_flags &= ~BTRFS_INODE_NODATACOW;
}
}
@@ -350,21 +348,21 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap,
* things smaller.
*/
if (fsflags & FS_NOCOMP_FL) {
- binode_flags &= ~BTRFS_INODE_COMPRESS;
- binode_flags |= BTRFS_INODE_NOCOMPRESS;
+ inode_flags &= ~BTRFS_INODE_COMPRESS;
+ inode_flags |= BTRFS_INODE_NOCOMPRESS;
} else if (fsflags & FS_COMPR_FL) {
- if (IS_SWAPFILE(inode))
+ if (IS_SWAPFILE(&inode->vfs_inode))
return -ETXTBSY;
- binode_flags |= BTRFS_INODE_COMPRESS;
- binode_flags &= ~BTRFS_INODE_NOCOMPRESS;
+ inode_flags |= BTRFS_INODE_COMPRESS;
+ inode_flags &= ~BTRFS_INODE_NOCOMPRESS;
comp = btrfs_compress_type2str(fs_info->compress_type);
if (!comp || comp[0] == 0)
comp = btrfs_compress_type2str(BTRFS_COMPRESS_ZLIB);
} else {
- binode_flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
+ inode_flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
}
/*
@@ -376,15 +374,14 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap,
return PTR_ERR(trans);
if (comp) {
- ret = btrfs_set_prop(trans, BTRFS_I(inode), "btrfs.compression",
+ ret = btrfs_set_prop(trans, inode, "btrfs.compression",
comp, strlen(comp), 0);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
}
} else {
- ret = btrfs_set_prop(trans, BTRFS_I(inode), "btrfs.compression",
- NULL, 0, 0);
+ ret = btrfs_set_prop(trans, inode, "btrfs.compression", NULL, 0, 0);
if (ret && ret != -ENODATA) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
@@ -392,98 +389,19 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap,
}
update_flags:
- binode->flags = binode_flags;
+ inode->flags = inode_flags;
+ btrfs_update_inode_mapping_flags(inode);
btrfs_sync_inode_flags_to_i_flags(inode);
- inode_inc_iversion(inode);
- inode_set_ctime_current(inode);
- ret = btrfs_update_inode(trans, BTRFS_I(inode));
+ inode_inc_iversion(&inode->vfs_inode);
+ inode_set_ctime_current(&inode->vfs_inode);
+ ret = btrfs_update_inode(trans, inode);
out_end_trans:
btrfs_end_transaction(trans);
return ret;
}
-/*
- * Start exclusive operation @type, return true on success
- */
-bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
- enum btrfs_exclusive_operation type)
-{
- bool ret = false;
-
- spin_lock(&fs_info->super_lock);
- if (fs_info->exclusive_operation == BTRFS_EXCLOP_NONE) {
- fs_info->exclusive_operation = type;
- ret = true;
- }
- spin_unlock(&fs_info->super_lock);
-
- return ret;
-}
-
-/*
- * Conditionally allow to enter the exclusive operation in case it's compatible
- * with the running one. This must be paired with btrfs_exclop_start_unlock and
- * btrfs_exclop_finish.
- *
- * Compatibility:
- * - the same type is already running
- * - when trying to add a device and balance has been paused
- * - not BTRFS_EXCLOP_NONE - this is intentionally incompatible and the caller
- * must check the condition first that would allow none -> @type
- */
-bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info,
- enum btrfs_exclusive_operation type)
-{
- spin_lock(&fs_info->super_lock);
- if (fs_info->exclusive_operation == type ||
- (fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED &&
- type == BTRFS_EXCLOP_DEV_ADD))
- return true;
-
- spin_unlock(&fs_info->super_lock);
- return false;
-}
-
-void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info)
-{
- spin_unlock(&fs_info->super_lock);
-}
-
-void btrfs_exclop_finish(struct btrfs_fs_info *fs_info)
-{
- spin_lock(&fs_info->super_lock);
- WRITE_ONCE(fs_info->exclusive_operation, BTRFS_EXCLOP_NONE);
- spin_unlock(&fs_info->super_lock);
- sysfs_notify(&fs_info->fs_devices->fsid_kobj, NULL, "exclusive_operation");
-}
-
-void btrfs_exclop_balance(struct btrfs_fs_info *fs_info,
- enum btrfs_exclusive_operation op)
-{
- switch (op) {
- case BTRFS_EXCLOP_BALANCE_PAUSED:
- spin_lock(&fs_info->super_lock);
- ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE ||
- fs_info->exclusive_operation == BTRFS_EXCLOP_DEV_ADD ||
- fs_info->exclusive_operation == BTRFS_EXCLOP_NONE ||
- fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED);
- fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE_PAUSED;
- spin_unlock(&fs_info->super_lock);
- break;
- case BTRFS_EXCLOP_BALANCE:
- spin_lock(&fs_info->super_lock);
- ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED);
- fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE;
- spin_unlock(&fs_info->super_lock);
- break;
- default:
- btrfs_warn(fs_info,
- "invalid exclop balance operation %d requested", op);
- }
-}
-
-static int btrfs_ioctl_getversion(struct inode *inode, int __user *arg)
+static int btrfs_ioctl_getversion(const struct inode *inode, int __user *arg)
{
return put_user(inode->i_generation, arg);
}
@@ -551,22 +469,11 @@ static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info,
return ret;
}
-int __pure btrfs_is_empty_uuid(const u8 *uuid)
-{
- int i;
-
- for (i = 0; i < BTRFS_UUID_SIZE; i++) {
- if (uuid[i])
- return 0;
- }
- return 1;
-}
-
/*
* Calculate the number of transaction items to reserve for creating a subvolume
* or snapshot, not including the inode, directory entries, or parent directory.
*/
-static unsigned int create_subvol_num_items(struct btrfs_qgroup_inherit *inherit)
+static unsigned int create_subvol_num_items(const struct btrfs_qgroup_inherit *inherit)
{
/*
* 1 to add root block
@@ -708,8 +615,8 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
btrfs_set_root_dirid(root_item, BTRFS_FIRST_FREE_OBJECTID);
key.objectid = objectid;
- key.offset = 0;
key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = 0;
ret = btrfs_insert_root(trans, fs_info->tree_root, &key,
root_item);
if (ret) {
@@ -969,7 +876,7 @@ static int btrfs_may_delete(struct mnt_idmap *idmap,
/* copy of may_create in fs/namei.c() */
static inline int btrfs_may_create(struct mnt_idmap *idmap,
- struct inode *dir, struct dentry *child)
+ struct inode *dir, const struct dentry *child)
{
if (d_really_is_positive(child))
return -EEXIST;
@@ -1124,17 +1031,14 @@ static noinline int btrfs_ioctl_resize(struct file *file,
void __user *arg)
{
BTRFS_DEV_LOOKUP_ARGS(args);
- struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
+ struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
u64 new_size;
u64 old_size;
u64 devid = 1;
- struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_ioctl_vol_args *vol_args;
- struct btrfs_trans_handle *trans;
struct btrfs_device *device = NULL;
char *sizestr;
- char *retptr;
char *devstr = NULL;
int ret = 0;
int mod = 0;
@@ -1202,6 +1106,8 @@ static noinline int btrfs_ioctl_resize(struct file *file,
if (!strcmp(sizestr, "max"))
new_size = bdev_nr_bytes(device->bdev);
else {
+ char *retptr;
+
if (sizestr[0] == '-') {
mod = -1;
sizestr++;
@@ -1249,6 +1155,8 @@ static noinline int btrfs_ioctl_resize(struct file *file,
new_size = round_down(new_size, fs_info->sectorsize);
if (new_size > old_size) {
+ struct btrfs_trans_handle *trans;
+
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
@@ -1427,15 +1335,15 @@ free_args:
return ret;
}
-static noinline int btrfs_ioctl_subvol_getflags(struct inode *inode,
+static noinline int btrfs_ioctl_subvol_getflags(struct btrfs_inode *inode,
void __user *arg)
{
- struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
int ret = 0;
u64 flags = 0;
- if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID)
+ if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID)
return -EINVAL;
down_read(&fs_info->subvol_sem);
@@ -1538,8 +1446,8 @@ out:
return ret;
}
-static noinline int key_in_sk(struct btrfs_key *key,
- struct btrfs_ioctl_search_key *sk)
+static noinline int key_in_sk(const struct btrfs_key *key,
+ const struct btrfs_ioctl_search_key *sk)
{
struct btrfs_key test;
int ret;
@@ -1564,7 +1472,7 @@ static noinline int key_in_sk(struct btrfs_key *key,
static noinline int copy_to_sk(struct btrfs_path *path,
struct btrfs_key *key,
- struct btrfs_ioctl_search_key *sk,
+ const struct btrfs_ioctl_search_key *sk,
u64 *buf_size,
char __user *ubuf,
unsigned long *sk_offset,
@@ -1621,8 +1529,8 @@ static noinline int copy_to_sk(struct btrfs_path *path,
}
sh.objectid = key->objectid;
- sh.offset = key->offset;
sh.type = key->type;
+ sh.offset = key->offset;
sh.len = item_len;
sh.transid = found_transid;
@@ -1695,13 +1603,12 @@ out:
return ret;
}
-static noinline int search_ioctl(struct inode *inode,
+static noinline int search_ioctl(struct btrfs_root *root,
struct btrfs_ioctl_search_key *sk,
u64 *buf_size,
char __user *ubuf)
{
- struct btrfs_fs_info *info = inode_to_fs_info(inode);
- struct btrfs_root *root;
+ struct btrfs_fs_info *info = root->fs_info;
struct btrfs_key key;
struct btrfs_path *path;
int ret;
@@ -1718,9 +1625,10 @@ static noinline int search_ioctl(struct inode *inode,
return -ENOMEM;
if (sk->tree_id == 0) {
- /* search the root of the inode that was passed */
- root = btrfs_grab_root(BTRFS_I(inode)->root);
+ /* Search the root that we got passed. */
+ root = btrfs_grab_root(root);
} else {
+ /* Look up the root from the arguments. */
root = btrfs_get_fs_root(info, sk->tree_id, true);
if (IS_ERR(root)) {
btrfs_free_path(path);
@@ -1733,21 +1641,19 @@ static noinline int search_ioctl(struct inode *inode,
key.offset = sk->min_offset;
while (1) {
- ret = -EFAULT;
/*
* Ensure that the whole user buffer is faulted in at sub-page
* granularity, otherwise the loop may live-lock.
*/
- if (fault_in_subpage_writeable(ubuf + sk_offset,
- *buf_size - sk_offset))
+ if (fault_in_subpage_writeable(ubuf + sk_offset, *buf_size - sk_offset)) {
+ ret = -EFAULT;
break;
+ }
ret = btrfs_search_forward(root, &key, path, sk->min_transid);
- if (ret != 0) {
- if (ret > 0)
- ret = 0;
- goto err;
- }
+ if (ret)
+ break;
+
ret = copy_to_sk(path, &key, sk, buf_size, ubuf,
&sk_offset, &num_found);
btrfs_release_path(path);
@@ -1755,16 +1661,17 @@ static noinline int search_ioctl(struct inode *inode,
break;
}
+ /* Normalize return values from btrfs_search_forward() and copy_to_sk(). */
if (ret > 0)
ret = 0;
-err:
+
sk->nr_items = num_found;
btrfs_put_root(root);
btrfs_free_path(path);
return ret;
}
-static noinline int btrfs_ioctl_tree_search(struct inode *inode,
+static noinline int btrfs_ioctl_tree_search(struct btrfs_root *root,
void __user *argp)
{
struct btrfs_ioctl_search_args __user *uargs = argp;
@@ -1780,7 +1687,7 @@ static noinline int btrfs_ioctl_tree_search(struct inode *inode,
buf_size = sizeof(uargs->buf);
- ret = search_ioctl(inode, &sk, &buf_size, uargs->buf);
+ ret = search_ioctl(root, &sk, &buf_size, uargs->buf);
/*
* In the origin implementation an overflow is handled by returning a
@@ -1794,7 +1701,7 @@ static noinline int btrfs_ioctl_tree_search(struct inode *inode,
return ret;
}
-static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode,
+static noinline int btrfs_ioctl_tree_search_v2(struct btrfs_root *root,
void __user *argp)
{
struct btrfs_ioctl_search_args_v2 __user *uarg = argp;
@@ -1816,7 +1723,7 @@ static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode,
if (buf_size > buf_limit)
buf_size = buf_limit;
- ret = search_ioctl(inode, &args.key, &buf_size,
+ ret = search_ioctl(root, &args.key, &buf_size,
(char __user *)(&uarg->buf[0]));
if (ret == 0 && copy_to_user(&uarg->key, &args.key, sizeof(args.key)))
ret = -EFAULT;
@@ -1924,7 +1831,6 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap,
struct btrfs_path *path;
struct btrfs_key key, key2;
struct extent_buffer *leaf;
- struct inode *temp_inode;
char *ptr;
int slot;
int len;
@@ -1952,6 +1858,8 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap,
key.type = BTRFS_INODE_REF_KEY;
key.offset = (u64)-1;
while (1) {
+ struct btrfs_inode *temp_inode;
+
ret = btrfs_search_backwards(root, &key, path);
if (ret < 0)
goto out_put;
@@ -2006,9 +1914,9 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap,
ret = PTR_ERR(temp_inode);
goto out_put;
}
- ret = inode_permission(idmap, temp_inode,
+ ret = inode_permission(idmap, &temp_inode->vfs_inode,
MAY_READ | MAY_EXEC);
- iput(temp_inode);
+ iput(&temp_inode->vfs_inode);
if (ret) {
ret = -EACCES;
goto out_put;
@@ -2635,6 +2543,15 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
goto out;
}
+ /*
+ * Don't allow defrag on pre-content watched files, as it could
+ * populate the page cache with 0's via readahead.
+ */
+ if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) {
+ ret = -EINVAL;
+ goto out;
+ }
+
if (argp) {
if (copy_from_user(&range, argp, sizeof(range))) {
ret = -EFAULT;
@@ -2653,7 +2570,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
/* the rest are all set to zero by kzalloc */
range.len = (u64)-1;
}
- ret = btrfs_defrag_file(file_inode(file), &file->f_ra,
+ ret = btrfs_defrag_file(BTRFS_I(file_inode(file)), &file->f_ra,
&range, BTRFS_OLDEST_GENERATION, 0);
if (ret > 0)
ret = 0;
@@ -2845,7 +2762,7 @@ out_free:
return ret;
}
-static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info,
+static long btrfs_ioctl_fs_info(const struct btrfs_fs_info *fs_info,
void __user *arg)
{
struct btrfs_ioctl_fs_info_args *fi_args;
@@ -2899,7 +2816,7 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info,
return ret;
}
-static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info,
+static long btrfs_ioctl_dev_info(const struct btrfs_fs_info *fs_info,
void __user *arg)
{
BTRFS_DEV_LOOKUP_ARGS(args);
@@ -3007,7 +2924,6 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key);
btrfs_set_dir_item_key(path->nodes[0], di, &disk_key);
- btrfs_mark_buffer_dirty(trans, path->nodes[0]);
btrfs_release_path(path);
btrfs_set_fs_incompat(fs_info, DEFAULT_SUBVOL);
@@ -4331,7 +4247,7 @@ static int btrfs_ioctl_get_features(struct btrfs_fs_info *fs_info,
return 0;
}
-static int check_feature_bits(struct btrfs_fs_info *fs_info,
+static int check_feature_bits(const struct btrfs_fs_info *fs_info,
enum btrfs_feature_set set,
u64 change_mask, u64 flags, u64 supported_flags,
u64 safe_set, u64 safe_clear)
@@ -4467,7 +4383,7 @@ out_drop_write:
return ret;
}
-static int _btrfs_ioctl_send(struct btrfs_inode *inode, void __user *argp, bool compat)
+static int _btrfs_ioctl_send(struct btrfs_root *root, void __user *argp, bool compat)
{
struct btrfs_ioctl_send_args *arg;
int ret;
@@ -4498,7 +4414,7 @@ static int _btrfs_ioctl_send(struct btrfs_inode *inode, void __user *argp, bool
if (IS_ERR(arg))
return PTR_ERR(arg);
}
- ret = btrfs_ioctl_send(inode, arg);
+ ret = btrfs_ioctl_send(root, arg);
kfree(arg);
return ret;
}
@@ -4878,25 +4794,29 @@ out_fail:
return ret;
}
+struct btrfs_uring_encoded_data {
+ struct btrfs_ioctl_encoded_io_args args;
+ struct iovec iovstack[UIO_FASTIOV];
+ struct iovec *iov;
+ struct iov_iter iter;
+};
+
static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue_flags)
{
size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, flags);
size_t copy_end;
- struct btrfs_ioctl_encoded_io_args args = { 0 };
int ret;
u64 disk_bytenr, disk_io_size;
struct file *file;
struct btrfs_inode *inode;
struct btrfs_fs_info *fs_info;
struct extent_io_tree *io_tree;
- struct iovec iovstack[UIO_FASTIOV];
- struct iovec *iov = iovstack;
- struct iov_iter iter;
loff_t pos;
struct kiocb kiocb;
struct extent_state *cached_state = NULL;
u64 start, lockend;
void __user *sqe_addr;
+ struct btrfs_uring_encoded_data *data = io_uring_cmd_get_async_data(cmd)->op_data;
if (!capable(CAP_SYS_ADMIN)) {
ret = -EPERM;
@@ -4910,43 +4830,64 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue
if (issue_flags & IO_URING_F_COMPAT) {
#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
- struct btrfs_ioctl_encoded_io_args_32 args32;
-
copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32, flags);
- if (copy_from_user(&args32, sqe_addr, copy_end)) {
- ret = -EFAULT;
- goto out_acct;
- }
- args.iov = compat_ptr(args32.iov);
- args.iovcnt = args32.iovcnt;
- args.offset = args32.offset;
- args.flags = args32.flags;
#else
return -ENOTTY;
#endif
} else {
copy_end = copy_end_kernel;
- if (copy_from_user(&args, sqe_addr, copy_end)) {
- ret = -EFAULT;
+ }
+
+ if (!data) {
+ data = kzalloc(sizeof(*data), GFP_NOFS);
+ if (!data) {
+ ret = -ENOMEM;
goto out_acct;
}
- }
- if (args.flags != 0)
- return -EINVAL;
+ io_uring_cmd_get_async_data(cmd)->op_data = data;
- ret = import_iovec(ITER_DEST, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
- &iov, &iter);
- if (ret < 0)
- goto out_acct;
+ if (issue_flags & IO_URING_F_COMPAT) {
+#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
+ struct btrfs_ioctl_encoded_io_args_32 args32;
- if (iov_iter_count(&iter) == 0) {
- ret = 0;
- goto out_free;
+ if (copy_from_user(&args32, sqe_addr, copy_end)) {
+ ret = -EFAULT;
+ goto out_acct;
+ }
+
+ data->args.iov = compat_ptr(args32.iov);
+ data->args.iovcnt = args32.iovcnt;
+ data->args.offset = args32.offset;
+ data->args.flags = args32.flags;
+#endif
+ } else {
+ if (copy_from_user(&data->args, sqe_addr, copy_end)) {
+ ret = -EFAULT;
+ goto out_acct;
+ }
+ }
+
+ if (data->args.flags != 0) {
+ ret = -EINVAL;
+ goto out_acct;
+ }
+
+ data->iov = data->iovstack;
+ ret = import_iovec(ITER_DEST, data->args.iov, data->args.iovcnt,
+ ARRAY_SIZE(data->iovstack), &data->iov,
+ &data->iter);
+ if (ret < 0)
+ goto out_acct;
+
+ if (iov_iter_count(&data->iter) == 0) {
+ ret = 0;
+ goto out_free;
+ }
}
- pos = args.offset;
- ret = rw_verify_area(READ, file, &pos, args.len);
+ pos = data->args.offset;
+ ret = rw_verify_area(READ, file, &pos, data->args.len);
if (ret < 0)
goto out_free;
@@ -4959,15 +4900,18 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue
start = ALIGN_DOWN(pos, fs_info->sectorsize);
lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
- ret = btrfs_encoded_read(&kiocb, &iter, &args, &cached_state,
+ ret = btrfs_encoded_read(&kiocb, &data->iter, &data->args, &cached_state,
&disk_bytenr, &disk_io_size);
+ if (ret == -EAGAIN)
+ goto out_acct;
if (ret < 0 && ret != -EIOCBQUEUED)
goto out_free;
file_accessed(file);
- if (copy_to_user(sqe_addr + copy_end, (const char *)&args + copy_end_kernel,
- sizeof(args) - copy_end_kernel)) {
+ if (copy_to_user(sqe_addr + copy_end,
+ (const char *)&data->args + copy_end_kernel,
+ sizeof(data->args) - copy_end_kernel)) {
if (ret == -EIOCBQUEUED) {
unlock_extent(io_tree, start, lockend, &cached_state);
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
@@ -4977,40 +4921,22 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue
}
if (ret == -EIOCBQUEUED) {
- u64 count;
-
- /*
- * If we've optimized things by storing the iovecs on the stack,
- * undo this.
- */
- if (!iov) {
- iov = kmalloc(sizeof(struct iovec) * args.iovcnt, GFP_NOFS);
- if (!iov) {
- unlock_extent(io_tree, start, lockend, &cached_state);
- btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
- ret = -ENOMEM;
- goto out_acct;
- }
-
- memcpy(iov, iovstack, sizeof(struct iovec) * args.iovcnt);
- }
-
- count = min_t(u64, iov_iter_count(&iter), disk_io_size);
+ u64 count = min_t(u64, iov_iter_count(&data->iter), disk_io_size);
/* Match ioctl by not returning past EOF if uncompressed. */
- if (!args.compression)
- count = min_t(u64, count, args.len);
+ if (!data->args.compression)
+ count = min_t(u64, count, data->args.len);
- ret = btrfs_uring_read_extent(&kiocb, &iter, start, lockend,
- cached_state, disk_bytenr,
- disk_io_size, count,
- args.compression, iov, cmd);
+ ret = btrfs_uring_read_extent(&kiocb, &data->iter, start, lockend,
+ cached_state, disk_bytenr, disk_io_size,
+ count, data->args.compression,
+ data->iov, cmd);
goto out_acct;
}
out_free:
- kfree(iov);
+ kfree(data->iov);
out_acct:
if (ret > 0)
@@ -5020,6 +4946,128 @@ out_acct:
return ret;
}
+static int btrfs_uring_encoded_write(struct io_uring_cmd *cmd, unsigned int issue_flags)
+{
+ loff_t pos;
+ struct kiocb kiocb;
+ struct file *file;
+ ssize_t ret;
+ void __user *sqe_addr;
+ struct btrfs_uring_encoded_data *data = io_uring_cmd_get_async_data(cmd)->op_data;
+
+ if (!capable(CAP_SYS_ADMIN)) {
+ ret = -EPERM;
+ goto out_acct;
+ }
+
+ file = cmd->file;
+ sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr));
+
+ if (!(file->f_mode & FMODE_WRITE)) {
+ ret = -EBADF;
+ goto out_acct;
+ }
+
+ if (!data) {
+ data = kzalloc(sizeof(*data), GFP_NOFS);
+ if (!data) {
+ ret = -ENOMEM;
+ goto out_acct;
+ }
+
+ io_uring_cmd_get_async_data(cmd)->op_data = data;
+
+ if (issue_flags & IO_URING_F_COMPAT) {
+#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
+ struct btrfs_ioctl_encoded_io_args_32 args32;
+
+ if (copy_from_user(&args32, sqe_addr, sizeof(args32))) {
+ ret = -EFAULT;
+ goto out_acct;
+ }
+ data->args.iov = compat_ptr(args32.iov);
+ data->args.iovcnt = args32.iovcnt;
+ data->args.offset = args32.offset;
+ data->args.flags = args32.flags;
+ data->args.len = args32.len;
+ data->args.unencoded_len = args32.unencoded_len;
+ data->args.unencoded_offset = args32.unencoded_offset;
+ data->args.compression = args32.compression;
+ data->args.encryption = args32.encryption;
+ memcpy(data->args.reserved, args32.reserved,
+ sizeof(data->args.reserved));
+#else
+ ret = -ENOTTY;
+ goto out_acct;
+#endif
+ } else {
+ if (copy_from_user(&data->args, sqe_addr, sizeof(data->args))) {
+ ret = -EFAULT;
+ goto out_acct;
+ }
+ }
+
+ ret = -EINVAL;
+ if (data->args.flags != 0)
+ goto out_acct;
+ if (memchr_inv(data->args.reserved, 0, sizeof(data->args.reserved)))
+ goto out_acct;
+ if (data->args.compression == BTRFS_ENCODED_IO_COMPRESSION_NONE &&
+ data->args.encryption == BTRFS_ENCODED_IO_ENCRYPTION_NONE)
+ goto out_acct;
+ if (data->args.compression >= BTRFS_ENCODED_IO_COMPRESSION_TYPES ||
+ data->args.encryption >= BTRFS_ENCODED_IO_ENCRYPTION_TYPES)
+ goto out_acct;
+ if (data->args.unencoded_offset > data->args.unencoded_len)
+ goto out_acct;
+ if (data->args.len > data->args.unencoded_len - data->args.unencoded_offset)
+ goto out_acct;
+
+ data->iov = data->iovstack;
+ ret = import_iovec(ITER_SOURCE, data->args.iov, data->args.iovcnt,
+ ARRAY_SIZE(data->iovstack), &data->iov,
+ &data->iter);
+ if (ret < 0)
+ goto out_acct;
+
+ if (iov_iter_count(&data->iter) == 0) {
+ ret = 0;
+ goto out_iov;
+ }
+ }
+
+ if (issue_flags & IO_URING_F_NONBLOCK) {
+ ret = -EAGAIN;
+ goto out_acct;
+ }
+
+ pos = data->args.offset;
+ ret = rw_verify_area(WRITE, file, &pos, data->args.len);
+ if (ret < 0)
+ goto out_iov;
+
+ init_sync_kiocb(&kiocb, file);
+ ret = kiocb_set_rw_flags(&kiocb, 0, WRITE);
+ if (ret)
+ goto out_iov;
+ kiocb.ki_pos = pos;
+
+ file_start_write(file);
+
+ ret = btrfs_do_write_iter(&kiocb, &data->iter, &data->args);
+ if (ret > 0)
+ fsnotify_modify(file);
+
+ file_end_write(file);
+out_iov:
+ kfree(data->iov);
+out_acct:
+ if (ret > 0)
+ add_wchar(current, ret);
+ inc_syscw(current);
+ return ret;
+}
+
int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
{
switch (cmd->cmd_op) {
@@ -5028,6 +5076,12 @@ int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
case BTRFS_IOC_ENCODED_READ_32:
#endif
return btrfs_uring_encoded_read(cmd, issue_flags);
+
+ case BTRFS_IOC_ENCODED_WRITE:
+#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
+ case BTRFS_IOC_ENCODED_WRITE_32:
+#endif
+ return btrfs_uring_encoded_write(cmd, issue_flags);
}
return -EINVAL;
@@ -5189,7 +5243,7 @@ long btrfs_ioctl(struct file *file, unsigned int
case BTRFS_IOC_SNAP_DESTROY_V2:
return btrfs_ioctl_snap_destroy(file, argp, true);
case BTRFS_IOC_SUBVOL_GETFLAGS:
- return btrfs_ioctl_subvol_getflags(inode, argp);
+ return btrfs_ioctl_subvol_getflags(BTRFS_I(inode), argp);
case BTRFS_IOC_SUBVOL_SETFLAGS:
return btrfs_ioctl_subvol_setflags(file, argp);
case BTRFS_IOC_DEFAULT_SUBVOL:
@@ -5211,9 +5265,9 @@ long btrfs_ioctl(struct file *file, unsigned int
case BTRFS_IOC_DEV_INFO:
return btrfs_ioctl_dev_info(fs_info, argp);
case BTRFS_IOC_TREE_SEARCH:
- return btrfs_ioctl_tree_search(inode, argp);
+ return btrfs_ioctl_tree_search(root, argp);
case BTRFS_IOC_TREE_SEARCH_V2:
- return btrfs_ioctl_tree_search_v2(inode, argp);
+ return btrfs_ioctl_tree_search_v2(root, argp);
case BTRFS_IOC_INO_LOOKUP:
return btrfs_ioctl_ino_lookup(root, argp);
case BTRFS_IOC_INO_PATHS:
@@ -5261,10 +5315,10 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_set_received_subvol_32(file, argp);
#endif
case BTRFS_IOC_SEND:
- return _btrfs_ioctl_send(BTRFS_I(inode), argp, false);
+ return _btrfs_ioctl_send(root, argp, false);
#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
case BTRFS_IOC_SEND_32:
- return _btrfs_ioctl_send(BTRFS_I(inode), argp, true);
+ return _btrfs_ioctl_send(root, argp, true);
#endif
case BTRFS_IOC_GET_DEV_STATS:
return btrfs_ioctl_get_dev_stats(fs_info, argp);
@@ -5300,6 +5354,8 @@ long btrfs_ioctl(struct file *file, unsigned int
return fsverity_ioctl_enable(file, (const void __user *)argp);
case FS_IOC_MEASURE_VERITY:
return fsverity_ioctl_measure(file, argp);
+ case FS_IOC_READ_VERITY_METADATA:
+ return fsverity_ioctl_read_metadata(file, argp);
case BTRFS_IOC_ENCODED_READ:
return btrfs_ioctl_encoded_read(file, argp, false);
case BTRFS_IOC_ENCODED_WRITE:
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 2b760c8778f8..e08ea446cf48 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -9,6 +9,8 @@ struct file;
struct dentry;
struct mnt_idmap;
struct fileattr;
+struct io_uring_cmd;
+struct btrfs_inode;
struct btrfs_fs_info;
struct btrfs_ioctl_balance_args;
@@ -18,8 +20,7 @@ int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa);
int btrfs_fileattr_set(struct mnt_idmap *idmap,
struct dentry *dentry, struct fileattr *fa);
int btrfs_ioctl_get_supported_features(void __user *arg);
-void btrfs_sync_inode_flags_to_i_flags(struct inode *inode);
-int __pure btrfs_is_empty_uuid(const u8 *uuid);
+void btrfs_sync_inode_flags_to_i_flags(struct btrfs_inode *inode);
void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_balance_args *bargs);
int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags);
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 9a7a7b723305..81e62b652e21 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -9,7 +9,6 @@
#include <linux/page-flags.h>
#include <asm/bug.h>
#include <trace/events/btrfs.h>
-#include "misc.h"
#include "ctree.h"
#include "extent_io.h"
#include "locking.h"
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index 35036b151bf5..c69e57ff804b 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -199,8 +199,13 @@ static inline void btrfs_assert_tree_write_locked(struct extent_buffer *eb)
{
lockdep_assert_held_write(&eb->lock);
}
+static inline void btrfs_assert_tree_read_locked(struct extent_buffer *eb)
+{
+ lockdep_assert_held_read(&eb->lock);
+}
#else
static inline void btrfs_assert_tree_write_locked(struct extent_buffer *eb) { }
+static inline void btrfs_assert_tree_read_locked(struct extent_buffer *eb) { }
#endif
void btrfs_unlock_up_safe(struct btrfs_path *path, int level);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 30eceaf829a7..03c945711003 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -842,10 +842,12 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
/*
* Start IO and wait for a given ordered extent to finish.
*
- * Wait on page writeback for all the pages in the extent and the IO completion
- * code to insert metadata into the btree corresponding to the extent.
+ * Wait on page writeback for all the pages in the extent but not in
+ * [@nowriteback_start, @nowriteback_start + @nowriteback_len) and the
+ * IO completion code to insert metadata into the btree corresponding to the extent.
*/
-void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry)
+void btrfs_start_ordered_extent_nowriteback(struct btrfs_ordered_extent *entry,
+ u64 nowriteback_start, u32 nowriteback_len)
{
u64 start = entry->file_offset;
u64 end = start + entry->num_bytes - 1;
@@ -865,8 +867,19 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry)
* start IO on any dirty ones so the wait doesn't stall waiting
* for the flusher thread to find them
*/
- if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
- filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end);
+ if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) {
+ if (!nowriteback_len) {
+ filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end);
+ } else {
+ if (start < nowriteback_start)
+ filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start,
+ nowriteback_start - 1);
+ if (nowriteback_start + nowriteback_len < end)
+ filemap_fdatawrite_range(inode->vfs_inode.i_mapping,
+ nowriteback_start + nowriteback_len,
+ end);
+ }
+ }
if (!freespace_inode)
btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent);
@@ -1229,6 +1242,18 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent(
*/
if (WARN_ON_ONCE(len >= ordered->num_bytes))
return ERR_PTR(-EINVAL);
+ /*
+ * If our ordered extent had an error there's no point in continuing.
+ * The error may have come from a transaction abort done either by this
+ * task or some other concurrent task, and the transaction abort path
+ * iterates over all existing ordered extents and sets the flag
+ * BTRFS_ORDERED_IOERR on them.
+ */
+ if (unlikely(flags & (1U << BTRFS_ORDERED_IOERR))) {
+ const int fs_error = BTRFS_FS_ERROR(fs_info);
+
+ return fs_error ? ERR_PTR(fs_error) : ERR_PTR(-EIO);
+ }
/* We cannot split partially completed ordered extents. */
if (ordered->bytes_left) {
ASSERT(!(flags & ~BTRFS_ORDERED_TYPE_FLAGS));
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 4e152736d06c..1e6b0b182b29 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -17,6 +17,7 @@
struct inode;
struct page;
struct extent_state;
+struct btrfs_block_group;
struct btrfs_inode;
struct btrfs_root;
struct btrfs_fs_info;
@@ -191,7 +192,13 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
struct btrfs_ordered_sum *sum);
struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode,
u64 file_offset);
-void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry);
+void btrfs_start_ordered_extent_nowriteback(struct btrfs_ordered_extent *entry,
+ u64 nowriteback_start, u32 nowriteback_len);
+static inline void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry)
+{
+ return btrfs_start_ordered_extent_nowriteback(entry, 0, 0);
+}
+
int btrfs_wait_ordered_range(struct btrfs_inode *inode, u64 start, u64 len);
struct btrfs_ordered_extent *
btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset);
diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h
index 8504bf1702c7..d0e620bf5f5a 100644
--- a/fs/btrfs/print-tree.h
+++ b/fs/btrfs/print-tree.h
@@ -6,6 +6,8 @@
#ifndef BTRFS_PRINT_TREE_H
#define BTRFS_PRINT_TREE_H
+#include <linux/types.h>
+
/* Buffer size to contain tree name and possibly additional data (offset) */
#define BTRFS_ROOT_NAME_BUF_LEN 48
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index b8fa34e16abb..adc956432d2f 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -26,8 +26,8 @@ struct prop_handler {
const char *xattr_name;
int (*validate)(const struct btrfs_inode *inode, const char *value,
size_t len);
- int (*apply)(struct inode *inode, const char *value, size_t len);
- const char *(*extract)(const struct inode *inode);
+ int (*apply)(struct btrfs_inode *inode, const char *value, size_t len);
+ const char *(*extract)(const struct btrfs_inode *inode);
bool (*ignore)(const struct btrfs_inode *inode);
int inheritable;
};
@@ -121,7 +121,7 @@ int btrfs_set_prop(struct btrfs_trans_handle *trans, struct btrfs_inode *inode,
if (ret)
return ret;
- ret = handler->apply(&inode->vfs_inode, NULL, 0);
+ ret = handler->apply(inode, NULL, 0);
ASSERT(ret == 0);
return ret;
@@ -131,7 +131,7 @@ int btrfs_set_prop(struct btrfs_trans_handle *trans, struct btrfs_inode *inode,
value_len, flags);
if (ret)
return ret;
- ret = handler->apply(&inode->vfs_inode, value, value_len);
+ ret = handler->apply(inode, value, value_len);
if (ret) {
btrfs_setxattr(trans, &inode->vfs_inode, handler->xattr_name, NULL,
0, flags);
@@ -263,7 +263,7 @@ static void inode_prop_iterator(void *ctx,
struct btrfs_root *root = BTRFS_I(inode)->root;
int ret;
- ret = handler->apply(inode, value, len);
+ ret = handler->apply(BTRFS_I(inode), value, len);
if (unlikely(ret))
btrfs_warn(root->fs_info,
"error applying prop %s to ino %llu (root %llu): %d",
@@ -273,12 +273,13 @@ static void inode_prop_iterator(void *ctx,
set_bit(BTRFS_INODE_HAS_PROPS, &BTRFS_I(inode)->runtime_flags);
}
-int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path)
+int btrfs_load_inode_props(struct btrfs_inode *inode, struct btrfs_path *path)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
- u64 ino = btrfs_ino(BTRFS_I(inode));
+ struct btrfs_root *root = inode->root;
+ u64 ino = btrfs_ino(inode);
- return iterate_object_props(root, path, ino, inode_prop_iterator, inode);
+ return iterate_object_props(root, path, ino, inode_prop_iterator,
+ &inode->vfs_inode);
}
static int prop_compression_validate(const struct btrfs_inode *inode,
@@ -300,26 +301,26 @@ static int prop_compression_validate(const struct btrfs_inode *inode,
return -EINVAL;
}
-static int prop_compression_apply(struct inode *inode, const char *value,
+static int prop_compression_apply(struct btrfs_inode *inode, const char *value,
size_t len)
{
- struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
int type;
/* Reset to defaults */
if (len == 0) {
- BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
- BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
- BTRFS_I(inode)->prop_compress = BTRFS_COMPRESS_NONE;
+ inode->flags &= ~BTRFS_INODE_COMPRESS;
+ inode->flags &= ~BTRFS_INODE_NOCOMPRESS;
+ inode->prop_compress = BTRFS_COMPRESS_NONE;
return 0;
}
/* Set NOCOMPRESS flag */
if ((len == 2 && strncmp("no", value, 2) == 0) ||
(len == 4 && strncmp("none", value, 4) == 0)) {
- BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
- BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
- BTRFS_I(inode)->prop_compress = BTRFS_COMPRESS_NONE;
+ inode->flags |= BTRFS_INODE_NOCOMPRESS;
+ inode->flags &= ~BTRFS_INODE_COMPRESS;
+ inode->prop_compress = BTRFS_COMPRESS_NONE;
return 0;
}
@@ -336,9 +337,9 @@ static int prop_compression_apply(struct inode *inode, const char *value,
return -EINVAL;
}
- BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
- BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
- BTRFS_I(inode)->prop_compress = type;
+ inode->flags &= ~BTRFS_INODE_NOCOMPRESS;
+ inode->flags |= BTRFS_INODE_COMPRESS;
+ inode->prop_compress = type;
return 0;
}
@@ -359,13 +360,13 @@ static bool prop_compression_ignore(const struct btrfs_inode *inode)
return false;
}
-static const char *prop_compression_extract(const struct inode *inode)
+static const char *prop_compression_extract(const struct btrfs_inode *inode)
{
- switch (BTRFS_I(inode)->prop_compress) {
+ switch (inode->prop_compress) {
case BTRFS_COMPRESS_ZLIB:
case BTRFS_COMPRESS_LZO:
case BTRFS_COMPRESS_ZSTD:
- return btrfs_compress_type2str(BTRFS_I(inode)->prop_compress);
+ return btrfs_compress_type2str(inode->prop_compress);
default:
break;
}
@@ -385,16 +386,16 @@ static struct prop_handler prop_handlers[] = {
};
int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
- struct inode *inode, const struct inode *parent)
+ struct btrfs_inode *inode,
+ const struct btrfs_inode *parent)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
int i;
bool need_reserve = false;
- if (!test_bit(BTRFS_INODE_HAS_PROPS,
- &BTRFS_I(parent)->runtime_flags))
+ if (!test_bit(BTRFS_INODE_HAS_PROPS, &parent->runtime_flags))
return 0;
for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) {
@@ -405,7 +406,7 @@ int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
if (!h->inheritable)
continue;
- if (h->ignore(BTRFS_I(inode)))
+ if (h->ignore(inode))
continue;
value = h->extract(parent);
@@ -416,7 +417,7 @@ int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
* This is not strictly necessary as the property should be
* valid, but in case it isn't, don't propagate it further.
*/
- ret = h->validate(BTRFS_I(inode), value, strlen(value));
+ ret = h->validate(inode, value, strlen(value));
if (ret)
continue;
@@ -436,16 +437,15 @@ int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
return ret;
}
- ret = btrfs_setxattr(trans, inode, h->xattr_name, value,
+ ret = btrfs_setxattr(trans, &inode->vfs_inode, h->xattr_name, value,
strlen(value), 0);
if (!ret) {
ret = h->apply(inode, value, strlen(value));
if (ret)
- btrfs_setxattr(trans, inode, h->xattr_name,
+ btrfs_setxattr(trans, &inode->vfs_inode, h->xattr_name,
NULL, 0, 0);
else
- set_bit(BTRFS_INODE_HAS_PROPS,
- &BTRFS_I(inode)->runtime_flags);
+ set_bit(BTRFS_INODE_HAS_PROPS, &inode->runtime_flags);
}
if (need_reserve) {
diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h
index 63546d0a9444..15d9a025c923 100644
--- a/fs/btrfs/props.h
+++ b/fs/btrfs/props.h
@@ -6,9 +6,9 @@
#ifndef BTRFS_PROPS_H
#define BTRFS_PROPS_H
+#include <linux/types.h>
#include <linux/compiler_types.h>
-struct inode;
struct btrfs_inode;
struct btrfs_path;
struct btrfs_trans_handle;
@@ -22,10 +22,10 @@ int btrfs_validate_prop(const struct btrfs_inode *inode, const char *name,
const char *value, size_t value_len);
bool btrfs_ignore_prop(const struct btrfs_inode *inode, const char *name);
-int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path);
+int btrfs_load_inode_props(struct btrfs_inode *inode, struct btrfs_path *path);
int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
- struct inode *inode,
- const struct inode *dir);
+ struct btrfs_inode *inode,
+ const struct btrfs_inode *dir);
#endif
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index a6f92836c9b1..d6fa36674270 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -673,9 +673,6 @@ static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
key.offset = dst;
ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0);
-
- btrfs_mark_buffer_dirty(trans, path->nodes[0]);
-
btrfs_free_path(path);
return ret;
}
@@ -752,8 +749,6 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans,
btrfs_set_qgroup_info_excl(leaf, qgroup_info, 0);
btrfs_set_qgroup_info_excl_cmpr(leaf, qgroup_info, 0);
- btrfs_mark_buffer_dirty(trans, leaf);
-
btrfs_release_path(path);
key.type = BTRFS_QGROUP_LIMIT_KEY;
@@ -771,8 +766,6 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans,
btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0);
btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0);
- btrfs_mark_buffer_dirty(trans, leaf);
-
ret = 0;
out:
btrfs_free_path(path);
@@ -859,9 +852,6 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, qgroup->max_excl);
btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer);
btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl);
-
- btrfs_mark_buffer_dirty(trans, l);
-
out:
btrfs_free_path(path);
return ret;
@@ -905,9 +895,6 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr);
btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl);
btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr);
-
- btrfs_mark_buffer_dirty(trans, l);
-
out:
btrfs_free_path(path);
return ret;
@@ -947,9 +934,6 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans)
btrfs_set_qgroup_status_generation(l, ptr, trans->transid);
btrfs_set_qgroup_status_rescan(l, ptr,
fs_info->qgroup_rescan_progress.objectid);
-
- btrfs_mark_buffer_dirty(trans, l);
-
out:
btrfs_free_path(path);
return ret;
@@ -972,8 +956,8 @@ static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans,
return -ENOMEM;
key.objectid = 0;
- key.offset = 0;
key.type = 0;
+ key.offset = 0;
while (1) {
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
@@ -1121,6 +1105,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON;
if (simple) {
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE;
+ btrfs_set_fs_incompat(fs_info, SIMPLE_QUOTA);
btrfs_set_qgroup_status_enable_gen(leaf, ptr, trans->transid);
} else {
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
@@ -1129,8 +1114,6 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
BTRFS_QGROUP_STATUS_FLAGS_MASK);
btrfs_set_qgroup_status_rescan(leaf, ptr, 0);
- btrfs_mark_buffer_dirty(trans, leaf);
-
key.objectid = 0;
key.type = BTRFS_ROOT_REF_KEY;
key.offset = 0;
@@ -1254,8 +1237,6 @@ out_add_root:
spin_lock(&fs_info->qgroup_lock);
fs_info->quota_root = quota_root;
set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
- if (simple)
- btrfs_set_fs_incompat(fs_info, SIMPLE_QUOTA);
spin_unlock(&fs_info->qgroup_lock);
/* Skip rescan for simple qgroups. */
@@ -1839,9 +1820,19 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
* Thus its reserved space should all be zero, no matter if qgroup
* is consistent or the mode.
*/
- WARN_ON(qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] ||
- qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] ||
- qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]);
+ if (qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] ||
+ qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] ||
+ qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]) {
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ btrfs_warn_rl(fs_info,
+"to be deleted qgroup %u/%llu has non-zero numbers, data %llu meta prealloc %llu meta pertrans %llu",
+ btrfs_qgroup_level(qgroup->qgroupid),
+ btrfs_qgroup_subvolid(qgroup->qgroupid),
+ qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA],
+ qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC],
+ qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]);
+
+ }
/*
* The same for rfer/excl numbers, but that's only if our qgroup is
* consistent and if it's in regular qgroup mode.
@@ -1850,8 +1841,9 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
*/
if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL &&
!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT)) {
- if (WARN_ON(qgroup->rfer || qgroup->excl ||
- qgroup->rfer_cmpr || qgroup->excl_cmpr)) {
+ if (qgroup->rfer || qgroup->excl ||
+ qgroup->rfer_cmpr || qgroup->excl_cmpr) {
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
btrfs_warn_rl(fs_info,
"to be deleted qgroup %u/%llu has non-zero numbers, rfer %llu rfer_cmpr %llu excl %llu excl_cmpr %llu",
btrfs_qgroup_level(qgroup->qgroupid),
@@ -1888,11 +1880,7 @@ int btrfs_qgroup_cleanup_dropped_subvolume(struct btrfs_fs_info *fs_info, u64 su
* Commit current transaction to make sure all the rfer/excl numbers
* get updated.
*/
- trans = btrfs_start_transaction(fs_info->quota_root, 0);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
-
- ret = btrfs_commit_transaction(trans);
+ ret = btrfs_commit_current_transaction(fs_info->quota_root);
if (ret < 0)
return ret;
@@ -1905,8 +1893,11 @@ int btrfs_qgroup_cleanup_dropped_subvolume(struct btrfs_fs_info *fs_info, u64 su
/*
* It's squota and the subvolume still has numbers needed for future
* accounting, in this case we can not delete it. Just skip it.
+ *
+ * Or the qgroup is already removed by a qgroup rescan. For both cases we're
+ * safe to ignore them.
*/
- if (ret == -EBUSY)
+ if (ret == -EBUSY || ret == -ENOENT)
ret = 0;
return ret;
}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index e233cc79af18..a979fd59a4da 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -22,6 +22,9 @@ struct btrfs_ioctl_quota_ctl_args;
struct btrfs_trans_handle;
struct btrfs_delayed_ref_root;
struct btrfs_inode;
+struct btrfs_transaction;
+struct btrfs_block_group;
+struct btrfs_qgroup_swapped_blocks;
/*
* Btrfs qgroup overview
diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
index 9ffc79f250fb..1834011ccc49 100644
--- a/fs/btrfs/raid-stripe-tree.c
+++ b/fs/btrfs/raid-stripe-tree.c
@@ -13,12 +13,13 @@
#include "volumes.h"
#include "print-tree.h"
-static void btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans,
+static int btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
const struct btrfs_key *oldkey,
u64 newlen, u64 frontpad)
{
- struct btrfs_stripe_extent *extent;
+ struct btrfs_root *stripe_root = trans->fs_info->stripe_root;
+ struct btrfs_stripe_extent *extent, *newitem;
struct extent_buffer *leaf;
int slot;
size_t item_size;
@@ -27,23 +28,39 @@ static void btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans,
.type = BTRFS_RAID_STRIPE_KEY,
.offset = newlen,
};
+ int ret;
+ ASSERT(newlen > 0);
ASSERT(oldkey->type == BTRFS_RAID_STRIPE_KEY);
leaf = path->nodes[0];
slot = path->slots[0];
item_size = btrfs_item_size(leaf, slot);
+
+ newitem = kzalloc(item_size, GFP_NOFS);
+ if (!newitem)
+ return -ENOMEM;
+
extent = btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent);
for (int i = 0; i < btrfs_num_raid_stripes(item_size); i++) {
struct btrfs_raid_stride *stride = &extent->strides[i];
u64 phys;
- phys = btrfs_raid_stride_physical(leaf, stride);
- btrfs_set_raid_stride_physical(leaf, stride, phys + frontpad);
+ phys = btrfs_raid_stride_physical(leaf, stride) + frontpad;
+ btrfs_set_stack_raid_stride_physical(&newitem->strides[i], phys);
}
- btrfs_set_item_key_safe(trans, path, &newkey);
+ ret = btrfs_del_item(trans, stripe_root, path);
+ if (ret)
+ goto out;
+
+ btrfs_release_path(path);
+ ret = btrfs_insert_item(trans, stripe_root, &newkey, newitem, item_size);
+
+out:
+ kfree(newitem);
+ return ret;
}
int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 length)
@@ -59,9 +76,22 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le
int slot;
int ret;
- if (!stripe_root)
+ if (!btrfs_fs_incompat(fs_info, RAID_STRIPE_TREE) || !stripe_root)
return 0;
+ if (!btrfs_is_testing(fs_info)) {
+ struct btrfs_chunk_map *map;
+ bool use_rst;
+
+ map = btrfs_find_chunk_map(fs_info, start, length);
+ if (!map)
+ return -EINVAL;
+ use_rst = btrfs_need_stripe_tree_update(fs_info, map->type);
+ btrfs_free_chunk_map(map);
+ if (!use_rst)
+ return 0;
+ }
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -85,6 +115,37 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le
found_end = found_start + key.offset;
ret = 0;
+ /*
+ * The stripe extent starts before the range we want to delete,
+ * but the range spans more than one stripe extent:
+ *
+ * |--- RAID Stripe Extent ---||--- RAID Stripe Extent ---|
+ * |--- keep ---|--- drop ---|
+ *
+ * This means we have to get the previous item, truncate its
+ * length and then restart the search.
+ */
+ if (found_start > start) {
+ if (slot == 0) {
+ ret = btrfs_previous_item(stripe_root, path, start,
+ BTRFS_RAID_STRIPE_KEY);
+ if (ret) {
+ if (ret > 0)
+ ret = -ENOENT;
+ break;
+ }
+ } else {
+ path->slots[0]--;
+ }
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ found_start = key.objectid;
+ found_end = found_start + key.offset;
+ ASSERT(found_start <= start);
+ }
+
if (key.type != BTRFS_RAID_STRIPE_KEY)
break;
@@ -96,6 +157,54 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le
found_start, found_end);
/*
+ * The stripe extent starts before the range we want to delete
+ * and ends after the range we want to delete, i.e. we're
+ * punching a hole in the stripe extent:
+ *
+ * |--- RAID Stripe Extent ---|
+ * | keep |--- drop ---| keep |
+ *
+ * This means we need to a) truncate the existing item and b)
+ * create a second item for the remaining range.
+ */
+ if (found_start < start && found_end > end) {
+ size_t item_size;
+ u64 diff_start = start - found_start;
+ u64 diff_end = found_end - end;
+ struct btrfs_stripe_extent *extent;
+ struct btrfs_key newkey = {
+ .objectid = end,
+ .type = BTRFS_RAID_STRIPE_KEY,
+ .offset = diff_end,
+ };
+
+ /* The "right" item. */
+ ret = btrfs_duplicate_item(trans, stripe_root, path, &newkey);
+ if (ret)
+ break;
+
+ item_size = btrfs_item_size(leaf, path->slots[0]);
+ extent = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_stripe_extent);
+
+ for (int i = 0; i < btrfs_num_raid_stripes(item_size); i++) {
+ struct btrfs_raid_stride *stride = &extent->strides[i];
+ u64 phys;
+
+ phys = btrfs_raid_stride_physical(leaf, stride);
+ phys += diff_start + length;
+ btrfs_set_raid_stride_physical(leaf, stride, phys);
+ }
+
+ /* The "left" item. */
+ path->slots[0]--;
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ btrfs_partially_delete_raid_extent(trans, path, &key,
+ diff_start, 0);
+ break;
+ }
+
+ /*
* The stripe extent starts before the range we want to delete:
*
* |--- RAID Stripe Extent ---|
@@ -105,11 +214,18 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le
* length to the new size and then re-insert the item.
*/
if (found_start < start) {
- u64 diff = start - found_start;
+ u64 diff_start = start - found_start;
btrfs_partially_delete_raid_extent(trans, path, &key,
- diff, 0);
- break;
+ diff_start, 0);
+
+ start += (key.offset - diff_start);
+ length -= (key.offset - diff_start);
+ if (length == 0)
+ break;
+
+ btrfs_release_path(path);
+ continue;
}
/*
@@ -122,13 +238,16 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le
* length to the new size and then re-insert the item.
*/
if (found_end > end) {
- u64 diff = found_end - end;
+ u64 diff_end = found_end - end;
btrfs_partially_delete_raid_extent(trans, path, &key,
- diff, diff);
+ key.offset - length,
+ length);
+ ASSERT(key.offset - diff_end == length);
break;
}
+ /* Finally we can delete the whole item, no more special cases. */
ret = btrfs_del_item(trans, stripe_root, path);
if (ret)
break;
@@ -169,7 +288,6 @@ static int update_raid_extent_item(struct btrfs_trans_handle *trans,
write_extent_buffer(leaf, stripe_extent, btrfs_item_ptr_offset(leaf, slot),
item_size);
- btrfs_mark_buffer_dirty(trans, leaf);
btrfs_free_path(path);
return ret;
@@ -199,12 +317,8 @@ int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans,
for (int i = 0; i < num_stripes; i++) {
u64 devid = bioc->stripes[i].dev->devid;
u64 physical = bioc->stripes[i].physical;
- u64 length = bioc->stripes[i].length;
struct btrfs_raid_stride *raid_stride = &stripe_extent->strides[i];
- if (length == 0)
- length = bioc->size;
-
btrfs_set_stack_raid_stride_devid(raid_stride, devid);
btrfs_set_stack_raid_stride_physical(raid_stride, physical);
}
diff --git a/fs/btrfs/raid-stripe-tree.h b/fs/btrfs/raid-stripe-tree.h
index 541836421778..69942ad43140 100644
--- a/fs/btrfs/raid-stripe-tree.h
+++ b/fs/btrfs/raid-stripe-tree.h
@@ -9,6 +9,7 @@
#include <linux/types.h>
#include <uapi/linux/btrfs_tree.h>
#include "fs.h"
+#include "accessors.h"
#define BTRFS_RST_SUPP_BLOCK_GROUP_MASK (BTRFS_BLOCK_GROUP_DUP | \
BTRFS_BLOCK_GROUP_RAID1_MASK | \
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index f0824c948cb7..15c296cb4dac 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -165,7 +165,7 @@ out:
* the source inode to destination inode when possible. When not possible we
* copy the inline extent's data into the respective page of the inode.
*/
-static int clone_copy_inline_extent(struct inode *dst,
+static int clone_copy_inline_extent(struct btrfs_inode *inode,
struct btrfs_path *path,
struct btrfs_key *new_key,
const u64 drop_start,
@@ -175,8 +175,8 @@ static int clone_copy_inline_extent(struct inode *dst,
char *inline_data,
struct btrfs_trans_handle **trans_out)
{
- struct btrfs_fs_info *fs_info = inode_to_fs_info(dst);
- struct btrfs_root *root = BTRFS_I(dst)->root;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
const u64 aligned_end = ALIGN(new_key->offset + datal,
fs_info->sectorsize);
struct btrfs_trans_handle *trans = NULL;
@@ -185,12 +185,12 @@ static int clone_copy_inline_extent(struct inode *dst,
struct btrfs_key key;
if (new_key->offset > 0) {
- ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
+ ret = copy_inline_to_page(inode, new_key->offset,
inline_data, size, datal, comp_type);
goto out;
}
- key.objectid = btrfs_ino(BTRFS_I(dst));
+ key.objectid = btrfs_ino(inode);
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = 0;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -205,7 +205,7 @@ static int clone_copy_inline_extent(struct inode *dst,
goto copy_inline_extent;
}
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
- if (key.objectid == btrfs_ino(BTRFS_I(dst)) &&
+ if (key.objectid == btrfs_ino(inode) &&
key.type == BTRFS_EXTENT_DATA_KEY) {
/*
* There's an implicit hole at file offset 0, copy the
@@ -214,7 +214,7 @@ static int clone_copy_inline_extent(struct inode *dst,
ASSERT(key.offset > 0);
goto copy_to_page;
}
- } else if (i_size_read(dst) <= datal) {
+ } else if (i_size_read(&inode->vfs_inode) <= datal) {
struct btrfs_file_extent_item *ei;
ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
@@ -236,7 +236,7 @@ copy_inline_extent:
* We have no extent items, or we have an extent at offset 0 which may
* or may not be inlined. All these cases are dealt the same way.
*/
- if (i_size_read(dst) > datal) {
+ if (i_size_read(&inode->vfs_inode) > datal) {
/*
* At the destination offset 0 we have either a hole, a regular
* extent or an inline extent larger then the one we want to
@@ -270,7 +270,7 @@ copy_inline_extent:
drop_args.start = drop_start;
drop_args.end = aligned_end;
drop_args.drop_cache = true;
- ret = btrfs_drop_extents(trans, root, BTRFS_I(dst), &drop_args);
+ ret = btrfs_drop_extents(trans, root, inode, &drop_args);
if (ret)
goto out;
ret = btrfs_insert_empty_item(trans, root, path, new_key, size);
@@ -281,9 +281,9 @@ copy_inline_extent:
btrfs_item_ptr_offset(path->nodes[0],
path->slots[0]),
size);
- btrfs_update_inode_bytes(BTRFS_I(dst), datal, drop_args.bytes_found);
- btrfs_set_inode_full_sync(BTRFS_I(dst));
- ret = btrfs_inode_set_file_extent_range(BTRFS_I(dst), 0, aligned_end);
+ btrfs_update_inode_bytes(inode, datal, drop_args.bytes_found);
+ btrfs_set_inode_full_sync(inode);
+ ret = btrfs_inode_set_file_extent_range(inode, 0, aligned_end);
out:
if (!ret && !trans) {
/*
@@ -318,7 +318,7 @@ copy_to_page:
*/
btrfs_release_path(path);
- ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
+ ret = copy_inline_to_page(inode, new_key->offset,
inline_data, size, datal, comp_type);
goto out;
}
@@ -526,7 +526,7 @@ process_slot:
goto out;
}
- ret = clone_copy_inline_extent(inode, path, &new_key,
+ ret = clone_copy_inline_extent(BTRFS_I(inode), path, &new_key,
drop_start, datal, size,
comp, buf, &trans);
if (ret)
@@ -617,26 +617,26 @@ out:
return ret;
}
-static void btrfs_double_mmap_lock(struct inode *inode1, struct inode *inode2)
+static void btrfs_double_mmap_lock(struct btrfs_inode *inode1, struct btrfs_inode *inode2)
{
if (inode1 < inode2)
swap(inode1, inode2);
- down_write(&BTRFS_I(inode1)->i_mmap_lock);
- down_write_nested(&BTRFS_I(inode2)->i_mmap_lock, SINGLE_DEPTH_NESTING);
+ down_write(&inode1->i_mmap_lock);
+ down_write_nested(&inode2->i_mmap_lock, SINGLE_DEPTH_NESTING);
}
-static void btrfs_double_mmap_unlock(struct inode *inode1, struct inode *inode2)
+static void btrfs_double_mmap_unlock(struct btrfs_inode *inode1, struct btrfs_inode *inode2)
{
- up_write(&BTRFS_I(inode1)->i_mmap_lock);
- up_write(&BTRFS_I(inode2)->i_mmap_lock);
+ up_write(&inode1->i_mmap_lock);
+ up_write(&inode2->i_mmap_lock);
}
-static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
- struct inode *dst, u64 dst_loff)
+static int btrfs_extent_same_range(struct btrfs_inode *src, u64 loff, u64 len,
+ struct btrfs_inode *dst, u64 dst_loff)
{
const u64 end = dst_loff + len - 1;
struct extent_state *cached_state = NULL;
- struct btrfs_fs_info *fs_info = BTRFS_I(src)->root->fs_info;
+ struct btrfs_fs_info *fs_info = src->root->fs_info;
const u64 bs = fs_info->sectorsize;
int ret;
@@ -646,9 +646,10 @@ static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
* because we have already locked the inode's i_mmap_lock in exclusive
* mode.
*/
- lock_extent(&BTRFS_I(dst)->io_tree, dst_loff, end, &cached_state);
- ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1);
- unlock_extent(&BTRFS_I(dst)->io_tree, dst_loff, end, &cached_state);
+ lock_extent(&dst->io_tree, dst_loff, end, &cached_state);
+ ret = btrfs_clone(&src->vfs_inode, &dst->vfs_inode, loff, len,
+ ALIGN(len, bs), dst_loff, 1);
+ unlock_extent(&dst->io_tree, dst_loff, end, &cached_state);
btrfs_btree_balance_dirty(fs_info);
@@ -678,8 +679,8 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN);
for (i = 0; i < chunk_count; i++) {
- ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN,
- dst, dst_loff);
+ ret = btrfs_extent_same_range(BTRFS_I(src), loff, BTRFS_MAX_DEDUPE_LEN,
+ BTRFS_I(dst), dst_loff);
if (ret)
goto out;
@@ -688,7 +689,8 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
}
if (tail_len > 0)
- ret = btrfs_extent_same_range(src, loff, tail_len, dst, dst_loff);
+ ret = btrfs_extent_same_range(BTRFS_I(src), loff, tail_len,
+ BTRFS_I(dst), dst_loff);
out:
spin_lock(&root_dst->root_item_lock);
root_dst->dedupe_in_progress--;
@@ -775,24 +777,24 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
loff_t *len, unsigned int remap_flags)
{
- struct inode *inode_in = file_inode(file_in);
- struct inode *inode_out = file_inode(file_out);
- u64 bs = BTRFS_I(inode_out)->root->fs_info->sectorsize;
+ struct btrfs_inode *inode_in = BTRFS_I(file_inode(file_in));
+ struct btrfs_inode *inode_out = BTRFS_I(file_inode(file_out));
+ u64 bs = inode_out->root->fs_info->sectorsize;
u64 wb_len;
int ret;
if (!(remap_flags & REMAP_FILE_DEDUP)) {
- struct btrfs_root *root_out = BTRFS_I(inode_out)->root;
+ struct btrfs_root *root_out = inode_out->root;
if (btrfs_root_readonly(root_out))
return -EROFS;
- ASSERT(inode_in->i_sb == inode_out->i_sb);
+ ASSERT(inode_in->vfs_inode.i_sb == inode_out->vfs_inode.i_sb);
}
/* Don't make the dst file partly checksummed */
- if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) !=
- (BTRFS_I(inode_out)->flags & BTRFS_INODE_NODATASUM)) {
+ if ((inode_in->flags & BTRFS_INODE_NODATASUM) !=
+ (inode_out->flags & BTRFS_INODE_NODATASUM)) {
return -EINVAL;
}
@@ -811,7 +813,7 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
* to complete so that new file extent items are in the fs tree.
*/
if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP))
- wb_len = ALIGN(inode_in->i_size, bs) - ALIGN_DOWN(pos_in, bs);
+ wb_len = ALIGN(inode_in->vfs_inode.i_size, bs) - ALIGN_DOWN(pos_in, bs);
else
wb_len = ALIGN(*len, bs);
@@ -832,16 +834,14 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
* Also we don't need to check ASYNC_EXTENT, as async extent will be
* CoWed anyway, not affecting nocow part.
*/
- ret = filemap_flush(inode_in->i_mapping);
+ ret = filemap_flush(inode_in->vfs_inode.i_mapping);
if (ret < 0)
return ret;
- ret = btrfs_wait_ordered_range(BTRFS_I(inode_in), ALIGN_DOWN(pos_in, bs),
- wb_len);
+ ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs), wb_len);
if (ret < 0)
return ret;
- ret = btrfs_wait_ordered_range(BTRFS_I(inode_out), ALIGN_DOWN(pos_out, bs),
- wb_len);
+ ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs), wb_len);
if (ret < 0)
return ret;
@@ -863,8 +863,8 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
struct file *dst_file, loff_t destoff, loff_t len,
unsigned int remap_flags)
{
- struct inode *src_inode = file_inode(src_file);
- struct inode *dst_inode = file_inode(dst_file);
+ struct btrfs_inode *src_inode = BTRFS_I(file_inode(src_file));
+ struct btrfs_inode *dst_inode = BTRFS_I(file_inode(dst_file));
bool same_inode = dst_inode == src_inode;
int ret;
@@ -872,9 +872,9 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
return -EINVAL;
if (same_inode) {
- btrfs_inode_lock(BTRFS_I(src_inode), BTRFS_ILOCK_MMAP);
+ btrfs_inode_lock(src_inode, BTRFS_ILOCK_MMAP);
} else {
- lock_two_nondirectories(src_inode, dst_inode);
+ lock_two_nondirectories(&src_inode->vfs_inode, &dst_inode->vfs_inode);
btrfs_double_mmap_lock(src_inode, dst_inode);
}
@@ -884,16 +884,18 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
goto out_unlock;
if (remap_flags & REMAP_FILE_DEDUP)
- ret = btrfs_extent_same(src_inode, off, len, dst_inode, destoff);
+ ret = btrfs_extent_same(&src_inode->vfs_inode, off, len,
+ &dst_inode->vfs_inode, destoff);
else
ret = btrfs_clone_files(dst_file, src_file, off, len, destoff);
out_unlock:
if (same_inode) {
- btrfs_inode_unlock(BTRFS_I(src_inode), BTRFS_ILOCK_MMAP);
+ btrfs_inode_unlock(src_inode, BTRFS_ILOCK_MMAP);
} else {
btrfs_double_mmap_unlock(src_inode, dst_inode);
- unlock_two_nondirectories(src_inode, dst_inode);
+ unlock_two_nondirectories(&src_inode->vfs_inode,
+ &dst_inode->vfs_inode);
}
/*
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index bf267bdfa8f8..e17bcb034595 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -342,12 +342,6 @@ static bool handle_useless_nodes(struct reloc_control *rc,
if (cur == node)
ret = true;
- /* The node is the lowest node */
- if (cur->lowest) {
- list_del_init(&cur->lower);
- cur->lowest = 0;
- }
-
/* Cleanup the lower edges */
while (!list_empty(&cur->lower)) {
struct btrfs_backref_edge *edge;
@@ -373,7 +367,6 @@ static bool handle_useless_nodes(struct reloc_control *rc,
* cache to avoid unnecessary backref lookup.
*/
if (cur->level > 0) {
- list_add(&cur->list, &cache->detached);
cur->detached = 1;
} else {
rb_erase(&cur->rb_node, &cache->rb_root);
@@ -426,7 +419,6 @@ static noinline_for_stack struct btrfs_backref_node *build_backref_tree(
goto out;
}
- node->lowest = 1;
cur = node;
/* Breadth-first search to build backref cache */
@@ -470,92 +462,6 @@ out:
}
/*
- * helper to add backref node for the newly created snapshot.
- * the backref node is created by cloning backref node that
- * corresponds to root of source tree
- */
-static int clone_backref_node(struct btrfs_trans_handle *trans,
- struct reloc_control *rc,
- const struct btrfs_root *src,
- struct btrfs_root *dest)
-{
- struct btrfs_root *reloc_root = src->reloc_root;
- struct btrfs_backref_cache *cache = &rc->backref_cache;
- struct btrfs_backref_node *node = NULL;
- struct btrfs_backref_node *new_node;
- struct btrfs_backref_edge *edge;
- struct btrfs_backref_edge *new_edge;
- struct rb_node *rb_node;
-
- rb_node = rb_simple_search(&cache->rb_root, src->commit_root->start);
- if (rb_node) {
- node = rb_entry(rb_node, struct btrfs_backref_node, rb_node);
- if (node->detached)
- node = NULL;
- else
- BUG_ON(node->new_bytenr != reloc_root->node->start);
- }
-
- if (!node) {
- rb_node = rb_simple_search(&cache->rb_root,
- reloc_root->commit_root->start);
- if (rb_node) {
- node = rb_entry(rb_node, struct btrfs_backref_node,
- rb_node);
- BUG_ON(node->detached);
- }
- }
-
- if (!node)
- return 0;
-
- new_node = btrfs_backref_alloc_node(cache, dest->node->start,
- node->level);
- if (!new_node)
- return -ENOMEM;
-
- new_node->lowest = node->lowest;
- new_node->checked = 1;
- new_node->root = btrfs_grab_root(dest);
- ASSERT(new_node->root);
-
- if (!node->lowest) {
- list_for_each_entry(edge, &node->lower, list[UPPER]) {
- new_edge = btrfs_backref_alloc_edge(cache);
- if (!new_edge)
- goto fail;
-
- btrfs_backref_link_edge(new_edge, edge->node[LOWER],
- new_node, LINK_UPPER);
- }
- } else {
- list_add_tail(&new_node->lower, &cache->leaves);
- }
-
- rb_node = rb_simple_insert(&cache->rb_root, new_node->bytenr,
- &new_node->rb_node);
- if (rb_node)
- btrfs_backref_panic(trans->fs_info, new_node->bytenr, -EEXIST);
-
- if (!new_node->lowest) {
- list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
- list_add_tail(&new_edge->list[LOWER],
- &new_edge->node[LOWER]->upper);
- }
- }
- return 0;
-fail:
- while (!list_empty(&new_node->lower)) {
- new_edge = list_entry(new_node->lower.next,
- struct btrfs_backref_edge, list[UPPER]);
- list_del(&new_edge->list[UPPER]);
- btrfs_backref_free_edge(cache, new_edge);
- }
- btrfs_backref_free_node(cache, new_node);
- return -ENOMEM;
-}
-
-/*
* helper to add 'address of tree root -> reloc tree' mapping
*/
static int __add_reloc_root(struct btrfs_root *root)
@@ -950,7 +856,6 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
u32 i;
int ret = 0;
int first = 1;
- int dirty = 0;
if (rc->stage != UPDATE_DATA_PTRS)
return 0;
@@ -1030,7 +935,6 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
}
btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
- dirty = 1;
key.offset -= btrfs_file_extent_offset(leaf, fi);
ref.action = BTRFS_ADD_DELAYED_REF;
@@ -1061,8 +965,6 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
break;
}
}
- if (dirty)
- btrfs_mark_buffer_dirty(trans, leaf);
if (inode)
btrfs_add_delayed_iput(inode);
return ret;
@@ -1255,13 +1157,11 @@ again:
*/
btrfs_set_node_blockptr(parent, slot, new_bytenr);
btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen);
- btrfs_mark_buffer_dirty(trans, parent);
btrfs_set_node_blockptr(path->nodes[level],
path->slots[level], old_bytenr);
btrfs_set_node_ptr_generation(path->nodes[level],
path->slots[level], old_ptr_gen);
- btrfs_mark_buffer_dirty(trans, path->nodes[level]);
ref.action = BTRFS_ADD_DELAYED_REF;
ref.bytenr = old_bytenr;
@@ -2058,100 +1958,72 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
int index = 0;
int ret;
- next = node;
- while (1) {
- cond_resched();
- next = walk_up_backref(next, edges, &index);
- root = next->root;
-
- /*
- * If there is no root, then our references for this block are
- * incomplete, as we should be able to walk all the way up to a
- * block that is owned by a root.
- *
- * This path is only for SHAREABLE roots, so if we come upon a
- * non-SHAREABLE root then we have backrefs that resolve
- * improperly.
- *
- * Both of these cases indicate file system corruption, or a bug
- * in the backref walking code.
- */
- if (!root) {
- ASSERT(0);
- btrfs_err(trans->fs_info,
- "bytenr %llu doesn't have a backref path ending in a root",
- node->bytenr);
- return ERR_PTR(-EUCLEAN);
- }
- if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
- ASSERT(0);
- btrfs_err(trans->fs_info,
- "bytenr %llu has multiple refs with one ending in a non-shareable root",
- node->bytenr);
- return ERR_PTR(-EUCLEAN);
- }
+ next = walk_up_backref(node, edges, &index);
+ root = next->root;
- if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) {
- ret = record_reloc_root_in_trans(trans, root);
- if (ret)
- return ERR_PTR(ret);
- break;
- }
+ /*
+ * If there is no root, then our references for this block are
+ * incomplete, as we should be able to walk all the way up to a block
+ * that is owned by a root.
+ *
+ * This path is only for SHAREABLE roots, so if we come upon a
+ * non-SHAREABLE root then we have backrefs that resolve improperly.
+ *
+ * Both of these cases indicate file system corruption, or a bug in the
+ * backref walking code.
+ */
+ if (unlikely(!root)) {
+ btrfs_err(trans->fs_info,
+ "bytenr %llu doesn't have a backref path ending in a root",
+ node->bytenr);
+ return ERR_PTR(-EUCLEAN);
+ }
+ if (unlikely(!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))) {
+ btrfs_err(trans->fs_info,
+ "bytenr %llu has multiple refs with one ending in a non-shareable root",
+ node->bytenr);
+ return ERR_PTR(-EUCLEAN);
+ }
- ret = btrfs_record_root_in_trans(trans, root);
+ if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) {
+ ret = record_reloc_root_in_trans(trans, root);
if (ret)
return ERR_PTR(ret);
- root = root->reloc_root;
-
- /*
- * We could have raced with another thread which failed, so
- * root->reloc_root may not be set, return ENOENT in this case.
- */
- if (!root)
- return ERR_PTR(-ENOENT);
+ goto found;
+ }
- if (next->new_bytenr != root->node->start) {
- /*
- * We just created the reloc root, so we shouldn't have
- * ->new_bytenr set and this shouldn't be in the changed
- * list. If it is then we have multiple roots pointing
- * at the same bytenr which indicates corruption, or
- * we've made a mistake in the backref walking code.
- */
- ASSERT(next->new_bytenr == 0);
- ASSERT(list_empty(&next->list));
- if (next->new_bytenr || !list_empty(&next->list)) {
- btrfs_err(trans->fs_info,
- "bytenr %llu possibly has multiple roots pointing at the same bytenr %llu",
- node->bytenr, next->bytenr);
- return ERR_PTR(-EUCLEAN);
- }
+ ret = btrfs_record_root_in_trans(trans, root);
+ if (ret)
+ return ERR_PTR(ret);
+ root = root->reloc_root;
- next->new_bytenr = root->node->start;
- btrfs_put_root(next->root);
- next->root = btrfs_grab_root(root);
- ASSERT(next->root);
- list_add_tail(&next->list,
- &rc->backref_cache.changed);
- mark_block_processed(rc, next);
- break;
- }
+ /*
+ * We could have raced with another thread which failed, so
+ * root->reloc_root may not be set, return ENOENT in this case.
+ */
+ if (!root)
+ return ERR_PTR(-ENOENT);
- WARN_ON(1);
- root = NULL;
- next = walk_down_backref(edges, &index);
- if (!next || next->level <= node->level)
- break;
- }
- if (!root) {
+ if (next->new_bytenr) {
/*
- * This can happen if there's fs corruption or if there's a bug
- * in the backref lookup code.
+ * We just created the reloc root, so we shouldn't have
+ * ->new_bytenr set yet. If it is then we have multiple roots
+ * pointing at the same bytenr which indicates corruption, or
+ * we've made a mistake in the backref walking code.
*/
- ASSERT(0);
- return ERR_PTR(-ENOENT);
+ ASSERT(next->new_bytenr == 0);
+ btrfs_err(trans->fs_info,
+ "bytenr %llu possibly has multiple roots pointing at the same bytenr %llu",
+ node->bytenr, next->bytenr);
+ return ERR_PTR(-EUCLEAN);
}
+ next->new_bytenr = root->node->start;
+ btrfs_put_root(next->root);
+ next->root = btrfs_grab_root(root);
+ ASSERT(next->root);
+ mark_block_processed(rc, next);
+found:
next = node;
/* setup backref node path for btrfs_reloc_cow_block */
while (1) {
@@ -2247,17 +2119,11 @@ static noinline_for_stack u64 calcu_metadata_size(struct reloc_control *rc,
return num_bytes;
}
-static int reserve_metadata_space(struct btrfs_trans_handle *trans,
- struct reloc_control *rc,
- struct btrfs_backref_node *node)
+static int refill_metadata_space(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc, u64 num_bytes)
{
- struct btrfs_root *root = rc->extent_root;
- struct btrfs_fs_info *fs_info = root->fs_info;
- u64 num_bytes;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
int ret;
- u64 tmp;
-
- num_bytes = calcu_metadata_size(rc, node) * 2;
trans->block_rsv = rc->block_rsv;
rc->reserved_bytes += num_bytes;
@@ -2270,7 +2136,8 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, num_bytes,
BTRFS_RESERVE_FLUSH_LIMIT);
if (ret) {
- tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES;
+ u64 tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES;
+
while (tmp <= rc->reserved_bytes)
tmp <<= 1;
/*
@@ -2288,6 +2155,16 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
return 0;
}
+static int reserve_metadata_space(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc,
+ struct btrfs_backref_node *node)
+{
+ u64 num_bytes;
+
+ num_bytes = calcu_metadata_size(rc, node) * 2;
+ return refill_metadata_space(trans, rc, num_bytes);
+}
+
/*
* relocate a block tree, and then update pointers in upper level
* blocks that reference the block to point to the new location.
@@ -2442,7 +2319,7 @@ next:
if (!ret && node->pending) {
btrfs_backref_drop_node_buffer(node);
- list_move_tail(&node->list, &rc->backref_cache.changed);
+ list_del_init(&node->list);
node->pending = 0;
}
@@ -2605,8 +2482,7 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
/*
* This block was the root block of a root, and this is
* the first time we're processing the block and thus it
- * should not have had the ->new_bytenr modified and
- * should have not been included on the changed list.
+ * should not have had the ->new_bytenr modified.
*
* However in the case of corruption we could have
* multiple refs pointing to the same block improperly,
@@ -2616,8 +2492,7 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
* normal user in the case of corruption.
*/
ASSERT(node->new_bytenr == 0);
- ASSERT(list_empty(&node->list));
- if (node->new_bytenr || !list_empty(&node->list)) {
+ if (node->new_bytenr) {
btrfs_err(root->fs_info,
"bytenr %llu has improper references to it",
node->bytenr);
@@ -2640,17 +2515,12 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
btrfs_put_root(node->root);
node->root = btrfs_grab_root(root);
ASSERT(node->root);
- list_add_tail(&node->list, &rc->backref_cache.changed);
} else {
- path->lowest_level = node->level;
- if (root == root->fs_info->chunk_root)
- btrfs_reserve_chunk_metadata(trans, false);
- ret = btrfs_search_slot(trans, root, key, path, 0, 1);
- btrfs_release_path(path);
- if (root == root->fs_info->chunk_root)
- btrfs_trans_release_chunk_metadata(trans);
- if (ret > 0)
- ret = 0;
+ btrfs_err(root->fs_info,
+ "bytenr %llu resolved to a non-shareable root",
+ node->bytenr);
+ ret = -EUCLEAN;
+ goto out;
}
if (!ret)
update_processed_blocks(rc, node);
@@ -2658,11 +2528,50 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
ret = do_relocation(trans, rc, node, key, path, 1);
}
out:
- if (ret || node->level == 0 || node->cowonly)
+ if (ret || node->level == 0)
btrfs_backref_cleanup_node(&rc->backref_cache, node);
return ret;
}
+static int relocate_cowonly_block(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc, struct tree_block *block,
+ struct btrfs_path *path)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_root *root;
+ u64 num_bytes;
+ int nr_levels;
+ int ret;
+
+ root = btrfs_get_fs_root(fs_info, block->owner, true);
+ if (IS_ERR(root))
+ return PTR_ERR(root);
+
+ nr_levels = max(btrfs_header_level(root->node) - block->level, 0) + 1;
+
+ num_bytes = fs_info->nodesize * nr_levels;
+ ret = refill_metadata_space(trans, rc, num_bytes);
+ if (ret) {
+ btrfs_put_root(root);
+ return ret;
+ }
+ path->lowest_level = block->level;
+ if (root == root->fs_info->chunk_root)
+ btrfs_reserve_chunk_metadata(trans, false);
+
+ ret = btrfs_search_slot(trans, root, &block->key, path, 0, 1);
+ path->lowest_level = 0;
+ btrfs_release_path(path);
+
+ if (root == root->fs_info->chunk_root)
+ btrfs_trans_release_chunk_metadata(trans);
+ if (ret > 0)
+ ret = 0;
+ btrfs_put_root(root);
+
+ return ret;
+}
+
/*
* relocate a list of blocks
*/
@@ -2702,6 +2611,20 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
/* Do tree relocation */
rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
+ /*
+ * For COWonly blocks, or the data reloc tree, we only need to
+ * COW down to the block, there's no need to generate a backref
+ * tree.
+ */
+ if (block->owner &&
+ (!is_fstree(block->owner) ||
+ block->owner == BTRFS_DATA_RELOC_TREE_OBJECTID)) {
+ ret = relocate_cowonly_block(trans, rc, block, path);
+ if (ret)
+ break;
+ continue;
+ }
+
node = build_backref_tree(trans, rc, &block->key,
block->level, block->bytenr);
if (IS_ERR(node)) {
@@ -2902,6 +2825,7 @@ static int relocate_one_folio(struct reloc_control *rc,
const bool use_rst = btrfs_need_stripe_tree_update(fs_info, rc->block_group->flags);
ASSERT(index <= last_index);
+again:
folio = filemap_lock_folio(inode->i_mapping, index);
if (IS_ERR(folio)) {
@@ -2937,11 +2861,16 @@ static int relocate_one_folio(struct reloc_control *rc,
ret = -EIO;
goto release_folio;
}
+ if (folio->mapping != inode->i_mapping) {
+ folio_unlock(folio);
+ folio_put(folio);
+ goto again;
+ }
}
/*
* We could have lost folio private when we dropped the lock to read the
- * folio above, make sure we set_page_extent_mapped here so we have any
+ * folio above, make sure we set_folio_extent_mapped() here so we have any
* of the subpage blocksize stuff we need in place.
*/
ret = set_folio_extent_mapped(folio);
@@ -3310,21 +3239,23 @@ out:
return ret;
}
-static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
- struct btrfs_block_group *block_group,
+static int delete_block_group_cache(struct btrfs_block_group *block_group,
struct inode *inode,
u64 ino)
{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
struct btrfs_root *root = fs_info->tree_root;
struct btrfs_trans_handle *trans;
+ struct btrfs_inode *btrfs_inode;
int ret = 0;
if (inode)
goto truncate;
- inode = btrfs_iget(ino, root);
- if (IS_ERR(inode))
+ btrfs_inode = btrfs_iget(ino, root);
+ if (IS_ERR(btrfs_inode))
return -ENOENT;
+ inode = &btrfs_inode->vfs_inode;
truncate:
ret = btrfs_check_trunc_cache_free_space(fs_info,
@@ -3384,8 +3315,7 @@ static int delete_v1_space_cache(struct extent_buffer *leaf,
}
if (!found)
return -ENOENT;
- ret = delete_block_group_cache(leaf->fs_info, block_group, NULL,
- space_cache_ino);
+ ret = delete_block_group_cache(block_group, NULL, space_cache_ino);
return ret;
}
@@ -3793,7 +3723,6 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
BTRFS_INODE_PREALLOC);
- btrfs_mark_buffer_dirty(trans, leaf);
out:
btrfs_free_path(path);
return ret;
@@ -3833,10 +3762,10 @@ out:
* the inode is in data relocation tree and its link count is 0
*/
static noinline_for_stack struct inode *create_reloc_inode(
- struct btrfs_fs_info *fs_info,
const struct btrfs_block_group *group)
{
- struct inode *inode = NULL;
+ struct btrfs_fs_info *fs_info = group->fs_info;
+ struct btrfs_inode *inode = NULL;
struct btrfs_trans_handle *trans;
struct btrfs_root *root;
u64 objectid;
@@ -3864,18 +3793,19 @@ static noinline_for_stack struct inode *create_reloc_inode(
inode = NULL;
goto out;
}
- BTRFS_I(inode)->reloc_block_group_start = group->start;
+ inode->reloc_block_group_start = group->start;
- ret = btrfs_orphan_add(trans, BTRFS_I(inode));
+ ret = btrfs_orphan_add(trans, inode);
out:
btrfs_put_root(root);
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
if (ret) {
- iput(inode);
- inode = ERR_PTR(ret);
+ if (inode)
+ iput(&inode->vfs_inode);
+ return ERR_PTR(ret);
}
- return inode;
+ return &inode->vfs_inode;
}
/*
@@ -4049,7 +3979,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
btrfs_free_path(path);
if (!IS_ERR(inode))
- ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0);
+ ret = delete_block_group_cache(rc->block_group, inode, 0);
else
ret = PTR_ERR(inode);
@@ -4058,7 +3988,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
goto out;
}
- rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
+ rc->data_inode = create_reloc_inode(rc->block_group);
if (IS_ERR(rc->data_inode)) {
err = PTR_ERR(rc->data_inode);
rc->data_inode = NULL;
@@ -4399,8 +4329,18 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
WARN_ON(!first_cow && level == 0);
node = rc->backref_cache.path[level];
- BUG_ON(node->bytenr != buf->start &&
- node->new_bytenr != buf->start);
+
+ /*
+ * If node->bytenr != buf->start and node->new_bytenr !=
+ * buf->start then we've got the wrong backref node for what we
+ * expected to see here and the cache is incorrect.
+ */
+ if (unlikely(node->bytenr != buf->start && node->new_bytenr != buf->start)) {
+ btrfs_err(fs_info,
+"bytenr %llu was found but our backref cache was expecting %llu or %llu",
+ buf->start, node->bytenr, node->new_bytenr);
+ return -EUCLEAN;
+ }
btrfs_backref_drop_node_buffer(node);
atomic_inc(&cow->refs);
@@ -4500,10 +4440,7 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
return ret;
}
new_root->reloc_root = btrfs_grab_root(reloc_root);
-
- if (rc->create_reloc_tree)
- ret = clone_backref_node(trans, rc, root, reloc_root);
- return ret;
+ return 0;
}
/*
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 33962671a96c..e22e6b06927a 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -197,7 +197,6 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
btrfs_set_root_generation_v2(item, btrfs_root_generation(item));
write_extent_buffer(l, item, ptr, sizeof(*item));
- btrfs_mark_buffer_dirty(trans, path->nodes[0]);
out:
btrfs_free_path(path);
return ret;
@@ -447,7 +446,6 @@ again:
btrfs_set_root_ref_name_len(leaf, ref, name->len);
ptr = (unsigned long)(ref + 1);
write_extent_buffer(leaf, name->name, ptr, name->len);
- btrfs_mark_buffer_dirty(trans, leaf);
if (key.type == BTRFS_ROOT_BACKREF_KEY) {
btrfs_release_path(path);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 204c928beaf9..2c5edcee9450 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1380,11 +1380,11 @@ static int find_first_extent_item(struct btrfs_root *extent_root,
if (path->nodes[0])
goto search_forward;
+ key.objectid = search_start;
if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
key.type = BTRFS_METADATA_ITEM_KEY;
else
key.type = BTRFS_EXTENT_ITEM_KEY;
- key.objectid = search_start;
key.offset = (u64)-1;
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
@@ -1541,6 +1541,10 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg,
u64 extent_gen;
int ret;
+ if (unlikely(!extent_root)) {
+ btrfs_err(fs_info, "no valid extent root for scrub");
+ return -EUCLEAN;
+ }
memset(stripe->sectors, 0, sizeof(struct scrub_sector_verification) *
stripe->nr_sectors);
scrub_stripe_reset_bitmaps(stripe);
@@ -2493,8 +2497,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
path->skip_locking = 1;
key.objectid = scrub_dev->devid;
- key.offset = 0ull;
key.type = BTRFS_DEV_EXTENT_KEY;
+ key.offset = 0ull;
while (1) {
u64 dev_extent_len;
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 7254279c3cc9..0c8c58c4f29b 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -16,7 +16,6 @@
#include <linux/compat.h>
#include <linux/crc32c.h>
#include <linux/fsverity.h>
-
#include "send.h"
#include "ctree.h"
#include "backref.h"
@@ -178,6 +177,7 @@ struct send_ctx {
u64 cur_inode_rdev;
u64 cur_inode_last_extent;
u64 cur_inode_next_write_offset;
+ struct fs_path cur_inode_path;
bool cur_inode_new;
bool cur_inode_new_gen;
bool cur_inode_deleted;
@@ -425,15 +425,21 @@ static int need_send_hole(struct send_ctx *sctx)
static void fs_path_reset(struct fs_path *p)
{
- if (p->reversed) {
+ if (p->reversed)
p->start = p->buf + p->buf_len - 1;
- p->end = p->start;
- *p->start = 0;
- } else {
+ else
p->start = p->buf;
- p->end = p->start;
- *p->start = 0;
- }
+
+ p->end = p->start;
+ *p->start = 0;
+}
+
+static void init_path(struct fs_path *p)
+{
+ p->reversed = 0;
+ p->buf = p->inline_buf;
+ p->buf_len = FS_PATH_INLINE_SIZE;
+ fs_path_reset(p);
}
static struct fs_path *fs_path_alloc(void)
@@ -443,10 +449,7 @@ static struct fs_path *fs_path_alloc(void)
p = kmalloc(sizeof(*p), GFP_KERNEL);
if (!p)
return NULL;
- p->reversed = 0;
- p->buf = p->inline_buf;
- p->buf_len = FS_PATH_INLINE_SIZE;
- fs_path_reset(p);
+ init_path(p);
return p;
}
@@ -471,7 +474,7 @@ static void fs_path_free(struct fs_path *p)
kfree(p);
}
-static int fs_path_len(struct fs_path *p)
+static inline int fs_path_len(const struct fs_path *p)
{
return p->end - p->start;
}
@@ -487,12 +490,10 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)
if (p->buf_len >= len)
return 0;
- if (len > PATH_MAX) {
- WARN_ON(1);
- return -ENOMEM;
- }
+ if (WARN_ON(len > PATH_MAX))
+ return -ENAMETOOLONG;
- path_len = p->end - p->start;
+ path_len = fs_path_len(p);
old_buf_len = p->buf_len;
/*
@@ -533,12 +534,12 @@ static int fs_path_prepare_for_add(struct fs_path *p, int name_len,
int ret;
int new_len;
- new_len = p->end - p->start + name_len;
+ new_len = fs_path_len(p) + name_len;
if (p->start != p->end)
new_len++;
ret = fs_path_ensure_buf(p, new_len);
if (ret < 0)
- goto out;
+ return ret;
if (p->reversed) {
if (p->start != p->end)
@@ -553,8 +554,7 @@ static int fs_path_prepare_for_add(struct fs_path *p, int name_len,
*p->end = 0;
}
-out:
- return ret;
+ return 0;
}
static int fs_path_add(struct fs_path *p, const char *name, int name_len)
@@ -564,25 +564,15 @@ static int fs_path_add(struct fs_path *p, const char *name, int name_len)
ret = fs_path_prepare_for_add(p, name_len, &prepared);
if (ret < 0)
- goto out;
+ return ret;
memcpy(prepared, name, name_len);
-out:
- return ret;
+ return 0;
}
-static int fs_path_add_path(struct fs_path *p, struct fs_path *p2)
+static inline int fs_path_add_path(struct fs_path *p, const struct fs_path *p2)
{
- int ret;
- char *prepared;
-
- ret = fs_path_prepare_for_add(p, p2->end - p2->start, &prepared);
- if (ret < 0)
- goto out;
- memcpy(prepared, p2->start, p2->end - p2->start);
-
-out:
- return ret;
+ return fs_path_add(p, p2->start, fs_path_len(p2));
}
static int fs_path_add_from_extent_buffer(struct fs_path *p,
@@ -594,12 +584,11 @@ static int fs_path_add_from_extent_buffer(struct fs_path *p,
ret = fs_path_prepare_for_add(p, len, &prepared);
if (ret < 0)
- goto out;
+ return ret;
read_extent_buffer(eb, prepared, off, len);
-out:
- return ret;
+ return 0;
}
static int fs_path_copy(struct fs_path *p, struct fs_path *from)
@@ -619,13 +608,21 @@ static void fs_path_unreverse(struct fs_path *p)
return;
tmp = p->start;
- len = p->end - p->start;
+ len = fs_path_len(p);
p->start = p->buf;
p->end = p->start + len;
memmove(p->start, tmp, len + 1);
p->reversed = 0;
}
+static inline bool is_current_inode_path(const struct send_ctx *sctx,
+ const struct fs_path *path)
+{
+ const struct fs_path *cur = &sctx->cur_inode_path;
+
+ return (strncmp(path->start, cur->start, fs_path_len(cur)) == 0);
+}
+
static struct btrfs_path *alloc_path_for_send(void)
{
struct btrfs_path *path;
@@ -740,7 +737,7 @@ static int tlv_put_btrfs_timespec(struct send_ctx *sctx, u16 attr,
#define TLV_PUT_PATH(sctx, attrtype, p) \
do { \
ret = tlv_put_string(sctx, attrtype, p->start, \
- p->end - p->start); \
+ fs_path_len((p))); \
if (ret < 0) \
goto tlv_put_failure; \
} while(0)
@@ -826,7 +823,7 @@ static int send_rename(struct send_ctx *sctx,
ret = begin_cmd(sctx, BTRFS_SEND_C_RENAME);
if (ret < 0)
- goto out;
+ return ret;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, from);
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_TO, to);
@@ -834,7 +831,6 @@ static int send_rename(struct send_ctx *sctx,
ret = send_cmd(sctx);
tlv_put_failure:
-out:
return ret;
}
@@ -851,7 +847,7 @@ static int send_link(struct send_ctx *sctx,
ret = begin_cmd(sctx, BTRFS_SEND_C_LINK);
if (ret < 0)
- goto out;
+ return ret;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, lnk);
@@ -859,7 +855,6 @@ static int send_link(struct send_ctx *sctx,
ret = send_cmd(sctx);
tlv_put_failure:
-out:
return ret;
}
@@ -875,14 +870,13 @@ static int send_unlink(struct send_ctx *sctx, struct fs_path *path)
ret = begin_cmd(sctx, BTRFS_SEND_C_UNLINK);
if (ret < 0)
- goto out;
+ return ret;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
ret = send_cmd(sctx);
tlv_put_failure:
-out:
return ret;
}
@@ -898,14 +892,13 @@ static int send_rmdir(struct send_ctx *sctx, struct fs_path *path)
ret = begin_cmd(sctx, BTRFS_SEND_C_RMDIR);
if (ret < 0)
- goto out;
+ return ret;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
ret = send_cmd(sctx);
tlv_put_failure:
-out:
return ret;
}
@@ -1897,7 +1890,7 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen,
ret = get_inode_info(sctx->send_root, ino, &info);
if (ret < 0 && ret != -ENOENT)
- goto out;
+ return ret;
left_ret = (info.nlink == 0) ? -ENOENT : ret;
left_gen = info.gen;
if (send_gen)
@@ -1908,7 +1901,7 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen,
} else {
ret = get_inode_info(sctx->parent_root, ino, &info);
if (ret < 0 && ret != -ENOENT)
- goto out;
+ return ret;
right_ret = (info.nlink == 0) ? -ENOENT : ret;
right_gen = info.gen;
if (parent_gen)
@@ -1953,7 +1946,6 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen,
ret = -ENOENT;
}
-out:
return ret;
}
@@ -1967,17 +1959,14 @@ static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen,
ret = get_cur_inode_state(sctx, ino, gen, send_gen, parent_gen);
if (ret < 0)
- goto out;
+ return ret;
if (ret == inode_state_no_change ||
ret == inode_state_did_create ||
ret == inode_state_will_delete)
- ret = 1;
- else
- ret = 0;
+ return 1;
-out:
- return ret;
+ return 0;
}
/*
@@ -2326,9 +2315,8 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
*parent_gen = nce->parent_gen;
ret = fs_path_add(dest, nce->name, nce->name_len);
if (ret < 0)
- goto out;
- ret = nce->ret;
- goto out;
+ return ret;
+ return nce->ret;
}
}
@@ -2339,12 +2327,12 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
*/
ret = is_inode_existent(sctx, ino, gen, NULL, NULL);
if (ret < 0)
- goto out;
+ return ret;
if (!ret) {
ret = gen_unique_name(sctx, ino, gen, dest);
if (ret < 0)
- goto out;
+ return ret;
ret = 1;
goto out_cache;
}
@@ -2360,21 +2348,21 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
ret = get_first_ref(sctx->parent_root, ino,
parent_ino, parent_gen, dest);
if (ret < 0)
- goto out;
+ return ret;
/*
* Check if the ref was overwritten by an inode's ref that was processed
* earlier. If yes, treat as orphan and return 1.
*/
ret = did_overwrite_ref(sctx, *parent_ino, *parent_gen, ino, gen,
- dest->start, dest->end - dest->start);
+ dest->start, fs_path_len(dest));
if (ret < 0)
- goto out;
+ return ret;
if (ret) {
fs_path_reset(dest);
ret = gen_unique_name(sctx, ino, gen, dest);
if (ret < 0)
- goto out;
+ return ret;
ret = 1;
}
@@ -2383,10 +2371,8 @@ out_cache:
* Store the result of the lookup in the name cache.
*/
nce = kmalloc(sizeof(*nce) + fs_path_len(dest), GFP_KERNEL);
- if (!nce) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!nce)
+ return -ENOMEM;
nce->entry.key = ino;
nce->entry.gen = gen;
@@ -2404,10 +2390,9 @@ out_cache:
nce_ret = btrfs_lru_cache_store(&sctx->name_cache, &nce->entry, GFP_KERNEL);
if (nce_ret < 0) {
kfree(nce);
- ret = nce_ret;
+ return nce_ret;
}
-out:
return ret;
}
@@ -2444,6 +2429,14 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
u64 parent_inode = 0;
u64 parent_gen = 0;
int stop = 0;
+ const bool is_cur_inode = (ino == sctx->cur_ino && gen == sctx->cur_inode_gen);
+
+ if (is_cur_inode && fs_path_len(&sctx->cur_inode_path) > 0) {
+ if (dest != &sctx->cur_inode_path)
+ return fs_path_copy(dest, &sctx->cur_inode_path);
+
+ return 0;
+ }
name = fs_path_alloc();
if (!name) {
@@ -2495,8 +2488,12 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
out:
fs_path_free(name);
- if (!ret)
+ if (!ret) {
fs_path_unreverse(dest);
+ if (is_cur_inode && dest != &sctx->cur_inode_path)
+ ret = fs_path_copy(&sctx->cur_inode_path, dest);
+ }
+
return ret;
}
@@ -2591,6 +2588,47 @@ out:
return ret;
}
+static struct fs_path *get_cur_inode_path(struct send_ctx *sctx)
+{
+ if (fs_path_len(&sctx->cur_inode_path) == 0) {
+ int ret;
+
+ ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen,
+ &sctx->cur_inode_path);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ }
+
+ return &sctx->cur_inode_path;
+}
+
+static struct fs_path *get_path_for_command(struct send_ctx *sctx, u64 ino, u64 gen)
+{
+ struct fs_path *path;
+ int ret;
+
+ if (ino == sctx->cur_ino && gen == sctx->cur_inode_gen)
+ return get_cur_inode_path(sctx);
+
+ path = fs_path_alloc();
+ if (!path)
+ return ERR_PTR(-ENOMEM);
+
+ ret = get_cur_path(sctx, ino, gen, path);
+ if (ret < 0) {
+ fs_path_free(path);
+ return ERR_PTR(ret);
+ }
+
+ return path;
+}
+
+static void free_path_for_command(const struct send_ctx *sctx, struct fs_path *path)
+{
+ if (path != &sctx->cur_inode_path)
+ fs_path_free(path);
+}
+
static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size)
{
struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
@@ -2599,17 +2637,14 @@ static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size)
btrfs_debug(fs_info, "send_truncate %llu size=%llu", ino, size);
- p = fs_path_alloc();
- if (!p)
- return -ENOMEM;
+ p = get_path_for_command(sctx, ino, gen);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
ret = begin_cmd(sctx, BTRFS_SEND_C_TRUNCATE);
if (ret < 0)
goto out;
- ret = get_cur_path(sctx, ino, gen, p);
- if (ret < 0)
- goto out;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, size);
@@ -2617,7 +2652,7 @@ static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size)
tlv_put_failure:
out:
- fs_path_free(p);
+ free_path_for_command(sctx, p);
return ret;
}
@@ -2629,17 +2664,14 @@ static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode)
btrfs_debug(fs_info, "send_chmod %llu mode=%llu", ino, mode);
- p = fs_path_alloc();
- if (!p)
- return -ENOMEM;
+ p = get_path_for_command(sctx, ino, gen);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
ret = begin_cmd(sctx, BTRFS_SEND_C_CHMOD);
if (ret < 0)
goto out;
- ret = get_cur_path(sctx, ino, gen, p);
- if (ret < 0)
- goto out;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode & 07777);
@@ -2647,7 +2679,7 @@ static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode)
tlv_put_failure:
out:
- fs_path_free(p);
+ free_path_for_command(sctx, p);
return ret;
}
@@ -2662,17 +2694,14 @@ static int send_fileattr(struct send_ctx *sctx, u64 ino, u64 gen, u64 fileattr)
btrfs_debug(fs_info, "send_fileattr %llu fileattr=%llu", ino, fileattr);
- p = fs_path_alloc();
- if (!p)
- return -ENOMEM;
+ p = get_path_for_command(sctx, ino, gen);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
ret = begin_cmd(sctx, BTRFS_SEND_C_FILEATTR);
if (ret < 0)
goto out;
- ret = get_cur_path(sctx, ino, gen, p);
- if (ret < 0)
- goto out;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
TLV_PUT_U64(sctx, BTRFS_SEND_A_FILEATTR, fileattr);
@@ -2680,7 +2709,7 @@ static int send_fileattr(struct send_ctx *sctx, u64 ino, u64 gen, u64 fileattr)
tlv_put_failure:
out:
- fs_path_free(p);
+ free_path_for_command(sctx, p);
return ret;
}
@@ -2693,17 +2722,14 @@ static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid)
btrfs_debug(fs_info, "send_chown %llu uid=%llu, gid=%llu",
ino, uid, gid);
- p = fs_path_alloc();
- if (!p)
- return -ENOMEM;
+ p = get_path_for_command(sctx, ino, gen);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
ret = begin_cmd(sctx, BTRFS_SEND_C_CHOWN);
if (ret < 0)
goto out;
- ret = get_cur_path(sctx, ino, gen, p);
- if (ret < 0)
- goto out;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
TLV_PUT_U64(sctx, BTRFS_SEND_A_UID, uid);
TLV_PUT_U64(sctx, BTRFS_SEND_A_GID, gid);
@@ -2712,7 +2738,7 @@ static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid)
tlv_put_failure:
out:
- fs_path_free(p);
+ free_path_for_command(sctx, p);
return ret;
}
@@ -2729,9 +2755,9 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen)
btrfs_debug(fs_info, "send_utimes %llu", ino);
- p = fs_path_alloc();
- if (!p)
- return -ENOMEM;
+ p = get_path_for_command(sctx, ino, gen);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
path = alloc_path_for_send();
if (!path) {
@@ -2756,9 +2782,6 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen)
if (ret < 0)
goto out;
- ret = get_cur_path(sctx, ino, gen, p);
- if (ret < 0)
- goto out;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb, &ii->atime);
TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb, &ii->mtime);
@@ -2770,7 +2793,7 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen)
tlv_put_failure:
out:
- fs_path_free(p);
+ free_path_for_command(sctx, p);
btrfs_free_path(path);
return ret;
}
@@ -3106,6 +3129,11 @@ static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen,
goto out;
ret = send_rename(sctx, path, orphan);
+ if (ret < 0)
+ goto out;
+
+ if (ino == sctx->cur_ino && gen == sctx->cur_inode_gen)
+ ret = fs_path_copy(&sctx->cur_inode_path, orphan);
out:
fs_path_free(orphan);
@@ -4158,6 +4186,23 @@ out:
return ret;
}
+static int rename_current_inode(struct send_ctx *sctx,
+ struct fs_path *current_path,
+ struct fs_path *new_path)
+{
+ int ret;
+
+ ret = send_rename(sctx, current_path, new_path);
+ if (ret < 0)
+ return ret;
+
+ ret = fs_path_copy(&sctx->cur_inode_path, new_path);
+ if (ret < 0)
+ return ret;
+
+ return fs_path_copy(current_path, new_path);
+}
+
/*
* This does all the move/link/unlink/rmdir magic.
*/
@@ -4172,9 +4217,9 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
u64 ow_inode = 0;
u64 ow_gen;
u64 ow_mode;
- int did_overwrite = 0;
- int is_orphan = 0;
u64 last_dir_ino_rm = 0;
+ bool did_overwrite = false;
+ bool is_orphan = false;
bool can_rename = true;
bool orphanized_dir = false;
bool orphanized_ancestor = false;
@@ -4216,14 +4261,14 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
if (ret < 0)
goto out;
if (ret)
- did_overwrite = 1;
+ did_overwrite = true;
}
if (sctx->cur_inode_new || did_overwrite) {
ret = gen_unique_name(sctx, sctx->cur_ino,
sctx->cur_inode_gen, valid_path);
if (ret < 0)
goto out;
- is_orphan = 1;
+ is_orphan = true;
} else {
ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen,
valid_path);
@@ -4348,6 +4393,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
if (ret > 0) {
orphanized_ancestor = true;
fs_path_reset(valid_path);
+ fs_path_reset(&sctx->cur_inode_path);
ret = get_cur_path(sctx, sctx->cur_ino,
sctx->cur_inode_gen,
valid_path);
@@ -4443,13 +4489,10 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
* it depending on the inode mode.
*/
if (is_orphan && can_rename) {
- ret = send_rename(sctx, valid_path, cur->full_path);
- if (ret < 0)
- goto out;
- is_orphan = 0;
- ret = fs_path_copy(valid_path, cur->full_path);
+ ret = rename_current_inode(sctx, valid_path, cur->full_path);
if (ret < 0)
goto out;
+ is_orphan = false;
} else if (can_rename) {
if (S_ISDIR(sctx->cur_inode_mode)) {
/*
@@ -4457,10 +4500,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
* dirs, we always have one new and one deleted
* ref. The deleted ref is ignored later.
*/
- ret = send_rename(sctx, valid_path,
- cur->full_path);
- if (!ret)
- ret = fs_path_copy(valid_path,
+ ret = rename_current_inode(sctx, valid_path,
cur->full_path);
if (ret < 0)
goto out;
@@ -4507,7 +4547,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
sctx->cur_inode_gen, valid_path);
if (ret < 0)
goto out;
- is_orphan = 1;
+ is_orphan = true;
}
list_for_each_entry(cur, &sctx->deleted_refs, list) {
@@ -4553,6 +4593,8 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
ret = send_unlink(sctx, cur->full_path);
if (ret < 0)
goto out;
+ if (is_current_inode_path(sctx, cur->full_path))
+ fs_path_reset(&sctx->cur_inode_path);
}
ret = dup_ref(cur, &check_dirs);
if (ret < 0)
@@ -4701,7 +4743,7 @@ out:
static int record_new_ref_if_needed(u64 dir, struct fs_path *name, void *ctx)
{
- int ret = 0;
+ int ret;
struct send_ctx *sctx = ctx;
struct rb_node *node = NULL;
struct recorded_ref data;
@@ -4710,7 +4752,7 @@ static int record_new_ref_if_needed(u64 dir, struct fs_path *name, void *ctx)
ret = get_inode_gen(sctx->send_root, dir, &dir_gen);
if (ret < 0)
- goto out;
+ return ret;
data.dir = dir;
data.dir_gen = dir_gen;
@@ -4724,13 +4766,13 @@ static int record_new_ref_if_needed(u64 dir, struct fs_path *name, void *ctx)
&sctx->new_refs, name, dir, dir_gen,
sctx);
}
-out:
+
return ret;
}
static int record_deleted_ref_if_needed(u64 dir, struct fs_path *name, void *ctx)
{
- int ret = 0;
+ int ret;
struct send_ctx *sctx = ctx;
struct rb_node *node = NULL;
struct recorded_ref data;
@@ -4739,7 +4781,7 @@ static int record_deleted_ref_if_needed(u64 dir, struct fs_path *name, void *ctx
ret = get_inode_gen(sctx->parent_root, dir, &dir_gen);
if (ret < 0)
- goto out;
+ return ret;
data.dir = dir;
data.dir_gen = dir_gen;
@@ -4753,7 +4795,7 @@ static int record_deleted_ref_if_needed(u64 dir, struct fs_path *name, void *ctx
&sctx->deleted_refs, name, dir,
dir_gen, sctx);
}
-out:
+
return ret;
}
@@ -4764,11 +4806,9 @@ static int record_new_ref(struct send_ctx *sctx)
ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
sctx->cmp_key, 0, record_new_ref_if_needed, sctx);
if (ret < 0)
- goto out;
- ret = 0;
+ return ret;
-out:
- return ret;
+ return 0;
}
static int record_deleted_ref(struct send_ctx *sctx)
@@ -4779,29 +4819,25 @@ static int record_deleted_ref(struct send_ctx *sctx)
sctx->cmp_key, 0, record_deleted_ref_if_needed,
sctx);
if (ret < 0)
- goto out;
- ret = 0;
+ return ret;
-out:
- return ret;
+ return 0;
}
static int record_changed_ref(struct send_ctx *sctx)
{
- int ret = 0;
+ int ret;
ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
sctx->cmp_key, 0, record_new_ref_if_needed, sctx);
if (ret < 0)
- goto out;
+ return ret;
ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
sctx->cmp_key, 0, record_deleted_ref_if_needed, sctx);
if (ret < 0)
- goto out;
- ret = 0;
+ return ret;
-out:
- return ret;
+ return 0;
}
/*
@@ -4869,15 +4905,19 @@ out:
}
static int send_set_xattr(struct send_ctx *sctx,
- struct fs_path *path,
const char *name, int name_len,
const char *data, int data_len)
{
- int ret = 0;
+ struct fs_path *path;
+ int ret;
+
+ path = get_cur_inode_path(sctx);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
ret = begin_cmd(sctx, BTRFS_SEND_C_SET_XATTR);
if (ret < 0)
- goto out;
+ return ret;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len);
@@ -4886,7 +4926,6 @@ static int send_set_xattr(struct send_ctx *sctx,
ret = send_cmd(sctx);
tlv_put_failure:
-out:
return ret;
}
@@ -4894,11 +4933,11 @@ static int send_remove_xattr(struct send_ctx *sctx,
struct fs_path *path,
const char *name, int name_len)
{
- int ret = 0;
+ int ret;
ret = begin_cmd(sctx, BTRFS_SEND_C_REMOVE_XATTR);
if (ret < 0)
- goto out;
+ return ret;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len);
@@ -4906,7 +4945,6 @@ static int send_remove_xattr(struct send_ctx *sctx,
ret = send_cmd(sctx);
tlv_put_failure:
-out:
return ret;
}
@@ -4914,19 +4952,13 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key,
const char *name, int name_len, const char *data,
int data_len, void *ctx)
{
- int ret;
struct send_ctx *sctx = ctx;
- struct fs_path *p;
struct posix_acl_xattr_header dummy_acl;
/* Capabilities are emitted by finish_inode_if_needed */
if (!strncmp(name, XATTR_NAME_CAPS, name_len))
return 0;
- p = fs_path_alloc();
- if (!p)
- return -ENOMEM;
-
/*
* This hack is needed because empty acls are stored as zero byte
* data in xattrs. Problem with that is, that receiving these zero byte
@@ -4943,48 +4975,27 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key,
}
}
- ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
- if (ret < 0)
- goto out;
-
- ret = send_set_xattr(sctx, p, name, name_len, data, data_len);
-
-out:
- fs_path_free(p);
- return ret;
+ return send_set_xattr(sctx, name, name_len, data, data_len);
}
static int __process_deleted_xattr(int num, struct btrfs_key *di_key,
const char *name, int name_len,
const char *data, int data_len, void *ctx)
{
- int ret;
struct send_ctx *sctx = ctx;
struct fs_path *p;
- p = fs_path_alloc();
- if (!p)
- return -ENOMEM;
-
- ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
- if (ret < 0)
- goto out;
-
- ret = send_remove_xattr(sctx, p, name, name_len);
+ p = get_cur_inode_path(sctx);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
-out:
- fs_path_free(p);
- return ret;
+ return send_remove_xattr(sctx, p, name, name_len);
}
static int process_new_xattr(struct send_ctx *sctx)
{
- int ret = 0;
-
- ret = iterate_dir_item(sctx->send_root, sctx->left_path,
- __process_new_xattr, sctx);
-
- return ret;
+ return iterate_dir_item(sctx->send_root, sctx->left_path,
+ __process_new_xattr, sctx);
}
static int process_deleted_xattr(struct send_ctx *sctx)
@@ -5100,17 +5111,15 @@ static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key,
static int process_changed_xattr(struct send_ctx *sctx)
{
- int ret = 0;
+ int ret;
ret = iterate_dir_item(sctx->send_root, sctx->left_path,
__process_changed_new_xattr, sctx);
if (ret < 0)
- goto out;
- ret = iterate_dir_item(sctx->parent_root, sctx->right_path,
- __process_changed_deleted_xattr, sctx);
+ return ret;
-out:
- return ret;
+ return iterate_dir_item(sctx->parent_root, sctx->right_path,
+ __process_changed_deleted_xattr, sctx);
}
static int process_all_new_xattrs(struct send_ctx *sctx)
@@ -5157,7 +5166,7 @@ static int send_verity(struct send_ctx *sctx, struct fs_path *path,
ret = begin_cmd(sctx, BTRFS_SEND_C_ENABLE_VERITY);
if (ret < 0)
- goto out;
+ return ret;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
TLV_PUT_U8(sctx, BTRFS_SEND_A_VERITY_ALGORITHM,
@@ -5172,21 +5181,20 @@ static int send_verity(struct send_ctx *sctx, struct fs_path *path,
ret = send_cmd(sctx);
tlv_put_failure:
-out:
return ret;
}
static int process_verity(struct send_ctx *sctx)
{
int ret = 0;
- struct inode *inode;
+ struct btrfs_inode *inode;
struct fs_path *p;
inode = btrfs_iget(sctx->cur_ino, sctx->send_root);
if (IS_ERR(inode))
return PTR_ERR(inode);
- ret = btrfs_get_verity_descriptor(inode, NULL, 0);
+ ret = btrfs_get_verity_descriptor(&inode->vfs_inode, NULL, 0);
if (ret < 0)
goto iput;
@@ -5203,27 +5211,19 @@ static int process_verity(struct send_ctx *sctx)
}
}
- ret = btrfs_get_verity_descriptor(inode, sctx->verity_descriptor, ret);
+ ret = btrfs_get_verity_descriptor(&inode->vfs_inode, sctx->verity_descriptor, ret);
if (ret < 0)
goto iput;
- p = fs_path_alloc();
- if (!p) {
- ret = -ENOMEM;
+ p = get_cur_inode_path(sctx);
+ if (IS_ERR(p)) {
+ ret = PTR_ERR(p);
goto iput;
}
- ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
- if (ret < 0)
- goto free_path;
ret = send_verity(sctx, p, sctx->verity_descriptor);
- if (ret < 0)
- goto free_path;
-
-free_path:
- fs_path_free(p);
iput:
- iput(inode);
+ iput(&inode->vfs_inode);
return ret;
}
@@ -5280,6 +5280,7 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
unsigned cur_len = min_t(unsigned, len,
PAGE_SIZE - pg_offset);
+again:
folio = filemap_lock_folio(mapping, index);
if (IS_ERR(folio)) {
page_cache_sync_readahead(mapping,
@@ -5312,6 +5313,11 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
ret = -EIO;
break;
}
+ if (folio->mapping != mapping) {
+ folio_unlock(folio);
+ folio_put(folio);
+ goto again;
+ }
}
memcpy_from_folio(sctx->send_buf + sctx->send_size, folio,
@@ -5337,31 +5343,25 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
int ret = 0;
struct fs_path *p;
- p = fs_path_alloc();
- if (!p)
- return -ENOMEM;
-
btrfs_debug(fs_info, "send_write offset=%llu, len=%d", offset, len);
- ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
- if (ret < 0)
- goto out;
+ p = get_cur_inode_path(sctx);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
- ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
+ ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
if (ret < 0)
- goto out;
+ return ret;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
ret = put_file_data(sctx, offset, len);
if (ret < 0)
- goto out;
+ return ret;
ret = send_cmd(sctx);
tlv_put_failure:
-out:
- fs_path_free(p);
return ret;
}
@@ -5374,6 +5374,7 @@ static int send_clone(struct send_ctx *sctx,
{
int ret = 0;
struct fs_path *p;
+ struct fs_path *cur_inode_path;
u64 gen;
btrfs_debug(sctx->send_root->fs_info,
@@ -5381,6 +5382,10 @@ static int send_clone(struct send_ctx *sctx,
offset, len, btrfs_root_id(clone_root->root),
clone_root->ino, clone_root->offset);
+ cur_inode_path = get_cur_inode_path(sctx);
+ if (IS_ERR(cur_inode_path))
+ return PTR_ERR(cur_inode_path);
+
p = fs_path_alloc();
if (!p)
return -ENOMEM;
@@ -5389,13 +5394,9 @@ static int send_clone(struct send_ctx *sctx,
if (ret < 0)
goto out;
- ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
- if (ret < 0)
- goto out;
-
TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_LEN, len);
- TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, cur_inode_path);
if (clone_root->root == sctx->send_root) {
ret = get_inode_gen(sctx->send_root, clone_root->ino, &gen);
@@ -5446,17 +5447,13 @@ static int send_update_extent(struct send_ctx *sctx,
int ret = 0;
struct fs_path *p;
- p = fs_path_alloc();
- if (!p)
- return -ENOMEM;
+ p = get_cur_inode_path(sctx);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
ret = begin_cmd(sctx, BTRFS_SEND_C_UPDATE_EXTENT);
if (ret < 0)
- goto out;
-
- ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
- if (ret < 0)
- goto out;
+ return ret;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
@@ -5465,8 +5462,6 @@ static int send_update_extent(struct send_ctx *sctx,
ret = send_cmd(sctx);
tlv_put_failure:
-out:
- fs_path_free(p);
return ret;
}
@@ -5495,12 +5490,10 @@ static int send_hole(struct send_ctx *sctx, u64 end)
if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
return send_update_extent(sctx, offset, end - offset);
- p = fs_path_alloc();
- if (!p)
- return -ENOMEM;
- ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
- if (ret < 0)
- goto tlv_put_failure;
+ p = get_cur_inode_path(sctx);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
+
while (offset < end) {
u64 len = min(end - offset, read_size);
@@ -5521,7 +5514,6 @@ static int send_hole(struct send_ctx *sctx, u64 end)
}
sctx->cur_inode_next_write_offset = offset;
tlv_put_failure:
- fs_path_free(p);
return ret;
}
@@ -5529,9 +5521,7 @@ static int send_encoded_inline_extent(struct send_ctx *sctx,
struct btrfs_path *path, u64 offset,
u64 len)
{
- struct btrfs_root *root = sctx->send_root;
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct inode *inode;
+ struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
struct fs_path *fspath;
struct extent_buffer *leaf = path->nodes[0];
struct btrfs_key key;
@@ -5540,23 +5530,13 @@ static int send_encoded_inline_extent(struct send_ctx *sctx,
size_t inline_size;
int ret;
- inode = btrfs_iget(sctx->cur_ino, root);
- if (IS_ERR(inode))
- return PTR_ERR(inode);
-
- fspath = fs_path_alloc();
- if (!fspath) {
- ret = -ENOMEM;
- goto out;
- }
+ fspath = get_cur_inode_path(sctx);
+ if (IS_ERR(fspath))
+ return PTR_ERR(fspath);
ret = begin_cmd(sctx, BTRFS_SEND_C_ENCODED_WRITE);
if (ret < 0)
- goto out;
-
- ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath);
- if (ret < 0)
- goto out;
+ return ret;
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
@@ -5572,12 +5552,12 @@ static int send_encoded_inline_extent(struct send_ctx *sctx,
ret = btrfs_encoded_io_compression_from_extent(fs_info,
btrfs_file_extent_compression(leaf, ei));
if (ret < 0)
- goto out;
+ return ret;
TLV_PUT_U32(sctx, BTRFS_SEND_A_COMPRESSION, ret);
ret = put_data_header(sctx, inline_size);
if (ret < 0)
- goto out;
+ return ret;
read_extent_buffer(leaf, sctx->send_buf + sctx->send_size,
btrfs_file_extent_inline_start(ei), inline_size);
sctx->send_size += inline_size;
@@ -5585,9 +5565,6 @@ static int send_encoded_inline_extent(struct send_ctx *sctx,
ret = send_cmd(sctx);
tlv_put_failure:
-out:
- fs_path_free(fspath);
- iput(inode);
return ret;
}
@@ -5596,7 +5573,7 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path,
{
struct btrfs_root *root = sctx->send_root;
struct btrfs_fs_info *fs_info = root->fs_info;
- struct inode *inode;
+ struct btrfs_inode *inode;
struct fs_path *fspath;
struct extent_buffer *leaf = path->nodes[0];
struct btrfs_key key;
@@ -5611,9 +5588,9 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path,
if (IS_ERR(inode))
return PTR_ERR(inode);
- fspath = fs_path_alloc();
- if (!fspath) {
- ret = -ENOMEM;
+ fspath = get_cur_inode_path(sctx);
+ if (IS_ERR(fspath)) {
+ ret = PTR_ERR(fspath);
goto out;
}
@@ -5621,10 +5598,6 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path,
if (ret < 0)
goto out;
- ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath);
- if (ret < 0)
- goto out;
-
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
@@ -5666,7 +5639,7 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path,
* Note that send_buf is a mapping of send_buf_pages, so this is really
* reading into send_buf.
*/
- ret = btrfs_encoded_read_regular_fill_pages(BTRFS_I(inode),
+ ret = btrfs_encoded_read_regular_fill_pages(inode,
disk_bytenr, disk_num_bytes,
sctx->send_buf_pages +
(data_offset >> PAGE_SHIFT),
@@ -5692,8 +5665,7 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path,
tlv_put_failure:
out:
- fs_path_free(fspath);
- iput(inode);
+ iput(&inode->vfs_inode);
return ret;
}
@@ -5735,15 +5707,14 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path,
}
if (sctx->cur_inode == NULL) {
+ struct btrfs_inode *btrfs_inode;
struct btrfs_root *root = sctx->send_root;
- sctx->cur_inode = btrfs_iget(sctx->cur_ino, root);
- if (IS_ERR(sctx->cur_inode)) {
- int err = PTR_ERR(sctx->cur_inode);
+ btrfs_inode = btrfs_iget(sctx->cur_ino, root);
+ if (IS_ERR(btrfs_inode))
+ return PTR_ERR(btrfs_inode);
- sctx->cur_inode = NULL;
- return err;
- }
+ sctx->cur_inode = &btrfs_inode->vfs_inode;
memset(&sctx->ra, 0, sizeof(struct file_ra_state));
file_ra_state_init(&sctx->ra, sctx->cur_inode->i_mapping);
@@ -5822,7 +5793,6 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path,
*/
static int send_capabilities(struct send_ctx *sctx)
{
- struct fs_path *fspath = NULL;
struct btrfs_path *path;
struct btrfs_dir_item *di;
struct extent_buffer *leaf;
@@ -5848,25 +5818,19 @@ static int send_capabilities(struct send_ctx *sctx)
leaf = path->nodes[0];
buf_len = btrfs_dir_data_len(leaf, di);
- fspath = fs_path_alloc();
buf = kmalloc(buf_len, GFP_KERNEL);
- if (!fspath || !buf) {
+ if (!buf) {
ret = -ENOMEM;
goto out;
}
- ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath);
- if (ret < 0)
- goto out;
-
data_ptr = (unsigned long)(di + 1) + btrfs_dir_name_len(leaf, di);
read_extent_buffer(leaf, buf, data_ptr, buf_len);
- ret = send_set_xattr(sctx, fspath, XATTR_NAME_CAPS,
+ ret = send_set_xattr(sctx, XATTR_NAME_CAPS,
strlen(XATTR_NAME_CAPS), buf, buf_len);
out:
kfree(buf);
- fs_path_free(fspath);
btrfs_free_path(path);
return ret;
}
@@ -6892,6 +6856,7 @@ static int changed_inode(struct send_ctx *sctx,
sctx->cur_inode_last_extent = (u64)-1;
sctx->cur_inode_next_write_offset = 0;
sctx->ignore_cur_inode = false;
+ fs_path_reset(&sctx->cur_inode_path);
/*
* Set send_progress to current inode. This will tell all get_cur_xxx
@@ -7253,7 +7218,7 @@ static int changed_cb(struct btrfs_path *left_path,
enum btrfs_compare_tree_result result,
struct send_ctx *sctx)
{
- int ret = 0;
+ int ret;
/*
* We can not hold the commit root semaphore here. This is because in
@@ -7313,7 +7278,6 @@ static int changed_cb(struct btrfs_path *left_path,
return 0;
}
result = BTRFS_COMPARE_TREE_CHANGED;
- ret = 0;
}
sctx->left_path = left_path;
@@ -8102,10 +8066,9 @@ static void dedupe_in_progress_warn(const struct btrfs_root *root)
btrfs_root_id(root), root->dedupe_in_progress);
}
-long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_args *arg)
+long btrfs_ioctl_send(struct btrfs_root *send_root, const struct btrfs_ioctl_send_args *arg)
{
int ret = 0;
- struct btrfs_root *send_root = inode->root;
struct btrfs_fs_info *fs_info = send_root->fs_info;
struct btrfs_root *clone_root;
struct send_ctx *sctx = NULL;
@@ -8168,6 +8131,7 @@ long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_a
goto out;
}
+ init_path(&sctx->cur_inode_path);
INIT_LIST_HEAD(&sctx->new_refs);
INIT_LIST_HEAD(&sctx->deleted_refs);
@@ -8444,6 +8408,9 @@ out:
btrfs_lru_cache_clear(&sctx->dir_created_cache);
btrfs_lru_cache_clear(&sctx->dir_utimes_cache);
+ if (sctx->cur_inode_path.buf != sctx->cur_inode_path.inline_buf)
+ kfree(sctx->cur_inode_path.buf);
+
kfree(sctx);
}
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index 9309886c5ea1..652bb28f63d4 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -11,7 +11,7 @@
#include <linux/sizes.h>
#include <linux/align.h>
-struct btrfs_inode;
+struct btrfs_root;
struct btrfs_ioctl_send_args;
#define BTRFS_SEND_STREAM_MAGIC "btrfs-stream"
@@ -182,6 +182,6 @@ enum {
__BTRFS_SEND_A_MAX = 35,
};
-long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_args *arg);
+long btrfs_ioctl_send(struct btrfs_root *send_root, const struct btrfs_ioctl_send_args *arg);
#endif
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 255e85f78313..ff089e3e4103 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
-#include "linux/spinlock.h"
+#include <linux/spinlock.h>
#include <linux/minmax.h>
#include "misc.h"
#include "ctree.h"
@@ -14,6 +14,7 @@
#include "fs.h"
#include "accessors.h"
#include "extent-tree.h"
+#include "zoned.h"
/*
* HOW DOES SPACE RESERVATION WORK
@@ -127,6 +128,14 @@
* churn a lot and we can avoid making some extent tree modifications if we
* are able to delay for as long as possible.
*
+ * RESET_ZONES
+ * This state works only for the zoned mode. On the zoned mode, we cannot
+ * reuse once allocated then freed region until we reset the zone, due to
+ * the sequential write zone requirement. The RESET_ZONES state resets the
+ * zones of an unused block group and let us reuse the space. The reusing
+ * is faster than removing the block group and allocating another block
+ * group on the zones.
+ *
* ALLOC_CHUNK
* We will skip this the first time through space reservation, because of
* overcommit and we don't want to have a lot of useless metadata space when
@@ -316,7 +325,7 @@ void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info,
found->bytes_used += block_group->used;
found->disk_used += block_group->used * factor;
found->bytes_readonly += block_group->bytes_super;
- btrfs_space_info_update_bytes_zone_unusable(info, found, block_group->zone_unusable);
+ btrfs_space_info_update_bytes_zone_unusable(found, block_group->zone_unusable);
if (block_group->length > 0)
found->full = 0;
btrfs_try_granting_tickets(info, found);
@@ -489,9 +498,7 @@ again:
if ((used + ticket->bytes <= space_info->total_bytes) ||
btrfs_can_overcommit(fs_info, space_info, ticket->bytes,
flush)) {
- btrfs_space_info_update_bytes_may_use(fs_info,
- space_info,
- ticket->bytes);
+ btrfs_space_info_update_bytes_may_use(space_info, ticket->bytes);
remove_ticket(space_info, ticket);
ticket->bytes = 0;
space_info->tickets_id++;
@@ -834,6 +841,9 @@ static void flush_space(struct btrfs_fs_info *fs_info,
*/
ret = btrfs_commit_current_transaction(root);
break;
+ case RESET_ZONES:
+ ret = btrfs_reset_unused_block_groups(space_info, num_bytes);
+ break;
default:
ret = -ENOSPC;
break;
@@ -1086,9 +1096,14 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
enum btrfs_flush_state flush_state;
int commit_cycles = 0;
u64 last_tickets_id;
+ enum btrfs_flush_state final_state;
fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+ if (btrfs_is_zoned(fs_info))
+ final_state = RESET_ZONES;
+ else
+ final_state = COMMIT_TRANS;
spin_lock(&space_info->lock);
to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info);
@@ -1141,7 +1156,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles)
flush_state++;
- if (flush_state > COMMIT_TRANS) {
+ if (flush_state > final_state) {
commit_cycles++;
if (commit_cycles > 2) {
if (maybe_fail_all_tickets(fs_info, space_info)) {
@@ -1155,7 +1170,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
}
}
spin_unlock(&space_info->lock);
- } while (flush_state <= COMMIT_TRANS);
+ } while (flush_state <= final_state);
}
/*
@@ -1286,6 +1301,10 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
* This is where we reclaim all of the pinned space generated by running the
* iputs
*
+ * RESET_ZONES
+ * This state works only for the zoned mode. We scan the unused block group
+ * list and reset the zones and reuse the block group.
+ *
* ALLOC_CHUNK_FORCE
* For data we start with alloc chunk force, however we could have been full
* before, and then the transaction commit could have freed new block groups,
@@ -1295,6 +1314,7 @@ static const enum btrfs_flush_state data_flush_states[] = {
FLUSH_DELALLOC_FULL,
RUN_DELAYED_IPUTS,
COMMIT_TRANS,
+ RESET_ZONES,
ALLOC_CHUNK_FORCE,
};
@@ -1386,6 +1406,7 @@ void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info)
static const enum btrfs_flush_state priority_flush_states[] = {
FLUSH_DELAYED_ITEMS_NR,
FLUSH_DELAYED_ITEMS,
+ RESET_ZONES,
ALLOC_CHUNK,
};
@@ -1399,6 +1420,7 @@ static const enum btrfs_flush_state evict_flush_states[] = {
FLUSH_DELALLOC_FULL,
ALLOC_CHUNK,
COMMIT_TRANS,
+ RESET_ZONES,
};
static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
@@ -1690,8 +1712,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
if (!pending_tickets &&
((used + orig_bytes <= space_info->total_bytes) ||
btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) {
- btrfs_space_info_update_bytes_may_use(fs_info, space_info,
- orig_bytes);
+ btrfs_space_info_update_bytes_may_use(space_info, orig_bytes);
ret = 0;
}
@@ -1703,8 +1724,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
if (ret && unlikely(flush == BTRFS_RESERVE_FLUSH_EMERGENCY)) {
used = btrfs_space_info_used(space_info, false);
if (used + orig_bytes <= space_info->total_bytes) {
- btrfs_space_info_update_bytes_may_use(fs_info, space_info,
- orig_bytes);
+ btrfs_space_info_update_bytes_may_use(space_info, orig_bytes);
ret = 0;
}
}
@@ -2082,3 +2102,32 @@ void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info)
do_reclaim_sweep(space_info, raid);
}
}
+
+void btrfs_return_free_space(struct btrfs_space_info *space_info, u64 len)
+{
+ struct btrfs_fs_info *fs_info = space_info->fs_info;
+ struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+
+ lockdep_assert_held(&space_info->lock);
+
+ /* Prioritize the global reservation to receive the freed space. */
+ if (global_rsv->space_info != space_info)
+ goto grant;
+
+ spin_lock(&global_rsv->lock);
+ if (!global_rsv->full) {
+ u64 to_add = min(len, global_rsv->size - global_rsv->reserved);
+
+ global_rsv->reserved += to_add;
+ btrfs_space_info_update_bytes_may_use(space_info, to_add);
+ if (global_rsv->reserved >= global_rsv->size)
+ global_rsv->full = 1;
+ len -= to_add;
+ }
+ spin_unlock(&global_rsv->lock);
+
+grant:
+ /* Add to any tickets we may have. */
+ if (len)
+ btrfs_try_granting_tickets(fs_info, space_info);
+}
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index efbecc0c5258..a96efdb5e681 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -79,6 +79,10 @@ enum btrfs_reserve_flush_enum {
BTRFS_RESERVE_FLUSH_EMERGENCY,
};
+/*
+ * Please be aware that the order of enum values will be the order of the reclaim
+ * process in btrfs_async_reclaim_metadata_space().
+ */
enum btrfs_flush_state {
FLUSH_DELAYED_ITEMS_NR = 1,
FLUSH_DELAYED_ITEMS = 2,
@@ -91,6 +95,7 @@ enum btrfs_flush_state {
ALLOC_CHUNK_FORCE = 9,
RUN_DELAYED_IPUTS = 10,
COMMIT_TRANS = 11,
+ RESET_ZONES = 12,
};
struct btrfs_space_info {
@@ -229,10 +234,10 @@ static inline bool btrfs_mixed_space_info(const struct btrfs_space_info *space_i
*/
#define DECLARE_SPACE_INFO_UPDATE(name, trace_name) \
static inline void \
-btrfs_space_info_update_##name(struct btrfs_fs_info *fs_info, \
- struct btrfs_space_info *sinfo, \
+btrfs_space_info_update_##name(struct btrfs_space_info *sinfo, \
s64 bytes) \
{ \
+ struct btrfs_fs_info *fs_info = sinfo->fs_info; \
const u64 abs_bytes = (bytes < 0) ? -bytes : bytes; \
lockdep_assert_held(&sinfo->lock); \
trace_update_##name(fs_info, sinfo, sinfo->name, bytes); \
@@ -275,13 +280,12 @@ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
enum btrfs_reserve_flush_enum flush);
static inline void btrfs_space_info_free_bytes_may_use(
- struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
u64 num_bytes)
{
spin_lock(&space_info->lock);
- btrfs_space_info_update_bytes_may_use(fs_info, space_info, -num_bytes);
- btrfs_try_granting_tickets(fs_info, space_info);
+ btrfs_space_info_update_bytes_may_use(space_info, -num_bytes);
+ btrfs_try_granting_tickets(space_info->fs_info, space_info);
spin_unlock(&space_info->lock);
}
int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
@@ -295,5 +299,6 @@ void btrfs_set_periodic_reclaim_ready(struct btrfs_space_info *space_info, bool
bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info);
int btrfs_calc_reclaim_threshold(const struct btrfs_space_info *space_info);
void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info);
+void btrfs_return_free_space(struct btrfs_space_info *space_info, u64 len);
#endif /* BTRFS_SPACE_INFO_H */
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index 8c68059ac1b0..c0a0b8b063d0 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -2,12 +2,11 @@
#include <linux/slab.h>
#include "messages.h"
-#include "ctree.h"
#include "subpage.h"
#include "btrfs_inode.h"
/*
- * Subpage (sectorsize < PAGE_SIZE) support overview:
+ * Subpage (block size < folio size) support overview:
*
* Limitations:
*
@@ -64,35 +63,14 @@
* This means a slightly higher tree locking latency.
*/
-#if PAGE_SIZE > SZ_4K
-bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space *mapping)
-{
- if (fs_info->sectorsize >= PAGE_SIZE)
- return false;
-
- /*
- * Only data pages (either through DIO or compression) can have no
- * mapping. And if page->mapping->host is data inode, it's subpage.
- * As we have ruled our sectorsize >= PAGE_SIZE case already.
- */
- if (!mapping || !mapping->host || is_data_inode(BTRFS_I(mapping->host)))
- return true;
-
- /*
- * Now the only remaining case is metadata, which we only go subpage
- * routine if nodesize < PAGE_SIZE.
- */
- if (fs_info->nodesize < PAGE_SIZE)
- return true;
- return false;
-}
-#endif
-
int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
struct folio *folio, enum btrfs_subpage_type type)
{
struct btrfs_subpage *subpage;
+ /* For metadata we don't support large folio yet. */
+ ASSERT(!folio_test_large(folio));
+
/*
* We have cases like a dummy extent buffer page, which is not mapped
* and doesn't need to be locked.
@@ -101,10 +79,14 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
ASSERT(folio_test_locked(folio));
/* Either not subpage, or the folio already has private attached. */
- if (!btrfs_is_subpage(fs_info, folio->mapping) || folio_test_private(folio))
+ if (folio_test_private(folio))
+ return 0;
+ if (type == BTRFS_SUBPAGE_METADATA && !btrfs_meta_is_subpage(fs_info))
+ return 0;
+ if (type == BTRFS_SUBPAGE_DATA && !btrfs_is_subpage(fs_info, folio))
return 0;
- subpage = btrfs_alloc_subpage(fs_info, type);
+ subpage = btrfs_alloc_subpage(fs_info, folio_size(folio), type);
if (IS_ERR(subpage))
return PTR_ERR(subpage);
@@ -112,12 +94,17 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
return 0;
}
-void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio)
+void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio,
+ enum btrfs_subpage_type type)
{
struct btrfs_subpage *subpage;
/* Either not subpage, or the folio already has private attached. */
- if (!btrfs_is_subpage(fs_info, folio->mapping) || !folio_test_private(folio))
+ if (!folio_test_private(folio))
+ return;
+ if (type == BTRFS_SUBPAGE_METADATA && !btrfs_meta_is_subpage(fs_info))
+ return;
+ if (type == BTRFS_SUBPAGE_DATA && !btrfs_is_subpage(fs_info, folio))
return;
subpage = folio_detach_private(folio);
@@ -126,15 +113,16 @@ void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *fol
}
struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
- enum btrfs_subpage_type type)
+ size_t fsize, enum btrfs_subpage_type type)
{
struct btrfs_subpage *ret;
unsigned int real_size;
- ASSERT(fs_info->sectorsize < PAGE_SIZE);
+ ASSERT(fs_info->sectorsize < fsize);
real_size = struct_size(ret, bitmaps,
- BITS_TO_LONGS(btrfs_bitmap_nr_max * fs_info->sectors_per_page));
+ BITS_TO_LONGS(btrfs_bitmap_nr_max *
+ (fsize >> fs_info->sectorsize_bits)));
ret = kzalloc(real_size, GFP_NOFS);
if (!ret)
return ERR_PTR(-ENOMEM);
@@ -165,7 +153,7 @@ void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *
{
struct btrfs_subpage *subpage;
- if (!btrfs_is_subpage(fs_info, folio->mapping))
+ if (!btrfs_meta_is_subpage(fs_info))
return;
ASSERT(folio_test_private(folio) && folio->mapping);
@@ -179,7 +167,7 @@ void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *
{
struct btrfs_subpage *subpage;
- if (!btrfs_is_subpage(fs_info, folio->mapping))
+ if (!btrfs_meta_is_subpage(fs_info))
return;
ASSERT(folio_test_private(folio) && folio->mapping);
@@ -206,16 +194,18 @@ static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info,
*/
if (folio->mapping)
ASSERT(folio_pos(folio) <= start &&
- start + len <= folio_pos(folio) + PAGE_SIZE);
+ start + len <= folio_pos(folio) + folio_size(folio));
}
#define subpage_calc_start_bit(fs_info, folio, name, start, len) \
({ \
- unsigned int __start_bit; \
+ unsigned int __start_bit; \
+ const unsigned int blocks_per_folio = \
+ btrfs_blocks_per_folio(fs_info, folio); \
\
btrfs_subpage_assert(fs_info, folio, start, len); \
- __start_bit = offset_in_page(start) >> fs_info->sectorsize_bits; \
- __start_bit += fs_info->sectors_per_page * btrfs_bitmap_nr_##name; \
+ __start_bit = offset_in_folio(folio, start) >> fs_info->sectorsize_bits; \
+ __start_bit += blocks_per_folio * btrfs_bitmap_nr_##name; \
__start_bit; \
})
@@ -233,7 +223,7 @@ static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len)
if (folio_pos(folio) >= orig_start + orig_len)
*len = 0;
else
- *len = min_t(u64, folio_pos(folio) + PAGE_SIZE,
+ *len = min_t(u64, folio_pos(folio) + folio_size(folio),
orig_start + orig_len) - *start;
}
@@ -296,7 +286,7 @@ void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info,
ASSERT(folio_test_locked(folio));
- if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) {
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio)) {
folio_unlock(folio);
return;
}
@@ -323,13 +313,14 @@ void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info,
struct folio *folio, unsigned long bitmap)
{
struct btrfs_subpage *subpage = folio_get_private(folio);
- const int start_bit = fs_info->sectors_per_page * btrfs_bitmap_nr_locked;
+ const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
+ const int start_bit = blocks_per_folio * btrfs_bitmap_nr_locked;
unsigned long flags;
bool last = false;
int cleared = 0;
int bit;
- if (!btrfs_is_subpage(fs_info, folio->mapping)) {
+ if (!btrfs_is_subpage(fs_info, folio)) {
folio_unlock(folio);
return;
}
@@ -341,7 +332,7 @@ void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info,
}
spin_lock_irqsave(&subpage->lock, flags);
- for_each_set_bit(bit, &bitmap, fs_info->sectors_per_page) {
+ for_each_set_bit(bit, &bitmap, blocks_per_folio) {
if (test_and_clear_bit(bit + start_bit, subpage->bitmaps))
cleared++;
}
@@ -352,15 +343,27 @@ void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info,
folio_unlock(folio);
}
-#define subpage_test_bitmap_all_set(fs_info, subpage, name) \
+#define subpage_test_bitmap_all_set(fs_info, folio, name) \
+({ \
+ struct btrfs_subpage *subpage = folio_get_private(folio); \
+ const unsigned int blocks_per_folio = \
+ btrfs_blocks_per_folio(fs_info, folio); \
+ \
bitmap_test_range_all_set(subpage->bitmaps, \
- fs_info->sectors_per_page * btrfs_bitmap_nr_##name, \
- fs_info->sectors_per_page)
+ blocks_per_folio * btrfs_bitmap_nr_##name, \
+ blocks_per_folio); \
+})
-#define subpage_test_bitmap_all_zero(fs_info, subpage, name) \
+#define subpage_test_bitmap_all_zero(fs_info, folio, name) \
+({ \
+ struct btrfs_subpage *subpage = folio_get_private(folio); \
+ const unsigned int blocks_per_folio = \
+ btrfs_blocks_per_folio(fs_info, folio); \
+ \
bitmap_test_range_all_zero(subpage->bitmaps, \
- fs_info->sectors_per_page * btrfs_bitmap_nr_##name, \
- fs_info->sectors_per_page)
+ blocks_per_folio * btrfs_bitmap_nr_##name, \
+ blocks_per_folio); \
+})
void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
@@ -372,7 +375,7 @@ void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info,
spin_lock_irqsave(&subpage->lock, flags);
bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
- if (subpage_test_bitmap_all_set(fs_info, subpage, uptodate))
+ if (subpage_test_bitmap_all_set(fs_info, folio, uptodate))
folio_mark_uptodate(folio);
spin_unlock_irqrestore(&subpage->lock, flags);
}
@@ -426,7 +429,7 @@ bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
spin_lock_irqsave(&subpage->lock, flags);
bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
- if (subpage_test_bitmap_all_zero(fs_info, subpage, dirty))
+ if (subpage_test_bitmap_all_zero(fs_info, folio, dirty))
last = true;
spin_unlock_irqrestore(&subpage->lock, flags);
return last;
@@ -467,7 +470,7 @@ void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info,
spin_lock_irqsave(&subpage->lock, flags);
bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
- if (subpage_test_bitmap_all_zero(fs_info, subpage, writeback)) {
+ if (subpage_test_bitmap_all_zero(fs_info, folio, writeback)) {
ASSERT(folio_test_writeback(folio));
folio_end_writeback(folio);
}
@@ -498,7 +501,7 @@ void btrfs_subpage_clear_ordered(const struct btrfs_fs_info *fs_info,
spin_lock_irqsave(&subpage->lock, flags);
bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
- if (subpage_test_bitmap_all_zero(fs_info, subpage, ordered))
+ if (subpage_test_bitmap_all_zero(fs_info, folio, ordered))
folio_clear_ordered(folio);
spin_unlock_irqrestore(&subpage->lock, flags);
}
@@ -513,7 +516,7 @@ void btrfs_subpage_set_checked(const struct btrfs_fs_info *fs_info,
spin_lock_irqsave(&subpage->lock, flags);
bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
- if (subpage_test_bitmap_all_set(fs_info, subpage, checked))
+ if (subpage_test_bitmap_all_set(fs_info, folio, checked))
folio_set_checked(folio);
spin_unlock_irqrestore(&subpage->lock, flags);
}
@@ -569,7 +572,7 @@ void btrfs_folio_set_##name(const struct btrfs_fs_info *fs_info, \
struct folio *folio, u64 start, u32 len) \
{ \
if (unlikely(!fs_info) || \
- !btrfs_is_subpage(fs_info, folio->mapping)) { \
+ !btrfs_is_subpage(fs_info, folio)) { \
folio_set_func(folio); \
return; \
} \
@@ -579,7 +582,7 @@ void btrfs_folio_clear_##name(const struct btrfs_fs_info *fs_info, \
struct folio *folio, u64 start, u32 len) \
{ \
if (unlikely(!fs_info) || \
- !btrfs_is_subpage(fs_info, folio->mapping)) { \
+ !btrfs_is_subpage(fs_info, folio)) { \
folio_clear_func(folio); \
return; \
} \
@@ -589,7 +592,7 @@ bool btrfs_folio_test_##name(const struct btrfs_fs_info *fs_info, \
struct folio *folio, u64 start, u32 len) \
{ \
if (unlikely(!fs_info) || \
- !btrfs_is_subpage(fs_info, folio->mapping)) \
+ !btrfs_is_subpage(fs_info, folio)) \
return folio_test_func(folio); \
return btrfs_subpage_test_##name(fs_info, folio, start, len); \
} \
@@ -597,7 +600,7 @@ void btrfs_folio_clamp_set_##name(const struct btrfs_fs_info *fs_info, \
struct folio *folio, u64 start, u32 len) \
{ \
if (unlikely(!fs_info) || \
- !btrfs_is_subpage(fs_info, folio->mapping)) { \
+ !btrfs_is_subpage(fs_info, folio)) { \
folio_set_func(folio); \
return; \
} \
@@ -608,7 +611,7 @@ void btrfs_folio_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \
struct folio *folio, u64 start, u32 len) \
{ \
if (unlikely(!fs_info) || \
- !btrfs_is_subpage(fs_info, folio->mapping)) { \
+ !btrfs_is_subpage(fs_info, folio)) { \
folio_clear_func(folio); \
return; \
} \
@@ -619,10 +622,32 @@ bool btrfs_folio_clamp_test_##name(const struct btrfs_fs_info *fs_info, \
struct folio *folio, u64 start, u32 len) \
{ \
if (unlikely(!fs_info) || \
- !btrfs_is_subpage(fs_info, folio->mapping)) \
+ !btrfs_is_subpage(fs_info, folio)) \
return folio_test_func(folio); \
btrfs_subpage_clamp_range(folio, &start, &len); \
return btrfs_subpage_test_##name(fs_info, folio, start, len); \
+} \
+void btrfs_meta_folio_set_##name(struct folio *folio, const struct extent_buffer *eb) \
+{ \
+ if (!btrfs_meta_is_subpage(eb->fs_info)) { \
+ folio_set_func(folio); \
+ return; \
+ } \
+ btrfs_subpage_set_##name(eb->fs_info, folio, eb->start, eb->len); \
+} \
+void btrfs_meta_folio_clear_##name(struct folio *folio, const struct extent_buffer *eb) \
+{ \
+ if (!btrfs_meta_is_subpage(eb->fs_info)) { \
+ folio_clear_func(folio); \
+ return; \
+ } \
+ btrfs_subpage_clear_##name(eb->fs_info, folio, eb->start, eb->len); \
+} \
+bool btrfs_meta_folio_test_##name(struct folio *folio, const struct extent_buffer *eb) \
+{ \
+ if (!btrfs_meta_is_subpage(eb->fs_info)) \
+ return folio_test_func(folio); \
+ return btrfs_subpage_test_##name(eb->fs_info, folio, eb->start, eb->len); \
}
IMPLEMENT_BTRFS_PAGE_OPS(uptodate, folio_mark_uptodate, folio_clear_uptodate,
folio_test_uptodate);
@@ -635,6 +660,31 @@ IMPLEMENT_BTRFS_PAGE_OPS(ordered, folio_set_ordered, folio_clear_ordered,
IMPLEMENT_BTRFS_PAGE_OPS(checked, folio_set_checked, folio_clear_checked,
folio_test_checked);
+#define GET_SUBPAGE_BITMAP(fs_info, folio, name, dst) \
+{ \
+ const unsigned int blocks_per_folio = \
+ btrfs_blocks_per_folio(fs_info, folio); \
+ const struct btrfs_subpage *subpage = folio_get_private(folio); \
+ \
+ ASSERT(blocks_per_folio <= BITS_PER_LONG); \
+ *dst = bitmap_read(subpage->bitmaps, \
+ blocks_per_folio * btrfs_bitmap_nr_##name, \
+ blocks_per_folio); \
+}
+
+#define SUBPAGE_DUMP_BITMAP(fs_info, folio, name, start, len) \
+{ \
+ unsigned long bitmap; \
+ const unsigned int blocks_per_folio = \
+ btrfs_blocks_per_folio(fs_info, folio); \
+ \
+ GET_SUBPAGE_BITMAP(fs_info, folio, name, &bitmap); \
+ btrfs_warn(fs_info, \
+ "dumpping bitmap start=%llu len=%u folio=%llu " #name "_bitmap=%*pbl", \
+ start, len, folio_pos(folio), \
+ blocks_per_folio, &bitmap); \
+}
+
/*
* Make sure not only the page dirty bit is cleared, but also subpage dirty bit
* is cleared.
@@ -650,7 +700,7 @@ void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info,
if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
return;
- if (!btrfs_is_subpage(fs_info, folio->mapping)) {
+ if (!btrfs_is_subpage(fs_info, folio)) {
ASSERT(!folio_test_dirty(folio));
return;
}
@@ -660,6 +710,10 @@ void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info,
subpage = folio_get_private(folio);
ASSERT(subpage);
spin_lock_irqsave(&subpage->lock, flags);
+ if (unlikely(!bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits))) {
+ SUBPAGE_DUMP_BITMAP(fs_info, folio, dirty, start, len);
+ ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits));
+ }
ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits));
spin_unlock_irqrestore(&subpage->lock, flags);
}
@@ -681,7 +735,7 @@ void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info,
int ret;
ASSERT(folio_test_locked(folio));
- if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping))
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio))
return;
subpage = folio_get_private(folio);
@@ -689,57 +743,74 @@ void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info,
nbits = len >> fs_info->sectorsize_bits;
spin_lock_irqsave(&subpage->lock, flags);
/* Target range should not yet be locked. */
- ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits));
+ if (unlikely(!bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits))) {
+ SUBPAGE_DUMP_BITMAP(fs_info, folio, locked, start, len);
+ ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits));
+ }
bitmap_set(subpage->bitmaps, start_bit, nbits);
ret = atomic_add_return(nbits, &subpage->nr_locked);
- ASSERT(ret <= fs_info->sectors_per_page);
+ ASSERT(ret <= btrfs_blocks_per_folio(fs_info, folio));
spin_unlock_irqrestore(&subpage->lock, flags);
}
-#define GET_SUBPAGE_BITMAP(subpage, fs_info, name, dst) \
-{ \
- const int sectors_per_page = fs_info->sectors_per_page; \
- \
- ASSERT(sectors_per_page < BITS_PER_LONG); \
- *dst = bitmap_read(subpage->bitmaps, \
- sectors_per_page * btrfs_bitmap_nr_##name, \
- sectors_per_page); \
+/*
+ * Clear the dirty flag for the folio.
+ *
+ * If the affected folio is no longer dirty, return true. Otherwise return false.
+ */
+bool btrfs_meta_folio_clear_and_test_dirty(struct folio *folio, const struct extent_buffer *eb)
+{
+ bool last;
+
+ if (!btrfs_meta_is_subpage(eb->fs_info)) {
+ folio_clear_dirty_for_io(folio);
+ return true;
+ }
+
+ last = btrfs_subpage_clear_and_test_dirty(eb->fs_info, folio, eb->start, eb->len);
+ if (last) {
+ folio_clear_dirty_for_io(folio);
+ return true;
+ }
+ return false;
}
void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
struct btrfs_subpage *subpage;
- const u32 sectors_per_page = fs_info->sectors_per_page;
+ const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
unsigned long uptodate_bitmap;
unsigned long dirty_bitmap;
unsigned long writeback_bitmap;
unsigned long ordered_bitmap;
unsigned long checked_bitmap;
+ unsigned long locked_bitmap;
unsigned long flags;
ASSERT(folio_test_private(folio) && folio_get_private(folio));
- ASSERT(sectors_per_page > 1);
+ ASSERT(blocks_per_folio > 1);
subpage = folio_get_private(folio);
spin_lock_irqsave(&subpage->lock, flags);
- GET_SUBPAGE_BITMAP(subpage, fs_info, uptodate, &uptodate_bitmap);
- GET_SUBPAGE_BITMAP(subpage, fs_info, dirty, &dirty_bitmap);
- GET_SUBPAGE_BITMAP(subpage, fs_info, writeback, &writeback_bitmap);
- GET_SUBPAGE_BITMAP(subpage, fs_info, ordered, &ordered_bitmap);
- GET_SUBPAGE_BITMAP(subpage, fs_info, checked, &checked_bitmap);
- GET_SUBPAGE_BITMAP(subpage, fs_info, locked, &checked_bitmap);
+ GET_SUBPAGE_BITMAP(fs_info, folio, uptodate, &uptodate_bitmap);
+ GET_SUBPAGE_BITMAP(fs_info, folio, dirty, &dirty_bitmap);
+ GET_SUBPAGE_BITMAP(fs_info, folio, writeback, &writeback_bitmap);
+ GET_SUBPAGE_BITMAP(fs_info, folio, ordered, &ordered_bitmap);
+ GET_SUBPAGE_BITMAP(fs_info, folio, checked, &checked_bitmap);
+ GET_SUBPAGE_BITMAP(fs_info, folio, locked, &locked_bitmap);
spin_unlock_irqrestore(&subpage->lock, flags);
dump_page(folio_page(folio, 0), "btrfs subpage dump");
btrfs_warn(fs_info,
-"start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl dirty=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl",
+"start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl dirty=%*pbl locked=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl",
start, len, folio_pos(folio),
- sectors_per_page, &uptodate_bitmap,
- sectors_per_page, &dirty_bitmap,
- sectors_per_page, &writeback_bitmap,
- sectors_per_page, &ordered_bitmap,
- sectors_per_page, &checked_bitmap);
+ blocks_per_folio, &uptodate_bitmap,
+ blocks_per_folio, &dirty_bitmap,
+ blocks_per_folio, &locked_bitmap,
+ blocks_per_folio, &writeback_bitmap,
+ blocks_per_folio, &ordered_bitmap,
+ blocks_per_folio, &checked_bitmap);
}
void btrfs_get_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info,
@@ -750,10 +821,10 @@ void btrfs_get_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info,
unsigned long flags;
ASSERT(folio_test_private(folio) && folio_get_private(folio));
- ASSERT(fs_info->sectors_per_page > 1);
+ ASSERT(btrfs_blocks_per_folio(fs_info, folio) > 1);
subpage = folio_get_private(folio);
spin_lock_irqsave(&subpage->lock, flags);
- GET_SUBPAGE_BITMAP(subpage, fs_info, dirty, ret_bitmap);
+ GET_SUBPAGE_BITMAP(fs_info, folio, dirty, ret_bitmap);
spin_unlock_irqrestore(&subpage->lock, flags);
}
diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
index 428fa9389fd4..3042c5ea840a 100644
--- a/fs/btrfs/subpage.h
+++ b/fs/btrfs/subpage.h
@@ -6,10 +6,11 @@
#include <linux/spinlock.h>
#include <linux/atomic.h>
#include <linux/sizes.h>
+#include "btrfs_inode.h"
+#include "fs.h"
struct address_space;
struct folio;
-struct btrfs_fs_info;
/*
* Extra info for subpapge bitmap.
@@ -69,23 +70,49 @@ enum btrfs_subpage_type {
BTRFS_SUBPAGE_DATA,
};
-#if PAGE_SIZE > SZ_4K
-bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space *mapping);
+#if PAGE_SIZE > BTRFS_MIN_BLOCKSIZE
+/*
+ * Subpage support for metadata is more complex, as we can have dummy extent
+ * buffers, where folios have no mapping to determine the owning inode.
+ *
+ * Thankfully we only need to check if node size is smaller than page size.
+ * Even with larger folio support, we will only allocate a folio as large as
+ * node size.
+ * Thus if nodesize < PAGE_SIZE, we know metadata needs need to subpage routine.
+ */
+static inline bool btrfs_meta_is_subpage(const struct btrfs_fs_info *fs_info)
+{
+ return fs_info->nodesize < PAGE_SIZE;
+}
+static inline bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info,
+ struct folio *folio)
+{
+ if (folio->mapping && folio->mapping->host)
+ ASSERT(is_data_inode(BTRFS_I(folio->mapping->host)));
+ return fs_info->sectorsize < folio_size(folio);
+}
#else
+static inline bool btrfs_meta_is_subpage(const struct btrfs_fs_info *fs_info)
+{
+ return false;
+}
static inline bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info,
- struct address_space *mapping)
+ struct folio *folio)
{
+ if (folio->mapping && folio->mapping->host)
+ ASSERT(is_data_inode(BTRFS_I(folio->mapping->host)));
return false;
}
#endif
int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
struct folio *folio, enum btrfs_subpage_type type);
-void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio);
+void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio,
+ enum btrfs_subpage_type type);
/* Allocate additional data where page represents more than one sector */
struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
- enum btrfs_subpage_type type);
+ size_t fsize, enum btrfs_subpage_type type);
void btrfs_free_subpage(struct btrfs_subpage *subpage);
void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio);
@@ -110,6 +137,13 @@ void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info,
* btrfs_folio_clamp_*() are similar to btrfs_folio_*(), except the range doesn't
* need to be inside the page. Those functions will truncate the range
* automatically.
+ *
+ * Both btrfs_folio_*() and btrfs_folio_clamp_*() are for data folios.
+ *
+ * For metadata, one should use btrfs_meta_folio_*() helpers instead, and there
+ * is no clamp version for metadata helpers, as we either go subpage
+ * (nodesize < PAGE_SIZE) or go regular folio helpers (nodesize >= PAGE_SIZE,
+ * and our folio is never larger than nodesize).
*/
#define DECLARE_BTRFS_SUBPAGE_OPS(name) \
void btrfs_subpage_set_##name(const struct btrfs_fs_info *fs_info, \
@@ -129,7 +163,10 @@ void btrfs_folio_clamp_set_##name(const struct btrfs_fs_info *fs_info, \
void btrfs_folio_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \
struct folio *folio, u64 start, u32 len); \
bool btrfs_folio_clamp_test_##name(const struct btrfs_fs_info *fs_info, \
- struct folio *folio, u64 start, u32 len);
+ struct folio *folio, u64 start, u32 len); \
+void btrfs_meta_folio_set_##name(struct folio *folio, const struct extent_buffer *eb); \
+void btrfs_meta_folio_clear_##name(struct folio *folio, const struct extent_buffer *eb); \
+bool btrfs_meta_folio_test_##name(struct folio *folio, const struct extent_buffer *eb);
DECLARE_BTRFS_SUBPAGE_OPS(uptodate);
DECLARE_BTRFS_SUBPAGE_OPS(dirty);
@@ -137,11 +174,25 @@ DECLARE_BTRFS_SUBPAGE_OPS(writeback);
DECLARE_BTRFS_SUBPAGE_OPS(ordered);
DECLARE_BTRFS_SUBPAGE_OPS(checked);
+/*
+ * Helper for error cleanup, where a folio will have its dirty flag cleared,
+ * with writeback started and finished.
+ */
+static inline void btrfs_folio_clamp_finish_io(struct btrfs_fs_info *fs_info,
+ struct folio *locked_folio,
+ u64 start, u32 len)
+{
+ btrfs_folio_clamp_clear_dirty(fs_info, locked_folio, start, len);
+ btrfs_folio_clamp_set_writeback(fs_info, locked_folio, start, len);
+ btrfs_folio_clamp_clear_writeback(fs_info, locked_folio, start, len);
+}
+
bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len);
void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len);
+bool btrfs_meta_folio_clear_and_test_dirty(struct folio *folio, const struct extent_buffer *eb);
void btrfs_get_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info,
struct folio *folio,
unsigned long *ret_bitmap);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 7dfe5005129a..7121d8c7a318 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -84,7 +84,7 @@ struct btrfs_fs_context {
u32 thread_pool_size;
unsigned long long mount_opt;
unsigned long compress_type:4;
- unsigned int compress_level;
+ int compress_level;
refcount_t refs;
};
@@ -947,7 +947,7 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec
static int btrfs_fill_super(struct super_block *sb,
struct btrfs_fs_devices *fs_devices)
{
- struct inode *inode;
+ struct btrfs_inode *inode;
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
int err;
@@ -961,7 +961,7 @@ static int btrfs_fill_super(struct super_block *sb,
#endif
sb->s_xattr = btrfs_xattr_handlers;
sb->s_time_gran = 1;
- sb->s_iflags |= SB_I_CGROUPWB;
+ sb->s_iflags |= SB_I_CGROUPWB | SB_I_ALLOW_HSM;
err = super_setup_bdi(sb);
if (err) {
@@ -971,7 +971,7 @@ static int btrfs_fill_super(struct super_block *sb,
err = open_ctree(sb, fs_devices);
if (err) {
- btrfs_err(fs_info, "open_ctree failed");
+ btrfs_err(fs_info, "open_ctree failed: %d", err);
return err;
}
@@ -982,7 +982,7 @@ static int btrfs_fill_super(struct super_block *sb,
goto fail_close;
}
- sb->s_root = d_make_root(inode);
+ sb->s_root = d_make_root(&inode->vfs_inode);
if (!sb->s_root) {
err = -ENOMEM;
goto fail_close;
@@ -1139,8 +1139,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
subvol_name = btrfs_get_subvol_name_from_objectid(info,
btrfs_root_id(BTRFS_I(d_inode(dentry))->root));
if (!IS_ERR(subvol_name)) {
- seq_puts(seq, ",subvol=");
- seq_escape(seq, subvol_name, " \t\n\\");
+ seq_show_option(seq, "subvol", subvol_name);
kfree(subvol_name);
}
return 0;
@@ -2446,6 +2445,9 @@ static __cold void btrfs_interface_exit(void)
static int __init btrfs_print_mod_info(void)
{
static const char options[] = ""
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+ ", experimental=on"
+#endif
#ifdef CONFIG_BTRFS_DEBUG
", debug=on"
#endif
@@ -2466,7 +2468,17 @@ static int __init btrfs_print_mod_info(void)
", fsverity=no"
#endif
;
+
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+ if (btrfs_get_mod_read_policy() == NULL)
+ pr_info("Btrfs loaded%s\n", options);
+ else
+ pr_info("Btrfs loaded%s, read_policy=%s\n",
+ options, btrfs_get_mod_read_policy());
+#else
pr_info("Btrfs loaded%s\n", options);
+#endif
+
return 0;
}
@@ -2524,6 +2536,11 @@ static const struct init_sequence mod_init_seq[] = {
}, {
.init_func = extent_map_init,
.exit_func = extent_map_exit,
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+ }, {
+ .init_func = btrfs_read_policy_init,
+ .exit_func = NULL,
+#endif
}, {
.init_func = ordered_data_init,
.exit_func = ordered_data_exit,
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index fdcbf650ac31..b9af74498b0c 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -411,7 +411,8 @@ static ssize_t supported_sectorsizes_show(struct kobject *kobj,
{
ssize_t ret = 0;
- /* An artificial limit to only support 4K and PAGE_SIZE */
+ if (BTRFS_MIN_BLOCKSIZE != SZ_4K && BTRFS_MIN_BLOCKSIZE != PAGE_SIZE)
+ ret += sysfs_emit_at(buf, ret, "%u ", BTRFS_MIN_BLOCKSIZE);
if (PAGE_SIZE > SZ_4K)
ret += sysfs_emit_at(buf, ret, "%u ", SZ_4K);
ret += sysfs_emit_at(buf, ret, "%lu\n", PAGE_SIZE);
@@ -1118,7 +1119,7 @@ static ssize_t btrfs_nodesize_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
- return sysfs_emit(buf, "%u\n", fs_info->super_copy->nodesize);
+ return sysfs_emit(buf, "%u\n", fs_info->nodesize);
}
BTRFS_ATTR(, nodesize, btrfs_nodesize_show);
@@ -1128,7 +1129,7 @@ static ssize_t btrfs_sectorsize_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
- return sysfs_emit(buf, "%u\n", fs_info->super_copy->sectorsize);
+ return sysfs_emit(buf, "%u\n", fs_info->sectorsize);
}
BTRFS_ATTR(, sectorsize, btrfs_sectorsize_show);
@@ -1180,7 +1181,7 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
- return sysfs_emit(buf, "%u\n", fs_info->super_copy->sectorsize);
+ return sysfs_emit(buf, "%u\n", fs_info->sectorsize);
}
BTRFS_ATTR(, clone_alignment, btrfs_clone_alignment_show);
@@ -1305,7 +1306,74 @@ static ssize_t btrfs_temp_fsid_show(struct kobject *kobj,
}
BTRFS_ATTR(, temp_fsid, btrfs_temp_fsid_show);
-static const char * const btrfs_read_policy_name[] = { "pid" };
+static const char *btrfs_read_policy_name[] = {
+ "pid",
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+ "round-robin",
+ "devid",
+#endif
+};
+
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+
+/* Global module configuration parameters. */
+static char *read_policy;
+char *btrfs_get_mod_read_policy(void)
+{
+ return read_policy;
+}
+
+/* Set perms to 0, disable /sys/module/btrfs/parameter/read_policy interface. */
+module_param(read_policy, charp, 0);
+MODULE_PARM_DESC(read_policy,
+"Global read policy: pid (default), round-robin[:<min_contig_read>], devid[:<devid>]");
+#endif
+
+int btrfs_read_policy_to_enum(const char *str, s64 *value_ret)
+{
+ char param[32];
+ char __maybe_unused *value_str;
+
+ if (!str || strlen(str) == 0)
+ return 0;
+
+ strscpy(param, str);
+
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+ /* Separate value from input in policy:value format. */
+ value_str = strchr(param, ':');
+ if (value_str) {
+ char *retptr;
+
+ *value_str = 0;
+ value_str++;
+ if (!value_ret)
+ return -EINVAL;
+
+ *value_ret = memparse(value_str, &retptr);
+ /* There could be any trailing typos after the value. */
+ retptr = skip_spaces(retptr);
+ if (*retptr != 0 || *value_ret <= 0)
+ return -EINVAL;
+ }
+#endif
+
+ return sysfs_match_string(btrfs_read_policy_name, param);
+}
+
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+int __init btrfs_read_policy_init(void)
+{
+ s64 value;
+
+ if (btrfs_read_policy_to_enum(read_policy, &value) == -EINVAL) {
+ btrfs_err(NULL, "invalid read policy or value %s", read_policy);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+#endif
static ssize_t btrfs_read_policy_show(struct kobject *kobj,
struct kobj_attribute *a, char *buf)
@@ -1316,14 +1384,25 @@ static ssize_t btrfs_read_policy_show(struct kobject *kobj,
int i;
for (i = 0; i < BTRFS_NR_READ_POLICY; i++) {
- if (policy == i)
- ret += sysfs_emit_at(buf, ret, "%s[%s]",
- (ret == 0 ? "" : " "),
- btrfs_read_policy_name[i]);
- else
- ret += sysfs_emit_at(buf, ret, "%s%s",
- (ret == 0 ? "" : " "),
- btrfs_read_policy_name[i]);
+ if (ret != 0)
+ ret += sysfs_emit_at(buf, ret, " ");
+
+ if (i == policy)
+ ret += sysfs_emit_at(buf, ret, "[");
+
+ ret += sysfs_emit_at(buf, ret, "%s", btrfs_read_policy_name[i]);
+
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+ if (i == BTRFS_READ_POLICY_RR)
+ ret += sysfs_emit_at(buf, ret, ":%u",
+ READ_ONCE(fs_devices->rr_min_contig_read));
+
+ if (i == BTRFS_READ_POLICY_DEVID)
+ ret += sysfs_emit_at(buf, ret, ":%llu",
+ READ_ONCE(fs_devices->read_devid));
+#endif
+ if (i == policy)
+ ret += sysfs_emit_at(buf, ret, "]");
}
ret += sysfs_emit_at(buf, ret, "\n");
@@ -1336,21 +1415,80 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj,
const char *buf, size_t len)
{
struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj);
- int i;
+ int index;
+ s64 value = -1;
- for (i = 0; i < BTRFS_NR_READ_POLICY; i++) {
- if (sysfs_streq(buf, btrfs_read_policy_name[i])) {
- if (i != READ_ONCE(fs_devices->read_policy)) {
- WRITE_ONCE(fs_devices->read_policy, i);
- btrfs_info(fs_devices->fs_info,
- "read policy set to '%s'",
- btrfs_read_policy_name[i]);
+ index = btrfs_read_policy_to_enum(buf, &value);
+ if (index < 0)
+ return -EINVAL;
+
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+ /* If moving from RR then disable collecting fs stats. */
+ if (fs_devices->read_policy == BTRFS_READ_POLICY_RR && index != BTRFS_READ_POLICY_RR)
+ fs_devices->collect_fs_stats = false;
+
+ if (index == BTRFS_READ_POLICY_RR) {
+ if (value != -1) {
+ const u32 sectorsize = fs_devices->fs_info->sectorsize;
+
+ if (!IS_ALIGNED(value, sectorsize)) {
+ u64 temp_value = round_up(value, sectorsize);
+
+ btrfs_debug(fs_devices->fs_info,
+"read_policy: min contig read %lld should be multiple of sectorsize %u, rounded to %llu",
+ value, sectorsize, temp_value);
+ value = temp_value;
}
- return len;
+ } else {
+ value = BTRFS_DEFAULT_RR_MIN_CONTIG_READ;
}
+
+ if (index != READ_ONCE(fs_devices->read_policy) ||
+ value != READ_ONCE(fs_devices->rr_min_contig_read)) {
+ WRITE_ONCE(fs_devices->read_policy, index);
+ WRITE_ONCE(fs_devices->rr_min_contig_read, value);
+
+ btrfs_info(fs_devices->fs_info, "read policy set to '%s:%lld'",
+ btrfs_read_policy_name[index], value);
+ }
+
+ fs_devices->collect_fs_stats = true;
+
+ return len;
}
- return -EINVAL;
+ if (index == BTRFS_READ_POLICY_DEVID) {
+ if (value != -1) {
+ BTRFS_DEV_LOOKUP_ARGS(args);
+
+ /* Validate input devid. */
+ args.devid = value;
+ if (btrfs_find_device(fs_devices, &args) == NULL)
+ return -EINVAL;
+ } else {
+ /* Set default devid to the devid of the latest device. */
+ value = fs_devices->latest_dev->devid;
+ }
+
+ if (index != READ_ONCE(fs_devices->read_policy) ||
+ value != READ_ONCE(fs_devices->read_devid)) {
+ WRITE_ONCE(fs_devices->read_policy, index);
+ WRITE_ONCE(fs_devices->read_devid, value);
+
+ btrfs_info(fs_devices->fs_info, "read policy set to '%s:%llu'",
+ btrfs_read_policy_name[index], value);
+ }
+
+ return len;
+ }
+#endif
+ if (index != READ_ONCE(fs_devices->read_policy)) {
+ WRITE_ONCE(fs_devices->read_policy, index);
+ btrfs_info(fs_devices->fs_info, "read policy set to '%s'",
+ btrfs_read_policy_name[index]);
+ }
+
+ return len;
}
BTRFS_ATTR_RW(, read_policy, btrfs_read_policy_show, btrfs_read_policy_store);
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index e6a284c59809..0f94ae923210 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -7,6 +7,7 @@
#include <linux/compiler_types.h>
#include <linux/kobject.h>
+struct block_device;
struct btrfs_fs_info;
struct btrfs_device;
struct btrfs_fs_devices;
@@ -47,5 +48,11 @@ void btrfs_sysfs_del_qgroups(struct btrfs_fs_info *fs_info);
int btrfs_sysfs_add_qgroups(struct btrfs_fs_info *fs_info);
void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info,
struct btrfs_qgroup *qgroup);
+int btrfs_read_policy_to_enum(const char *str, s64 *value);
+
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+int __init btrfs_read_policy_init(void);
+char *btrfs_get_mod_read_policy(void);
+#endif
#endif
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index e607b5d52fb1..5eff8d7d2360 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -30,6 +30,7 @@ const char *test_error[] = {
[TEST_ALLOC_EXTENT_MAP] = "cannot allocate extent map",
[TEST_ALLOC_CHUNK_MAP] = "cannot allocate chunk map",
[TEST_ALLOC_IO_CONTEXT] = "cannot allocate io context",
+ [TEST_ALLOC_TRANSACTION] = "cannot allocate transaction",
};
static const struct super_operations btrfs_test_super_ops = {
@@ -142,6 +143,11 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize)
fs_info->nodesize = nodesize;
fs_info->sectorsize = sectorsize;
fs_info->sectorsize_bits = ilog2(sectorsize);
+
+ /* CRC32C csum size. */
+ fs_info->csum_size = 4;
+ fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) /
+ fs_info->csum_size;
set_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
test_mnt->mnt_sb->s_fs_info = fs_info;
@@ -247,6 +253,15 @@ void btrfs_free_dummy_block_group(struct btrfs_block_group *cache)
kfree(cache);
}
+void btrfs_init_dummy_transaction(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info)
+{
+ memset(trans, 0, sizeof(*trans));
+ trans->fs_info = fs_info;
+ xa_init(&trans->delayed_refs.head_refs);
+ xa_init(&trans->delayed_refs.dirty_extents);
+ spin_lock_init(&trans->delayed_refs.lock);
+}
+
void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info)
{
@@ -295,6 +310,9 @@ int btrfs_run_sanity_tests(void)
ret = btrfs_test_raid_stripe_tree(sectorsize, nodesize);
if (ret)
goto out;
+ ret = btrfs_test_delayed_refs(sectorsize, nodesize);
+ if (ret)
+ goto out;
}
}
ret = btrfs_test_extent_map();
diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h
index b524ecf2f452..4307bdaa6749 100644
--- a/fs/btrfs/tests/btrfs-tests.h
+++ b/fs/btrfs/tests/btrfs-tests.h
@@ -6,6 +6,8 @@
#ifndef BTRFS_TESTS_H
#define BTRFS_TESTS_H
+#include <linux/types.h>
+
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
int btrfs_run_sanity_tests(void);
@@ -25,12 +27,14 @@ enum {
TEST_ALLOC_EXTENT_MAP,
TEST_ALLOC_CHUNK_MAP,
TEST_ALLOC_IO_CONTEXT,
+ TEST_ALLOC_TRANSACTION,
};
extern const char *test_error[];
struct btrfs_root;
struct btrfs_trans_handle;
+struct btrfs_transaction;
int btrfs_test_extent_buffer_operations(u32 sectorsize, u32 nodesize);
int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize);
@@ -40,6 +44,7 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize);
int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize);
int btrfs_test_raid_stripe_tree(u32 sectorsize, u32 nodesize);
int btrfs_test_extent_map(void);
+int btrfs_test_delayed_refs(u32 sectorsize, u32 nodesize);
struct inode *btrfs_new_test_inode(void);
struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize);
void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info);
@@ -49,6 +54,7 @@ btrfs_alloc_dummy_block_group(struct btrfs_fs_info *fs_info, unsigned long lengt
void btrfs_free_dummy_block_group(struct btrfs_block_group *cache);
void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
+void btrfs_init_dummy_transaction(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info);
struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info);
#else
static inline int btrfs_run_sanity_tests(void)
diff --git a/fs/btrfs/tests/delayed-refs-tests.c b/fs/btrfs/tests/delayed-refs-tests.c
new file mode 100644
index 000000000000..265370e79a54
--- /dev/null
+++ b/fs/btrfs/tests/delayed-refs-tests.c
@@ -0,0 +1,1016 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/sizes.h>
+#include "btrfs-tests.h"
+#include "../transaction.h"
+#include "../delayed-ref.h"
+#include "../extent-tree.h"
+
+#define FAKE_ROOT_OBJECTID 256
+#define FAKE_BYTENR 0
+#define FAKE_LEVEL 1
+#define FAKE_INO 256
+#define FAKE_FILE_OFFSET 0
+#define FAKE_PARENT SZ_1M
+
+struct ref_head_check {
+ u64 bytenr;
+ u64 num_bytes;
+ int ref_mod;
+ int total_ref_mod;
+ int must_insert;
+};
+
+struct ref_node_check {
+ u64 bytenr;
+ u64 num_bytes;
+ int ref_mod;
+ enum btrfs_delayed_ref_action action;
+ u8 type;
+ u64 parent;
+ u64 root;
+ u64 owner;
+ u64 offset;
+};
+
+static enum btrfs_ref_type ref_type_from_disk_ref_type(u8 type)
+{
+ if ((type == BTRFS_TREE_BLOCK_REF_KEY) ||
+ (type == BTRFS_SHARED_BLOCK_REF_KEY))
+ return BTRFS_REF_METADATA;
+ return BTRFS_REF_DATA;
+}
+
+static void delete_delayed_ref_head(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_head *head)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_delayed_ref_root *delayed_refs =
+ &trans->transaction->delayed_refs;
+
+ spin_lock(&delayed_refs->lock);
+ spin_lock(&head->lock);
+ btrfs_delete_ref_head(fs_info, delayed_refs, head);
+ spin_unlock(&head->lock);
+ spin_unlock(&delayed_refs->lock);
+
+ btrfs_delayed_ref_unlock(head);
+ btrfs_put_delayed_ref_head(head);
+}
+
+static void delete_delayed_ref_node(struct btrfs_delayed_ref_head *head,
+ struct btrfs_delayed_ref_node *node)
+{
+ rb_erase_cached(&node->ref_node, &head->ref_tree);
+ RB_CLEAR_NODE(&node->ref_node);
+ if (!list_empty(&node->add_list))
+ list_del_init(&node->add_list);
+ btrfs_put_delayed_ref(node);
+}
+
+static int validate_ref_head(struct btrfs_delayed_ref_head *head,
+ struct ref_head_check *check)
+{
+ if (head->bytenr != check->bytenr) {
+ test_err("invalid bytenr have: %llu want: %llu", head->bytenr,
+ check->bytenr);
+ return -EINVAL;
+ }
+
+ if (head->num_bytes != check->num_bytes) {
+ test_err("invalid num_bytes have: %llu want: %llu",
+ head->num_bytes, check->num_bytes);
+ return -EINVAL;
+ }
+
+ if (head->ref_mod != check->ref_mod) {
+ test_err("invalid ref_mod have: %d want: %d", head->ref_mod,
+ check->ref_mod);
+ return -EINVAL;
+ }
+
+ if (head->total_ref_mod != check->total_ref_mod) {
+ test_err("invalid total_ref_mod have: %d want: %d",
+ head->total_ref_mod, check->total_ref_mod);
+ return -EINVAL;
+ }
+
+ if (head->must_insert_reserved != check->must_insert) {
+ test_err("invalid must_insert have: %d want: %d",
+ head->must_insert_reserved, check->must_insert);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int validate_ref_node(struct btrfs_delayed_ref_node *node,
+ struct ref_node_check *check)
+{
+ if (node->bytenr != check->bytenr) {
+ test_err("invalid bytenr have: %llu want: %llu", node->bytenr,
+ check->bytenr);
+ return -EINVAL;
+ }
+
+ if (node->num_bytes != check->num_bytes) {
+ test_err("invalid num_bytes have: %llu want: %llu",
+ node->num_bytes, check->num_bytes);
+ return -EINVAL;
+ }
+
+ if (node->ref_mod != check->ref_mod) {
+ test_err("invalid ref_mod have: %d want: %d", node->ref_mod,
+ check->ref_mod);
+ return -EINVAL;
+ }
+
+ if (node->action != check->action) {
+ test_err("invalid action have: %d want: %d", node->action,
+ check->action);
+ return -EINVAL;
+ }
+
+ if (node->parent != check->parent) {
+ test_err("invalid parent have: %llu want: %llu", node->parent,
+ check->parent);
+ return -EINVAL;
+ }
+
+ if (node->ref_root != check->root) {
+ test_err("invalid root have: %llu want: %llu", node->ref_root,
+ check->root);
+ return -EINVAL;
+ }
+
+ if (node->type != check->type) {
+ test_err("invalid type have: %d want: %d", node->type,
+ check->type);
+ return -EINVAL;
+ }
+
+ if (btrfs_delayed_ref_owner(node) != check->owner) {
+ test_err("invalid owner have: %llu want: %llu",
+ btrfs_delayed_ref_owner(node), check->owner);
+ return -EINVAL;
+ }
+
+ if (btrfs_delayed_ref_offset(node) != check->offset) {
+ test_err("invalid offset have: %llu want: %llu",
+ btrfs_delayed_ref_offset(node), check->offset);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int simple_test(struct btrfs_trans_handle *trans,
+ struct ref_head_check *head_check,
+ struct ref_node_check *node_check)
+{
+ struct btrfs_delayed_ref_root *delayed_refs =
+ &trans->transaction->delayed_refs;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_delayed_ref_head *head;
+ struct btrfs_delayed_ref_node *node;
+ struct btrfs_ref ref = {
+ .type = ref_type_from_disk_ref_type(node_check->type),
+ .action = node_check->action,
+ .parent = node_check->parent,
+ .ref_root = node_check->root,
+ .bytenr = node_check->bytenr,
+ .num_bytes = fs_info->nodesize,
+ };
+ int ret;
+
+ if (ref.type == BTRFS_REF_METADATA)
+ btrfs_init_tree_ref(&ref, node_check->owner, node_check->root,
+ false);
+ else
+ btrfs_init_data_ref(&ref, node_check->owner, node_check->offset,
+ node_check->root, true);
+
+ if (ref.type == BTRFS_REF_METADATA)
+ ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL);
+ else
+ ret = btrfs_add_delayed_data_ref(trans, &ref, 0);
+ if (ret) {
+ test_err("failed ref action %d", ret);
+ return ret;
+ }
+
+ head = btrfs_select_ref_head(fs_info, delayed_refs);
+ if (IS_ERR_OR_NULL(head)) {
+ if (IS_ERR(head))
+ test_err("failed to select delayed ref head: %ld",
+ PTR_ERR(head));
+ else
+ test_err("failed to find delayed ref head");
+ return -EINVAL;
+ }
+
+ ret = -EINVAL;
+ if (validate_ref_head(head, head_check))
+ goto out;
+
+ spin_lock(&head->lock);
+ node = btrfs_select_delayed_ref(head);
+ spin_unlock(&head->lock);
+ if (!node) {
+ test_err("failed to select delayed ref");
+ goto out;
+ }
+
+ if (validate_ref_node(node, node_check))
+ goto out;
+ ret = 0;
+out:
+ btrfs_unselect_ref_head(delayed_refs, head);
+ btrfs_destroy_delayed_refs(trans->transaction);
+ return ret;
+}
+
+/*
+ * These are simple tests, make sure that our btrfs_ref's get turned into the
+ * appropriate btrfs_delayed_ref_node based on their settings and action.
+ */
+static int simple_tests(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct ref_head_check head_check = {
+ .bytenr = FAKE_BYTENR,
+ .num_bytes = fs_info->nodesize,
+ .ref_mod = 1,
+ .total_ref_mod = 1,
+ };
+ struct ref_node_check node_check = {
+ .bytenr = FAKE_BYTENR,
+ .num_bytes = fs_info->nodesize,
+ .ref_mod = 1,
+ .action = BTRFS_ADD_DELAYED_REF,
+ .type = BTRFS_TREE_BLOCK_REF_KEY,
+ .parent = 0,
+ .root = FAKE_ROOT_OBJECTID,
+ .owner = FAKE_LEVEL,
+ .offset = 0,
+ };
+
+ if (simple_test(trans, &head_check, &node_check)) {
+ test_err("single add tree block failed");
+ return -EINVAL;
+ }
+
+ node_check.type = BTRFS_EXTENT_DATA_REF_KEY;
+ node_check.owner = FAKE_INO;
+ node_check.offset = FAKE_FILE_OFFSET;
+
+ if (simple_test(trans, &head_check, &node_check)) {
+ test_err("single add extent data failed");
+ return -EINVAL;
+ }
+
+ node_check.parent = FAKE_PARENT;
+ node_check.type = BTRFS_SHARED_BLOCK_REF_KEY;
+ node_check.owner = FAKE_LEVEL;
+ node_check.offset = 0;
+
+ if (simple_test(trans, &head_check, &node_check)) {
+ test_err("single add shared block failed");
+ return -EINVAL;
+ }
+
+ node_check.type = BTRFS_SHARED_DATA_REF_KEY;
+ node_check.owner = FAKE_INO;
+ node_check.offset = FAKE_FILE_OFFSET;
+
+ if (simple_test(trans, &head_check, &node_check)) {
+ test_err("single add shared data failed");
+ return -EINVAL;
+ }
+
+ head_check.ref_mod = -1;
+ head_check.total_ref_mod = -1;
+ node_check.action = BTRFS_DROP_DELAYED_REF;
+ node_check.type = BTRFS_TREE_BLOCK_REF_KEY;
+ node_check.owner = FAKE_LEVEL;
+ node_check.offset = 0;
+ node_check.parent = 0;
+
+ if (simple_test(trans, &head_check, &node_check)) {
+ test_err("single drop tree block failed");
+ return -EINVAL;
+ }
+
+ node_check.type = BTRFS_EXTENT_DATA_REF_KEY;
+ node_check.owner = FAKE_INO;
+ node_check.offset = FAKE_FILE_OFFSET;
+
+ if (simple_test(trans, &head_check, &node_check)) {
+ test_err("single drop extent data failed");
+ return -EINVAL;
+ }
+
+ node_check.parent = FAKE_PARENT;
+ node_check.type = BTRFS_SHARED_BLOCK_REF_KEY;
+ node_check.owner = FAKE_LEVEL;
+ node_check.offset = 0;
+ if (simple_test(trans, &head_check, &node_check)) {
+ test_err("single drop shared block failed");
+ return -EINVAL;
+ }
+
+ node_check.type = BTRFS_SHARED_DATA_REF_KEY;
+ node_check.owner = FAKE_INO;
+ node_check.offset = FAKE_FILE_OFFSET;
+ if (simple_test(trans, &head_check, &node_check)) {
+ test_err("single drop shared data failed");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/*
+ * Merge tests, validate that we do delayed ref merging properly, the ref counts
+ * all end up properly, and delayed refs are deleted once they're no longer
+ * needed.
+ */
+static int merge_tests(struct btrfs_trans_handle *trans,
+ enum btrfs_ref_type type)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_delayed_ref_head *head = NULL;
+ struct btrfs_delayed_ref_node *node;
+ struct btrfs_ref ref = {
+ .type = type,
+ .action = BTRFS_ADD_DELAYED_REF,
+ .parent = 0,
+ .ref_root = FAKE_ROOT_OBJECTID,
+ .bytenr = FAKE_BYTENR,
+ .num_bytes = fs_info->nodesize,
+ };
+ struct ref_head_check head_check = {
+ .bytenr = FAKE_BYTENR,
+ .num_bytes = fs_info->nodesize,
+ .ref_mod = 0,
+ .total_ref_mod = 0,
+ };
+ struct ref_node_check node_check = {
+ .bytenr = FAKE_BYTENR,
+ .num_bytes = fs_info->nodesize,
+ .ref_mod = 2,
+ .action = BTRFS_ADD_DELAYED_REF,
+ .parent = 0,
+ .root = FAKE_ROOT_OBJECTID,
+ };
+ int ret;
+
+ /*
+ * First add a ref and then drop it, make sure we get a head ref with a
+ * 0 total ref mod and no nodes.
+ */
+ if (type == BTRFS_REF_METADATA) {
+ node_check.type = BTRFS_TREE_BLOCK_REF_KEY;
+ node_check.owner = FAKE_LEVEL;
+ btrfs_init_tree_ref(&ref, FAKE_LEVEL, FAKE_ROOT_OBJECTID, false);
+ } else {
+ node_check.type = BTRFS_EXTENT_DATA_REF_KEY;
+ node_check.owner = FAKE_INO;
+ node_check.offset = FAKE_FILE_OFFSET;
+ btrfs_init_data_ref(&ref, FAKE_INO, FAKE_FILE_OFFSET,
+ FAKE_ROOT_OBJECTID, true);
+ }
+
+ if (type == BTRFS_REF_METADATA)
+ ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL);
+ else
+ ret = btrfs_add_delayed_data_ref(trans, &ref, 0);
+ if (ret) {
+ test_err("failed ref action %d", ret);
+ return ret;
+ }
+
+ ref.action = BTRFS_DROP_DELAYED_REF;
+ if (type == BTRFS_REF_METADATA)
+ ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL);
+ else
+ ret = btrfs_add_delayed_data_ref(trans, &ref, 0);
+ if (ret) {
+ test_err("failed ref action %d", ret);
+ goto out;
+ }
+
+ head = btrfs_select_ref_head(fs_info, &trans->transaction->delayed_refs);
+ if (IS_ERR_OR_NULL(head)) {
+ if (IS_ERR(head))
+ test_err("failed to select delayed ref head: %ld",
+ PTR_ERR(head));
+ else
+ test_err("failed to find delayed ref head");
+ goto out;
+ }
+
+ ret = -EINVAL;
+ if (validate_ref_head(head, &head_check)) {
+ test_err("single add and drop failed");
+ goto out;
+ }
+
+ spin_lock(&head->lock);
+ node = btrfs_select_delayed_ref(head);
+ spin_unlock(&head->lock);
+ if (node) {
+ test_err("found node when none should exist");
+ goto out;
+ }
+
+ delete_delayed_ref_head(trans, head);
+ head = NULL;
+
+ /*
+ * Add a ref, then add another ref, make sure we get a head ref with a
+ * 2 total ref mod and 1 node.
+ */
+ ref.action = BTRFS_ADD_DELAYED_REF;
+ if (type == BTRFS_REF_METADATA)
+ ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL);
+ else
+ ret = btrfs_add_delayed_data_ref(trans, &ref, 0);
+ if (ret) {
+ test_err("failed ref action %d", ret);
+ goto out;
+ }
+
+ if (type == BTRFS_REF_METADATA)
+ ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL);
+ else
+ ret = btrfs_add_delayed_data_ref(trans, &ref, 0);
+ if (ret) {
+ test_err("failed ref action %d", ret);
+ goto out;
+ }
+
+ head = btrfs_select_ref_head(fs_info, &trans->transaction->delayed_refs);
+ if (IS_ERR_OR_NULL(head)) {
+ if (IS_ERR(head))
+ test_err("failed to select delayed ref head: %ld",
+ PTR_ERR(head));
+ else
+ test_err("failed to find delayed ref head");
+ goto out;
+ }
+
+ head_check.ref_mod = 2;
+ head_check.total_ref_mod = 2;
+ ret = -EINVAL;
+ if (validate_ref_head(head, &head_check)) {
+ test_err("double add failed");
+ goto out;
+ }
+
+ spin_lock(&head->lock);
+ node = btrfs_select_delayed_ref(head);
+ spin_unlock(&head->lock);
+ if (!node) {
+ test_err("failed to select delayed ref");
+ goto out;
+ }
+
+ if (validate_ref_node(node, &node_check)) {
+ test_err("node check failed");
+ goto out;
+ }
+
+ delete_delayed_ref_node(head, node);
+
+ spin_lock(&head->lock);
+ node = btrfs_select_delayed_ref(head);
+ spin_unlock(&head->lock);
+ if (node) {
+ test_err("found node when none should exist");
+ goto out;
+ }
+ delete_delayed_ref_head(trans, head);
+ head = NULL;
+
+ /* Add two drop refs, make sure they are merged properly. */
+ ref.action = BTRFS_DROP_DELAYED_REF;
+ if (type == BTRFS_REF_METADATA)
+ ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL);
+ else
+ ret = btrfs_add_delayed_data_ref(trans, &ref, 0);
+ if (ret) {
+ test_err("failed ref action %d", ret);
+ goto out;
+ }
+
+ if (type == BTRFS_REF_METADATA)
+ ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL);
+ else
+ ret = btrfs_add_delayed_data_ref(trans, &ref, 0);
+ if (ret) {
+ test_err("failed ref action %d", ret);
+ goto out;
+ }
+
+ head = btrfs_select_ref_head(fs_info, &trans->transaction->delayed_refs);
+ if (IS_ERR_OR_NULL(head)) {
+ if (IS_ERR(head))
+ test_err("failed to select delayed ref head: %ld",
+ PTR_ERR(head));
+ else
+ test_err("failed to find delayed ref head");
+ goto out;
+ }
+
+ head_check.ref_mod = -2;
+ head_check.total_ref_mod = -2;
+ ret = -EINVAL;
+ if (validate_ref_head(head, &head_check)) {
+ test_err("double drop failed");
+ goto out;
+ }
+
+ node_check.action = BTRFS_DROP_DELAYED_REF;
+ spin_lock(&head->lock);
+ node = btrfs_select_delayed_ref(head);
+ spin_unlock(&head->lock);
+ if (!node) {
+ test_err("failed to select delayed ref");
+ goto out;
+ }
+
+ if (validate_ref_node(node, &node_check)) {
+ test_err("node check failed");
+ goto out;
+ }
+
+ delete_delayed_ref_node(head, node);
+
+ spin_lock(&head->lock);
+ node = btrfs_select_delayed_ref(head);
+ spin_unlock(&head->lock);
+ if (node) {
+ test_err("found node when none should exist");
+ goto out;
+ }
+ delete_delayed_ref_head(trans, head);
+ head = NULL;
+
+ /* Add multiple refs, then drop until we go negative again. */
+ ref.action = BTRFS_ADD_DELAYED_REF;
+ for (int i = 0; i < 10; i++) {
+ if (type == BTRFS_REF_METADATA)
+ ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL);
+ else
+ ret = btrfs_add_delayed_data_ref(trans, &ref, 0);
+ if (ret) {
+ test_err("failed ref action %d", ret);
+ goto out;
+ }
+ }
+
+ ref.action = BTRFS_DROP_DELAYED_REF;
+ for (int i = 0; i < 12; i++) {
+ if (type == BTRFS_REF_METADATA)
+ ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL);
+ else
+ ret = btrfs_add_delayed_data_ref(trans, &ref, 0);
+ if (ret) {
+ test_err("failed ref action %d", ret);
+ goto out;
+ }
+ }
+
+ head = btrfs_select_ref_head(fs_info, &trans->transaction->delayed_refs);
+ if (IS_ERR_OR_NULL(head)) {
+ if (IS_ERR(head))
+ test_err("failed to select delayed ref head: %ld",
+ PTR_ERR(head));
+ else
+ test_err("failed to find delayed ref head");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ head_check.ref_mod = -2;
+ head_check.total_ref_mod = -2;
+ ret = -EINVAL;
+ if (validate_ref_head(head, &head_check)) {
+ test_err("double drop failed");
+ goto out;
+ }
+
+ spin_lock(&head->lock);
+ node = btrfs_select_delayed_ref(head);
+ spin_unlock(&head->lock);
+ if (!node) {
+ test_err("failed to select delayed ref");
+ goto out;
+ }
+
+ if (validate_ref_node(node, &node_check)) {
+ test_err("node check failed");
+ goto out;
+ }
+
+ delete_delayed_ref_node(head, node);
+
+ spin_lock(&head->lock);
+ node = btrfs_select_delayed_ref(head);
+ spin_unlock(&head->lock);
+ if (node) {
+ test_err("found node when none should exist");
+ goto out;
+ }
+
+ delete_delayed_ref_head(trans, head);
+ head = NULL;
+
+ /* Drop multiple refs, then add until we go positive again. */
+ ref.action = BTRFS_DROP_DELAYED_REF;
+ for (int i = 0; i < 10; i++) {
+ if (type == BTRFS_REF_METADATA)
+ ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL);
+ else
+ ret = btrfs_add_delayed_data_ref(trans, &ref, 0);
+ if (ret) {
+ test_err("failed ref action %d", ret);
+ goto out;
+ }
+ }
+
+ ref.action = BTRFS_ADD_DELAYED_REF;
+ for (int i = 0; i < 12; i++) {
+ if (type == BTRFS_REF_METADATA)
+ ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL);
+ else
+ ret = btrfs_add_delayed_data_ref(trans, &ref, 0);
+ if (ret) {
+ test_err("failed ref action %d", ret);
+ goto out;
+ }
+ }
+
+ head = btrfs_select_ref_head(fs_info, &trans->transaction->delayed_refs);
+ if (IS_ERR_OR_NULL(head)) {
+ if (IS_ERR(head))
+ test_err("failed to select delayed ref head: %ld",
+ PTR_ERR(head));
+ else
+ test_err("failed to find delayed ref head");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ head_check.ref_mod = 2;
+ head_check.total_ref_mod = 2;
+ ret = -EINVAL;
+ if (validate_ref_head(head, &head_check)) {
+ test_err("add and drop to positive failed");
+ goto out;
+ }
+
+ node_check.action = BTRFS_ADD_DELAYED_REF;
+ spin_lock(&head->lock);
+ node = btrfs_select_delayed_ref(head);
+ spin_unlock(&head->lock);
+ if (!node) {
+ test_err("failed to select delayed ref");
+ goto out;
+ }
+
+ if (validate_ref_node(node, &node_check)) {
+ test_err("node check failed");
+ goto out;
+ }
+
+ delete_delayed_ref_node(head, node);
+
+ spin_lock(&head->lock);
+ node = btrfs_select_delayed_ref(head);
+ spin_unlock(&head->lock);
+ if (node) {
+ test_err("found node when none should exist");
+ goto out;
+ }
+ delete_delayed_ref_head(trans, head);
+ head = NULL;
+
+ /*
+ * Add a bunch of refs with different roots and parents, then drop them
+ * all, make sure everything is properly merged.
+ */
+ ref.action = BTRFS_ADD_DELAYED_REF;
+ for (int i = 0; i < 50; i++) {
+ if (!(i % 2)) {
+ ref.parent = 0;
+ ref.ref_root = FAKE_ROOT_OBJECTID + i;
+ } else {
+ ref.parent = FAKE_PARENT + (i * fs_info->nodesize);
+ }
+ if (type == BTRFS_REF_METADATA)
+ ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL);
+ else
+ ret = btrfs_add_delayed_data_ref(trans, &ref, 0);
+ if (ret) {
+ test_err("failed ref action %d", ret);
+ goto out;
+ }
+ }
+
+ ref.action = BTRFS_DROP_DELAYED_REF;
+ for (int i = 0; i < 50; i++) {
+ if (!(i % 2)) {
+ ref.parent = 0;
+ ref.ref_root = FAKE_ROOT_OBJECTID + i;
+ } else {
+ ref.parent = FAKE_PARENT + (i * fs_info->nodesize);
+ }
+ if (type == BTRFS_REF_METADATA)
+ ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL);
+ else
+ ret = btrfs_add_delayed_data_ref(trans, &ref, 0);
+ if (ret) {
+ test_err("failed ref action %d", ret);
+ goto out;
+ }
+ }
+
+ head = btrfs_select_ref_head(fs_info, &trans->transaction->delayed_refs);
+ if (IS_ERR_OR_NULL(head)) {
+ if (IS_ERR(head))
+ test_err("failed to select delayed ref head: %ld",
+ PTR_ERR(head));
+ else
+ test_err("failed to find delayed ref head");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ head_check.ref_mod = 0;
+ head_check.total_ref_mod = 0;
+ ret = -EINVAL;
+ if (validate_ref_head(head, &head_check)) {
+ test_err("add and drop multiple failed");
+ goto out;
+ }
+
+ spin_lock(&head->lock);
+ node = btrfs_select_delayed_ref(head);
+ spin_unlock(&head->lock);
+ if (node) {
+ test_err("found node when none should exist");
+ goto out;
+ }
+ ret = 0;
+out:
+ if (!IS_ERR_OR_NULL(head))
+ btrfs_unselect_ref_head(&trans->transaction->delayed_refs, head);
+ btrfs_destroy_delayed_refs(trans->transaction);
+ return ret;
+}
+
+/*
+ * Basic test to validate we always get the add operations first followed by any
+ * delete operations.
+ */
+static int select_delayed_refs_test(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_delayed_ref_root *delayed_refs =
+ &trans->transaction->delayed_refs;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_delayed_ref_head *head = NULL;
+ struct btrfs_delayed_ref_node *node;
+ struct btrfs_ref ref = {
+ .type = BTRFS_REF_METADATA,
+ .action = BTRFS_DROP_DELAYED_REF,
+ .parent = 0,
+ .ref_root = FAKE_ROOT_OBJECTID,
+ .bytenr = FAKE_BYTENR,
+ .num_bytes = fs_info->nodesize,
+ };
+ struct ref_head_check head_check = {
+ .bytenr = FAKE_BYTENR,
+ .num_bytes = fs_info->nodesize,
+ .ref_mod = 0,
+ .total_ref_mod = 0,
+ };
+ struct ref_node_check node_check = {
+ .bytenr = FAKE_BYTENR,
+ .num_bytes = fs_info->nodesize,
+ .ref_mod = 1,
+ .action = BTRFS_ADD_DELAYED_REF,
+ .type = BTRFS_TREE_BLOCK_REF_KEY,
+ .parent = 0,
+ .owner = FAKE_LEVEL,
+ .offset = 0,
+ };
+ int ret;
+
+ /* Add the drop first. */
+ btrfs_init_tree_ref(&ref, FAKE_LEVEL, FAKE_ROOT_OBJECTID, false);
+ ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL);
+ if (ret) {
+ test_err("failed ref action %d", ret);
+ return ret;
+ }
+
+ /*
+ * Now add the add, and make it a different root so it's logically later
+ * in the rb tree.
+ */
+ ref.action = BTRFS_ADD_DELAYED_REF;
+ ref.ref_root = FAKE_ROOT_OBJECTID + 1;
+ ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL);
+ if (ret) {
+ test_err("failed ref action %d", ret);
+ goto out;
+ }
+
+ head = btrfs_select_ref_head(fs_info, delayed_refs);
+ if (IS_ERR_OR_NULL(head)) {
+ if (IS_ERR(head))
+ test_err("failed to select delayed ref head: %ld",
+ PTR_ERR(head));
+ else
+ test_err("failed to find delayed ref head");
+ ret = -EINVAL;
+ head = NULL;
+ goto out;
+ }
+
+ ret = -EINVAL;
+ if (validate_ref_head(head, &head_check)) {
+ test_err("head check failed");
+ goto out;
+ }
+
+ spin_lock(&head->lock);
+ node = btrfs_select_delayed_ref(head);
+ spin_unlock(&head->lock);
+ if (!node) {
+ test_err("failed to select delayed ref");
+ goto out;
+ }
+
+ node_check.root = FAKE_ROOT_OBJECTID + 1;
+ if (validate_ref_node(node, &node_check)) {
+ test_err("node check failed");
+ goto out;
+ }
+ delete_delayed_ref_node(head, node);
+
+ spin_lock(&head->lock);
+ node = btrfs_select_delayed_ref(head);
+ spin_unlock(&head->lock);
+ if (!node) {
+ test_err("failed to select delayed ref");
+ goto out;
+ }
+
+ node_check.action = BTRFS_DROP_DELAYED_REF;
+ node_check.root = FAKE_ROOT_OBJECTID;
+ if (validate_ref_node(node, &node_check)) {
+ test_err("node check failed");
+ goto out;
+ }
+ delete_delayed_ref_node(head, node);
+ delete_delayed_ref_head(trans, head);
+ head = NULL;
+
+ /*
+ * Now we're going to do the same thing, but we're going to have an add
+ * that gets deleted because of a merge, and make sure we still have
+ * another add in place.
+ */
+ ref.action = BTRFS_DROP_DELAYED_REF;
+ ref.ref_root = FAKE_ROOT_OBJECTID;
+ ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL);
+ if (ret) {
+ test_err("failed ref action %d", ret);
+ goto out;
+ }
+
+ ref.action = BTRFS_ADD_DELAYED_REF;
+ ref.ref_root = FAKE_ROOT_OBJECTID + 1;
+ ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL);
+ if (ret) {
+ test_err("failed ref action %d", ret);
+ goto out;
+ }
+
+ ref.action = BTRFS_DROP_DELAYED_REF;
+ ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL);
+ if (ret) {
+ test_err("failed ref action %d", ret);
+ goto out;
+ }
+
+ ref.action = BTRFS_ADD_DELAYED_REF;
+ ref.ref_root = FAKE_ROOT_OBJECTID + 2;
+ ret = btrfs_add_delayed_tree_ref(trans, &ref, NULL);
+ if (ret) {
+ test_err("failed ref action %d", ret);
+ goto out;
+ }
+
+ head = btrfs_select_ref_head(fs_info, delayed_refs);
+ if (IS_ERR_OR_NULL(head)) {
+ if (IS_ERR(head))
+ test_err("failed to select delayed ref head: %ld",
+ PTR_ERR(head));
+ else
+ test_err("failed to find delayed ref head");
+ ret = -EINVAL;
+ head = NULL;
+ goto out;
+ }
+
+ ret = -EINVAL;
+ if (validate_ref_head(head, &head_check)) {
+ test_err("head check failed");
+ goto out;
+ }
+
+ spin_lock(&head->lock);
+ node = btrfs_select_delayed_ref(head);
+ spin_unlock(&head->lock);
+ if (!node) {
+ test_err("failed to select delayed ref");
+ goto out;
+ }
+
+ node_check.action = BTRFS_ADD_DELAYED_REF;
+ node_check.root = FAKE_ROOT_OBJECTID + 2;
+ if (validate_ref_node(node, &node_check)) {
+ test_err("node check failed");
+ goto out;
+ }
+ delete_delayed_ref_node(head, node);
+
+ spin_lock(&head->lock);
+ node = btrfs_select_delayed_ref(head);
+ spin_unlock(&head->lock);
+ if (!node) {
+ test_err("failed to select delayed ref");
+ goto out;
+ }
+
+ node_check.action = BTRFS_DROP_DELAYED_REF;
+ node_check.root = FAKE_ROOT_OBJECTID;
+ if (validate_ref_node(node, &node_check)) {
+ test_err("node check failed");
+ goto out;
+ }
+ delete_delayed_ref_node(head, node);
+ ret = 0;
+out:
+ if (head)
+ btrfs_unselect_ref_head(delayed_refs, head);
+ btrfs_destroy_delayed_refs(trans->transaction);
+ return ret;
+}
+
+int btrfs_test_delayed_refs(u32 sectorsize, u32 nodesize)
+{
+ struct btrfs_transaction *transaction;
+ struct btrfs_trans_handle trans;
+ struct btrfs_fs_info *fs_info;
+ int ret;
+
+ test_msg("running delayed refs tests");
+
+ fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
+ if (!fs_info) {
+ test_std_err(TEST_ALLOC_FS_INFO);
+ return -ENOMEM;
+ }
+ transaction = kmalloc(sizeof(*transaction), GFP_KERNEL);
+ if (!transaction) {
+ test_std_err(TEST_ALLOC_TRANSACTION);
+ ret = -ENOMEM;
+ goto out_free_fs_info;
+ }
+ btrfs_init_dummy_trans(&trans, fs_info);
+ btrfs_init_dummy_transaction(transaction, fs_info);
+ trans.transaction = transaction;
+
+ ret = simple_tests(&trans);
+ if (!ret) {
+ test_msg("running delayed refs merg tests on metadata refs");
+ ret = merge_tests(&trans, BTRFS_REF_METADATA);
+ }
+
+ if (!ret) {
+ test_msg("running delayed refs merg tests on data refs");
+ ret = merge_tests(&trans, BTRFS_REF_DATA);
+ }
+
+ if (!ret)
+ ret = select_delayed_refs_test(&trans);
+
+ kfree(transaction);
+out_free_fs_info:
+ btrfs_free_dummy_fs_info(fs_info);
+ return ret;
+}
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index 0a2dbfaaf49e..74aca7180a5a 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -525,7 +525,7 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize)
goto out;
}
- eb = __alloc_dummy_extent_buffer(fs_info, 0, nodesize);
+ eb = alloc_dummy_extent_buffer(fs_info, 0);
if (!eb) {
test_std_err(TEST_ALLOC_ROOT);
ret = -ENOMEM;
@@ -542,7 +542,7 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize)
* Test again for case where the tree block is sectorsize aligned but
* not nodesize aligned.
*/
- eb = __alloc_dummy_extent_buffer(fs_info, sectorsize, nodesize);
+ eb = alloc_dummy_extent_buffer(fs_info, sectorsize);
if (!eb) {
test_std_err(TEST_ALLOC_ROOT);
ret = -ENOMEM;
@@ -730,7 +730,7 @@ static int test_eb_mem_ops(u32 sectorsize, u32 nodesize)
goto out;
}
- eb = __alloc_dummy_extent_buffer(fs_info, SZ_1M, nodesize);
+ eb = alloc_dummy_extent_buffer(fs_info, SZ_1M);
if (!eb) {
test_std_err(TEST_ALLOC_EXTENT_BUFFER);
ret = -ENOMEM;
diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c
index 56e61ac1cc64..609bb6c9c087 100644
--- a/fs/btrfs/tests/extent-map-tests.c
+++ b/fs/btrfs/tests/extent-map-tests.c
@@ -1045,6 +1045,7 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info,
ret = btrfs_add_chunk_map(fs_info, map);
if (ret) {
test_err("error adding chunk map to mapping tree");
+ btrfs_free_chunk_map(map);
goto out_free;
}
diff --git a/fs/btrfs/tests/raid-stripe-tree-tests.c b/fs/btrfs/tests/raid-stripe-tree-tests.c
index 30f17eb7b6a8..a7bc58a5c1e2 100644
--- a/fs/btrfs/tests/raid-stripe-tree-tests.c
+++ b/fs/btrfs/tests/raid-stripe-tree-tests.c
@@ -14,6 +14,8 @@
#define RST_TEST_NUM_DEVICES (2)
#define RST_TEST_RAID1_TYPE (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_RAID1)
+#define SZ_48K (SZ_32K + SZ_16K)
+
typedef int (*test_func_t)(struct btrfs_trans_handle *trans);
static struct btrfs_device *btrfs_device_by_devid(struct btrfs_fs_devices *fs_devices,
@@ -30,6 +32,613 @@ static struct btrfs_device *btrfs_device_by_devid(struct btrfs_fs_devices *fs_de
}
/*
+ * Test creating a range of three extents and then punch a hole in the middle,
+ * deleting all of the middle extents and partially deleting the "book ends".
+ */
+static int test_punch_hole_3extents(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_io_context *bioc;
+ struct btrfs_io_stripe io_stripe = { 0 };
+ u64 map_type = RST_TEST_RAID1_TYPE;
+ u64 logical1 = SZ_1M;
+ u64 len1 = SZ_1M;
+ u64 logical2 = logical1 + len1;
+ u64 len2 = SZ_1M;
+ u64 logical3 = logical2 + len2;
+ u64 len3 = SZ_1M;
+ u64 hole_start = logical1 + SZ_256K;
+ u64 hole_len = SZ_2M;
+ int ret;
+
+ bioc = alloc_btrfs_io_context(fs_info, logical1, RST_TEST_NUM_DEVICES);
+ if (!bioc) {
+ test_std_err(TEST_ALLOC_IO_CONTEXT);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0);
+
+ /* Prepare for the test, 1st create 3 x 1M extents. */
+ bioc->map_type = map_type;
+ bioc->size = len1;
+
+ for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) {
+ struct btrfs_io_stripe *stripe = &bioc->stripes[i];
+
+ stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i);
+ if (!stripe->dev) {
+ test_err("cannot find device with devid %d", i);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ stripe->physical = logical1 + i * SZ_1G;
+ }
+
+ ret = btrfs_insert_one_raid_extent(trans, bioc);
+ if (ret) {
+ test_err("inserting RAID extent failed: %d", ret);
+ goto out;
+ }
+
+ bioc->logical = logical2;
+ bioc->size = len2;
+ for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) {
+ struct btrfs_io_stripe *stripe = &bioc->stripes[i];
+
+ stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i);
+ if (!stripe->dev) {
+ test_err("cannot find device with devid %d", i);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ stripe->physical = logical2 + i * SZ_1G;
+ }
+
+ ret = btrfs_insert_one_raid_extent(trans, bioc);
+ if (ret) {
+ test_err("inserting RAID extent failed: %d", ret);
+ goto out;
+ }
+
+ bioc->logical = logical3;
+ bioc->size = len3;
+ for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) {
+ struct btrfs_io_stripe *stripe = &bioc->stripes[i];
+
+ stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i);
+ if (!stripe->dev) {
+ test_err("cannot find device with devid %d", i);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ stripe->physical = logical3 + i * SZ_1G;
+ }
+
+ ret = btrfs_insert_one_raid_extent(trans, bioc);
+ if (ret) {
+ test_err("inserting RAID extent failed: %d", ret);
+ goto out;
+ }
+
+ /*
+ * Delete a range starting at logical1 + 256K and 2M in length. Extent
+ * 1 is truncated to 256k length, extent 2 is completely dropped and
+ * extent 3 is moved 256K to the right.
+ */
+ ret = btrfs_delete_raid_extent(trans, hole_start, hole_len);
+ if (ret) {
+ test_err("deleting RAID extent [%llu, %llu] failed",
+ hole_start, hole_start + hole_len);
+ goto out;
+ }
+
+ /* Get the first extent and check its size. */
+ ret = btrfs_get_raid_extent_offset(fs_info, logical1, &len1, map_type,
+ 0, &io_stripe);
+ if (ret) {
+ test_err("lookup of RAID extent [%llu, %llu] failed",
+ logical1, logical1 + len1);
+ goto out;
+ }
+
+ if (io_stripe.physical != logical1) {
+ test_err("invalid physical address, expected %llu, got %llu",
+ logical1, io_stripe.physical);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (len1 != SZ_256K) {
+ test_err("invalid stripe length, expected %llu, got %llu",
+ (u64)SZ_256K, len1);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* Get the second extent and check it's absent. */
+ ret = btrfs_get_raid_extent_offset(fs_info, logical2, &len2, map_type,
+ 0, &io_stripe);
+ if (ret != -ENODATA) {
+ test_err("lookup of RAID extent [%llu, %llu] succeeded should fail",
+ logical2, logical2 + len2);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* Get the third extent and check its size. */
+ logical3 += SZ_256K;
+ ret = btrfs_get_raid_extent_offset(fs_info, logical3, &len3, map_type,
+ 0, &io_stripe);
+ if (ret) {
+ test_err("lookup of RAID extent [%llu, %llu] failed",
+ logical3, logical3 + len3);
+ goto out;
+ }
+
+ if (io_stripe.physical != logical3) {
+ test_err("invalid physical address, expected %llu, got %llu",
+ logical3 + SZ_256K, io_stripe.physical);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (len3 != SZ_1M - SZ_256K) {
+ test_err("invalid stripe length, expected %llu, got %llu",
+ (u64)SZ_1M - SZ_256K, len3);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = btrfs_delete_raid_extent(trans, logical1, len1);
+ if (ret) {
+ test_err("deleting RAID extent [%llu, %llu] failed",
+ logical1, logical1 + len1);
+ goto out;
+ }
+
+ ret = btrfs_delete_raid_extent(trans, logical3, len3);
+ if (ret) {
+ test_err("deleting RAID extent [%llu, %llu] failed",
+ logical1, logical1 + len1);
+ goto out;
+ }
+
+out:
+ btrfs_put_bioc(bioc);
+ return ret;
+}
+
+static int test_delete_two_extents(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_io_context *bioc;
+ struct btrfs_io_stripe io_stripe = { 0 };
+ u64 map_type = RST_TEST_RAID1_TYPE;
+ u64 logical1 = SZ_1M;
+ u64 len1 = SZ_1M;
+ u64 logical2 = logical1 + len1;
+ u64 len2 = SZ_1M;
+ u64 logical3 = logical2 + len2;
+ u64 len3 = SZ_1M;
+ int ret;
+
+ bioc = alloc_btrfs_io_context(fs_info, logical1, RST_TEST_NUM_DEVICES);
+ if (!bioc) {
+ test_std_err(TEST_ALLOC_IO_CONTEXT);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0);
+
+ /* Prepare for the test, 1st create 3 x 1M extents. */
+ bioc->map_type = map_type;
+ bioc->size = len1;
+
+ for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) {
+ struct btrfs_io_stripe *stripe = &bioc->stripes[i];
+
+ stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i);
+ if (!stripe->dev) {
+ test_err("cannot find device with devid %d", i);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ stripe->physical = logical1 + i * SZ_1G;
+ }
+
+ ret = btrfs_insert_one_raid_extent(trans, bioc);
+ if (ret) {
+ test_err("inserting RAID extent failed: %d", ret);
+ goto out;
+ }
+
+ bioc->logical = logical2;
+ bioc->size = len2;
+ for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) {
+ struct btrfs_io_stripe *stripe = &bioc->stripes[i];
+
+ stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i);
+ if (!stripe->dev) {
+ test_err("cannot find device with devid %d", i);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ stripe->physical = logical2 + i * SZ_1G;
+ }
+
+ ret = btrfs_insert_one_raid_extent(trans, bioc);
+ if (ret) {
+ test_err("inserting RAID extent failed: %d", ret);
+ goto out;
+ }
+
+ bioc->logical = logical3;
+ bioc->size = len3;
+ for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) {
+ struct btrfs_io_stripe *stripe = &bioc->stripes[i];
+
+ stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i);
+ if (!stripe->dev) {
+ test_err("cannot find device with devid %d", i);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ stripe->physical = logical3 + i * SZ_1G;
+ }
+
+ ret = btrfs_insert_one_raid_extent(trans, bioc);
+ if (ret) {
+ test_err("inserting RAID extent failed: %d", ret);
+ goto out;
+ }
+
+ /*
+ * Delete a range starting at logical1 and 2M in length. Extents 1
+ * and 2 are dropped and extent 3 is kept as is.
+ */
+ ret = btrfs_delete_raid_extent(trans, logical1, len1 + len2);
+ if (ret) {
+ test_err("deleting RAID extent [%llu, %llu] failed",
+ logical1, logical1 + len1 + len2);
+ goto out;
+ }
+
+ ret = btrfs_get_raid_extent_offset(fs_info, logical1, &len1, map_type,
+ 0, &io_stripe);
+ if (ret != -ENODATA) {
+ test_err("lookup of RAID extent [%llu, %llu] succeeded, should fail",
+ logical1, len1);
+ goto out;
+ }
+
+ ret = btrfs_get_raid_extent_offset(fs_info, logical2, &len2, map_type,
+ 0, &io_stripe);
+ if (ret != -ENODATA) {
+ test_err("lookup of RAID extent [%llu, %llu] succeeded, should fail",
+ logical2, len2);
+ goto out;
+ }
+
+ ret = btrfs_get_raid_extent_offset(fs_info, logical3, &len3, map_type,
+ 0, &io_stripe);
+ if (ret) {
+ test_err("lookup of RAID extent [%llu, %llu] failed",
+ logical3, len3);
+ goto out;
+ }
+
+ if (io_stripe.physical != logical3) {
+ test_err("invalid physical address, expected %llu, got %llu",
+ logical3, io_stripe.physical);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (len3 != SZ_1M) {
+ test_err("invalid stripe length, expected %llu, got %llu",
+ (u64)SZ_1M, len3);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = btrfs_delete_raid_extent(trans, logical3, len3);
+out:
+ btrfs_put_bioc(bioc);
+ return ret;
+}
+
+/* Test punching a hole into a single RAID stripe-extent. */
+static int test_punch_hole(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_io_context *bioc;
+ struct btrfs_io_stripe io_stripe = { 0 };
+ u64 map_type = RST_TEST_RAID1_TYPE;
+ u64 logical1 = SZ_1M;
+ u64 hole_start = logical1 + SZ_32K;
+ u64 hole_len = SZ_64K;
+ u64 logical2 = hole_start + hole_len;
+ u64 len = SZ_1M;
+ u64 len1 = SZ_32K;
+ u64 len2 = len - len1 - hole_len;
+ int ret;
+
+ bioc = alloc_btrfs_io_context(fs_info, logical1, RST_TEST_NUM_DEVICES);
+ if (!bioc) {
+ test_std_err(TEST_ALLOC_IO_CONTEXT);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0);
+ bioc->map_type = map_type;
+ bioc->size = len;
+
+ for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) {
+ struct btrfs_io_stripe *stripe = &bioc->stripes[i];
+
+ stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i);
+ if (!stripe->dev) {
+ test_err("cannot find device with devid %d", i);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ stripe->physical = logical1 + i * SZ_1G;
+ }
+
+ ret = btrfs_insert_one_raid_extent(trans, bioc);
+ if (ret) {
+ test_err("inserting RAID extent failed: %d", ret);
+ goto out;
+ }
+
+ ret = btrfs_get_raid_extent_offset(fs_info, logical1, &len, map_type, 0,
+ &io_stripe);
+ if (ret) {
+ test_err("lookup of RAID extent [%llu, %llu] failed", logical1,
+ logical1 + len);
+ goto out;
+ }
+
+ if (io_stripe.physical != logical1) {
+ test_err("invalid physical address, expected %llu got %llu",
+ logical1, io_stripe.physical);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (len != SZ_1M) {
+ test_err("invalid stripe length, expected %llu got %llu",
+ (u64)SZ_1M, len);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = btrfs_delete_raid_extent(trans, hole_start, hole_len);
+ if (ret) {
+ test_err("deleting RAID extent [%llu, %llu] failed",
+ hole_start, hole_start + hole_len);
+ goto out;
+ }
+
+ ret = btrfs_get_raid_extent_offset(fs_info, logical1, &len1, map_type,
+ 0, &io_stripe);
+ if (ret) {
+ test_err("lookup of RAID extent [%llu, %llu] failed",
+ logical1, logical1 + len1);
+ goto out;
+ }
+
+ if (io_stripe.physical != logical1) {
+ test_err("invalid physical address, expected %llu, got %llu",
+ logical1, io_stripe.physical);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (len1 != SZ_32K) {
+ test_err("invalid stripe length, expected %llu, got %llu",
+ (u64)SZ_32K, len1);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = btrfs_get_raid_extent_offset(fs_info, logical2, &len2, map_type,
+ 0, &io_stripe);
+ if (ret) {
+ test_err("lookup of RAID extent [%llu, %llu] failed", logical2,
+ logical2 + len2);
+ goto out;
+ }
+
+ if (io_stripe.physical != logical2) {
+ test_err("invalid physical address, expected %llu, got %llu",
+ logical2, io_stripe.physical);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (len2 != len - len1 - hole_len) {
+ test_err("invalid length, expected %llu, got %llu",
+ len - len1 - hole_len, len2);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* Check for the absence of the hole. */
+ ret = btrfs_get_raid_extent_offset(fs_info, hole_start, &hole_len,
+ map_type, 0, &io_stripe);
+ if (ret != -ENODATA) {
+ ret = -EINVAL;
+ test_err("lookup of RAID extent [%llu, %llu] succeeded, should fail",
+ hole_start, hole_start + SZ_64K);
+ goto out;
+ }
+
+ ret = btrfs_delete_raid_extent(trans, logical1, len1);
+ if (ret)
+ goto out;
+
+ ret = btrfs_delete_raid_extent(trans, logical2, len2);
+out:
+ btrfs_put_bioc(bioc);
+ return ret;
+}
+
+/*
+ * Test a 1M RST write that spans two adjacent RST items on disk and then
+ * delete a portion starting in the first item and spanning into the second
+ * item. This is similar to test_front_delete(), but spanning multiple items.
+ */
+static int test_front_delete_prev_item(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_io_context *bioc;
+ struct btrfs_io_stripe io_stripe = { 0 };
+ u64 map_type = RST_TEST_RAID1_TYPE;
+ u64 logical1 = SZ_1M;
+ u64 logical2 = SZ_2M;
+ u64 len = SZ_1M;
+ int ret;
+
+ bioc = alloc_btrfs_io_context(fs_info, logical1, RST_TEST_NUM_DEVICES);
+ if (!bioc) {
+ test_std_err(TEST_ALLOC_IO_CONTEXT);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0);
+ bioc->map_type = map_type;
+ bioc->size = len;
+
+ /* Insert RAID extent 1. */
+ for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) {
+ struct btrfs_io_stripe *stripe = &bioc->stripes[i];
+
+ stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i);
+ if (!stripe->dev) {
+ test_err("cannot find device with devid %d", i);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ stripe->physical = logical1 + i * SZ_1G;
+ }
+
+ ret = btrfs_insert_one_raid_extent(trans, bioc);
+ if (ret) {
+ test_err("inserting RAID extent failed: %d", ret);
+ goto out;
+ }
+
+ bioc->logical = logical2;
+ /* Insert RAID extent 2, directly adjacent to it. */
+ for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) {
+ struct btrfs_io_stripe *stripe = &bioc->stripes[i];
+
+ stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i);
+ if (!stripe->dev) {
+ test_err("cannot find device with devid %d", i);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ stripe->physical = logical2 + i * SZ_1G;
+ }
+
+ ret = btrfs_insert_one_raid_extent(trans, bioc);
+ if (ret) {
+ test_err("inserting RAID extent failed: %d", ret);
+ goto out;
+ }
+
+ ret = btrfs_delete_raid_extent(trans, logical1 + SZ_512K, SZ_1M);
+ if (ret) {
+ test_err("deleting RAID extent [%llu, %llu] failed",
+ logical1 + SZ_512K, (u64)SZ_1M);
+ goto out;
+ }
+
+ /* Verify item 1 is truncated to 512K. */
+ ret = btrfs_get_raid_extent_offset(fs_info, logical1, &len, map_type, 0,
+ &io_stripe);
+ if (ret) {
+ test_err("lookup of RAID extent [%llu, %llu] failed", logical1,
+ logical1 + len);
+ goto out;
+ }
+
+ if (io_stripe.physical != logical1) {
+ test_err("invalid physical address, expected %llu got %llu",
+ logical1, io_stripe.physical);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (len != SZ_512K) {
+ test_err("invalid stripe length, expected %llu got %llu",
+ (u64)SZ_512K, len);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* Verify item 2's start is moved by 512K. */
+ ret = btrfs_get_raid_extent_offset(fs_info, logical2 + SZ_512K, &len,
+ map_type, 0, &io_stripe);
+ if (ret) {
+ test_err("lookup of RAID extent [%llu, %llu] failed",
+ logical2 + SZ_512K, logical2 + len);
+ goto out;
+ }
+
+ if (io_stripe.physical != logical2 + SZ_512K) {
+ test_err("invalid physical address, expected %llu got %llu",
+ logical2 + SZ_512K, io_stripe.physical);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (len != SZ_512K) {
+ test_err("invalid stripe length, expected %llu got %llu",
+ (u64)SZ_512K, len);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* Verify there's a hole at [1M+512K, 2M+512K] . */
+ len = SZ_1M;
+ ret = btrfs_get_raid_extent_offset(fs_info, logical1 + SZ_512K, &len,
+ map_type, 0, &io_stripe);
+ if (ret != -ENODATA) {
+ test_err("lookup of RAID [%llu, %llu] succeeded, should fail",
+ logical1 + SZ_512K, logical1 + SZ_512K + len);
+ goto out;
+ }
+
+ /* Clean up after us. */
+ ret = btrfs_delete_raid_extent(trans, logical1, SZ_512K);
+ if (ret)
+ goto out;
+
+ ret = btrfs_delete_raid_extent(trans, logical2 + SZ_512K, SZ_512K);
+
+out:
+ btrfs_put_bioc(bioc);
+ return ret;
+}
+
+/*
* Test a 64K RST write on a 2 disk RAID1 at a logical address of 1M and then
* delete the 1st 32K, making the new start address 1M+32K.
*/
@@ -94,45 +703,45 @@ static int test_front_delete(struct btrfs_trans_handle *trans)
goto out;
}
- ret = btrfs_delete_raid_extent(trans, logical, SZ_32K);
+ ret = btrfs_delete_raid_extent(trans, logical, SZ_16K);
if (ret) {
test_err("deleting RAID extent [%llu, %llu] failed", logical,
- logical + SZ_32K);
+ logical + SZ_16K);
goto out;
}
- len = SZ_32K;
- ret = btrfs_get_raid_extent_offset(fs_info, logical + SZ_32K, &len,
+ len -= SZ_16K;
+ ret = btrfs_get_raid_extent_offset(fs_info, logical + SZ_16K, &len,
map_type, 0, &io_stripe);
if (ret) {
test_err("lookup of RAID extent [%llu, %llu] failed",
- logical + SZ_32K, logical + SZ_32K + len);
+ logical + SZ_16K, logical + SZ_64K);
goto out;
}
- if (io_stripe.physical != logical + SZ_32K) {
+ if (io_stripe.physical != logical + SZ_16K) {
test_err("invalid physical address, expected %llu, got %llu",
- logical + SZ_32K, io_stripe.physical);
+ logical + SZ_16K, io_stripe.physical);
ret = -EINVAL;
goto out;
}
- if (len != SZ_32K) {
+ if (len != SZ_48K) {
test_err("invalid stripe length, expected %llu, got %llu",
- (u64)SZ_32K, len);
+ (u64)SZ_48K, len);
ret = -EINVAL;
goto out;
}
ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe);
- if (!ret) {
+ if (ret != -ENODATA) {
ret = -EINVAL;
test_err("lookup of RAID extent [%llu, %llu] succeeded, should fail",
- logical, logical + SZ_32K);
+ logical, logical + SZ_16K);
goto out;
}
- ret = btrfs_delete_raid_extent(trans, logical + SZ_32K, SZ_32K);
+ ret = btrfs_delete_raid_extent(trans, logical + SZ_16K, SZ_48K);
out:
btrfs_put_bioc(bioc);
return ret;
@@ -209,14 +818,14 @@ static int test_tail_delete(struct btrfs_trans_handle *trans)
goto out;
}
- ret = btrfs_delete_raid_extent(trans, logical + SZ_32K, SZ_32K);
+ ret = btrfs_delete_raid_extent(trans, logical + SZ_48K, SZ_16K);
if (ret) {
test_err("deleting RAID extent [%llu, %llu] failed",
- logical + SZ_32K, logical + SZ_64K);
+ logical + SZ_48K, logical + SZ_64K);
goto out;
}
- len = SZ_32K;
+ len = SZ_48K;
ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe);
if (ret) {
test_err("lookup of RAID extent [%llu, %llu] failed", logical,
@@ -231,9 +840,19 @@ static int test_tail_delete(struct btrfs_trans_handle *trans)
goto out;
}
- if (len != SZ_32K) {
+ if (len != SZ_48K) {
test_err("invalid stripe length, expected %llu, got %llu",
- (u64)SZ_32K, len);
+ (u64)SZ_48K, len);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ len = SZ_16K;
+ ret = btrfs_get_raid_extent_offset(fs_info, logical + SZ_48K, &len,
+ map_type, 0, &io_stripe);
+ if (ret != -ENODATA) {
+ test_err("lookup of RAID extent [%llu, %llu] succeeded should fail",
+ logical + SZ_48K, logical + SZ_64K);
ret = -EINVAL;
goto out;
}
@@ -456,6 +1075,10 @@ static const test_func_t tests[] = {
test_create_update_delete,
test_tail_delete,
test_front_delete,
+ test_front_delete_prev_item,
+ test_punch_hole,
+ test_punch_hole_3extents,
+ test_delete_two_extents,
};
static int run_test(test_func_t test, u32 sectorsize, u32 nodesize)
@@ -478,8 +1101,8 @@ static int run_test(test_func_t test, u32 sectorsize, u32 nodesize)
ret = PTR_ERR(root);
goto out;
}
- btrfs_set_super_compat_ro_flags(root->fs_info->super_copy,
- BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE);
+ btrfs_set_super_incompat_flags(root->fs_info->super_copy,
+ BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE);
root->root_key.objectid = BTRFS_RAID_STRIPE_TREE_OBJECTID;
root->root_key.type = BTRFS_ROOT_ITEM_KEY;
root->root_key.offset = 0;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index dc0b837efd5d..f26a394a9ec5 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -160,7 +160,13 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
cache = list_first_entry(&transaction->deleted_bgs,
struct btrfs_block_group,
bg_list);
+ /*
+ * Not strictly necessary to lock, as no other task will be using a
+ * block_group on the deleted_bgs list during a transaction abort.
+ */
+ spin_lock(&transaction->fs_info->unused_bgs_lock);
list_del_init(&cache->bg_list);
+ spin_unlock(&transaction->fs_info->unused_bgs_lock);
btrfs_unfreeze_block_group(cache);
btrfs_put_block_group(cache);
}
@@ -274,8 +280,10 @@ loop:
cur_trans = fs_info->running_transaction;
if (cur_trans) {
if (TRANS_ABORTED(cur_trans)) {
+ const int abort_error = cur_trans->aborted;
+
spin_unlock(&fs_info->trans_lock);
- return cur_trans->aborted;
+ return abort_error;
}
if (btrfs_blocked_trans_types[cur_trans->state] & type) {
spin_unlock(&fs_info->trans_lock);
@@ -795,8 +803,7 @@ alloc_fail:
if (num_bytes)
btrfs_block_rsv_release(fs_info, trans_rsv, num_bytes, NULL);
if (delayed_refs_bytes)
- btrfs_space_info_free_bytes_may_use(fs_info, trans_rsv->space_info,
- delayed_refs_bytes);
+ btrfs_space_info_free_bytes_may_use(trans_rsv->space_info, delayed_refs_bytes);
reserve_fail:
btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved);
return ERR_PTR(ret);
@@ -1634,7 +1641,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
struct btrfs_root *root = pending->root;
struct btrfs_root *parent_root;
struct btrfs_block_rsv *rsv;
- struct inode *parent_inode = &pending->dir->vfs_inode;
+ struct btrfs_inode *parent_inode = pending->dir;
struct btrfs_path *path;
struct btrfs_dir_item *dir_item;
struct extent_buffer *tmp;
@@ -1660,7 +1667,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
* filesystem.
*/
nofs_flags = memalloc_nofs_save();
- pending->error = fscrypt_setup_filename(parent_inode,
+ pending->error = fscrypt_setup_filename(&parent_inode->vfs_inode,
&pending->dentry->d_name, 0,
&fname);
memalloc_nofs_restore(nofs_flags);
@@ -1689,8 +1696,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
}
key.objectid = objectid;
- key.offset = (u64)-1;
key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = (u64)-1;
rsv = trans->block_rsv;
trans->block_rsv = &pending->block_rsv;
@@ -1698,16 +1705,16 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
trace_btrfs_space_reservation(fs_info, "transaction",
trans->transid,
trans->bytes_reserved, 1);
- parent_root = BTRFS_I(parent_inode)->root;
+ parent_root = parent_inode->root;
ret = record_root_in_trans(trans, parent_root, 0);
if (ret)
goto fail;
- cur_time = current_time(parent_inode);
+ cur_time = current_time(&parent_inode->vfs_inode);
/*
* insert the directory item
*/
- ret = btrfs_set_inode_index(BTRFS_I(parent_inode), &index);
+ ret = btrfs_set_inode_index(parent_inode, &index);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto fail;
@@ -1715,7 +1722,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
/* check if there is a file/dir which has the same name. */
dir_item = btrfs_lookup_dir_item(NULL, parent_root, path,
- btrfs_ino(BTRFS_I(parent_inode)),
+ btrfs_ino(parent_inode),
&fname.disk_name, 0);
if (dir_item != NULL && !IS_ERR(dir_item)) {
pending->error = -EEXIST;
@@ -1816,7 +1823,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
*/
ret = btrfs_add_root_ref(trans, objectid,
btrfs_root_id(parent_root),
- btrfs_ino(BTRFS_I(parent_inode)), index,
+ btrfs_ino(parent_inode), index,
&fname.disk_name);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -1854,18 +1861,18 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
goto fail;
ret = btrfs_insert_dir_item(trans, &fname.disk_name,
- BTRFS_I(parent_inode), &key, BTRFS_FT_DIR,
+ parent_inode, &key, BTRFS_FT_DIR,
index);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto fail;
}
- btrfs_i_size_write(BTRFS_I(parent_inode), parent_inode->i_size +
+ btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size +
fname.disk_name.len * 2);
- inode_set_mtime_to_ts(parent_inode,
- inode_set_ctime_current(parent_inode));
- ret = btrfs_update_inode_fallback(trans, BTRFS_I(parent_inode));
+ inode_set_mtime_to_ts(&parent_inode->vfs_inode,
+ inode_set_ctime_current(&parent_inode->vfs_inode));
+ ret = btrfs_update_inode_fallback(trans, parent_inode);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto fail;
@@ -2095,7 +2102,14 @@ static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans)
list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info);
+ /*
+ * Not strictly necessary to lock, as no other task will be using a
+ * block_group on the new_bgs list during a transaction abort.
+ */
+ spin_lock(&fs_info->unused_bgs_lock);
list_del_init(&block_group->bg_list);
+ btrfs_put_block_group(block_group);
+ spin_unlock(&fs_info->unused_bgs_lock);
}
}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 184fa5c0062a..9f7c777af635 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -227,7 +227,21 @@ static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans)
delayed_refs->qgroup_to_skip = 0;
}
-bool __cold abort_should_print_stack(int error);
+/*
+ * We want the transaction abort to print stack trace only for errors where the
+ * cause could be a bug, eg. due to ENOSPC, and not for common errors that are
+ * caused by external factors.
+ */
+static inline bool btrfs_abort_should_print_stack(int error)
+{
+ switch (error) {
+ case -EIO:
+ case -EROFS:
+ case -ENOMEM:
+ return false;
+ }
+ return true;
+}
/*
* Call btrfs_abort_transaction as early as possible when an error condition is
@@ -240,7 +254,7 @@ do { \
if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \
&((trans)->fs_info->fs_state))) { \
__first = true; \
- if (WARN(abort_should_print_stack(error), \
+ if (WARN(btrfs_abort_should_print_stack(error), \
KERN_ERR \
"BTRFS: Transaction aborted (error %d)\n", \
(error))) { \
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index dfeee033f31f..2b66a6130269 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -764,22 +764,19 @@ static int check_block_group_item(struct extent_buffer *leaf,
return 0;
}
-__printf(4, 5)
+__printf(5, 6)
__cold
-static void chunk_err(const struct extent_buffer *leaf,
+static void chunk_err(const struct btrfs_fs_info *fs_info,
+ const struct extent_buffer *leaf,
const struct btrfs_chunk *chunk, u64 logical,
const char *fmt, ...)
{
- const struct btrfs_fs_info *fs_info = leaf->fs_info;
- bool is_sb;
+ bool is_sb = !leaf;
struct va_format vaf;
va_list args;
int i;
int slot = -1;
- /* Only superblock eb is able to have such small offset */
- is_sb = (leaf->start == BTRFS_SUPER_INFO_OFFSET);
-
if (!is_sb) {
/*
* Get the slot number by iterating through all slots, this
@@ -812,13 +809,17 @@ static void chunk_err(const struct extent_buffer *leaf,
/*
* The common chunk check which could also work on super block sys chunk array.
*
+ * If @leaf is NULL, then @chunk must be an on-stack chunk item.
+ * (For superblock sys_chunk array, and fs_info->sectorsize is unreliable)
+ *
* Return -EUCLEAN if anything is corrupted.
* Return 0 if everything is OK.
*/
-int btrfs_check_chunk_valid(struct extent_buffer *leaf,
- struct btrfs_chunk *chunk, u64 logical)
+int btrfs_check_chunk_valid(const struct btrfs_fs_info *fs_info,
+ const struct extent_buffer *leaf,
+ const struct btrfs_chunk *chunk, u64 logical,
+ u32 sectorsize)
{
- struct btrfs_fs_info *fs_info = leaf->fs_info;
u64 length;
u64 chunk_end;
u64 stripe_len;
@@ -826,63 +827,73 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf,
u16 sub_stripes;
u64 type;
u64 features;
+ u32 chunk_sector_size;
bool mixed = false;
int raid_index;
int nparity;
int ncopies;
- length = btrfs_chunk_length(leaf, chunk);
- stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
- num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
- sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
- type = btrfs_chunk_type(leaf, chunk);
+ if (leaf) {
+ length = btrfs_chunk_length(leaf, chunk);
+ stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
+ num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+ sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
+ type = btrfs_chunk_type(leaf, chunk);
+ chunk_sector_size = btrfs_chunk_sector_size(leaf, chunk);
+ } else {
+ length = btrfs_stack_chunk_length(chunk);
+ stripe_len = btrfs_stack_chunk_stripe_len(chunk);
+ num_stripes = btrfs_stack_chunk_num_stripes(chunk);
+ sub_stripes = btrfs_stack_chunk_sub_stripes(chunk);
+ type = btrfs_stack_chunk_type(chunk);
+ chunk_sector_size = btrfs_stack_chunk_sector_size(chunk);
+ }
raid_index = btrfs_bg_flags_to_raid_index(type);
ncopies = btrfs_raid_array[raid_index].ncopies;
nparity = btrfs_raid_array[raid_index].nparity;
if (unlikely(!num_stripes)) {
- chunk_err(leaf, chunk, logical,
+ chunk_err(fs_info, leaf, chunk, logical,
"invalid chunk num_stripes, have %u", num_stripes);
return -EUCLEAN;
}
if (unlikely(num_stripes < ncopies)) {
- chunk_err(leaf, chunk, logical,
+ chunk_err(fs_info, leaf, chunk, logical,
"invalid chunk num_stripes < ncopies, have %u < %d",
num_stripes, ncopies);
return -EUCLEAN;
}
if (unlikely(nparity && num_stripes == nparity)) {
- chunk_err(leaf, chunk, logical,
+ chunk_err(fs_info, leaf, chunk, logical,
"invalid chunk num_stripes == nparity, have %u == %d",
num_stripes, nparity);
return -EUCLEAN;
}
- if (unlikely(!IS_ALIGNED(logical, fs_info->sectorsize))) {
- chunk_err(leaf, chunk, logical,
+ if (unlikely(!IS_ALIGNED(logical, sectorsize))) {
+ chunk_err(fs_info, leaf, chunk, logical,
"invalid chunk logical, have %llu should aligned to %u",
- logical, fs_info->sectorsize);
+ logical, sectorsize);
return -EUCLEAN;
}
- if (unlikely(btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize)) {
- chunk_err(leaf, chunk, logical,
+ if (unlikely(chunk_sector_size != sectorsize)) {
+ chunk_err(fs_info, leaf, chunk, logical,
"invalid chunk sectorsize, have %u expect %u",
- btrfs_chunk_sector_size(leaf, chunk),
- fs_info->sectorsize);
+ chunk_sector_size, sectorsize);
return -EUCLEAN;
}
- if (unlikely(!length || !IS_ALIGNED(length, fs_info->sectorsize))) {
- chunk_err(leaf, chunk, logical,
+ if (unlikely(!length || !IS_ALIGNED(length, sectorsize))) {
+ chunk_err(fs_info, leaf, chunk, logical,
"invalid chunk length, have %llu", length);
return -EUCLEAN;
}
if (unlikely(check_add_overflow(logical, length, &chunk_end))) {
- chunk_err(leaf, chunk, logical,
+ chunk_err(fs_info, leaf, chunk, logical,
"invalid chunk logical start and length, have logical start %llu length %llu",
logical, length);
return -EUCLEAN;
}
if (unlikely(!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN)) {
- chunk_err(leaf, chunk, logical,
+ chunk_err(fs_info, leaf, chunk, logical,
"invalid chunk stripe length: %llu",
stripe_len);
return -EUCLEAN;
@@ -896,30 +907,29 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf,
* Thus it should be a good way to catch obvious bitflips.
*/
if (unlikely(length >= btrfs_stripe_nr_to_offset(U32_MAX))) {
- chunk_err(leaf, chunk, logical,
+ chunk_err(fs_info, leaf, chunk, logical,
"chunk length too large: have %llu limit %llu",
length, btrfs_stripe_nr_to_offset(U32_MAX));
return -EUCLEAN;
}
if (unlikely(type & ~(BTRFS_BLOCK_GROUP_TYPE_MASK |
BTRFS_BLOCK_GROUP_PROFILE_MASK))) {
- chunk_err(leaf, chunk, logical,
+ chunk_err(fs_info, leaf, chunk, logical,
"unrecognized chunk type: 0x%llx",
~(BTRFS_BLOCK_GROUP_TYPE_MASK |
- BTRFS_BLOCK_GROUP_PROFILE_MASK) &
- btrfs_chunk_type(leaf, chunk));
+ BTRFS_BLOCK_GROUP_PROFILE_MASK) & type);
return -EUCLEAN;
}
if (unlikely(!has_single_bit_set(type & BTRFS_BLOCK_GROUP_PROFILE_MASK) &&
(type & BTRFS_BLOCK_GROUP_PROFILE_MASK) != 0)) {
- chunk_err(leaf, chunk, logical,
+ chunk_err(fs_info, leaf, chunk, logical,
"invalid chunk profile flag: 0x%llx, expect 0 or 1 bit set",
type & BTRFS_BLOCK_GROUP_PROFILE_MASK);
return -EUCLEAN;
}
if (unlikely((type & BTRFS_BLOCK_GROUP_TYPE_MASK) == 0)) {
- chunk_err(leaf, chunk, logical,
+ chunk_err(fs_info, leaf, chunk, logical,
"missing chunk type flag, have 0x%llx one bit must be set in 0x%llx",
type, BTRFS_BLOCK_GROUP_TYPE_MASK);
return -EUCLEAN;
@@ -928,7 +938,7 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf,
if (unlikely((type & BTRFS_BLOCK_GROUP_SYSTEM) &&
(type & (BTRFS_BLOCK_GROUP_METADATA |
BTRFS_BLOCK_GROUP_DATA)))) {
- chunk_err(leaf, chunk, logical,
+ chunk_err(fs_info, leaf, chunk, logical,
"system chunk with data or metadata type: 0x%llx",
type);
return -EUCLEAN;
@@ -941,7 +951,7 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf,
if (!mixed) {
if (unlikely((type & BTRFS_BLOCK_GROUP_METADATA) &&
(type & BTRFS_BLOCK_GROUP_DATA))) {
- chunk_err(leaf, chunk, logical,
+ chunk_err(fs_info, leaf, chunk, logical,
"mixed chunk type in non-mixed mode: 0x%llx", type);
return -EUCLEAN;
}
@@ -963,7 +973,7 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf,
num_stripes != btrfs_raid_array[BTRFS_RAID_DUP].dev_stripes) ||
((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 &&
num_stripes != btrfs_raid_array[BTRFS_RAID_SINGLE].dev_stripes))) {
- chunk_err(leaf, chunk, logical,
+ chunk_err(fs_info, leaf, chunk, logical,
"invalid num_stripes:sub_stripes %u:%u for profile %llu",
num_stripes, sub_stripes,
type & BTRFS_BLOCK_GROUP_PROFILE_MASK);
@@ -983,14 +993,15 @@ static int check_leaf_chunk_item(struct extent_buffer *leaf,
struct btrfs_chunk *chunk,
struct btrfs_key *key, int slot)
{
+ struct btrfs_fs_info *fs_info = leaf->fs_info;
int num_stripes;
if (unlikely(btrfs_item_size(leaf, slot) < sizeof(struct btrfs_chunk))) {
- chunk_err(leaf, chunk, key->offset,
+ chunk_err(fs_info, leaf, chunk, key->offset,
"invalid chunk item size: have %u expect [%zu, %u)",
btrfs_item_size(leaf, slot),
sizeof(struct btrfs_chunk),
- BTRFS_LEAF_DATA_SIZE(leaf->fs_info));
+ BTRFS_LEAF_DATA_SIZE(fs_info));
return -EUCLEAN;
}
@@ -1001,14 +1012,15 @@ static int check_leaf_chunk_item(struct extent_buffer *leaf,
if (unlikely(btrfs_chunk_item_size(num_stripes) !=
btrfs_item_size(leaf, slot))) {
- chunk_err(leaf, chunk, key->offset,
+ chunk_err(fs_info, leaf, chunk, key->offset,
"invalid chunk item size: have %u expect %lu",
btrfs_item_size(leaf, slot),
btrfs_chunk_item_size(num_stripes));
return -EUCLEAN;
}
out:
- return btrfs_check_chunk_valid(leaf, chunk, key->offset);
+ return btrfs_check_chunk_valid(fs_info, leaf, chunk, key->offset,
+ fs_info->sectorsize);
}
__printf(3, 4)
@@ -2223,7 +2235,7 @@ int btrfs_verify_level_key(struct extent_buffer *eb,
btrfs_err(fs_info,
"tree level mismatch detected, bytenr=%llu level expected=%u has=%u",
eb->start, check->level, found_level);
- return -EIO;
+ return -EUCLEAN;
}
if (!check->has_first_key)
diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h
index db67f96cbe4b..eb201f4ec3c7 100644
--- a/fs/btrfs/tree-checker.h
+++ b/fs/btrfs/tree-checker.h
@@ -10,6 +10,7 @@
#include <uapi/linux/btrfs_tree.h>
struct extent_buffer;
+struct btrfs_fs_info;
struct btrfs_chunk;
struct btrfs_key;
@@ -66,8 +67,10 @@ enum btrfs_tree_block_status __btrfs_check_node(struct extent_buffer *node);
int btrfs_check_leaf(struct extent_buffer *leaf);
int btrfs_check_node(struct extent_buffer *node);
-int btrfs_check_chunk_valid(struct extent_buffer *leaf,
- struct btrfs_chunk *chunk, u64 logical);
+int btrfs_check_chunk_valid(const struct btrfs_fs_info *fs_info,
+ const struct extent_buffer *leaf,
+ const struct btrfs_chunk *chunk, u64 logical,
+ u32 sectorsize);
int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner);
int btrfs_verify_level_key(struct extent_buffer *eb,
const struct btrfs_tree_parent_check *check);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index c8d6587688b3..90dc094cfa5e 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -138,10 +138,10 @@ static void wait_log_commit(struct btrfs_root *root, int transid);
* and once to do all the other items.
*/
-static struct inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *root)
+static struct btrfs_inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *root)
{
unsigned int nofs_flag;
- struct inode *inode;
+ struct btrfs_inode *inode;
/*
* We're holding a transaction handle whether we are logging or
@@ -376,12 +376,12 @@ static int process_one_buffer(struct btrfs_root *log,
}
/*
- * Item overwrite used by replay and tree logging. eb, slot and key all refer
- * to the src data we are copying out.
+ * Item overwrite used by log replay. The given eb, slot and key all refer to
+ * the source data we are copying out.
*
- * root is the tree we are copying into, and path is a scratch
- * path for use in this function (it should be released on entry and
- * will be released on exit).
+ * The given root is for the tree we are copying into, and path is a scratch
+ * path for use in this function (it should be released on entry and will be
+ * released on exit).
*
* If the key is already in the destination tree the existing item is
* overwritten. If the existing item isn't big enough, it is extended.
@@ -401,6 +401,8 @@ static int overwrite_item(struct btrfs_trans_handle *trans,
int save_old_i_size = 0;
unsigned long src_ptr;
unsigned long dst_ptr;
+ struct extent_buffer *dst_eb;
+ int dst_slot;
bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
/*
@@ -420,11 +422,13 @@ static int overwrite_item(struct btrfs_trans_handle *trans,
if (ret < 0)
return ret;
+ dst_eb = path->nodes[0];
+ dst_slot = path->slots[0];
+
if (ret == 0) {
char *src_copy;
- char *dst_copy;
- u32 dst_size = btrfs_item_size(path->nodes[0],
- path->slots[0]);
+ const u32 dst_size = btrfs_item_size(dst_eb, dst_slot);
+
if (dst_size != item_size)
goto insert;
@@ -432,23 +436,16 @@ static int overwrite_item(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
return 0;
}
- dst_copy = kmalloc(item_size, GFP_NOFS);
src_copy = kmalloc(item_size, GFP_NOFS);
- if (!dst_copy || !src_copy) {
+ if (!src_copy) {
btrfs_release_path(path);
- kfree(dst_copy);
- kfree(src_copy);
return -ENOMEM;
}
read_extent_buffer(eb, src_copy, src_ptr, item_size);
+ dst_ptr = btrfs_item_ptr_offset(dst_eb, dst_slot);
+ ret = memcmp_extent_buffer(dst_eb, src_copy, dst_ptr, item_size);
- dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
- read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
- item_size);
- ret = memcmp(dst_copy, src_copy, item_size);
-
- kfree(dst_copy);
kfree(src_copy);
/*
* they have the same contents, just return, this saves
@@ -470,9 +467,9 @@ static int overwrite_item(struct btrfs_trans_handle *trans,
u64 nbytes;
u32 mode;
- item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ item = btrfs_item_ptr(dst_eb, dst_slot,
struct btrfs_inode_item);
- nbytes = btrfs_inode_nbytes(path->nodes[0], item);
+ nbytes = btrfs_inode_nbytes(dst_eb, item);
item = btrfs_item_ptr(eb, slot,
struct btrfs_inode_item);
btrfs_set_inode_nbytes(eb, item, nbytes);
@@ -514,11 +511,13 @@ insert:
key, item_size);
path->skip_release_on_error = 0;
+ dst_eb = path->nodes[0];
+ dst_slot = path->slots[0];
+
/* make sure any existing item is the correct size */
if (ret == -EEXIST || ret == -EOVERFLOW) {
- u32 found_size;
- found_size = btrfs_item_size(path->nodes[0],
- path->slots[0]);
+ const u32 found_size = btrfs_item_size(dst_eb, dst_slot);
+
if (found_size > item_size)
btrfs_truncate_item(trans, path, item_size, 1);
else if (found_size < item_size)
@@ -526,8 +525,7 @@ insert:
} else if (ret) {
return ret;
}
- dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
- path->slots[0]);
+ dst_ptr = btrfs_item_ptr_offset(dst_eb, dst_slot);
/* don't overwrite an existing inode if the generation number
* was logged as zero. This is done when the tree logging code
@@ -546,7 +544,6 @@ insert:
dst_item = (struct btrfs_inode_item *)dst_ptr;
if (btrfs_inode_generation(eb, src_item) == 0) {
- struct extent_buffer *dst_eb = path->nodes[0];
const u64 ino_size = btrfs_inode_size(eb, src_item);
/*
@@ -564,33 +561,30 @@ insert:
}
if (S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
- S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
+ S_ISDIR(btrfs_inode_mode(dst_eb, dst_item))) {
save_old_i_size = 1;
- saved_i_size = btrfs_inode_size(path->nodes[0],
- dst_item);
+ saved_i_size = btrfs_inode_size(dst_eb, dst_item);
}
}
- copy_extent_buffer(path->nodes[0], eb, dst_ptr,
- src_ptr, item_size);
+ copy_extent_buffer(dst_eb, eb, dst_ptr, src_ptr, item_size);
if (save_old_i_size) {
struct btrfs_inode_item *dst_item;
+
dst_item = (struct btrfs_inode_item *)dst_ptr;
- btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
+ btrfs_set_inode_size(dst_eb, dst_item, saved_i_size);
}
/* make sure the generation is filled in */
if (key->type == BTRFS_INODE_ITEM_KEY) {
struct btrfs_inode_item *dst_item;
+
dst_item = (struct btrfs_inode_item *)dst_ptr;
- if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
- btrfs_set_inode_generation(path->nodes[0], dst_item,
- trans->transid);
- }
+ if (btrfs_inode_generation(dst_eb, dst_item) == 0)
+ btrfs_set_inode_generation(dst_eb, dst_item, trans->transid);
}
no_copy:
- btrfs_mark_buffer_dirty(trans, path->nodes[0]);
btrfs_release_path(path);
return 0;
}
@@ -614,14 +608,14 @@ static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len,
* simple helper to read an inode off the disk from a given root
* This can only be called for subvolume roots and not for the log
*/
-static noinline struct inode *read_one_inode(struct btrfs_root *root,
- u64 objectid)
+static noinline struct btrfs_inode *read_one_inode(struct btrfs_root *root,
+ u64 objectid)
{
- struct inode *inode;
+ struct btrfs_inode *inode;
inode = btrfs_iget_logging(objectid, root);
if (IS_ERR(inode))
- inode = NULL;
+ return NULL;
return inode;
}
@@ -650,7 +644,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
u64 start = key->offset;
u64 nbytes = 0;
struct btrfs_file_extent_item *item;
- struct inode *inode = NULL;
+ struct btrfs_inode *inode = NULL;
unsigned long size;
int ret = 0;
@@ -689,31 +683,23 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
* file. This must be done before the btrfs_drop_extents run
* so we don't try to drop this extent.
*/
- ret = btrfs_lookup_file_extent(trans, root, path,
- btrfs_ino(BTRFS_I(inode)), start, 0);
+ ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode), start, 0);
if (ret == 0 &&
(found_type == BTRFS_FILE_EXTENT_REG ||
found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
- struct btrfs_file_extent_item cmp1;
- struct btrfs_file_extent_item cmp2;
- struct btrfs_file_extent_item *existing;
- struct extent_buffer *leaf;
-
- leaf = path->nodes[0];
- existing = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_file_extent_item);
+ struct btrfs_file_extent_item existing;
+ unsigned long ptr;
- read_extent_buffer(eb, &cmp1, (unsigned long)item,
- sizeof(cmp1));
- read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
- sizeof(cmp2));
+ ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
+ read_extent_buffer(path->nodes[0], &existing, ptr, sizeof(existing));
/*
* we already have a pointer to this exact extent,
* we don't have to do anything
*/
- if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
+ if (memcmp_extent_buffer(eb, &existing, (unsigned long)item,
+ sizeof(existing)) == 0) {
btrfs_release_path(path);
goto out;
}
@@ -724,7 +710,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
drop_args.start = start;
drop_args.end = extent_end;
drop_args.drop_cache = true;
- ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args);
+ ret = btrfs_drop_extents(trans, root, inode, &drop_args);
if (ret)
goto out;
@@ -748,8 +734,8 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
(unsigned long)item, sizeof(*item));
ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
- ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
ins.type = BTRFS_EXTENT_ITEM_KEY;
+ ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
offset = key->offset - btrfs_file_extent_offset(eb, item);
/*
@@ -902,16 +888,15 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
goto out;
}
- ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start,
- extent_end - start);
+ ret = btrfs_inode_set_file_extent_range(inode, start, extent_end - start);
if (ret)
goto out;
update_inode:
- btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found);
- ret = btrfs_update_inode(trans, BTRFS_I(inode));
+ btrfs_update_inode_bytes(inode, nbytes, drop_args.bytes_found);
+ ret = btrfs_update_inode(trans, inode);
out:
- iput(inode);
+ iput(&inode->vfs_inode);
return ret;
}
@@ -948,7 +933,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
struct btrfs_dir_item *di)
{
struct btrfs_root *root = dir->root;
- struct inode *inode;
+ struct btrfs_inode *inode;
struct fscrypt_str name;
struct extent_buffer *leaf;
struct btrfs_key location;
@@ -973,10 +958,10 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
if (ret)
goto out;
- ret = unlink_inode_for_log_replay(trans, dir, BTRFS_I(inode), &name);
+ ret = unlink_inode_for_log_replay(trans, dir, inode, &name);
out:
kfree(name.name);
- iput(inode);
+ iput(&inode->vfs_inode);
return ret;
}
@@ -1149,7 +1134,7 @@ again:
u32 item_size;
u32 cur_offset = 0;
unsigned long base;
- struct inode *victim_parent;
+ struct btrfs_inode *victim_parent;
leaf = path->nodes[0];
@@ -1189,10 +1174,10 @@ again:
btrfs_release_path(path);
ret = unlink_inode_for_log_replay(trans,
- BTRFS_I(victim_parent),
+ victim_parent,
inode, &victim_name);
}
- iput(victim_parent);
+ iput(&victim_parent->vfs_inode);
kfree(victim_name.name);
if (ret)
return ret;
@@ -1326,7 +1311,7 @@ again:
ret = !!btrfs_find_name_in_backref(log_eb, log_slot, &name);
if (!ret) {
- struct inode *dir;
+ struct btrfs_inode *dir;
btrfs_release_path(path);
dir = read_one_inode(root, parent_id);
@@ -1335,10 +1320,9 @@ again:
kfree(name.name);
goto out;
}
- ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir),
- inode, &name);
+ ret = unlink_inode_for_log_replay(trans, dir, inode, &name);
kfree(name.name);
- iput(dir);
+ iput(&dir->vfs_inode);
if (ret)
goto out;
goto again;
@@ -1370,8 +1354,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
struct extent_buffer *eb, int slot,
struct btrfs_key *key)
{
- struct inode *dir = NULL;
- struct inode *inode = NULL;
+ struct btrfs_inode *dir = NULL;
+ struct btrfs_inode *inode = NULL;
unsigned long ref_ptr;
unsigned long ref_end;
struct fscrypt_str name = { 0 };
@@ -1436,8 +1420,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
if (ret)
goto out;
- ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)),
- btrfs_ino(BTRFS_I(inode)), ref_index, &name);
+ ret = inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
+ ref_index, &name);
if (ret < 0) {
goto out;
} else if (ret == 0) {
@@ -1448,8 +1432,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
* overwrite any existing back reference, and we don't
* want to create dangling pointers in the directory.
*/
- ret = __add_inode_ref(trans, root, path, log,
- BTRFS_I(dir), BTRFS_I(inode),
+ ret = __add_inode_ref(trans, root, path, log, dir, inode,
inode_objectid, parent_objectid,
ref_index, &name);
if (ret) {
@@ -1459,12 +1442,11 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
}
/* insert our name */
- ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
- &name, 0, ref_index);
+ ret = btrfs_add_link(trans, dir, inode, &name, 0, ref_index);
if (ret)
goto out;
- ret = btrfs_update_inode(trans, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, inode);
if (ret)
goto out;
}
@@ -1474,7 +1456,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
kfree(name.name);
name.name = NULL;
if (log_ref_ver) {
- iput(dir);
+ iput(&dir->vfs_inode);
dir = NULL;
}
}
@@ -1487,8 +1469,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
* dir index entries exist for a name but there is no inode reference
* item with the same name.
*/
- ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot,
- key);
+ ret = unlink_old_inode_refs(trans, root, path, inode, eb, slot, key);
if (ret)
goto out;
@@ -1497,8 +1478,10 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
out:
btrfs_release_path(path);
kfree(name.name);
- iput(dir);
- iput(inode);
+ if (dir)
+ iput(&dir->vfs_inode);
+ if (inode)
+ iput(&inode->vfs_inode);
return ret;
}
@@ -1612,25 +1595,25 @@ process_slot:
* will free the inode.
*/
static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
- struct inode *inode)
+ struct btrfs_inode *inode)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_root *root = inode->root;
struct btrfs_path *path;
int ret;
u64 nlink = 0;
- u64 ino = btrfs_ino(BTRFS_I(inode));
+ const u64 ino = btrfs_ino(inode);
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- ret = count_inode_refs(BTRFS_I(inode), path);
+ ret = count_inode_refs(inode, path);
if (ret < 0)
goto out;
nlink = ret;
- ret = count_inode_extrefs(BTRFS_I(inode), path);
+ ret = count_inode_extrefs(inode, path);
if (ret < 0)
goto out;
@@ -1638,17 +1621,17 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
ret = 0;
- if (nlink != inode->i_nlink) {
- set_nlink(inode, nlink);
- ret = btrfs_update_inode(trans, BTRFS_I(inode));
+ if (nlink != inode->vfs_inode.i_nlink) {
+ set_nlink(&inode->vfs_inode, nlink);
+ ret = btrfs_update_inode(trans, inode);
if (ret)
goto out;
}
- if (S_ISDIR(inode->i_mode))
- BTRFS_I(inode)->index_cnt = (u64)-1;
+ if (S_ISDIR(inode->vfs_inode.i_mode))
+ inode->index_cnt = (u64)-1;
- if (inode->i_nlink == 0) {
- if (S_ISDIR(inode->i_mode)) {
+ if (inode->vfs_inode.i_nlink == 0) {
+ if (S_ISDIR(inode->vfs_inode.i_mode)) {
ret = replay_dir_deletes(trans, root, NULL, path,
ino, 1);
if (ret)
@@ -1670,12 +1653,13 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
{
int ret;
struct btrfs_key key;
- struct inode *inode;
key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
key.type = BTRFS_ORPHAN_ITEM_KEY;
key.offset = (u64)-1;
while (1) {
+ struct btrfs_inode *inode;
+
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0)
break;
@@ -1704,7 +1688,7 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
}
ret = fixup_inode_link_count(trans, inode);
- iput(inode);
+ iput(&inode->vfs_inode);
if (ret)
break;
@@ -1732,12 +1716,14 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
{
struct btrfs_key key;
int ret = 0;
- struct inode *inode;
+ struct btrfs_inode *inode;
+ struct inode *vfs_inode;
inode = read_one_inode(root, objectid);
if (!inode)
return -EIO;
+ vfs_inode = &inode->vfs_inode;
key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
key.type = BTRFS_ORPHAN_ITEM_KEY;
key.offset = objectid;
@@ -1746,15 +1732,15 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
if (ret == 0) {
- if (!inode->i_nlink)
- set_nlink(inode, 1);
+ if (!vfs_inode->i_nlink)
+ set_nlink(vfs_inode, 1);
else
- inc_nlink(inode);
- ret = btrfs_update_inode(trans, BTRFS_I(inode));
+ inc_nlink(vfs_inode);
+ ret = btrfs_update_inode(trans, inode);
} else if (ret == -EEXIST) {
ret = 0;
}
- iput(inode);
+ iput(vfs_inode);
return ret;
}
@@ -1770,8 +1756,8 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans,
const struct fscrypt_str *name,
struct btrfs_key *location)
{
- struct inode *inode;
- struct inode *dir;
+ struct btrfs_inode *inode;
+ struct btrfs_inode *dir;
int ret;
inode = read_one_inode(root, location->objectid);
@@ -1780,17 +1766,16 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans,
dir = read_one_inode(root, dirid);
if (!dir) {
- iput(inode);
+ iput(&inode->vfs_inode);
return -EIO;
}
- ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
- 1, index);
+ ret = btrfs_add_link(trans, dir, inode, name, 1, index);
/* FIXME, put inode into FIXUP list */
- iput(inode);
- iput(dir);
+ iput(&inode->vfs_inode);
+ iput(&dir->vfs_inode);
return ret;
}
@@ -1852,7 +1837,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
bool index_dst_matches = false;
struct btrfs_key log_key;
struct btrfs_key search_key;
- struct inode *dir;
+ struct btrfs_inode *dir;
u8 log_flags;
bool exists;
int ret;
@@ -1882,9 +1867,8 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
ret = PTR_ERR(dir_dst_di);
goto out;
} else if (dir_dst_di) {
- ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path,
- dir_dst_di, &log_key,
- log_flags, exists);
+ ret = delete_conflicting_dir_entry(trans, dir, path, dir_dst_di,
+ &log_key, log_flags, exists);
if (ret < 0)
goto out;
dir_dst_matches = (ret == 1);
@@ -1899,9 +1883,8 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
ret = PTR_ERR(index_dst_di);
goto out;
} else if (index_dst_di) {
- ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path,
- index_dst_di, &log_key,
- log_flags, exists);
+ ret = delete_conflicting_dir_entry(trans, dir, path, index_dst_di,
+ &log_key, log_flags, exists);
if (ret < 0)
goto out;
index_dst_matches = (ret == 1);
@@ -1956,11 +1939,11 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
out:
if (!ret && update_size) {
- btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name.len * 2);
- ret = btrfs_update_inode(trans, BTRFS_I(dir));
+ btrfs_i_size_write(dir, dir->vfs_inode.i_size + name.len * 2);
+ ret = btrfs_update_inode(trans, dir);
}
kfree(name.name);
- iput(dir);
+ iput(&dir->vfs_inode);
if (!ret && name_added)
ret = 1;
return ret;
@@ -2117,16 +2100,16 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
struct btrfs_root *log,
struct btrfs_path *path,
struct btrfs_path *log_path,
- struct inode *dir,
+ struct btrfs_inode *dir,
struct btrfs_key *dir_key)
{
- struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_root *root = dir->root;
int ret;
struct extent_buffer *eb;
int slot;
struct btrfs_dir_item *di;
struct fscrypt_str name = { 0 };
- struct inode *inode = NULL;
+ struct btrfs_inode *inode = NULL;
struct btrfs_key location;
/*
@@ -2173,9 +2156,8 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
if (ret)
goto out;
- inc_nlink(inode);
- ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(inode),
- &name);
+ inc_nlink(&inode->vfs_inode);
+ ret = unlink_inode_for_log_replay(trans, dir, inode, &name);
/*
* Unlike dir item keys, dir index keys can only have one name (entry) in
* them, as there are no key collisions since each key has a unique offset
@@ -2185,7 +2167,8 @@ out:
btrfs_release_path(path);
btrfs_release_path(log_path);
kfree(name.name);
- iput(inode);
+ if (inode)
+ iput(&inode->vfs_inode);
return ret;
}
@@ -2309,7 +2292,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
struct btrfs_key dir_key;
struct btrfs_key found_key;
struct btrfs_path *log_path;
- struct inode *dir;
+ struct btrfs_inode *dir;
dir_key.objectid = dirid;
dir_key.type = BTRFS_DIR_INDEX_KEY;
@@ -2386,7 +2369,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
out:
btrfs_release_path(path);
btrfs_free_path(log_path);
- iput(dir);
+ iput(&dir->vfs_inode);
return ret;
}
@@ -2480,7 +2463,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
*/
if (S_ISREG(mode)) {
struct btrfs_drop_extents_args drop_args = { 0 };
- struct inode *inode;
+ struct btrfs_inode *inode;
u64 from;
inode = read_one_inode(root, key.objectid);
@@ -2488,22 +2471,20 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
ret = -EIO;
break;
}
- from = ALIGN(i_size_read(inode),
+ from = ALIGN(i_size_read(&inode->vfs_inode),
root->fs_info->sectorsize);
drop_args.start = from;
drop_args.end = (u64)-1;
drop_args.drop_cache = true;
- ret = btrfs_drop_extents(wc->trans, root,
- BTRFS_I(inode),
+ ret = btrfs_drop_extents(wc->trans, root, inode,
&drop_args);
if (!ret) {
- inode_sub_bytes(inode,
+ inode_sub_bytes(&inode->vfs_inode,
drop_args.bytes_found);
/* Update the inode's nbytes. */
- ret = btrfs_update_inode(wc->trans,
- BTRFS_I(inode));
+ ret = btrfs_update_inode(wc->trans, inode);
}
- iput(inode);
+ iput(&inode->vfs_inode);
if (ret)
break;
}
@@ -3561,8 +3542,8 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
struct btrfs_dir_log_item *item;
key.objectid = dirid;
- key.offset = first_offset;
key.type = BTRFS_DIR_LOG_INDEX_KEY;
+ key.offset = first_offset;
ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
/*
* -EEXIST is fine and can happen sporadically when we are logging a
@@ -3588,7 +3569,6 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
last_offset = max(last_offset, curr_end);
}
btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
- btrfs_mark_buffer_dirty(trans, path->nodes[0]);
btrfs_release_path(path);
return 0;
}
@@ -4566,7 +4546,6 @@ copy_item:
dst_index++;
}
- btrfs_mark_buffer_dirty(trans, dst_path->nodes[0]);
btrfs_release_path(dst_path);
out:
kfree(ins_data);
@@ -4776,7 +4755,6 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
write_extent_buffer(leaf, &fi,
btrfs_item_ptr_offset(leaf, path->slots[0]),
sizeof(fi));
- btrfs_mark_buffer_dirty(trans, leaf);
btrfs_release_path(path);
@@ -5485,7 +5463,6 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
ihold(&curr_inode->vfs_inode);
while (true) {
- struct inode *vfs_inode;
struct btrfs_key key;
struct btrfs_key found_key;
u64 next_index;
@@ -5501,7 +5478,7 @@ again:
struct extent_buffer *leaf = path->nodes[0];
struct btrfs_dir_item *di;
struct btrfs_key di_key;
- struct inode *di_inode;
+ struct btrfs_inode *di_inode;
int log_mode = LOG_INODE_EXISTS;
int type;
@@ -5528,17 +5505,16 @@ again:
goto out;
}
- if (!need_log_inode(trans, BTRFS_I(di_inode))) {
- btrfs_add_delayed_iput(BTRFS_I(di_inode));
+ if (!need_log_inode(trans, di_inode)) {
+ btrfs_add_delayed_iput(di_inode);
break;
}
ctx->log_new_dentries = false;
if (type == BTRFS_FT_DIR)
log_mode = LOG_INODE_ALL;
- ret = btrfs_log_inode(trans, BTRFS_I(di_inode),
- log_mode, ctx);
- btrfs_add_delayed_iput(BTRFS_I(di_inode));
+ ret = btrfs_log_inode(trans, di_inode, log_mode, ctx);
+ btrfs_add_delayed_iput(di_inode);
if (ret)
goto out;
if (ctx->log_new_dentries) {
@@ -5580,14 +5556,13 @@ again:
kfree(dir_elem);
btrfs_add_delayed_iput(curr_inode);
- curr_inode = NULL;
- vfs_inode = btrfs_iget_logging(ino, root);
- if (IS_ERR(vfs_inode)) {
- ret = PTR_ERR(vfs_inode);
+ curr_inode = btrfs_iget_logging(ino, root);
+ if (IS_ERR(curr_inode)) {
+ ret = PTR_ERR(curr_inode);
+ curr_inode = NULL;
break;
}
- curr_inode = BTRFS_I(vfs_inode);
}
out:
btrfs_free_path(path);
@@ -5665,7 +5640,7 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans,
struct btrfs_log_ctx *ctx)
{
struct btrfs_ino_list *ino_elem;
- struct inode *inode;
+ struct btrfs_inode *inode;
/*
* It's rare to have a lot of conflicting inodes, in practice it is not
@@ -5756,12 +5731,12 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans,
* inode in LOG_INODE_EXISTS mode and rename operations update the log,
* so that the log ends up with the new name and without the old name.
*/
- if (!need_log_inode(trans, BTRFS_I(inode))) {
- btrfs_add_delayed_iput(BTRFS_I(inode));
+ if (!need_log_inode(trans, inode)) {
+ btrfs_add_delayed_iput(inode);
return 0;
}
- btrfs_add_delayed_iput(BTRFS_I(inode));
+ btrfs_add_delayed_iput(inode);
ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
if (!ino_elem)
@@ -5797,7 +5772,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
*/
while (!list_empty(&ctx->conflict_inodes)) {
struct btrfs_ino_list *curr;
- struct inode *inode;
+ struct btrfs_inode *inode;
u64 ino;
u64 parent;
@@ -5833,9 +5808,8 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
* dir index key range logged for the directory. So we
* must make sure the deletion is recorded.
*/
- ret = btrfs_log_inode(trans, BTRFS_I(inode),
- LOG_INODE_ALL, ctx);
- btrfs_add_delayed_iput(BTRFS_I(inode));
+ ret = btrfs_log_inode(trans, inode, LOG_INODE_ALL, ctx);
+ btrfs_add_delayed_iput(inode);
if (ret)
break;
continue;
@@ -5851,8 +5825,8 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
* it again because if some other task logged the inode after
* that, we can avoid doing it again.
*/
- if (!need_log_inode(trans, BTRFS_I(inode))) {
- btrfs_add_delayed_iput(BTRFS_I(inode));
+ if (!need_log_inode(trans, inode)) {
+ btrfs_add_delayed_iput(inode);
continue;
}
@@ -5863,8 +5837,8 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
* well because during a rename we pin the log and update the
* log with the new name before we unpin it.
*/
- ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_INODE_EXISTS, ctx);
- btrfs_add_delayed_iput(BTRFS_I(inode));
+ ret = btrfs_log_inode(trans, inode, LOG_INODE_EXISTS, ctx);
+ btrfs_add_delayed_iput(inode);
if (ret)
break;
}
@@ -6355,7 +6329,7 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
list_for_each_entry(item, delayed_ins_list, log_list) {
struct btrfs_dir_item *dir_item;
- struct inode *di_inode;
+ struct btrfs_inode *di_inode;
struct btrfs_key key;
int log_mode = LOG_INODE_EXISTS;
@@ -6371,8 +6345,8 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
break;
}
- if (!need_log_inode(trans, BTRFS_I(di_inode))) {
- btrfs_add_delayed_iput(BTRFS_I(di_inode));
+ if (!need_log_inode(trans, di_inode)) {
+ btrfs_add_delayed_iput(di_inode);
continue;
}
@@ -6380,12 +6354,12 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
log_mode = LOG_INODE_ALL;
ctx->log_new_dentries = false;
- ret = btrfs_log_inode(trans, BTRFS_I(di_inode), log_mode, ctx);
+ ret = btrfs_log_inode(trans, di_inode, log_mode, ctx);
if (!ret && ctx->log_new_dentries)
- ret = log_new_dir_dentries(trans, BTRFS_I(di_inode), ctx);
+ ret = log_new_dir_dentries(trans, di_inode, ctx);
- btrfs_add_delayed_iput(BTRFS_I(di_inode));
+ btrfs_add_delayed_iput(di_inode);
if (ret)
break;
@@ -6793,7 +6767,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
ptr = btrfs_item_ptr_offset(leaf, slot);
while (cur_offset < item_size) {
struct btrfs_key inode_key;
- struct inode *dir_inode;
+ struct btrfs_inode *dir_inode;
inode_key.type = BTRFS_INODE_ITEM_KEY;
inode_key.offset = 0;
@@ -6842,18 +6816,16 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
goto out;
}
- if (!need_log_inode(trans, BTRFS_I(dir_inode))) {
- btrfs_add_delayed_iput(BTRFS_I(dir_inode));
+ if (!need_log_inode(trans, dir_inode)) {
+ btrfs_add_delayed_iput(dir_inode);
continue;
}
ctx->log_new_dentries = false;
- ret = btrfs_log_inode(trans, BTRFS_I(dir_inode),
- LOG_INODE_ALL, ctx);
+ ret = btrfs_log_inode(trans, dir_inode, LOG_INODE_ALL, ctx);
if (!ret && ctx->log_new_dentries)
- ret = log_new_dir_dentries(trans,
- BTRFS_I(dir_inode), ctx);
- btrfs_add_delayed_iput(BTRFS_I(dir_inode));
+ ret = log_new_dir_dentries(trans, dir_inode, ctx);
+ btrfs_add_delayed_iput(dir_inode);
if (ret)
goto out;
}
@@ -6878,7 +6850,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
int slot;
struct btrfs_key search_key;
- struct inode *inode;
+ struct btrfs_inode *inode;
u64 ino;
int ret = 0;
@@ -6893,11 +6865,10 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans,
if (IS_ERR(inode))
return PTR_ERR(inode);
- if (BTRFS_I(inode)->generation >= trans->transid &&
- need_log_inode(trans, BTRFS_I(inode)))
- ret = btrfs_log_inode(trans, BTRFS_I(inode),
- LOG_INODE_EXISTS, ctx);
- btrfs_add_delayed_iput(BTRFS_I(inode));
+ if (inode->generation >= trans->transid &&
+ need_log_inode(trans, inode))
+ ret = btrfs_log_inode(trans, inode, LOG_INODE_EXISTS, ctx);
+ btrfs_add_delayed_iput(inode);
if (ret)
return ret;
@@ -7065,26 +7036,20 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
int ret = 0;
- bool log_dentries = false;
+ bool log_dentries;
- if (btrfs_test_opt(fs_info, NOTREELOG)) {
- ret = BTRFS_LOG_FORCE_COMMIT;
- goto end_no_trans;
- }
+ if (btrfs_test_opt(fs_info, NOTREELOG))
+ return BTRFS_LOG_FORCE_COMMIT;
- if (btrfs_root_refs(&root->root_item) == 0) {
- ret = BTRFS_LOG_FORCE_COMMIT;
- goto end_no_trans;
- }
+ if (btrfs_root_refs(&root->root_item) == 0)
+ return BTRFS_LOG_FORCE_COMMIT;
/*
* If we're logging an inode from a subvolume created in the current
* transaction we must force a commit since the root is not persisted.
*/
- if (btrfs_root_generation(&root->root_item) == trans->transid) {
- ret = BTRFS_LOG_FORCE_COMMIT;
- goto end_no_trans;
- }
+ if (btrfs_root_generation(&root->root_item) == trans->transid)
+ return BTRFS_LOG_FORCE_COMMIT;
/*
* Skip already logged inodes or inodes corresponding to tmpfiles
@@ -7093,14 +7058,12 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
*/
if ((btrfs_inode_in_log(inode, trans->transid) &&
list_empty(&ctx->ordered_extents)) ||
- inode->vfs_inode.i_nlink == 0) {
- ret = BTRFS_NO_LOG_SYNC;
- goto end_no_trans;
- }
+ inode->vfs_inode.i_nlink == 0)
+ return BTRFS_NO_LOG_SYNC;
ret = start_log_trans(trans, root, ctx);
if (ret)
- goto end_no_trans;
+ return ret;
ret = btrfs_log_inode(trans, inode, inode_only, ctx);
if (ret)
@@ -7119,8 +7082,11 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
goto end_trans;
}
- if (S_ISDIR(inode->vfs_inode.i_mode) && ctx->log_new_dentries)
- log_dentries = true;
+ /*
+ * Track if we need to log dentries because ctx->log_new_dentries can
+ * be modified in the call chains below.
+ */
+ log_dentries = ctx->log_new_dentries;
/*
* On unlink we must make sure all our current and old parent directory
@@ -7175,8 +7141,6 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
if (log_dentries)
ret = log_new_dir_dentries(trans, inode, ctx);
- else
- ret = 0;
end_trans:
if (ret < 0) {
btrfs_set_log_full_commit(trans);
@@ -7186,7 +7150,7 @@ end_trans:
if (ret)
btrfs_remove_log_ctx(root, ctx);
btrfs_end_log_trans(root);
-end_no_trans:
+
return ret;
}
@@ -7251,8 +7215,8 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
again:
key.objectid = BTRFS_TREE_LOG_OBJECTID;
- key.offset = (u64)-1;
key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = (u64)-1;
while (1) {
ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
index aca2861f2187..17b5e81123a1 100644
--- a/fs/btrfs/uuid-tree.c
+++ b/fs/btrfs/uuid-tree.c
@@ -140,8 +140,6 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, const u8 *uuid, u8 typ
ret = 0;
subid_le = cpu_to_le64(subid_cpu);
write_extent_buffer(eb, &subid_le, offset, sizeof(subid_le));
- btrfs_mark_buffer_dirty(trans, eb);
-
out:
btrfs_free_path(path);
return ret;
diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c
index e97ad824ae16..b7a96a005487 100644
--- a/fs/btrfs/verity.c
+++ b/fs/btrfs/verity.c
@@ -485,7 +485,7 @@ static int rollback_verity(struct btrfs_inode *inode)
goto out;
}
inode->ro_flags &= ~BTRFS_INODE_RO_VERITY;
- btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode);
+ btrfs_sync_inode_flags_to_i_flags(inode);
ret = btrfs_update_inode(trans, inode);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -552,7 +552,7 @@ static int finish_verity(struct btrfs_inode *inode, const void *desc,
goto out;
}
inode->ro_flags |= BTRFS_INODE_RO_VERITY;
- btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode);
+ btrfs_sync_inode_flags_to_i_flags(inode);
ret = btrfs_update_inode(trans, inode);
if (ret)
goto end_trans;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 1cccaf9c2b0d..c8c21c55be53 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -13,8 +13,8 @@
#include <linux/list_sort.h>
#include <linux/namei.h>
#include "misc.h"
-#include "ctree.h"
#include "disk-io.h"
+#include "extent-tree.h"
#include "transaction.h"
#include "volumes.h"
#include "raid56.h"
@@ -48,6 +48,7 @@ struct btrfs_io_geometry {
u64 raid56_full_stripe_start;
int max_errors;
enum btrfs_map_op op;
+ bool use_rst;
};
const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
@@ -797,6 +798,10 @@ static int get_canonical_dev_path(const char *dev_path, char *canonical)
if (ret)
goto out;
resolved_path = d_path(&path, path_buf, PATH_MAX);
+ if (IS_ERR(resolved_path)) {
+ ret = PTR_ERR(resolved_path);
+ goto out;
+ }
ret = strscpy(canonical, resolved_path, PATH_MAX);
out:
kfree(path_buf);
@@ -1298,6 +1303,7 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *device;
struct btrfs_device *latest_dev = NULL;
struct btrfs_device *tmp_device;
+ s64 __maybe_unused value = 0;
int ret = 0;
list_for_each_entry_safe(device, tmp_device, &fs_devices->devices,
@@ -1327,7 +1333,23 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
fs_devices->latest_dev = latest_dev;
fs_devices->total_rw_bytes = 0;
fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+ fs_devices->rr_min_contig_read = BTRFS_DEFAULT_RR_MIN_CONTIG_READ;
+ fs_devices->read_devid = latest_dev->devid;
+ fs_devices->read_policy = btrfs_read_policy_to_enum(btrfs_get_mod_read_policy(),
+ &value);
+ if (fs_devices->read_policy == BTRFS_READ_POLICY_RR)
+ fs_devices->collect_fs_stats = true;
+
+ if (value) {
+ if (fs_devices->read_policy == BTRFS_READ_POLICY_RR)
+ fs_devices->rr_min_contig_read = value;
+ if (fs_devices->read_policy == BTRFS_READ_POLICY_DEVID)
+ fs_devices->read_devid = value;
+ }
+#else
fs_devices->read_policy = BTRFS_READ_POLICY_PID;
+#endif
return 0;
}
@@ -1776,8 +1798,8 @@ again:
path->skip_locking = 1;
key.objectid = device->devid;
- key.offset = search_start;
key.type = BTRFS_DEV_EXTENT_KEY;
+ key.offset = search_start;
ret = btrfs_search_backwards(root, &key, path);
if (ret < 0)
@@ -1896,8 +1918,8 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
return -ENOMEM;
key.objectid = device->devid;
- key.offset = start;
key.type = BTRFS_DEV_EXTENT_KEY;
+ key.offset = start;
again:
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret > 0) {
@@ -2045,7 +2067,6 @@ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
ptr = btrfs_device_fsid(dev_item);
write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid,
ptr, BTRFS_FSID_SIZE);
- btrfs_mark_buffer_dirty(trans, leaf);
ret = 0;
out:
@@ -2700,8 +2721,8 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
return -ENOMEM;
key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
- key.offset = 0;
key.type = BTRFS_DEV_ITEM_KEY;
+ key.offset = 0;
while (1) {
btrfs_reserve_chunk_metadata(trans, false);
@@ -2741,11 +2762,9 @@ next_slot:
device = btrfs_find_device(fs_info->fs_devices, &args);
BUG_ON(!device); /* Logic error */
- if (device->fs_devices->seeding) {
+ if (device->fs_devices->seeding)
btrfs_set_device_generation(leaf, dev_item,
device->generation);
- btrfs_mark_buffer_dirty(trans, leaf);
- }
path->slots[0]++;
goto next_slot;
@@ -3038,8 +3057,6 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
btrfs_device_get_disk_total_bytes(device));
btrfs_set_device_bytes_used(leaf, dev_item,
btrfs_device_get_bytes_used(device));
- btrfs_mark_buffer_dirty(trans, leaf);
-
out:
btrfs_free_path(path);
return ret;
@@ -3102,8 +3119,8 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
return -ENOMEM;
key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
- key.offset = chunk_offset;
key.type = BTRFS_CHUNK_ITEM_KEY;
+ key.offset = chunk_offset;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0)
@@ -3560,8 +3577,8 @@ static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
again:
key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
- key.offset = (u64)-1;
key.type = BTRFS_CHUNK_ITEM_KEY;
+ key.offset = (u64)-1;
while (1) {
mutex_lock(&fs_info->reclaim_bgs_lock);
@@ -3748,10 +3765,7 @@ static int insert_balance_item(struct btrfs_fs_info *fs_info,
btrfs_set_balance_meta(leaf, item, &disk_bargs);
btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
btrfs_set_balance_sys(leaf, item, &disk_bargs);
-
btrfs_set_balance_flags(leaf, item, bctl->flags);
-
- btrfs_mark_buffer_dirty(trans, leaf);
out:
btrfs_free_path(path);
err = btrfs_commit_transaction(trans);
@@ -4170,8 +4184,8 @@ again:
bctl->sys.limit = limit_sys;
}
key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
- key.offset = (u64)-1;
key.type = BTRFS_CHUNK_ITEM_KEY;
+ key.offset = (u64)-1;
while (1) {
if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
@@ -4987,8 +5001,8 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
again:
key.objectid = device->devid;
- key.offset = (u64)-1;
key.type = BTRFS_DEV_EXTENT_KEY;
+ key.offset = (u64)-1;
do {
mutex_lock(&fs_info->reclaim_bgs_lock);
@@ -5513,33 +5527,34 @@ void btrfs_remove_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_ma
btrfs_free_chunk_map(map);
}
+static int btrfs_chunk_map_cmp(const struct rb_node *new,
+ const struct rb_node *exist)
+{
+ const struct btrfs_chunk_map *new_map =
+ rb_entry(new, struct btrfs_chunk_map, rb_node);
+ const struct btrfs_chunk_map *exist_map =
+ rb_entry(exist, struct btrfs_chunk_map, rb_node);
+
+ if (new_map->start == exist_map->start)
+ return 0;
+ if (new_map->start < exist_map->start)
+ return -1;
+ return 1;
+}
+
EXPORT_FOR_TESTS
int btrfs_add_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map)
{
- struct rb_node **p;
- struct rb_node *parent = NULL;
- bool leftmost = true;
+ struct rb_node *exist;
write_lock(&fs_info->mapping_tree_lock);
- p = &fs_info->mapping_tree.rb_root.rb_node;
- while (*p) {
- struct btrfs_chunk_map *entry;
-
- parent = *p;
- entry = rb_entry(parent, struct btrfs_chunk_map, rb_node);
-
- if (map->start < entry->start) {
- p = &(*p)->rb_left;
- } else if (map->start > entry->start) {
- p = &(*p)->rb_right;
- leftmost = false;
- } else {
- write_unlock(&fs_info->mapping_tree_lock);
- return -EEXIST;
- }
+ exist = rb_find_add_cached(&map->rb_node, &fs_info->mapping_tree,
+ btrfs_chunk_map_cmp);
+
+ if (exist) {
+ write_unlock(&fs_info->mapping_tree_lock);
+ return -EEXIST;
}
- rb_link_node(&map->rb_node, parent, p);
- rb_insert_color_cached(&map->rb_node, &fs_info->mapping_tree, leftmost);
chunk_map_device_set_bits(map, CHUNK_ALLOCATED);
chunk_map_device_clear_bits(map, CHUNK_TRIMMED);
write_unlock(&fs_info->mapping_tree_lock);
@@ -5959,6 +5974,76 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
return len;
}
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+static int btrfs_read_preferred(struct btrfs_chunk_map *map, int first, int num_stripes)
+{
+ for (int index = first; index < first + num_stripes; index++) {
+ const struct btrfs_device *device = map->stripes[index].dev;
+
+ if (device->devid == READ_ONCE(device->fs_devices->read_devid))
+ return index;
+ }
+
+ /* If no read-preferred device is set use the first stripe. */
+ return first;
+}
+
+struct stripe_mirror {
+ u64 devid;
+ int num;
+};
+
+static int btrfs_cmp_devid(const void *a, const void *b)
+{
+ const struct stripe_mirror *s1 = (const struct stripe_mirror *)a;
+ const struct stripe_mirror *s2 = (const struct stripe_mirror *)b;
+
+ if (s1->devid < s2->devid)
+ return -1;
+ if (s1->devid > s2->devid)
+ return 1;
+ return 0;
+}
+
+/*
+ * Select a stripe for reading using the round-robin algorithm.
+ *
+ * 1. Compute the read cycle as the total sectors read divided by the minimum
+ * sectors per device.
+ * 2. Determine the stripe number for the current read by taking the modulus
+ * of the read cycle with the total number of stripes:
+ *
+ * stripe index = (total sectors / min sectors per dev) % num stripes
+ *
+ * The calculated stripe index is then used to select the corresponding device
+ * from the list of devices, which is ordered by devid.
+ */
+static int btrfs_read_rr(const struct btrfs_chunk_map *map, int first, int num_stripes)
+{
+ struct stripe_mirror stripes[BTRFS_RAID1_MAX_MIRRORS] = { 0 };
+ struct btrfs_device *device = map->stripes[first].dev;
+ struct btrfs_fs_info *fs_info = device->fs_devices->fs_info;
+ unsigned int read_cycle;
+ unsigned int total_reads;
+ unsigned int min_reads_per_dev;
+
+ total_reads = percpu_counter_sum(&fs_info->stats_read_blocks);
+ min_reads_per_dev = READ_ONCE(fs_info->fs_devices->rr_min_contig_read) >>
+ fs_info->sectorsize_bits;
+
+ for (int index = 0, i = first; i < first + num_stripes; i++) {
+ stripes[index].devid = map->stripes[i].dev->devid;
+ stripes[index].num = i;
+ index++;
+ }
+ sort(stripes, num_stripes, sizeof(struct stripe_mirror),
+ btrfs_cmp_devid, NULL);
+
+ read_cycle = total_reads / min_reads_per_dev;
+ return stripes[read_cycle % num_stripes].num;
+}
+#endif
+
static int find_live_mirror(struct btrfs_fs_info *fs_info,
struct btrfs_chunk_map *map, int first,
int dev_replace_is_ongoing)
@@ -5988,6 +6073,14 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
case BTRFS_READ_POLICY_PID:
preferred_mirror = first + (current->pid % num_stripes);
break;
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+ case BTRFS_READ_POLICY_RR:
+ preferred_mirror = btrfs_read_rr(map, first, num_stripes);
+ break;
+ case BTRFS_READ_POLICY_DEVID:
+ preferred_mirror = btrfs_read_preferred(map, first, num_stripes);
+ break;
+#endif
}
if (dev_replace_is_ongoing &&
@@ -6346,8 +6439,7 @@ static int set_io_stripe(struct btrfs_fs_info *fs_info, u64 logical,
{
dst->dev = map->stripes[io_geom->stripe_index].dev;
- if (io_geom->op == BTRFS_MAP_READ &&
- btrfs_need_stripe_tree_update(fs_info, map->type))
+ if (io_geom->op == BTRFS_MAP_READ && io_geom->use_rst)
return btrfs_get_raid_extent_offset(fs_info, logical, length,
map->type,
io_geom->stripe_index, dst);
@@ -6362,7 +6454,7 @@ static bool is_single_device_io(struct btrfs_fs_info *fs_info,
const struct btrfs_io_stripe *smap,
const struct btrfs_chunk_map *map,
int num_alloc_stripes,
- enum btrfs_map_op op, int mirror_num)
+ struct btrfs_io_geometry *io_geom)
{
if (!smap)
return false;
@@ -6370,10 +6462,10 @@ static bool is_single_device_io(struct btrfs_fs_info *fs_info,
if (num_alloc_stripes != 1)
return false;
- if (btrfs_need_stripe_tree_update(fs_info, map->type) && op != BTRFS_MAP_READ)
+ if (io_geom->use_rst && io_geom->op != BTRFS_MAP_READ)
return false;
- if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1)
+ if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && io_geom->mirror_num > 1)
return false;
return true;
@@ -6579,6 +6671,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
io_geom.raid56_full_stripe_start = (u64)-1;
max_len = btrfs_max_io_len(map, map_offset, &io_geom);
*length = min_t(u64, map->chunk_len - map_offset, max_len);
+ io_geom.use_rst = btrfs_need_stripe_tree_update(fs_info, map->type);
if (dev_replace->replace_task != current)
down_read(&dev_replace->rwsem);
@@ -6647,8 +6740,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
* physical block information on the stack instead of allocating an
* I/O context structure.
*/
- if (is_single_device_io(fs_info, smap, map, num_alloc_stripes, op,
- io_geom.mirror_num)) {
+ if (is_single_device_io(fs_info, smap, map, num_alloc_stripes, &io_geom)) {
ret = set_io_stripe(fs_info, logical, length, smap, map, &io_geom);
if (mirror_num_ret)
*mirror_num_ret = io_geom.mirror_num;
@@ -6662,6 +6754,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
goto out;
}
bioc->map_type = map->type;
+ bioc->use_rst = io_geom.use_rst;
/*
* For RAID56 full map, we need to make sure the stripes[] follows the
@@ -7002,16 +7095,6 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
warn_32bit_meta_chunk(fs_info, logical, length, type);
#endif
- /*
- * Only need to verify chunk item if we're reading from sys chunk array,
- * as chunk item in tree block is already verified by tree-checker.
- */
- if (leaf->start == BTRFS_SUPER_INFO_OFFSET) {
- ret = btrfs_check_chunk_valid(leaf, chunk, logical);
- if (ret)
- return ret;
- }
-
map = btrfs_find_chunk_map(fs_info, logical, 1);
/* already mapped? */
@@ -7072,6 +7155,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
btrfs_err(fs_info,
"failed to add chunk map, start=%llu len=%llu: %d",
map->start, map->chunk_len, ret);
+ btrfs_free_chunk_map(map);
}
return ret;
@@ -7117,8 +7201,12 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
fs_devices = find_fsid(fsid, NULL);
if (!fs_devices) {
- if (!btrfs_test_opt(fs_info, DEGRADED))
+ if (!btrfs_test_opt(fs_info, DEGRADED)) {
+ btrfs_err(fs_info,
+ "failed to find fsid %pU when attempting to open seed devices",
+ fsid);
return ERR_PTR(-ENOENT);
+ }
fs_devices = alloc_fs_devices(fsid);
if (IS_ERR(fs_devices))
@@ -7269,16 +7357,11 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
{
struct btrfs_super_block *super_copy = fs_info->super_copy;
struct extent_buffer *sb;
- struct btrfs_disk_key *disk_key;
- struct btrfs_chunk *chunk;
u8 *array_ptr;
unsigned long sb_array_offset;
int ret = 0;
- u32 num_stripes;
u32 array_size;
- u32 len = 0;
u32 cur_offset;
- u64 type;
struct btrfs_key key;
ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize);
@@ -7301,10 +7384,15 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
cur_offset = 0;
while (cur_offset < array_size) {
- disk_key = (struct btrfs_disk_key *)array_ptr;
- len = sizeof(*disk_key);
- if (cur_offset + len > array_size)
- goto out_short_read;
+ struct btrfs_chunk *chunk;
+ struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)array_ptr;
+ u32 len = sizeof(*disk_key);
+
+ /*
+ * The sys_chunk_array has been already verified at super block
+ * read time. Only do ASSERT()s for basic checks.
+ */
+ ASSERT(cur_offset + len <= array_size);
btrfs_disk_key_to_cpu(&key, disk_key);
@@ -7312,44 +7400,14 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
sb_array_offset += len;
cur_offset += len;
- if (key.type != BTRFS_CHUNK_ITEM_KEY) {
- btrfs_err(fs_info,
- "unexpected item type %u in sys_array at offset %u",
- (u32)key.type, cur_offset);
- ret = -EIO;
- break;
- }
+ ASSERT(key.type == BTRFS_CHUNK_ITEM_KEY);
chunk = (struct btrfs_chunk *)sb_array_offset;
- /*
- * At least one btrfs_chunk with one stripe must be present,
- * exact stripe count check comes afterwards
- */
- len = btrfs_chunk_item_size(1);
- if (cur_offset + len > array_size)
- goto out_short_read;
-
- num_stripes = btrfs_chunk_num_stripes(sb, chunk);
- if (!num_stripes) {
- btrfs_err(fs_info,
- "invalid number of stripes %u in sys_array at offset %u",
- num_stripes, cur_offset);
- ret = -EIO;
- break;
- }
+ ASSERT(btrfs_chunk_type(sb, chunk) & BTRFS_BLOCK_GROUP_SYSTEM);
- type = btrfs_chunk_type(sb, chunk);
- if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
- btrfs_err(fs_info,
- "invalid chunk type %llu in sys_array at offset %u",
- type, cur_offset);
- ret = -EIO;
- break;
- }
+ len = btrfs_chunk_item_size(btrfs_chunk_num_stripes(sb, chunk));
- len = btrfs_chunk_item_size(num_stripes);
- if (cur_offset + len > array_size)
- goto out_short_read;
+ ASSERT(cur_offset + len <= array_size);
ret = read_one_chunk(&key, sb, chunk);
if (ret)
@@ -7362,13 +7420,6 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
clear_extent_buffer_uptodate(sb);
free_extent_buffer_stale(sb);
return ret;
-
-out_short_read:
- btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u",
- len, cur_offset);
- clear_extent_buffer_uptodate(sb);
- free_extent_buffer_stale(sb);
- return -EIO;
}
/*
@@ -7488,8 +7539,8 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
* item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
*/
key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
- key.offset = 0;
key.type = 0;
+ key.offset = 0;
btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
struct extent_buffer *node = path->nodes[1];
@@ -7568,8 +7619,6 @@ int btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
struct btrfs_device *device;
int ret = 0;
- fs_devices->fs_info = fs_info;
-
mutex_lock(&fs_devices->device_list_mutex);
list_for_each_entry(device, &fs_devices->devices, dev_list)
device->fs_info = fs_info;
@@ -7745,8 +7794,6 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
btrfs_set_dev_stats_value(eb, ptr, i,
btrfs_dev_stat_read(device, i));
- btrfs_mark_buffer_dirty(trans, eb);
-
out:
btrfs_free_path(path);
return ret;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 3a416b1bc24c..e247d551da67 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -7,6 +7,7 @@
#define BTRFS_VOLUMES_H
#include <linux/blk_types.h>
+#include <linux/blkdev.h>
#include <linux/sizes.h>
#include <linux/atomic.h>
#include <linux/sort.h>
@@ -18,14 +19,17 @@
#include <linux/completion.h>
#include <linux/rbtree.h>
#include <uapi/linux/btrfs.h>
+#include <uapi/linux/btrfs_tree.h>
#include "messages.h"
#include "rcu-string.h"
+#include "extent-io-tree.h"
struct block_device;
struct bdev_handle;
struct btrfs_fs_info;
struct btrfs_block_group;
struct btrfs_trans_handle;
+struct btrfs_transaction;
struct btrfs_zoned_device_info;
#define BTRFS_MAX_DATA_CHUNK_SIZE (10ULL * SZ_1G)
@@ -296,6 +300,9 @@ enum btrfs_chunk_allocation_policy {
BTRFS_CHUNK_ALLOC_ZONED,
};
+#define BTRFS_DEFAULT_RR_MIN_CONTIG_READ (SZ_256K)
+/* Keep in sync with raid_attr table, current maximum is RAID1C4. */
+#define BTRFS_RAID1_MAX_MIRRORS (4)
/*
* Read policies for mirrored block group profiles, read picks the stripe based
* on these policies.
@@ -303,6 +310,12 @@ enum btrfs_chunk_allocation_policy {
enum btrfs_read_policy {
/* Use process PID to choose the stripe */
BTRFS_READ_POLICY_PID,
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+ /* Balancing RAID1 reads across all striped devices (round-robin). */
+ BTRFS_READ_POLICY_RR,
+ /* Read from a specific device. */
+ BTRFS_READ_POLICY_DEVID,
+#endif
BTRFS_NR_READ_POLICY,
};
@@ -417,6 +430,8 @@ struct btrfs_fs_devices {
bool seeding;
/* The mount needs to use a randomly generated fsid. */
bool temp_fsid;
+ /* Enable/disable the filesystem stats tracking. */
+ bool collect_fs_stats;
struct btrfs_fs_info *fs_info;
/* sysfs kobjects */
@@ -431,6 +446,15 @@ struct btrfs_fs_devices {
enum btrfs_read_policy read_policy;
#ifdef CONFIG_BTRFS_EXPERIMENTAL
+ /*
+ * Minimum contiguous reads before switching to next device, the unit
+ * is one block/sectorsize.
+ */
+ u32 rr_min_contig_read;
+
+ /* Device to be used for reading in case of RAID1. */
+ u64 read_devid;
+
/* Checksum mode - offload it or do it synchronously. */
enum btrfs_offload_csum_mode offload_csum_mode;
#endif
@@ -485,6 +509,7 @@ struct btrfs_io_context {
struct bio *orig_bio;
atomic_t error;
u16 max_errors;
+ bool use_rst;
u64 logical;
u64 size;
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index bc18710d1dcf..3e0edbcf73e1 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -204,7 +204,6 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
btrfs_set_dir_data_len(leaf, di, size);
data_ptr = ((unsigned long)(di + 1)) + name_len;
write_extent_buffer(leaf, value, data_ptr, size);
- btrfs_mark_buffer_dirty(trans, leaf);
} else {
/*
* Insert, and we had space for the xattr, so path->slots[0] is
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index 8dc4cf49f6f0..0ce10e4ec836 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -6,6 +6,8 @@
#ifndef BTRFS_XATTR_H
#define BTRFS_XATTR_H
+#include <linux/types.h>
+
struct dentry;
struct inode;
struct qstr;
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index ddf0d5a448a7..545f413d81fc 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -94,6 +94,47 @@ fail:
return ERR_PTR(-ENOMEM);
}
+/*
+ * Helper for S390x with hardware zlib compression support.
+ *
+ * That hardware acceleration requires a buffer size larger than a single page
+ * to get ideal performance, thus we need to do the memory copy rather than
+ * use the page cache directly as input buffer.
+ */
+static int copy_data_into_buffer(struct address_space *mapping,
+ struct workspace *workspace, u64 filepos,
+ unsigned long length)
+{
+ u64 cur = filepos;
+
+ /* It's only for hardware accelerated zlib code. */
+ ASSERT(zlib_deflate_dfltcc_enabled());
+
+ while (cur < filepos + length) {
+ struct folio *folio;
+ void *data_in;
+ unsigned int offset;
+ unsigned long copy_length;
+ int ret;
+
+ ret = btrfs_compress_filemap_get_folio(mapping, cur, &folio);
+ if (ret < 0)
+ return ret;
+ /* No large folio support yet. */
+ ASSERT(!folio_test_large(folio));
+
+ offset = offset_in_folio(folio, cur);
+ copy_length = min(folio_size(folio) - offset,
+ filepos + length - cur);
+
+ data_in = kmap_local_folio(folio, offset);
+ memcpy(workspace->buf + cur - filepos, data_in, copy_length);
+ kunmap_local(data_in);
+ cur += copy_length;
+ }
+ return 0;
+}
+
int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
u64 start, struct folio **folios, unsigned long *out_folios,
unsigned long *total_in, unsigned long *total_out)
@@ -105,8 +146,6 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
int nr_folios = 0;
struct folio *in_folio = NULL;
struct folio *out_folio = NULL;
- unsigned long bytes_left;
- unsigned int in_buf_folios;
unsigned long len = *total_out;
unsigned long nr_dest_folios = *out_folios;
const unsigned long max_out = nr_dest_folios * PAGE_SIZE;
@@ -150,34 +189,21 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
* the workspace buffer if required.
*/
if (workspace->strm.avail_in == 0) {
- bytes_left = len - workspace->strm.total_in;
- in_buf_folios = min(DIV_ROUND_UP(bytes_left, PAGE_SIZE),
- workspace->buf_size / PAGE_SIZE);
- if (in_buf_folios > 1) {
- int i;
-
- /* S390 hardware acceleration path, not subpage. */
- ASSERT(!btrfs_is_subpage(
- inode_to_fs_info(mapping->host),
- mapping));
- for (i = 0; i < in_buf_folios; i++) {
- if (data_in) {
- kunmap_local(data_in);
- folio_put(in_folio);
- data_in = NULL;
- }
- ret = btrfs_compress_filemap_get_folio(mapping,
- start, &in_folio);
- if (ret < 0)
- goto out;
- data_in = kmap_local_folio(in_folio, 0);
- copy_page(workspace->buf + i * PAGE_SIZE,
- data_in);
- start += PAGE_SIZE;
- workspace->strm.avail_in =
- (in_buf_folios << PAGE_SHIFT);
- }
+ unsigned long bytes_left = len - workspace->strm.total_in;
+ unsigned int copy_length = min(bytes_left, workspace->buf_size);
+
+ /*
+ * This can only happen when hardware zlib compression is
+ * enabled.
+ */
+ if (copy_length > PAGE_SIZE) {
+ ret = copy_data_into_buffer(mapping, workspace,
+ start, copy_length);
+ if (ret < 0)
+ goto out;
+ start += copy_length;
workspace->strm.next_in = workspace->buf;
+ workspace->strm.avail_in = copy_length;
} else {
unsigned int pg_off;
unsigned int cur_len;
@@ -463,6 +489,7 @@ out:
const struct btrfs_compress_op btrfs_zlib_compress = {
.workspace_manager = &wsm,
+ .min_level = 1,
.max_level = 9,
.default_level = BTRFS_ZLIB_DEFAULT_LEVEL,
};
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 11ed523e528e..4a3e02b49f29 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -748,8 +748,9 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
(u64)lim->max_segments << PAGE_SHIFT),
fs_info->sectorsize);
fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED;
- if (fs_info->max_zone_append_size < fs_info->max_extent_size)
- fs_info->max_extent_size = fs_info->max_zone_append_size;
+
+ fs_info->max_extent_size = min_not_zero(fs_info->max_extent_size,
+ fs_info->max_zone_append_size);
/*
* Check mount options here, because we might change fs_info->zoned
@@ -1276,7 +1277,7 @@ struct zone_info {
static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
struct zone_info *info, unsigned long *active,
- struct btrfs_chunk_map *map)
+ struct btrfs_chunk_map *map, bool new)
{
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
struct btrfs_device *device;
@@ -1306,6 +1307,8 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
return 0;
}
+ ASSERT(!new || btrfs_dev_is_empty_zone(device, info->physical));
+
/* This zone will be used for allocation, so mark this zone non-empty. */
btrfs_dev_clear_zone_empty(device, info->physical);
@@ -1318,6 +1321,18 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
* to determine the allocation offset within the zone.
*/
WARN_ON(!IS_ALIGNED(info->physical, fs_info->zone_size));
+
+ if (new) {
+ sector_t capacity;
+
+ capacity = bdev_zone_capacity(device->bdev, info->physical >> SECTOR_SHIFT);
+ up_read(&dev_replace->rwsem);
+ info->alloc_offset = 0;
+ info->capacity = capacity << SECTOR_SHIFT;
+
+ return 0;
+ }
+
nofs_flag = memalloc_nofs_save();
ret = btrfs_get_dev_zone(device, info->physical, &zone);
memalloc_nofs_restore(nofs_flag);
@@ -1587,7 +1602,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
}
for (i = 0; i < map->num_stripes; i++) {
- ret = btrfs_load_zone_info(fs_info, i, &zone_info[i], active, map);
+ ret = btrfs_load_zone_info(fs_info, i, &zone_info[i], active, map, new);
if (ret)
goto out;
@@ -1658,7 +1673,6 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
* stripe.
*/
cache->alloc_offset = cache->zone_capacity;
- ret = 0;
}
out:
@@ -2110,6 +2124,9 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
physical = map->stripes[i].physical;
zinfo = device->zone_info;
+ if (!device->bdev)
+ continue;
+
if (zinfo->max_active_zones == 0)
continue;
@@ -2271,6 +2288,9 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
struct btrfs_zoned_device_info *zinfo = device->zone_info;
unsigned int nofs_flags;
+ if (!device->bdev)
+ continue;
+
if (zinfo->max_active_zones == 0)
continue;
@@ -2324,6 +2344,9 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags)
if (!btrfs_is_zoned(fs_info))
return true;
+ if (test_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags))
+ return false;
+
/* Check if there is a device with active zones left */
mutex_lock(&fs_info->chunk_mutex);
spin_lock(&fs_info->zone_active_bgs_lock);
@@ -2651,3 +2674,127 @@ void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info)
}
spin_unlock(&fs_info->zone_active_bgs_lock);
}
+
+/*
+ * Reset the zones of unused block groups from @space_info->bytes_zone_unusable.
+ *
+ * @space_info: the space to work on
+ * @num_bytes: targeting reclaim bytes
+ *
+ * This one resets the zones of a block group, so we can reuse the region
+ * without removing the block group. On the other hand, btrfs_delete_unused_bgs()
+ * just removes a block group and frees up the underlying zones. So, we still
+ * need to allocate a new block group to reuse the zones.
+ *
+ * Resetting is faster than deleting/recreating a block group. It is similar
+ * to freeing the logical space on the regular mode. However, we cannot change
+ * the block group's profile with this operation.
+ */
+int btrfs_reset_unused_block_groups(struct btrfs_space_info *space_info, u64 num_bytes)
+{
+ struct btrfs_fs_info *fs_info = space_info->fs_info;
+ const sector_t zone_size_sectors = fs_info->zone_size >> SECTOR_SHIFT;
+
+ if (!btrfs_is_zoned(fs_info))
+ return 0;
+
+ while (num_bytes > 0) {
+ struct btrfs_chunk_map *map;
+ struct btrfs_block_group *bg = NULL;
+ bool found = false;
+ u64 reclaimed = 0;
+
+ /*
+ * Here, we choose a fully zone_unusable block group. It's
+ * technically possible to reset a partly zone_unusable block
+ * group, which still has some free space left. However,
+ * handling that needs to cope with the allocation side, which
+ * makes the logic more complex. So, let's handle the easy case
+ * for now.
+ */
+ spin_lock(&fs_info->unused_bgs_lock);
+ list_for_each_entry(bg, &fs_info->unused_bgs, bg_list) {
+ if ((bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) != space_info->flags)
+ continue;
+
+ /*
+ * Use trylock to avoid locking order violation. In
+ * btrfs_reclaim_bgs_work(), the lock order is
+ * &bg->lock -> &fs_info->unused_bgs_lock. We skip a
+ * block group if we cannot take its lock.
+ */
+ if (!spin_trylock(&bg->lock))
+ continue;
+ if (btrfs_is_block_group_used(bg) || bg->zone_unusable < bg->length) {
+ spin_unlock(&bg->lock);
+ continue;
+ }
+ spin_unlock(&bg->lock);
+ found = true;
+ break;
+ }
+ if (!found) {
+ spin_unlock(&fs_info->unused_bgs_lock);
+ return 0;
+ }
+
+ list_del_init(&bg->bg_list);
+ btrfs_put_block_group(bg);
+ spin_unlock(&fs_info->unused_bgs_lock);
+
+ /*
+ * Since the block group is fully zone_unusable and we cannot
+ * allocate from this block group anymore, we don't need to set
+ * this block group read-only.
+ */
+
+ down_read(&fs_info->dev_replace.rwsem);
+ map = bg->physical_map;
+ for (int i = 0; i < map->num_stripes; i++) {
+ struct btrfs_io_stripe *stripe = &map->stripes[i];
+ unsigned int nofs_flags;
+ int ret;
+
+ nofs_flags = memalloc_nofs_save();
+ ret = blkdev_zone_mgmt(stripe->dev->bdev, REQ_OP_ZONE_RESET,
+ stripe->physical >> SECTOR_SHIFT,
+ zone_size_sectors);
+ memalloc_nofs_restore(nofs_flags);
+
+ if (ret) {
+ up_read(&fs_info->dev_replace.rwsem);
+ return ret;
+ }
+ }
+ up_read(&fs_info->dev_replace.rwsem);
+
+ spin_lock(&space_info->lock);
+ spin_lock(&bg->lock);
+ ASSERT(!btrfs_is_block_group_used(bg));
+ if (bg->ro) {
+ spin_unlock(&bg->lock);
+ spin_unlock(&space_info->lock);
+ continue;
+ }
+
+ reclaimed = bg->alloc_offset;
+ bg->zone_unusable = bg->length - bg->zone_capacity;
+ bg->alloc_offset = 0;
+ /*
+ * This holds because we currently reset fully used then freed
+ * block group.
+ */
+ ASSERT(reclaimed == bg->zone_capacity);
+ bg->free_space_ctl->free_space += reclaimed;
+ space_info->bytes_zone_unusable -= reclaimed;
+ spin_unlock(&bg->lock);
+ btrfs_return_free_space(space_info, reclaimed);
+ spin_unlock(&space_info->lock);
+
+ if (num_bytes <= reclaimed)
+ break;
+ num_bytes -= reclaimed;
+ }
+
+ return 0;
+}
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index 7612e6572605..9672bf4c3335 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -96,6 +96,7 @@ int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info);
int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info, bool do_finish);
void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info);
+int btrfs_reset_unused_block_groups(struct btrfs_space_info *space_info, u64 num_bytes);
#else /* CONFIG_BLK_DEV_ZONED */
static inline int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info)
@@ -265,6 +266,12 @@ static inline int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
static inline void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info) { }
+static inline int btrfs_reset_unused_block_groups(struct btrfs_space_info *space_info,
+ u64 num_bytes)
+{
+ return 0;
+}
+
#endif
static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index 5232b56d5892..3541efa765c7 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -26,11 +26,12 @@
#define ZSTD_BTRFS_MAX_WINDOWLOG 17
#define ZSTD_BTRFS_MAX_INPUT (1 << ZSTD_BTRFS_MAX_WINDOWLOG)
#define ZSTD_BTRFS_DEFAULT_LEVEL 3
+#define ZSTD_BTRFS_MIN_LEVEL -15
#define ZSTD_BTRFS_MAX_LEVEL 15
/* 307s to avoid pathologically clashing with transaction commit */
#define ZSTD_BTRFS_RECLAIM_JIFFIES (307 * HZ)
-static zstd_parameters zstd_get_btrfs_parameters(unsigned int level,
+static zstd_parameters zstd_get_btrfs_parameters(int level,
size_t src_len)
{
zstd_parameters params = zstd_get_params(level, src_len);
@@ -45,13 +46,14 @@ struct workspace {
void *mem;
size_t size;
char *buf;
- unsigned int level;
- unsigned int req_level;
+ int level;
+ int req_level;
unsigned long last_used; /* jiffies */
struct list_head list;
struct list_head lru_list;
zstd_in_buffer in_buf;
zstd_out_buffer out_buf;
+ zstd_parameters params;
};
/*
@@ -93,8 +95,10 @@ static inline struct workspace *list_to_workspace(struct list_head *list)
return container_of(list, struct workspace, list);
}
-void zstd_free_workspace(struct list_head *ws);
-struct list_head *zstd_alloc_workspace(unsigned int level);
+static inline int clip_level(int level)
+{
+ return max(0, level - 1);
+}
/*
* Timer callback to free unused workspaces.
@@ -123,7 +127,7 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer)
list_for_each_prev_safe(pos, next, &wsm.lru_list) {
struct workspace *victim = container_of(pos, struct workspace,
lru_list);
- unsigned int level;
+ int level;
if (time_after(victim->last_used, reclaim_threshold))
break;
@@ -137,8 +141,8 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer)
list_del(&victim->list);
zstd_free_workspace(&victim->list);
- if (list_empty(&wsm.idle_ws[level - 1]))
- clear_bit(level - 1, &wsm.active_map);
+ if (list_empty(&wsm.idle_ws[level]))
+ clear_bit(level, &wsm.active_map);
}
@@ -160,9 +164,11 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer)
static void zstd_calc_ws_mem_sizes(void)
{
size_t max_size = 0;
- unsigned int level;
+ int level;
- for (level = 1; level <= ZSTD_BTRFS_MAX_LEVEL; level++) {
+ for (level = ZSTD_BTRFS_MIN_LEVEL; level <= ZSTD_BTRFS_MAX_LEVEL; level++) {
+ if (level == 0)
+ continue;
zstd_parameters params =
zstd_get_btrfs_parameters(level, ZSTD_BTRFS_MAX_INPUT);
size_t level_size =
@@ -171,7 +177,8 @@ static void zstd_calc_ws_mem_sizes(void)
zstd_dstream_workspace_bound(ZSTD_BTRFS_MAX_INPUT));
max_size = max_t(size_t, max_size, level_size);
- zstd_ws_mem_sizes[level - 1] = max_size;
+ /* Use level 1 workspace size for all the fast mode negative levels. */
+ zstd_ws_mem_sizes[clip_level(level)] = max_size;
}
}
@@ -218,7 +225,7 @@ void zstd_cleanup_workspace_manager(void)
}
spin_unlock_bh(&wsm.lock);
- del_timer_sync(&wsm.timer);
+ timer_delete_sync(&wsm.timer);
}
/*
@@ -233,11 +240,11 @@ void zstd_cleanup_workspace_manager(void)
* offer the opportunity to reclaim the workspace in favor of allocating an
* appropriately sized one in the future.
*/
-static struct list_head *zstd_find_workspace(unsigned int level)
+static struct list_head *zstd_find_workspace(int level)
{
struct list_head *ws;
struct workspace *workspace;
- int i = level - 1;
+ int i = clip_level(level);
spin_lock_bh(&wsm.lock);
for_each_set_bit_from(i, &wsm.active_map, ZSTD_BTRFS_MAX_LEVEL) {
@@ -247,7 +254,7 @@ static struct list_head *zstd_find_workspace(unsigned int level)
list_del_init(ws);
/* keep its place if it's a lower level using this */
workspace->req_level = level;
- if (level == workspace->level)
+ if (clip_level(level) == workspace->level)
list_del(&workspace->lru_list);
if (list_empty(&wsm.idle_ws[i]))
clear_bit(i, &wsm.active_map);
@@ -270,7 +277,7 @@ static struct list_head *zstd_find_workspace(unsigned int level)
* attempt to allocate a new workspace. If we fail to allocate one due to
* memory pressure, go to sleep waiting for the max level workspace to free up.
*/
-struct list_head *zstd_get_workspace(unsigned int level)
+struct list_head *zstd_get_workspace(int level)
{
struct list_head *ws;
unsigned int nofs_flag;
@@ -319,7 +326,7 @@ void zstd_put_workspace(struct list_head *ws)
spin_lock_bh(&wsm.lock);
/* A node is only taken off the lru if we are the corresponding level */
- if (workspace->req_level == workspace->level) {
+ if (clip_level(workspace->req_level) == workspace->level) {
/* Hide a max level workspace from reclaim */
if (list_empty(&wsm.idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1])) {
INIT_LIST_HEAD(&workspace->lru_list);
@@ -332,13 +339,13 @@ void zstd_put_workspace(struct list_head *ws)
}
}
- set_bit(workspace->level - 1, &wsm.active_map);
- list_add(&workspace->list, &wsm.idle_ws[workspace->level - 1]);
+ set_bit(workspace->level, &wsm.active_map);
+ list_add(&workspace->list, &wsm.idle_ws[workspace->level]);
workspace->req_level = 0;
spin_unlock_bh(&wsm.lock);
- if (workspace->level == ZSTD_BTRFS_MAX_LEVEL)
+ if (workspace->level == clip_level(ZSTD_BTRFS_MAX_LEVEL))
cond_wake_up(&wsm.wait);
}
@@ -351,7 +358,7 @@ void zstd_free_workspace(struct list_head *ws)
kfree(workspace);
}
-struct list_head *zstd_alloc_workspace(unsigned int level)
+struct list_head *zstd_alloc_workspace(int level)
{
struct workspace *workspace;
@@ -359,8 +366,9 @@ struct list_head *zstd_alloc_workspace(unsigned int level)
if (!workspace)
return ERR_PTR(-ENOMEM);
- workspace->size = zstd_ws_mem_sizes[level - 1];
- workspace->level = level;
+ /* Use level 1 workspace size for all the fast mode negative levels. */
+ workspace->size = zstd_ws_mem_sizes[clip_level(level)];
+ workspace->level = clip_level(level);
workspace->req_level = level;
workspace->last_used = jiffies;
workspace->mem = kvmalloc(workspace->size, GFP_KERNEL | __GFP_NOWARN);
@@ -393,17 +401,15 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
const unsigned long nr_dest_folios = *out_folios;
const u64 orig_end = start + len;
unsigned long max_out = nr_dest_folios * PAGE_SIZE;
- unsigned int pg_off;
unsigned int cur_len;
- zstd_parameters params = zstd_get_btrfs_parameters(workspace->req_level,
- len);
+ workspace->params = zstd_get_btrfs_parameters(workspace->req_level, len);
*out_folios = 0;
*total_out = 0;
*total_in = 0;
/* Initialize the stream */
- stream = zstd_init_cstream(&params, len, workspace->mem,
+ stream = zstd_init_cstream(&workspace->params, len, workspace->mem,
workspace->size);
if (unlikely(!stream)) {
struct btrfs_inode *inode = BTRFS_I(mapping->host);
@@ -420,9 +426,8 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio);
if (ret < 0)
goto out;
- pg_off = offset_in_page(start);
cur_len = btrfs_calc_input_length(orig_end, start);
- workspace->in_buf.src = kmap_local_folio(in_folio, pg_off);
+ workspace->in_buf.src = kmap_local_folio(in_folio, offset_in_page(start));
workspace->in_buf.pos = 0;
workspace->in_buf.size = cur_len;
@@ -506,9 +511,9 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio);
if (ret < 0)
goto out;
- pg_off = offset_in_page(start);
cur_len = btrfs_calc_input_length(orig_end, start);
- workspace->in_buf.src = kmap_local_folio(in_folio, pg_off);
+ workspace->in_buf.src = kmap_local_folio(in_folio,
+ offset_in_page(start));
workspace->in_buf.pos = 0;
workspace->in_buf.size = cur_len;
}
@@ -717,6 +722,7 @@ finish:
const struct btrfs_compress_op btrfs_zstd_compress = {
/* ZSTD uses own workspace manager */
.workspace_manager = NULL,
+ .min_level = ZSTD_BTRFS_MIN_LEVEL,
.max_level = ZSTD_BTRFS_MAX_LEVEL,
.default_level = ZSTD_BTRFS_DEFAULT_LEVEL,
};