summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_inode_dotl.c2
-rw-r--r--fs/afs/fs_probe.c2
-rw-r--r--fs/afs/server.c2
-rw-r--r--fs/bcachefs/Kconfig1
-rw-r--r--fs/bcachefs/acl.c4
-rw-r--r--fs/bcachefs/alloc_background.c95
-rw-r--r--fs/bcachefs/alloc_background.h6
-rw-r--r--fs/bcachefs/alloc_foreground.c79
-rw-r--r--fs/bcachefs/backpointers.c24
-rw-r--r--fs/bcachefs/bcachefs.h7
-rw-r--r--fs/bcachefs/btree_gc.c4
-rw-r--r--fs/bcachefs/btree_io.c17
-rw-r--r--fs/bcachefs/btree_iter.c188
-rw-r--r--fs/bcachefs/btree_iter.h122
-rw-r--r--fs/bcachefs/btree_key_cache.c32
-rw-r--r--fs/bcachefs/btree_node_scan.c8
-rw-r--r--fs/bcachefs/btree_types.h1
-rw-r--r--fs/bcachefs/btree_update.c26
-rw-r--r--fs/bcachefs/btree_update_interior.c12
-rw-r--r--fs/bcachefs/btree_write_buffer.c10
-rw-r--r--fs/bcachefs/buckets.c16
-rw-r--r--fs/bcachefs/buckets.h21
-rw-r--r--fs/bcachefs/buckets_types.h5
-rw-r--r--fs/bcachefs/chardev.c14
-rw-r--r--fs/bcachefs/clock.c2
-rw-r--r--fs/bcachefs/compress.c5
-rw-r--r--fs/bcachefs/data_update.c8
-rw-r--r--fs/bcachefs/debug.c4
-rw-r--r--fs/bcachefs/dirent.c16
-rw-r--r--fs/bcachefs/disk_accounting.c4
-rw-r--r--fs/bcachefs/disk_groups.c4
-rw-r--r--fs/bcachefs/ec.c18
-rw-r--r--fs/bcachefs/error.c7
-rw-r--r--fs/bcachefs/extent_update.c6
-rw-r--r--fs/bcachefs/fs-io-buffered.c6
-rw-r--r--fs/bcachefs/fs-io.c14
-rw-r--r--fs/bcachefs/fs.c24
-rw-r--r--fs/bcachefs/fsck.c30
-rw-r--r--fs/bcachefs/inode.c18
-rw-r--r--fs/bcachefs/io_misc.c18
-rw-r--r--fs/bcachefs/io_read.c14
-rw-r--r--fs/bcachefs/io_write.c40
-rw-r--r--fs/bcachefs/journal.c14
-rw-r--r--fs/bcachefs/journal_io.c8
-rw-r--r--fs/bcachefs/migrate.c4
-rw-r--r--fs/bcachefs/move.c14
-rw-r--r--fs/bcachefs/movinggc.c8
-rw-r--r--fs/bcachefs/namei.c38
-rw-r--r--fs/bcachefs/quota.c2
-rw-r--r--fs/bcachefs/rebalance.c12
-rw-r--r--fs/bcachefs/recovery.c6
-rw-r--r--fs/bcachefs/reflink.c23
-rw-r--r--fs/bcachefs/sb-members.h23
-rw-r--r--fs/bcachefs/snapshot.c13
-rw-r--r--fs/bcachefs/str_hash.c2
-rw-r--r--fs/bcachefs/str_hash.h8
-rw-r--r--fs/bcachefs/subvolume.c4
-rw-r--r--fs/bcachefs/subvolume.h14
-rw-r--r--fs/bcachefs/super-io.c21
-rw-r--r--fs/bcachefs/super.c85
-rw-r--r--fs/bcachefs/tests.c30
-rw-r--r--fs/bcachefs/xattr.c2
-rw-r--r--fs/btrfs/zstd.c2
-rw-r--r--fs/cachefiles/namei.c7
-rw-r--r--fs/exec.c18
-rw-r--r--fs/exportfs/expfs.c1
-rw-r--r--fs/ext4/super.c2
-rw-r--r--fs/fuse/dev.c162
-rw-r--r--fs/fuse/dev_uring.c43
-rw-r--r--fs/fuse/dev_uring_i.h18
-rw-r--r--fs/fuse/dir.c11
-rw-r--r--fs/fuse/fuse_dev_i.h4
-rw-r--r--fs/fuse/fuse_i.h47
-rw-r--r--fs/fuse/inode.c51
-rw-r--r--fs/fuse/sysctl.c24
-rw-r--r--fs/hostfs/hostfs.h2
-rw-r--r--fs/hostfs/hostfs_kern.c7
-rw-r--r--fs/hostfs/hostfs_user.c59
-rw-r--r--fs/jbd2/journal.c4
-rw-r--r--fs/jffs2/wbuf.c2
-rw-r--r--fs/namespace.c7
-rw-r--r--fs/nfs/client.c5
-rw-r--r--fs/nfs/delegation.c66
-rw-r--r--fs/nfs/dir.c2
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c24
-rw-r--r--fs/nfs/fs_context.c71
-rw-r--r--fs/nfs/inode.c2
-rw-r--r--fs/nfs/internal.h5
-rw-r--r--fs/nfs/nfs3client.c2
-rw-r--r--fs/nfs/nfs3proc.c2
-rw-r--r--fs/nfs/nfs42proc.c172
-rw-r--r--fs/nfs/nfs42xdr.c86
-rw-r--r--fs/nfs/nfs4client.c7
-rw-r--r--fs/nfs/nfs4proc.c17
-rw-r--r--fs/nfs/nfs4state.c14
-rw-r--r--fs/nfs/nfs4trace.h11
-rw-r--r--fs/nfs/nfs4xdr.c19
-rw-r--r--fs/nfs/super.c4
-rw-r--r--fs/nfs/sysfs.c82
-rw-r--r--fs/nfs/write.c4
-rw-r--r--fs/nilfs2/segment.c2
-rw-r--r--fs/ntfs3/attrib.c3
-rw-r--r--fs/ntfs3/file.c42
-rw-r--r--fs/ntfs3/frecord.c63
-rw-r--r--fs/ntfs3/fsntfs.c28
-rw-r--r--fs/ntfs3/index.c4
-rw-r--r--fs/ntfs3/inode.c40
-rw-r--r--fs/ntfs3/ntfs.h2
-rw-r--r--fs/ntfs3/ntfs_fs.h6
-rw-r--r--fs/ntfs3/super.c89
-rw-r--r--fs/ocfs2/cluster/tcp.c2
-rw-r--r--fs/pstore/platform.c2
-rw-r--r--fs/smb/client/cifs_fs_sb.h1
-rw-r--r--fs/smb/client/cifsfs.h5
-rw-r--r--fs/smb/client/cifsglob.h7
-rw-r--r--fs/smb/client/cifssmb.c32
-rw-r--r--fs/smb/client/connect.c180
-rw-r--r--fs/smb/client/fs_context.c21
-rw-r--r--fs/smb/client/fs_context.h5
-rw-r--r--fs/smb/client/inode.c19
-rw-r--r--fs/smb/client/link.c8
-rw-r--r--fs/smb/client/misc.c2
-rw-r--r--fs/smb/client/smb1ops.c57
-rw-r--r--fs/smb/client/smb2file.c21
-rw-r--r--fs/smb/client/smb2glob.h1
-rw-r--r--fs/smb/client/smb2inode.c67
-rw-r--r--fs/smb/client/smb2ops.c32
-rw-r--r--fs/smb/client/smb2pdu.c24
-rw-r--r--fs/smb/client/transport.c3
-rw-r--r--fs/smb/client/xattr.c36
-rw-r--r--fs/smb/common/smb2pdu.h3
-rw-r--r--fs/smb/server/auth.c4
-rw-r--r--fs/smb/server/connection.h11
-rw-r--r--fs/smb/server/mgmt/user_session.c18
-rw-r--r--fs/smb/server/smb2pdu.c21
-rw-r--r--fs/smb/server/smbacl.c21
-rw-r--r--fs/userfaultfd.c51
137 files changed, 2257 insertions, 1009 deletions
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index cc2007be2173..5b5fda617b80 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -407,8 +407,8 @@ static struct dentry *v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap,
err);
goto error;
}
- v9fs_fid_add(dentry, &fid);
v9fs_set_create_acl(inode, fid, dacl, pacl);
+ v9fs_fid_add(dentry, &fid);
d_instantiate(dentry, inode);
err = 0;
inc_nlink(dir);
diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c
index 07a8bfbdd9b9..e0030ac74ea0 100644
--- a/fs/afs/fs_probe.c
+++ b/fs/afs/fs_probe.c
@@ -534,6 +534,6 @@ dont_wait:
*/
void afs_fs_probe_cleanup(struct afs_net *net)
{
- if (del_timer_sync(&net->fs_probe_timer))
+ if (timer_delete_sync(&net->fs_probe_timer))
afs_dec_servers_outstanding(net);
}
diff --git a/fs/afs/server.c b/fs/afs/server.c
index c530d1ca15df..8755f2703815 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -318,7 +318,7 @@ struct afs_server *afs_use_server(struct afs_server *server, bool activate,
a = atomic_inc_return(&server->active);
if (a == 1 && activate &&
!test_bit(AFS_SERVER_FL_EXPIRED, &server->flags))
- del_timer(&server->timer);
+ timer_delete(&server->timer);
trace_afs_server(server->debug_id, r + 1, a, reason);
return server;
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
index c9798750202d..bf1c94e51dd0 100644
--- a/fs/bcachefs/Kconfig
+++ b/fs/bcachefs/Kconfig
@@ -26,6 +26,7 @@ config BCACHEFS_FS
select SRCU
select SYMBOLIC_ERRNAME
select MIN_HEAP
+ select XARRAY_MULTI
help
The bcachefs filesystem - a modern, copy on write filesystem, with
support for multiple devices, compression, checksumming, etc.
diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
index 99487727ae64..d03adc36100e 100644
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
@@ -273,7 +273,7 @@ struct posix_acl *bch2_get_acl(struct inode *vinode, int type, bool rcu)
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0);
- struct btree_iter iter = { NULL };
+ struct btree_iter iter = {};
struct posix_acl *acl = NULL;
if (rcu)
@@ -344,7 +344,7 @@ int bch2_set_acl(struct mnt_idmap *idmap,
{
struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct btree_iter inode_iter = { NULL };
+ struct btree_iter inode_iter = {};
struct bch_inode_unpacked inode_u;
struct posix_acl *acl;
umode_t mode;
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index c12ca7538e4f..94ea9e49aec4 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -610,7 +610,7 @@ int bch2_alloc_read(struct bch_fs *c)
* bch2_check_alloc_key() which runs later:
*/
if (!ca) {
- bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
+ bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0));
continue;
}
@@ -631,17 +631,17 @@ int bch2_alloc_read(struct bch_fs *c)
* bch2_check_alloc_key() which runs later:
*/
if (!ca) {
- bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
+ bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0));
continue;
}
if (k.k->p.offset < ca->mi.first_bucket) {
- bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode, ca->mi.first_bucket));
+ bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode, ca->mi.first_bucket));
continue;
}
if (k.k->p.offset >= ca->mi.nbuckets) {
- bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
+ bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0));
continue;
}
@@ -1039,9 +1039,10 @@ invalid_bucket:
* This synthesizes deleted extents for holes, similar to BTREE_ITER_slots for
* extents style btrees, but works on non-extents btrees:
*/
-static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos end, struct bkey *hole)
+static struct bkey_s_c bch2_get_key_or_hole(struct btree_trans *trans, struct btree_iter *iter,
+ struct bpos end, struct bkey *hole)
{
- struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
+ struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, iter);
if (bkey_err(k))
return k;
@@ -1052,9 +1053,9 @@ static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos
struct btree_iter iter2;
struct bpos next;
- bch2_trans_copy_iter(&iter2, iter);
+ bch2_trans_copy_iter(trans, &iter2, iter);
- struct btree_path *path = btree_iter_path(iter->trans, iter);
+ struct btree_path *path = btree_iter_path(trans, iter);
if (!bpos_eq(path->l[0].b->key.k.p, SPOS_MAX))
end = bkey_min(end, bpos_nosnap_successor(path->l[0].b->key.k.p));
@@ -1064,9 +1065,9 @@ static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos
* btree node min/max is a closed interval, upto takes a half
* open interval:
*/
- k = bch2_btree_iter_peek_max(&iter2, end);
+ k = bch2_btree_iter_peek_max(trans, &iter2, end);
next = iter2.pos;
- bch2_trans_iter_exit(iter->trans, &iter2);
+ bch2_trans_iter_exit(trans, &iter2);
BUG_ON(next.offset >= iter->pos.offset + U32_MAX);
@@ -1107,13 +1108,14 @@ static bool next_bucket(struct bch_fs *c, struct bch_dev **ca, struct bpos *buck
return *ca != NULL;
}
-static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter,
- struct bch_dev **ca, struct bkey *hole)
+static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bch_dev **ca, struct bkey *hole)
{
- struct bch_fs *c = iter->trans->c;
+ struct bch_fs *c = trans->c;
struct bkey_s_c k;
again:
- k = bch2_get_key_or_hole(iter, POS_MAX, hole);
+ k = bch2_get_key_or_hole(trans, iter, POS_MAX, hole);
if (bkey_err(k))
return k;
@@ -1126,7 +1128,7 @@ again:
if (!next_bucket(c, ca, &hole_start))
return bkey_s_c_null;
- bch2_btree_iter_set_pos(iter, hole_start);
+ bch2_btree_iter_set_pos(trans, iter, hole_start);
goto again;
}
@@ -1167,8 +1169,8 @@ int bch2_check_alloc_key(struct btree_trans *trans,
a = bch2_alloc_to_v4(alloc_k, &a_convert);
- bch2_btree_iter_set_pos(discard_iter, alloc_k.k->p);
- k = bch2_btree_iter_peek_slot(discard_iter);
+ bch2_btree_iter_set_pos(trans, discard_iter, alloc_k.k->p);
+ k = bch2_btree_iter_peek_slot(trans, discard_iter);
ret = bkey_err(k);
if (ret)
goto err;
@@ -1181,8 +1183,8 @@ int bch2_check_alloc_key(struct btree_trans *trans,
goto err;
}
- bch2_btree_iter_set_pos(freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a));
- k = bch2_btree_iter_peek_slot(freespace_iter);
+ bch2_btree_iter_set_pos(trans, freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a));
+ k = bch2_btree_iter_peek_slot(trans, freespace_iter);
ret = bkey_err(k);
if (ret)
goto err;
@@ -1195,8 +1197,8 @@ int bch2_check_alloc_key(struct btree_trans *trans,
goto err;
}
- bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(alloc_k.k->p, &gens_offset));
- k = bch2_btree_iter_peek_slot(bucket_gens_iter);
+ bch2_btree_iter_set_pos(trans, bucket_gens_iter, alloc_gens_pos(alloc_k.k->p, &gens_offset));
+ k = bch2_btree_iter_peek_slot(trans, bucket_gens_iter);
ret = bkey_err(k);
if (ret)
goto err;
@@ -1249,9 +1251,9 @@ int bch2_check_alloc_hole_freespace(struct btree_trans *trans,
if (!ca->mi.freespace_initialized)
return 0;
- bch2_btree_iter_set_pos(freespace_iter, start);
+ bch2_btree_iter_set_pos(trans, freespace_iter, start);
- k = bch2_btree_iter_peek_slot(freespace_iter);
+ k = bch2_btree_iter_peek_slot(trans, freespace_iter);
ret = bkey_err(k);
if (ret)
goto err;
@@ -1300,9 +1302,9 @@ int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans,
unsigned i, gens_offset, gens_end_offset;
int ret;
- bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(start, &gens_offset));
+ bch2_btree_iter_set_pos(trans, bucket_gens_iter, alloc_gens_pos(start, &gens_offset));
- k = bch2_btree_iter_peek_slot(bucket_gens_iter);
+ k = bch2_btree_iter_peek_slot(trans, bucket_gens_iter);
ret = bkey_err(k);
if (ret)
goto err;
@@ -1435,7 +1437,7 @@ int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_ite
*gen = a->gen;
out:
fsck_err:
- bch2_set_btree_iter_dontneed(&alloc_iter);
+ bch2_set_btree_iter_dontneed(trans, &alloc_iter);
bch2_trans_iter_exit(trans, &alloc_iter);
printbuf_exit(&buf);
return ret;
@@ -1572,7 +1574,7 @@ int bch2_check_alloc_info(struct bch_fs *c)
bch2_trans_begin(trans);
- k = bch2_get_key_or_real_bucket_hole(&iter, &ca, &hole);
+ k = bch2_get_key_or_real_bucket_hole(trans, &iter, &ca, &hole);
ret = bkey_err(k);
if (ret)
goto bkey_err;
@@ -1610,7 +1612,7 @@ int bch2_check_alloc_info(struct bch_fs *c)
if (ret)
goto bkey_err;
- bch2_btree_iter_set_pos(&iter, next);
+ bch2_btree_iter_set_pos(trans, &iter, next);
bkey_err:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
continue;
@@ -1638,7 +1640,7 @@ bkey_err:
BTREE_ITER_prefetch);
while (1) {
bch2_trans_begin(trans);
- k = bch2_btree_iter_peek(&iter);
+ k = bch2_btree_iter_peek(trans, &iter);
if (!k.k)
break;
@@ -1657,7 +1659,7 @@ bkey_err:
break;
}
- bch2_btree_iter_set_pos(&iter, bpos_nosnap_successor(iter.pos));
+ bch2_btree_iter_set_pos(trans, &iter, bpos_nosnap_successor(iter.pos));
}
bch2_trans_iter_exit(trans, &iter);
if (ret)
@@ -1685,7 +1687,7 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
struct printbuf buf = PRINTBUF;
int ret;
- alloc_k = bch2_btree_iter_peek(alloc_iter);
+ alloc_k = bch2_btree_iter_peek(trans, alloc_iter);
if (!alloc_k.k)
return 0;
@@ -1826,7 +1828,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct bpos pos = need_discard_iter->pos;
- struct btree_iter iter = { NULL };
+ struct btree_iter iter = {};
struct bkey_s_c k;
struct bkey_i_alloc_v4 *a;
struct printbuf buf = PRINTBUF;
@@ -1950,7 +1952,7 @@ static void bch2_do_discards_work(struct work_struct *work)
trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded,
bch2_err_str(ret));
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[WRITE]);
bch2_write_ref_put(c, BCH_WRITE_REF_discard);
}
@@ -1967,7 +1969,7 @@ void bch2_dev_do_discards(struct bch_dev *ca)
if (queue_work(c->write_ref_wq, &ca->discard_work))
return;
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[WRITE]);
put_write_ref:
bch2_write_ref_put(c, BCH_WRITE_REF_discard);
}
@@ -2045,7 +2047,7 @@ static void bch2_do_discards_fast_work(struct work_struct *work)
trace_discard_buckets_fast(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret));
bch2_trans_put(trans);
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[WRITE]);
bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
}
@@ -2065,7 +2067,7 @@ static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket)
if (queue_work(c->write_ref_wq, &ca->discard_fast_work))
return;
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[WRITE]);
put_ref:
bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
}
@@ -2082,6 +2084,9 @@ static int invalidate_one_bp(struct btree_trans *trans,
if (ret)
return ret;
+ if (!extent_k.k)
+ return 0;
+
struct bkey_i *n =
bch2_bkey_make_mut(trans, &extent_iter, &extent_k,
BTREE_UPDATE_internal_snapshot_node);
@@ -2199,9 +2204,9 @@ static struct bkey_s_c next_lru_key(struct btree_trans *trans, struct btree_iter
{
struct bkey_s_c k;
again:
- k = bch2_btree_iter_peek_max(iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX));
+ k = bch2_btree_iter_peek_max(trans, iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX));
if (!k.k && !*wrapped) {
- bch2_btree_iter_set_pos(iter, lru_pos(ca->dev_idx, 0, 0));
+ bch2_btree_iter_set_pos(trans, iter, lru_pos(ca->dev_idx, 0, 0));
*wrapped = true;
goto again;
}
@@ -2251,12 +2256,12 @@ restart_err:
if (ret)
break;
- bch2_btree_iter_advance(&iter);
+ bch2_btree_iter_advance(trans, &iter);
}
bch2_trans_iter_exit(trans, &iter);
err:
bch2_trans_put(trans);
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[WRITE]);
bch2_bkey_buf_exit(&last_flushed, c);
bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
}
@@ -2274,7 +2279,7 @@ void bch2_dev_do_invalidates(struct bch_dev *ca)
if (queue_work(c->write_ref_wq, &ca->invalidate_work))
return;
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[WRITE]);
put_ref:
bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
}
@@ -2321,7 +2326,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
break;
}
- k = bch2_get_key_or_hole(&iter, end, &hole);
+ k = bch2_get_key_or_hole(trans, &iter, end, &hole);
ret = bkey_err(k);
if (ret)
goto bkey_err;
@@ -2340,7 +2345,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
if (ret)
goto bkey_err;
- bch2_btree_iter_advance(&iter);
+ bch2_btree_iter_advance(trans, &iter);
} else {
struct bkey_i *freespace;
@@ -2360,7 +2365,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
if (ret)
goto bkey_err;
- bch2_btree_iter_set_pos(&iter, k.k->p);
+ bch2_btree_iter_set_pos(trans, &iter, k.k->p);
}
bkey_err:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -2506,7 +2511,7 @@ void bch2_recalc_capacity(struct bch_fs *c)
bch2_set_ra_pages(c, ra_pages);
- for_each_rw_member(c, ca) {
+ __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw), READ) {
u64 dev_reserve = 0;
/*
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index c556ccaffe89..34b3d6ac4fbb 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -321,11 +321,11 @@ static inline u64 should_invalidate_buckets(struct bch_dev *ca,
{
u64 want_free = ca->mi.nbuckets >> 7;
u64 free = max_t(s64, 0,
- u.d[BCH_DATA_free].buckets
- + u.d[BCH_DATA_need_discard].buckets
+ u.buckets[BCH_DATA_free]
+ + u.buckets[BCH_DATA_need_discard]
- bch2_dev_buckets_reserved(ca, BCH_WATERMARK_stripe));
- return clamp_t(s64, want_free - free, 0, u.d[BCH_DATA_cached].buckets);
+ return clamp_t(s64, want_free - free, 0, u.buckets[BCH_DATA_cached]);
}
void bch2_dev_do_invalidates(struct bch_dev *);
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index da0d72928b5b..7c930ef77380 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -327,7 +327,7 @@ again:
bucket = sector_to_bucket(ca,
round_up(bucket_to_sector(ca, bucket) + 1,
1ULL << ca->mi.btree_bitmap_shift));
- bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, bucket));
+ bch2_btree_iter_set_pos(trans, &iter, POS(ca->dev_idx, bucket));
s->buckets_seen++;
s->skipped_mi_btree_bitmap++;
continue;
@@ -355,7 +355,7 @@ again:
watermark, s, cl)
: NULL;
next:
- bch2_set_btree_iter_dontneed(&citer);
+ bch2_set_btree_iter_dontneed(trans, &citer);
bch2_trans_iter_exit(trans, &citer);
if (ob)
break;
@@ -417,7 +417,7 @@ again:
1ULL << ca->mi.btree_bitmap_shift));
alloc_cursor = bucket|(iter.pos.offset & (~0ULL << 56));
- bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, alloc_cursor));
+ bch2_btree_iter_set_pos(trans, &iter, POS(ca->dev_idx, alloc_cursor));
s->skipped_mi_btree_bitmap++;
goto next;
}
@@ -426,7 +426,7 @@ again:
if (ob) {
if (!IS_ERR(ob))
*dev_alloc_cursor = iter.pos.offset;
- bch2_set_btree_iter_dontneed(&iter);
+ bch2_set_btree_iter_dontneed(trans, &iter);
break;
}
@@ -469,7 +469,7 @@ static noinline void trace_bucket_alloc2(struct bch_fs *c, struct bch_dev *ca,
prt_printf(&buf, "watermark\t%s\n", bch2_watermarks[watermark]);
prt_printf(&buf, "data type\t%s\n", __bch2_data_types[data_type]);
prt_printf(&buf, "blocking\t%u\n", cl != NULL);
- prt_printf(&buf, "free\t%llu\n", usage->d[BCH_DATA_free].buckets);
+ prt_printf(&buf, "free\t%llu\n", usage->buckets[BCH_DATA_free]);
prt_printf(&buf, "avail\t%llu\n", dev_buckets_free(ca, *usage, watermark));
prt_printf(&buf, "copygc_wait\t%lu/%lli\n",
bch2_copygc_wait_amount(c),
@@ -524,10 +524,10 @@ again:
bch2_dev_usage_read_fast(ca, usage);
avail = dev_buckets_free(ca, *usage, watermark);
- if (usage->d[BCH_DATA_need_discard].buckets > avail)
+ if (usage->buckets[BCH_DATA_need_discard] > avail)
bch2_dev_do_discards(ca);
- if (usage->d[BCH_DATA_need_gc_gens].buckets > avail)
+ if (usage->buckets[BCH_DATA_need_gc_gens] > avail)
bch2_gc_gens_async(c);
if (should_invalidate_buckets(ca, *usage))
@@ -606,8 +606,7 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
static int __dev_stripe_cmp(struct dev_stripe_state *stripe,
unsigned l, unsigned r)
{
- return ((stripe->next_alloc[l] > stripe->next_alloc[r]) -
- (stripe->next_alloc[l] < stripe->next_alloc[r]));
+ return cmp_int(stripe->next_alloc[l], stripe->next_alloc[r]);
}
#define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r)
@@ -626,25 +625,62 @@ struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c,
return ret;
}
+static const u64 stripe_clock_hand_rescale = 1ULL << 62; /* trigger rescale at */
+static const u64 stripe_clock_hand_max = 1ULL << 56; /* max after rescale */
+static const u64 stripe_clock_hand_inv = 1ULL << 52; /* max increment, if a device is empty */
+
+static noinline void bch2_stripe_state_rescale(struct dev_stripe_state *stripe)
+{
+ /*
+ * Avoid underflowing clock hands if at all possible, if clock hands go
+ * to 0 then we lose information - clock hands can be in a wide range if
+ * we have devices we rarely try to allocate from, if we generally
+ * allocate from a specified target but only sometimes have to fall back
+ * to the whole filesystem.
+ */
+ u64 scale_max = U64_MAX; /* maximum we can subtract without underflow */
+ u64 scale_min = 0; /* minumum we must subtract to avoid overflow */
+
+ for (u64 *v = stripe->next_alloc;
+ v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++) {
+ if (*v)
+ scale_max = min(scale_max, *v);
+ if (*v > stripe_clock_hand_max)
+ scale_min = max(scale_min, *v - stripe_clock_hand_max);
+ }
+
+ u64 scale = max(scale_min, scale_max);
+
+ for (u64 *v = stripe->next_alloc;
+ v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++)
+ *v = *v < scale ? 0 : *v - scale;
+}
+
static inline void bch2_dev_stripe_increment_inlined(struct bch_dev *ca,
struct dev_stripe_state *stripe,
struct bch_dev_usage *usage)
{
+ /*
+ * Stripe state has a per device clock hand: we allocate from the device
+ * with the smallest clock hand.
+ *
+ * When we allocate, we don't do a simple increment; we add the inverse
+ * of the device's free space. This results in round robin behavior that
+ * biases in favor of the device(s) with more free space.
+ */
+
u64 *v = stripe->next_alloc + ca->dev_idx;
u64 free_space = __dev_buckets_available(ca, *usage, BCH_WATERMARK_normal);
u64 free_space_inv = free_space
- ? div64_u64(1ULL << 48, free_space)
- : 1ULL << 48;
- u64 scale = *v / 4;
+ ? div64_u64(stripe_clock_hand_inv, free_space)
+ : stripe_clock_hand_inv;
- if (*v + free_space_inv >= *v)
- *v += free_space_inv;
- else
- *v = U64_MAX;
+ /* Saturating add, avoid overflow: */
+ u64 sum = *v + free_space_inv;
+ *v = sum >= *v ? sum : U64_MAX;
- for (v = stripe->next_alloc;
- v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++)
- *v = *v < scale ? 0 : *v - scale;
+ if (unlikely(*v > stripe_clock_hand_rescale))
+ bch2_stripe_state_rescale(stripe);
}
void bch2_dev_stripe_increment(struct bch_dev *ca,
@@ -1633,7 +1669,7 @@ void bch2_fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c)
void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
{
struct bch_fs *c = ca->fs;
- struct bch_dev_usage stats = bch2_dev_usage_read(ca);
+ struct bch_dev_usage_full stats = bch2_dev_usage_full_read(ca);
unsigned nr[BCH_DATA_NR];
memset(nr, 0, sizeof(nr));
@@ -1656,7 +1692,8 @@ void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
printbuf_tabstop_push(out, 16);
prt_printf(out, "open buckets\t%i\r\n", ca->nr_open_buckets);
- prt_printf(out, "buckets to invalidate\t%llu\r\n", should_invalidate_buckets(ca, stats));
+ prt_printf(out, "buckets to invalidate\t%llu\r\n",
+ should_invalidate_buckets(ca, bch2_dev_usage_read(ca)));
}
static noinline void bch2_print_allocator_stuck(struct bch_fs *c)
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index 21d1d86d5008..ff26bb515150 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -252,12 +252,24 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
0,
bp.v->level,
iter_flags);
- struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
+ struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, iter);
if (bkey_err(k)) {
bch2_trans_iter_exit(trans, iter);
return k;
}
+ /*
+ * peek_slot() doesn't normally return NULL - except when we ask for a
+ * key at a btree level that doesn't exist.
+ *
+ * We may want to revisit this and change peek_slot():
+ */
+ if (!k.k) {
+ bkey_init(&iter->k);
+ iter->k.p = bp.v->pos;
+ k.k = &iter->k;
+ }
+
if (k.k &&
extent_matches_bp(c, bp.v->btree_id, bp.v->level, k, bp))
return k;
@@ -293,7 +305,7 @@ struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
0,
bp.v->level - 1,
0);
- struct btree *b = bch2_btree_iter_peek_node(iter);
+ struct btree *b = bch2_btree_iter_peek_node(trans, iter);
if (IS_ERR_OR_NULL(b))
goto err;
@@ -321,7 +333,7 @@ static int bch2_check_backpointer_has_valid_bucket(struct btree_trans *trans, st
return 0;
struct bch_fs *c = trans->c;
- struct btree_iter alloc_iter = { NULL };
+ struct btree_iter alloc_iter = {};
struct bkey_s_c alloc_k;
struct printbuf buf = PRINTBUF;
int ret = 0;
@@ -462,7 +474,7 @@ err:
if (bio)
bio_put(bio);
kvfree(data_buf);
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[READ]);
printbuf_exit(&buf);
return ret;
}
@@ -650,7 +662,7 @@ static int check_btree_root_to_backpointers(struct btree_trans *trans,
retry:
bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN,
0, bch2_btree_id_root(c, btree_id)->b->c.level, 0);
- b = bch2_btree_iter_peek_node(&iter);
+ b = bch2_btree_iter_peek_node(trans, &iter);
ret = PTR_ERR_OR_ZERO(b);
if (ret)
goto err;
@@ -934,7 +946,7 @@ static int btree_node_get_and_pin(struct btree_trans *trans, struct bkey_i *k,
{
struct btree_iter iter;
bch2_trans_node_iter_init(trans, &iter, btree, k->k.p, 0, level, 0);
- struct btree *b = bch2_btree_iter_peek_node(&iter);
+ struct btree *b = bch2_btree_iter_peek_node(trans, &iter);
int ret = PTR_ERR_OR_ZERO(b);
if (ret)
goto err;
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index f52311017aee..5d9f208a1bb7 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -524,8 +524,8 @@ struct bch_dev {
struct percpu_ref ref;
#endif
struct completion ref_completion;
- struct percpu_ref io_ref;
- struct completion io_ref_completion;
+ struct percpu_ref io_ref[2];
+ struct completion io_ref_completion[2];
struct bch_fs *fs;
@@ -562,7 +562,8 @@ struct bch_dev {
unsigned long *bucket_backpointer_mismatches;
unsigned long *bucket_backpointer_empty;
- struct bch_dev_usage __percpu *usage;
+ struct bch_dev_usage_full __percpu
+ *usage;
/* Allocator: */
u64 alloc_cursor[3];
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 2025d408979c..7b98ba2dec64 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -691,7 +691,7 @@ retry_root:
struct btree_iter iter;
bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN,
0, bch2_btree_id_root(c, btree)->b->c.level, 0);
- struct btree *b = bch2_btree_iter_peek_node(&iter);
+ struct btree *b = bch2_btree_iter_peek_node(trans, &iter);
ret = PTR_ERR_OR_ZERO(b);
if (ret)
goto err_root;
@@ -1199,7 +1199,7 @@ int bch2_gc_gens(struct bch_fs *c)
BCH_TRANS_COMMIT_no_enospc, ({
ca = bch2_dev_iterate(c, ca, k.k->p.inode);
if (!ca) {
- bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
+ bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0));
continue;
}
bch2_alloc_write_oldest_gen(trans, ca, &iter, k);
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 1d94a2bf706d..5fd4a58d2ad2 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1353,7 +1353,7 @@ start:
"btree read error %s for %s",
bch2_blk_status_to_str(bio->bi_status), buf.buf);
if (rb->have_ioref)
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[READ]);
rb->have_ioref = false;
bch2_mark_io_failure(&failed, &rb->pick, false);
@@ -1609,6 +1609,7 @@ static void btree_node_read_all_replicas_endio(struct bio *bio)
struct bch_dev *ca = bch2_dev_have_ref(c, rb->pick.ptr.dev);
bch2_latency_acct(ca, rb->start_time, READ);
+ percpu_ref_put(&ca->io_ref[READ]);
}
ra->err[rb->idx] = bio->bi_status;
@@ -1908,7 +1909,8 @@ static void btree_node_scrub_work(struct work_struct *work)
scrub->key.k->k.p, 0, scrub->level - 1, 0);
struct btree *b;
- int ret = lockrestart_do(trans, PTR_ERR_OR_ZERO(b = bch2_btree_iter_peek_node(&iter)));
+ int ret = lockrestart_do(trans,
+ PTR_ERR_OR_ZERO(b = bch2_btree_iter_peek_node(trans, &iter)));
if (ret)
goto err;
@@ -1927,7 +1929,7 @@ err:
printbuf_exit(&err);
bch2_bkey_buf_exit(&scrub->key, c);;
btree_bounce_free(c, c->opts.btree_node_size, scrub->used_mempool, scrub->buf);
- percpu_ref_put(&scrub->ca->io_ref);
+ percpu_ref_put(&scrub->ca->io_ref[READ]);
kfree(scrub);
bch2_write_ref_put(c, BCH_WRITE_REF_btree_node_scrub);
}
@@ -1996,7 +1998,7 @@ int bch2_btree_node_scrub(struct btree_trans *trans,
return 0;
err_free:
btree_bounce_free(c, c->opts.btree_node_size, used_mempool, buf);
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[READ]);
err:
bch2_write_ref_put(c, BCH_WRITE_REF_btree_node_scrub);
return ret;
@@ -2144,6 +2146,7 @@ static void btree_node_write_endio(struct bio *bio)
if (ca && bio->bi_status) {
struct printbuf buf = PRINTBUF;
+ buf.atomic++;
prt_printf(&buf, "btree write error: %s\n ",
bch2_blk_status_to_str(bio->bi_status));
bch2_btree_pos_to_text(&buf, c, b);
@@ -2158,8 +2161,12 @@ static void btree_node_write_endio(struct bio *bio)
spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
}
+ /*
+ * XXX: we should be using io_ref[WRITE], but we aren't retrying failed
+ * btree writes yet (due to device removal/ro):
+ */
if (wbio->have_ioref)
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[READ]);
if (parent) {
bio_put(bio);
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index a9c110b846b5..e34e9598ef25 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -244,10 +244,8 @@ void bch2_trans_verify_paths(struct btree_trans *trans)
bch2_btree_path_verify(trans, path);
}
-static void bch2_btree_iter_verify(struct btree_iter *iter)
+static void bch2_btree_iter_verify(struct btree_trans *trans, struct btree_iter *iter)
{
- struct btree_trans *trans = iter->trans;
-
BUG_ON(!!(iter->flags & BTREE_ITER_cached) != btree_iter_path(trans, iter)->cached);
BUG_ON((iter->flags & BTREE_ITER_is_extents) &&
@@ -276,9 +274,9 @@ static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
bkey_gt(iter->pos, iter->k.p)));
}
-static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k)
+static int bch2_btree_iter_verify_ret(struct btree_trans *trans,
+ struct btree_iter *iter, struct bkey_s_c k)
{
- struct btree_trans *trans = iter->trans;
struct btree_iter copy;
struct bkey_s_c prev;
int ret = 0;
@@ -299,7 +297,7 @@ static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k
bch2_trans_iter_init(trans, &copy, iter->btree_id, iter->pos,
BTREE_ITER_nopreserve|
BTREE_ITER_all_snapshots);
- prev = bch2_btree_iter_prev(&copy);
+ prev = bch2_btree_iter_prev(trans, &copy);
if (!prev.k)
goto out;
@@ -365,9 +363,11 @@ static inline void bch2_btree_path_verify_level(struct btree_trans *trans,
struct btree_path *path, unsigned l) {}
static inline void bch2_btree_path_verify(struct btree_trans *trans,
struct btree_path *path) {}
-static inline void bch2_btree_iter_verify(struct btree_iter *iter) {}
+static inline void bch2_btree_iter_verify(struct btree_trans *trans,
+ struct btree_iter *iter) {}
static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {}
-static inline int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) { return 0; }
+static inline int bch2_btree_iter_verify_ret(struct btree_trans *trans, struct btree_iter *iter,
+ struct bkey_s_c k) { return 0; }
#endif
@@ -1855,10 +1855,8 @@ hole:
return (struct bkey_s_c) { u, NULL };
}
-void bch2_set_btree_iter_dontneed(struct btree_iter *iter)
+void bch2_set_btree_iter_dontneed(struct btree_trans *trans, struct btree_iter *iter)
{
- struct btree_trans *trans = iter->trans;
-
if (!iter->path || trans->restarted)
return;
@@ -1870,17 +1868,14 @@ void bch2_set_btree_iter_dontneed(struct btree_iter *iter)
/* Btree iterators: */
int __must_check
-__bch2_btree_iter_traverse(struct btree_iter *iter)
+__bch2_btree_iter_traverse(struct btree_trans *trans, struct btree_iter *iter)
{
- return bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
+ return bch2_btree_path_traverse(trans, iter->path, iter->flags);
}
int __must_check
-bch2_btree_iter_traverse(struct btree_iter *iter)
+bch2_btree_iter_traverse(struct btree_trans *trans, struct btree_iter *iter)
{
- struct btree_trans *trans = iter->trans;
- int ret;
-
bch2_trans_verify_not_unlocked_or_in_restart(trans);
iter->path = bch2_btree_path_set_pos(trans, iter->path,
@@ -1888,7 +1883,7 @@ bch2_btree_iter_traverse(struct btree_iter *iter)
iter->flags & BTREE_ITER_intent,
btree_iter_ip_allocated(iter));
- ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
+ int ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
if (ret)
return ret;
@@ -1900,14 +1895,14 @@ bch2_btree_iter_traverse(struct btree_iter *iter)
/* Iterate across nodes (leaf and interior nodes) */
-struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
+struct btree *bch2_btree_iter_peek_node(struct btree_trans *trans,
+ struct btree_iter *iter)
{
- struct btree_trans *trans = iter->trans;
struct btree *b = NULL;
int ret;
EBUG_ON(trans->paths[iter->path].cached);
- bch2_btree_iter_verify(iter);
+ bch2_btree_iter_verify(trans, iter);
ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
if (ret)
@@ -1929,7 +1924,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter));
out:
bch2_btree_iter_verify_entry_exit(iter);
- bch2_btree_iter_verify(iter);
+ bch2_btree_iter_verify(trans, iter);
return b;
err:
@@ -1938,26 +1933,26 @@ err:
}
/* Only kept for -tools */
-struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *iter)
+struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_trans *trans,
+ struct btree_iter *iter)
{
struct btree *b;
- while (b = bch2_btree_iter_peek_node(iter),
+ while (b = bch2_btree_iter_peek_node(trans, iter),
bch2_err_matches(PTR_ERR_OR_ZERO(b), BCH_ERR_transaction_restart))
- bch2_trans_begin(iter->trans);
+ bch2_trans_begin(trans);
return b;
}
-struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
+struct btree *bch2_btree_iter_next_node(struct btree_trans *trans, struct btree_iter *iter)
{
- struct btree_trans *trans = iter->trans;
struct btree *b = NULL;
int ret;
EBUG_ON(trans->paths[iter->path].cached);
bch2_trans_verify_not_unlocked_or_in_restart(trans);
- bch2_btree_iter_verify(iter);
+ bch2_btree_iter_verify(trans, iter);
ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
if (ret)
@@ -2024,7 +2019,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
EBUG_ON(btree_iter_path(trans, iter)->uptodate);
out:
bch2_btree_iter_verify_entry_exit(iter);
- bch2_btree_iter_verify(iter);
+ bch2_btree_iter_verify(trans, iter);
return b;
err:
@@ -2034,7 +2029,7 @@ err:
/* Iterate across keys (in leaf nodes only) */
-inline bool bch2_btree_iter_advance(struct btree_iter *iter)
+inline bool bch2_btree_iter_advance(struct btree_trans *trans, struct btree_iter *iter)
{
struct bpos pos = iter->k.p;
bool ret = !(iter->flags & BTREE_ITER_all_snapshots
@@ -2043,11 +2038,11 @@ inline bool bch2_btree_iter_advance(struct btree_iter *iter)
if (ret && !(iter->flags & BTREE_ITER_is_extents))
pos = bkey_successor(iter, pos);
- bch2_btree_iter_set_pos(iter, pos);
+ bch2_btree_iter_set_pos(trans, iter, pos);
return ret;
}
-inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
+inline bool bch2_btree_iter_rewind(struct btree_trans *trans, struct btree_iter *iter)
{
struct bpos pos = bkey_start_pos(&iter->k);
bool ret = !(iter->flags & BTREE_ITER_all_snapshots
@@ -2056,7 +2051,7 @@ inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
if (ret && !(iter->flags & BTREE_ITER_is_extents))
pos = bkey_predecessor(iter, pos);
- bch2_btree_iter_set_pos(iter, pos);
+ bch2_btree_iter_set_pos(trans, iter, pos);
return ret;
}
@@ -2183,9 +2178,9 @@ void btree_trans_peek_prev_journal(struct btree_trans *trans,
* bkey_s_c_null:
*/
static noinline
-struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos)
+struct bkey_s_c btree_trans_peek_key_cache(struct btree_trans *trans, struct btree_iter *iter,
+ struct bpos pos)
{
- struct btree_trans *trans = iter->trans;
struct bch_fs *c = trans->c;
struct bkey u;
struct bkey_s_c k;
@@ -2231,14 +2226,14 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos
return k;
}
-static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key)
+static struct bkey_s_c __bch2_btree_iter_peek(struct btree_trans *trans, struct btree_iter *iter,
+ struct bpos search_key)
{
- struct btree_trans *trans = iter->trans;
struct bkey_s_c k, k2;
int ret;
EBUG_ON(btree_iter_path(trans, iter)->cached);
- bch2_btree_iter_verify(iter);
+ bch2_btree_iter_verify(trans, iter);
while (1) {
iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
@@ -2248,7 +2243,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
if (unlikely(ret)) {
/* ensure that iter->k is consistent with iter->pos: */
- bch2_btree_iter_set_pos(iter, iter->pos);
+ bch2_btree_iter_set_pos(trans, iter, iter->pos);
k = bkey_s_c_err(ret);
break;
}
@@ -2258,7 +2253,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
if (unlikely(!l->b)) {
/* No btree nodes at requested level: */
- bch2_btree_iter_set_pos(iter, SPOS_MAX);
+ bch2_btree_iter_set_pos(trans, iter, SPOS_MAX);
k = bkey_s_c_null;
break;
}
@@ -2269,10 +2264,10 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
if (unlikely(iter->flags & BTREE_ITER_with_key_cache) &&
k.k &&
- (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) {
+ (k2 = btree_trans_peek_key_cache(trans, iter, k.k->p)).k) {
k = k2;
if (bkey_err(k)) {
- bch2_btree_iter_set_pos(iter, iter->pos);
+ bch2_btree_iter_set_pos(trans, iter, iter->pos);
break;
}
}
@@ -2305,27 +2300,28 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
search_key = bpos_successor(l->b->key.k.p);
} else {
/* End of btree: */
- bch2_btree_iter_set_pos(iter, SPOS_MAX);
+ bch2_btree_iter_set_pos(trans, iter, SPOS_MAX);
k = bkey_s_c_null;
break;
}
}
- bch2_btree_iter_verify(iter);
+ bch2_btree_iter_verify(trans, iter);
return k;
}
/**
* bch2_btree_iter_peek_max() - returns first key greater than or equal to
* iterator's current position
+ * @trans: btree transaction object
* @iter: iterator to peek from
* @end: search limit: returns keys less than or equal to @end
*
* Returns: key if found, or an error extractable with bkey_err().
*/
-struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos end)
+struct bkey_s_c bch2_btree_iter_peek_max(struct btree_trans *trans, struct btree_iter *iter,
+ struct bpos end)
{
- struct btree_trans *trans = iter->trans;
struct bpos search_key = btree_iter_search_key(iter);
struct bkey_s_c k;
struct bpos iter_pos = iter->pos;
@@ -2348,7 +2344,7 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos en
}
while (1) {
- k = __bch2_btree_iter_peek(iter, search_key);
+ k = __bch2_btree_iter_peek(trans, iter, search_key);
if (unlikely(!k.k))
goto end;
if (unlikely(bkey_err(k)))
@@ -2462,9 +2458,9 @@ out_no_locked:
if (!(iter->flags & BTREE_ITER_all_snapshots))
iter->pos.snapshot = iter->snapshot;
- ret = bch2_btree_iter_verify_ret(iter, k);
+ ret = bch2_btree_iter_verify_ret(trans, iter, k);
if (unlikely(ret)) {
- bch2_btree_iter_set_pos(iter, iter->pos);
+ bch2_btree_iter_set_pos(trans, iter, iter->pos);
k = bkey_s_c_err(ret);
}
@@ -2472,7 +2468,7 @@ out_no_locked:
return k;
end:
- bch2_btree_iter_set_pos(iter, end);
+ bch2_btree_iter_set_pos(trans, iter, end);
k = bkey_s_c_null;
goto out_no_locked;
}
@@ -2480,24 +2476,25 @@ end:
/**
* bch2_btree_iter_next() - returns first key greater than iterator's current
* position
+ * @trans: btree transaction object
* @iter: iterator to peek from
*
* Returns: key if found, or an error extractable with bkey_err().
*/
-struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
+struct bkey_s_c bch2_btree_iter_next(struct btree_trans *trans, struct btree_iter *iter)
{
- if (!bch2_btree_iter_advance(iter))
+ if (!bch2_btree_iter_advance(trans, iter))
return bkey_s_c_null;
- return bch2_btree_iter_peek(iter);
+ return bch2_btree_iter_peek(trans, iter);
}
-static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, struct bpos search_key)
+static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_trans *trans, struct btree_iter *iter,
+ struct bpos search_key)
{
- struct btree_trans *trans = iter->trans;
struct bkey_s_c k, k2;
- bch2_btree_iter_verify(iter);
+ bch2_btree_iter_verify(trans, iter);
while (1) {
iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
@@ -2507,7 +2504,7 @@ static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, stru
int ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
if (unlikely(ret)) {
/* ensure that iter->k is consistent with iter->pos: */
- bch2_btree_iter_set_pos(iter, iter->pos);
+ bch2_btree_iter_set_pos(trans, iter, iter->pos);
k = bkey_s_c_err(ret);
break;
}
@@ -2517,7 +2514,7 @@ static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, stru
if (unlikely(!l->b)) {
/* No btree nodes at requested level: */
- bch2_btree_iter_set_pos(iter, SPOS_MAX);
+ bch2_btree_iter_set_pos(trans, iter, SPOS_MAX);
k = bkey_s_c_null;
break;
}
@@ -2533,10 +2530,10 @@ static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, stru
if (unlikely(iter->flags & BTREE_ITER_with_key_cache) &&
k.k &&
- (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) {
+ (k2 = btree_trans_peek_key_cache(trans, iter, k.k->p)).k) {
k = k2;
if (bkey_err(k2)) {
- bch2_btree_iter_set_pos(iter, iter->pos);
+ bch2_btree_iter_set_pos(trans, iter, iter->pos);
break;
}
}
@@ -2557,25 +2554,27 @@ static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, stru
search_key = bpos_predecessor(path->l[0].b->data->min_key);
} else {
/* Start of btree: */
- bch2_btree_iter_set_pos(iter, POS_MIN);
+ bch2_btree_iter_set_pos(trans, iter, POS_MIN);
k = bkey_s_c_null;
break;
}
}
- bch2_btree_iter_verify(iter);
+ bch2_btree_iter_verify(trans, iter);
return k;
}
/**
* bch2_btree_iter_peek_prev_min() - returns first key less than or equal to
* iterator's current position
+ * @trans: btree transaction object
* @iter: iterator to peek from
* @end: search limit: returns keys greater than or equal to @end
*
* Returns: key if found, or an error extractable with bkey_err().
*/
-struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bpos end)
+struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *trans, struct btree_iter *iter,
+ struct bpos end)
{
if ((iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots)) &&
!bkey_eq(iter->pos, POS_MAX)) {
@@ -2587,7 +2586,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bp
* real visible extents - easiest to just use peek_slot() (which
* internally uses peek() for extents)
*/
- struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
+ struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, iter);
if (bkey_err(k))
return k;
@@ -2597,7 +2596,6 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bp
return k;
}
- struct btree_trans *trans = iter->trans;
struct bpos search_key = iter->pos;
struct bkey_s_c k;
btree_path_idx_t saved_path = 0;
@@ -2613,7 +2611,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bp
}
while (1) {
- k = __bch2_btree_iter_peek_prev(iter, search_key);
+ k = __bch2_btree_iter_peek_prev(trans, iter, search_key);
if (unlikely(!k.k))
goto end;
if (unlikely(bkey_err(k)))
@@ -2704,10 +2702,10 @@ out_no_locked:
bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_intent);
bch2_btree_iter_verify_entry_exit(iter);
- bch2_btree_iter_verify(iter);
+ bch2_btree_iter_verify(trans, iter);
return k;
end:
- bch2_btree_iter_set_pos(iter, end);
+ bch2_btree_iter_set_pos(trans, iter, end);
k = bkey_s_c_null;
goto out_no_locked;
}
@@ -2715,27 +2713,27 @@ end:
/**
* bch2_btree_iter_prev() - returns first key less than iterator's current
* position
+ * @trans: btree transaction object
* @iter: iterator to peek from
*
* Returns: key if found, or an error extractable with bkey_err().
*/
-struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
+struct bkey_s_c bch2_btree_iter_prev(struct btree_trans *trans, struct btree_iter *iter)
{
- if (!bch2_btree_iter_rewind(iter))
+ if (!bch2_btree_iter_rewind(trans, iter))
return bkey_s_c_null;
- return bch2_btree_iter_peek_prev(iter);
+ return bch2_btree_iter_peek_prev(trans, iter);
}
-struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
+struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btree_iter *iter)
{
- struct btree_trans *trans = iter->trans;
struct bpos search_key;
struct bkey_s_c k;
int ret;
bch2_trans_verify_not_unlocked_or_in_restart(trans);
- bch2_btree_iter_verify(iter);
+ bch2_btree_iter_verify(trans, iter);
bch2_btree_iter_verify_entry_exit(iter);
EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_with_key_cache));
@@ -2751,7 +2749,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
if (iter->pos.inode == KEY_INODE_MAX)
return bkey_s_c_null;
- bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos));
+ bch2_btree_iter_set_pos(trans, iter, bpos_nosnap_successor(iter->pos));
}
search_key = btree_iter_search_key(iter);
@@ -2785,7 +2783,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
goto out;
if (unlikely(iter->flags & BTREE_ITER_with_key_cache) &&
- (k = btree_trans_peek_key_cache(iter, iter->pos)).k) {
+ (k = btree_trans_peek_key_cache(trans, iter, iter->pos)).k) {
if (!bkey_err(k))
iter->k = *k.k;
/* We're not returning a key from iter->path: */
@@ -2812,8 +2810,8 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
if (iter->flags & BTREE_ITER_intent) {
struct btree_iter iter2;
- bch2_trans_copy_iter(&iter2, iter);
- k = bch2_btree_iter_peek_max(&iter2, end);
+ bch2_trans_copy_iter(trans, &iter2, iter);
+ k = bch2_btree_iter_peek_max(trans, &iter2, end);
if (k.k && !bkey_err(k)) {
swap(iter->key_cache_path, iter2.key_cache_path);
@@ -2824,9 +2822,9 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
} else {
struct bpos pos = iter->pos;
- k = bch2_btree_iter_peek_max(iter, end);
+ k = bch2_btree_iter_peek_max(trans, iter, end);
if (unlikely(bkey_err(k)))
- bch2_btree_iter_set_pos(iter, pos);
+ bch2_btree_iter_set_pos(trans, iter, pos);
else
iter->pos = pos;
}
@@ -2857,39 +2855,39 @@ out:
btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter));
out_no_locked:
bch2_btree_iter_verify_entry_exit(iter);
- bch2_btree_iter_verify(iter);
- ret = bch2_btree_iter_verify_ret(iter, k);
+ bch2_btree_iter_verify(trans, iter);
+ ret = bch2_btree_iter_verify_ret(trans, iter, k);
if (unlikely(ret))
return bkey_s_c_err(ret);
return k;
}
-struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
+struct bkey_s_c bch2_btree_iter_next_slot(struct btree_trans *trans, struct btree_iter *iter)
{
- if (!bch2_btree_iter_advance(iter))
+ if (!bch2_btree_iter_advance(trans, iter))
return bkey_s_c_null;
- return bch2_btree_iter_peek_slot(iter);
+ return bch2_btree_iter_peek_slot(trans, iter);
}
-struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter)
+struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_trans *trans, struct btree_iter *iter)
{
- if (!bch2_btree_iter_rewind(iter))
+ if (!bch2_btree_iter_rewind(trans, iter))
return bkey_s_c_null;
- return bch2_btree_iter_peek_slot(iter);
+ return bch2_btree_iter_peek_slot(trans, iter);
}
/* Obsolete, but still used by rust wrapper in -tools */
-struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *iter)
+struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_trans *trans, struct btree_iter *iter)
{
struct bkey_s_c k;
- while (btree_trans_too_many_iters(iter->trans) ||
- (k = bch2_btree_iter_peek_type(iter, iter->flags),
+ while (btree_trans_too_many_iters(trans) ||
+ (k = bch2_btree_iter_peek_type(trans, iter, iter->flags),
bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart)))
- bch2_trans_begin(iter->trans);
+ bch2_trans_begin(trans);
return k;
}
@@ -3035,7 +3033,6 @@ void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter)
iter->path = 0;
iter->update_path = 0;
iter->key_cache_path = 0;
- iter->trans = NULL;
}
void bch2_trans_iter_init_outlined(struct btree_trans *trans,
@@ -3075,10 +3072,9 @@ void bch2_trans_node_iter_init(struct btree_trans *trans,
BUG_ON(iter->min_depth != depth);
}
-void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
+void bch2_trans_copy_iter(struct btree_trans *trans,
+ struct btree_iter *dst, struct btree_iter *src)
{
- struct btree_trans *trans = src->trans;
-
*dst = *src;
#ifdef TRACK_PATH_ALLOCATED
dst->ip_allocated = _RET_IP_;
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index e6f51a3b8187..9d2cccf5d21a 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -393,36 +393,37 @@ void bch2_trans_node_add(struct btree_trans *trans, struct btree_path *, struct
void bch2_trans_node_drop(struct btree_trans *trans, struct btree *);
void bch2_trans_node_reinit_iter(struct btree_trans *, struct btree *);
-int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter);
-int __must_check bch2_btree_iter_traverse(struct btree_iter *);
+int __must_check __bch2_btree_iter_traverse(struct btree_trans *, struct btree_iter *);
+int __must_check bch2_btree_iter_traverse(struct btree_trans *, struct btree_iter *);
-struct btree *bch2_btree_iter_peek_node(struct btree_iter *);
-struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *);
-struct btree *bch2_btree_iter_next_node(struct btree_iter *);
+struct btree *bch2_btree_iter_peek_node(struct btree_trans *, struct btree_iter *);
+struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_trans *, struct btree_iter *);
+struct btree *bch2_btree_iter_next_node(struct btree_trans *, struct btree_iter *);
-struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *, struct bpos);
-struct bkey_s_c bch2_btree_iter_next(struct btree_iter *);
+struct bkey_s_c bch2_btree_iter_peek_max(struct btree_trans *, struct btree_iter *, struct bpos);
+struct bkey_s_c bch2_btree_iter_next(struct btree_trans *, struct btree_iter *);
-static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
+static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_trans *trans,
+ struct btree_iter *iter)
{
- return bch2_btree_iter_peek_max(iter, SPOS_MAX);
+ return bch2_btree_iter_peek_max(trans, iter, SPOS_MAX);
}
-struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *, struct bpos);
+struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *, struct btree_iter *, struct bpos);
-static inline struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
+static inline struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_trans *trans, struct btree_iter *iter)
{
- return bch2_btree_iter_peek_prev_min(iter, POS_MIN);
+ return bch2_btree_iter_peek_prev_min(trans, iter, POS_MIN);
}
-struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *);
+struct bkey_s_c bch2_btree_iter_prev(struct btree_trans *, struct btree_iter *);
-struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *);
-struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *);
-struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *);
+struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *, struct btree_iter *);
+struct bkey_s_c bch2_btree_iter_next_slot(struct btree_trans *, struct btree_iter *);
+struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_trans *, struct btree_iter *);
-bool bch2_btree_iter_advance(struct btree_iter *);
-bool bch2_btree_iter_rewind(struct btree_iter *);
+bool bch2_btree_iter_advance(struct btree_trans *, struct btree_iter *);
+bool bch2_btree_iter_rewind(struct btree_trans *, struct btree_iter *);
static inline void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
{
@@ -433,10 +434,9 @@ static inline void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpo
iter->k.size = 0;
}
-static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
+static inline void bch2_btree_iter_set_pos(struct btree_trans *trans,
+ struct btree_iter *iter, struct bpos new_pos)
{
- struct btree_trans *trans = iter->trans;
-
if (unlikely(iter->update_path))
bch2_path_put(trans, iter->update_path,
iter->flags & BTREE_ITER_intent);
@@ -454,13 +454,14 @@ static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *it
iter->pos = bkey_start_pos(&iter->k);
}
-static inline void bch2_btree_iter_set_snapshot(struct btree_iter *iter, u32 snapshot)
+static inline void bch2_btree_iter_set_snapshot(struct btree_trans *trans,
+ struct btree_iter *iter, u32 snapshot)
{
struct bpos pos = iter->pos;
iter->snapshot = snapshot;
pos.snapshot = snapshot;
- bch2_btree_iter_set_pos(iter, pos);
+ bch2_btree_iter_set_pos(trans, iter, pos);
}
void bch2_trans_iter_exit(struct btree_trans *, struct btree_iter *);
@@ -502,7 +503,6 @@ static inline void bch2_trans_iter_init_common(struct btree_trans *trans,
unsigned flags,
unsigned long ip)
{
- iter->trans = trans;
iter->update_path = 0;
iter->key_cache_path = 0;
iter->btree_id = btree_id;
@@ -539,9 +539,9 @@ static inline void bch2_trans_iter_init(struct btree_trans *trans,
void bch2_trans_node_iter_init(struct btree_trans *, struct btree_iter *,
enum btree_id, struct bpos,
unsigned, unsigned, unsigned);
-void bch2_trans_copy_iter(struct btree_iter *, struct btree_iter *);
+void bch2_trans_copy_iter(struct btree_trans *, struct btree_iter *, struct btree_iter *);
-void bch2_set_btree_iter_dontneed(struct btree_iter *);
+void bch2_set_btree_iter_dontneed(struct btree_trans *, struct btree_iter *);
void *__bch2_trans_kmalloc(struct btree_trans *, size_t);
@@ -588,7 +588,7 @@ static inline struct bkey_s_c __bch2_bkey_get_iter(struct btree_trans *trans,
struct bkey_s_c k;
bch2_trans_iter_init(trans, iter, btree_id, pos, flags);
- k = bch2_btree_iter_peek_slot(iter);
+ k = bch2_btree_iter_peek_slot(trans, iter);
if (!bkey_err(k) && type && k.k->type != type)
k = bkey_s_c_err(-BCH_ERR_ENOENT_bkey_type_mismatch);
@@ -658,14 +658,14 @@ u32 bch2_trans_begin(struct btree_trans *);
int _ret3 = 0; \
do { \
_ret3 = lockrestart_do((_trans), ({ \
- struct btree *_b = bch2_btree_iter_peek_node(&_iter); \
+ struct btree *_b = bch2_btree_iter_peek_node(_trans, &_iter);\
if (!_b) \
break; \
\
PTR_ERR_OR_ZERO(_b) ?: (_do); \
})) ?: \
lockrestart_do((_trans), \
- PTR_ERR_OR_ZERO(bch2_btree_iter_next_node(&_iter))); \
+ PTR_ERR_OR_ZERO(bch2_btree_iter_next_node(_trans, &_iter)));\
} while (!_ret3); \
\
bch2_trans_iter_exit((_trans), &(_iter)); \
@@ -677,31 +677,34 @@ u32 bch2_trans_begin(struct btree_trans *);
__for_each_btree_node(_trans, _iter, _btree_id, _start, \
0, 0, _flags, _b, _do)
-static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_iter *iter,
+static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_trans *trans,
+ struct btree_iter *iter,
unsigned flags)
{
- return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(iter) :
- bch2_btree_iter_peek_prev(iter);
+ return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(trans, iter) :
+ bch2_btree_iter_peek_prev(trans, iter);
}
-static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter,
+static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_trans *trans,
+ struct btree_iter *iter,
unsigned flags)
{
- return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(iter) :
- bch2_btree_iter_peek(iter);
+ return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(trans, iter) :
+ bch2_btree_iter_peek(trans, iter);
}
-static inline struct bkey_s_c bch2_btree_iter_peek_max_type(struct btree_iter *iter,
- struct bpos end,
- unsigned flags)
+static inline struct bkey_s_c bch2_btree_iter_peek_max_type(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bpos end,
+ unsigned flags)
{
if (!(flags & BTREE_ITER_slots))
- return bch2_btree_iter_peek_max(iter, end);
+ return bch2_btree_iter_peek_max(trans, iter, end);
if (bkey_gt(iter->pos, end))
return bkey_s_c_null;
- return bch2_btree_iter_peek_slot(iter);
+ return bch2_btree_iter_peek_slot(trans, iter);
}
int __bch2_btree_trans_too_many_iters(struct btree_trans *);
@@ -768,14 +771,14 @@ transaction_restart: \
\
do { \
_ret3 = lockrestart_do(_trans, ({ \
- (_k) = bch2_btree_iter_peek_max_type(&(_iter), \
+ (_k) = bch2_btree_iter_peek_max_type(_trans, &(_iter), \
_end, (_flags)); \
if (!(_k).k) \
break; \
\
bkey_err(_k) ?: (_do); \
})); \
- } while (!_ret3 && bch2_btree_iter_advance(&(_iter))); \
+ } while (!_ret3 && bch2_btree_iter_advance(_trans, &(_iter))); \
\
bch2_trans_iter_exit((_trans), &(_iter)); \
_ret3; \
@@ -813,14 +816,14 @@ transaction_restart: \
\
do { \
_ret3 = lockrestart_do(_trans, ({ \
- (_k) = bch2_btree_iter_peek_prev_type(&(_iter), \
+ (_k) = bch2_btree_iter_peek_prev_type(_trans, &(_iter), \
(_flags)); \
if (!(_k).k) \
break; \
\
bkey_err(_k) ?: (_do); \
})); \
- } while (!_ret3 && bch2_btree_iter_rewind(&(_iter))); \
+ } while (!_ret3 && bch2_btree_iter_rewind(_trans, &(_iter))); \
\
bch2_trans_iter_exit((_trans), &(_iter)); \
_ret3; \
@@ -850,37 +853,38 @@ transaction_restart: \
(_do) ?: bch2_trans_commit(_trans, (_disk_res),\
(_journal_seq), (_commit_flags)))
-struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *);
+struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_trans *,
+ struct btree_iter *);
#define for_each_btree_key_max_norestart(_trans, _iter, _btree_id, \
_start, _end, _flags, _k, _ret) \
for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
(_start), (_flags)); \
- (_k) = bch2_btree_iter_peek_max_type(&(_iter), _end, _flags),\
+ (_k) = bch2_btree_iter_peek_max_type(_trans, &(_iter), _end, _flags),\
!((_ret) = bkey_err(_k)) && (_k).k; \
- bch2_btree_iter_advance(&(_iter)))
+ bch2_btree_iter_advance(_trans, &(_iter)))
-#define for_each_btree_key_max_continue_norestart(_iter, _end, _flags, _k, _ret)\
+#define for_each_btree_key_max_continue_norestart(_trans, _iter, _end, _flags, _k, _ret)\
for (; \
- (_k) = bch2_btree_iter_peek_max_type(&(_iter), _end, _flags), \
+ (_k) = bch2_btree_iter_peek_max_type(_trans, &(_iter), _end, _flags), \
!((_ret) = bkey_err(_k)) && (_k).k; \
- bch2_btree_iter_advance(&(_iter)))
+ bch2_btree_iter_advance(_trans, &(_iter)))
#define for_each_btree_key_norestart(_trans, _iter, _btree_id, \
_start, _flags, _k, _ret) \
for_each_btree_key_max_norestart(_trans, _iter, _btree_id, _start,\
SPOS_MAX, _flags, _k, _ret)
-#define for_each_btree_key_reverse_norestart(_trans, _iter, _btree_id, \
- _start, _flags, _k, _ret) \
- for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
- (_start), (_flags)); \
- (_k) = bch2_btree_iter_peek_prev_type(&(_iter), _flags), \
- !((_ret) = bkey_err(_k)) && (_k).k; \
- bch2_btree_iter_rewind(&(_iter)))
+#define for_each_btree_key_reverse_norestart(_trans, _iter, _btree_id, \
+ _start, _flags, _k, _ret) \
+ for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
+ (_start), (_flags)); \
+ (_k) = bch2_btree_iter_peek_prev_type(_trans, &(_iter), _flags), \
+ !((_ret) = bkey_err(_k)) && (_k).k; \
+ bch2_btree_iter_rewind(_trans, &(_iter)))
-#define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \
- for_each_btree_key_max_continue_norestart(_iter, SPOS_MAX, _flags, _k, _ret)
+#define for_each_btree_key_continue_norestart(_trans, _iter, _flags, _k, _ret) \
+ for_each_btree_key_max_continue_norestart(_trans, _iter, SPOS_MAX, _flags, _k, _ret)
/*
* This should not be used in a fastpath, without first trying _do in
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index edce59433375..2b186584a291 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -287,6 +287,19 @@ err:
return ret;
}
+static noinline_for_stack void do_trace_key_cache_fill(struct btree_trans *trans,
+ struct btree_path *ck_path,
+ struct bkey_s_c k)
+{
+ struct printbuf buf = PRINTBUF;
+
+ bch2_bpos_to_text(&buf, ck_path->pos);
+ prt_char(&buf, ' ');
+ bch2_bkey_val_to_text(&buf, trans->c, k);
+ trace_key_cache_fill(trans, buf.buf);
+ printbuf_exit(&buf);
+}
+
static noinline int btree_key_cache_fill(struct btree_trans *trans,
struct btree_path *ck_path,
unsigned flags)
@@ -306,7 +319,7 @@ static noinline int btree_key_cache_fill(struct btree_trans *trans,
BTREE_ITER_key_cache_fill|
BTREE_ITER_cached_nofill);
iter.flags &= ~BTREE_ITER_with_journal;
- k = bch2_btree_iter_peek_slot(&iter);
+ k = bch2_btree_iter_peek_slot(trans, &iter);
ret = bkey_err(k);
if (ret)
goto err;
@@ -320,18 +333,11 @@ static noinline int btree_key_cache_fill(struct btree_trans *trans,
if (ret)
goto err;
- if (trace_key_cache_fill_enabled()) {
- struct printbuf buf = PRINTBUF;
-
- bch2_bpos_to_text(&buf, ck_path->pos);
- prt_char(&buf, ' ');
- bch2_bkey_val_to_text(&buf, trans->c, k);
- trace_key_cache_fill(trans, buf.buf);
- printbuf_exit(&buf);
- }
+ if (trace_key_cache_fill_enabled())
+ do_trace_key_cache_fill(trans, ck_path, k);
out:
/* We're not likely to need this iterator again: */
- bch2_set_btree_iter_dontneed(&iter);
+ bch2_set_btree_iter_dontneed(trans, &iter);
err:
bch2_trans_iter_exit(trans, &iter);
return ret;
@@ -412,7 +418,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
BTREE_ITER_intent);
b_iter.flags &= ~BTREE_ITER_with_key_cache;
- ret = bch2_btree_iter_traverse(&c_iter);
+ ret = bch2_btree_iter_traverse(trans, &c_iter);
if (ret)
goto out;
@@ -444,7 +450,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
!test_bit(JOURNAL_space_low, &c->journal.flags))
commit_flags |= BCH_TRANS_COMMIT_no_journal_res;
- struct bkey_s_c btree_k = bch2_btree_iter_peek_slot(&b_iter);
+ struct bkey_s_c btree_k = bch2_btree_iter_peek_slot(trans, &b_iter);
ret = bkey_err(btree_k);
if (ret)
goto err;
diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c
index 25d54b77cdc2..8c9fdb7263fe 100644
--- a/fs/bcachefs/btree_node_scan.c
+++ b/fs/bcachefs/btree_node_scan.c
@@ -271,7 +271,7 @@ static int read_btree_nodes_worker(void *p)
err:
bio_put(bio);
free_page((unsigned long) buf);
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[READ]);
closure_put(w->cl);
kfree(w);
return 0;
@@ -291,7 +291,7 @@ static int read_btree_nodes(struct find_btree_nodes *f)
struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL);
if (!w) {
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[READ]);
ret = -ENOMEM;
goto err;
}
@@ -303,14 +303,14 @@ static int read_btree_nodes(struct find_btree_nodes *f)
struct task_struct *t = kthread_create(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name);
ret = PTR_ERR_OR_ZERO(t);
if (ret) {
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[READ]);
kfree(w);
bch_err_msg(c, ret, "starting kthread");
break;
}
closure_get(&cl);
- percpu_ref_get(&ca->io_ref);
+ percpu_ref_get(&ca->io_ref[READ]);
wake_up_process(t);
}
err:
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 77578da2d23f..023c472dc9ee 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -367,7 +367,6 @@ static inline unsigned long btree_path_ip_allocated(struct btree_path *path)
* @nodes_intent_locked - bitmask indicating which locks are intent locks
*/
struct btree_iter {
- struct btree_trans *trans;
btree_path_idx_t path;
btree_path_idx_t update_path;
btree_path_idx_t key_cache_path;
diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
index c05394f56424..1e6b7836cc01 100644
--- a/fs/bcachefs/btree_update.c
+++ b/fs/bcachefs/btree_update.c
@@ -126,7 +126,7 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
struct bpos new_pos)
{
struct bch_fs *c = trans->c;
- struct btree_iter old_iter, new_iter = { NULL };
+ struct btree_iter old_iter, new_iter = {};
struct bkey_s_c old_k, new_k;
snapshot_id_list s;
struct bkey_i *update;
@@ -140,7 +140,7 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
bch2_trans_iter_init(trans, &old_iter, id, old_pos,
BTREE_ITER_not_extents|
BTREE_ITER_all_snapshots);
- while ((old_k = bch2_btree_iter_prev(&old_iter)).k &&
+ while ((old_k = bch2_btree_iter_prev(trans, &old_iter)).k &&
!(ret = bkey_err(old_k)) &&
bkey_eq(old_pos, old_k.k->p)) {
struct bpos whiteout_pos =
@@ -296,7 +296,7 @@ static int bch2_trans_update_extent(struct btree_trans *trans,
BTREE_ITER_intent|
BTREE_ITER_with_updates|
BTREE_ITER_not_extents);
- k = bch2_btree_iter_peek_max(&iter, POS(insert->k.p.inode, U64_MAX));
+ k = bch2_btree_iter_peek_max(trans, &iter, POS(insert->k.p.inode, U64_MAX));
if ((ret = bkey_err(k)))
goto err;
if (!k.k)
@@ -322,8 +322,8 @@ static int bch2_trans_update_extent(struct btree_trans *trans,
if (done)
goto out;
next:
- bch2_btree_iter_advance(&iter);
- k = bch2_btree_iter_peek_max(&iter, POS(insert->k.p.inode, U64_MAX));
+ bch2_btree_iter_advance(trans, &iter);
+ k = bch2_btree_iter_peek_max(trans, &iter, POS(insert->k.p.inode, U64_MAX));
if ((ret = bkey_err(k)))
goto err;
if (!k.k)
@@ -592,13 +592,13 @@ int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter,
enum btree_id btree, struct bpos end)
{
bch2_trans_iter_init(trans, iter, btree, end, BTREE_ITER_intent);
- struct bkey_s_c k = bch2_btree_iter_peek_prev(iter);
+ struct bkey_s_c k = bch2_btree_iter_peek_prev(trans, iter);
int ret = bkey_err(k);
if (ret)
goto err;
- bch2_btree_iter_advance(iter);
- k = bch2_btree_iter_peek_slot(iter);
+ bch2_btree_iter_advance(trans, iter);
+ k = bch2_btree_iter_peek_slot(trans, iter);
ret = bkey_err(k);
if (ret)
goto err;
@@ -634,7 +634,7 @@ int bch2_btree_insert_nonextent(struct btree_trans *trans,
BTREE_ITER_cached|
BTREE_ITER_not_extents|
BTREE_ITER_intent);
- ret = bch2_btree_iter_traverse(&iter) ?:
+ ret = bch2_btree_iter_traverse(trans, &iter) ?:
bch2_trans_update(trans, &iter, k, flags);
bch2_trans_iter_exit(trans, &iter);
return ret;
@@ -646,7 +646,7 @@ int bch2_btree_insert_trans(struct btree_trans *trans, enum btree_id id,
struct btree_iter iter;
bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k),
BTREE_ITER_intent|flags);
- int ret = bch2_btree_iter_traverse(&iter) ?:
+ int ret = bch2_btree_iter_traverse(trans, &iter) ?:
bch2_trans_update(trans, &iter, k, flags);
bch2_trans_iter_exit(trans, &iter);
return ret;
@@ -695,7 +695,7 @@ int bch2_btree_delete(struct btree_trans *trans,
bch2_trans_iter_init(trans, &iter, btree, pos,
BTREE_ITER_cached|
BTREE_ITER_intent);
- ret = bch2_btree_iter_traverse(&iter) ?:
+ ret = bch2_btree_iter_traverse(trans, &iter) ?:
bch2_btree_delete_at(trans, &iter, update_flags);
bch2_trans_iter_exit(trans, &iter);
@@ -713,7 +713,7 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
int ret = 0;
bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_intent);
- while ((k = bch2_btree_iter_peek_max(&iter, end)).k) {
+ while ((k = bch2_btree_iter_peek_max(trans, &iter, end)).k) {
struct disk_reservation disk_res =
bch2_disk_reservation_init(trans->c, 0);
struct bkey_i delete;
@@ -808,7 +808,7 @@ int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree,
struct btree_iter iter;
bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_intent);
- int ret = bch2_btree_iter_traverse(&iter) ?:
+ int ret = bch2_btree_iter_traverse(trans, &iter) ?:
bch2_btree_bit_mod_iter(trans, &iter, set);
bch2_trans_iter_exit(trans, &iter);
return ret;
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index bf7e1dac7f46..55fbeeb8eaaa 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -2147,7 +2147,7 @@ static int get_iter_to_node(struct btree_trans *trans, struct btree_iter *iter,
bch2_trans_node_iter_init(trans, iter, b->c.btree_id, b->key.k.p,
BTREE_MAX_DEPTH, b->c.level,
BTREE_ITER_intent);
- int ret = bch2_btree_iter_traverse(iter);
+ int ret = bch2_btree_iter_traverse(trans, iter);
if (ret)
goto err;
@@ -2239,7 +2239,7 @@ static int bch2_btree_node_rewrite_key(struct btree_trans *trans,
bch2_trans_node_iter_init(trans, &iter,
btree, k->k.p,
BTREE_MAX_DEPTH, level, 0);
- struct btree *b = bch2_btree_iter_peek_node(&iter);
+ struct btree *b = bch2_btree_iter_peek_node(trans, &iter);
int ret = PTR_ERR_OR_ZERO(b);
if (ret)
goto out;
@@ -2262,7 +2262,7 @@ int bch2_btree_node_rewrite_pos(struct btree_trans *trans,
/* Traverse one depth lower to get a pointer to the node itself: */
struct btree_iter iter;
bch2_trans_node_iter_init(trans, &iter, btree, pos, 0, level - 1, 0);
- struct btree *b = bch2_btree_iter_peek_node(&iter);
+ struct btree *b = bch2_btree_iter_peek_node(trans, &iter);
int ret = PTR_ERR_OR_ZERO(b);
if (ret)
goto err;
@@ -2406,7 +2406,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
bool skip_triggers)
{
struct bch_fs *c = trans->c;
- struct btree_iter iter2 = { NULL };
+ struct btree_iter iter2 = {};
struct btree *parent;
int ret;
@@ -2430,7 +2430,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
parent = btree_node_parent(btree_iter_path(trans, iter), b);
if (parent) {
- bch2_trans_copy_iter(&iter2, iter);
+ bch2_trans_copy_iter(trans, &iter2, iter);
iter2.path = bch2_btree_path_make_mut(trans, iter2.path,
iter2.flags & BTREE_ITER_intent,
@@ -2444,7 +2444,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
trans->paths_sorted = false;
- ret = bch2_btree_iter_traverse(&iter2) ?:
+ ret = bch2_btree_iter_traverse(trans, &iter2) ?:
bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_norun);
if (ret)
goto err;
diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
index 2c09d19dd621..adbe576ec77e 100644
--- a/fs/bcachefs/btree_write_buffer.c
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -144,7 +144,7 @@ static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *ite
EBUG_ON(!trans->c->btree_write_buffer.flushing.pin.seq);
EBUG_ON(trans->c->btree_write_buffer.flushing.pin.seq > wb->journal_seq);
- ret = bch2_btree_iter_traverse(iter);
+ ret = bch2_btree_iter_traverse(trans, iter);
if (ret)
return ret;
@@ -208,7 +208,7 @@ btree_write_buffered_insert(struct btree_trans *trans,
trans->journal_res.seq = wb->journal_seq;
- ret = bch2_btree_iter_traverse(&iter) ?:
+ ret = bch2_btree_iter_traverse(trans, &iter) ?:
bch2_trans_update(trans, &iter, &wb->k,
BTREE_UPDATE_internal_snapshot_node);
bch2_trans_iter_exit(trans, &iter);
@@ -285,7 +285,7 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
struct bch_fs *c = trans->c;
struct journal *j = &c->journal;
struct btree_write_buffer *wb = &c->btree_write_buffer;
- struct btree_iter iter = { NULL };
+ struct btree_iter iter = {};
size_t overwritten = 0, fast = 0, slowpath = 0, could_not_insert = 0;
bool write_locked = false;
bool accounting_replay_done = test_bit(BCH_FS_accounting_replay_done, &c->flags);
@@ -368,7 +368,7 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
write_locked = false;
ret = lockrestart_do(trans,
- bch2_btree_iter_traverse(&iter) ?:
+ bch2_btree_iter_traverse(trans, &iter) ?:
bch2_foreground_maybe_merge(trans, iter.path, 0,
BCH_WATERMARK_reclaim|
BCH_TRANS_COMMIT_journal_reclaim|
@@ -385,7 +385,7 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
BTREE_ITER_intent|BTREE_ITER_all_snapshots);
}
- bch2_btree_iter_set_pos(&iter, k->k.k.p);
+ bch2_btree_iter_set_pos(trans, &iter, k->k.k.p);
btree_iter_path(trans, &iter)->preserve = false;
bool accounting_accumulated = false;
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 0903311cc71e..fea61e60a9ee 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -30,6 +30,12 @@
void bch2_dev_usage_read_fast(struct bch_dev *ca, struct bch_dev_usage *usage)
{
+ for (unsigned i = 0; i < BCH_DATA_NR; i++)
+ usage->buckets[i] = percpu_u64_get(&ca->usage->d[i].buckets);
+}
+
+void bch2_dev_usage_full_read_fast(struct bch_dev *ca, struct bch_dev_usage_full *usage)
+{
memset(usage, 0, sizeof(*usage));
acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage, dev_usage_u64s());
}
@@ -75,7 +81,7 @@ bch2_fs_usage_read_short(struct bch_fs *c)
void bch2_dev_usage_to_text(struct printbuf *out,
struct bch_dev *ca,
- struct bch_dev_usage *usage)
+ struct bch_dev_usage_full *usage)
{
if (out->nr_tabstops < 5) {
printbuf_tabstops_reset(out);
@@ -365,7 +371,7 @@ found:
struct btree_iter iter;
bch2_trans_node_iter_init(trans, &iter, btree, new->k.p, 0, level,
BTREE_ITER_intent|BTREE_ITER_all_snapshots);
- ret = bch2_btree_iter_traverse(&iter) ?:
+ ret = bch2_btree_iter_traverse(trans, &iter) ?:
bch2_trans_update(trans, &iter, new,
BTREE_UPDATE_internal_snapshot_node|
BTREE_TRIGGER_norun);
@@ -707,7 +713,7 @@ err:
struct disk_accounting_pos acc;
memset(&acc, 0, sizeof(acc));
acc.type = BCH_DISK_ACCOUNTING_replicas;
- memcpy(&acc.replicas, &m->r.e, replicas_entry_bytes(&m->r.e));
+ unsafe_memcpy(&acc.replicas, &m->r.e, replicas_entry_bytes(&m->r.e), "VLA");
gc_stripe_unlock(m);
acc.replicas.data_type = data_type;
@@ -1132,7 +1138,7 @@ int bch2_trans_mark_dev_sbs_flags(struct bch_fs *c,
for_each_online_member(c, ca) {
int ret = bch2_trans_mark_dev_sb(c, ca, flags);
if (ret) {
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[READ]);
return ret;
}
}
@@ -1331,7 +1337,7 @@ void bch2_dev_buckets_free(struct bch_dev *ca)
int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
{
- ca->usage = alloc_percpu(struct bch_dev_usage);
+ ca->usage = alloc_percpu(struct bch_dev_usage_full);
if (!ca->usage)
return -BCH_ERR_ENOMEM_usage_init;
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index c5363256e363..1c38b165f48b 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -172,7 +172,16 @@ static inline struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca)
return ret;
}
-void bch2_dev_usage_to_text(struct printbuf *, struct bch_dev *, struct bch_dev_usage *);
+void bch2_dev_usage_full_read_fast(struct bch_dev *, struct bch_dev_usage_full *);
+static inline struct bch_dev_usage_full bch2_dev_usage_full_read(struct bch_dev *ca)
+{
+ struct bch_dev_usage_full ret;
+
+ bch2_dev_usage_full_read_fast(ca, &ret);
+ return ret;
+}
+
+void bch2_dev_usage_to_text(struct printbuf *, struct bch_dev *, struct bch_dev_usage_full *);
static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_watermark watermark)
{
@@ -207,7 +216,7 @@ static inline u64 dev_buckets_free(struct bch_dev *ca,
enum bch_watermark watermark)
{
return max_t(s64, 0,
- usage.d[BCH_DATA_free].buckets -
+ usage.buckets[BCH_DATA_free]-
ca->nr_open_buckets -
bch2_dev_buckets_reserved(ca, watermark));
}
@@ -217,10 +226,10 @@ static inline u64 __dev_buckets_available(struct bch_dev *ca,
enum bch_watermark watermark)
{
return max_t(s64, 0,
- usage.d[BCH_DATA_free].buckets
- + usage.d[BCH_DATA_cached].buckets
- + usage.d[BCH_DATA_need_gc_gens].buckets
- + usage.d[BCH_DATA_need_discard].buckets
+ usage.buckets[BCH_DATA_free]
+ + usage.buckets[BCH_DATA_cached]
+ + usage.buckets[BCH_DATA_need_gc_gens]
+ + usage.buckets[BCH_DATA_need_discard]
- ca->nr_open_buckets
- bch2_dev_buckets_reserved(ca, watermark));
}
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 900b8680c8b5..0aed2500ade3 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -54,7 +54,12 @@ struct bucket_gens {
u8 b[] __counted_by(nbuckets);
};
+/* Only info on bucket countns: */
struct bch_dev_usage {
+ u64 buckets[BCH_DATA_NR];
+};
+
+struct bch_dev_usage_full {
struct bch_dev_usage_type {
u64 buckets;
u64 sectors; /* _compressed_ sectors: */
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index 584f4a3eb670..5891b3a1e61c 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -350,8 +350,8 @@ static ssize_t bch2_data_job_read(struct file *file, char __user *buf,
if (ctx->arg.op == BCH_DATA_OP_scrub) {
struct bch_dev *ca = bch2_dev_tryget(c, ctx->arg.scrub.dev);
if (ca) {
- struct bch_dev_usage u;
- bch2_dev_usage_read_fast(ca, &u);
+ struct bch_dev_usage_full u;
+ bch2_dev_usage_full_read_fast(ca, &u);
for (unsigned i = BCH_DATA_btree; i < ARRAY_SIZE(u.d); i++)
if (ctx->arg.scrub.data_types & BIT(i))
e.p.sectors_total += u.d[i].sectors;
@@ -473,7 +473,7 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c,
struct bch_ioctl_dev_usage __user *user_arg)
{
struct bch_ioctl_dev_usage arg;
- struct bch_dev_usage src;
+ struct bch_dev_usage_full src;
struct bch_dev *ca;
unsigned i;
@@ -493,7 +493,7 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c,
if (IS_ERR(ca))
return PTR_ERR(ca);
- src = bch2_dev_usage_read(ca);
+ src = bch2_dev_usage_full_read(ca);
arg.state = ca->mi.state;
arg.bucket_size = ca->mi.bucket_size;
@@ -514,7 +514,7 @@ static long bch2_ioctl_dev_usage_v2(struct bch_fs *c,
struct bch_ioctl_dev_usage_v2 __user *user_arg)
{
struct bch_ioctl_dev_usage_v2 arg;
- struct bch_dev_usage src;
+ struct bch_dev_usage_full src;
struct bch_dev *ca;
int ret = 0;
@@ -534,7 +534,7 @@ static long bch2_ioctl_dev_usage_v2(struct bch_fs *c,
if (IS_ERR(ca))
return PTR_ERR(ca);
- src = bch2_dev_usage_read(ca);
+ src = bch2_dev_usage_full_read(ca);
arg.state = ca->mi.state;
arg.bucket_size = ca->mi.bucket_size;
@@ -615,7 +615,7 @@ static long bch2_ioctl_disk_get_idx(struct bch_fs *c,
for_each_online_member(c, ca)
if (ca->dev == dev) {
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[READ]);
return ca->dev_idx;
}
diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c
index 1f8e035d7119..d6dd12d74d4f 100644
--- a/fs/bcachefs/clock.c
+++ b/fs/bcachefs/clock.c
@@ -121,7 +121,7 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock,
} while (0);
__set_current_state(TASK_RUNNING);
- del_timer_sync(&wait.cpu_timer);
+ timer_delete_sync(&wait.cpu_timer);
destroy_timer_on_stack(&wait.cpu_timer);
bch2_io_timer_del(clock, &wait.io_timer);
}
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index 85fc90342492..28ed32449913 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -371,13 +371,14 @@ static int attempt_compress(struct bch_fs *c,
};
zlib_set_workspace(&strm, workspace);
- zlib_deflateInit2(&strm,
+ if (zlib_deflateInit2(&strm,
compression.level
? clamp_t(unsigned, compression.level,
Z_BEST_SPEED, Z_BEST_COMPRESSION)
: Z_DEFAULT_COMPRESSION,
Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL,
- Z_DEFAULT_STRATEGY);
+ Z_DEFAULT_STRATEGY) != Z_OK)
+ return 0;
if (zlib_deflate(&strm, Z_FINISH) != Z_STREAM_END)
return 0;
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index fe400dfc5d76..de02ebf847ec 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -216,7 +216,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
bch2_trans_begin(trans);
- k = bch2_btree_iter_peek_slot(&iter);
+ k = bch2_btree_iter_peek_slot(trans, &iter);
ret = bkey_err(k);
if (ret)
goto err;
@@ -398,7 +398,7 @@ restart_drop_extra_replicas:
BCH_TRANS_COMMIT_no_enospc|
m->data_opts.btree_insert_flags);
if (!ret) {
- bch2_btree_iter_set_pos(&iter, next_pos);
+ bch2_btree_iter_set_pos(trans, &iter, next_pos);
this_cpu_add(c->counters[BCH_COUNTER_io_move_finish], new->k.size);
if (trace_io_move_finish_enabled())
@@ -426,7 +426,7 @@ nowork:
count_event(c, io_move_fail);
- bch2_btree_iter_advance(&iter);
+ bch2_btree_iter_advance(trans, &iter);
goto next;
}
out:
@@ -497,7 +497,7 @@ static int bch2_update_unwritten_extent(struct btree_trans *trans,
bch2_trans_iter_init(trans, &iter, update->btree_id, update->op.pos,
BTREE_ITER_slots);
ret = lockrestart_do(trans, ({
- k = bch2_btree_iter_peek_slot(&iter);
+ k = bch2_btree_iter_peek_slot(trans, &iter);
bkey_err(k);
}));
bch2_trans_iter_exit(trans, &iter);
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 788af88f6979..5a8bc7013512 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -57,7 +57,7 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b,
submit_bio_wait(bio);
bio_put(bio);
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[READ]);
memcpy(n_ondisk, n_sorted, btree_buf_bytes(b));
@@ -297,7 +297,7 @@ out:
if (bio)
bio_put(bio);
kvfree(n_ondisk);
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[READ]);
}
#ifdef CONFIG_DEBUG_FS
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index d7f9f79318a2..bf53a029f356 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -417,8 +417,8 @@ int bch2_dirent_rename(struct btree_trans *trans,
enum bch_rename_mode mode)
{
struct qstr src_name_lookup, dst_name_lookup;
- struct btree_iter src_iter = { NULL };
- struct btree_iter dst_iter = { NULL };
+ struct btree_iter src_iter = {};
+ struct btree_iter dst_iter = {};
struct bkey_s_c old_src, old_dst = bkey_s_c_null;
struct bkey_i_dirent *new_src = NULL, *new_dst = NULL;
struct bpos dst_pos =
@@ -586,16 +586,16 @@ out_set_src:
}
if (delete_src) {
- bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot);
- ret = bch2_btree_iter_traverse(&src_iter) ?:
+ bch2_btree_iter_set_snapshot(trans, &src_iter, old_src.k->p.snapshot);
+ ret = bch2_btree_iter_traverse(trans, &src_iter) ?:
bch2_btree_delete_at(trans, &src_iter, BTREE_UPDATE_internal_snapshot_node);
if (ret)
goto out;
}
if (delete_dst) {
- bch2_btree_iter_set_snapshot(&dst_iter, old_dst.k->p.snapshot);
- ret = bch2_btree_iter_traverse(&dst_iter) ?:
+ bch2_btree_iter_set_snapshot(trans, &dst_iter, old_dst.k->p.snapshot);
+ ret = bch2_btree_iter_traverse(trans, &dst_iter) ?:
bch2_btree_delete_at(trans, &dst_iter, BTREE_UPDATE_internal_snapshot_node);
if (ret)
goto out;
@@ -642,7 +642,7 @@ u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir,
const struct qstr *name, subvol_inum *inum)
{
struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter = { NULL };
+ struct btree_iter iter = {};
int ret = lockrestart_do(trans,
bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, name, inum, 0));
@@ -771,7 +771,7 @@ int bch2_fsck_remove_dirent(struct btree_trans *trans, struct bpos pos)
bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent);
- ret = bch2_btree_iter_traverse(&iter) ?:
+ ret = bch2_btree_iter_traverse(trans, &iter) ?:
bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
&dir_hash_info, &iter,
BTREE_UPDATE_internal_snapshot_node);
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index a59f6c12529b..b007319b72e9 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -739,7 +739,7 @@ int bch2_accounting_read(struct bch_fs *c)
struct disk_accounting_pos next;
memset(&next, 0, sizeof(next));
next.type = acc_k.type + 1;
- bch2_btree_iter_set_pos(&iter, disk_accounting_pos_to_bpos(&next));
+ bch2_btree_iter_set_pos(trans, &iter, disk_accounting_pos_to_bpos(&next));
continue;
}
@@ -930,7 +930,7 @@ void bch2_verify_accounting_clean(struct bch_fs *c)
struct disk_accounting_pos next;
memset(&next, 0, sizeof(next));
next.type = acc_k.type + 1;
- bch2_btree_iter_set_pos(&iter, disk_accounting_pos_to_bpos(&next));
+ bch2_btree_iter_set_pos(trans, &iter, disk_accounting_pos_to_bpos(&next));
continue;
}
diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
index 5df8de0b8c02..1186280b29e9 100644
--- a/fs/bcachefs/disk_groups.c
+++ b/fs/bcachefs/disk_groups.c
@@ -555,9 +555,9 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
? rcu_dereference(c->devs[t.dev])
: NULL;
- if (ca && percpu_ref_tryget(&ca->io_ref)) {
+ if (ca && percpu_ref_tryget(&ca->io_ref[READ])) {
prt_printf(out, "/dev/%s", ca->name);
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[READ]);
} else if (ca) {
prt_printf(out, "offline device %u", t.dev);
} else {
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 6faeda7ad03d..a396865e8b17 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -105,6 +105,7 @@ struct ec_bio {
struct bch_dev *ca;
struct ec_stripe_buf *buf;
size_t idx;
+ int rw;
u64 submit_time;
struct bio bio;
};
@@ -462,7 +463,8 @@ int bch2_trigger_stripe(struct btree_trans *trans,
return ret;
if (gc)
- memcpy(&gc->r.e, &acc.replicas, replicas_entry_bytes(&acc.replicas));
+ unsafe_memcpy(&gc->r.e, &acc.replicas,
+ replicas_entry_bytes(&acc.replicas), "VLA");
}
if (old_s) {
@@ -703,6 +705,7 @@ static void ec_block_endio(struct bio *bio)
struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx];
struct bch_dev *ca = ec_bio->ca;
struct closure *cl = bio->bi_private;
+ int rw = ec_bio->rw;
bch2_account_io_completion(ca, bio_data_dir(bio),
ec_bio->submit_time, !bio->bi_status);
@@ -724,7 +727,7 @@ static void ec_block_endio(struct bio *bio)
}
bio_put(&ec_bio->bio);
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[rw]);
closure_put(cl);
}
@@ -775,6 +778,7 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
ec_bio->ca = ca;
ec_bio->buf = buf;
ec_bio->idx = idx;
+ ec_bio->rw = rw;
ec_bio->submit_time = local_clock();
ec_bio->bio.bi_iter.bi_sector = ptr->offset + buf->offset + (offset >> 9);
@@ -784,14 +788,14 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b);
closure_get(cl);
- percpu_ref_get(&ca->io_ref);
+ percpu_ref_get(&ca->io_ref[rw]);
submit_bio(&ec_bio->bio);
offset += b;
}
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[rw]);
}
static int get_stripe_key_trans(struct btree_trans *trans, u64 idx,
@@ -1264,7 +1268,7 @@ static void zero_out_rest_of_ec_bucket(struct bch_fs *c,
ob->sectors_free,
GFP_KERNEL, 0);
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[WRITE]);
if (ret)
s->err = ret;
@@ -1836,7 +1840,7 @@ static int __get_existing_stripe(struct btree_trans *trans,
ret = 1;
}
out:
- bch2_set_btree_iter_dontneed(&iter);
+ bch2_set_btree_iter_dontneed(trans, &iter);
err:
bch2_trans_iter_exit(trans, &iter);
return ret;
@@ -1949,7 +1953,7 @@ static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_st
if (bkey_gt(k.k->p, POS(0, U32_MAX))) {
if (start_pos.offset) {
start_pos = min_pos;
- bch2_btree_iter_set_pos(&iter, start_pos);
+ bch2_btree_iter_set_pos(trans, &iter, start_pos);
continue;
}
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index d4dfd13a8076..baf5dfb32298 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -34,7 +34,7 @@ bool __bch2_inconsistent_error(struct bch_fs *c, struct printbuf *out)
journal_cur_seq(&c->journal));
return true;
case BCH_ON_ERROR_panic:
- bch2_print_string_as_lines(KERN_ERR, out->buf);
+ bch2_print_string_as_lines_nonblocking(KERN_ERR, out->buf);
panic(bch2_fmt(c, "panic after error"));
return true;
default:
@@ -45,6 +45,8 @@ bool __bch2_inconsistent_error(struct bch_fs *c, struct printbuf *out)
bool bch2_inconsistent_error(struct bch_fs *c)
{
struct printbuf buf = PRINTBUF;
+ buf.atomic++;
+
printbuf_indent_add_nextline(&buf, 2);
bool ret = __bch2_inconsistent_error(c, &buf);
@@ -59,6 +61,7 @@ static bool bch2_fs_trans_inconsistent(struct bch_fs *c, struct btree_trans *tra
const char *fmt, va_list args)
{
struct printbuf buf = PRINTBUF;
+ buf.atomic++;
bch2_log_msg_start(c, &buf);
@@ -68,7 +71,7 @@ static bool bch2_fs_trans_inconsistent(struct bch_fs *c, struct btree_trans *tra
if (trans)
bch2_trans_updates_to_text(&buf, trans);
bool ret = __bch2_inconsistent_error(c, &buf);
- bch2_print_string_as_lines(KERN_ERR, buf.buf);
+ bch2_print_string_as_lines_nonblocking(KERN_ERR, buf.buf);
printbuf_exit(&buf);
return ret;
diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
index 6aac579a692a..6bb42985306e 100644
--- a/fs/bcachefs/extent_update.c
+++ b/fs/bcachefs/extent_update.c
@@ -112,7 +112,7 @@ int bch2_extent_atomic_end(struct btree_trans *trans,
unsigned nr_iters = 0;
int ret;
- ret = bch2_btree_iter_traverse(iter);
+ ret = bch2_btree_iter_traverse(trans, iter);
if (ret)
return ret;
@@ -126,9 +126,9 @@ int bch2_extent_atomic_end(struct btree_trans *trans,
if (ret < 0)
return ret;
- bch2_trans_copy_iter(&copy, iter);
+ bch2_trans_copy_iter(trans, &copy, iter);
- for_each_btree_key_max_continue_norestart(copy, insert->k.p, 0, k, ret) {
+ for_each_btree_key_max_continue_norestart(trans, copy, insert->k.p, 0, k, ret) {
unsigned offset = 0;
if (bkey_gt(bkey_start_pos(&insert->k), bkey_start_pos(k.k)))
diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
index a03e2c780cba..19d4599918dc 100644
--- a/fs/bcachefs/fs-io-buffered.c
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -183,12 +183,12 @@ static void bchfs_read(struct btree_trans *trans,
if (ret)
goto err;
- bch2_btree_iter_set_snapshot(&iter, snapshot);
+ bch2_btree_iter_set_snapshot(trans, &iter, snapshot);
- bch2_btree_iter_set_pos(&iter,
+ bch2_btree_iter_set_pos(trans, &iter,
POS(inum.inum, rbio->bio.bi_iter.bi_sector));
- k = bch2_btree_iter_peek_slot(&iter);
+ k = bch2_btree_iter_peek_slot(trans, &iter);
ret = bkey_err(k);
if (ret)
goto err;
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index c80ed3a54e70..65c2c33d253d 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -48,7 +48,7 @@ static void nocow_flush_endio(struct bio *_bio)
struct nocow_flush *bio = container_of(_bio, struct nocow_flush, bio);
closure_put(bio->cl);
- percpu_ref_put(&bio->ca->io_ref);
+ percpu_ref_put(&bio->ca->io_ref[WRITE]);
bio_put(&bio->bio);
}
@@ -71,7 +71,7 @@ void bch2_inode_flush_nocow_writes_async(struct bch_fs *c,
for_each_set_bit(dev, devs.d, BCH_SB_MEMBERS_MAX) {
rcu_read_lock();
ca = rcu_dereference(c->devs[dev]);
- if (ca && !percpu_ref_tryget(&ca->io_ref))
+ if (ca && !percpu_ref_tryget(&ca->io_ref[WRITE]))
ca = NULL;
rcu_read_unlock();
@@ -636,9 +636,9 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
if (ret)
goto bkey_err;
- bch2_btree_iter_set_snapshot(&iter, snapshot);
+ bch2_btree_iter_set_snapshot(trans, &iter, snapshot);
- k = bch2_btree_iter_peek_slot(&iter);
+ k = bch2_btree_iter_peek_slot(trans, &iter);
if ((ret = bkey_err(k)))
goto bkey_err;
@@ -649,13 +649,13 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
/* already reserved */
if (bkey_extent_is_reservation(k) &&
bch2_bkey_nr_ptrs_fully_allocated(k) >= opts.data_replicas) {
- bch2_btree_iter_advance(&iter);
+ bch2_btree_iter_advance(trans, &iter);
continue;
}
if (bkey_extent_is_data(k.k) &&
!(mode & FALLOC_FL_ZERO_RANGE)) {
- bch2_btree_iter_advance(&iter);
+ bch2_btree_iter_advance(trans, &iter);
continue;
}
@@ -676,7 +676,7 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
if (ret)
goto bkey_err;
}
- bch2_btree_iter_set_pos(&iter, POS(iter.pos.inode, hole_start));
+ bch2_btree_iter_set_pos(trans, &iter, POS(iter.pos.inode, hole_start));
if (ret)
goto bkey_err;
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index fc834bdf1f52..5a41b1a8e54f 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -88,7 +88,7 @@ int __must_check bch2_write_inode(struct bch_fs *c,
void *p, unsigned fields)
{
struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter = { NULL };
+ struct btree_iter iter = {};
struct bch_inode_unpacked inode_u;
int ret;
retry:
@@ -1075,7 +1075,7 @@ int bch2_setattr_nonsize(struct mnt_idmap *idmap,
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch_qid qid;
struct btree_trans *trans;
- struct btree_iter inode_iter = { NULL };
+ struct btree_iter inode_iter = {};
struct bch_inode_unpacked inode_u;
struct posix_acl *acl = NULL;
kuid_t kuid;
@@ -1330,9 +1330,9 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
if (ret)
continue;
- bch2_btree_iter_set_snapshot(&iter, snapshot);
+ bch2_btree_iter_set_snapshot(trans, &iter, snapshot);
- k = bch2_btree_iter_peek_max(&iter, end);
+ k = bch2_btree_iter_peek_max(trans, &iter, end);
ret = bkey_err(k);
if (ret)
continue;
@@ -1342,7 +1342,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
if (!bkey_extent_is_data(k.k) &&
k.k->type != KEY_TYPE_reservation) {
- bch2_btree_iter_advance(&iter);
+ bch2_btree_iter_advance(trans, &iter);
continue;
}
@@ -1380,7 +1380,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
bkey_copy(prev.k, cur.k);
have_extent = true;
- bch2_btree_iter_set_pos(&iter,
+ bch2_btree_iter_set_pos(trans, &iter,
POS(iter.pos.inode, iter.pos.offset + sectors));
}
bch2_trans_iter_exit(trans, &iter);
@@ -1697,17 +1697,17 @@ retry:
if (ret)
goto err;
- bch2_btree_iter_set_snapshot(&iter1, snapshot);
- bch2_btree_iter_set_snapshot(&iter2, snapshot);
+ bch2_btree_iter_set_snapshot(trans, &iter1, snapshot);
+ bch2_btree_iter_set_snapshot(trans, &iter2, snapshot);
ret = bch2_inode_find_by_inum_trans(trans, inode_inum(inode), &inode_u);
if (ret)
goto err;
if (inode_u.bi_dir == dir->ei_inode.bi_inum) {
- bch2_btree_iter_set_pos(&iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset));
+ bch2_btree_iter_set_pos(trans, &iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset));
- k = bch2_btree_iter_peek_slot(&iter1);
+ k = bch2_btree_iter_peek_slot(trans, &iter1);
ret = bkey_err(k);
if (ret)
goto err;
@@ -1731,7 +1731,7 @@ retry:
* File with multiple hardlinks and our backref is to the wrong
* directory - linear search:
*/
- for_each_btree_key_continue_norestart(iter2, 0, k, ret) {
+ for_each_btree_key_continue_norestart(trans, iter2, 0, k, ret) {
if (k.k->p.inode > dir->ei_inode.bi_inum)
break;
@@ -2237,7 +2237,7 @@ got_sb:
/* XXX: create an anonymous device for multi device filesystems */
sb->s_bdev = bdev;
sb->s_dev = bdev->bd_dev;
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[READ]);
break;
}
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 52320295dcf6..18308f3d64a1 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -186,7 +186,7 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
{
struct bch_fs *c = trans->c;
struct qstr lostfound_str = QSTR("lost+found");
- struct btree_iter lostfound_iter = { NULL };
+ struct btree_iter lostfound_iter = {};
u64 inum = 0;
unsigned d_type = 0;
int ret;
@@ -295,8 +295,8 @@ create_lostfound:
if (ret)
goto err;
- bch2_btree_iter_set_snapshot(&lostfound_iter, snapshot);
- ret = bch2_btree_iter_traverse(&lostfound_iter);
+ bch2_btree_iter_set_snapshot(trans, &lostfound_iter, snapshot);
+ ret = bch2_btree_iter_traverse(trans, &lostfound_iter);
if (ret)
goto err;
@@ -544,7 +544,7 @@ static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 sub
new_inode.bi_subvol = subvolid;
int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?:
- bch2_btree_iter_traverse(&inode_iter) ?:
+ bch2_btree_iter_traverse(trans, &inode_iter) ?:
bch2_inode_write(trans, &inode_iter, &new_inode);
bch2_trans_iter_exit(trans, &inode_iter);
if (ret)
@@ -609,7 +609,7 @@ static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32
struct btree_iter iter = {};
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0);
- struct bkey_s_c k = bch2_btree_iter_peek_prev_min(&iter, POS(inum, 0));
+ struct bkey_s_c k = bch2_btree_iter_peek_prev_min(trans, &iter, POS(inum, 0));
bch2_trans_iter_exit(trans, &iter);
int ret = bkey_err(k);
if (ret)
@@ -1557,7 +1557,7 @@ static int overlapping_extents_found(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct printbuf buf = PRINTBUF;
- struct btree_iter iter1, iter2 = { NULL };
+ struct btree_iter iter1, iter2 = {};
struct bkey_s_c k1, k2;
int ret;
@@ -1566,7 +1566,7 @@ static int overlapping_extents_found(struct btree_trans *trans,
bch2_trans_iter_init(trans, &iter1, btree, pos1,
BTREE_ITER_all_snapshots|
BTREE_ITER_not_extents);
- k1 = bch2_btree_iter_peek_max(&iter1, POS(pos1.inode, U64_MAX));
+ k1 = bch2_btree_iter_peek_max(trans, &iter1, POS(pos1.inode, U64_MAX));
ret = bkey_err(k1);
if (ret)
goto err;
@@ -1586,12 +1586,12 @@ static int overlapping_extents_found(struct btree_trans *trans,
goto err;
}
- bch2_trans_copy_iter(&iter2, &iter1);
+ bch2_trans_copy_iter(trans, &iter2, &iter1);
while (1) {
- bch2_btree_iter_advance(&iter2);
+ bch2_btree_iter_advance(trans, &iter2);
- k2 = bch2_btree_iter_peek_max(&iter2, POS(pos1.inode, U64_MAX));
+ k2 = bch2_btree_iter_peek_max(trans, &iter2, POS(pos1.inode, U64_MAX));
ret = bkey_err(k2);
if (ret)
goto err;
@@ -1791,9 +1791,9 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
struct btree_iter iter2;
- bch2_trans_copy_iter(&iter2, iter);
- bch2_btree_iter_set_snapshot(&iter2, i->snapshot);
- ret = bch2_btree_iter_traverse(&iter2) ?:
+ bch2_trans_copy_iter(trans, &iter2, iter);
+ bch2_btree_iter_set_snapshot(trans, &iter2, i->snapshot);
+ ret = bch2_btree_iter_traverse(trans, &iter2) ?:
bch2_btree_delete_at(trans, &iter2,
BTREE_UPDATE_internal_snapshot_node);
bch2_trans_iter_exit(trans, &iter2);
@@ -2185,7 +2185,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
BTREE_ID_dirents,
SPOS(k.k->p.inode, k.k->p.offset, *i),
BTREE_ITER_intent);
- ret = bch2_btree_iter_traverse(&delete_iter) ?:
+ ret = bch2_btree_iter_traverse(trans, &delete_iter) ?:
bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
hash_info,
&delete_iter,
@@ -2412,7 +2412,7 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter,
bch2_trans_iter_exit(trans, &parent_iter);
bch2_trans_iter_init(trans, &parent_iter,
BTREE_ID_subvolumes, POS(0, parent), 0);
- k = bch2_btree_iter_peek_slot(&parent_iter);
+ k = bch2_btree_iter_peek_slot(trans, &parent_iter);
ret = bkey_err(k);
if (ret)
goto err;
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 80051073f613..b51d98cf8a80 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -940,7 +940,7 @@ int bch2_inode_create(struct btree_trans *trans,
BTREE_ITER_intent);
struct bkey_s_c k;
again:
- while ((k = bch2_btree_iter_peek(iter)).k &&
+ while ((k = bch2_btree_iter_peek(trans, iter)).k &&
!(ret = bkey_err(k)) &&
bkey_lt(k.k->p, POS(0, max))) {
if (pos < iter->pos.offset)
@@ -951,7 +951,7 @@ again:
* we've found just one:
*/
pos = iter->pos.offset + 1;
- bch2_btree_iter_set_pos(iter, POS(0, pos));
+ bch2_btree_iter_set_pos(trans, iter, POS(0, pos));
}
if (!ret && pos < max)
@@ -967,12 +967,12 @@ again:
/* Retry from start */
pos = start = min;
- bch2_btree_iter_set_pos(iter, POS(0, pos));
+ bch2_btree_iter_set_pos(trans, iter, POS(0, pos));
le32_add_cpu(&cursor->v.gen, 1);
goto again;
found_slot:
- bch2_btree_iter_set_pos(iter, SPOS(0, pos, snapshot));
- k = bch2_btree_iter_peek_slot(iter);
+ bch2_btree_iter_set_pos(trans, iter, SPOS(0, pos, snapshot));
+ k = bch2_btree_iter_peek_slot(trans, iter);
ret = bkey_err(k);
if (ret) {
bch2_trans_iter_exit(trans, iter);
@@ -1009,9 +1009,9 @@ static int bch2_inode_delete_keys(struct btree_trans *trans,
if (ret)
goto err;
- bch2_btree_iter_set_snapshot(&iter, snapshot);
+ bch2_btree_iter_set_snapshot(trans, &iter, snapshot);
- k = bch2_btree_iter_peek_max(&iter, end);
+ k = bch2_btree_iter_peek_max(trans, &iter, end);
ret = bkey_err(k);
if (ret)
goto err;
@@ -1042,7 +1042,7 @@ err:
int bch2_inode_rm(struct bch_fs *c, subvol_inum inum)
{
struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter = { NULL };
+ struct btree_iter iter = {};
struct bkey_s_c k;
u32 snapshot;
int ret;
@@ -1207,7 +1207,7 @@ int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_i
static noinline int __bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
{
struct bch_fs *c = trans->c;
- struct btree_iter iter = { NULL };
+ struct btree_iter iter = {};
struct bkey_i_inode_generation delete;
struct bch_inode_unpacked inode_u;
struct bkey_s_c k;
diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c
index 6b842c8d21be..cc07729a4b62 100644
--- a/fs/bcachefs/io_misc.c
+++ b/fs/bcachefs/io_misc.c
@@ -43,7 +43,7 @@ int bch2_extent_fallocate(struct btree_trans *trans,
bch2_bkey_buf_init(&new);
closure_init_stack(&cl);
- k = bch2_btree_iter_peek_slot(iter);
+ k = bch2_btree_iter_peek_slot(trans, iter);
ret = bkey_err(k);
if (ret)
return ret;
@@ -164,12 +164,12 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
if (ret)
continue;
- bch2_btree_iter_set_snapshot(iter, snapshot);
+ bch2_btree_iter_set_snapshot(trans, iter, snapshot);
/*
* peek_max() doesn't have ideal semantics for extents:
*/
- k = bch2_btree_iter_peek_max(iter, end_pos);
+ k = bch2_btree_iter_peek_max(trans, iter, end_pos);
if (!k.k)
break;
@@ -230,7 +230,7 @@ static int truncate_set_isize(struct btree_trans *trans,
u64 new_i_size,
bool warn)
{
- struct btree_iter iter = { NULL };
+ struct btree_iter iter = {};
struct bch_inode_unpacked inode_u;
int ret;
@@ -399,7 +399,7 @@ case LOGGED_OP_FINSERT_start:
if (ret)
goto err;
} else {
- bch2_btree_iter_set_pos(&iter, POS(inum.inum, src_offset));
+ bch2_btree_iter_set_pos(trans, &iter, POS(inum.inum, src_offset));
ret = bch2_fpunch_at(trans, &iter, inum, src_offset + len, i_sectors_delta);
if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -425,12 +425,12 @@ case LOGGED_OP_FINSERT_shift_extents:
if (ret)
goto btree_err;
- bch2_btree_iter_set_snapshot(&iter, snapshot);
- bch2_btree_iter_set_pos(&iter, SPOS(inum.inum, pos, snapshot));
+ bch2_btree_iter_set_snapshot(trans, &iter, snapshot);
+ bch2_btree_iter_set_pos(trans, &iter, SPOS(inum.inum, pos, snapshot));
k = insert
- ? bch2_btree_iter_peek_prev_min(&iter, POS(inum.inum, 0))
- : bch2_btree_iter_peek_max(&iter, POS(inum.inum, U64_MAX));
+ ? bch2_btree_iter_peek_prev_min(trans, &iter, POS(inum.inum, 0))
+ : bch2_btree_iter_peek_max(trans, &iter, POS(inum.inum, U64_MAX));
if ((ret = bkey_err(k)))
goto btree_err;
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
index fd01e67b3e84..417bb0c7bbfa 100644
--- a/fs/bcachefs/io_read.c
+++ b/fs/bcachefs/io_read.c
@@ -394,7 +394,7 @@ static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
if (rbio->have_ioref) {
struct bch_dev *ca = bch2_dev_have_ref(rbio->c, rbio->pick.ptr.dev);
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[READ]);
}
if (rbio->split) {
@@ -909,7 +909,7 @@ static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans,
prt_printf(&buf, "memory gen: %u", gen);
- ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
+ ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(trans, &iter)));
if (!ret) {
prt_newline(&buf);
bch2_bkey_val_to_text(&buf, c, k);
@@ -1003,7 +1003,7 @@ retry_pick:
unlikely(dev_ptr_stale(ca, &pick.ptr))) {
read_from_stale_dirty_pointer(trans, ca, k, pick.ptr);
bch2_mark_io_failure(failed, &pick, false);
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[READ]);
goto retry_pick;
}
@@ -1036,7 +1036,7 @@ retry_pick:
*/
if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) {
if (ca)
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[READ]);
rbio->ret = -BCH_ERR_data_read_buffer_too_small;
goto out_read_done;
}
@@ -1285,12 +1285,12 @@ int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio,
if (ret)
goto err;
- bch2_btree_iter_set_snapshot(&iter, snapshot);
+ bch2_btree_iter_set_snapshot(trans, &iter, snapshot);
- bch2_btree_iter_set_pos(&iter,
+ bch2_btree_iter_set_pos(trans, &iter,
POS(inum.inum, bvec_iter.bi_sector));
- k = bch2_btree_iter_peek_slot(&iter);
+ k = bch2_btree_iter_peek_slot(trans, &iter);
ret = bkey_err(k);
if (ret)
goto err;
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index 07b55839768e..a418fa62f09d 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -168,9 +168,9 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans,
*i_sectors_delta = 0;
*disk_sectors_delta = 0;
- bch2_trans_copy_iter(&iter, extent_iter);
+ bch2_trans_copy_iter(trans, &iter, extent_iter);
- for_each_btree_key_max_continue_norestart(iter,
+ for_each_btree_key_max_continue_norestart(trans, iter,
new->k.p, BTREE_ITER_slots, old, ret) {
s64 sectors = min(new->k.p.offset, old.k->p.offset) -
max(bkey_start_offset(&new->k),
@@ -292,7 +292,7 @@ int bch2_extent_update(struct btree_trans *trans,
* path already traversed at iter->pos because
* bch2_trans_extent_update() will use it to attempt extent merging
*/
- ret = __bch2_btree_iter_traverse(iter);
+ ret = __bch2_btree_iter_traverse(trans, iter);
if (ret)
return ret;
@@ -337,7 +337,7 @@ int bch2_extent_update(struct btree_trans *trans,
if (i_sectors_delta_total)
*i_sectors_delta_total += i_sectors_delta;
- bch2_btree_iter_set_pos(iter, next_pos);
+ bch2_btree_iter_set_pos(trans, iter, next_pos);
return 0;
}
@@ -445,6 +445,11 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
BUG_ON(c->opts.nochanges);
bkey_for_each_ptr(ptrs, ptr) {
+ /*
+ * XXX: btree writes should be using io_ref[WRITE], but we
+ * aren't retrying failed btree writes yet (due to device
+ * removal/ro):
+ */
struct bch_dev *ca = nocow
? bch2_dev_have_ref(c, ptr->dev)
: bch2_dev_get_ioref(c, ptr->dev, type == BCH_DATA_btree ? READ : WRITE);
@@ -697,12 +702,19 @@ static void bch2_write_endio(struct bio *bio)
bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write,
wbio->submit_time, !bio->bi_status);
- if (bio->bi_status) {
- bch_err_inum_offset_ratelimited(ca,
- op->pos.inode,
- wbio->inode_offset << 9,
- "data write error: %s",
- bch2_blk_status_to_str(bio->bi_status));
+ if (unlikely(bio->bi_status)) {
+ if (ca)
+ bch_err_inum_offset_ratelimited(ca,
+ op->pos.inode,
+ wbio->inode_offset << 9,
+ "data write error: %s",
+ bch2_blk_status_to_str(bio->bi_status));
+ else
+ bch_err_inum_offset_ratelimited(c,
+ op->pos.inode,
+ wbio->inode_offset << 9,
+ "data write error: %s",
+ bch2_blk_status_to_str(bio->bi_status));
set_bit(wbio->dev, op->failed.d);
op->flags |= BCH_WRITE_io_error;
}
@@ -715,7 +727,7 @@ static void bch2_write_endio(struct bio *bio)
}
if (wbio->have_ioref)
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[WRITE]);
if (wbio->bounce)
bch2_bio_free_pages_pool(c, bio);
@@ -1293,7 +1305,7 @@ retry:
if (ret)
break;
- k = bch2_btree_iter_peek_slot(&iter);
+ k = bch2_btree_iter_peek_slot(trans, &iter);
ret = bkey_err(k);
if (ret)
break;
@@ -1377,7 +1389,7 @@ retry:
bch2_keylist_push(&op->insert_keys);
if (op->flags & BCH_WRITE_submitted)
break;
- bch2_btree_iter_advance(&iter);
+ bch2_btree_iter_advance(trans, &iter);
}
out:
bch2_trans_iter_exit(trans, &iter);
@@ -1414,7 +1426,7 @@ err:
return;
err_get_ioref:
darray_for_each(buckets, i)
- percpu_ref_put(&bch2_dev_have_ref(c, i->b.inode)->io_ref);
+ percpu_ref_put(&bch2_dev_have_ref(c, i->b.inode)->io_ref[WRITE]);
/* Fall back to COW path: */
goto out;
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 8a36d5536668..d8f74b6d0a75 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -1315,7 +1315,7 @@ int bch2_fs_journal_alloc(struct bch_fs *c)
int ret = bch2_dev_journal_alloc(ca, true);
if (ret) {
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[READ]);
return ret;
}
}
@@ -1404,6 +1404,14 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
nr = cur_seq - last_seq;
+ /*
+ * Extra fudge factor, in case we crashed when the journal pin fifo was
+ * nearly or completely full. We'll need to be able to open additional
+ * journal entries (at least a few) in order for journal replay to get
+ * going:
+ */
+ nr += nr / 4;
+
if (nr + 1 > j->pin.size) {
free_fifo(&j->pin);
init_fifo(&j->pin, roundup_pow_of_two(nr + 1), GFP_KERNEL);
@@ -1461,11 +1469,9 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
j->reservations.idx = journal_cur_seq(j);
c->last_bucket_seq_cleanup = journal_cur_seq(j);
-
- bch2_journal_space_available(j);
spin_unlock(&j->lock);
- return bch2_journal_reclaim_start(j);
+ return 0;
}
/* init/exit: */
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 2debc213e47c..1b7961f4f609 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1218,7 +1218,7 @@ static CLOSURE_CALLBACK(bch2_journal_read_device)
out:
bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret);
kvfree(buf.data);
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[READ]);
closure_return(cl);
return;
err:
@@ -1253,7 +1253,7 @@ int bch2_journal_read(struct bch_fs *c,
if ((ca->mi.state == BCH_MEMBER_STATE_rw ||
ca->mi.state == BCH_MEMBER_STATE_ro) &&
- percpu_ref_tryget(&ca->io_ref))
+ percpu_ref_tryget(&ca->io_ref[READ]))
closure_call(&ca->journal.read,
bch2_journal_read_device,
system_unbound_wq,
@@ -1768,7 +1768,7 @@ static void journal_write_endio(struct bio *bio)
}
closure_put(&w->io);
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[WRITE]);
}
static CLOSURE_CALLBACK(journal_write_submit)
@@ -1843,7 +1843,7 @@ static CLOSURE_CALLBACK(journal_write_preflush)
if (w->separate_flush) {
for_each_rw_member(c, ca) {
- percpu_ref_get(&ca->io_ref);
+ percpu_ref_get(&ca->io_ref[WRITE]);
struct journal_device *ja = &ca->journal;
struct bio *bio = &ja->bio[w->idx]->bio;
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index 57ad662871ba..90dcf80bd64a 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -130,7 +130,7 @@ static int bch2_dev_metadata_drop(struct bch_fs *c,
retry:
ret = 0;
while (bch2_trans_begin(trans),
- (b = bch2_btree_iter_peek_node(&iter)) &&
+ (b = bch2_btree_iter_peek_node(trans, &iter)) &&
!(ret = PTR_ERR_OR_ZERO(b))) {
bch2_progress_update_iter(trans, progress, &iter, "dropping metadata");
@@ -154,7 +154,7 @@ retry:
if (ret)
break;
next:
- bch2_btree_iter_next_node(&iter);
+ bch2_btree_iter_next_node(trans, &iter);
}
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 5d41260e10da..fc396b9fa754 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -545,7 +545,7 @@ static struct bkey_s_c bch2_lookup_indirect_extent_for_move(struct btree_trans *
BTREE_ID_reflink, reflink_pos,
BTREE_ITER_not_extents);
- struct bkey_s_c k = bch2_btree_iter_peek(iter);
+ struct bkey_s_c k = bch2_btree_iter_peek(trans, iter);
if (!k.k || bkey_err(k)) {
bch2_trans_iter_exit(trans, iter);
return k;
@@ -603,7 +603,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
bch2_trans_begin(trans);
- k = bch2_btree_iter_peek(&iter);
+ k = bch2_btree_iter_peek(trans, &iter);
if (!k.k)
break;
@@ -681,7 +681,7 @@ next:
if (ctxt->stats)
atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
next_nondata:
- bch2_btree_iter_advance(&iter);
+ bch2_btree_iter_advance(trans, &iter);
}
bch2_trans_iter_exit(trans, &reflink_iter);
@@ -794,7 +794,7 @@ static int __bch2_move_data_phys(struct moving_context *ctxt,
bch2_trans_begin(trans);
- k = bch2_btree_iter_peek(&bp_iter);
+ k = bch2_btree_iter_peek(trans, &bp_iter);
ret = bkey_err(k);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
continue;
@@ -876,7 +876,7 @@ static int __bch2_move_data_phys(struct moving_context *ctxt,
if (ctxt->stats)
atomic64_add(sectors, &ctxt->stats->sectors_seen);
next:
- bch2_btree_iter_advance(&bp_iter);
+ bch2_btree_iter_advance(trans, &bp_iter);
}
err:
bch2_trans_iter_exit(trans, &bp_iter);
@@ -991,7 +991,7 @@ static int bch2_move_btree(struct bch_fs *c,
retry:
ret = 0;
while (bch2_trans_begin(trans),
- (b = bch2_btree_iter_peek_node(&iter)) &&
+ (b = bch2_btree_iter_peek_node(trans, &iter)) &&
!(ret = PTR_ERR_OR_ZERO(b))) {
if (kthread && kthread_should_stop())
break;
@@ -1011,7 +1011,7 @@ retry:
if (ret)
break;
next:
- bch2_btree_iter_next_node(&iter);
+ bch2_btree_iter_next_node(trans, &iter);
}
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 5126c870ce5b..159410c50861 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -280,7 +280,11 @@ unsigned long bch2_copygc_wait_amount(struct bch_fs *c)
s64 wait = S64_MAX, fragmented_allowed, fragmented;
for_each_rw_member(c, ca) {
- struct bch_dev_usage usage = bch2_dev_usage_read(ca);
+ struct bch_dev_usage_full usage_full = bch2_dev_usage_full_read(ca);
+ struct bch_dev_usage usage;
+
+ for (unsigned i = 0; i < BCH_DATA_NR; i++)
+ usage.buckets[i] = usage_full.d[i].buckets;
fragmented_allowed = ((__dev_buckets_available(ca, usage, BCH_WATERMARK_stripe) *
ca->mi.bucket_size) >> 1);
@@ -288,7 +292,7 @@ unsigned long bch2_copygc_wait_amount(struct bch_fs *c)
for (unsigned i = 0; i < BCH_DATA_NR; i++)
if (data_type_movable(i))
- fragmented += usage.d[i].fragmented;
+ fragmented += usage_full.d[i].fragmented;
wait = min(wait, max(0LL, fragmented_allowed - fragmented));
}
diff --git a/fs/bcachefs/namei.c b/fs/bcachefs/namei.c
index ee7251709fb9..0d65ea96f7a2 100644
--- a/fs/bcachefs/namei.c
+++ b/fs/bcachefs/namei.c
@@ -28,8 +28,8 @@ int bch2_create_trans(struct btree_trans *trans,
unsigned flags)
{
struct bch_fs *c = trans->c;
- struct btree_iter dir_iter = { NULL };
- struct btree_iter inode_iter = { NULL };
+ struct btree_iter dir_iter = {};
+ struct btree_iter inode_iter = {};
subvol_inum new_inum = dir;
u64 now = bch2_current_time(c);
u64 cpu = raw_smp_processor_id();
@@ -127,8 +127,8 @@ int bch2_create_trans(struct btree_trans *trans,
if (ret)
goto err;
- bch2_btree_iter_set_snapshot(&dir_iter, dir_snapshot);
- ret = bch2_btree_iter_traverse(&dir_iter);
+ bch2_btree_iter_set_snapshot(trans, &dir_iter, dir_snapshot);
+ ret = bch2_btree_iter_traverse(trans, &dir_iter);
if (ret)
goto err;
}
@@ -177,9 +177,9 @@ int bch2_create_trans(struct btree_trans *trans,
new_inode->bi_depth = dir_u->bi_depth + 1;
inode_iter.flags &= ~BTREE_ITER_all_snapshots;
- bch2_btree_iter_set_snapshot(&inode_iter, snapshot);
+ bch2_btree_iter_set_snapshot(trans, &inode_iter, snapshot);
- ret = bch2_btree_iter_traverse(&inode_iter) ?:
+ ret = bch2_btree_iter_traverse(trans, &inode_iter) ?:
bch2_inode_write(trans, &inode_iter, new_inode);
err:
bch2_trans_iter_exit(trans, &inode_iter);
@@ -193,8 +193,8 @@ int bch2_link_trans(struct btree_trans *trans,
const struct qstr *name)
{
struct bch_fs *c = trans->c;
- struct btree_iter dir_iter = { NULL };
- struct btree_iter inode_iter = { NULL };
+ struct btree_iter dir_iter = {};
+ struct btree_iter inode_iter = {};
struct bch_hash_info dir_hash;
u64 now = bch2_current_time(c);
u64 dir_offset = 0;
@@ -253,9 +253,9 @@ int bch2_unlink_trans(struct btree_trans *trans,
bool deleting_subvol)
{
struct bch_fs *c = trans->c;
- struct btree_iter dir_iter = { NULL };
- struct btree_iter dirent_iter = { NULL };
- struct btree_iter inode_iter = { NULL };
+ struct btree_iter dir_iter = {};
+ struct btree_iter dirent_iter = {};
+ struct btree_iter inode_iter = {};
struct bch_hash_info dir_hash;
subvol_inum inum;
u64 now = bch2_current_time(c);
@@ -301,7 +301,7 @@ int bch2_unlink_trans(struct btree_trans *trans,
if (ret)
goto err;
- k = bch2_btree_iter_peek_slot(&dirent_iter);
+ k = bch2_btree_iter_peek_slot(trans, &dirent_iter);
ret = bkey_err(k);
if (ret)
goto err;
@@ -310,8 +310,8 @@ int bch2_unlink_trans(struct btree_trans *trans,
* If we're deleting a subvolume, we need to really delete the
* dirent, not just emit a whiteout in the current snapshot:
*/
- bch2_btree_iter_set_snapshot(&dirent_iter, k.k->p.snapshot);
- ret = bch2_btree_iter_traverse(&dirent_iter);
+ bch2_btree_iter_set_snapshot(trans, &dirent_iter, k.k->p.snapshot);
+ ret = bch2_btree_iter_traverse(trans, &dirent_iter);
if (ret)
goto err;
} else {
@@ -390,10 +390,10 @@ int bch2_rename_trans(struct btree_trans *trans,
enum bch_rename_mode mode)
{
struct bch_fs *c = trans->c;
- struct btree_iter src_dir_iter = { NULL };
- struct btree_iter dst_dir_iter = { NULL };
- struct btree_iter src_inode_iter = { NULL };
- struct btree_iter dst_inode_iter = { NULL };
+ struct btree_iter src_dir_iter = {};
+ struct btree_iter dst_dir_iter = {};
+ struct btree_iter src_inode_iter = {};
+ struct btree_iter dst_inode_iter = {};
struct bch_hash_info src_hash, dst_hash;
subvol_inum src_inum, dst_inum;
u64 src_offset, dst_offset;
@@ -666,7 +666,7 @@ static int bch2_check_dirent_inode_dirent(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct printbuf buf = PRINTBUF;
- struct btree_iter bp_iter = { NULL };
+ struct btree_iter bp_iter = {};
int ret = 0;
if (inode_points_to_dirent(target, d))
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index 8b857fc33244..3d4755d73af7 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -516,7 +516,7 @@ static int bch2_fs_quota_read_inode(struct btree_trans *trans,
bch2_quota_acct(c, bch_qid(&u), Q_INO, 1,
KEY_TYPE_QUOTA_NOCHECK);
advance:
- bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos));
+ bch2_btree_iter_set_pos(trans, iter, bpos_nosnap_successor(iter->pos));
return 0;
}
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index b9bde04b66c0..c63fa53f30d2 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -233,7 +233,7 @@ int bch2_set_rebalance_needs_scan_trans(struct btree_trans *trans, u64 inum)
bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work,
SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
BTREE_ITER_intent);
- k = bch2_btree_iter_peek_slot(&iter);
+ k = bch2_btree_iter_peek_slot(trans, &iter);
ret = bkey_err(k);
if (ret)
goto err;
@@ -281,7 +281,7 @@ static int bch2_clear_rebalance_needs_scan(struct btree_trans *trans, u64 inum,
bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work,
SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
BTREE_ITER_intent);
- k = bch2_btree_iter_peek_slot(&iter);
+ k = bch2_btree_iter_peek_slot(trans, &iter);
ret = bkey_err(k);
if (ret)
goto err;
@@ -301,7 +301,7 @@ static struct bkey_s_c next_rebalance_entry(struct btree_trans *trans,
struct btree_iter *work_iter)
{
return !kthread_should_stop()
- ? bch2_btree_iter_peek(work_iter)
+ ? bch2_btree_iter_peek(trans, work_iter)
: bkey_s_c_null;
}
@@ -335,7 +335,7 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
work_pos.inode ? BTREE_ID_extents : BTREE_ID_reflink,
work_pos,
BTREE_ITER_all_snapshots);
- struct bkey_s_c k = bch2_btree_iter_peek_slot(extent_iter);
+ struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, extent_iter);
if (bkey_err(k))
return k;
@@ -511,7 +511,7 @@ static int do_rebalance(struct moving_context *ctxt)
struct btree_trans *trans = ctxt->trans;
struct bch_fs *c = trans->c;
struct bch_fs_rebalance *r = &c->rebalance;
- struct btree_iter rebalance_work_iter, extent_iter = { NULL };
+ struct btree_iter rebalance_work_iter, extent_iter = {};
struct bkey_s_c k;
int ret = 0;
@@ -552,7 +552,7 @@ static int do_rebalance(struct moving_context *ctxt)
if (ret)
break;
- bch2_btree_iter_advance(&rebalance_work_iter);
+ bch2_btree_iter_advance(trans, &rebalance_work_iter);
}
bch2_trans_iter_exit(trans, &extent_iter);
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 266c5770c824..79fd18a5a07c 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -198,7 +198,7 @@ static int bch2_journal_replay_accounting_key(struct btree_trans *trans,
bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
BTREE_MAX_DEPTH, k->level,
BTREE_ITER_intent);
- int ret = bch2_btree_iter_traverse(&iter);
+ int ret = bch2_btree_iter_traverse(trans, &iter);
if (ret)
goto out;
@@ -261,7 +261,7 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
BTREE_MAX_DEPTH, k->level,
iter_flags);
- ret = bch2_btree_iter_traverse(&iter);
+ ret = bch2_btree_iter_traverse(trans, &iter);
if (ret)
goto out;
@@ -270,7 +270,7 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
bch2_trans_iter_exit(trans, &iter);
bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
BTREE_MAX_DEPTH, 0, iter_flags);
- ret = bch2_btree_iter_traverse(&iter) ?:
+ ret = bch2_btree_iter_traverse(trans, &iter) ?:
bch2_btree_increase_depth(trans, iter.path, 0) ?:
-BCH_ERR_transaction_restart_nested;
goto out;
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index ee23f1f93acc..710178e3da4c 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -495,7 +495,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
bool reflink_p_may_update_opts_field)
{
struct bch_fs *c = trans->c;
- struct btree_iter reflink_iter = { NULL };
+ struct btree_iter reflink_iter = {};
struct bkey_s_c k;
struct bkey_i *r_v;
struct bkey_i_reflink_p *r_p;
@@ -507,7 +507,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
bch2_trans_iter_init(trans, &reflink_iter, BTREE_ID_reflink, POS_MAX,
BTREE_ITER_intent);
- k = bch2_btree_iter_peek_prev(&reflink_iter);
+ k = bch2_btree_iter_peek_prev(trans, &reflink_iter);
ret = bkey_err(k);
if (ret)
goto err;
@@ -569,12 +569,13 @@ err:
return ret;
}
-static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end)
+static struct bkey_s_c get_next_src(struct btree_trans *trans,
+ struct btree_iter *iter, struct bpos end)
{
struct bkey_s_c k;
int ret;
- for_each_btree_key_max_continue_norestart(*iter, end, 0, k, ret) {
+ for_each_btree_key_max_continue_norestart(trans, *iter, end, 0, k, ret) {
if (bkey_extent_is_unwritten(k))
continue;
@@ -583,7 +584,7 @@ static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end)
}
if (bkey_ge(iter->pos, end))
- bch2_btree_iter_set_pos(iter, end);
+ bch2_btree_iter_set_pos(trans, iter, end);
return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
}
@@ -647,27 +648,27 @@ s64 bch2_remap_range(struct bch_fs *c,
if (ret)
continue;
- bch2_btree_iter_set_snapshot(&src_iter, src_snapshot);
+ bch2_btree_iter_set_snapshot(trans, &src_iter, src_snapshot);
ret = bch2_subvolume_get_snapshot(trans, dst_inum.subvol,
&dst_snapshot);
if (ret)
continue;
- bch2_btree_iter_set_snapshot(&dst_iter, dst_snapshot);
+ bch2_btree_iter_set_snapshot(trans, &dst_iter, dst_snapshot);
if (dst_inum.inum < src_inum.inum) {
/* Avoid some lock cycle transaction restarts */
- ret = bch2_btree_iter_traverse(&dst_iter);
+ ret = bch2_btree_iter_traverse(trans, &dst_iter);
if (ret)
continue;
}
dst_done = dst_iter.pos.offset - dst_start.offset;
src_want = POS(src_start.inode, src_start.offset + dst_done);
- bch2_btree_iter_set_pos(&src_iter, src_want);
+ bch2_btree_iter_set_pos(trans, &src_iter, src_want);
- src_k = get_next_src(&src_iter, src_end);
+ src_k = get_next_src(trans, &src_iter, src_end);
ret = bkey_err(src_k);
if (ret)
continue;
@@ -738,7 +739,7 @@ s64 bch2_remap_range(struct bch_fs *c,
do {
struct bch_inode_unpacked inode_u;
- struct btree_iter inode_iter = { NULL };
+ struct btree_iter inode_iter = {};
bch2_trans_begin(trans);
diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h
index 38261638a611..06bb41a3f360 100644
--- a/fs/bcachefs/sb-members.h
+++ b/fs/bcachefs/sb-members.h
@@ -20,7 +20,7 @@ struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i);
static inline bool bch2_dev_is_online(struct bch_dev *ca)
{
- return !percpu_ref_is_zero(&ca->io_ref);
+ return !percpu_ref_is_zero(&ca->io_ref[READ]);
}
static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *, unsigned);
@@ -156,33 +156,34 @@ static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, struct bch_dev
static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c,
struct bch_dev *ca,
- unsigned state_mask)
+ unsigned state_mask,
+ int rw)
{
rcu_read_lock();
if (ca)
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[rw]);
while ((ca = __bch2_next_dev(c, ca, NULL)) &&
(!((1 << ca->mi.state) & state_mask) ||
- !percpu_ref_tryget(&ca->io_ref)))
+ !percpu_ref_tryget(&ca->io_ref[rw])))
;
rcu_read_unlock();
return ca;
}
-#define __for_each_online_member(_c, _ca, state_mask) \
+#define __for_each_online_member(_c, _ca, state_mask, rw) \
for (struct bch_dev *_ca = NULL; \
- (_ca = bch2_get_next_online_dev(_c, _ca, state_mask));)
+ (_ca = bch2_get_next_online_dev(_c, _ca, state_mask, rw));)
#define for_each_online_member(c, ca) \
- __for_each_online_member(c, ca, ~0)
+ __for_each_online_member(c, ca, ~0, READ)
#define for_each_rw_member(c, ca) \
- __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw))
+ __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw), WRITE)
#define for_each_readable_member(c, ca) \
- __for_each_online_member(c, ca, BIT( BCH_MEMBER_STATE_rw)|BIT(BCH_MEMBER_STATE_ro))
+ __for_each_online_member(c, ca, BIT( BCH_MEMBER_STATE_rw)|BIT(BCH_MEMBER_STATE_ro), READ)
static inline bool bch2_dev_exists(const struct bch_fs *c, unsigned dev)
{
@@ -287,7 +288,7 @@ static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev,
rcu_read_lock();
struct bch_dev *ca = bch2_dev_rcu(c, dev);
- if (ca && !percpu_ref_tryget(&ca->io_ref))
+ if (ca && !percpu_ref_tryget(&ca->io_ref[rw]))
ca = NULL;
rcu_read_unlock();
@@ -297,7 +298,7 @@ static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev,
return ca;
if (ca)
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[rw]);
return NULL;
}
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index 0c65065b08ec..b7de29aed839 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -843,9 +843,6 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id)
{
struct bch_fs *c = trans->c;
- if (bch2_snapshot_exists(c, id))
- return 0;
-
/* Do we need to reconstruct the snapshot_tree entry as well? */
struct btree_iter iter;
struct bkey_s_c k;
@@ -1074,9 +1071,9 @@ static inline void normalize_snapshot_child_pointers(struct bch_snapshot *s)
static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
{
struct bch_fs *c = trans->c;
- struct btree_iter iter, p_iter = (struct btree_iter) { NULL };
- struct btree_iter c_iter = (struct btree_iter) { NULL };
- struct btree_iter tree_iter = (struct btree_iter) { NULL };
+ struct btree_iter iter, p_iter = {};
+ struct btree_iter c_iter = {};
+ struct btree_iter tree_iter = {};
struct bkey_s_c_snapshot s;
u32 parent_id, child_id;
unsigned i;
@@ -1193,13 +1190,13 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
POS_MIN, BTREE_ITER_intent);
- k = bch2_btree_iter_peek(&iter);
+ k = bch2_btree_iter_peek(trans, &iter);
ret = bkey_err(k);
if (ret)
goto err;
for (i = 0; i < nr_snapids; i++) {
- k = bch2_btree_iter_prev_slot(&iter);
+ k = bch2_btree_iter_prev_slot(trans, &iter);
ret = bkey_err(k);
if (ret)
goto err;
diff --git a/fs/bcachefs/str_hash.c b/fs/bcachefs/str_hash.c
index 602afca2f5ef..a90bf7b8a2b4 100644
--- a/fs/bcachefs/str_hash.c
+++ b/fs/bcachefs/str_hash.c
@@ -195,7 +195,7 @@ int __bch2_str_hash_check_key(struct btree_trans *trans,
struct btree_iter *k_iter, struct bkey_s_c hash_k)
{
struct bch_fs *c = trans->c;
- struct btree_iter iter = { NULL };
+ struct btree_iter iter = {};
struct printbuf buf = PRINTBUF;
struct bkey_s_c k;
int ret = 0;
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index 575ad1e03904..09a354a26c3b 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -231,11 +231,11 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans,
struct bkey_s_c k;
int ret;
- bch2_trans_copy_iter(&iter, start);
+ bch2_trans_copy_iter(trans, &iter, start);
- bch2_btree_iter_advance(&iter);
+ bch2_btree_iter_advance(trans, &iter);
- for_each_btree_key_continue_norestart(iter, BTREE_ITER_slots, k, ret) {
+ for_each_btree_key_continue_norestart(trans, iter, BTREE_ITER_slots, k, ret) {
if (k.k->type != desc.key_type &&
k.k->type != KEY_TYPE_hash_whiteout)
break;
@@ -280,7 +280,7 @@ struct bkey_s_c bch2_hash_set_or_get_in_snapshot(struct btree_trans *trans,
}
if (!slot.path && !(flags & STR_HASH_must_replace))
- bch2_trans_copy_iter(&slot, iter);
+ bch2_trans_copy_iter(trans, &slot, iter);
if (k.k->type != KEY_TYPE_hash_whiteout)
goto not_found;
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index cd0d8e5e44e7..5537283d0bea 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -275,7 +275,7 @@ int bch2_subvol_has_children(struct btree_trans *trans, u32 subvol)
struct btree_iter iter;
bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolume_children, POS(subvol, 0), 0);
- struct bkey_s_c k = bch2_btree_iter_peek(&iter);
+ struct bkey_s_c k = bch2_btree_iter_peek(trans, &iter);
bch2_trans_iter_exit(trans, &iter);
return bkey_err(k) ?: k.k && k.k->p.inode == subvol
@@ -574,7 +574,7 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
bool ro)
{
struct bch_fs *c = trans->c;
- struct btree_iter dst_iter, src_iter = (struct btree_iter) { NULL };
+ struct btree_iter dst_iter, src_iter = {};
struct bkey_i_subvolume *new_subvol = NULL;
struct bkey_i_subvolume *src_subvol = NULL;
u32 parent = 0, new_nodes[2], snapshot_subvols[2];
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index 910f6196700e..f640c1e3d639 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -33,16 +33,16 @@ int bch2_subvol_is_ro_trans(struct btree_trans *, u32);
int bch2_subvol_is_ro(struct bch_fs *, u32);
static inline struct bkey_s_c
-bch2_btree_iter_peek_in_subvolume_max_type(struct btree_iter *iter, struct bpos end,
- u32 subvolid, unsigned flags)
+bch2_btree_iter_peek_in_subvolume_max_type(struct btree_trans *trans, struct btree_iter *iter,
+ struct bpos end, u32 subvolid, unsigned flags)
{
u32 snapshot;
- int ret = bch2_subvolume_get_snapshot(iter->trans, subvolid, &snapshot);
+ int ret = bch2_subvolume_get_snapshot(trans, subvolid, &snapshot);
if (ret)
return bkey_s_c_err(ret);
- bch2_btree_iter_set_snapshot(iter, snapshot);
- return bch2_btree_iter_peek_max_type(iter, end, flags);
+ bch2_btree_iter_set_snapshot(trans, iter, snapshot);
+ return bch2_btree_iter_peek_max_type(trans, iter, end, flags);
}
#define for_each_btree_key_in_subvolume_max_continue(_trans, _iter, \
@@ -53,14 +53,14 @@ bch2_btree_iter_peek_in_subvolume_max_type(struct btree_iter *iter, struct bpos
\
do { \
_ret3 = lockrestart_do(_trans, ({ \
- (_k) = bch2_btree_iter_peek_in_subvolume_max_type(&(_iter), \
+ (_k) = bch2_btree_iter_peek_in_subvolume_max_type(trans, &(_iter),\
_end, _subvolid, (_flags)); \
if (!(_k).k) \
break; \
\
bkey_err(_k) ?: (_do); \
})); \
- } while (!_ret3 && bch2_btree_iter_advance(&(_iter))); \
+ } while (!_ret3 && bch2_btree_iter_advance(_trans, &(_iter))); \
\
bch2_trans_iter_exit((_trans), &(_iter)); \
_ret3; \
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 572b06bfa0b8..e27422b6d9c6 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -248,7 +248,7 @@ struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *sb,
struct bch_sb_handle *dev_sb = &ca->disk_sb;
if (bch2_sb_realloc(dev_sb, le32_to_cpu(dev_sb->sb->u64s) + d)) {
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[READ]);
return NULL;
}
}
@@ -945,7 +945,7 @@ static void write_super_endio(struct bio *bio)
}
closure_put(&ca->fs->sb_write);
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[READ]);
}
static void read_back_super(struct bch_fs *c, struct bch_dev *ca)
@@ -963,7 +963,7 @@ static void read_back_super(struct bch_fs *c, struct bch_dev *ca)
this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb], bio_sectors(bio));
- percpu_ref_get(&ca->io_ref);
+ percpu_ref_get(&ca->io_ref[READ]);
closure_bio_submit(bio, &c->sb_write);
}
@@ -989,7 +989,7 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_sb],
bio_sectors(bio));
- percpu_ref_get(&ca->io_ref);
+ percpu_ref_get(&ca->io_ref[READ]);
closure_bio_submit(bio, &c->sb_write);
}
@@ -1014,13 +1014,20 @@ int bch2_write_super(struct bch_fs *c)
closure_init_stack(cl);
memset(&sb_written, 0, sizeof(sb_written));
+ /*
+ * Note: we do writes to RO devices here, and we might want to change
+ * that in the future.
+ *
+ * For now, we expect to be able to call write_super() when we're not
+ * yet RW:
+ */
for_each_online_member(c, ca) {
ret = darray_push(&online_devices, ca);
if (bch2_fs_fatal_err_on(ret, c, "%s: error allocating online devices", __func__)) {
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[READ]);
goto out;
}
- percpu_ref_get(&ca->io_ref);
+ percpu_ref_get(&ca->io_ref[READ]);
}
/* Make sure we're using the new magic numbers: */
@@ -1186,7 +1193,7 @@ out:
/* Make new options visible after they're persistent: */
bch2_sb_update(c);
darray_for_each(online_devices, ca)
- percpu_ref_put(&(*ca)->io_ref);
+ percpu_ref_put(&(*ca)->io_ref[READ]);
darray_exit(&online_devices);
printbuf_exit(&err);
return ret;
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 20208f3c5d8b..a58edde43bee 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -185,6 +185,7 @@ static void bch2_dev_unlink(struct bch_dev *);
static void bch2_dev_free(struct bch_dev *);
static int bch2_dev_alloc(struct bch_fs *, unsigned);
static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *);
+static void bch2_dev_io_ref_stop(struct bch_dev *, int);
static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *);
struct bch_fs *bch2_dev_to_fs(dev_t dev)
@@ -294,8 +295,10 @@ static void __bch2_fs_read_only(struct bch_fs *c)
/*
* After stopping journal:
*/
- for_each_member_device(c, ca)
+ for_each_member_device(c, ca) {
+ bch2_dev_io_ref_stop(ca, WRITE);
bch2_dev_allocator_remove(c, ca);
+ }
}
#ifndef BCH_WRITE_REF_DEBUG
@@ -465,10 +468,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
if (ret)
goto err;
- ret = bch2_fs_mark_dirty(c);
- if (ret)
- goto err;
-
clear_bit(BCH_FS_clean_shutdown, &c->flags);
/*
@@ -480,10 +479,24 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
set_bit(JOURNAL_need_flush_write, &c->journal.flags);
set_bit(JOURNAL_running, &c->journal.flags);
- for_each_rw_member(c, ca)
+ __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw), READ) {
bch2_dev_allocator_add(c, ca);
+ percpu_ref_reinit(&ca->io_ref[WRITE]);
+ }
bch2_recalc_capacity(c);
+ ret = bch2_fs_mark_dirty(c);
+ if (ret)
+ goto err;
+
+ spin_lock(&c->journal.lock);
+ bch2_journal_space_available(&c->journal);
+ spin_unlock(&c->journal.lock);
+
+ ret = bch2_journal_reclaim_start(&c->journal);
+ if (ret)
+ goto err;
+
set_bit(BCH_FS_rw, &c->flags);
set_bit(BCH_FS_was_rw, &c->flags);
@@ -495,11 +508,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
atomic_long_inc(&c->writes[i]);
}
#endif
-
- ret = bch2_journal_reclaim_start(&c->journal);
- if (ret)
- goto err;
-
if (!early) {
ret = bch2_fs_read_write_late(c);
if (ret)
@@ -675,6 +683,7 @@ void bch2_fs_free(struct bch_fs *c)
if (ca) {
EBUG_ON(atomic_long_read(&ca->ref) != 1);
+ bch2_dev_io_ref_stop(ca, READ);
bch2_free_super(&ca->disk_sb);
bch2_dev_free(ca);
}
@@ -1199,6 +1208,15 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs,
/* Device startup/shutdown: */
+static void bch2_dev_io_ref_stop(struct bch_dev *ca, int rw)
+{
+ if (!percpu_ref_is_zero(&ca->io_ref[rw])) {
+ reinit_completion(&ca->io_ref_completion[rw]);
+ percpu_ref_kill(&ca->io_ref[rw]);
+ wait_for_completion(&ca->io_ref_completion[rw]);
+ }
+}
+
static void bch2_dev_release(struct kobject *kobj)
{
struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
@@ -1208,6 +1226,9 @@ static void bch2_dev_release(struct kobject *kobj)
static void bch2_dev_free(struct bch_dev *ca)
{
+ WARN_ON(!percpu_ref_is_zero(&ca->io_ref[WRITE]));
+ WARN_ON(!percpu_ref_is_zero(&ca->io_ref[READ]));
+
cancel_work_sync(&ca->io_error_work);
bch2_dev_unlink(ca);
@@ -1226,7 +1247,8 @@ static void bch2_dev_free(struct bch_dev *ca)
bch2_time_stats_quantiles_exit(&ca->io_latency[WRITE]);
bch2_time_stats_quantiles_exit(&ca->io_latency[READ]);
- percpu_ref_exit(&ca->io_ref);
+ percpu_ref_exit(&ca->io_ref[WRITE]);
+ percpu_ref_exit(&ca->io_ref[READ]);
#ifndef CONFIG_BCACHEFS_DEBUG
percpu_ref_exit(&ca->ref);
#endif
@@ -1238,14 +1260,12 @@ static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca)
lockdep_assert_held(&c->state_lock);
- if (percpu_ref_is_zero(&ca->io_ref))
+ if (percpu_ref_is_zero(&ca->io_ref[READ]))
return;
__bch2_dev_read_only(c, ca);
- reinit_completion(&ca->io_ref_completion);
- percpu_ref_kill(&ca->io_ref);
- wait_for_completion(&ca->io_ref_completion);
+ bch2_dev_io_ref_stop(ca, READ);
bch2_dev_unlink(ca);
@@ -1262,11 +1282,18 @@ static void bch2_dev_ref_complete(struct percpu_ref *ref)
}
#endif
-static void bch2_dev_io_ref_complete(struct percpu_ref *ref)
+static void bch2_dev_io_ref_read_complete(struct percpu_ref *ref)
+{
+ struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref[READ]);
+
+ complete(&ca->io_ref_completion[READ]);
+}
+
+static void bch2_dev_io_ref_write_complete(struct percpu_ref *ref)
{
- struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref);
+ struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref[WRITE]);
- complete(&ca->io_ref_completion);
+ complete(&ca->io_ref_completion[WRITE]);
}
static void bch2_dev_unlink(struct bch_dev *ca)
@@ -1330,7 +1357,8 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
kobject_init(&ca->kobj, &bch2_dev_ktype);
init_completion(&ca->ref_completion);
- init_completion(&ca->io_ref_completion);
+ init_completion(&ca->io_ref_completion[READ]);
+ init_completion(&ca->io_ref_completion[WRITE]);
INIT_WORK(&ca->io_error_work, bch2_io_error_work);
@@ -1356,7 +1384,9 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
bch2_dev_allocator_background_init(ca);
- if (percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete,
+ if (percpu_ref_init(&ca->io_ref[READ], bch2_dev_io_ref_read_complete,
+ PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
+ percpu_ref_init(&ca->io_ref[WRITE], bch2_dev_io_ref_write_complete,
PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
!(ca->sb_read_scratch = kmalloc(BCH_SB_READ_SCRATCH_BUF_SIZE, GFP_KERNEL)) ||
bch2_dev_buckets_alloc(c, ca) ||
@@ -1419,7 +1449,8 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
return -BCH_ERR_device_size_too_small;
}
- BUG_ON(!percpu_ref_is_zero(&ca->io_ref));
+ BUG_ON(!percpu_ref_is_zero(&ca->io_ref[READ]));
+ BUG_ON(!percpu_ref_is_zero(&ca->io_ref[WRITE]));
ret = bch2_dev_journal_init(ca, sb->sb);
if (ret)
@@ -1438,7 +1469,7 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
ca->dev = ca->disk_sb.bdev->bd_dev;
- percpu_ref_reinit(&ca->io_ref);
+ percpu_ref_reinit(&ca->io_ref[READ]);
return 0;
}
@@ -1568,6 +1599,8 @@ static bool bch2_fs_may_start(struct bch_fs *c)
static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
{
+ bch2_dev_io_ref_stop(ca, WRITE);
+
/*
* The allocator thread itself allocates btree nodes, so stop it first:
*/
@@ -1584,6 +1617,10 @@ static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
+
+ if (percpu_ref_is_zero(&ca->io_ref[WRITE]))
+ percpu_ref_reinit(&ca->io_ref[WRITE]);
+
bch2_dev_do_discards(ca);
}
@@ -1731,7 +1768,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
return 0;
err:
if (ca->mi.state == BCH_MEMBER_STATE_rw &&
- !percpu_ref_is_zero(&ca->io_ref))
+ !percpu_ref_is_zero(&ca->io_ref[READ]))
__bch2_dev_read_write(c, ca);
up_write(&c->state_lock);
return ret;
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index 6c6469814637..c265b102267a 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -43,7 +43,7 @@ static int test_delete(struct bch_fs *c, u64 nr)
BTREE_ITER_intent);
ret = commit_do(trans, NULL, NULL, 0,
- bch2_btree_iter_traverse(&iter) ?:
+ bch2_btree_iter_traverse(trans, &iter) ?:
bch2_trans_update(trans, &iter, &k.k_i, 0));
bch_err_msg(c, ret, "update error");
if (ret)
@@ -51,7 +51,7 @@ static int test_delete(struct bch_fs *c, u64 nr)
pr_info("deleting once");
ret = commit_do(trans, NULL, NULL, 0,
- bch2_btree_iter_traverse(&iter) ?:
+ bch2_btree_iter_traverse(trans, &iter) ?:
bch2_btree_delete_at(trans, &iter, 0));
bch_err_msg(c, ret, "delete error (first)");
if (ret)
@@ -59,7 +59,7 @@ static int test_delete(struct bch_fs *c, u64 nr)
pr_info("deleting twice");
ret = commit_do(trans, NULL, NULL, 0,
- bch2_btree_iter_traverse(&iter) ?:
+ bch2_btree_iter_traverse(trans, &iter) ?:
bch2_btree_delete_at(trans, &iter, 0));
bch_err_msg(c, ret, "delete error (second)");
if (ret)
@@ -84,7 +84,7 @@ static int test_delete_written(struct bch_fs *c, u64 nr)
BTREE_ITER_intent);
ret = commit_do(trans, NULL, NULL, 0,
- bch2_btree_iter_traverse(&iter) ?:
+ bch2_btree_iter_traverse(trans, &iter) ?:
bch2_trans_update(trans, &iter, &k.k_i, 0));
bch_err_msg(c, ret, "update error");
if (ret)
@@ -94,7 +94,7 @@ static int test_delete_written(struct bch_fs *c, u64 nr)
bch2_journal_flush_all_pins(&c->journal);
ret = commit_do(trans, NULL, NULL, 0,
- bch2_btree_iter_traverse(&iter) ?:
+ bch2_btree_iter_traverse(trans, &iter) ?:
bch2_btree_delete_at(trans, &iter, 0));
bch_err_msg(c, ret, "delete error");
if (ret)
@@ -349,10 +349,10 @@ static int test_peek_end(struct bch_fs *c, u64 nr)
bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
SPOS(0, 0, U32_MAX), 0);
- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX))));
+ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX))));
BUG_ON(k.k);
- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX))));
+ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX))));
BUG_ON(k.k);
bch2_trans_iter_exit(trans, &iter);
@@ -369,10 +369,10 @@ static int test_peek_end_extents(struct bch_fs *c, u64 nr)
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
SPOS(0, 0, U32_MAX), 0);
- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX))));
+ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX))));
BUG_ON(k.k);
- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX))));
+ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX))));
BUG_ON(k.k);
bch2_trans_iter_exit(trans, &iter);
@@ -488,7 +488,7 @@ static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi)
trans = bch2_trans_get(c);
bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
SPOS(0, 0, snapid_lo), 0);
- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX))));
+ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX))));
BUG_ON(k.k->p.snapshot != U32_MAX);
@@ -602,9 +602,9 @@ static int rand_lookup(struct bch_fs *c, u64 nr)
SPOS(0, 0, U32_MAX), 0);
for (i = 0; i < nr; i++) {
- bch2_btree_iter_set_pos(&iter, SPOS(0, test_rand(), U32_MAX));
+ bch2_btree_iter_set_pos(trans, &iter, SPOS(0, test_rand(), U32_MAX));
- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
+ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek(trans, &iter)));
ret = bkey_err(k);
if (ret)
break;
@@ -623,9 +623,9 @@ static int rand_mixed_trans(struct btree_trans *trans,
struct bkey_s_c k;
int ret;
- bch2_btree_iter_set_pos(iter, SPOS(0, pos, U32_MAX));
+ bch2_btree_iter_set_pos(trans, iter, SPOS(0, pos, U32_MAX));
- k = bch2_btree_iter_peek(iter);
+ k = bch2_btree_iter_peek(trans, iter);
ret = bkey_err(k);
bch_err_msg(trans->c, ret, "lookup error");
if (ret)
@@ -672,7 +672,7 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos)
bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos,
BTREE_ITER_intent);
- k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX));
+ k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX));
ret = bkey_err(k);
if (ret)
goto err;
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index f9667b944c0d..651da52b2cbc 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -168,7 +168,7 @@ int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum,
int type, int flags)
{
struct bch_fs *c = trans->c;
- struct btree_iter inode_iter = { NULL };
+ struct btree_iter inode_iter = {};
int ret;
ret = bch2_subvol_is_ro_trans(trans, inum.subvol) ?:
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index cd5f38d6fbaa..3541efa765c7 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -225,7 +225,7 @@ void zstd_cleanup_workspace_manager(void)
}
spin_unlock_bh(&wsm.lock);
- del_timer_sync(&wsm.timer);
+ timer_delete_sync(&wsm.timer);
}
/*
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 83a60126de0f..14d0cc894000 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -128,10 +128,11 @@ retry:
ret = security_path_mkdir(&path, subdir, 0700);
if (ret < 0)
goto mkdir_error;
- subdir = ERR_PTR(cachefiles_inject_write_error());
- if (!IS_ERR(subdir))
+ ret = cachefiles_inject_write_error();
+ if (ret == 0)
subdir = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), subdir, 0700);
- ret = PTR_ERR(subdir);
+ else
+ subdir = ERR_PTR(ret);
if (IS_ERR(subdir)) {
trace_cachefiles_vfs_error(NULL, d_inode(dir), ret,
cachefiles_trace_mkdir_error);
diff --git a/fs/exec.c b/fs/exec.c
index f45859ad13ac..8e4ea5f1e64c 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1227,13 +1227,12 @@ int begin_new_exec(struct linux_binprm * bprm)
*/
bprm->point_of_no_return = true;
- /*
- * Make this the only thread in the thread group.
- */
+ /* Make this the only thread in the thread group */
retval = de_thread(me);
if (retval)
goto out;
-
+ /* see the comment in check_unsafe_exec() */
+ current->fs->in_exec = 0;
/*
* Cancel any io_uring activity across execve
*/
@@ -1495,6 +1494,8 @@ static void free_bprm(struct linux_binprm *bprm)
}
free_arg_pages(bprm);
if (bprm->cred) {
+ /* in case exec fails before de_thread() succeeds */
+ current->fs->in_exec = 0;
mutex_unlock(&current->signal->cred_guard_mutex);
abort_creds(bprm->cred);
}
@@ -1616,6 +1617,10 @@ static void check_unsafe_exec(struct linux_binprm *bprm)
* suid exec because the differently privileged task
* will be able to manipulate the current directory, etc.
* It would be nice to force an unshare instead...
+ *
+ * Otherwise we set fs->in_exec = 1 to deny clone(CLONE_FS)
+ * from another sub-thread until de_thread() succeeds, this
+ * state is protected by cred_guard_mutex we hold.
*/
n_fs = 1;
spin_lock(&p->fs->lock);
@@ -1859,10 +1864,9 @@ static int bprm_execve(struct linux_binprm *bprm)
goto out;
sched_mm_cid_after_execve(current);
+ rseq_execve(current);
/* execve succeeded */
- current->fs->in_exec = 0;
current->in_execve = 0;
- rseq_execve(current);
user_events_execve(current);
acct_update_integrals(current);
task_numa_free(current, false);
@@ -1879,7 +1883,7 @@ out:
force_fatal_sig(SIGSEGV);
sched_mm_cid_after_execve(current);
- current->fs->in_exec = 0;
+ rseq_set_notify_resume(current);
current->in_execve = 0;
return retval;
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index b5845c4846b8..128dd092916b 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -608,4 +608,5 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
}
EXPORT_SYMBOL_GPL(exportfs_decode_fh);
+MODULE_DESCRIPTION("Code mapping from inodes to file handles");
MODULE_LICENSE("GPL");
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 8122d4ffb3b5..181934499624 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -5680,7 +5680,7 @@ failed_mount3:
/* flush s_sb_upd_work before sbi destroy */
flush_work(&sbi->s_sb_upd_work);
ext4_stop_mmpd(sbi);
- del_timer_sync(&sbi->s_err_report);
+ timer_delete_sync(&sbi->s_err_report);
ext4_group_desc_free(sbi);
failed_mount:
#if IS_ENABLED(CONFIG_UNICODE)
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 51e31df4c546..6dcbaa218b7a 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -32,6 +32,100 @@ MODULE_ALIAS("devname:fuse");
static struct kmem_cache *fuse_req_cachep;
+const unsigned long fuse_timeout_timer_freq =
+ secs_to_jiffies(FUSE_TIMEOUT_TIMER_FREQ);
+
+bool fuse_request_expired(struct fuse_conn *fc, struct list_head *list)
+{
+ struct fuse_req *req;
+
+ req = list_first_entry_or_null(list, struct fuse_req, list);
+ if (!req)
+ return false;
+ return time_is_before_jiffies(req->create_time + fc->timeout.req_timeout);
+}
+
+bool fuse_fpq_processing_expired(struct fuse_conn *fc, struct list_head *processing)
+{
+ int i;
+
+ for (i = 0; i < FUSE_PQ_HASH_SIZE; i++)
+ if (fuse_request_expired(fc, &processing[i]))
+ return true;
+
+ return false;
+}
+
+/*
+ * Check if any requests aren't being completed by the time the request timeout
+ * elapses. To do so, we:
+ * - check the fiq pending list
+ * - check the bg queue
+ * - check the fpq io and processing lists
+ *
+ * To make this fast, we only check against the head request on each list since
+ * these are generally queued in order of creation time (eg newer requests get
+ * queued to the tail). We might miss a few edge cases (eg requests transitioning
+ * between lists, re-sent requests at the head of the pending list having a
+ * later creation time than other requests on that list, etc.) but that is fine
+ * since if the request never gets fulfilled, it will eventually be caught.
+ */
+void fuse_check_timeout(struct work_struct *work)
+{
+ struct delayed_work *dwork = to_delayed_work(work);
+ struct fuse_conn *fc = container_of(dwork, struct fuse_conn,
+ timeout.work);
+ struct fuse_iqueue *fiq = &fc->iq;
+ struct fuse_dev *fud;
+ struct fuse_pqueue *fpq;
+ bool expired = false;
+
+ if (!atomic_read(&fc->num_waiting))
+ goto out;
+
+ spin_lock(&fiq->lock);
+ expired = fuse_request_expired(fc, &fiq->pending);
+ spin_unlock(&fiq->lock);
+ if (expired)
+ goto abort_conn;
+
+ spin_lock(&fc->bg_lock);
+ expired = fuse_request_expired(fc, &fc->bg_queue);
+ spin_unlock(&fc->bg_lock);
+ if (expired)
+ goto abort_conn;
+
+ spin_lock(&fc->lock);
+ if (!fc->connected) {
+ spin_unlock(&fc->lock);
+ return;
+ }
+ list_for_each_entry(fud, &fc->devices, entry) {
+ fpq = &fud->pq;
+ spin_lock(&fpq->lock);
+ if (fuse_request_expired(fc, &fpq->io) ||
+ fuse_fpq_processing_expired(fc, fpq->processing)) {
+ spin_unlock(&fpq->lock);
+ spin_unlock(&fc->lock);
+ goto abort_conn;
+ }
+
+ spin_unlock(&fpq->lock);
+ }
+ spin_unlock(&fc->lock);
+
+ if (fuse_uring_request_expired(fc))
+ goto abort_conn;
+
+out:
+ queue_delayed_work(system_wq, &fc->timeout.work,
+ fuse_timeout_timer_freq);
+ return;
+
+abort_conn:
+ fuse_abort_conn(fc);
+}
+
static void fuse_request_init(struct fuse_mount *fm, struct fuse_req *req)
{
INIT_LIST_HEAD(&req->list);
@@ -40,6 +134,7 @@ static void fuse_request_init(struct fuse_mount *fm, struct fuse_req *req)
refcount_set(&req->count, 1);
__set_bit(FR_PENDING, &req->flags);
req->fm = fm;
+ req->create_time = jiffies;
}
static struct fuse_req *fuse_request_alloc(struct fuse_mount *fm, gfp_t flags)
@@ -407,6 +502,24 @@ static int queue_interrupt(struct fuse_req *req)
return 0;
}
+bool fuse_remove_pending_req(struct fuse_req *req, spinlock_t *lock)
+{
+ spin_lock(lock);
+ if (test_bit(FR_PENDING, &req->flags)) {
+ /*
+ * FR_PENDING does not get cleared as the request will end
+ * up in destruction anyway.
+ */
+ list_del(&req->list);
+ spin_unlock(lock);
+ __fuse_put_request(req);
+ req->out.h.error = -EINTR;
+ return true;
+ }
+ spin_unlock(lock);
+ return false;
+}
+
static void request_wait_answer(struct fuse_req *req)
{
struct fuse_conn *fc = req->fm->fc;
@@ -428,22 +541,20 @@ static void request_wait_answer(struct fuse_req *req)
}
if (!test_bit(FR_FORCE, &req->flags)) {
+ bool removed;
+
/* Only fatal signals may interrupt this */
err = wait_event_killable(req->waitq,
test_bit(FR_FINISHED, &req->flags));
if (!err)
return;
- spin_lock(&fiq->lock);
- /* Request is not yet in userspace, bail out */
- if (test_bit(FR_PENDING, &req->flags)) {
- list_del(&req->list);
- spin_unlock(&fiq->lock);
- __fuse_put_request(req);
- req->out.h.error = -EINTR;
+ if (test_bit(FR_URING, &req->flags))
+ removed = fuse_uring_remove_pending_req(req);
+ else
+ removed = fuse_remove_pending_req(req, &fiq->lock);
+ if (removed)
return;
- }
- spin_unlock(&fiq->lock);
}
/*
@@ -1533,14 +1644,10 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
struct fuse_copy_state *cs)
{
struct fuse_notify_inval_entry_out outarg;
- int err = -ENOMEM;
- char *buf;
+ int err;
+ char *buf = NULL;
struct qstr name;
- buf = kzalloc(FUSE_NAME_MAX + 1, GFP_KERNEL);
- if (!buf)
- goto err;
-
err = -EINVAL;
if (size < sizeof(outarg))
goto err;
@@ -1550,13 +1657,18 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
goto err;
err = -ENAMETOOLONG;
- if (outarg.namelen > FUSE_NAME_MAX)
+ if (outarg.namelen > fc->name_max)
goto err;
err = -EINVAL;
if (size != sizeof(outarg) + outarg.namelen + 1)
goto err;
+ err = -ENOMEM;
+ buf = kzalloc(outarg.namelen + 1, GFP_KERNEL);
+ if (!buf)
+ goto err;
+
name.name = buf;
name.len = outarg.namelen;
err = fuse_copy_one(cs, buf, outarg.namelen + 1);
@@ -1581,14 +1693,10 @@ static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size,
struct fuse_copy_state *cs)
{
struct fuse_notify_delete_out outarg;
- int err = -ENOMEM;
- char *buf;
+ int err;
+ char *buf = NULL;
struct qstr name;
- buf = kzalloc(FUSE_NAME_MAX + 1, GFP_KERNEL);
- if (!buf)
- goto err;
-
err = -EINVAL;
if (size < sizeof(outarg))
goto err;
@@ -1598,13 +1706,18 @@ static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size,
goto err;
err = -ENAMETOOLONG;
- if (outarg.namelen > FUSE_NAME_MAX)
+ if (outarg.namelen > fc->name_max)
goto err;
err = -EINVAL;
if (size != sizeof(outarg) + outarg.namelen + 1)
goto err;
+ err = -ENOMEM;
+ buf = kzalloc(outarg.namelen + 1, GFP_KERNEL);
+ if (!buf)
+ goto err;
+
name.name = buf;
name.len = outarg.namelen;
err = fuse_copy_one(cs, buf, outarg.namelen + 1);
@@ -2275,6 +2388,9 @@ void fuse_abort_conn(struct fuse_conn *fc)
LIST_HEAD(to_end);
unsigned int i;
+ if (fc->timeout.req_timeout)
+ cancel_delayed_work(&fc->timeout.work);
+
/* Background queuing checks fc->connected under bg_lock */
spin_lock(&fc->bg_lock);
fc->connected = 0;
diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
index 82bf458fa9db..accdce2977c5 100644
--- a/fs/fuse/dev_uring.c
+++ b/fs/fuse/dev_uring.c
@@ -140,6 +140,33 @@ void fuse_uring_abort_end_requests(struct fuse_ring *ring)
}
}
+bool fuse_uring_request_expired(struct fuse_conn *fc)
+{
+ struct fuse_ring *ring = fc->ring;
+ struct fuse_ring_queue *queue;
+ int qid;
+
+ if (!ring)
+ return false;
+
+ for (qid = 0; qid < ring->nr_queues; qid++) {
+ queue = READ_ONCE(ring->queues[qid]);
+ if (!queue)
+ continue;
+
+ spin_lock(&queue->lock);
+ if (fuse_request_expired(fc, &queue->fuse_req_queue) ||
+ fuse_request_expired(fc, &queue->fuse_req_bg_queue) ||
+ fuse_fpq_processing_expired(fc, queue->fpq.processing)) {
+ spin_unlock(&queue->lock);
+ return true;
+ }
+ spin_unlock(&queue->lock);
+ }
+
+ return false;
+}
+
void fuse_uring_destruct(struct fuse_conn *fc)
{
struct fuse_ring *ring = fc->ring;
@@ -211,7 +238,6 @@ static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc)
ring->nr_queues = nr_queues;
ring->fc = fc;
ring->max_payload_sz = max_payload_size;
- atomic_set(&ring->queue_refs, 0);
smp_store_release(&fc->ring, ring);
spin_unlock(&fc->lock);
@@ -726,8 +752,6 @@ static void fuse_uring_add_req_to_ring_ent(struct fuse_ring_ent *ent,
struct fuse_req *req)
{
struct fuse_ring_queue *queue = ent->queue;
- struct fuse_conn *fc = req->fm->fc;
- struct fuse_iqueue *fiq = &fc->iq;
lockdep_assert_held(&queue->lock);
@@ -737,9 +761,7 @@ static void fuse_uring_add_req_to_ring_ent(struct fuse_ring_ent *ent,
ent->state);
}
- spin_lock(&fiq->lock);
clear_bit(FR_PENDING, &req->flags);
- spin_unlock(&fiq->lock);
ent->fuse_req = req;
ent->state = FRRS_FUSE_REQ;
list_move(&ent->list, &queue->ent_w_req_queue);
@@ -1238,6 +1260,8 @@ void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req)
if (unlikely(queue->stopped))
goto err_unlock;
+ set_bit(FR_URING, &req->flags);
+ req->ring_queue = queue;
ent = list_first_entry_or_null(&queue->ent_avail_queue,
struct fuse_ring_ent, list);
if (ent)
@@ -1276,6 +1300,8 @@ bool fuse_uring_queue_bq_req(struct fuse_req *req)
return false;
}
+ set_bit(FR_URING, &req->flags);
+ req->ring_queue = queue;
list_add_tail(&req->list, &queue->fuse_req_bg_queue);
ent = list_first_entry_or_null(&queue->ent_avail_queue,
@@ -1306,6 +1332,13 @@ bool fuse_uring_queue_bq_req(struct fuse_req *req)
return true;
}
+bool fuse_uring_remove_pending_req(struct fuse_req *req)
+{
+ struct fuse_ring_queue *queue = req->ring_queue;
+
+ return fuse_remove_pending_req(req, &queue->lock);
+}
+
static const struct fuse_iqueue_ops fuse_io_uring_ops = {
/* should be send over io-uring as enhancement */
.send_forget = fuse_dev_queue_forget,
diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h
index 2102b3d0c1ae..51a563922ce1 100644
--- a/fs/fuse/dev_uring_i.h
+++ b/fs/fuse/dev_uring_i.h
@@ -142,6 +142,8 @@ void fuse_uring_abort_end_requests(struct fuse_ring *ring);
int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags);
void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req);
bool fuse_uring_queue_bq_req(struct fuse_req *req);
+bool fuse_uring_remove_pending_req(struct fuse_req *req);
+bool fuse_uring_request_expired(struct fuse_conn *fc);
static inline void fuse_uring_abort(struct fuse_conn *fc)
{
@@ -172,12 +174,6 @@ static inline bool fuse_uring_ready(struct fuse_conn *fc)
#else /* CONFIG_FUSE_IO_URING */
-struct fuse_ring;
-
-static inline void fuse_uring_create(struct fuse_conn *fc)
-{
-}
-
static inline void fuse_uring_destruct(struct fuse_conn *fc)
{
}
@@ -200,6 +196,16 @@ static inline bool fuse_uring_ready(struct fuse_conn *fc)
return false;
}
+static inline bool fuse_uring_remove_pending_req(struct fuse_req *req)
+{
+ return false;
+}
+
+static inline bool fuse_uring_request_expired(struct fuse_conn *fc)
+{
+ return false;
+}
+
#endif /* CONFIG_FUSE_IO_URING */
#endif /* _FS_FUSE_DEV_URING_I_H */
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 85e4f894a59f..83ac192e7fdd 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -370,7 +370,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name
*inode = NULL;
err = -ENAMETOOLONG;
- if (name->len > FUSE_NAME_MAX)
+ if (name->len > fm->fc->name_max)
goto out;
@@ -1137,6 +1137,9 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,
struct fuse_mount *fm = get_fuse_mount(inode);
FUSE_ARGS(args);
+ if (fm->fc->no_link)
+ goto out;
+
memset(&inarg, 0, sizeof(inarg));
inarg.oldnodeid = get_node_id(inode);
args.opcode = FUSE_LINK;
@@ -1151,6 +1154,12 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,
else if (err == -EINTR)
fuse_invalidate_attr(inode);
+ if (err == -ENOSYS)
+ fm->fc->no_link = 1;
+out:
+ if (fm->fc->no_link)
+ return -EPERM;
+
return err;
}
diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h
index 3b2bfe1248d3..b3c2e32254ba 100644
--- a/fs/fuse/fuse_dev_i.h
+++ b/fs/fuse/fuse_dev_i.h
@@ -61,6 +61,10 @@ int fuse_copy_out_args(struct fuse_copy_state *cs, struct fuse_args *args,
void fuse_dev_queue_forget(struct fuse_iqueue *fiq,
struct fuse_forget_link *forget);
void fuse_dev_queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req);
+bool fuse_remove_pending_req(struct fuse_req *req, spinlock_t *lock);
+
+bool fuse_request_expired(struct fuse_conn *fc, struct list_head *list);
+bool fuse_fpq_processing_expired(struct fuse_conn *fc, struct list_head *processing);
#endif
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index fee96fe7887b..d56d4fd956db 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -38,14 +38,34 @@
/** Bias for fi->writectr, meaning new writepages must not be sent */
#define FUSE_NOWRITE INT_MIN
-/** It could be as large as PATH_MAX, but would that have any uses? */
-#define FUSE_NAME_MAX 1024
+/** Maximum length of a filename, not including terminating null */
+
+/* maximum, small enough for FUSE_MIN_READ_BUFFER*/
+#define FUSE_NAME_LOW_MAX 1024
+/* maximum, but needs a request buffer > FUSE_MIN_READ_BUFFER */
+#define FUSE_NAME_MAX (PATH_MAX - 1)
/** Number of dentries for each connection in the control filesystem */
#define FUSE_CTL_NUM_DENTRIES 5
+/* Frequency (in seconds) of request timeout checks, if opted into */
+#define FUSE_TIMEOUT_TIMER_FREQ 15
+
+/** Frequency (in jiffies) of request timeout checks, if opted into */
+extern const unsigned long fuse_timeout_timer_freq;
+
/** Maximum of max_pages received in init_out */
extern unsigned int fuse_max_pages_limit;
+/*
+ * Default timeout (in seconds) for the server to reply to a request
+ * before the connection is aborted, if no timeout was specified on mount.
+ */
+extern unsigned int fuse_default_req_timeout;
+/*
+ * Max timeout (in seconds) for the server to reply to a request before
+ * the connection is aborted.
+ */
+extern unsigned int fuse_max_req_timeout;
/** List of active connections */
extern struct list_head fuse_conn_list;
@@ -378,6 +398,7 @@ struct fuse_io_priv {
* FR_FINISHED: request is finished
* FR_PRIVATE: request is on private list
* FR_ASYNC: request is asynchronous
+ * FR_URING: request is handled through fuse-io-uring
*/
enum fuse_req_flag {
FR_ISREPLY,
@@ -392,6 +413,7 @@ enum fuse_req_flag {
FR_FINISHED,
FR_PRIVATE,
FR_ASYNC,
+ FR_URING,
};
/**
@@ -441,7 +463,10 @@ struct fuse_req {
#ifdef CONFIG_FUSE_IO_URING
void *ring_entry;
+ void *ring_queue;
#endif
+ /** When (in jiffies) the request was created */
+ unsigned long create_time;
};
struct fuse_iqueue;
@@ -867,6 +892,9 @@ struct fuse_conn {
/* Use pages instead of pointer for kernel I/O */
unsigned int use_pages_for_kvec_io:1;
+ /* Is link not implemented by fs? */
+ unsigned int no_link:1;
+
/* Use io_uring for communication */
unsigned int io_uring;
@@ -900,6 +928,9 @@ struct fuse_conn {
/** Version counter for evict inode */
atomic64_t evict_ctr;
+ /* maximum file name length */
+ u32 name_max;
+
/** Called on final put */
void (*release)(struct fuse_conn *);
@@ -935,6 +966,15 @@ struct fuse_conn {
/** uring connection information*/
struct fuse_ring *ring;
#endif
+
+ /** Only used if the connection opts into request timeouts */
+ struct {
+ /* Worker for checking if any requests have timed out */
+ struct delayed_work work;
+
+ /* Request timeout (in jiffies). 0 = no timeout */
+ unsigned int req_timeout;
+ } timeout;
};
/*
@@ -1216,6 +1256,9 @@ void fuse_request_end(struct fuse_req *req);
void fuse_abort_conn(struct fuse_conn *fc);
void fuse_wait_aborted(struct fuse_conn *fc);
+/* Check if any requests timed out */
+void fuse_check_timeout(struct work_struct *work);
+
/**
* Invalidate inode attributes
*/
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index e9db2cb8c150..fd48e8d37f2e 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -37,6 +37,9 @@ DEFINE_MUTEX(fuse_mutex);
static int set_global_limit(const char *val, const struct kernel_param *kp);
unsigned int fuse_max_pages_limit = 256;
+/* default is no timeout */
+unsigned int fuse_default_req_timeout;
+unsigned int fuse_max_req_timeout;
unsigned max_user_bgreq;
module_param_call(max_user_bgreq, set_global_limit, param_get_uint,
@@ -979,6 +982,8 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm,
fc->user_ns = get_user_ns(user_ns);
fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ;
fc->max_pages_limit = fuse_max_pages_limit;
+ fc->name_max = FUSE_NAME_LOW_MAX;
+ fc->timeout.req_timeout = 0;
if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
fuse_backing_files_init(fc);
@@ -1007,6 +1012,8 @@ void fuse_conn_put(struct fuse_conn *fc)
if (IS_ENABLED(CONFIG_FUSE_DAX))
fuse_dax_conn_free(fc);
+ if (fc->timeout.req_timeout)
+ cancel_delayed_work_sync(&fc->timeout.work);
if (fiq->ops->release)
fiq->ops->release(fiq);
put_pid_ns(fc->pid_ns);
@@ -1257,6 +1264,34 @@ static void process_init_limits(struct fuse_conn *fc, struct fuse_init_out *arg)
spin_unlock(&fc->bg_lock);
}
+static void set_request_timeout(struct fuse_conn *fc, unsigned int timeout)
+{
+ fc->timeout.req_timeout = secs_to_jiffies(timeout);
+ INIT_DELAYED_WORK(&fc->timeout.work, fuse_check_timeout);
+ queue_delayed_work(system_wq, &fc->timeout.work,
+ fuse_timeout_timer_freq);
+}
+
+static void init_server_timeout(struct fuse_conn *fc, unsigned int timeout)
+{
+ if (!timeout && !fuse_max_req_timeout && !fuse_default_req_timeout)
+ return;
+
+ if (!timeout)
+ timeout = fuse_default_req_timeout;
+
+ if (fuse_max_req_timeout) {
+ if (timeout)
+ timeout = min(fuse_max_req_timeout, timeout);
+ else
+ timeout = fuse_max_req_timeout;
+ }
+
+ timeout = max(FUSE_TIMEOUT_TIMER_FREQ, timeout);
+
+ set_request_timeout(fc, timeout);
+}
+
struct fuse_init_args {
struct fuse_args args;
struct fuse_init_in in;
@@ -1275,6 +1310,7 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
ok = false;
else {
unsigned long ra_pages;
+ unsigned int timeout = 0;
process_init_limits(fc, arg);
@@ -1338,6 +1374,13 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
fc->max_pages =
min_t(unsigned int, fc->max_pages_limit,
max_t(unsigned int, arg->max_pages, 1));
+
+ /*
+ * PATH_MAX file names might need two pages for
+ * ops like rename
+ */
+ if (fc->max_pages > 1)
+ fc->name_max = FUSE_NAME_MAX;
}
if (IS_ENABLED(CONFIG_FUSE_DAX)) {
if (flags & FUSE_MAP_ALIGNMENT &&
@@ -1392,12 +1435,17 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
}
if (flags & FUSE_OVER_IO_URING && fuse_uring_enabled())
fc->io_uring = 1;
+
+ if (flags & FUSE_REQUEST_TIMEOUT)
+ timeout = arg->request_timeout;
} else {
ra_pages = fc->max_read / PAGE_SIZE;
fc->no_lock = 1;
fc->no_flock = 1;
}
+ init_server_timeout(fc, timeout);
+
fm->sb->s_bdi->ra_pages =
min(fm->sb->s_bdi->ra_pages, ra_pages);
fc->minor = arg->minor;
@@ -1439,7 +1487,8 @@ void fuse_send_init(struct fuse_mount *fm)
FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT | FUSE_INIT_EXT |
FUSE_SECURITY_CTX | FUSE_CREATE_SUPP_GROUP |
FUSE_HAS_EXPIRE_ONLY | FUSE_DIRECT_IO_ALLOW_MMAP |
- FUSE_NO_EXPORT_SUPPORT | FUSE_HAS_RESEND | FUSE_ALLOW_IDMAP;
+ FUSE_NO_EXPORT_SUPPORT | FUSE_HAS_RESEND | FUSE_ALLOW_IDMAP |
+ FUSE_REQUEST_TIMEOUT;
#ifdef CONFIG_FUSE_DAX
if (fm->fc->dax)
flags |= FUSE_MAP_ALIGNMENT;
diff --git a/fs/fuse/sysctl.c b/fs/fuse/sysctl.c
index 63fb1e5bee30..e2d921abcb88 100644
--- a/fs/fuse/sysctl.c
+++ b/fs/fuse/sysctl.c
@@ -13,6 +13,12 @@ static struct ctl_table_header *fuse_table_header;
/* Bound by fuse_init_out max_pages, which is a u16 */
static unsigned int sysctl_fuse_max_pages_limit = 65535;
+/*
+ * fuse_init_out request timeouts are u16.
+ * This goes up to ~18 hours, which is plenty for a timeout.
+ */
+static unsigned int sysctl_fuse_req_timeout_limit = 65535;
+
static const struct ctl_table fuse_sysctl_table[] = {
{
.procname = "max_pages_limit",
@@ -23,6 +29,24 @@ static const struct ctl_table fuse_sysctl_table[] = {
.extra1 = SYSCTL_ONE,
.extra2 = &sysctl_fuse_max_pages_limit,
},
+ {
+ .procname = "default_request_timeout",
+ .data = &fuse_default_req_timeout,
+ .maxlen = sizeof(fuse_default_req_timeout),
+ .mode = 0644,
+ .proc_handler = proc_douintvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = &sysctl_fuse_req_timeout_limit,
+ },
+ {
+ .procname = "max_request_timeout",
+ .data = &fuse_max_req_timeout,
+ .maxlen = sizeof(fuse_max_req_timeout),
+ .mode = 0644,
+ .proc_handler = proc_douintvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = &sysctl_fuse_req_timeout_limit,
+ },
};
int fuse_sysctl_register(void)
diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h
index 8b39c15c408c..15b2f094d36e 100644
--- a/fs/hostfs/hostfs.h
+++ b/fs/hostfs/hostfs.h
@@ -60,7 +60,7 @@ struct hostfs_stat {
unsigned int uid;
unsigned int gid;
unsigned long long size;
- struct hostfs_timespec atime, mtime, ctime;
+ struct hostfs_timespec atime, mtime, ctime, btime;
unsigned int blksize;
unsigned long long blocks;
struct {
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index a2c6b9051c5b..702c41317589 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -33,6 +33,7 @@ struct hostfs_inode_info {
struct inode vfs_inode;
struct mutex open_mutex;
dev_t dev;
+ struct hostfs_timespec btime;
};
static inline struct hostfs_inode_info *HOSTFS_I(struct inode *inode)
@@ -547,6 +548,7 @@ static int hostfs_inode_set(struct inode *ino, void *data)
}
HOSTFS_I(ino)->dev = dev;
+ HOSTFS_I(ino)->btime = st->btime;
ino->i_ino = st->ino;
ino->i_mode = st->mode;
return hostfs_inode_update(ino, st);
@@ -557,7 +559,10 @@ static int hostfs_inode_test(struct inode *inode, void *data)
const struct hostfs_stat *st = data;
dev_t dev = MKDEV(st->dev.maj, st->dev.min);
- return inode->i_ino == st->ino && HOSTFS_I(inode)->dev == dev;
+ return inode->i_ino == st->ino && HOSTFS_I(inode)->dev == dev &&
+ (inode->i_mode & S_IFMT) == (st->mode & S_IFMT) &&
+ HOSTFS_I(inode)->btime.tv_sec == st->btime.tv_sec &&
+ HOSTFS_I(inode)->btime.tv_nsec == st->btime.tv_nsec;
}
static struct inode *hostfs_iget(struct super_block *sb, char *name)
diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c
index 97e9c40a9448..3bcd9f35e70b 100644
--- a/fs/hostfs/hostfs_user.c
+++ b/fs/hostfs/hostfs_user.c
@@ -18,39 +18,48 @@
#include "hostfs.h"
#include <utime.h>
-static void stat64_to_hostfs(const struct stat64 *buf, struct hostfs_stat *p)
+static void statx_to_hostfs(const struct statx *buf, struct hostfs_stat *p)
{
- p->ino = buf->st_ino;
- p->mode = buf->st_mode;
- p->nlink = buf->st_nlink;
- p->uid = buf->st_uid;
- p->gid = buf->st_gid;
- p->size = buf->st_size;
- p->atime.tv_sec = buf->st_atime;
- p->atime.tv_nsec = 0;
- p->ctime.tv_sec = buf->st_ctime;
- p->ctime.tv_nsec = 0;
- p->mtime.tv_sec = buf->st_mtime;
- p->mtime.tv_nsec = 0;
- p->blksize = buf->st_blksize;
- p->blocks = buf->st_blocks;
- p->rdev.maj = os_major(buf->st_rdev);
- p->rdev.min = os_minor(buf->st_rdev);
- p->dev.maj = os_major(buf->st_dev);
- p->dev.min = os_minor(buf->st_dev);
+ p->ino = buf->stx_ino;
+ p->mode = buf->stx_mode;
+ p->nlink = buf->stx_nlink;
+ p->uid = buf->stx_uid;
+ p->gid = buf->stx_gid;
+ p->size = buf->stx_size;
+ p->atime.tv_sec = buf->stx_atime.tv_sec;
+ p->atime.tv_nsec = buf->stx_atime.tv_nsec;
+ p->ctime.tv_sec = buf->stx_ctime.tv_sec;
+ p->ctime.tv_nsec = buf->stx_ctime.tv_nsec;
+ p->mtime.tv_sec = buf->stx_mtime.tv_sec;
+ p->mtime.tv_nsec = buf->stx_mtime.tv_nsec;
+ if (buf->stx_mask & STATX_BTIME) {
+ p->btime.tv_sec = buf->stx_btime.tv_sec;
+ p->btime.tv_nsec = buf->stx_btime.tv_nsec;
+ } else {
+ memset(&p->btime, 0, sizeof(p->btime));
+ }
+ p->blksize = buf->stx_blksize;
+ p->blocks = buf->stx_blocks;
+ p->rdev.maj = buf->stx_rdev_major;
+ p->rdev.min = buf->stx_rdev_minor;
+ p->dev.maj = buf->stx_dev_major;
+ p->dev.min = buf->stx_dev_minor;
}
int stat_file(const char *path, struct hostfs_stat *p, int fd)
{
- struct stat64 buf;
+ struct statx buf;
+ int flags = AT_SYMLINK_NOFOLLOW;
if (fd >= 0) {
- if (fstat64(fd, &buf) < 0)
- return -errno;
- } else if (lstat64(path, &buf) < 0) {
- return -errno;
+ flags |= AT_EMPTY_PATH;
+ path = "";
}
- stat64_to_hostfs(&buf, p);
+
+ if ((statx(fd, path, flags, STATX_BASIC_STATS | STATX_BTIME, &buf)) < 0)
+ return -errno;
+
+ statx_to_hostfs(&buf, p);
return 0;
}
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index a5ccba25ff47..743a1d7633cd 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -197,7 +197,7 @@ loop:
if (journal->j_commit_sequence != journal->j_commit_request) {
jbd2_debug(1, "OK, requests differ\n");
write_unlock(&journal->j_state_lock);
- del_timer_sync(&journal->j_commit_timer);
+ timer_delete_sync(&journal->j_commit_timer);
jbd2_journal_commit_transaction(journal);
write_lock(&journal->j_state_lock);
goto loop;
@@ -246,7 +246,7 @@ loop:
goto loop;
end_loop:
- del_timer_sync(&journal->j_commit_timer);
+ timer_delete_sync(&journal->j_commit_timer);
journal->j_task = NULL;
wake_up(&journal->j_wait_done_commit);
jbd2_debug(1, "Journal thread exiting.\n");
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 4061e0ba7010..bb815a002984 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -584,7 +584,7 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad)
size_t retlen;
/* Nothing to do if not write-buffering the flash. In particular, we shouldn't
- del_timer() the timer we never initialised. */
+ call timer_delete() on the timer we never initialised. */
if (!jffs2_is_writebuffered(c))
return 0;
diff --git a/fs/namespace.c b/fs/namespace.c
index 6100e5b962a6..14935a0500a2 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2478,7 +2478,8 @@ struct vfsmount *clone_private_mount(const struct path *path)
struct mount *old_mnt = real_mount(path->mnt);
struct mount *new_mnt;
- scoped_guard(rwsem_read, &namespace_sem)
+ guard(rwsem_read)(&namespace_sem);
+
if (IS_MNT_UNBINDABLE(old_mnt))
return ERR_PTR(-EINVAL);
@@ -5326,8 +5327,10 @@ struct kstatmount {
struct mnt_idmap *idmap;
u64 mask;
struct path root;
- struct statmount sm;
struct seq_file seq;
+
+ /* Must be last --ends in a flexible-array member. */
+ struct statmount sm;
};
static u64 mnt_to_attr_flags(struct vfsmount *mnt)
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 3b0918ade53c..02c916a55020 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -546,6 +546,8 @@ int nfs_create_rpc_client(struct nfs_client *clp,
args.flags |= RPC_CLNT_CREATE_NOPING;
if (test_bit(NFS_CS_REUSEPORT, &clp->cl_flags))
args.flags |= RPC_CLNT_CREATE_REUSEPORT;
+ if (test_bit(NFS_CS_NETUNREACH_FATAL, &clp->cl_flags))
+ args.flags |= RPC_CLNT_CREATE_NETUNREACH_FATAL;
if (!IS_ERR(clp->cl_rpcclient))
return 0;
@@ -709,6 +711,9 @@ static int nfs_init_server(struct nfs_server *server,
if (ctx->flags & NFS_MOUNT_NORESVPORT)
set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
+ if (ctx->flags & NFS_MOUNT_NETUNREACH_FATAL)
+ __set_bit(NFS_CS_NETUNREACH_FATAL, &cl_init.init_flags);
+
/* Allocate or find a client reference we can use */
clp = nfs_get_client(&cl_init);
if (IS_ERR(clp))
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 4db912f56230..8bdbc4dca89c 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -79,6 +79,7 @@ static void nfs_mark_return_delegation(struct nfs_server *server,
struct nfs_delegation *delegation)
{
set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
+ set_bit(NFS4SERV_DELEGRETURN, &server->delegation_flags);
set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state);
}
@@ -306,7 +307,8 @@ nfs_start_delegation_return_locked(struct nfs_inode *nfsi)
if (delegation == NULL)
goto out;
spin_lock(&delegation->lock);
- if (!test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) {
+ if (delegation->inode &&
+ !test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) {
clear_bit(NFS_DELEGATION_RETURN_DELAYED, &delegation->flags);
/* Refcount matched in nfs_end_delegation_return() */
ret = nfs_get_delegation(delegation);
@@ -330,14 +332,16 @@ nfs_start_delegation_return(struct nfs_inode *nfsi)
}
static void nfs_abort_delegation_return(struct nfs_delegation *delegation,
- struct nfs_client *clp, int err)
+ struct nfs_server *server, int err)
{
-
spin_lock(&delegation->lock);
clear_bit(NFS_DELEGATION_RETURNING, &delegation->flags);
if (err == -EAGAIN) {
set_bit(NFS_DELEGATION_RETURN_DELAYED, &delegation->flags);
- set_bit(NFS4CLNT_DELEGRETURN_DELAYED, &clp->cl_state);
+ set_bit(NFS4SERV_DELEGRETURN_DELAYED,
+ &server->delegation_flags);
+ set_bit(NFS4CLNT_DELEGRETURN_DELAYED,
+ &server->nfs_client->cl_state);
}
spin_unlock(&delegation->lock);
}
@@ -547,7 +551,7 @@ out:
*/
static int nfs_end_delegation_return(struct inode *inode, struct nfs_delegation *delegation, int issync)
{
- struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+ struct nfs_server *server = NFS_SERVER(inode);
unsigned int mode = O_WRONLY | O_RDWR;
int err = 0;
@@ -569,11 +573,11 @@ static int nfs_end_delegation_return(struct inode *inode, struct nfs_delegation
/*
* Guard against state recovery
*/
- err = nfs4_wait_clnt_recover(clp);
+ err = nfs4_wait_clnt_recover(server->nfs_client);
}
if (err) {
- nfs_abort_delegation_return(delegation, clp, err);
+ nfs_abort_delegation_return(delegation, server, err);
goto out;
}
@@ -590,17 +594,6 @@ static bool nfs_delegation_need_return(struct nfs_delegation *delegation)
if (test_and_clear_bit(NFS_DELEGATION_RETURN, &delegation->flags))
ret = true;
- else if (test_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags)) {
- struct inode *inode;
-
- spin_lock(&delegation->lock);
- inode = delegation->inode;
- if (inode && list_empty(&NFS_I(inode)->open_files))
- ret = true;
- spin_unlock(&delegation->lock);
- }
- if (ret)
- clear_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags);
if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags) ||
test_bit(NFS_DELEGATION_RETURN_DELAYED, &delegation->flags) ||
test_bit(NFS_DELEGATION_REVOKED, &delegation->flags))
@@ -619,6 +612,9 @@ static int nfs_server_return_marked_delegations(struct nfs_server *server,
struct nfs_delegation *place_holder_deleg = NULL;
int err = 0;
+ if (!test_and_clear_bit(NFS4SERV_DELEGRETURN,
+ &server->delegation_flags))
+ return 0;
restart:
/*
* To avoid quadratic looping we hold a reference
@@ -670,6 +666,7 @@ restart:
cond_resched();
if (!err)
goto restart;
+ set_bit(NFS4SERV_DELEGRETURN, &server->delegation_flags);
set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state);
goto out;
}
@@ -684,6 +681,9 @@ static bool nfs_server_clear_delayed_delegations(struct nfs_server *server)
struct nfs_delegation *d;
bool ret = false;
+ if (!test_and_clear_bit(NFS4SERV_DELEGRETURN_DELAYED,
+ &server->delegation_flags))
+ goto out;
list_for_each_entry_rcu (d, &server->delegations, super_list) {
if (!test_bit(NFS_DELEGATION_RETURN_DELAYED, &d->flags))
continue;
@@ -691,6 +691,7 @@ static bool nfs_server_clear_delayed_delegations(struct nfs_server *server)
clear_bit(NFS_DELEGATION_RETURN_DELAYED, &d->flags);
ret = true;
}
+out:
return ret;
}
@@ -878,11 +879,25 @@ int nfs4_inode_make_writeable(struct inode *inode)
return nfs4_inode_return_delegation(inode);
}
-static void nfs_mark_return_if_closed_delegation(struct nfs_server *server,
- struct nfs_delegation *delegation)
+static void
+nfs_mark_return_if_closed_delegation(struct nfs_server *server,
+ struct nfs_delegation *delegation)
{
- set_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags);
- set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state);
+ struct inode *inode;
+
+ if (test_bit(NFS_DELEGATION_RETURN, &delegation->flags) ||
+ test_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags))
+ return;
+ spin_lock(&delegation->lock);
+ inode = delegation->inode;
+ if (!inode)
+ goto out;
+ if (list_empty(&NFS_I(inode)->open_files))
+ nfs_mark_return_delegation(server, delegation);
+ else
+ set_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags);
+out:
+ spin_unlock(&delegation->lock);
}
static bool nfs_server_mark_return_all_delegations(struct nfs_server *server)
@@ -1276,6 +1291,7 @@ static void nfs_mark_test_expired_delegation(struct nfs_server *server,
return;
clear_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags);
set_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags);
+ set_bit(NFS4SERV_DELEGATION_EXPIRED, &server->delegation_flags);
set_bit(NFS4CLNT_DELEGATION_EXPIRED, &server->nfs_client->cl_state);
}
@@ -1354,6 +1370,9 @@ static int nfs_server_reap_expired_delegations(struct nfs_server *server,
nfs4_stateid stateid;
unsigned long gen = ++server->delegation_gen;
+ if (!test_and_clear_bit(NFS4SERV_DELEGATION_EXPIRED,
+ &server->delegation_flags))
+ return 0;
restart:
rcu_read_lock();
list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
@@ -1383,6 +1402,9 @@ restart:
goto restart;
}
nfs_inode_mark_test_expired_delegation(server,inode);
+ set_bit(NFS4SERV_DELEGATION_EXPIRED, &server->delegation_flags);
+ set_bit(NFS4CLNT_DELEGATION_EXPIRED,
+ &server->nfs_client->cl_state);
iput(inode);
return -EAGAIN;
}
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index bc957487f6ec..bd23fc736b39 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -666,6 +666,8 @@ static bool nfs_use_readdirplus(struct inode *dir, struct dir_context *ctx,
{
if (!nfs_server_capable(dir, NFS_CAP_READDIRPLUS))
return false;
+ if (NFS_SERVER(dir)->flags & NFS_MOUNT_FORCE_RDIRPLUS)
+ return true;
if (ctx->pos == 0 ||
cache_hits + cache_misses > NFS_READDIR_CACHE_USAGE_THRESHOLD)
return true;
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 98b45b636be3..61ad269c825f 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -1154,10 +1154,14 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task,
rpc_wake_up(&tbl->slot_tbl_waitq);
goto reset;
/* RPC connection errors */
+ case -ENETDOWN:
+ case -ENETUNREACH:
+ if (test_bit(NFS_CS_NETUNREACH_FATAL, &clp->cl_flags))
+ return -NFS4ERR_FATAL_IOERROR;
+ fallthrough;
case -ECONNREFUSED:
case -EHOSTDOWN:
case -EHOSTUNREACH:
- case -ENETUNREACH:
case -EIO:
case -ETIMEDOUT:
case -EPIPE:
@@ -1183,6 +1187,7 @@ reset:
/* Retry all errors through either pNFS or MDS except for -EJUKEBOX */
static int ff_layout_async_handle_error_v3(struct rpc_task *task,
+ struct nfs_client *clp,
struct pnfs_layout_segment *lseg,
u32 idx)
{
@@ -1200,6 +1205,11 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task,
case -EJUKEBOX:
nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY);
goto out_retry;
+ case -ENETDOWN:
+ case -ENETUNREACH:
+ if (test_bit(NFS_CS_NETUNREACH_FATAL, &clp->cl_flags))
+ return -NFS4ERR_FATAL_IOERROR;
+ fallthrough;
default:
dprintk("%s DS connection error %d\n", __func__,
task->tk_status);
@@ -1234,7 +1244,7 @@ static int ff_layout_async_handle_error(struct rpc_task *task,
switch (vers) {
case 3:
- return ff_layout_async_handle_error_v3(task, lseg, idx);
+ return ff_layout_async_handle_error_v3(task, clp, lseg, idx);
case 4:
return ff_layout_async_handle_error_v4(task, state, clp,
lseg, idx);
@@ -1264,6 +1274,7 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
case -ECONNRESET:
case -EHOSTDOWN:
case -EHOSTUNREACH:
+ case -ENETDOWN:
case -ENETUNREACH:
case -EADDRINUSE:
case -ENOBUFS:
@@ -1337,6 +1348,9 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
return task->tk_status;
case -EAGAIN:
goto out_eagain;
+ case -NFS4ERR_FATAL_IOERROR:
+ task->tk_status = -EIO;
+ return 0;
}
return 0;
@@ -1507,6 +1521,9 @@ static int ff_layout_write_done_cb(struct rpc_task *task,
return task->tk_status;
case -EAGAIN:
return -EAGAIN;
+ case -NFS4ERR_FATAL_IOERROR:
+ task->tk_status = -EIO;
+ return 0;
}
if (hdr->res.verf->committed == NFS_FILE_SYNC ||
@@ -1551,6 +1568,9 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
case -EAGAIN:
rpc_restart_call_prepare(task);
return -EAGAIN;
+ case -NFS4ERR_FATAL_IOERROR:
+ task->tk_status = -EIO;
+ return 0;
}
ff_layout_set_layoutcommit(data->inode, data->lseg, data->lwb);
diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c
index b069385eea17..13f71ca8c974 100644
--- a/fs/nfs/fs_context.c
+++ b/fs/nfs/fs_context.c
@@ -50,6 +50,7 @@ enum nfs_param {
Opt_clientaddr,
Opt_cto,
Opt_alignwrite,
+ Opt_fatal_neterrors,
Opt_fg,
Opt_fscache,
Opt_fscache_flag,
@@ -72,6 +73,8 @@ enum nfs_param {
Opt_posix,
Opt_proto,
Opt_rdirplus,
+ Opt_rdirplus_none,
+ Opt_rdirplus_force,
Opt_rdma,
Opt_resvport,
Opt_retrans,
@@ -96,6 +99,20 @@ enum nfs_param {
};
enum {
+ Opt_fatal_neterrors_default,
+ Opt_fatal_neterrors_enetunreach,
+ Opt_fatal_neterrors_none,
+};
+
+static const struct constant_table nfs_param_enums_fatal_neterrors[] = {
+ { "default", Opt_fatal_neterrors_default },
+ { "ENETDOWN:ENETUNREACH", Opt_fatal_neterrors_enetunreach },
+ { "ENETUNREACH:ENETDOWN", Opt_fatal_neterrors_enetunreach },
+ { "none", Opt_fatal_neterrors_none },
+ {}
+};
+
+enum {
Opt_local_lock_all,
Opt_local_lock_flock,
Opt_local_lock_none,
@@ -151,6 +168,8 @@ static const struct fs_parameter_spec nfs_fs_parameters[] = {
fsparam_string("clientaddr", Opt_clientaddr),
fsparam_flag_no("cto", Opt_cto),
fsparam_flag_no("alignwrite", Opt_alignwrite),
+ fsparam_enum("fatal_neterrors", Opt_fatal_neterrors,
+ nfs_param_enums_fatal_neterrors),
fsparam_flag ("fg", Opt_fg),
fsparam_flag_no("fsc", Opt_fscache_flag),
fsparam_string("fsc", Opt_fscache),
@@ -174,7 +193,8 @@ static const struct fs_parameter_spec nfs_fs_parameters[] = {
fsparam_u32 ("port", Opt_port),
fsparam_flag_no("posix", Opt_posix),
fsparam_string("proto", Opt_proto),
- fsparam_flag_no("rdirplus", Opt_rdirplus),
+ fsparam_flag_no("rdirplus", Opt_rdirplus), // rdirplus|nordirplus
+ fsparam_string("rdirplus", Opt_rdirplus), // rdirplus=...
fsparam_flag ("rdma", Opt_rdma),
fsparam_flag_no("resvport", Opt_resvport),
fsparam_u32 ("retrans", Opt_retrans),
@@ -288,6 +308,12 @@ static const struct constant_table nfs_xprtsec_policies[] = {
{}
};
+static const struct constant_table nfs_rdirplus_tokens[] = {
+ { "none", Opt_rdirplus_none },
+ { "force", Opt_rdirplus_force },
+ {}
+};
+
/*
* Sanity-check a server address provided by the mount command.
*
@@ -636,10 +662,25 @@ static int nfs_fs_context_parse_param(struct fs_context *fc,
ctx->flags &= ~NFS_MOUNT_NOACL;
break;
case Opt_rdirplus:
- if (result.negated)
+ if (result.negated) {
+ ctx->flags &= ~NFS_MOUNT_FORCE_RDIRPLUS;
ctx->flags |= NFS_MOUNT_NORDIRPLUS;
- else
- ctx->flags &= ~NFS_MOUNT_NORDIRPLUS;
+ } else if (!param->string) {
+ ctx->flags &= ~(NFS_MOUNT_NORDIRPLUS | NFS_MOUNT_FORCE_RDIRPLUS);
+ } else {
+ switch (lookup_constant(nfs_rdirplus_tokens, param->string, -1)) {
+ case Opt_rdirplus_none:
+ ctx->flags &= ~NFS_MOUNT_FORCE_RDIRPLUS;
+ ctx->flags |= NFS_MOUNT_NORDIRPLUS;
+ break;
+ case Opt_rdirplus_force:
+ ctx->flags &= ~NFS_MOUNT_NORDIRPLUS;
+ ctx->flags |= NFS_MOUNT_FORCE_RDIRPLUS;
+ break;
+ default:
+ goto out_invalid_value;
+ }
+ }
break;
case Opt_sharecache:
if (result.negated)
@@ -872,6 +913,25 @@ static int nfs_fs_context_parse_param(struct fs_context *fc,
goto out_of_bounds;
ctx->nfs_server.max_connect = result.uint_32;
break;
+ case Opt_fatal_neterrors:
+ trace_nfs_mount_assign(param->key, param->string);
+ switch (result.uint_32) {
+ case Opt_fatal_neterrors_default:
+ if (fc->net_ns != &init_net)
+ ctx->flags |= NFS_MOUNT_NETUNREACH_FATAL;
+ else
+ ctx->flags &= ~NFS_MOUNT_NETUNREACH_FATAL;
+ break;
+ case Opt_fatal_neterrors_enetunreach:
+ ctx->flags |= NFS_MOUNT_NETUNREACH_FATAL;
+ break;
+ case Opt_fatal_neterrors_none:
+ ctx->flags &= ~NFS_MOUNT_NETUNREACH_FATAL;
+ break;
+ default:
+ goto out_invalid_value;
+ }
+ break;
case Opt_lookupcache:
trace_nfs_mount_assign(param->key, param->string);
switch (result.uint_32) {
@@ -1651,6 +1711,9 @@ static int nfs_init_fs_context(struct fs_context *fc)
ctx->xprtsec.cert_serial = TLS_NO_CERT;
ctx->xprtsec.privkey_serial = TLS_NO_PRIVKEY;
+ if (fc->net_ns != &init_net)
+ ctx->flags |= NFS_MOUNT_NETUNREACH_FATAL;
+
fc->s_iflags |= SB_I_STABLE_WRITES;
}
fc->fs_private = ctx;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 1aa67fca69b2..119e447758b9 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -74,6 +74,8 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr)
int nfs_wait_bit_killable(struct wait_bit_key *key, int mode)
{
+ if (unlikely(nfs_current_task_exiting()))
+ return -EINTR;
schedule();
if (signal_pending_state(mode, current))
return -ERESTARTSYS;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 1ac1d3eec517..ec8d32d0e2e9 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -912,6 +912,11 @@ static inline u32 nfs_stateid_hash(nfs4_stateid *stateid)
}
#endif
+static inline bool nfs_current_task_exiting(void)
+{
+ return (current->flags & PF_EXITING) != 0;
+}
+
static inline bool nfs_error_is_fatal(int err)
{
switch (err) {
diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c
index b0c8a39c2bbd..0d7310c1ee0c 100644
--- a/fs/nfs/nfs3client.c
+++ b/fs/nfs/nfs3client.c
@@ -120,6 +120,8 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv,
if (mds_srv->flags & NFS_MOUNT_NORESVPORT)
__set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
+ if (test_bit(NFS_CS_NETUNREACH_FATAL, &mds_clp->cl_flags))
+ __set_bit(NFS_CS_NETUNREACH_FATAL, &cl_init.init_flags);
__set_bit(NFS_CS_DS, &cl_init.init_flags);
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 755ed3c37051..a4cb67573aa7 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -39,7 +39,7 @@ nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
__set_current_state(TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
schedule_timeout(NFS_JUKEBOX_RETRY_TIME);
res = -ERESTARTSYS;
- } while (!fatal_signal_pending(current));
+ } while (!fatal_signal_pending(current) && !nfs_current_task_exiting());
return res;
}
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 1924c4a2077b..5cf52ece96ac 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -21,6 +21,8 @@
#define NFSDBG_FACILITY NFSDBG_PROC
static int nfs42_do_offload_cancel_async(struct file *dst, nfs4_stateid *std);
+static int nfs42_proc_offload_status(struct file *file, nfs4_stateid *stateid,
+ u64 *copied);
static void nfs42_set_netaddr(struct file *filep, struct nfs42_netaddr *naddr)
{
@@ -173,6 +175,20 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len)
return err;
}
+static void nfs4_copy_dequeue_callback(struct nfs_server *dst_server,
+ struct nfs_server *src_server,
+ struct nfs4_copy_state *copy)
+{
+ spin_lock(&dst_server->nfs_client->cl_lock);
+ list_del_init(&copy->copies);
+ spin_unlock(&dst_server->nfs_client->cl_lock);
+ if (dst_server != src_server) {
+ spin_lock(&src_server->nfs_client->cl_lock);
+ list_del_init(&copy->src_copies);
+ spin_unlock(&src_server->nfs_client->cl_lock);
+ }
+}
+
static int handle_async_copy(struct nfs42_copy_res *res,
struct nfs_server *dst_server,
struct nfs_server *src_server,
@@ -182,9 +198,12 @@ static int handle_async_copy(struct nfs42_copy_res *res,
bool *restart)
{
struct nfs4_copy_state *copy, *tmp_copy = NULL, *iter;
- int status = NFS4_OK;
struct nfs_open_context *dst_ctx = nfs_file_open_context(dst);
struct nfs_open_context *src_ctx = nfs_file_open_context(src);
+ struct nfs_client *clp = dst_server->nfs_client;
+ unsigned long timeout = 3 * HZ;
+ int status = NFS4_OK;
+ u64 copied;
copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_KERNEL);
if (!copy)
@@ -222,15 +241,12 @@ static int handle_async_copy(struct nfs42_copy_res *res,
spin_unlock(&src_server->nfs_client->cl_lock);
}
- status = wait_for_completion_interruptible(&copy->completion);
- spin_lock(&dst_server->nfs_client->cl_lock);
- list_del_init(&copy->copies);
- spin_unlock(&dst_server->nfs_client->cl_lock);
- if (dst_server != src_server) {
- spin_lock(&src_server->nfs_client->cl_lock);
- list_del_init(&copy->src_copies);
- spin_unlock(&src_server->nfs_client->cl_lock);
- }
+wait:
+ status = wait_for_completion_interruptible_timeout(&copy->completion,
+ timeout);
+ if (!status)
+ goto timeout;
+ nfs4_copy_dequeue_callback(dst_server, src_server, copy);
if (status == -ERESTARTSYS) {
goto out_cancel;
} else if (copy->flags || copy->error == NFS4ERR_PARTNER_NO_AUTH) {
@@ -240,6 +256,7 @@ static int handle_async_copy(struct nfs42_copy_res *res,
}
out:
res->write_res.count = copy->count;
+ /* Copy out the updated write verifier provided by CB_OFFLOAD. */
memcpy(&res->write_res.verifier, &copy->verf, sizeof(copy->verf));
status = -copy->error;
@@ -251,6 +268,39 @@ out_cancel:
if (!nfs42_files_from_same_server(src, dst))
nfs42_do_offload_cancel_async(src, src_stateid);
goto out_free;
+timeout:
+ timeout <<= 1;
+ if (timeout > (clp->cl_lease_time >> 1))
+ timeout = clp->cl_lease_time >> 1;
+ status = nfs42_proc_offload_status(dst, &copy->stateid, &copied);
+ if (status == -EINPROGRESS)
+ goto wait;
+ nfs4_copy_dequeue_callback(dst_server, src_server, copy);
+ switch (status) {
+ case 0:
+ /* The server recognized the copy stateid, so it hasn't
+ * rebooted. Don't overwrite the verifier returned in the
+ * COPY result. */
+ res->write_res.count = copied;
+ goto out_free;
+ case -EREMOTEIO:
+ /* COPY operation failed on the server. */
+ status = -EOPNOTSUPP;
+ res->write_res.count = copied;
+ goto out_free;
+ case -EBADF:
+ /* Server did not recognize the copy stateid. It has
+ * probably restarted and lost the plot. */
+ res->write_res.count = 0;
+ status = -EOPNOTSUPP;
+ break;
+ case -EOPNOTSUPP:
+ /* RFC 7862 REQUIREs server to support OFFLOAD_STATUS when
+ * it has signed up for an async COPY, so server is not
+ * spec-compliant. */
+ res->write_res.count = 0;
+ }
+ goto out_free;
}
static int process_copy_commit(struct file *dst, loff_t pos_dst,
@@ -582,6 +632,108 @@ static int nfs42_do_offload_cancel_async(struct file *dst,
return status;
}
+static int
+_nfs42_proc_offload_status(struct nfs_server *server, struct file *file,
+ struct nfs42_offload_data *data)
+{
+ struct nfs_open_context *ctx = nfs_file_open_context(file);
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OFFLOAD_STATUS],
+ .rpc_argp = &data->args,
+ .rpc_resp = &data->res,
+ .rpc_cred = ctx->cred,
+ };
+ int status;
+
+ status = nfs4_call_sync(server->client, server, &msg,
+ &data->args.osa_seq_args,
+ &data->res.osr_seq_res, 1);
+ trace_nfs4_offload_status(&data->args, status);
+ switch (status) {
+ case 0:
+ break;
+
+ case -NFS4ERR_ADMIN_REVOKED:
+ case -NFS4ERR_BAD_STATEID:
+ case -NFS4ERR_OLD_STATEID:
+ /*
+ * Server does not recognize the COPY stateid. CB_OFFLOAD
+ * could have purged it, or server might have rebooted.
+ * Since COPY stateids don't have an associated inode,
+ * avoid triggering state recovery.
+ */
+ status = -EBADF;
+ break;
+ case -NFS4ERR_NOTSUPP:
+ case -ENOTSUPP:
+ case -EOPNOTSUPP:
+ server->caps &= ~NFS_CAP_OFFLOAD_STATUS;
+ status = -EOPNOTSUPP;
+ break;
+ }
+
+ return status;
+}
+
+/**
+ * nfs42_proc_offload_status - Poll completion status of an async copy operation
+ * @dst: handle of file being copied into
+ * @stateid: copy stateid (from async COPY result)
+ * @copied: OUT: number of bytes copied so far
+ *
+ * Return values:
+ * %0: Server returned an NFS4_OK completion status
+ * %-EINPROGRESS: Server returned no completion status
+ * %-EREMOTEIO: Server returned an error completion status
+ * %-EBADF: Server did not recognize the copy stateid
+ * %-EOPNOTSUPP: Server does not support OFFLOAD_STATUS
+ * %-ERESTARTSYS: Wait interrupted by signal
+ *
+ * Other negative errnos indicate the client could not complete the
+ * request.
+ */
+static int
+nfs42_proc_offload_status(struct file *dst, nfs4_stateid *stateid, u64 *copied)
+{
+ struct inode *inode = file_inode(dst);
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct nfs4_exception exception = {
+ .inode = inode,
+ };
+ struct nfs42_offload_data *data;
+ int status;
+
+ if (!(server->caps & NFS_CAP_OFFLOAD_STATUS))
+ return -EOPNOTSUPP;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+ data->seq_server = server;
+ data->args.osa_src_fh = NFS_FH(inode);
+ memcpy(&data->args.osa_stateid, stateid,
+ sizeof(data->args.osa_stateid));
+ exception.stateid = &data->args.osa_stateid;
+ do {
+ status = _nfs42_proc_offload_status(server, dst, data);
+ if (status == -EOPNOTSUPP)
+ goto out;
+ status = nfs4_handle_exception(server, status, &exception);
+ } while (exception.retry);
+ if (status)
+ goto out;
+
+ *copied = data->res.osr_count;
+ if (!data->res.complete_count)
+ status = -EINPROGRESS;
+ else if (data->res.osr_complete != NFS_OK)
+ status = -EREMOTEIO;
+
+out:
+ kfree(data);
+ return status;
+}
+
static int _nfs42_proc_copy_notify(struct file *src, struct file *dst,
struct nfs42_copy_notify_args *args,
struct nfs42_copy_notify_res *res)
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index 5072d7ea72e9..b1b663468249 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -35,6 +35,11 @@
#define encode_offload_cancel_maxsz (op_encode_hdr_maxsz + \
XDR_QUADLEN(NFS4_STATEID_SIZE))
#define decode_offload_cancel_maxsz (op_decode_hdr_maxsz)
+#define encode_offload_status_maxsz (op_encode_hdr_maxsz + \
+ XDR_QUADLEN(NFS4_STATEID_SIZE))
+#define decode_offload_status_maxsz (op_decode_hdr_maxsz + \
+ 2 /* osr_count */ + \
+ 2 /* osr_complete */)
#define encode_copy_notify_maxsz (op_encode_hdr_maxsz + \
XDR_QUADLEN(NFS4_STATEID_SIZE) + \
1 + /* nl4_type */ \
@@ -143,6 +148,14 @@
decode_sequence_maxsz + \
decode_putfh_maxsz + \
decode_offload_cancel_maxsz)
+#define NFS4_enc_offload_status_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_offload_status_maxsz)
+#define NFS4_dec_offload_status_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_offload_status_maxsz)
#define NFS4_enc_copy_notify_sz (compound_encode_hdr_maxsz + \
encode_sequence_maxsz + \
encode_putfh_maxsz + \
@@ -345,6 +358,14 @@ static void encode_offload_cancel(struct xdr_stream *xdr,
encode_nfs4_stateid(xdr, &args->osa_stateid);
}
+static void encode_offload_status(struct xdr_stream *xdr,
+ const struct nfs42_offload_status_args *args,
+ struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_OFFLOAD_STATUS, decode_offload_status_maxsz, hdr);
+ encode_nfs4_stateid(xdr, &args->osa_stateid);
+}
+
static void encode_copy_notify(struct xdr_stream *xdr,
const struct nfs42_copy_notify_args *args,
struct compound_hdr *hdr)
@@ -570,6 +591,25 @@ static void nfs4_xdr_enc_offload_cancel(struct rpc_rqst *req,
}
/*
+ * Encode OFFLOAD_STATUS request
+ */
+static void nfs4_xdr_enc_offload_status(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs42_offload_status_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->osa_seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->osa_seq_args, &hdr);
+ encode_putfh(xdr, args->osa_src_fh, &hdr);
+ encode_offload_status(xdr, args, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
* Encode COPY_NOTIFY request
*/
static void nfs4_xdr_enc_copy_notify(struct rpc_rqst *req,
@@ -921,6 +961,26 @@ static int decode_offload_cancel(struct xdr_stream *xdr,
return decode_op_hdr(xdr, OP_OFFLOAD_CANCEL);
}
+static int decode_offload_status(struct xdr_stream *xdr,
+ struct nfs42_offload_status_res *res)
+{
+ ssize_t result;
+ int status;
+
+ status = decode_op_hdr(xdr, OP_OFFLOAD_STATUS);
+ if (status)
+ return status;
+ /* osr_count */
+ if (xdr_stream_decode_u64(xdr, &res->osr_count) < 0)
+ return -EIO;
+ /* osr_complete<1> */
+ result = xdr_stream_decode_uint32_array(xdr, &res->osr_complete, 1);
+ if (result < 0)
+ return -EIO;
+ res->complete_count = result;
+ return 0;
+}
+
static int decode_copy_notify(struct xdr_stream *xdr,
struct nfs42_copy_notify_res *res)
{
@@ -1371,6 +1431,32 @@ out:
}
/*
+ * Decode OFFLOAD_STATUS response
+ */
+static int nfs4_xdr_dec_offload_status(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs42_offload_status_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->osr_seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_offload_status(xdr, res);
+
+out:
+ return status;
+}
+
+/*
* Decode COPY_NOTIFY response
*/
static int nfs4_xdr_dec_copy_notify(struct rpc_rqst *rqstp,
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 83378f69b35e..162c85a83a14 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -233,6 +233,8 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init)
__set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags);
if (test_bit(NFS_CS_PNFS, &cl_init->init_flags))
__set_bit(NFS_CS_PNFS, &clp->cl_flags);
+ if (test_bit(NFS_CS_NETUNREACH_FATAL, &cl_init->init_flags))
+ __set_bit(NFS_CS_NETUNREACH_FATAL, &clp->cl_flags);
/*
* Set up the connection to the server before we add add to the
* global list.
@@ -937,6 +939,9 @@ static int nfs4_set_client(struct nfs_server *server,
__set_bit(NFS_CS_TSM_POSSIBLE, &cl_init.init_flags);
server->port = rpc_get_port((struct sockaddr *)addr);
+ if (server->flags & NFS_MOUNT_NETUNREACH_FATAL)
+ __set_bit(NFS_CS_NETUNREACH_FATAL, &cl_init.init_flags);
+
/* Allocate or find a client reference we can use */
clp = nfs_get_client(&cl_init);
if (IS_ERR(clp))
@@ -1011,6 +1016,8 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv,
if (mds_srv->flags & NFS_MOUNT_NORESVPORT)
__set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
+ if (test_bit(NFS_CS_NETUNREACH_FATAL, &mds_clp->cl_flags))
+ __set_bit(NFS_CS_NETUNREACH_FATAL, &cl_init.init_flags);
__set_bit(NFS_CS_PNFS, &cl_init.init_flags);
cl_init.max_connect = NFS_MAX_TRANSPORTS;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 70c8ea943019..970f28dbf253 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -195,6 +195,9 @@ static int nfs4_map_errors(int err)
return -EBUSY;
case -NFS4ERR_NOT_SAME:
return -ENOTSYNC;
+ case -ENETDOWN:
+ case -ENETUNREACH:
+ break;
default:
dprintk("%s could not handle NFSv4 error %d\n",
__func__, -err);
@@ -443,6 +446,8 @@ static int nfs4_delay_killable(long *timeout)
{
might_sleep();
+ if (unlikely(nfs_current_task_exiting()))
+ return -EINTR;
__set_current_state(TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
schedule_timeout(nfs4_update_delay(timeout));
if (!__fatal_signal_pending(current))
@@ -454,6 +459,8 @@ static int nfs4_delay_interruptible(long *timeout)
{
might_sleep();
+ if (unlikely(nfs_current_task_exiting()))
+ return -EINTR;
__set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE_UNSAFE);
schedule_timeout(nfs4_update_delay(timeout));
if (!signal_pending(current))
@@ -1774,7 +1781,8 @@ static void nfs_set_open_stateid_locked(struct nfs4_state *state,
rcu_read_unlock();
trace_nfs4_open_stateid_update_wait(state->inode, stateid, 0);
- if (!fatal_signal_pending(current)) {
+ if (!fatal_signal_pending(current) &&
+ !nfs_current_task_exiting()) {
if (schedule_timeout(5*HZ) == 0)
status = -EAGAIN;
else
@@ -3576,7 +3584,7 @@ static bool nfs4_refresh_open_old_stateid(nfs4_stateid *dst,
write_sequnlock(&state->seqlock);
trace_nfs4_close_stateid_update_wait(state->inode, dst, 0);
- if (fatal_signal_pending(current))
+ if (fatal_signal_pending(current) || nfs_current_task_exiting())
status = -EINTR;
else
if (schedule_timeout(5*HZ) != 0)
@@ -9594,7 +9602,7 @@ static void nfs41_sequence_call_done(struct rpc_task *task, void *data)
return;
trace_nfs4_sequence(clp, task->tk_status);
- if (task->tk_status < 0 && !task->tk_client->cl_shutdown) {
+ if (task->tk_status < 0 && clp->cl_cons_state >= 0) {
dprintk("%s ERROR %d\n", __func__, task->tk_status);
if (refcount_read(&clp->cl_count) == 1)
return;
@@ -10798,7 +10806,8 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
| NFS_CAP_CLONE
| NFS_CAP_LAYOUTERROR
| NFS_CAP_READ_PLUS
- | NFS_CAP_MOVEABLE,
+ | NFS_CAP_MOVEABLE
+ | NFS_CAP_OFFLOAD_STATUS,
.init_client = nfs41_init_client,
.shutdown_client = nfs41_shutdown_client,
.match_stateid = nfs41_match_stateid,
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 542cdf71229f..7612e977e80b 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1198,7 +1198,7 @@ void nfs4_schedule_state_manager(struct nfs_client *clp)
struct rpc_clnt *clnt = clp->cl_rpcclient;
bool swapon = false;
- if (clnt->cl_shutdown)
+ if (clp->cl_cons_state < 0)
return;
set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state);
@@ -1403,7 +1403,7 @@ int nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4_
dprintk("%s: scheduling stateid recovery for server %s\n", __func__,
clp->cl_hostname);
nfs4_schedule_state_manager(clp);
- return 0;
+ return clp->cl_cons_state < 0 ? clp->cl_cons_state : 0;
}
EXPORT_SYMBOL_GPL(nfs4_schedule_stateid_recovery);
@@ -2739,7 +2739,15 @@ out_error:
pr_warn_ratelimited("NFS: state manager%s%s failed on NFSv4 server %s"
" with error %d\n", section_sep, section,
clp->cl_hostname, -status);
- ssleep(1);
+ switch (status) {
+ case -ENETDOWN:
+ case -ENETUNREACH:
+ nfs_mark_client_ready(clp, -EIO);
+ break;
+ default:
+ ssleep(1);
+ break;
+ }
out_drain:
memalloc_nofs_restore(memflags);
nfs4_end_drain_session(clp);
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index 22c973316f0b..bc67fe6801b1 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -2608,7 +2608,7 @@ TRACE_EVENT(nfs4_copy_notify,
)
);
-TRACE_EVENT(nfs4_offload_cancel,
+DECLARE_EVENT_CLASS(nfs4_offload_class,
TP_PROTO(
const struct nfs42_offload_status_args *args,
int error
@@ -2640,6 +2640,15 @@ TRACE_EVENT(nfs4_offload_cancel,
__entry->stateid_seq, __entry->stateid_hash
)
);
+#define DEFINE_NFS4_OFFLOAD_EVENT(name) \
+ DEFINE_EVENT(nfs4_offload_class, name, \
+ TP_PROTO( \
+ const struct nfs42_offload_status_args *args, \
+ int error \
+ ), \
+ TP_ARGS(args, error))
+DEFINE_NFS4_OFFLOAD_EVENT(nfs4_offload_cancel);
+DEFINE_NFS4_OFFLOAD_EVENT(nfs4_offload_status);
DECLARE_EVENT_CLASS(nfs4_xattr_event,
TP_PROTO(
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index e8ac3f615f93..55bef5fbfa47 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -82,9 +82,8 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
* we currently use size 2 (u64) out of (NFS4_OPAQUE_LIMIT >> 2)
*/
#define pagepad_maxsz (1)
-#define open_owner_id_maxsz (1 + 2 + 1 + 1 + 2)
-#define lock_owner_id_maxsz (1 + 1 + 4)
-#define decode_lockowner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ))
+#define open_owner_id_maxsz (2 + 1 + 2 + 2)
+#define lock_owner_id_maxsz (2 + 1 + 2)
#define compound_encode_hdr_maxsz (3 + (NFS4_MAXTAGLEN >> 2))
#define compound_decode_hdr_maxsz (3 + (NFS4_MAXTAGLEN >> 2))
#define op_encode_hdr_maxsz (1)
@@ -185,7 +184,7 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
#define encode_claim_null_maxsz (1 + nfs4_name_maxsz)
#define encode_open_maxsz (op_encode_hdr_maxsz + \
2 + encode_share_access_maxsz + 2 + \
- open_owner_id_maxsz + \
+ 1 + open_owner_id_maxsz + \
encode_opentype_maxsz + \
encode_claim_null_maxsz)
#define decode_space_limit_maxsz (3)
@@ -255,13 +254,14 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
#define encode_link_maxsz (op_encode_hdr_maxsz + \
nfs4_name_maxsz)
#define decode_link_maxsz (op_decode_hdr_maxsz + decode_change_info_maxsz)
-#define encode_lockowner_maxsz (7)
+#define encode_lockowner_maxsz (2 + 1 + lock_owner_id_maxsz)
+
#define encode_lock_maxsz (op_encode_hdr_maxsz + \
7 + \
1 + encode_stateid_maxsz + 1 + \
encode_lockowner_maxsz)
#define decode_lock_denied_maxsz \
- (8 + decode_lockowner_maxsz)
+ (2 + 2 + 1 + 2 + 1 + lock_owner_id_maxsz)
#define decode_lock_maxsz (op_decode_hdr_maxsz + \
decode_lock_denied_maxsz)
#define encode_lockt_maxsz (op_encode_hdr_maxsz + 5 + \
@@ -617,7 +617,7 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
encode_lockowner_maxsz)
#define NFS4_dec_release_lockowner_sz \
(compound_decode_hdr_maxsz + \
- decode_lockowner_maxsz)
+ decode_release_lockowner_maxsz)
#define NFS4_enc_access_sz (compound_encode_hdr_maxsz + \
encode_sequence_maxsz + \
encode_putfh_maxsz + \
@@ -1412,7 +1412,7 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
__be32 *p;
/*
* opcode 4, seqid 4, share_access 4, share_deny 4, clientid 8, ownerlen 4,
- * owner 4 = 32
+ * owner 28
*/
encode_nfs4_seqid(xdr, arg->seqid);
encode_share_access(xdr, arg->share_access);
@@ -5077,7 +5077,7 @@ static int decode_link(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
/*
* We create the owner, so we know a proper owner.id length is 4.
*/
-static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
+static int decode_lock_denied(struct xdr_stream *xdr, struct file_lock *fl)
{
uint64_t offset, length, clientid;
__be32 *p;
@@ -7702,6 +7702,7 @@ const struct rpc_procinfo nfs4_procedures[] = {
PROC42(CLONE, enc_clone, dec_clone),
PROC42(COPY, enc_copy, dec_copy),
PROC42(OFFLOAD_CANCEL, enc_offload_cancel, dec_offload_cancel),
+ PROC42(OFFLOAD_STATUS, enc_offload_status, dec_offload_status),
PROC42(COPY_NOTIFY, enc_copy_notify, dec_copy_notify),
PROC(LOOKUPP, enc_lookupp, dec_lookupp),
PROC42(LAYOUTERROR, enc_layouterror, dec_layouterror),
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index aeb715b4a690..9eea9e62afc9 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -454,8 +454,12 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
{ NFS_MOUNT_NONLM, ",nolock", "" },
{ NFS_MOUNT_NOACL, ",noacl", "" },
{ NFS_MOUNT_NORDIRPLUS, ",nordirplus", "" },
+ { NFS_MOUNT_FORCE_RDIRPLUS, ",rdirplus=force", "" },
{ NFS_MOUNT_UNSHARED, ",nosharecache", "" },
{ NFS_MOUNT_NORESVPORT, ",noresvport", "" },
+ { NFS_MOUNT_NETUNREACH_FATAL,
+ ",fatal_neterrors=ENETDOWN:ENETUNREACH",
+ ",fatal_neterrors=none" },
{ 0, NULL, NULL }
};
const struct proc_nfs_info *nfs_infop;
diff --git a/fs/nfs/sysfs.c b/fs/nfs/sysfs.c
index 7b59a40d40c0..37cb2b776435 100644
--- a/fs/nfs/sysfs.c
+++ b/fs/nfs/sysfs.c
@@ -14,6 +14,7 @@
#include <linux/rcupdate.h>
#include <linux/lockd/lockd.h>
+#include "internal.h"
#include "nfs4_fs.h"
#include "netns.h"
#include "sysfs.h"
@@ -228,6 +229,25 @@ static void shutdown_client(struct rpc_clnt *clnt)
rpc_cancel_tasks(clnt, -EIO, shutdown_match_client, NULL);
}
+/*
+ * Shut down the nfs_client only once all the superblocks
+ * have been shut down.
+ */
+static void shutdown_nfs_client(struct nfs_client *clp)
+{
+ struct nfs_server *server;
+ rcu_read_lock();
+ list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+ if (!(server->flags & NFS_MOUNT_SHUTDOWN)) {
+ rcu_read_unlock();
+ return;
+ }
+ }
+ rcu_read_unlock();
+ nfs_mark_client_ready(clp, -EIO);
+ shutdown_client(clp->cl_rpcclient);
+}
+
static ssize_t
shutdown_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
@@ -259,7 +279,6 @@ shutdown_store(struct kobject *kobj, struct kobj_attribute *attr,
server->flags |= NFS_MOUNT_SHUTDOWN;
shutdown_client(server->client);
- shutdown_client(server->nfs_client->cl_rpcclient);
if (!IS_ERR(server->client_acl))
shutdown_client(server->client_acl);
@@ -267,11 +286,44 @@ shutdown_store(struct kobject *kobj, struct kobj_attribute *attr,
if (server->nlm_host)
shutdown_client(server->nlm_host->h_rpcclnt);
out:
+ shutdown_nfs_client(server->nfs_client);
return count;
}
static struct kobj_attribute nfs_sysfs_attr_shutdown = __ATTR_RW(shutdown);
+#if IS_ENABLED(CONFIG_NFS_V4_1)
+static ssize_t
+implid_domain_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct nfs_server *server = container_of(kobj, struct nfs_server, kobj);
+ struct nfs41_impl_id *impl_id = server->nfs_client->cl_implid;
+
+ if (!impl_id || strlen(impl_id->domain) == 0)
+ return 0; //sysfs_emit(buf, "");
+ return sysfs_emit(buf, "%s\n", impl_id->domain);
+}
+
+static struct kobj_attribute nfs_sysfs_attr_implid_domain = __ATTR_RO(implid_domain);
+
+
+static ssize_t
+implid_name_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct nfs_server *server = container_of(kobj, struct nfs_server, kobj);
+ struct nfs41_impl_id *impl_id = server->nfs_client->cl_implid;
+
+ if (!impl_id || strlen(impl_id->name) == 0)
+ return 0; //sysfs_emit(buf, "");
+ return sysfs_emit(buf, "%s\n", impl_id->name);
+}
+
+static struct kobj_attribute nfs_sysfs_attr_implid_name = __ATTR_RO(implid_name);
+
+#endif /* IS_ENABLED(CONFIG_NFS_V4_1) */
+
#define RPC_CLIENT_NAME_SIZE 64
void nfs_sysfs_link_rpc_client(struct nfs_server *server,
@@ -309,6 +361,32 @@ static struct kobj_type nfs_sb_ktype = {
.child_ns_type = nfs_netns_object_child_ns_type,
};
+#if IS_ENABLED(CONFIG_NFS_V4_1)
+static void nfs_sysfs_add_nfsv41_server(struct nfs_server *server)
+{
+ int ret;
+
+ if (!server->nfs_client->cl_implid)
+ return;
+
+ ret = sysfs_create_file_ns(&server->kobj, &nfs_sysfs_attr_implid_domain.attr,
+ nfs_netns_server_namespace(&server->kobj));
+ if (ret < 0)
+ pr_warn("NFS: sysfs_create_file_ns for server-%d failed (%d)\n",
+ server->s_sysfs_id, ret);
+
+ ret = sysfs_create_file_ns(&server->kobj, &nfs_sysfs_attr_implid_name.attr,
+ nfs_netns_server_namespace(&server->kobj));
+ if (ret < 0)
+ pr_warn("NFS: sysfs_create_file_ns for server-%d failed (%d)\n",
+ server->s_sysfs_id, ret);
+}
+#else /* CONFIG_NFS_V4_1 */
+static inline void nfs_sysfs_add_nfsv41_server(struct nfs_server *server)
+{
+}
+#endif /* CONFIG_NFS_V4_1 */
+
void nfs_sysfs_add_server(struct nfs_server *server)
{
int ret;
@@ -325,6 +403,8 @@ void nfs_sysfs_add_server(struct nfs_server *server)
if (ret < 0)
pr_warn("NFS: sysfs_create_file_ns for server-%d failed (%d)\n",
server->s_sysfs_id, ret);
+
+ nfs_sysfs_add_nfsv41_server(server);
}
EXPORT_SYMBOL_GPL(nfs_sysfs_add_server);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index aa3d8bea3ec0..23df8b214474 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -579,8 +579,10 @@ retry:
while (!nfs_lock_request(head)) {
ret = nfs_wait_on_request(head);
- if (ret < 0)
+ if (ret < 0) {
+ nfs_release_request(head);
return ERR_PTR(ret);
+ }
}
/* Ensure that nobody removed the request before we locked it */
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 3a202e51b360..83970d97840b 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -2424,7 +2424,7 @@ static void nilfs_segctor_accept(struct nilfs_sc_info *sci)
* the area protected by sc_state_lock.
*/
if (thread_is_alive)
- del_timer_sync(&sci->sc_timer);
+ timer_delete_sync(&sci->sc_timer);
}
/**
diff --git a/fs/ntfs3/attrib.c b/fs/ntfs3/attrib.c
index af94e3737470..e946f75eb540 100644
--- a/fs/ntfs3/attrib.c
+++ b/fs/ntfs3/attrib.c
@@ -2664,8 +2664,9 @@ int attr_set_compress(struct ntfs_inode *ni, bool compr)
attr->nres.run_off = cpu_to_le16(run_off);
}
- /* Update data attribute flags. */
+ /* Update attribute flags. */
if (compr) {
+ attr->flags &= ~ATTR_FLAG_SPARSED;
attr->flags |= ATTR_FLAG_COMPRESSED;
attr->nres.c_unit = NTFS_LZNT_CUNIT;
} else {
diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c
index 3f96a11804c9..9b6a3f8d2e7c 100644
--- a/fs/ntfs3/file.c
+++ b/fs/ntfs3/file.c
@@ -101,8 +101,26 @@ int ntfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry,
/* Allowed to change compression for empty files and for directories only. */
if (!is_dedup(ni) && !is_encrypted(ni) &&
(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) {
- /* Change compress state. */
- int err = ni_set_compress(inode, flags & FS_COMPR_FL);
+ int err = 0;
+ struct address_space *mapping = inode->i_mapping;
+
+ /* write out all data and wait. */
+ filemap_invalidate_lock(mapping);
+ err = filemap_write_and_wait(mapping);
+
+ if (err >= 0) {
+ /* Change compress state. */
+ bool compr = flags & FS_COMPR_FL;
+ err = ni_set_compress(inode, compr);
+
+ /* For files change a_ops too. */
+ if (!err)
+ mapping->a_ops = compr ? &ntfs_aops_cmpr :
+ &ntfs_aops;
+ }
+
+ filemap_invalidate_unlock(mapping);
+
if (err)
return err;
}
@@ -412,6 +430,7 @@ static int ntfs_extend(struct inode *inode, loff_t pos, size_t count,
}
if (extend_init && !is_compressed(ni)) {
+ WARN_ON(ni->i_valid >= pos);
err = ntfs_extend_initialized_size(file, ni, ni->i_valid, pos);
if (err)
goto out;
@@ -1228,21 +1247,22 @@ static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
ssize_t ret;
int err;
- err = check_write_restriction(inode);
- if (err)
- return err;
-
- if (is_compressed(ni) && (iocb->ki_flags & IOCB_DIRECT)) {
- ntfs_inode_warn(inode, "direct i/o + compressed not supported");
- return -EOPNOTSUPP;
- }
-
if (!inode_trylock(inode)) {
if (iocb->ki_flags & IOCB_NOWAIT)
return -EAGAIN;
inode_lock(inode);
}
+ ret = check_write_restriction(inode);
+ if (ret)
+ goto out;
+
+ if (is_compressed(ni) && (iocb->ki_flags & IOCB_DIRECT)) {
+ ntfs_inode_warn(inode, "direct i/o + compressed not supported");
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
ret = generic_write_checks(iocb, from);
if (ret <= 0)
goto out;
diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c
index 5df6a0b5add9..b7a83200f2cc 100644
--- a/fs/ntfs3/frecord.c
+++ b/fs/ntfs3/frecord.c
@@ -281,63 +281,6 @@ struct ATTRIB *ni_enum_attr_ex(struct ntfs_inode *ni, struct ATTRIB *attr,
}
/*
- * ni_load_attr - Load attribute that contains given VCN.
- */
-struct ATTRIB *ni_load_attr(struct ntfs_inode *ni, enum ATTR_TYPE type,
- const __le16 *name, u8 name_len, CLST vcn,
- struct mft_inode **pmi)
-{
- struct ATTR_LIST_ENTRY *le;
- struct ATTRIB *attr;
- struct mft_inode *mi;
- struct ATTR_LIST_ENTRY *next;
-
- if (!ni->attr_list.size) {
- if (pmi)
- *pmi = &ni->mi;
- return mi_find_attr(ni, &ni->mi, NULL, type, name, name_len,
- NULL);
- }
-
- le = al_find_ex(ni, NULL, type, name, name_len, NULL);
- if (!le)
- return NULL;
-
- /*
- * Unfortunately ATTR_LIST_ENTRY contains only start VCN.
- * So to find the ATTRIB segment that contains 'vcn' we should
- * enumerate some entries.
- */
- if (vcn) {
- for (;; le = next) {
- next = al_find_ex(ni, le, type, name, name_len, NULL);
- if (!next || le64_to_cpu(next->vcn) > vcn)
- break;
- }
- }
-
- if (ni_load_mi(ni, le, &mi))
- return NULL;
-
- if (pmi)
- *pmi = mi;
-
- attr = mi_find_attr(ni, mi, NULL, type, name, name_len, &le->id);
- if (!attr)
- return NULL;
-
- if (!attr->non_res)
- return attr;
-
- if (le64_to_cpu(attr->nres.svcn) <= vcn &&
- vcn <= le64_to_cpu(attr->nres.evcn))
- return attr;
-
- _ntfs_bad_inode(&ni->vfs_inode);
- return NULL;
-}
-
-/*
* ni_load_all_mi - Load all subrecords.
*/
int ni_load_all_mi(struct ntfs_inode *ni)
@@ -3434,10 +3377,12 @@ int ni_set_compress(struct inode *inode, bool compr)
}
ni->std_fa = std->fa;
- if (compr)
+ if (compr) {
+ std->fa &= ~FILE_ATTRIBUTE_SPARSE_FILE;
std->fa |= FILE_ATTRIBUTE_COMPRESSED;
- else
+ } else {
std->fa &= ~FILE_ATTRIBUTE_COMPRESSED;
+ }
if (ni->std_fa != std->fa) {
ni->std_fa = std->fa;
diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c
index 938d351ebac7..df81f1f7330c 100644
--- a/fs/ntfs3/fsntfs.c
+++ b/fs/ntfs3/fsntfs.c
@@ -1035,34 +1035,6 @@ struct buffer_head *ntfs_bread(struct super_block *sb, sector_t block)
return NULL;
}
-int ntfs_sb_read(struct super_block *sb, u64 lbo, size_t bytes, void *buffer)
-{
- struct block_device *bdev = sb->s_bdev;
- u32 blocksize = sb->s_blocksize;
- u64 block = lbo >> sb->s_blocksize_bits;
- u32 off = lbo & (blocksize - 1);
- u32 op = blocksize - off;
-
- for (; bytes; block += 1, off = 0, op = blocksize) {
- struct buffer_head *bh = __bread(bdev, block, blocksize);
-
- if (!bh)
- return -EIO;
-
- if (op > bytes)
- op = bytes;
-
- memcpy(buffer, bh->b_data + off, op);
-
- put_bh(bh);
-
- bytes -= op;
- buffer = Add2Ptr(buffer, op);
- }
-
- return 0;
-}
-
int ntfs_sb_write(struct super_block *sb, u64 lbo, size_t bytes,
const void *buf, int wait)
{
diff --git a/fs/ntfs3/index.c b/fs/ntfs3/index.c
index 7eb9fae22f8d..78d20e4baa2c 100644
--- a/fs/ntfs3/index.c
+++ b/fs/ntfs3/index.c
@@ -618,7 +618,7 @@ static bool index_hdr_check(const struct INDEX_HDR *hdr, u32 bytes)
u32 off = le32_to_cpu(hdr->de_off);
if (!IS_ALIGNED(off, 8) || tot > bytes || end > tot ||
- off + sizeof(struct NTFS_DE) > end) {
+ size_add(off, sizeof(struct NTFS_DE)) > end) {
/* incorrect index buffer. */
return false;
}
@@ -736,7 +736,7 @@ fill_table:
if (end > total)
return NULL;
- if (off + sizeof(struct NTFS_DE) > end)
+ if (size_add(off, sizeof(struct NTFS_DE)) > end)
return NULL;
e = Add2Ptr(hdr, off);
diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c
index a1e11228dafd..3e2957a1e360 100644
--- a/fs/ntfs3/inode.c
+++ b/fs/ntfs3/inode.c
@@ -1025,46 +1025,6 @@ int ntfs_sync_inode(struct inode *inode)
}
/*
- * writeback_inode - Helper function for ntfs_flush_inodes().
- *
- * This writes both the inode and the file data blocks, waiting
- * for in flight data blocks before the start of the call. It
- * does not wait for any io started during the call.
- */
-static int writeback_inode(struct inode *inode)
-{
- int ret = sync_inode_metadata(inode, 0);
-
- if (!ret)
- ret = filemap_fdatawrite(inode->i_mapping);
- return ret;
-}
-
-/*
- * ntfs_flush_inodes
- *
- * Write data and metadata corresponding to i1 and i2. The io is
- * started but we do not wait for any of it to finish.
- *
- * filemap_flush() is used for the block device, so if there is a dirty
- * page for a block already in flight, we will not wait and start the
- * io over again.
- */
-int ntfs_flush_inodes(struct super_block *sb, struct inode *i1,
- struct inode *i2)
-{
- int ret = 0;
-
- if (i1)
- ret = writeback_inode(i1);
- if (!ret && i2)
- ret = writeback_inode(i2);
- if (!ret)
- ret = filemap_flush(sb->s_bdev_file->f_mapping);
- return ret;
-}
-
-/*
* Helper function to read file.
*/
int inode_read_data(struct inode *inode, void *data, size_t bytes)
diff --git a/fs/ntfs3/ntfs.h b/fs/ntfs3/ntfs.h
index 241f2ffdd920..1ff13b6f9613 100644
--- a/fs/ntfs3/ntfs.h
+++ b/fs/ntfs3/ntfs.h
@@ -717,7 +717,7 @@ static inline struct NTFS_DE *hdr_first_de(const struct INDEX_HDR *hdr)
struct NTFS_DE *e;
u16 esize;
- if (de_off >= used || de_off + sizeof(struct NTFS_DE) > used )
+ if (de_off >= used || size_add(de_off, sizeof(struct NTFS_DE)) > used)
return NULL;
e = Add2Ptr(hdr, de_off);
diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h
index 382820464dee..d628977e2556 100644
--- a/fs/ntfs3/ntfs_fs.h
+++ b/fs/ntfs3/ntfs_fs.h
@@ -530,9 +530,6 @@ struct ATTRIB *ni_find_attr(struct ntfs_inode *ni, struct ATTRIB *attr,
struct ATTRIB *ni_enum_attr_ex(struct ntfs_inode *ni, struct ATTRIB *attr,
struct ATTR_LIST_ENTRY **le,
struct mft_inode **mi);
-struct ATTRIB *ni_load_attr(struct ntfs_inode *ni, enum ATTR_TYPE type,
- const __le16 *name, u8 name_len, CLST vcn,
- struct mft_inode **pmi);
int ni_load_all_mi(struct ntfs_inode *ni);
bool ni_add_subrecord(struct ntfs_inode *ni, CLST rno, struct mft_inode **mi);
int ni_remove_attr(struct ntfs_inode *ni, enum ATTR_TYPE type,
@@ -619,7 +616,6 @@ enum NTFS_DIRTY_FLAGS {
NTFS_DIRTY_ERROR = 2,
};
int ntfs_set_state(struct ntfs_sb_info *sbi, enum NTFS_DIRTY_FLAGS dirty);
-int ntfs_sb_read(struct super_block *sb, u64 lbo, size_t bytes, void *buffer);
int ntfs_sb_write(struct super_block *sb, u64 lbo, size_t bytes,
const void *buffer, int wait);
int ntfs_sb_write_run(struct ntfs_sb_info *sbi, const struct runs_tree *run,
@@ -717,8 +713,6 @@ int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos,
u32 len, u32 copied, struct folio *folio, void *fsdata);
int ntfs3_write_inode(struct inode *inode, struct writeback_control *wbc);
int ntfs_sync_inode(struct inode *inode);
-int ntfs_flush_inodes(struct super_block *sb, struct inode *i1,
- struct inode *i2);
int inode_read_data(struct inode *inode, void *data, size_t bytes);
int ntfs_create_inode(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, const struct cpu_str *uni,
diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c
index 6a0f6b0a3ab2..920a1ab47b63 100644
--- a/fs/ntfs3/super.c
+++ b/fs/ntfs3/super.c
@@ -555,6 +555,55 @@ static const struct proc_ops ntfs3_label_fops = {
.proc_write = ntfs3_label_write,
};
+static void ntfs_create_procdir(struct super_block *sb)
+{
+ struct proc_dir_entry *e;
+
+ if (!proc_info_root)
+ return;
+
+ e = proc_mkdir(sb->s_id, proc_info_root);
+ if (e) {
+ struct ntfs_sb_info *sbi = sb->s_fs_info;
+
+ proc_create_data("volinfo", 0444, e,
+ &ntfs3_volinfo_fops, sb);
+ proc_create_data("label", 0644, e,
+ &ntfs3_label_fops, sb);
+ sbi->procdir = e;
+ }
+}
+
+static void ntfs_remove_procdir(struct super_block *sb)
+{
+ struct ntfs_sb_info *sbi = sb->s_fs_info;
+
+ if (!sbi->procdir)
+ return;
+
+ remove_proc_entry("label", sbi->procdir);
+ remove_proc_entry("volinfo", sbi->procdir);
+ remove_proc_entry(sb->s_id, proc_info_root);
+ sbi->procdir = NULL;
+}
+
+static void ntfs_create_proc_root(void)
+{
+ proc_info_root = proc_mkdir("fs/ntfs3", NULL);
+}
+
+static void ntfs_remove_proc_root(void)
+{
+ if (proc_info_root) {
+ remove_proc_entry("fs/ntfs3", NULL);
+ proc_info_root = NULL;
+ }
+}
+#else
+static void ntfs_create_procdir(struct super_block *sb) {}
+static void ntfs_remove_procdir(struct super_block *sb) {}
+static void ntfs_create_proc_root(void) {}
+static void ntfs_remove_proc_root(void) {}
#endif
static struct kmem_cache *ntfs_inode_cachep;
@@ -644,15 +693,7 @@ static void ntfs_put_super(struct super_block *sb)
{
struct ntfs_sb_info *sbi = sb->s_fs_info;
-#ifdef CONFIG_PROC_FS
- // Remove /proc/fs/ntfs3/..
- if (sbi->procdir) {
- remove_proc_entry("label", sbi->procdir);
- remove_proc_entry("volinfo", sbi->procdir);
- remove_proc_entry(sb->s_id, proc_info_root);
- sbi->procdir = NULL;
- }
-#endif
+ ntfs_remove_procdir(sb);
/* Mark rw ntfs as clear, if possible. */
ntfs_set_state(sbi, NTFS_DIRTY_CLEAR);
@@ -1590,20 +1631,7 @@ load_root:
kfree(boot2);
}
-#ifdef CONFIG_PROC_FS
- /* Create /proc/fs/ntfs3/.. */
- if (proc_info_root) {
- struct proc_dir_entry *e = proc_mkdir(sb->s_id, proc_info_root);
- static_assert((S_IRUGO | S_IWUSR) == 0644);
- if (e) {
- proc_create_data("volinfo", S_IRUGO, e,
- &ntfs3_volinfo_fops, sb);
- proc_create_data("label", S_IRUGO | S_IWUSR, e,
- &ntfs3_label_fops, sb);
- sbi->procdir = e;
- }
- }
-#endif
+ ntfs_create_procdir(sb);
if (is_legacy_ntfs(sb))
sb->s_flags |= SB_RDONLY;
@@ -1853,14 +1881,11 @@ static int __init init_ntfs_fs(void)
if (IS_ENABLED(CONFIG_NTFS3_LZX_XPRESS))
pr_info("ntfs3: Read-only LZX/Xpress compression included\n");
-#ifdef CONFIG_PROC_FS
- /* Create "/proc/fs/ntfs3" */
- proc_info_root = proc_mkdir("fs/ntfs3", NULL);
-#endif
+ ntfs_create_proc_root();
err = ntfs3_init_bitmap();
if (err)
- return err;
+ goto out2;
ntfs_inode_cachep = kmem_cache_create(
"ntfs_inode_cache", sizeof(struct ntfs_inode), 0,
@@ -1880,6 +1905,8 @@ out:
kmem_cache_destroy(ntfs_inode_cachep);
out1:
ntfs3_exit_bitmap();
+out2:
+ ntfs_remove_proc_root();
return err;
}
@@ -1890,11 +1917,7 @@ static void __exit exit_ntfs_fs(void)
unregister_filesystem(&ntfs_fs_type);
unregister_as_ntfs_legacy();
ntfs3_exit_bitmap();
-
-#ifdef CONFIG_PROC_FS
- if (proc_info_root)
- remove_proc_entry("fs/ntfs3", NULL);
-#endif
+ ntfs_remove_proc_root();
}
MODULE_LICENSE("GPL");
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 0f46b22561d6..fce9beb214f0 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -724,7 +724,7 @@ static void o2net_shutdown_sc(struct work_struct *work)
if (o2net_unregister_callbacks(sc->sc_sock->sk, sc)) {
/* we shouldn't flush as we're in the thread, the
* races with pending sc work structs are harmless */
- del_timer_sync(&sc->sc_idle_timeout);
+ timer_delete_sync(&sc->sc_idle_timeout);
o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work);
sc_put(sc);
kernel_sock_shutdown(sc->sc_sock, SHUT_RDWR);
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 557cf9d40177..f8b9c9c73997 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -563,7 +563,7 @@ void pstore_unregister(struct pstore_info *psi)
pstore_unregister_kmsg();
/* Stop timer and make sure all work has finished. */
- del_timer_sync(&pstore_timer);
+ timer_delete_sync(&pstore_timer);
flush_work(&pstore_work);
/* Remove all backend records from filesystem tree. */
diff --git a/fs/smb/client/cifs_fs_sb.h b/fs/smb/client/cifs_fs_sb.h
index 651759192280..5e8d163cb5f8 100644
--- a/fs/smb/client/cifs_fs_sb.h
+++ b/fs/smb/client/cifs_fs_sb.h
@@ -49,6 +49,7 @@
struct cifs_sb_info {
struct rb_root tlink_tree;
+ struct list_head tcon_sb_link;
spinlock_t tlink_tree_lock;
struct tcon_link *master_tlink;
struct nls_table *local_nls;
diff --git a/fs/smb/client/cifsfs.h b/fs/smb/client/cifsfs.h
index 8dea0cf3a8de..ca435a3841b8 100644
--- a/fs/smb/client/cifsfs.h
+++ b/fs/smb/client/cifsfs.h
@@ -135,7 +135,6 @@ extern ssize_t cifs_file_copychunk_range(unsigned int xid,
extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
extern void cifs_setsize(struct inode *inode, loff_t offset);
-extern int cifs_truncate_page(struct address_space *mapping, loff_t from);
struct smb3_fs_context;
extern struct dentry *cifs_smb3_do_mount(struct file_system_type *fs_type,
@@ -146,6 +145,6 @@ extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
/* when changing internal version - update following two lines at same time */
-#define SMB3_PRODUCT_BUILD 53
-#define CIFS_VERSION "2.53"
+#define SMB3_PRODUCT_BUILD 54
+#define CIFS_VERSION "2.54"
#endif /* _CIFSFS_H */
diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index 6ae170a2a042..07c4688ec4c9 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -714,6 +714,8 @@ struct TCP_Server_Info {
spinlock_t srv_lock; /* protect anything here that is not protected */
__u64 conn_id; /* connection identifier (useful for debugging) */
int srv_count; /* reference counter */
+ int rfc1001_sessinit; /* whether to estasblish netbios session */
+ bool with_rfc1001; /* if netbios session is used */
/* 15 character server name + 0x20 16th byte indicating type = srv */
char server_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
struct smb_version_operations *ops;
@@ -1321,7 +1323,8 @@ struct cifs_tcon {
#endif
struct list_head pending_opens; /* list of incomplete opens */
struct cached_fids *cfids;
- /* BB add field for back pointer to sb struct(s)? */
+ struct list_head cifs_sb_list;
+ spinlock_t sb_list_lock;
#ifdef CONFIG_CIFS_DFS_UPCALL
struct delayed_work dfs_cache_work;
struct list_head dfs_ses_list;
@@ -1718,6 +1721,7 @@ struct mid_q_entry {
void *resp_buf; /* pointer to received SMB header */
unsigned int resp_buf_size;
int mid_state; /* wish this were enum but can not pass to wait_event */
+ int mid_rc; /* rc for MID_RC */
unsigned int mid_flags;
__le16 command; /* smb command code */
unsigned int optype; /* operation type */
@@ -1880,6 +1884,7 @@ static inline bool is_replayable_error(int error)
#define MID_RESPONSE_MALFORMED 0x10
#define MID_SHUTDOWN 0x20
#define MID_RESPONSE_READY 0x40 /* ready for other process handle the rsp */
+#define MID_RC 0x80 /* mid_rc contains custom rc */
/* Flags */
#define MID_WAIT_CANCELLED 1 /* Cancelled while waiting for response */
diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c
index 29dcb88392e5..60cb264a01e5 100644
--- a/fs/smb/client/cifssmb.c
+++ b/fs/smb/client/cifssmb.c
@@ -1041,15 +1041,31 @@ static __u16 convert_disposition(int disposition)
static int
access_flags_to_smbopen_mode(const int access_flags)
{
- int masked_flags = access_flags & (GENERIC_READ | GENERIC_WRITE);
-
- if (masked_flags == GENERIC_READ)
- return SMBOPEN_READ;
- else if (masked_flags == GENERIC_WRITE)
+ /*
+ * SYSTEM_SECURITY grants both read and write access to SACL, treat is as read/write.
+ * MAXIMUM_ALLOWED grants as many access as possible, so treat it as read/write too.
+ * SYNCHRONIZE as is does not grant any specific access, so do not check its mask.
+ * If only SYNCHRONIZE bit is specified then fallback to read access.
+ */
+ bool with_write_flags = access_flags & (FILE_WRITE_DATA | FILE_APPEND_DATA | FILE_WRITE_EA |
+ FILE_DELETE_CHILD | FILE_WRITE_ATTRIBUTES | DELETE |
+ WRITE_DAC | WRITE_OWNER | SYSTEM_SECURITY |
+ MAXIMUM_ALLOWED | GENERIC_WRITE | GENERIC_ALL);
+ bool with_read_flags = access_flags & (FILE_READ_DATA | FILE_READ_EA | FILE_EXECUTE |
+ FILE_READ_ATTRIBUTES | READ_CONTROL |
+ SYSTEM_SECURITY | MAXIMUM_ALLOWED | GENERIC_ALL |
+ GENERIC_EXECUTE | GENERIC_READ);
+ bool with_execute_flags = access_flags & (FILE_EXECUTE | MAXIMUM_ALLOWED | GENERIC_ALL |
+ GENERIC_EXECUTE);
+
+ if (with_write_flags && with_read_flags)
+ return SMBOPEN_READWRITE;
+ else if (with_write_flags)
return SMBOPEN_WRITE;
-
- /* just go for read/write */
- return SMBOPEN_READWRITE;
+ else if (with_execute_flags)
+ return SMBOPEN_EXECUTE;
+ else
+ return SMBOPEN_READ;
}
int
diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c
index d7bad2c3af37..f298e86a3c1f 100644
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -371,7 +371,7 @@ static bool cifs_tcp_ses_needs_reconnect(struct TCP_Server_Info *server, int num
*
*/
static int __cifs_reconnect(struct TCP_Server_Info *server,
- bool mark_smb_session)
+ bool mark_smb_session, bool once)
{
int rc = 0;
@@ -399,6 +399,9 @@ static int __cifs_reconnect(struct TCP_Server_Info *server,
if (rc) {
cifs_server_unlock(server);
cifs_dbg(FYI, "%s: reconnect error %d\n", __func__, rc);
+ /* If was asked to reconnect only once, do not try it more times */
+ if (once)
+ break;
msleep(3000);
} else {
atomic_inc(&tcpSesReconnectCount);
@@ -564,19 +567,33 @@ static int reconnect_dfs_server(struct TCP_Server_Info *server)
return rc;
}
-int cifs_reconnect(struct TCP_Server_Info *server, bool mark_smb_session)
+static int
+_cifs_reconnect(struct TCP_Server_Info *server, bool mark_smb_session, bool once)
{
if (!server->leaf_fullpath)
- return __cifs_reconnect(server, mark_smb_session);
+ return __cifs_reconnect(server, mark_smb_session, once);
return reconnect_dfs_server(server);
}
#else
-int cifs_reconnect(struct TCP_Server_Info *server, bool mark_smb_session)
+static int
+_cifs_reconnect(struct TCP_Server_Info *server, bool mark_smb_session, bool once)
{
- return __cifs_reconnect(server, mark_smb_session);
+ return __cifs_reconnect(server, mark_smb_session, once);
}
#endif
+int
+cifs_reconnect(struct TCP_Server_Info *server, bool mark_smb_session)
+{
+ return _cifs_reconnect(server, mark_smb_session, false);
+}
+
+static int
+cifs_reconnect_once(struct TCP_Server_Info *server)
+{
+ return _cifs_reconnect(server, true, true);
+}
+
static void
cifs_echo_request(struct work_struct *work)
{
@@ -803,26 +820,110 @@ is_smb_response(struct TCP_Server_Info *server, unsigned char type)
/* Regular SMB response */
return true;
case RFC1002_SESSION_KEEP_ALIVE:
+ /*
+ * RFC 1002 session keep alive can sent by the server only when
+ * we established a RFC 1002 session. But Samba servers send
+ * RFC 1002 session keep alive also over port 445 on which
+ * RFC 1002 session is not established.
+ */
cifs_dbg(FYI, "RFC 1002 session keep alive\n");
break;
case RFC1002_POSITIVE_SESSION_RESPONSE:
- cifs_dbg(FYI, "RFC 1002 positive session response\n");
+ /*
+ * RFC 1002 positive session response cannot be returned
+ * for SMB request. RFC 1002 session response is handled
+ * exclusively in ip_rfc1001_connect() function.
+ */
+ cifs_server_dbg(VFS, "RFC 1002 positive session response (unexpected)\n");
+ cifs_reconnect(server, true);
break;
case RFC1002_NEGATIVE_SESSION_RESPONSE:
/*
* We get this from Windows 98 instead of an error on
- * SMB negprot response.
+ * SMB negprot response, when we have not established
+ * RFC 1002 session (which means ip_rfc1001_connect()
+ * was skipped). Note that same still happens with
+ * Windows Server 2022 when connecting via port 139.
+ * So for this case when mount option -o nonbsessinit
+ * was not specified, try to reconnect with establishing
+ * RFC 1002 session. If new socket establishment with
+ * RFC 1002 session was successful then return to the
+ * mid's caller -EAGAIN, so it can retry the request.
*/
- cifs_dbg(FYI, "RFC 1002 negative session response\n");
- /* give server a second to clean up */
- msleep(1000);
- /*
- * Always try 445 first on reconnect since we get NACK
- * on some if we ever connected to port 139 (the NACK
- * is since we do not begin with RFC1001 session
- * initialize frame).
- */
- cifs_set_port((struct sockaddr *)&server->dstaddr, CIFS_PORT);
+ if (!cifs_rdma_enabled(server) &&
+ server->tcpStatus == CifsInNegotiate &&
+ !server->with_rfc1001 &&
+ server->rfc1001_sessinit != 0) {
+ int rc, mid_rc;
+ struct mid_q_entry *mid, *nmid;
+ LIST_HEAD(dispose_list);
+
+ cifs_dbg(FYI, "RFC 1002 negative session response during SMB Negotiate, retrying with NetBIOS session\n");
+
+ /*
+ * Before reconnect, delete all pending mids for this
+ * server, so reconnect would not signal connection
+ * aborted error to mid's callbacks. Note that for this
+ * server there should be exactly one pending mid
+ * corresponding to SMB1/SMB2 Negotiate packet.
+ */
+ spin_lock(&server->mid_lock);
+ list_for_each_entry_safe(mid, nmid, &server->pending_mid_q, qhead) {
+ kref_get(&mid->refcount);
+ list_move(&mid->qhead, &dispose_list);
+ mid->mid_flags |= MID_DELETED;
+ }
+ spin_unlock(&server->mid_lock);
+
+ /* Now try to reconnect once with NetBIOS session. */
+ server->with_rfc1001 = true;
+ rc = cifs_reconnect_once(server);
+
+ /*
+ * If reconnect was successful then indicate -EAGAIN
+ * to mid's caller. If reconnect failed with -EAGAIN
+ * then mask it as -EHOSTDOWN, so mid's caller would
+ * know that it failed.
+ */
+ if (rc == 0)
+ mid_rc = -EAGAIN;
+ else if (rc == -EAGAIN)
+ mid_rc = -EHOSTDOWN;
+ else
+ mid_rc = rc;
+
+ /*
+ * After reconnect (either successful or unsuccessful)
+ * deliver reconnect status to mid's caller via mid's
+ * callback. Use MID_RC state which indicates that the
+ * return code should be read from mid_rc member.
+ */
+ list_for_each_entry_safe(mid, nmid, &dispose_list, qhead) {
+ list_del_init(&mid->qhead);
+ mid->mid_rc = mid_rc;
+ mid->mid_state = MID_RC;
+ mid->callback(mid);
+ release_mid(mid);
+ }
+
+ /*
+ * If reconnect failed then wait two seconds. In most
+ * cases we were been called from the mount context and
+ * delivered failure to mid's callback will stop this
+ * receiver task thread and fails the mount process.
+ * So wait two seconds to prevent another reconnect
+ * in this task thread, which would be useless as the
+ * mount context will fail at all.
+ */
+ if (rc != 0)
+ msleep(2000);
+ } else {
+ cifs_server_dbg(VFS, "RFC 1002 negative session response (unexpected)\n");
+ cifs_reconnect(server, true);
+ }
+ break;
+ case RFC1002_RETARGET_SESSION_RESPONSE:
+ cifs_server_dbg(VFS, "RFC 1002 retarget session response (unexpected)\n");
cifs_reconnect(server, true);
break;
default:
@@ -1701,6 +1802,8 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx,
ctx->source_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL);
memcpy(tcp_ses->server_RFC1001_name,
ctx->target_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL);
+ tcp_ses->rfc1001_sessinit = ctx->rfc1001_sessinit;
+ tcp_ses->with_rfc1001 = false;
tcp_ses->session_estab = false;
tcp_ses->sequence_number = 0;
tcp_ses->channel_sequence_num = 0; /* only tracked for primary channel */
@@ -1731,12 +1834,8 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx,
*/
tcp_ses->tcpStatus = CifsNew;
++tcp_ses->srv_count;
+ tcp_ses->echo_interval = ctx->echo_interval * HZ;
- if (ctx->echo_interval >= SMB_ECHO_INTERVAL_MIN &&
- ctx->echo_interval <= SMB_ECHO_INTERVAL_MAX)
- tcp_ses->echo_interval = ctx->echo_interval * HZ;
- else
- tcp_ses->echo_interval = SMB_ECHO_INTERVAL_DEFAULT * HZ;
if (tcp_ses->rdma) {
#ifndef CONFIG_CIFS_SMB_DIRECT
cifs_dbg(VFS, "CONFIG_CIFS_SMB_DIRECT is not enabled\n");
@@ -3221,6 +3320,7 @@ ip_rfc1001_connect(struct TCP_Server_Info *server)
return -EIO;
}
+ server->with_rfc1001 = true;
return 0;
}
@@ -3332,7 +3432,16 @@ generic_ip_connect(struct TCP_Server_Info *server)
return rc;
}
trace_smb3_connect_done(server->hostname, server->conn_id, &server->dstaddr);
- if (sport == htons(RFC1001_PORT))
+
+ /*
+ * Establish RFC1001 NetBIOS session when it was explicitly requested
+ * by mount option -o nbsessinit, or when connecting to default RFC1001
+ * server port (139) and it was not explicitly disabled by mount option
+ * -o nonbsessinit.
+ */
+ if (server->with_rfc1001 ||
+ server->rfc1001_sessinit == 1 ||
+ (server->rfc1001_sessinit == -1 && sport == htons(RFC1001_PORT)))
rc = ip_rfc1001_connect(server);
return rc;
@@ -3481,6 +3590,7 @@ int cifs_setup_cifs_sb(struct cifs_sb_info *cifs_sb)
struct smb3_fs_context *ctx = cifs_sb->ctx;
INIT_DELAYED_WORK(&cifs_sb->prune_tlinks, cifs_prune_tlinks);
+ INIT_LIST_HEAD(&cifs_sb->tcon_sb_link);
spin_lock_init(&cifs_sb->tlink_tree_lock);
cifs_sb->tlink_tree = RB_ROOT;
@@ -3713,6 +3823,10 @@ static int mount_setup_tlink(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses,
tlink_rb_insert(&cifs_sb->tlink_tree, tlink);
spin_unlock(&cifs_sb->tlink_tree_lock);
+ spin_lock(&tcon->sb_list_lock);
+ list_add(&cifs_sb->tcon_sb_link, &tcon->cifs_sb_list);
+ spin_unlock(&tcon->sb_list_lock);
+
queue_delayed_work(cifsiod_wq, &cifs_sb->prune_tlinks,
TLINK_IDLE_EXPIRE);
return 0;
@@ -4054,9 +4168,19 @@ cifs_umount(struct cifs_sb_info *cifs_sb)
struct rb_root *root = &cifs_sb->tlink_tree;
struct rb_node *node;
struct tcon_link *tlink;
+ struct cifs_tcon *tcon = NULL;
cancel_delayed_work_sync(&cifs_sb->prune_tlinks);
+ if (cifs_sb->master_tlink) {
+ tcon = cifs_sb->master_tlink->tl_tcon;
+ if (tcon) {
+ spin_lock(&tcon->sb_list_lock);
+ list_del_init(&cifs_sb->tcon_sb_link);
+ spin_unlock(&tcon->sb_list_lock);
+ }
+ }
+
spin_lock(&cifs_sb->tlink_tree_lock);
while ((node = rb_first(root))) {
tlink = rb_entry(node, struct tcon_link, tl_rbnode);
@@ -4078,11 +4202,13 @@ int
cifs_negotiate_protocol(const unsigned int xid, struct cifs_ses *ses,
struct TCP_Server_Info *server)
{
+ bool in_retry = false;
int rc = 0;
if (!server->ops->need_neg || !server->ops->negotiate)
return -ENOSYS;
+retry:
/* only send once per connect */
spin_lock(&server->srv_lock);
if (server->tcpStatus != CifsGood &&
@@ -4102,6 +4228,14 @@ cifs_negotiate_protocol(const unsigned int xid, struct cifs_ses *ses,
spin_unlock(&server->srv_lock);
rc = server->ops->negotiate(xid, ses, server);
+ if (rc == -EAGAIN) {
+ /* Allow one retry attempt */
+ if (!in_retry) {
+ in_retry = true;
+ goto retry;
+ }
+ rc = -EHOSTDOWN;
+ }
if (rc == 0) {
spin_lock(&server->srv_lock);
if (server->tcpStatus == CifsInNegotiate)
diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c
index bdb762d398af..2980941b9667 100644
--- a/fs/smb/client/fs_context.c
+++ b/fs/smb/client/fs_context.c
@@ -135,6 +135,7 @@ const struct fs_parameter_spec smb3_fs_parameters[] = {
fsparam_flag("witness", Opt_witness),
fsparam_flag_no("nativesocket", Opt_nativesocket),
fsparam_flag_no("unicode", Opt_unicode),
+ fsparam_flag_no("nbsessinit", Opt_nbsessinit),
/* Mount options which take uid or gid */
fsparam_uid("backupuid", Opt_backupuid),
@@ -968,6 +969,10 @@ static int smb3_verify_reconfigure_ctx(struct fs_context *fc,
cifs_errorf(fc, "can not change unicode during remount\n");
return -EINVAL;
}
+ if (new_ctx->rfc1001_sessinit != old_ctx->rfc1001_sessinit) {
+ cifs_errorf(fc, "can not change nbsessinit during remount\n");
+ return -EINVAL;
+ }
return 0;
}
@@ -1333,6 +1338,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
case Opt_rsize:
ctx->rsize = result.uint_32;
ctx->got_rsize = true;
+ ctx->vol_rsize = ctx->rsize;
break;
case Opt_wsize:
ctx->wsize = result.uint_32;
@@ -1348,6 +1354,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
ctx->wsize, PAGE_SIZE);
}
}
+ ctx->vol_wsize = ctx->wsize;
break;
case Opt_acregmax:
if (result.uint_32 > CIFS_MAX_ACTIMEO / HZ) {
@@ -1383,6 +1390,11 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
ctx->closetimeo = HZ * result.uint_32;
break;
case Opt_echo_interval:
+ if (result.uint_32 < SMB_ECHO_INTERVAL_MIN ||
+ result.uint_32 > SMB_ECHO_INTERVAL_MAX) {
+ cifs_errorf(fc, "echo interval is out of bounds\n");
+ goto cifs_parse_mount_err;
+ }
ctx->echo_interval = result.uint_32;
break;
case Opt_snapshot:
@@ -1602,6 +1614,10 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
if (i == RFC1001_NAME_LEN && param->string[i] != 0)
pr_warn("server netbiosname longer than 15 truncated\n");
break;
+ case Opt_nbsessinit:
+ ctx->rfc1001_sessinit = !result.negated;
+ cifs_dbg(FYI, "rfc1001_sessinit set to %d\n", ctx->rfc1001_sessinit);
+ break;
case Opt_ver:
/* version of mount userspace tools, not dialect */
/* If interface changes in mount.cifs bump to new ver */
@@ -1889,13 +1905,16 @@ int smb3_init_fs_context(struct fs_context *fc)
memset(ctx->source_rfc1001_name, 0x20, RFC1001_NAME_LEN);
for (i = 0; i < strnlen(nodename, RFC1001_NAME_LEN); i++)
ctx->source_rfc1001_name[i] = toupper(nodename[i]);
-
ctx->source_rfc1001_name[RFC1001_NAME_LEN] = 0;
+
/*
* null target name indicates to use *SMBSERVR default called name
* if we end up sending RFC1001 session initialize
*/
ctx->target_rfc1001_name[0] = 0;
+
+ ctx->rfc1001_sessinit = -1; /* autodetect based on port number */
+
ctx->cred_uid = current_uid();
ctx->linux_uid = current_uid();
ctx->linux_gid = current_gid();
diff --git a/fs/smb/client/fs_context.h b/fs/smb/client/fs_context.h
index 42c6b66c2c1a..d1d29249bcdb 100644
--- a/fs/smb/client/fs_context.h
+++ b/fs/smb/client/fs_context.h
@@ -174,6 +174,7 @@ enum cifs_param {
Opt_iocharset,
Opt_netbiosname,
Opt_servern,
+ Opt_nbsessinit,
Opt_ver,
Opt_vers,
Opt_sec,
@@ -216,6 +217,7 @@ struct smb3_fs_context {
char *iocharset; /* local code page for mapping to and from Unicode */
char source_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* clnt nb name */
char target_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* srvr nb name */
+ int rfc1001_sessinit;
kuid_t cred_uid;
kuid_t linux_uid;
kgid_t linux_gid;
@@ -280,6 +282,9 @@ struct smb3_fs_context {
bool use_client_guid:1;
/* reuse existing guid for multichannel */
u8 client_guid[SMB2_CLIENT_GUID_SIZE];
+ /* User-specified original r/wsize value */
+ unsigned int vol_rsize;
+ unsigned int vol_wsize;
unsigned int bsize;
unsigned int rasize;
unsigned int rsize;
diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c
index 3bb21aa58474..a00a9d91d0da 100644
--- a/fs/smb/client/inode.c
+++ b/fs/smb/client/inode.c
@@ -2901,23 +2901,6 @@ int cifs_fiemap(struct inode *inode, struct fiemap_extent_info *fei, u64 start,
return -EOPNOTSUPP;
}
-int cifs_truncate_page(struct address_space *mapping, loff_t from)
-{
- pgoff_t index = from >> PAGE_SHIFT;
- unsigned offset = from & (PAGE_SIZE - 1);
- struct page *page;
- int rc = 0;
-
- page = grab_cache_page(mapping, index);
- if (!page)
- return -ENOMEM;
-
- zero_user_segment(page, offset, PAGE_SIZE);
- unlock_page(page);
- put_page(page);
- return rc;
-}
-
void cifs_setsize(struct inode *inode, loff_t offset)
{
struct cifsInodeInfo *cifs_i = CIFS_I(inode);
@@ -3012,8 +2995,6 @@ set_size_out:
*/
attrs->ia_ctime = attrs->ia_mtime = current_time(inode);
attrs->ia_valid |= ATTR_CTIME | ATTR_MTIME;
-
- cifs_truncate_page(inode->i_mapping, inode->i_size);
}
return rc;
diff --git a/fs/smb/client/link.c b/fs/smb/client/link.c
index a88253668286..769752ad2c5c 100644
--- a/fs/smb/client/link.c
+++ b/fs/smb/client/link.c
@@ -258,7 +258,7 @@ cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
struct cifs_open_parms oparms;
struct cifs_io_parms io_parms = {0};
int buf_type = CIFS_NO_BUFFER;
- FILE_ALL_INFO file_info;
+ struct cifs_open_info_data query_data;
oparms = (struct cifs_open_parms) {
.tcon = tcon,
@@ -270,11 +270,11 @@ cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
.fid = &fid,
};
- rc = CIFS_open(xid, &oparms, &oplock, &file_info);
+ rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, &query_data);
if (rc)
return rc;
- if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) {
+ if (query_data.fi.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) {
rc = -ENOENT;
/* it's not a symlink */
goto out;
@@ -313,7 +313,7 @@ cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
.fid = &fid,
};
- rc = CIFS_open(xid, &oparms, &oplock, NULL);
+ rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, NULL);
if (rc)
return rc;
diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c
index b328dc5c7988..7b6ed9b23e71 100644
--- a/fs/smb/client/misc.c
+++ b/fs/smb/client/misc.c
@@ -137,8 +137,10 @@ tcon_info_alloc(bool dir_leases_enabled, enum smb3_tcon_ref_trace trace)
spin_lock_init(&ret_buf->tc_lock);
INIT_LIST_HEAD(&ret_buf->openFileList);
INIT_LIST_HEAD(&ret_buf->tcon_list);
+ INIT_LIST_HEAD(&ret_buf->cifs_sb_list);
spin_lock_init(&ret_buf->open_file_lock);
spin_lock_init(&ret_buf->stat_lock);
+ spin_lock_init(&ret_buf->sb_list_lock);
atomic_set(&ret_buf->num_local_opens, 0);
atomic_set(&ret_buf->num_remote_opens, 0);
ret_buf->stats_from_time = ktime_get_real_seconds();
diff --git a/fs/smb/client/smb1ops.c b/fs/smb/client/smb1ops.c
index 8701484805cd..26df807fbe7a 100644
--- a/fs/smb/client/smb1ops.c
+++ b/fs/smb/client/smb1ops.c
@@ -14,6 +14,8 @@
#include "cifspdu.h"
#include "cifs_unicode.h"
#include "fs_context.h"
+#include "nterr.h"
+#include "smberr.h"
/*
* An NT cancel request header looks just like the original request except:
@@ -426,13 +428,6 @@ cifs_negotiate(const unsigned int xid,
{
int rc;
rc = CIFSSMBNegotiate(xid, ses, server);
- if (rc == -EAGAIN) {
- /* retry only once on 1st time connection */
- set_credits(server, 1);
- rc = CIFSSMBNegotiate(xid, ses, server);
- if (rc == -EAGAIN)
- rc = -EHOSTDOWN;
- }
return rc;
}
@@ -444,8 +439,8 @@ cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
unsigned int wsize;
/* start with specified wsize, or default */
- if (ctx->wsize)
- wsize = ctx->wsize;
+ if (ctx->got_wsize)
+ wsize = ctx->vol_wsize;
else if (tcon->unix_ext && (unix_cap & CIFS_UNIX_LARGE_WRITE_CAP))
wsize = CIFS_DEFAULT_IOSIZE;
else
@@ -497,7 +492,7 @@ cifs_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
else
defsize = server->maxBuf - sizeof(READ_RSP);
- rsize = ctx->rsize ? ctx->rsize : defsize;
+ rsize = ctx->got_rsize ? ctx->vol_rsize : defsize;
/*
* no CAP_LARGE_READ_X? Then MS-CIFS states that we must limit this to
@@ -1069,6 +1064,47 @@ cifs_make_node(unsigned int xid, struct inode *inode,
full_path, mode, dev);
}
+static bool
+cifs_is_network_name_deleted(char *buf, struct TCP_Server_Info *server)
+{
+ struct smb_hdr *shdr = (struct smb_hdr *)buf;
+ struct TCP_Server_Info *pserver;
+ struct cifs_ses *ses;
+ struct cifs_tcon *tcon;
+
+ if (shdr->Flags2 & SMBFLG2_ERR_STATUS) {
+ if (shdr->Status.CifsError != cpu_to_le32(NT_STATUS_NETWORK_NAME_DELETED))
+ return false;
+ } else {
+ if (shdr->Status.DosError.ErrorClass != ERRSRV ||
+ shdr->Status.DosError.Error != cpu_to_le16(ERRinvtid))
+ return false;
+ }
+
+ /* If server is a channel, select the primary channel */
+ pserver = SERVER_IS_CHAN(server) ? server->primary_server : server;
+
+ spin_lock(&cifs_tcp_ses_lock);
+ list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) {
+ if (cifs_ses_exiting(ses))
+ continue;
+ list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
+ if (tcon->tid == shdr->Tid) {
+ spin_lock(&tcon->tc_lock);
+ tcon->need_reconnect = true;
+ spin_unlock(&tcon->tc_lock);
+ spin_unlock(&cifs_tcp_ses_lock);
+ pr_warn_once("Server share %s deleted.\n",
+ tcon->tree_name);
+ return true;
+ }
+ }
+ }
+ spin_unlock(&cifs_tcp_ses_lock);
+
+ return false;
+}
+
struct smb_version_operations smb1_operations = {
.send_cancel = send_nt_cancel,
.compare_fids = cifs_compare_fids,
@@ -1153,6 +1189,7 @@ struct smb_version_operations smb1_operations = {
.get_acl_by_fid = get_cifs_acl_by_fid,
.set_acl = set_cifs_acl,
.make_node = cifs_make_node,
+ .is_network_name_deleted = cifs_is_network_name_deleted,
};
struct smb_version_values smb1_values = {
diff --git a/fs/smb/client/smb2file.c b/fs/smb/client/smb2file.c
index d609a20fb98a..a7f629238830 100644
--- a/fs/smb/client/smb2file.c
+++ b/fs/smb/client/smb2file.c
@@ -152,16 +152,35 @@ int smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, __u32
int err_buftype = CIFS_NO_BUFFER;
struct cifs_fid *fid = oparms->fid;
struct network_resiliency_req nr_ioctl_req;
+ bool retry_without_read_attributes = false;
smb2_path = cifs_convert_path_to_utf16(oparms->path, oparms->cifs_sb);
if (smb2_path == NULL)
return -ENOMEM;
- oparms->desired_access |= FILE_READ_ATTRIBUTES;
+ /*
+ * GENERIC_READ, GENERIC_EXECUTE, GENERIC_ALL and MAXIMUM_ALLOWED
+ * contains also FILE_READ_ATTRIBUTES access right. So do not append
+ * FILE_READ_ATTRIBUTES when not needed and prevent calling code path
+ * for retry_without_read_attributes.
+ */
+ if (!(oparms->desired_access & FILE_READ_ATTRIBUTES) &&
+ !(oparms->desired_access & GENERIC_READ) &&
+ !(oparms->desired_access & GENERIC_EXECUTE) &&
+ !(oparms->desired_access & GENERIC_ALL) &&
+ !(oparms->desired_access & MAXIMUM_ALLOWED)) {
+ oparms->desired_access |= FILE_READ_ATTRIBUTES;
+ retry_without_read_attributes = true;
+ }
smb2_oplock = SMB2_OPLOCK_LEVEL_BATCH;
rc = SMB2_open(xid, oparms, smb2_path, &smb2_oplock, smb2_data, NULL, &err_iov,
&err_buftype);
+ if (rc == -EACCES && retry_without_read_attributes) {
+ oparms->desired_access &= ~FILE_READ_ATTRIBUTES;
+ rc = SMB2_open(xid, oparms, smb2_path, &smb2_oplock, smb2_data, NULL, &err_iov,
+ &err_buftype);
+ }
if (rc && data) {
struct smb2_hdr *hdr = err_iov.iov_base;
diff --git a/fs/smb/client/smb2glob.h b/fs/smb/client/smb2glob.h
index 2466e6155136..224495322a05 100644
--- a/fs/smb/client/smb2glob.h
+++ b/fs/smb/client/smb2glob.h
@@ -38,6 +38,7 @@ enum smb2_compound_ops {
SMB2_OP_SET_REPARSE,
SMB2_OP_GET_REPARSE,
SMB2_OP_QUERY_WSL_EA,
+ SMB2_OP_OPEN_QUERY,
};
/* Used when constructing chained read requests. */
diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c
index e9fd3e204a6f..57d9bfbadd97 100644
--- a/fs/smb/client/smb2inode.c
+++ b/fs/smb/client/smb2inode.c
@@ -176,6 +176,7 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
struct kvec *out_iov, int *out_buftype, struct dentry *dentry)
{
+ struct smb2_create_rsp *create_rsp = NULL;
struct smb2_query_info_rsp *qi_rsp = NULL;
struct smb2_compound_vars *vars = NULL;
__u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
@@ -265,7 +266,13 @@ replay_again:
num_rqst++;
rc = 0;
- for (i = 0; i < num_cmds; i++) {
+ i = 0;
+
+ /* Skip the leading explicit OPEN operation */
+ if (num_cmds > 0 && cmds[0] == SMB2_OP_OPEN_QUERY)
+ i++;
+
+ for (; i < num_cmds; i++) {
/* Operation */
switch (cmds[i]) {
case SMB2_OP_QUERY_INFO:
@@ -640,6 +647,27 @@ finished:
}
tmp_rc = rc;
+
+ if (rc == 0 && num_cmds > 0 && cmds[0] == SMB2_OP_OPEN_QUERY) {
+ create_rsp = rsp_iov[0].iov_base;
+ idata = in_iov[0].iov_base;
+ idata->fi.CreationTime = create_rsp->CreationTime;
+ idata->fi.LastAccessTime = create_rsp->LastAccessTime;
+ idata->fi.LastWriteTime = create_rsp->LastWriteTime;
+ idata->fi.ChangeTime = create_rsp->ChangeTime;
+ idata->fi.Attributes = create_rsp->FileAttributes;
+ idata->fi.AllocationSize = create_rsp->AllocationSize;
+ idata->fi.EndOfFile = create_rsp->EndofFile;
+ if (le32_to_cpu(idata->fi.NumberOfLinks) == 0)
+ idata->fi.NumberOfLinks = cpu_to_le32(1); /* dummy value */
+ idata->fi.DeletePending = 0;
+ idata->fi.Directory = !!(le32_to_cpu(create_rsp->FileAttributes) & ATTR_DIRECTORY);
+
+ /* smb2_parse_contexts() fills idata->fi.IndexNumber */
+ rc = smb2_parse_contexts(server, &rsp_iov[0], &oparms->fid->epoch,
+ oparms->fid->lease_key, &oplock, &idata->fi, NULL);
+ }
+
for (i = 0; i < num_cmds; i++) {
char *buf = rsp_iov[i + i].iov_base;
@@ -978,6 +1006,43 @@ int smb2_query_path_info(const unsigned int xid,
case 0:
rc = parse_create_response(data, cifs_sb, full_path, &out_iov[0]);
break;
+ case -EACCES:
+ /*
+ * If SMB2_OP_QUERY_INFO (called when POSIX extensions are not used) failed with
+ * STATUS_ACCESS_DENIED then it means that caller does not have permission to
+ * open the path with FILE_READ_ATTRIBUTES access and therefore cannot issue
+ * SMB2_OP_QUERY_INFO command.
+ *
+ * There is an alternative way how to query limited information about path but still
+ * suitable for stat() syscall. SMB2 OPEN/CREATE operation returns in its successful
+ * response subset of query information.
+ *
+ * So try to open the path without FILE_READ_ATTRIBUTES but with MAXIMUM_ALLOWED
+ * access which will grant the maximum possible access to the file and the response
+ * will contain required query information for stat() syscall.
+ */
+
+ if (tcon->posix_extensions)
+ break;
+
+ num_cmds = 1;
+ cmds[0] = SMB2_OP_OPEN_QUERY;
+ in_iov[0].iov_base = data;
+ in_iov[0].iov_len = sizeof(*data);
+ oparms = CIFS_OPARMS(cifs_sb, tcon, full_path, MAXIMUM_ALLOWED,
+ FILE_OPEN, create_options, ACL_NO_MODE);
+ free_rsp_iov(out_iov, out_buftype, ARRAY_SIZE(out_iov));
+ rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
+ &oparms, in_iov, cmds, num_cmds,
+ cfile, out_iov, out_buftype, NULL);
+
+ hdr = out_iov[0].iov_base;
+ if (!hdr || out_buftype[0] == CIFS_NO_BUFFER)
+ goto out;
+
+ if (!rc)
+ rc = parse_create_response(data, cifs_sb, full_path, &out_iov[0]);
+ break;
case -EOPNOTSUPP:
/*
* BB TODO: When support for special files added to Samba
diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c
index a700e5921961..41d8cd20b25f 100644
--- a/fs/smb/client/smb2ops.c
+++ b/fs/smb/client/smb2ops.c
@@ -464,12 +464,20 @@ smb2_negotiate(const unsigned int xid,
server->CurrentMid = 0;
spin_unlock(&server->mid_lock);
rc = SMB2_negotiate(xid, ses, server);
- /* BB we probably don't need to retry with modern servers */
- if (rc == -EAGAIN)
- rc = -EHOSTDOWN;
return rc;
}
+static inline unsigned int
+prevent_zero_iosize(unsigned int size, const char *type)
+{
+ if (size == 0) {
+ cifs_dbg(VFS, "SMB: Zero %ssize calculated, using minimum value %u\n",
+ type, CIFS_MIN_DEFAULT_IOSIZE);
+ return CIFS_MIN_DEFAULT_IOSIZE;
+ }
+ return size;
+}
+
static unsigned int
smb2_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
{
@@ -477,12 +485,12 @@ smb2_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
unsigned int wsize;
/* start with specified wsize, or default */
- wsize = ctx->wsize ? ctx->wsize : CIFS_DEFAULT_IOSIZE;
+ wsize = ctx->got_wsize ? ctx->vol_wsize : CIFS_DEFAULT_IOSIZE;
wsize = min_t(unsigned int, wsize, server->max_write);
if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU))
wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE);
- return wsize;
+ return prevent_zero_iosize(wsize, "w");
}
static unsigned int
@@ -492,7 +500,7 @@ smb3_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
unsigned int wsize;
/* start with specified wsize, or default */
- wsize = ctx->wsize ? ctx->wsize : SMB3_DEFAULT_IOSIZE;
+ wsize = ctx->got_wsize ? ctx->vol_wsize : SMB3_DEFAULT_IOSIZE;
wsize = min_t(unsigned int, wsize, server->max_write);
#ifdef CONFIG_CIFS_SMB_DIRECT
if (server->rdma) {
@@ -514,7 +522,7 @@ smb3_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU))
wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE);
- return wsize;
+ return prevent_zero_iosize(wsize, "w");
}
static unsigned int
@@ -524,13 +532,13 @@ smb2_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
unsigned int rsize;
/* start with specified rsize, or default */
- rsize = ctx->rsize ? ctx->rsize : CIFS_DEFAULT_IOSIZE;
+ rsize = ctx->got_rsize ? ctx->vol_rsize : CIFS_DEFAULT_IOSIZE;
rsize = min_t(unsigned int, rsize, server->max_read);
if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU))
rsize = min_t(unsigned int, rsize, SMB2_MAX_BUFFER_SIZE);
- return rsize;
+ return prevent_zero_iosize(rsize, "r");
}
static unsigned int
@@ -540,7 +548,7 @@ smb3_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
unsigned int rsize;
/* start with specified rsize, or default */
- rsize = ctx->rsize ? ctx->rsize : SMB3_DEFAULT_IOSIZE;
+ rsize = ctx->got_rsize ? ctx->vol_rsize : SMB3_DEFAULT_IOSIZE;
rsize = min_t(unsigned int, rsize, server->max_read);
#ifdef CONFIG_CIFS_SMB_DIRECT
if (server->rdma) {
@@ -563,7 +571,7 @@ smb3_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU))
rsize = min_t(unsigned int, rsize, SMB2_MAX_BUFFER_SIZE);
- return rsize;
+ return prevent_zero_iosize(rsize, "r");
}
/*
@@ -3526,8 +3534,6 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon,
if (rc == 0) {
netfs_resize_file(&cifsi->netfs, new_eof, true);
cifs_setsize(inode, new_eof);
- cifs_truncate_page(inode->i_mapping, inode->i_size);
- truncate_setsize(inode, new_eof);
}
goto out;
}
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index 4f69a1825e42..81e05db8e4d5 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -43,6 +43,7 @@
#endif
#include "cached_dir.h"
#include "compress.h"
+#include "fs_context.h"
/*
* The following table defines the expected "StructureSize" of SMB2 requests
@@ -4089,6 +4090,24 @@ smb2_echo_callback(struct mid_q_entry *mid)
add_credits(server, &credits, CIFS_ECHO_OP);
}
+static void cifs_renegotiate_iosize(struct TCP_Server_Info *server,
+ struct cifs_tcon *tcon)
+{
+ struct cifs_sb_info *cifs_sb;
+
+ if (server == NULL || tcon == NULL)
+ return;
+
+ spin_lock(&tcon->sb_list_lock);
+ list_for_each_entry(cifs_sb, &tcon->cifs_sb_list, tcon_sb_link) {
+ cifs_sb->ctx->rsize =
+ server->ops->negotiate_rsize(tcon, cifs_sb->ctx);
+ cifs_sb->ctx->wsize =
+ server->ops->negotiate_wsize(tcon, cifs_sb->ctx);
+ }
+ spin_unlock(&tcon->sb_list_lock);
+}
+
void smb2_reconnect_server(struct work_struct *work)
{
struct TCP_Server_Info *server = container_of(work,
@@ -4174,9 +4193,10 @@ void smb2_reconnect_server(struct work_struct *work)
list_for_each_entry_safe(tcon, tcon2, &tmp_list, rlist) {
rc = smb2_reconnect(SMB2_INTERNAL_CMD, tcon, server, true);
- if (!rc)
+ if (!rc) {
+ cifs_renegotiate_iosize(server, tcon);
cifs_reopen_persistent_handles(tcon);
- else
+ } else
resched = true;
list_del_init(&tcon->rlist);
if (tcon->ipc)
diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c
index 03434dbe9374..266af17aa7d9 100644
--- a/fs/smb/client/transport.c
+++ b/fs/smb/client/transport.c
@@ -894,6 +894,9 @@ cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
case MID_SHUTDOWN:
rc = -EHOSTDOWN;
break;
+ case MID_RC:
+ rc = mid->mid_rc;
+ break;
default:
if (!(mid->mid_flags & MID_DELETED)) {
list_del_init(&mid->qhead);
diff --git a/fs/smb/client/xattr.c b/fs/smb/client/xattr.c
index 7d49f38f01f3..b88fa04f5792 100644
--- a/fs/smb/client/xattr.c
+++ b/fs/smb/client/xattr.c
@@ -31,6 +31,8 @@
* secure, replaced by SMB2 (then even more highly secure SMB3) many years ago
*/
#define SMB3_XATTR_CIFS_ACL "system.smb3_acl" /* DACL only */
+#define SMB3_XATTR_CIFS_NTSD_SACL "system.smb3_ntsd_sacl" /* SACL only */
+#define SMB3_XATTR_CIFS_NTSD_OWNER "system.smb3_ntsd_owner" /* owner only */
#define SMB3_XATTR_CIFS_NTSD "system.smb3_ntsd" /* owner plus DACL */
#define SMB3_XATTR_CIFS_NTSD_FULL "system.smb3_ntsd_full" /* owner/DACL/SACL */
#define SMB3_XATTR_ATTRIB "smb3.dosattrib" /* full name: user.smb3.dosattrib */
@@ -38,6 +40,7 @@
/* BB need to add server (Samba e.g) support for security and trusted prefix */
enum { XATTR_USER, XATTR_CIFS_ACL, XATTR_ACL_ACCESS, XATTR_ACL_DEFAULT,
+ XATTR_CIFS_NTSD_SACL, XATTR_CIFS_NTSD_OWNER,
XATTR_CIFS_NTSD, XATTR_CIFS_NTSD_FULL };
static int cifs_attrib_set(unsigned int xid, struct cifs_tcon *pTcon,
@@ -160,6 +163,8 @@ static int cifs_xattr_set(const struct xattr_handler *handler,
break;
case XATTR_CIFS_ACL:
+ case XATTR_CIFS_NTSD_SACL:
+ case XATTR_CIFS_NTSD_OWNER:
case XATTR_CIFS_NTSD:
case XATTR_CIFS_NTSD_FULL: {
struct smb_ntsd *pacl;
@@ -187,6 +192,13 @@ static int cifs_xattr_set(const struct xattr_handler *handler,
CIFS_ACL_GROUP |
CIFS_ACL_DACL);
break;
+ case XATTR_CIFS_NTSD_OWNER:
+ aclflags = (CIFS_ACL_OWNER |
+ CIFS_ACL_GROUP);
+ break;
+ case XATTR_CIFS_NTSD_SACL:
+ aclflags = CIFS_ACL_SACL;
+ break;
case XATTR_CIFS_ACL:
default:
aclflags = CIFS_ACL_DACL;
@@ -308,6 +320,8 @@ static int cifs_xattr_get(const struct xattr_handler *handler,
break;
case XATTR_CIFS_ACL:
+ case XATTR_CIFS_NTSD_SACL:
+ case XATTR_CIFS_NTSD_OWNER:
case XATTR_CIFS_NTSD:
case XATTR_CIFS_NTSD_FULL: {
/*
@@ -327,6 +341,12 @@ static int cifs_xattr_get(const struct xattr_handler *handler,
case XATTR_CIFS_NTSD:
extra_info = OWNER_SECINFO | GROUP_SECINFO | DACL_SECINFO;
break;
+ case XATTR_CIFS_NTSD_OWNER:
+ extra_info = OWNER_SECINFO | GROUP_SECINFO;
+ break;
+ case XATTR_CIFS_NTSD_SACL:
+ extra_info = SACL_SECINFO;
+ break;
case XATTR_CIFS_ACL:
default:
extra_info = DACL_SECINFO;
@@ -448,6 +468,20 @@ static const struct xattr_handler smb3_acl_xattr_handler = {
.set = cifs_xattr_set,
};
+static const struct xattr_handler smb3_ntsd_sacl_xattr_handler = {
+ .name = SMB3_XATTR_CIFS_NTSD_SACL,
+ .flags = XATTR_CIFS_NTSD_SACL,
+ .get = cifs_xattr_get,
+ .set = cifs_xattr_set,
+};
+
+static const struct xattr_handler smb3_ntsd_owner_xattr_handler = {
+ .name = SMB3_XATTR_CIFS_NTSD_OWNER,
+ .flags = XATTR_CIFS_NTSD_OWNER,
+ .get = cifs_xattr_get,
+ .set = cifs_xattr_set,
+};
+
static const struct xattr_handler cifs_cifs_ntsd_xattr_handler = {
.name = CIFS_XATTR_CIFS_NTSD,
.flags = XATTR_CIFS_NTSD,
@@ -493,6 +527,8 @@ const struct xattr_handler * const cifs_xattr_handlers[] = {
&cifs_os2_xattr_handler,
&cifs_cifs_acl_xattr_handler,
&smb3_acl_xattr_handler, /* alias for above since avoiding "cifs" */
+ &smb3_ntsd_sacl_xattr_handler,
+ &smb3_ntsd_owner_xattr_handler,
&cifs_cifs_ntsd_xattr_handler,
&smb3_ntsd_xattr_handler, /* alias for above since avoiding "cifs" */
&cifs_cifs_ntsd_full_xattr_handler,
diff --git a/fs/smb/common/smb2pdu.h b/fs/smb/common/smb2pdu.h
index c7a0efda4403..764dca80c15c 100644
--- a/fs/smb/common/smb2pdu.h
+++ b/fs/smb/common/smb2pdu.h
@@ -95,6 +95,9 @@
*/
#define SMB3_DEFAULT_IOSIZE (4 * 1024 * 1024)
+/* According to MS-SMB2 specification The minimum recommended value is 65536.*/
+#define CIFS_MIN_DEFAULT_IOSIZE (65536)
+
/*
* SMB2 Header Definition
*
diff --git a/fs/smb/server/auth.c b/fs/smb/server/auth.c
index 00b31cf86462..83caa3849749 100644
--- a/fs/smb/server/auth.c
+++ b/fs/smb/server/auth.c
@@ -1016,9 +1016,9 @@ static int ksmbd_get_encryption_key(struct ksmbd_work *work, __u64 ses_id,
ses_enc_key = enc ? sess->smb3encryptionkey :
sess->smb3decryptionkey;
- if (enc)
- ksmbd_user_session_get(sess);
memcpy(key, ses_enc_key, SMB3_ENC_DEC_KEY_SIZE);
+ if (!enc)
+ ksmbd_user_session_put(sess);
return 0;
}
diff --git a/fs/smb/server/connection.h b/fs/smb/server/connection.h
index 91c2318639e7..14620e147dda 100644
--- a/fs/smb/server/connection.h
+++ b/fs/smb/server/connection.h
@@ -27,6 +27,7 @@ enum {
KSMBD_SESS_EXITING,
KSMBD_SESS_NEED_RECONNECT,
KSMBD_SESS_NEED_NEGOTIATE,
+ KSMBD_SESS_NEED_SETUP,
KSMBD_SESS_RELEASING
};
@@ -187,6 +188,11 @@ static inline bool ksmbd_conn_need_negotiate(struct ksmbd_conn *conn)
return READ_ONCE(conn->status) == KSMBD_SESS_NEED_NEGOTIATE;
}
+static inline bool ksmbd_conn_need_setup(struct ksmbd_conn *conn)
+{
+ return READ_ONCE(conn->status) == KSMBD_SESS_NEED_SETUP;
+}
+
static inline bool ksmbd_conn_need_reconnect(struct ksmbd_conn *conn)
{
return READ_ONCE(conn->status) == KSMBD_SESS_NEED_RECONNECT;
@@ -217,6 +223,11 @@ static inline void ksmbd_conn_set_need_negotiate(struct ksmbd_conn *conn)
WRITE_ONCE(conn->status, KSMBD_SESS_NEED_NEGOTIATE);
}
+static inline void ksmbd_conn_set_need_setup(struct ksmbd_conn *conn)
+{
+ WRITE_ONCE(conn->status, KSMBD_SESS_NEED_SETUP);
+}
+
static inline void ksmbd_conn_set_need_reconnect(struct ksmbd_conn *conn)
{
WRITE_ONCE(conn->status, KSMBD_SESS_NEED_RECONNECT);
diff --git a/fs/smb/server/mgmt/user_session.c b/fs/smb/server/mgmt/user_session.c
index 53d308f331af..3f45f28f6f0f 100644
--- a/fs/smb/server/mgmt/user_session.c
+++ b/fs/smb/server/mgmt/user_session.c
@@ -181,7 +181,7 @@ static void ksmbd_expire_session(struct ksmbd_conn *conn)
down_write(&sessions_table_lock);
down_write(&conn->session_lock);
xa_for_each(&conn->sessions, id, sess) {
- if (atomic_read(&sess->refcnt) == 0 &&
+ if (atomic_read(&sess->refcnt) <= 1 &&
(sess->state != SMB2_SESSION_VALID ||
time_after(jiffies,
sess->last_active + SMB2_SESSION_TIMEOUT))) {
@@ -233,7 +233,8 @@ void ksmbd_sessions_deregister(struct ksmbd_conn *conn)
down_write(&conn->session_lock);
xa_erase(&conn->sessions, sess->id);
up_write(&conn->session_lock);
- ksmbd_session_destroy(sess);
+ if (atomic_dec_and_test(&sess->refcnt))
+ ksmbd_session_destroy(sess);
}
}
}
@@ -252,7 +253,8 @@ void ksmbd_sessions_deregister(struct ksmbd_conn *conn)
if (xa_empty(&sess->ksmbd_chann_list)) {
xa_erase(&conn->sessions, sess->id);
hash_del(&sess->hlist);
- ksmbd_session_destroy(sess);
+ if (atomic_dec_and_test(&sess->refcnt))
+ ksmbd_session_destroy(sess);
}
}
up_write(&conn->session_lock);
@@ -328,8 +330,8 @@ void ksmbd_user_session_put(struct ksmbd_session *sess)
if (atomic_read(&sess->refcnt) <= 0)
WARN_ON(1);
- else
- atomic_dec(&sess->refcnt);
+ else if (atomic_dec_and_test(&sess->refcnt))
+ ksmbd_session_destroy(sess);
}
struct preauth_session *ksmbd_preauth_session_alloc(struct ksmbd_conn *conn,
@@ -372,13 +374,13 @@ void destroy_previous_session(struct ksmbd_conn *conn,
ksmbd_all_conn_set_status(id, KSMBD_SESS_NEED_RECONNECT);
err = ksmbd_conn_wait_idle_sess_id(conn, id);
if (err) {
- ksmbd_all_conn_set_status(id, KSMBD_SESS_NEED_NEGOTIATE);
+ ksmbd_all_conn_set_status(id, KSMBD_SESS_NEED_SETUP);
goto out;
}
ksmbd_destroy_file_table(&prev_sess->file_table);
prev_sess->state = SMB2_SESSION_EXPIRED;
- ksmbd_all_conn_set_status(id, KSMBD_SESS_NEED_NEGOTIATE);
+ ksmbd_all_conn_set_status(id, KSMBD_SESS_NEED_SETUP);
ksmbd_launch_ksmbd_durable_scavenger();
out:
up_write(&conn->session_lock);
@@ -436,7 +438,7 @@ static struct ksmbd_session *__session_create(int protocol)
xa_init(&sess->rpc_handle_list);
sess->sequence_number = 1;
rwlock_init(&sess->tree_conns_lock);
- atomic_set(&sess->refcnt, 1);
+ atomic_set(&sess->refcnt, 2);
ret = __init_smb2_session(sess);
if (ret)
diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index 4ddf4300371b..d24d95d15d87 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -1249,7 +1249,7 @@ int smb2_handle_negotiate(struct ksmbd_work *work)
}
conn->srv_sec_mode = le16_to_cpu(rsp->SecurityMode);
- ksmbd_conn_set_need_negotiate(conn);
+ ksmbd_conn_set_need_setup(conn);
err_out:
ksmbd_conn_unlock(conn);
@@ -1271,6 +1271,9 @@ static int alloc_preauth_hash(struct ksmbd_session *sess,
if (sess->Preauth_HashValue)
return 0;
+ if (!conn->preauth_info)
+ return -ENOMEM;
+
sess->Preauth_HashValue = kmemdup(conn->preauth_info->Preauth_HashValue,
PREAUTH_HASHVALUE_SIZE, KSMBD_DEFAULT_GFP);
if (!sess->Preauth_HashValue)
@@ -1674,6 +1677,11 @@ int smb2_sess_setup(struct ksmbd_work *work)
ksmbd_debug(SMB, "Received smb2 session setup request\n");
+ if (!ksmbd_conn_need_setup(conn) && !ksmbd_conn_good(conn)) {
+ work->send_no_response = 1;
+ return rc;
+ }
+
WORK_BUFFERS(work, req, rsp);
rsp->StructureSize = cpu_to_le16(9);
@@ -1909,7 +1917,7 @@ out_err:
if (try_delay) {
ksmbd_conn_set_need_reconnect(conn);
ssleep(5);
- ksmbd_conn_set_need_negotiate(conn);
+ ksmbd_conn_set_need_setup(conn);
}
}
smb2_set_err_rsp(work);
@@ -2235,14 +2243,15 @@ int smb2_session_logoff(struct ksmbd_work *work)
return -ENOENT;
}
- ksmbd_destroy_file_table(&sess->file_table);
down_write(&conn->session_lock);
sess->state = SMB2_SESSION_EXPIRED;
up_write(&conn->session_lock);
- ksmbd_free_user(sess->user);
- sess->user = NULL;
- ksmbd_all_conn_set_status(sess_id, KSMBD_SESS_NEED_NEGOTIATE);
+ if (sess->user) {
+ ksmbd_free_user(sess->user);
+ sess->user = NULL;
+ }
+ ksmbd_all_conn_set_status(sess_id, KSMBD_SESS_NEED_SETUP);
rsp->StructureSize = cpu_to_le16(4);
err = ksmbd_iov_pin_rsp(work, rsp, sizeof(struct smb2_logoff_rsp));
diff --git a/fs/smb/server/smbacl.c b/fs/smb/server/smbacl.c
index 49b128698670..5aa7a66334d9 100644
--- a/fs/smb/server/smbacl.c
+++ b/fs/smb/server/smbacl.c
@@ -270,6 +270,11 @@ static int sid_to_id(struct mnt_idmap *idmap,
return -EIO;
}
+ if (psid->num_subauth == 0) {
+ pr_err("%s: zero subauthorities!\n", __func__);
+ return -EIO;
+ }
+
if (sidtype == SIDOWNER) {
kuid_t uid;
uid_t id;
@@ -1026,7 +1031,9 @@ int smb_inherit_dacl(struct ksmbd_conn *conn,
struct dentry *parent = path->dentry->d_parent;
struct mnt_idmap *idmap = mnt_idmap(path->mnt);
int inherited_flags = 0, flags = 0, i, nt_size = 0, pdacl_size;
- int rc = 0, dacloffset, pntsd_type, pntsd_size, acl_len, aces_size;
+ int rc = 0, pntsd_type, pntsd_size, acl_len, aces_size;
+ unsigned int dacloffset;
+ size_t dacl_struct_end;
u16 num_aces, ace_cnt = 0;
char *aces_base;
bool is_dir = S_ISDIR(d_inode(path->dentry)->i_mode);
@@ -1035,8 +1042,11 @@ int smb_inherit_dacl(struct ksmbd_conn *conn,
parent, &parent_pntsd);
if (pntsd_size <= 0)
return -ENOENT;
+
dacloffset = le32_to_cpu(parent_pntsd->dacloffset);
- if (!dacloffset || (dacloffset + sizeof(struct smb_acl) > pntsd_size)) {
+ if (!dacloffset ||
+ check_add_overflow(dacloffset, sizeof(struct smb_acl), &dacl_struct_end) ||
+ dacl_struct_end > (size_t)pntsd_size) {
rc = -EINVAL;
goto free_parent_pntsd;
}
@@ -1240,7 +1250,9 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, const struct path *path,
struct smb_ntsd *pntsd = NULL;
struct smb_acl *pdacl;
struct posix_acl *posix_acls;
- int rc = 0, pntsd_size, acl_size, aces_size, pdacl_size, dacl_offset;
+ int rc = 0, pntsd_size, acl_size, aces_size, pdacl_size;
+ unsigned int dacl_offset;
+ size_t dacl_struct_end;
struct smb_sid sid;
int granted = le32_to_cpu(*pdaccess & ~FILE_MAXIMAL_ACCESS_LE);
struct smb_ace *ace;
@@ -1259,7 +1271,8 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, const struct path *path,
dacl_offset = le32_to_cpu(pntsd->dacloffset);
if (!dacl_offset ||
- (dacl_offset + sizeof(struct smb_acl) > pntsd_size))
+ check_add_overflow(dacl_offset, sizeof(struct smb_acl), &dacl_struct_end) ||
+ dacl_struct_end > (size_t)pntsd_size)
goto err_out;
pdacl = (struct smb_acl *)((char *)pntsd + le32_to_cpu(pntsd->dacloffset));
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 97c4d71115d8..d80f94346199 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -396,32 +396,6 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
goto out;
/*
- * If it's already released don't get it. This avoids to loop
- * in __get_user_pages if userfaultfd_release waits on the
- * caller of handle_userfault to release the mmap_lock.
- */
- if (unlikely(READ_ONCE(ctx->released))) {
- /*
- * Don't return VM_FAULT_SIGBUS in this case, so a non
- * cooperative manager can close the uffd after the
- * last UFFDIO_COPY, without risking to trigger an
- * involuntary SIGBUS if the process was starting the
- * userfaultfd while the userfaultfd was still armed
- * (but after the last UFFDIO_COPY). If the uffd
- * wasn't already closed when the userfault reached
- * this point, that would normally be solved by
- * userfaultfd_must_wait returning 'false'.
- *
- * If we were to return VM_FAULT_SIGBUS here, the non
- * cooperative manager would be instead forced to
- * always call UFFDIO_UNREGISTER before it can safely
- * close the uffd.
- */
- ret = VM_FAULT_NOPAGE;
- goto out;
- }
-
- /*
* Check that we can return VM_FAULT_RETRY.
*
* NOTE: it should become possible to return VM_FAULT_RETRY
@@ -457,6 +431,31 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
goto out;
+ if (unlikely(READ_ONCE(ctx->released))) {
+ /*
+ * If a concurrent release is detected, do not return
+ * VM_FAULT_SIGBUS or VM_FAULT_NOPAGE, but instead always
+ * return VM_FAULT_RETRY with lock released proactively.
+ *
+ * If we were to return VM_FAULT_SIGBUS here, the non
+ * cooperative manager would be instead forced to
+ * always call UFFDIO_UNREGISTER before it can safely
+ * close the uffd, to avoid involuntary SIGBUS triggered.
+ *
+ * If we were to return VM_FAULT_NOPAGE, it would work for
+ * the fault path, in which the lock will be released
+ * later. However for GUP, faultin_page() does nothing
+ * special on NOPAGE, so GUP would spin retrying without
+ * releasing the mmap read lock, causing possible livelock.
+ *
+ * Here only VM_FAULT_RETRY would make sure the mmap lock
+ * be released immediately, so that the thread concurrently
+ * releasing the userfault would always make progress.
+ */
+ release_fault_lock(vmf);
+ goto out;
+ }
+
/* take the reference before dropping the mmap_lock */
userfaultfd_ctx_get(ctx);