summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_inode_dotl.c2
-rw-r--r--fs/afs/dynroot.c4
-rw-r--r--fs/afs/fs_probe.c2
-rw-r--r--fs/afs/server.c2
-rw-r--r--fs/bcachefs/Kconfig8
-rw-r--r--fs/bcachefs/acl.c4
-rw-r--r--fs/bcachefs/alloc_background.c95
-rw-r--r--fs/bcachefs/alloc_background.h6
-rw-r--r--fs/bcachefs/alloc_foreground.c79
-rw-r--r--fs/bcachefs/backpointers.c24
-rw-r--r--fs/bcachefs/bcachefs.h13
-rw-r--r--fs/bcachefs/btree_gc.c4
-rw-r--r--fs/bcachefs/btree_io.c17
-rw-r--r--fs/bcachefs/btree_iter.c188
-rw-r--r--fs/bcachefs/btree_iter.h122
-rw-r--r--fs/bcachefs/btree_journal_iter.c5
-rw-r--r--fs/bcachefs/btree_key_cache.c32
-rw-r--r--fs/bcachefs/btree_node_scan.c14
-rw-r--r--fs/bcachefs/btree_types.h1
-rw-r--r--fs/bcachefs/btree_update.c26
-rw-r--r--fs/bcachefs/btree_update_interior.c14
-rw-r--r--fs/bcachefs/btree_write_buffer.c18
-rw-r--r--fs/bcachefs/buckets.c19
-rw-r--r--fs/bcachefs/buckets.h26
-rw-r--r--fs/bcachefs/buckets_types.h5
-rw-r--r--fs/bcachefs/chardev.c14
-rw-r--r--fs/bcachefs/checksum.c247
-rw-r--r--fs/bcachefs/checksum.h3
-rw-r--r--fs/bcachefs/clock.c2
-rw-r--r--fs/bcachefs/compress.c5
-rw-r--r--fs/bcachefs/data_update.c10
-rw-r--r--fs/bcachefs/debug.c4
-rw-r--r--fs/bcachefs/dirent.c20
-rw-r--r--fs/bcachefs/disk_accounting.c4
-rw-r--r--fs/bcachefs/disk_groups.c4
-rw-r--r--fs/bcachefs/ec.c18
-rw-r--r--fs/bcachefs/errcode.h2
-rw-r--r--fs/bcachefs/error.c7
-rw-r--r--fs/bcachefs/extent_update.c6
-rw-r--r--fs/bcachefs/extents.c2
-rw-r--r--fs/bcachefs/fs-io-buffered.c23
-rw-r--r--fs/bcachefs/fs-io.c14
-rw-r--r--fs/bcachefs/fs-ioctl.c2
-rw-r--r--fs/bcachefs/fs.c24
-rw-r--r--fs/bcachefs/fsck.c74
-rw-r--r--fs/bcachefs/inode.c18
-rw-r--r--fs/bcachefs/io_misc.c18
-rw-r--r--fs/bcachefs/io_read.c41
-rw-r--r--fs/bcachefs/io_write.c40
-rw-r--r--fs/bcachefs/journal.c14
-rw-r--r--fs/bcachefs/journal_io.c10
-rw-r--r--fs/bcachefs/migrate.c4
-rw-r--r--fs/bcachefs/move.c14
-rw-r--r--fs/bcachefs/movinggc.c8
-rw-r--r--fs/bcachefs/namei.c38
-rw-r--r--fs/bcachefs/quota.c2
-rw-r--r--fs/bcachefs/rebalance.c12
-rw-r--r--fs/bcachefs/recovery.c17
-rw-r--r--fs/bcachefs/reflink.c23
-rw-r--r--fs/bcachefs/sb-errors_format.h4
-rw-r--r--fs/bcachefs/sb-members.h23
-rw-r--r--fs/bcachefs/snapshot.c13
-rw-r--r--fs/bcachefs/str_hash.c2
-rw-r--r--fs/bcachefs/str_hash.h8
-rw-r--r--fs/bcachefs/subvolume.c4
-rw-r--r--fs/bcachefs/subvolume.h14
-rw-r--r--fs/bcachefs/super-io.c41
-rw-r--r--fs/bcachefs/super.c104
-rw-r--r--fs/bcachefs/tests.c30
-rw-r--r--fs/bcachefs/xattr.c2
-rw-r--r--fs/btrfs/Kconfig2
-rw-r--r--fs/btrfs/disk-io.c2
-rw-r--r--fs/btrfs/ioctl.c2
-rw-r--r--fs/btrfs/super.c3
-rw-r--r--fs/btrfs/zstd.c2
-rw-r--r--fs/ceph/Kconfig2
-rw-r--r--fs/devpts/inode.c4
-rw-r--r--fs/erofs/Kconfig2
-rw-r--r--fs/erofs/erofs_fs.h8
-rw-r--r--fs/erofs/fileio.c2
-rw-r--r--fs/erofs/zdata.c1
-rw-r--r--fs/erofs/zmap.c5
-rw-r--r--fs/exec.c3
-rw-r--r--fs/ext4/block_validity.c5
-rw-r--r--fs/ext4/inode.c75
-rw-r--r--fs/ext4/mballoc.c18
-rw-r--r--fs/ext4/namei.c2
-rw-r--r--fs/ext4/super.c2
-rw-r--r--fs/fuse/virtio_fs.c3
-rw-r--r--fs/gfs2/Kconfig1
-rw-r--r--fs/hfs/bnode.c6
-rw-r--r--fs/hfsplus/bnode.c6
-rw-r--r--fs/iomap/buffered-io.c2
-rw-r--r--fs/isofs/export.c2
-rw-r--r--fs/jbd2/journal.c4
-rw-r--r--fs/jffs2/wbuf.c2
-rw-r--r--fs/namei.c8
-rw-r--r--fs/namespace.c37
-rw-r--r--fs/netfs/main.c4
-rw-r--r--fs/nilfs2/segment.c2
-rw-r--r--fs/ocfs2/cluster/tcp.c2
-rw-r--r--fs/overlayfs/overlayfs.h2
-rw-r--r--fs/overlayfs/super.c5
-rw-r--r--fs/pstore/platform.c2
-rw-r--r--fs/smb/client/cifs_fs_sb.h1
-rw-r--r--fs/smb/client/cifsencrypt.c16
-rw-r--r--fs/smb/client/cifsfs.h5
-rw-r--r--fs/smb/client/cifsglob.h13
-rw-r--r--fs/smb/client/cifspdu.h2
-rw-r--r--fs/smb/client/cifssmb.c32
-rw-r--r--fs/smb/client/connect.c182
-rw-r--r--fs/smb/client/fs_context.c21
-rw-r--r--fs/smb/client/fs_context.h5
-rw-r--r--fs/smb/client/inode.c44
-rw-r--r--fs/smb/client/link.c8
-rw-r--r--fs/smb/client/misc.c2
-rw-r--r--fs/smb/client/reparse.c63
-rw-r--r--fs/smb/client/reparse.h5
-rw-r--r--fs/smb/client/sess.c60
-rw-r--r--fs/smb/client/smb1ops.c110
-rw-r--r--fs/smb/client/smb2file.c21
-rw-r--r--fs/smb/client/smb2glob.h1
-rw-r--r--fs/smb/client/smb2inode.c67
-rw-r--r--fs/smb/client/smb2ops.c46
-rw-r--r--fs/smb/client/smb2pdu.c35
-rw-r--r--fs/smb/client/transport.c3
-rw-r--r--fs/smb/client/xattr.c36
-rw-r--r--fs/smb/common/smb2pdu.h9
-rw-r--r--fs/smb/server/auth.c4
-rw-r--r--fs/smb/server/connection.h11
-rw-r--r--fs/smb/server/mgmt/user_session.c18
-rw-r--r--fs/smb/server/smb2pdu.c21
-rw-r--r--fs/smb/server/smb_common.h2
-rw-r--r--fs/smb/server/smbacl.c21
-rw-r--r--fs/userfaultfd.c51
-rw-r--r--fs/xfs/Kconfig2
-rw-r--r--fs/xfs/xfs_buf.c1
-rw-r--r--fs/xfs/xfs_buf_mem.c2
-rw-r--r--fs/xfs/xfs_dquot.c3
-rw-r--r--fs/xfs/xfs_fsmap.c51
-rw-r--r--fs/xfs/xfs_inode_item.c6
-rw-r--r--fs/xfs/xfs_log.c2
-rw-r--r--fs/xfs/xfs_mount.h1
-rw-r--r--fs/xfs/xfs_sysfs.c32
-rw-r--r--fs/xfs/xfs_trans_ail.c5
-rw-r--r--fs/xfs/xfs_trans_priv.h28
-rw-r--r--fs/xfs/xfs_zone_alloc.c7
-rw-r--r--fs/xfs/xfs_zone_gc.c16
148 files changed, 1820 insertions, 1198 deletions
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index cc2007be2173..5b5fda617b80 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -407,8 +407,8 @@ static struct dentry *v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap,
err);
goto error;
}
- v9fs_fid_add(dentry, &fid);
v9fs_set_create_acl(inode, fid, dacl, pacl);
+ v9fs_fid_add(dentry, &fid);
d_instantiate(dentry, inode);
err = 0;
inc_nlink(dir);
diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c
index 691e0ae607a1..8c6130789fde 100644
--- a/fs/afs/dynroot.c
+++ b/fs/afs/dynroot.c
@@ -348,9 +348,9 @@ static int afs_dynroot_readdir(struct file *file, struct dir_context *ctx)
}
if ((unsigned long long)ctx->pos <= AFS_MAX_DYNROOT_CELL_INO) {
- rcu_read_lock();
+ down_read(&net->cells_lock);
ret = afs_dynroot_readdir_cells(net, ctx);
- rcu_read_unlock();
+ up_read(&net->cells_lock);
}
return ret;
}
diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c
index 07a8bfbdd9b9..e0030ac74ea0 100644
--- a/fs/afs/fs_probe.c
+++ b/fs/afs/fs_probe.c
@@ -534,6 +534,6 @@ dont_wait:
*/
void afs_fs_probe_cleanup(struct afs_net *net)
{
- if (del_timer_sync(&net->fs_probe_timer))
+ if (timer_delete_sync(&net->fs_probe_timer))
afs_dec_servers_outstanding(net);
}
diff --git a/fs/afs/server.c b/fs/afs/server.c
index c530d1ca15df..8755f2703815 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -318,7 +318,7 @@ struct afs_server *afs_use_server(struct afs_server *server, bool activate,
a = atomic_inc_return(&server->active);
if (a == 1 && activate &&
!test_bit(AFS_SERVER_FL_EXPIRED, &server->flags))
- del_timer(&server->timer);
+ timer_delete(&server->timer);
trace_afs_server(server->debug_id, r + 1, a, reason);
return server;
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
index c9798750202d..07709b0d7688 100644
--- a/fs/bcachefs/Kconfig
+++ b/fs/bcachefs/Kconfig
@@ -4,7 +4,7 @@ config BCACHEFS_FS
depends on BLOCK
select EXPORTFS
select CLOSURES
- select LIBCRC32C
+ select CRC32
select CRC64
select FS_POSIX_ACL
select LZ4_COMPRESS
@@ -15,10 +15,9 @@ config BCACHEFS_FS
select ZLIB_INFLATE
select ZSTD_COMPRESS
select ZSTD_DECOMPRESS
- select CRYPTO
select CRYPTO_LIB_SHA256
- select CRYPTO_CHACHA20
- select CRYPTO_POLY1305
+ select CRYPTO_LIB_CHACHA
+ select CRYPTO_LIB_POLY1305
select KEYS
select RAID6_PQ
select XOR_BLOCKS
@@ -26,6 +25,7 @@ config BCACHEFS_FS
select SRCU
select SYMBOLIC_ERRNAME
select MIN_HEAP
+ select XARRAY_MULTI
help
The bcachefs filesystem - a modern, copy on write filesystem, with
support for multiple devices, compression, checksumming, etc.
diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
index 99487727ae64..d03adc36100e 100644
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
@@ -273,7 +273,7 @@ struct posix_acl *bch2_get_acl(struct inode *vinode, int type, bool rcu)
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0);
- struct btree_iter iter = { NULL };
+ struct btree_iter iter = {};
struct posix_acl *acl = NULL;
if (rcu)
@@ -344,7 +344,7 @@ int bch2_set_acl(struct mnt_idmap *idmap,
{
struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct btree_iter inode_iter = { NULL };
+ struct btree_iter inode_iter = {};
struct bch_inode_unpacked inode_u;
struct posix_acl *acl;
umode_t mode;
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index c12ca7538e4f..94ea9e49aec4 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -610,7 +610,7 @@ int bch2_alloc_read(struct bch_fs *c)
* bch2_check_alloc_key() which runs later:
*/
if (!ca) {
- bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
+ bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0));
continue;
}
@@ -631,17 +631,17 @@ int bch2_alloc_read(struct bch_fs *c)
* bch2_check_alloc_key() which runs later:
*/
if (!ca) {
- bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
+ bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0));
continue;
}
if (k.k->p.offset < ca->mi.first_bucket) {
- bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode, ca->mi.first_bucket));
+ bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode, ca->mi.first_bucket));
continue;
}
if (k.k->p.offset >= ca->mi.nbuckets) {
- bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
+ bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0));
continue;
}
@@ -1039,9 +1039,10 @@ invalid_bucket:
* This synthesizes deleted extents for holes, similar to BTREE_ITER_slots for
* extents style btrees, but works on non-extents btrees:
*/
-static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos end, struct bkey *hole)
+static struct bkey_s_c bch2_get_key_or_hole(struct btree_trans *trans, struct btree_iter *iter,
+ struct bpos end, struct bkey *hole)
{
- struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
+ struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, iter);
if (bkey_err(k))
return k;
@@ -1052,9 +1053,9 @@ static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos
struct btree_iter iter2;
struct bpos next;
- bch2_trans_copy_iter(&iter2, iter);
+ bch2_trans_copy_iter(trans, &iter2, iter);
- struct btree_path *path = btree_iter_path(iter->trans, iter);
+ struct btree_path *path = btree_iter_path(trans, iter);
if (!bpos_eq(path->l[0].b->key.k.p, SPOS_MAX))
end = bkey_min(end, bpos_nosnap_successor(path->l[0].b->key.k.p));
@@ -1064,9 +1065,9 @@ static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos
* btree node min/max is a closed interval, upto takes a half
* open interval:
*/
- k = bch2_btree_iter_peek_max(&iter2, end);
+ k = bch2_btree_iter_peek_max(trans, &iter2, end);
next = iter2.pos;
- bch2_trans_iter_exit(iter->trans, &iter2);
+ bch2_trans_iter_exit(trans, &iter2);
BUG_ON(next.offset >= iter->pos.offset + U32_MAX);
@@ -1107,13 +1108,14 @@ static bool next_bucket(struct bch_fs *c, struct bch_dev **ca, struct bpos *buck
return *ca != NULL;
}
-static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter,
- struct bch_dev **ca, struct bkey *hole)
+static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bch_dev **ca, struct bkey *hole)
{
- struct bch_fs *c = iter->trans->c;
+ struct bch_fs *c = trans->c;
struct bkey_s_c k;
again:
- k = bch2_get_key_or_hole(iter, POS_MAX, hole);
+ k = bch2_get_key_or_hole(trans, iter, POS_MAX, hole);
if (bkey_err(k))
return k;
@@ -1126,7 +1128,7 @@ again:
if (!next_bucket(c, ca, &hole_start))
return bkey_s_c_null;
- bch2_btree_iter_set_pos(iter, hole_start);
+ bch2_btree_iter_set_pos(trans, iter, hole_start);
goto again;
}
@@ -1167,8 +1169,8 @@ int bch2_check_alloc_key(struct btree_trans *trans,
a = bch2_alloc_to_v4(alloc_k, &a_convert);
- bch2_btree_iter_set_pos(discard_iter, alloc_k.k->p);
- k = bch2_btree_iter_peek_slot(discard_iter);
+ bch2_btree_iter_set_pos(trans, discard_iter, alloc_k.k->p);
+ k = bch2_btree_iter_peek_slot(trans, discard_iter);
ret = bkey_err(k);
if (ret)
goto err;
@@ -1181,8 +1183,8 @@ int bch2_check_alloc_key(struct btree_trans *trans,
goto err;
}
- bch2_btree_iter_set_pos(freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a));
- k = bch2_btree_iter_peek_slot(freespace_iter);
+ bch2_btree_iter_set_pos(trans, freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a));
+ k = bch2_btree_iter_peek_slot(trans, freespace_iter);
ret = bkey_err(k);
if (ret)
goto err;
@@ -1195,8 +1197,8 @@ int bch2_check_alloc_key(struct btree_trans *trans,
goto err;
}
- bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(alloc_k.k->p, &gens_offset));
- k = bch2_btree_iter_peek_slot(bucket_gens_iter);
+ bch2_btree_iter_set_pos(trans, bucket_gens_iter, alloc_gens_pos(alloc_k.k->p, &gens_offset));
+ k = bch2_btree_iter_peek_slot(trans, bucket_gens_iter);
ret = bkey_err(k);
if (ret)
goto err;
@@ -1249,9 +1251,9 @@ int bch2_check_alloc_hole_freespace(struct btree_trans *trans,
if (!ca->mi.freespace_initialized)
return 0;
- bch2_btree_iter_set_pos(freespace_iter, start);
+ bch2_btree_iter_set_pos(trans, freespace_iter, start);
- k = bch2_btree_iter_peek_slot(freespace_iter);
+ k = bch2_btree_iter_peek_slot(trans, freespace_iter);
ret = bkey_err(k);
if (ret)
goto err;
@@ -1300,9 +1302,9 @@ int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans,
unsigned i, gens_offset, gens_end_offset;
int ret;
- bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(start, &gens_offset));
+ bch2_btree_iter_set_pos(trans, bucket_gens_iter, alloc_gens_pos(start, &gens_offset));
- k = bch2_btree_iter_peek_slot(bucket_gens_iter);
+ k = bch2_btree_iter_peek_slot(trans, bucket_gens_iter);
ret = bkey_err(k);
if (ret)
goto err;
@@ -1435,7 +1437,7 @@ int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_ite
*gen = a->gen;
out:
fsck_err:
- bch2_set_btree_iter_dontneed(&alloc_iter);
+ bch2_set_btree_iter_dontneed(trans, &alloc_iter);
bch2_trans_iter_exit(trans, &alloc_iter);
printbuf_exit(&buf);
return ret;
@@ -1572,7 +1574,7 @@ int bch2_check_alloc_info(struct bch_fs *c)
bch2_trans_begin(trans);
- k = bch2_get_key_or_real_bucket_hole(&iter, &ca, &hole);
+ k = bch2_get_key_or_real_bucket_hole(trans, &iter, &ca, &hole);
ret = bkey_err(k);
if (ret)
goto bkey_err;
@@ -1610,7 +1612,7 @@ int bch2_check_alloc_info(struct bch_fs *c)
if (ret)
goto bkey_err;
- bch2_btree_iter_set_pos(&iter, next);
+ bch2_btree_iter_set_pos(trans, &iter, next);
bkey_err:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
continue;
@@ -1638,7 +1640,7 @@ bkey_err:
BTREE_ITER_prefetch);
while (1) {
bch2_trans_begin(trans);
- k = bch2_btree_iter_peek(&iter);
+ k = bch2_btree_iter_peek(trans, &iter);
if (!k.k)
break;
@@ -1657,7 +1659,7 @@ bkey_err:
break;
}
- bch2_btree_iter_set_pos(&iter, bpos_nosnap_successor(iter.pos));
+ bch2_btree_iter_set_pos(trans, &iter, bpos_nosnap_successor(iter.pos));
}
bch2_trans_iter_exit(trans, &iter);
if (ret)
@@ -1685,7 +1687,7 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
struct printbuf buf = PRINTBUF;
int ret;
- alloc_k = bch2_btree_iter_peek(alloc_iter);
+ alloc_k = bch2_btree_iter_peek(trans, alloc_iter);
if (!alloc_k.k)
return 0;
@@ -1826,7 +1828,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct bpos pos = need_discard_iter->pos;
- struct btree_iter iter = { NULL };
+ struct btree_iter iter = {};
struct bkey_s_c k;
struct bkey_i_alloc_v4 *a;
struct printbuf buf = PRINTBUF;
@@ -1950,7 +1952,7 @@ static void bch2_do_discards_work(struct work_struct *work)
trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded,
bch2_err_str(ret));
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[WRITE]);
bch2_write_ref_put(c, BCH_WRITE_REF_discard);
}
@@ -1967,7 +1969,7 @@ void bch2_dev_do_discards(struct bch_dev *ca)
if (queue_work(c->write_ref_wq, &ca->discard_work))
return;
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[WRITE]);
put_write_ref:
bch2_write_ref_put(c, BCH_WRITE_REF_discard);
}
@@ -2045,7 +2047,7 @@ static void bch2_do_discards_fast_work(struct work_struct *work)
trace_discard_buckets_fast(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret));
bch2_trans_put(trans);
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[WRITE]);
bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
}
@@ -2065,7 +2067,7 @@ static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket)
if (queue_work(c->write_ref_wq, &ca->discard_fast_work))
return;
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[WRITE]);
put_ref:
bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
}
@@ -2082,6 +2084,9 @@ static int invalidate_one_bp(struct btree_trans *trans,
if (ret)
return ret;
+ if (!extent_k.k)
+ return 0;
+
struct bkey_i *n =
bch2_bkey_make_mut(trans, &extent_iter, &extent_k,
BTREE_UPDATE_internal_snapshot_node);
@@ -2199,9 +2204,9 @@ static struct bkey_s_c next_lru_key(struct btree_trans *trans, struct btree_iter
{
struct bkey_s_c k;
again:
- k = bch2_btree_iter_peek_max(iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX));
+ k = bch2_btree_iter_peek_max(trans, iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX));
if (!k.k && !*wrapped) {
- bch2_btree_iter_set_pos(iter, lru_pos(ca->dev_idx, 0, 0));
+ bch2_btree_iter_set_pos(trans, iter, lru_pos(ca->dev_idx, 0, 0));
*wrapped = true;
goto again;
}
@@ -2251,12 +2256,12 @@ restart_err:
if (ret)
break;
- bch2_btree_iter_advance(&iter);
+ bch2_btree_iter_advance(trans, &iter);
}
bch2_trans_iter_exit(trans, &iter);
err:
bch2_trans_put(trans);
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[WRITE]);
bch2_bkey_buf_exit(&last_flushed, c);
bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
}
@@ -2274,7 +2279,7 @@ void bch2_dev_do_invalidates(struct bch_dev *ca)
if (queue_work(c->write_ref_wq, &ca->invalidate_work))
return;
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[WRITE]);
put_ref:
bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
}
@@ -2321,7 +2326,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
break;
}
- k = bch2_get_key_or_hole(&iter, end, &hole);
+ k = bch2_get_key_or_hole(trans, &iter, end, &hole);
ret = bkey_err(k);
if (ret)
goto bkey_err;
@@ -2340,7 +2345,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
if (ret)
goto bkey_err;
- bch2_btree_iter_advance(&iter);
+ bch2_btree_iter_advance(trans, &iter);
} else {
struct bkey_i *freespace;
@@ -2360,7 +2365,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
if (ret)
goto bkey_err;
- bch2_btree_iter_set_pos(&iter, k.k->p);
+ bch2_btree_iter_set_pos(trans, &iter, k.k->p);
}
bkey_err:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -2506,7 +2511,7 @@ void bch2_recalc_capacity(struct bch_fs *c)
bch2_set_ra_pages(c, ra_pages);
- for_each_rw_member(c, ca) {
+ __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw), READ) {
u64 dev_reserve = 0;
/*
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index c556ccaffe89..34b3d6ac4fbb 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -321,11 +321,11 @@ static inline u64 should_invalidate_buckets(struct bch_dev *ca,
{
u64 want_free = ca->mi.nbuckets >> 7;
u64 free = max_t(s64, 0,
- u.d[BCH_DATA_free].buckets
- + u.d[BCH_DATA_need_discard].buckets
+ u.buckets[BCH_DATA_free]
+ + u.buckets[BCH_DATA_need_discard]
- bch2_dev_buckets_reserved(ca, BCH_WATERMARK_stripe));
- return clamp_t(s64, want_free - free, 0, u.d[BCH_DATA_cached].buckets);
+ return clamp_t(s64, want_free - free, 0, u.buckets[BCH_DATA_cached]);
}
void bch2_dev_do_invalidates(struct bch_dev *);
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index da0d72928b5b..7c930ef77380 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -327,7 +327,7 @@ again:
bucket = sector_to_bucket(ca,
round_up(bucket_to_sector(ca, bucket) + 1,
1ULL << ca->mi.btree_bitmap_shift));
- bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, bucket));
+ bch2_btree_iter_set_pos(trans, &iter, POS(ca->dev_idx, bucket));
s->buckets_seen++;
s->skipped_mi_btree_bitmap++;
continue;
@@ -355,7 +355,7 @@ again:
watermark, s, cl)
: NULL;
next:
- bch2_set_btree_iter_dontneed(&citer);
+ bch2_set_btree_iter_dontneed(trans, &citer);
bch2_trans_iter_exit(trans, &citer);
if (ob)
break;
@@ -417,7 +417,7 @@ again:
1ULL << ca->mi.btree_bitmap_shift));
alloc_cursor = bucket|(iter.pos.offset & (~0ULL << 56));
- bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, alloc_cursor));
+ bch2_btree_iter_set_pos(trans, &iter, POS(ca->dev_idx, alloc_cursor));
s->skipped_mi_btree_bitmap++;
goto next;
}
@@ -426,7 +426,7 @@ again:
if (ob) {
if (!IS_ERR(ob))
*dev_alloc_cursor = iter.pos.offset;
- bch2_set_btree_iter_dontneed(&iter);
+ bch2_set_btree_iter_dontneed(trans, &iter);
break;
}
@@ -469,7 +469,7 @@ static noinline void trace_bucket_alloc2(struct bch_fs *c, struct bch_dev *ca,
prt_printf(&buf, "watermark\t%s\n", bch2_watermarks[watermark]);
prt_printf(&buf, "data type\t%s\n", __bch2_data_types[data_type]);
prt_printf(&buf, "blocking\t%u\n", cl != NULL);
- prt_printf(&buf, "free\t%llu\n", usage->d[BCH_DATA_free].buckets);
+ prt_printf(&buf, "free\t%llu\n", usage->buckets[BCH_DATA_free]);
prt_printf(&buf, "avail\t%llu\n", dev_buckets_free(ca, *usage, watermark));
prt_printf(&buf, "copygc_wait\t%lu/%lli\n",
bch2_copygc_wait_amount(c),
@@ -524,10 +524,10 @@ again:
bch2_dev_usage_read_fast(ca, usage);
avail = dev_buckets_free(ca, *usage, watermark);
- if (usage->d[BCH_DATA_need_discard].buckets > avail)
+ if (usage->buckets[BCH_DATA_need_discard] > avail)
bch2_dev_do_discards(ca);
- if (usage->d[BCH_DATA_need_gc_gens].buckets > avail)
+ if (usage->buckets[BCH_DATA_need_gc_gens] > avail)
bch2_gc_gens_async(c);
if (should_invalidate_buckets(ca, *usage))
@@ -606,8 +606,7 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
static int __dev_stripe_cmp(struct dev_stripe_state *stripe,
unsigned l, unsigned r)
{
- return ((stripe->next_alloc[l] > stripe->next_alloc[r]) -
- (stripe->next_alloc[l] < stripe->next_alloc[r]));
+ return cmp_int(stripe->next_alloc[l], stripe->next_alloc[r]);
}
#define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r)
@@ -626,25 +625,62 @@ struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c,
return ret;
}
+static const u64 stripe_clock_hand_rescale = 1ULL << 62; /* trigger rescale at */
+static const u64 stripe_clock_hand_max = 1ULL << 56; /* max after rescale */
+static const u64 stripe_clock_hand_inv = 1ULL << 52; /* max increment, if a device is empty */
+
+static noinline void bch2_stripe_state_rescale(struct dev_stripe_state *stripe)
+{
+ /*
+ * Avoid underflowing clock hands if at all possible, if clock hands go
+ * to 0 then we lose information - clock hands can be in a wide range if
+ * we have devices we rarely try to allocate from, if we generally
+ * allocate from a specified target but only sometimes have to fall back
+ * to the whole filesystem.
+ */
+ u64 scale_max = U64_MAX; /* maximum we can subtract without underflow */
+ u64 scale_min = 0; /* minumum we must subtract to avoid overflow */
+
+ for (u64 *v = stripe->next_alloc;
+ v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++) {
+ if (*v)
+ scale_max = min(scale_max, *v);
+ if (*v > stripe_clock_hand_max)
+ scale_min = max(scale_min, *v - stripe_clock_hand_max);
+ }
+
+ u64 scale = max(scale_min, scale_max);
+
+ for (u64 *v = stripe->next_alloc;
+ v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++)
+ *v = *v < scale ? 0 : *v - scale;
+}
+
static inline void bch2_dev_stripe_increment_inlined(struct bch_dev *ca,
struct dev_stripe_state *stripe,
struct bch_dev_usage *usage)
{
+ /*
+ * Stripe state has a per device clock hand: we allocate from the device
+ * with the smallest clock hand.
+ *
+ * When we allocate, we don't do a simple increment; we add the inverse
+ * of the device's free space. This results in round robin behavior that
+ * biases in favor of the device(s) with more free space.
+ */
+
u64 *v = stripe->next_alloc + ca->dev_idx;
u64 free_space = __dev_buckets_available(ca, *usage, BCH_WATERMARK_normal);
u64 free_space_inv = free_space
- ? div64_u64(1ULL << 48, free_space)
- : 1ULL << 48;
- u64 scale = *v / 4;
+ ? div64_u64(stripe_clock_hand_inv, free_space)
+ : stripe_clock_hand_inv;
- if (*v + free_space_inv >= *v)
- *v += free_space_inv;
- else
- *v = U64_MAX;
+ /* Saturating add, avoid overflow: */
+ u64 sum = *v + free_space_inv;
+ *v = sum >= *v ? sum : U64_MAX;
- for (v = stripe->next_alloc;
- v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++)
- *v = *v < scale ? 0 : *v - scale;
+ if (unlikely(*v > stripe_clock_hand_rescale))
+ bch2_stripe_state_rescale(stripe);
}
void bch2_dev_stripe_increment(struct bch_dev *ca,
@@ -1633,7 +1669,7 @@ void bch2_fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c)
void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
{
struct bch_fs *c = ca->fs;
- struct bch_dev_usage stats = bch2_dev_usage_read(ca);
+ struct bch_dev_usage_full stats = bch2_dev_usage_full_read(ca);
unsigned nr[BCH_DATA_NR];
memset(nr, 0, sizeof(nr));
@@ -1656,7 +1692,8 @@ void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
printbuf_tabstop_push(out, 16);
prt_printf(out, "open buckets\t%i\r\n", ca->nr_open_buckets);
- prt_printf(out, "buckets to invalidate\t%llu\r\n", should_invalidate_buckets(ca, stats));
+ prt_printf(out, "buckets to invalidate\t%llu\r\n",
+ should_invalidate_buckets(ca, bch2_dev_usage_read(ca)));
}
static noinline void bch2_print_allocator_stuck(struct bch_fs *c)
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index 21d1d86d5008..ff26bb515150 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -252,12 +252,24 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
0,
bp.v->level,
iter_flags);
- struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
+ struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, iter);
if (bkey_err(k)) {
bch2_trans_iter_exit(trans, iter);
return k;
}
+ /*
+ * peek_slot() doesn't normally return NULL - except when we ask for a
+ * key at a btree level that doesn't exist.
+ *
+ * We may want to revisit this and change peek_slot():
+ */
+ if (!k.k) {
+ bkey_init(&iter->k);
+ iter->k.p = bp.v->pos;
+ k.k = &iter->k;
+ }
+
if (k.k &&
extent_matches_bp(c, bp.v->btree_id, bp.v->level, k, bp))
return k;
@@ -293,7 +305,7 @@ struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
0,
bp.v->level - 1,
0);
- struct btree *b = bch2_btree_iter_peek_node(iter);
+ struct btree *b = bch2_btree_iter_peek_node(trans, iter);
if (IS_ERR_OR_NULL(b))
goto err;
@@ -321,7 +333,7 @@ static int bch2_check_backpointer_has_valid_bucket(struct btree_trans *trans, st
return 0;
struct bch_fs *c = trans->c;
- struct btree_iter alloc_iter = { NULL };
+ struct btree_iter alloc_iter = {};
struct bkey_s_c alloc_k;
struct printbuf buf = PRINTBUF;
int ret = 0;
@@ -462,7 +474,7 @@ err:
if (bio)
bio_put(bio);
kvfree(data_buf);
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[READ]);
printbuf_exit(&buf);
return ret;
}
@@ -650,7 +662,7 @@ static int check_btree_root_to_backpointers(struct btree_trans *trans,
retry:
bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN,
0, bch2_btree_id_root(c, btree_id)->b->c.level, 0);
- b = bch2_btree_iter_peek_node(&iter);
+ b = bch2_btree_iter_peek_node(trans, &iter);
ret = PTR_ERR_OR_ZERO(b);
if (ret)
goto err;
@@ -934,7 +946,7 @@ static int btree_node_get_and_pin(struct btree_trans *trans, struct bkey_i *k,
{
struct btree_iter iter;
bch2_trans_node_iter_init(trans, &iter, btree, k->k.p, 0, level, 0);
- struct btree *b = bch2_btree_iter_peek_node(&iter);
+ struct btree *b = bch2_btree_iter_peek_node(trans, &iter);
int ret = PTR_ERR_OR_ZERO(b);
if (ret)
goto err;
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index f52311017aee..75f7408da173 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -524,8 +524,8 @@ struct bch_dev {
struct percpu_ref ref;
#endif
struct completion ref_completion;
- struct percpu_ref io_ref;
- struct completion io_ref_completion;
+ struct percpu_ref io_ref[2];
+ struct completion io_ref_completion[2];
struct bch_fs *fs;
@@ -562,7 +562,8 @@ struct bch_dev {
unsigned long *bucket_backpointer_mismatches;
unsigned long *bucket_backpointer_empty;
- struct bch_dev_usage __percpu *usage;
+ struct bch_dev_usage_full __percpu
+ *usage;
/* Allocator: */
u64 alloc_cursor[3];
@@ -787,6 +788,8 @@ struct bch_fs {
unsigned long errors_silent[BITS_TO_LONGS(BCH_FSCK_ERR_MAX)];
u64 btrees_lost_data;
} sb;
+ DARRAY(enum bcachefs_metadata_version)
+ incompat_versions_requested;
#ifdef CONFIG_UNICODE
struct unicode_map *cf_encoding;
@@ -980,8 +983,8 @@ struct bch_fs {
mempool_t compress_workspace[BCH_COMPRESSION_OPT_NR];
size_t zstd_workspace_size;
- struct crypto_sync_skcipher *chacha20;
- struct crypto_shash *poly1305;
+ struct bch_key chacha20_key;
+ bool chacha20_key_set;
atomic64_t key_version;
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 2025d408979c..7b98ba2dec64 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -691,7 +691,7 @@ retry_root:
struct btree_iter iter;
bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN,
0, bch2_btree_id_root(c, btree)->b->c.level, 0);
- struct btree *b = bch2_btree_iter_peek_node(&iter);
+ struct btree *b = bch2_btree_iter_peek_node(trans, &iter);
ret = PTR_ERR_OR_ZERO(b);
if (ret)
goto err_root;
@@ -1199,7 +1199,7 @@ int bch2_gc_gens(struct bch_fs *c)
BCH_TRANS_COMMIT_no_enospc, ({
ca = bch2_dev_iterate(c, ca, k.k->p.inode);
if (!ca) {
- bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
+ bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0));
continue;
}
bch2_alloc_write_oldest_gen(trans, ca, &iter, k);
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 1d94a2bf706d..5fd4a58d2ad2 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1353,7 +1353,7 @@ start:
"btree read error %s for %s",
bch2_blk_status_to_str(bio->bi_status), buf.buf);
if (rb->have_ioref)
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[READ]);
rb->have_ioref = false;
bch2_mark_io_failure(&failed, &rb->pick, false);
@@ -1609,6 +1609,7 @@ static void btree_node_read_all_replicas_endio(struct bio *bio)
struct bch_dev *ca = bch2_dev_have_ref(c, rb->pick.ptr.dev);
bch2_latency_acct(ca, rb->start_time, READ);
+ percpu_ref_put(&ca->io_ref[READ]);
}
ra->err[rb->idx] = bio->bi_status;
@@ -1908,7 +1909,8 @@ static void btree_node_scrub_work(struct work_struct *work)
scrub->key.k->k.p, 0, scrub->level - 1, 0);
struct btree *b;
- int ret = lockrestart_do(trans, PTR_ERR_OR_ZERO(b = bch2_btree_iter_peek_node(&iter)));
+ int ret = lockrestart_do(trans,
+ PTR_ERR_OR_ZERO(b = bch2_btree_iter_peek_node(trans, &iter)));
if (ret)
goto err;
@@ -1927,7 +1929,7 @@ err:
printbuf_exit(&err);
bch2_bkey_buf_exit(&scrub->key, c);;
btree_bounce_free(c, c->opts.btree_node_size, scrub->used_mempool, scrub->buf);
- percpu_ref_put(&scrub->ca->io_ref);
+ percpu_ref_put(&scrub->ca->io_ref[READ]);
kfree(scrub);
bch2_write_ref_put(c, BCH_WRITE_REF_btree_node_scrub);
}
@@ -1996,7 +1998,7 @@ int bch2_btree_node_scrub(struct btree_trans *trans,
return 0;
err_free:
btree_bounce_free(c, c->opts.btree_node_size, used_mempool, buf);
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[READ]);
err:
bch2_write_ref_put(c, BCH_WRITE_REF_btree_node_scrub);
return ret;
@@ -2144,6 +2146,7 @@ static void btree_node_write_endio(struct bio *bio)
if (ca && bio->bi_status) {
struct printbuf buf = PRINTBUF;
+ buf.atomic++;
prt_printf(&buf, "btree write error: %s\n ",
bch2_blk_status_to_str(bio->bi_status));
bch2_btree_pos_to_text(&buf, c, b);
@@ -2158,8 +2161,12 @@ static void btree_node_write_endio(struct bio *bio)
spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
}
+ /*
+ * XXX: we should be using io_ref[WRITE], but we aren't retrying failed
+ * btree writes yet (due to device removal/ro):
+ */
if (wbio->have_ioref)
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[READ]);
if (parent) {
bio_put(bio);
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index a9c110b846b5..e34e9598ef25 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -244,10 +244,8 @@ void bch2_trans_verify_paths(struct btree_trans *trans)
bch2_btree_path_verify(trans, path);
}
-static void bch2_btree_iter_verify(struct btree_iter *iter)
+static void bch2_btree_iter_verify(struct btree_trans *trans, struct btree_iter *iter)
{
- struct btree_trans *trans = iter->trans;
-
BUG_ON(!!(iter->flags & BTREE_ITER_cached) != btree_iter_path(trans, iter)->cached);
BUG_ON((iter->flags & BTREE_ITER_is_extents) &&
@@ -276,9 +274,9 @@ static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
bkey_gt(iter->pos, iter->k.p)));
}
-static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k)
+static int bch2_btree_iter_verify_ret(struct btree_trans *trans,
+ struct btree_iter *iter, struct bkey_s_c k)
{
- struct btree_trans *trans = iter->trans;
struct btree_iter copy;
struct bkey_s_c prev;
int ret = 0;
@@ -299,7 +297,7 @@ static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k
bch2_trans_iter_init(trans, &copy, iter->btree_id, iter->pos,
BTREE_ITER_nopreserve|
BTREE_ITER_all_snapshots);
- prev = bch2_btree_iter_prev(&copy);
+ prev = bch2_btree_iter_prev(trans, &copy);
if (!prev.k)
goto out;
@@ -365,9 +363,11 @@ static inline void bch2_btree_path_verify_level(struct btree_trans *trans,
struct btree_path *path, unsigned l) {}
static inline void bch2_btree_path_verify(struct btree_trans *trans,
struct btree_path *path) {}
-static inline void bch2_btree_iter_verify(struct btree_iter *iter) {}
+static inline void bch2_btree_iter_verify(struct btree_trans *trans,
+ struct btree_iter *iter) {}
static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {}
-static inline int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) { return 0; }
+static inline int bch2_btree_iter_verify_ret(struct btree_trans *trans, struct btree_iter *iter,
+ struct bkey_s_c k) { return 0; }
#endif
@@ -1855,10 +1855,8 @@ hole:
return (struct bkey_s_c) { u, NULL };
}
-void bch2_set_btree_iter_dontneed(struct btree_iter *iter)
+void bch2_set_btree_iter_dontneed(struct btree_trans *trans, struct btree_iter *iter)
{
- struct btree_trans *trans = iter->trans;
-
if (!iter->path || trans->restarted)
return;
@@ -1870,17 +1868,14 @@ void bch2_set_btree_iter_dontneed(struct btree_iter *iter)
/* Btree iterators: */
int __must_check
-__bch2_btree_iter_traverse(struct btree_iter *iter)
+__bch2_btree_iter_traverse(struct btree_trans *trans, struct btree_iter *iter)
{
- return bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
+ return bch2_btree_path_traverse(trans, iter->path, iter->flags);
}
int __must_check
-bch2_btree_iter_traverse(struct btree_iter *iter)
+bch2_btree_iter_traverse(struct btree_trans *trans, struct btree_iter *iter)
{
- struct btree_trans *trans = iter->trans;
- int ret;
-
bch2_trans_verify_not_unlocked_or_in_restart(trans);
iter->path = bch2_btree_path_set_pos(trans, iter->path,
@@ -1888,7 +1883,7 @@ bch2_btree_iter_traverse(struct btree_iter *iter)
iter->flags & BTREE_ITER_intent,
btree_iter_ip_allocated(iter));
- ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
+ int ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
if (ret)
return ret;
@@ -1900,14 +1895,14 @@ bch2_btree_iter_traverse(struct btree_iter *iter)
/* Iterate across nodes (leaf and interior nodes) */
-struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
+struct btree *bch2_btree_iter_peek_node(struct btree_trans *trans,
+ struct btree_iter *iter)
{
- struct btree_trans *trans = iter->trans;
struct btree *b = NULL;
int ret;
EBUG_ON(trans->paths[iter->path].cached);
- bch2_btree_iter_verify(iter);
+ bch2_btree_iter_verify(trans, iter);
ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
if (ret)
@@ -1929,7 +1924,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter));
out:
bch2_btree_iter_verify_entry_exit(iter);
- bch2_btree_iter_verify(iter);
+ bch2_btree_iter_verify(trans, iter);
return b;
err:
@@ -1938,26 +1933,26 @@ err:
}
/* Only kept for -tools */
-struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *iter)
+struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_trans *trans,
+ struct btree_iter *iter)
{
struct btree *b;
- while (b = bch2_btree_iter_peek_node(iter),
+ while (b = bch2_btree_iter_peek_node(trans, iter),
bch2_err_matches(PTR_ERR_OR_ZERO(b), BCH_ERR_transaction_restart))
- bch2_trans_begin(iter->trans);
+ bch2_trans_begin(trans);
return b;
}
-struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
+struct btree *bch2_btree_iter_next_node(struct btree_trans *trans, struct btree_iter *iter)
{
- struct btree_trans *trans = iter->trans;
struct btree *b = NULL;
int ret;
EBUG_ON(trans->paths[iter->path].cached);
bch2_trans_verify_not_unlocked_or_in_restart(trans);
- bch2_btree_iter_verify(iter);
+ bch2_btree_iter_verify(trans, iter);
ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
if (ret)
@@ -2024,7 +2019,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
EBUG_ON(btree_iter_path(trans, iter)->uptodate);
out:
bch2_btree_iter_verify_entry_exit(iter);
- bch2_btree_iter_verify(iter);
+ bch2_btree_iter_verify(trans, iter);
return b;
err:
@@ -2034,7 +2029,7 @@ err:
/* Iterate across keys (in leaf nodes only) */
-inline bool bch2_btree_iter_advance(struct btree_iter *iter)
+inline bool bch2_btree_iter_advance(struct btree_trans *trans, struct btree_iter *iter)
{
struct bpos pos = iter->k.p;
bool ret = !(iter->flags & BTREE_ITER_all_snapshots
@@ -2043,11 +2038,11 @@ inline bool bch2_btree_iter_advance(struct btree_iter *iter)
if (ret && !(iter->flags & BTREE_ITER_is_extents))
pos = bkey_successor(iter, pos);
- bch2_btree_iter_set_pos(iter, pos);
+ bch2_btree_iter_set_pos(trans, iter, pos);
return ret;
}
-inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
+inline bool bch2_btree_iter_rewind(struct btree_trans *trans, struct btree_iter *iter)
{
struct bpos pos = bkey_start_pos(&iter->k);
bool ret = !(iter->flags & BTREE_ITER_all_snapshots
@@ -2056,7 +2051,7 @@ inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
if (ret && !(iter->flags & BTREE_ITER_is_extents))
pos = bkey_predecessor(iter, pos);
- bch2_btree_iter_set_pos(iter, pos);
+ bch2_btree_iter_set_pos(trans, iter, pos);
return ret;
}
@@ -2183,9 +2178,9 @@ void btree_trans_peek_prev_journal(struct btree_trans *trans,
* bkey_s_c_null:
*/
static noinline
-struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos)
+struct bkey_s_c btree_trans_peek_key_cache(struct btree_trans *trans, struct btree_iter *iter,
+ struct bpos pos)
{
- struct btree_trans *trans = iter->trans;
struct bch_fs *c = trans->c;
struct bkey u;
struct bkey_s_c k;
@@ -2231,14 +2226,14 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos
return k;
}
-static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key)
+static struct bkey_s_c __bch2_btree_iter_peek(struct btree_trans *trans, struct btree_iter *iter,
+ struct bpos search_key)
{
- struct btree_trans *trans = iter->trans;
struct bkey_s_c k, k2;
int ret;
EBUG_ON(btree_iter_path(trans, iter)->cached);
- bch2_btree_iter_verify(iter);
+ bch2_btree_iter_verify(trans, iter);
while (1) {
iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
@@ -2248,7 +2243,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
if (unlikely(ret)) {
/* ensure that iter->k is consistent with iter->pos: */
- bch2_btree_iter_set_pos(iter, iter->pos);
+ bch2_btree_iter_set_pos(trans, iter, iter->pos);
k = bkey_s_c_err(ret);
break;
}
@@ -2258,7 +2253,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
if (unlikely(!l->b)) {
/* No btree nodes at requested level: */
- bch2_btree_iter_set_pos(iter, SPOS_MAX);
+ bch2_btree_iter_set_pos(trans, iter, SPOS_MAX);
k = bkey_s_c_null;
break;
}
@@ -2269,10 +2264,10 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
if (unlikely(iter->flags & BTREE_ITER_with_key_cache) &&
k.k &&
- (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) {
+ (k2 = btree_trans_peek_key_cache(trans, iter, k.k->p)).k) {
k = k2;
if (bkey_err(k)) {
- bch2_btree_iter_set_pos(iter, iter->pos);
+ bch2_btree_iter_set_pos(trans, iter, iter->pos);
break;
}
}
@@ -2305,27 +2300,28 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
search_key = bpos_successor(l->b->key.k.p);
} else {
/* End of btree: */
- bch2_btree_iter_set_pos(iter, SPOS_MAX);
+ bch2_btree_iter_set_pos(trans, iter, SPOS_MAX);
k = bkey_s_c_null;
break;
}
}
- bch2_btree_iter_verify(iter);
+ bch2_btree_iter_verify(trans, iter);
return k;
}
/**
* bch2_btree_iter_peek_max() - returns first key greater than or equal to
* iterator's current position
+ * @trans: btree transaction object
* @iter: iterator to peek from
* @end: search limit: returns keys less than or equal to @end
*
* Returns: key if found, or an error extractable with bkey_err().
*/
-struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos end)
+struct bkey_s_c bch2_btree_iter_peek_max(struct btree_trans *trans, struct btree_iter *iter,
+ struct bpos end)
{
- struct btree_trans *trans = iter->trans;
struct bpos search_key = btree_iter_search_key(iter);
struct bkey_s_c k;
struct bpos iter_pos = iter->pos;
@@ -2348,7 +2344,7 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos en
}
while (1) {
- k = __bch2_btree_iter_peek(iter, search_key);
+ k = __bch2_btree_iter_peek(trans, iter, search_key);
if (unlikely(!k.k))
goto end;
if (unlikely(bkey_err(k)))
@@ -2462,9 +2458,9 @@ out_no_locked:
if (!(iter->flags & BTREE_ITER_all_snapshots))
iter->pos.snapshot = iter->snapshot;
- ret = bch2_btree_iter_verify_ret(iter, k);
+ ret = bch2_btree_iter_verify_ret(trans, iter, k);
if (unlikely(ret)) {
- bch2_btree_iter_set_pos(iter, iter->pos);
+ bch2_btree_iter_set_pos(trans, iter, iter->pos);
k = bkey_s_c_err(ret);
}
@@ -2472,7 +2468,7 @@ out_no_locked:
return k;
end:
- bch2_btree_iter_set_pos(iter, end);
+ bch2_btree_iter_set_pos(trans, iter, end);
k = bkey_s_c_null;
goto out_no_locked;
}
@@ -2480,24 +2476,25 @@ end:
/**
* bch2_btree_iter_next() - returns first key greater than iterator's current
* position
+ * @trans: btree transaction object
* @iter: iterator to peek from
*
* Returns: key if found, or an error extractable with bkey_err().
*/
-struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
+struct bkey_s_c bch2_btree_iter_next(struct btree_trans *trans, struct btree_iter *iter)
{
- if (!bch2_btree_iter_advance(iter))
+ if (!bch2_btree_iter_advance(trans, iter))
return bkey_s_c_null;
- return bch2_btree_iter_peek(iter);
+ return bch2_btree_iter_peek(trans, iter);
}
-static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, struct bpos search_key)
+static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_trans *trans, struct btree_iter *iter,
+ struct bpos search_key)
{
- struct btree_trans *trans = iter->trans;
struct bkey_s_c k, k2;
- bch2_btree_iter_verify(iter);
+ bch2_btree_iter_verify(trans, iter);
while (1) {
iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
@@ -2507,7 +2504,7 @@ static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, stru
int ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
if (unlikely(ret)) {
/* ensure that iter->k is consistent with iter->pos: */
- bch2_btree_iter_set_pos(iter, iter->pos);
+ bch2_btree_iter_set_pos(trans, iter, iter->pos);
k = bkey_s_c_err(ret);
break;
}
@@ -2517,7 +2514,7 @@ static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, stru
if (unlikely(!l->b)) {
/* No btree nodes at requested level: */
- bch2_btree_iter_set_pos(iter, SPOS_MAX);
+ bch2_btree_iter_set_pos(trans, iter, SPOS_MAX);
k = bkey_s_c_null;
break;
}
@@ -2533,10 +2530,10 @@ static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, stru
if (unlikely(iter->flags & BTREE_ITER_with_key_cache) &&
k.k &&
- (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) {
+ (k2 = btree_trans_peek_key_cache(trans, iter, k.k->p)).k) {
k = k2;
if (bkey_err(k2)) {
- bch2_btree_iter_set_pos(iter, iter->pos);
+ bch2_btree_iter_set_pos(trans, iter, iter->pos);
break;
}
}
@@ -2557,25 +2554,27 @@ static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, stru
search_key = bpos_predecessor(path->l[0].b->data->min_key);
} else {
/* Start of btree: */
- bch2_btree_iter_set_pos(iter, POS_MIN);
+ bch2_btree_iter_set_pos(trans, iter, POS_MIN);
k = bkey_s_c_null;
break;
}
}
- bch2_btree_iter_verify(iter);
+ bch2_btree_iter_verify(trans, iter);
return k;
}
/**
* bch2_btree_iter_peek_prev_min() - returns first key less than or equal to
* iterator's current position
+ * @trans: btree transaction object
* @iter: iterator to peek from
* @end: search limit: returns keys greater than or equal to @end
*
* Returns: key if found, or an error extractable with bkey_err().
*/
-struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bpos end)
+struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *trans, struct btree_iter *iter,
+ struct bpos end)
{
if ((iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots)) &&
!bkey_eq(iter->pos, POS_MAX)) {
@@ -2587,7 +2586,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bp
* real visible extents - easiest to just use peek_slot() (which
* internally uses peek() for extents)
*/
- struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
+ struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, iter);
if (bkey_err(k))
return k;
@@ -2597,7 +2596,6 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bp
return k;
}
- struct btree_trans *trans = iter->trans;
struct bpos search_key = iter->pos;
struct bkey_s_c k;
btree_path_idx_t saved_path = 0;
@@ -2613,7 +2611,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bp
}
while (1) {
- k = __bch2_btree_iter_peek_prev(iter, search_key);
+ k = __bch2_btree_iter_peek_prev(trans, iter, search_key);
if (unlikely(!k.k))
goto end;
if (unlikely(bkey_err(k)))
@@ -2704,10 +2702,10 @@ out_no_locked:
bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_intent);
bch2_btree_iter_verify_entry_exit(iter);
- bch2_btree_iter_verify(iter);
+ bch2_btree_iter_verify(trans, iter);
return k;
end:
- bch2_btree_iter_set_pos(iter, end);
+ bch2_btree_iter_set_pos(trans, iter, end);
k = bkey_s_c_null;
goto out_no_locked;
}
@@ -2715,27 +2713,27 @@ end:
/**
* bch2_btree_iter_prev() - returns first key less than iterator's current
* position
+ * @trans: btree transaction object
* @iter: iterator to peek from
*
* Returns: key if found, or an error extractable with bkey_err().
*/
-struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
+struct bkey_s_c bch2_btree_iter_prev(struct btree_trans *trans, struct btree_iter *iter)
{
- if (!bch2_btree_iter_rewind(iter))
+ if (!bch2_btree_iter_rewind(trans, iter))
return bkey_s_c_null;
- return bch2_btree_iter_peek_prev(iter);
+ return bch2_btree_iter_peek_prev(trans, iter);
}
-struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
+struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btree_iter *iter)
{
- struct btree_trans *trans = iter->trans;
struct bpos search_key;
struct bkey_s_c k;
int ret;
bch2_trans_verify_not_unlocked_or_in_restart(trans);
- bch2_btree_iter_verify(iter);
+ bch2_btree_iter_verify(trans, iter);
bch2_btree_iter_verify_entry_exit(iter);
EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_with_key_cache));
@@ -2751,7 +2749,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
if (iter->pos.inode == KEY_INODE_MAX)
return bkey_s_c_null;
- bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos));
+ bch2_btree_iter_set_pos(trans, iter, bpos_nosnap_successor(iter->pos));
}
search_key = btree_iter_search_key(iter);
@@ -2785,7 +2783,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
goto out;
if (unlikely(iter->flags & BTREE_ITER_with_key_cache) &&
- (k = btree_trans_peek_key_cache(iter, iter->pos)).k) {
+ (k = btree_trans_peek_key_cache(trans, iter, iter->pos)).k) {
if (!bkey_err(k))
iter->k = *k.k;
/* We're not returning a key from iter->path: */
@@ -2812,8 +2810,8 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
if (iter->flags & BTREE_ITER_intent) {
struct btree_iter iter2;
- bch2_trans_copy_iter(&iter2, iter);
- k = bch2_btree_iter_peek_max(&iter2, end);
+ bch2_trans_copy_iter(trans, &iter2, iter);
+ k = bch2_btree_iter_peek_max(trans, &iter2, end);
if (k.k && !bkey_err(k)) {
swap(iter->key_cache_path, iter2.key_cache_path);
@@ -2824,9 +2822,9 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
} else {
struct bpos pos = iter->pos;
- k = bch2_btree_iter_peek_max(iter, end);
+ k = bch2_btree_iter_peek_max(trans, iter, end);
if (unlikely(bkey_err(k)))
- bch2_btree_iter_set_pos(iter, pos);
+ bch2_btree_iter_set_pos(trans, iter, pos);
else
iter->pos = pos;
}
@@ -2857,39 +2855,39 @@ out:
btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter));
out_no_locked:
bch2_btree_iter_verify_entry_exit(iter);
- bch2_btree_iter_verify(iter);
- ret = bch2_btree_iter_verify_ret(iter, k);
+ bch2_btree_iter_verify(trans, iter);
+ ret = bch2_btree_iter_verify_ret(trans, iter, k);
if (unlikely(ret))
return bkey_s_c_err(ret);
return k;
}
-struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
+struct bkey_s_c bch2_btree_iter_next_slot(struct btree_trans *trans, struct btree_iter *iter)
{
- if (!bch2_btree_iter_advance(iter))
+ if (!bch2_btree_iter_advance(trans, iter))
return bkey_s_c_null;
- return bch2_btree_iter_peek_slot(iter);
+ return bch2_btree_iter_peek_slot(trans, iter);
}
-struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter)
+struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_trans *trans, struct btree_iter *iter)
{
- if (!bch2_btree_iter_rewind(iter))
+ if (!bch2_btree_iter_rewind(trans, iter))
return bkey_s_c_null;
- return bch2_btree_iter_peek_slot(iter);
+ return bch2_btree_iter_peek_slot(trans, iter);
}
/* Obsolete, but still used by rust wrapper in -tools */
-struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *iter)
+struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_trans *trans, struct btree_iter *iter)
{
struct bkey_s_c k;
- while (btree_trans_too_many_iters(iter->trans) ||
- (k = bch2_btree_iter_peek_type(iter, iter->flags),
+ while (btree_trans_too_many_iters(trans) ||
+ (k = bch2_btree_iter_peek_type(trans, iter, iter->flags),
bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart)))
- bch2_trans_begin(iter->trans);
+ bch2_trans_begin(trans);
return k;
}
@@ -3035,7 +3033,6 @@ void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter)
iter->path = 0;
iter->update_path = 0;
iter->key_cache_path = 0;
- iter->trans = NULL;
}
void bch2_trans_iter_init_outlined(struct btree_trans *trans,
@@ -3075,10 +3072,9 @@ void bch2_trans_node_iter_init(struct btree_trans *trans,
BUG_ON(iter->min_depth != depth);
}
-void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
+void bch2_trans_copy_iter(struct btree_trans *trans,
+ struct btree_iter *dst, struct btree_iter *src)
{
- struct btree_trans *trans = src->trans;
-
*dst = *src;
#ifdef TRACK_PATH_ALLOCATED
dst->ip_allocated = _RET_IP_;
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index e6f51a3b8187..9d2cccf5d21a 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -393,36 +393,37 @@ void bch2_trans_node_add(struct btree_trans *trans, struct btree_path *, struct
void bch2_trans_node_drop(struct btree_trans *trans, struct btree *);
void bch2_trans_node_reinit_iter(struct btree_trans *, struct btree *);
-int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter);
-int __must_check bch2_btree_iter_traverse(struct btree_iter *);
+int __must_check __bch2_btree_iter_traverse(struct btree_trans *, struct btree_iter *);
+int __must_check bch2_btree_iter_traverse(struct btree_trans *, struct btree_iter *);
-struct btree *bch2_btree_iter_peek_node(struct btree_iter *);
-struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *);
-struct btree *bch2_btree_iter_next_node(struct btree_iter *);
+struct btree *bch2_btree_iter_peek_node(struct btree_trans *, struct btree_iter *);
+struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_trans *, struct btree_iter *);
+struct btree *bch2_btree_iter_next_node(struct btree_trans *, struct btree_iter *);
-struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *, struct bpos);
-struct bkey_s_c bch2_btree_iter_next(struct btree_iter *);
+struct bkey_s_c bch2_btree_iter_peek_max(struct btree_trans *, struct btree_iter *, struct bpos);
+struct bkey_s_c bch2_btree_iter_next(struct btree_trans *, struct btree_iter *);
-static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
+static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_trans *trans,
+ struct btree_iter *iter)
{
- return bch2_btree_iter_peek_max(iter, SPOS_MAX);
+ return bch2_btree_iter_peek_max(trans, iter, SPOS_MAX);
}
-struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *, struct bpos);
+struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *, struct btree_iter *, struct bpos);
-static inline struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
+static inline struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_trans *trans, struct btree_iter *iter)
{
- return bch2_btree_iter_peek_prev_min(iter, POS_MIN);
+ return bch2_btree_iter_peek_prev_min(trans, iter, POS_MIN);
}
-struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *);
+struct bkey_s_c bch2_btree_iter_prev(struct btree_trans *, struct btree_iter *);
-struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *);
-struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *);
-struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *);
+struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *, struct btree_iter *);
+struct bkey_s_c bch2_btree_iter_next_slot(struct btree_trans *, struct btree_iter *);
+struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_trans *, struct btree_iter *);
-bool bch2_btree_iter_advance(struct btree_iter *);
-bool bch2_btree_iter_rewind(struct btree_iter *);
+bool bch2_btree_iter_advance(struct btree_trans *, struct btree_iter *);
+bool bch2_btree_iter_rewind(struct btree_trans *, struct btree_iter *);
static inline void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
{
@@ -433,10 +434,9 @@ static inline void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpo
iter->k.size = 0;
}
-static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
+static inline void bch2_btree_iter_set_pos(struct btree_trans *trans,
+ struct btree_iter *iter, struct bpos new_pos)
{
- struct btree_trans *trans = iter->trans;
-
if (unlikely(iter->update_path))
bch2_path_put(trans, iter->update_path,
iter->flags & BTREE_ITER_intent);
@@ -454,13 +454,14 @@ static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *it
iter->pos = bkey_start_pos(&iter->k);
}
-static inline void bch2_btree_iter_set_snapshot(struct btree_iter *iter, u32 snapshot)
+static inline void bch2_btree_iter_set_snapshot(struct btree_trans *trans,
+ struct btree_iter *iter, u32 snapshot)
{
struct bpos pos = iter->pos;
iter->snapshot = snapshot;
pos.snapshot = snapshot;
- bch2_btree_iter_set_pos(iter, pos);
+ bch2_btree_iter_set_pos(trans, iter, pos);
}
void bch2_trans_iter_exit(struct btree_trans *, struct btree_iter *);
@@ -502,7 +503,6 @@ static inline void bch2_trans_iter_init_common(struct btree_trans *trans,
unsigned flags,
unsigned long ip)
{
- iter->trans = trans;
iter->update_path = 0;
iter->key_cache_path = 0;
iter->btree_id = btree_id;
@@ -539,9 +539,9 @@ static inline void bch2_trans_iter_init(struct btree_trans *trans,
void bch2_trans_node_iter_init(struct btree_trans *, struct btree_iter *,
enum btree_id, struct bpos,
unsigned, unsigned, unsigned);
-void bch2_trans_copy_iter(struct btree_iter *, struct btree_iter *);
+void bch2_trans_copy_iter(struct btree_trans *, struct btree_iter *, struct btree_iter *);
-void bch2_set_btree_iter_dontneed(struct btree_iter *);
+void bch2_set_btree_iter_dontneed(struct btree_trans *, struct btree_iter *);
void *__bch2_trans_kmalloc(struct btree_trans *, size_t);
@@ -588,7 +588,7 @@ static inline struct bkey_s_c __bch2_bkey_get_iter(struct btree_trans *trans,
struct bkey_s_c k;
bch2_trans_iter_init(trans, iter, btree_id, pos, flags);
- k = bch2_btree_iter_peek_slot(iter);
+ k = bch2_btree_iter_peek_slot(trans, iter);
if (!bkey_err(k) && type && k.k->type != type)
k = bkey_s_c_err(-BCH_ERR_ENOENT_bkey_type_mismatch);
@@ -658,14 +658,14 @@ u32 bch2_trans_begin(struct btree_trans *);
int _ret3 = 0; \
do { \
_ret3 = lockrestart_do((_trans), ({ \
- struct btree *_b = bch2_btree_iter_peek_node(&_iter); \
+ struct btree *_b = bch2_btree_iter_peek_node(_trans, &_iter);\
if (!_b) \
break; \
\
PTR_ERR_OR_ZERO(_b) ?: (_do); \
})) ?: \
lockrestart_do((_trans), \
- PTR_ERR_OR_ZERO(bch2_btree_iter_next_node(&_iter))); \
+ PTR_ERR_OR_ZERO(bch2_btree_iter_next_node(_trans, &_iter)));\
} while (!_ret3); \
\
bch2_trans_iter_exit((_trans), &(_iter)); \
@@ -677,31 +677,34 @@ u32 bch2_trans_begin(struct btree_trans *);
__for_each_btree_node(_trans, _iter, _btree_id, _start, \
0, 0, _flags, _b, _do)
-static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_iter *iter,
+static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_trans *trans,
+ struct btree_iter *iter,
unsigned flags)
{
- return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(iter) :
- bch2_btree_iter_peek_prev(iter);
+ return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(trans, iter) :
+ bch2_btree_iter_peek_prev(trans, iter);
}
-static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter,
+static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_trans *trans,
+ struct btree_iter *iter,
unsigned flags)
{
- return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(iter) :
- bch2_btree_iter_peek(iter);
+ return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(trans, iter) :
+ bch2_btree_iter_peek(trans, iter);
}
-static inline struct bkey_s_c bch2_btree_iter_peek_max_type(struct btree_iter *iter,
- struct bpos end,
- unsigned flags)
+static inline struct bkey_s_c bch2_btree_iter_peek_max_type(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bpos end,
+ unsigned flags)
{
if (!(flags & BTREE_ITER_slots))
- return bch2_btree_iter_peek_max(iter, end);
+ return bch2_btree_iter_peek_max(trans, iter, end);
if (bkey_gt(iter->pos, end))
return bkey_s_c_null;
- return bch2_btree_iter_peek_slot(iter);
+ return bch2_btree_iter_peek_slot(trans, iter);
}
int __bch2_btree_trans_too_many_iters(struct btree_trans *);
@@ -768,14 +771,14 @@ transaction_restart: \
\
do { \
_ret3 = lockrestart_do(_trans, ({ \
- (_k) = bch2_btree_iter_peek_max_type(&(_iter), \
+ (_k) = bch2_btree_iter_peek_max_type(_trans, &(_iter), \
_end, (_flags)); \
if (!(_k).k) \
break; \
\
bkey_err(_k) ?: (_do); \
})); \
- } while (!_ret3 && bch2_btree_iter_advance(&(_iter))); \
+ } while (!_ret3 && bch2_btree_iter_advance(_trans, &(_iter))); \
\
bch2_trans_iter_exit((_trans), &(_iter)); \
_ret3; \
@@ -813,14 +816,14 @@ transaction_restart: \
\
do { \
_ret3 = lockrestart_do(_trans, ({ \
- (_k) = bch2_btree_iter_peek_prev_type(&(_iter), \
+ (_k) = bch2_btree_iter_peek_prev_type(_trans, &(_iter), \
(_flags)); \
if (!(_k).k) \
break; \
\
bkey_err(_k) ?: (_do); \
})); \
- } while (!_ret3 && bch2_btree_iter_rewind(&(_iter))); \
+ } while (!_ret3 && bch2_btree_iter_rewind(_trans, &(_iter))); \
\
bch2_trans_iter_exit((_trans), &(_iter)); \
_ret3; \
@@ -850,37 +853,38 @@ transaction_restart: \
(_do) ?: bch2_trans_commit(_trans, (_disk_res),\
(_journal_seq), (_commit_flags)))
-struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *);
+struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_trans *,
+ struct btree_iter *);
#define for_each_btree_key_max_norestart(_trans, _iter, _btree_id, \
_start, _end, _flags, _k, _ret) \
for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
(_start), (_flags)); \
- (_k) = bch2_btree_iter_peek_max_type(&(_iter), _end, _flags),\
+ (_k) = bch2_btree_iter_peek_max_type(_trans, &(_iter), _end, _flags),\
!((_ret) = bkey_err(_k)) && (_k).k; \
- bch2_btree_iter_advance(&(_iter)))
+ bch2_btree_iter_advance(_trans, &(_iter)))
-#define for_each_btree_key_max_continue_norestart(_iter, _end, _flags, _k, _ret)\
+#define for_each_btree_key_max_continue_norestart(_trans, _iter, _end, _flags, _k, _ret)\
for (; \
- (_k) = bch2_btree_iter_peek_max_type(&(_iter), _end, _flags), \
+ (_k) = bch2_btree_iter_peek_max_type(_trans, &(_iter), _end, _flags), \
!((_ret) = bkey_err(_k)) && (_k).k; \
- bch2_btree_iter_advance(&(_iter)))
+ bch2_btree_iter_advance(_trans, &(_iter)))
#define for_each_btree_key_norestart(_trans, _iter, _btree_id, \
_start, _flags, _k, _ret) \
for_each_btree_key_max_norestart(_trans, _iter, _btree_id, _start,\
SPOS_MAX, _flags, _k, _ret)
-#define for_each_btree_key_reverse_norestart(_trans, _iter, _btree_id, \
- _start, _flags, _k, _ret) \
- for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
- (_start), (_flags)); \
- (_k) = bch2_btree_iter_peek_prev_type(&(_iter), _flags), \
- !((_ret) = bkey_err(_k)) && (_k).k; \
- bch2_btree_iter_rewind(&(_iter)))
+#define for_each_btree_key_reverse_norestart(_trans, _iter, _btree_id, \
+ _start, _flags, _k, _ret) \
+ for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
+ (_start), (_flags)); \
+ (_k) = bch2_btree_iter_peek_prev_type(_trans, &(_iter), _flags), \
+ !((_ret) = bkey_err(_k)) && (_k).k; \
+ bch2_btree_iter_rewind(_trans, &(_iter)))
-#define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \
- for_each_btree_key_max_continue_norestart(_iter, SPOS_MAX, _flags, _k, _ret)
+#define for_each_btree_key_continue_norestart(_trans, _iter, _flags, _k, _ret) \
+ for_each_btree_key_max_continue_norestart(_trans, _iter, SPOS_MAX, _flags, _k, _ret)
/*
* This should not be used in a fastpath, without first trying _do in
diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c
index d1ad1a7613c9..7d6c971db23c 100644
--- a/fs/bcachefs/btree_journal_iter.c
+++ b/fs/bcachefs/btree_journal_iter.c
@@ -644,8 +644,6 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans,
*/
static int journal_sort_key_cmp(const void *_l, const void *_r)
{
- cond_resched();
-
const struct journal_key *l = _l;
const struct journal_key *r = _r;
@@ -689,7 +687,8 @@ void bch2_journal_keys_put(struct bch_fs *c)
static void __journal_keys_sort(struct journal_keys *keys)
{
- sort(keys->data, keys->nr, sizeof(keys->data[0]), journal_sort_key_cmp, NULL);
+ sort_nonatomic(keys->data, keys->nr, sizeof(keys->data[0]),
+ journal_sort_key_cmp, NULL);
cond_resched();
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index edce59433375..2b186584a291 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -287,6 +287,19 @@ err:
return ret;
}
+static noinline_for_stack void do_trace_key_cache_fill(struct btree_trans *trans,
+ struct btree_path *ck_path,
+ struct bkey_s_c k)
+{
+ struct printbuf buf = PRINTBUF;
+
+ bch2_bpos_to_text(&buf, ck_path->pos);
+ prt_char(&buf, ' ');
+ bch2_bkey_val_to_text(&buf, trans->c, k);
+ trace_key_cache_fill(trans, buf.buf);
+ printbuf_exit(&buf);
+}
+
static noinline int btree_key_cache_fill(struct btree_trans *trans,
struct btree_path *ck_path,
unsigned flags)
@@ -306,7 +319,7 @@ static noinline int btree_key_cache_fill(struct btree_trans *trans,
BTREE_ITER_key_cache_fill|
BTREE_ITER_cached_nofill);
iter.flags &= ~BTREE_ITER_with_journal;
- k = bch2_btree_iter_peek_slot(&iter);
+ k = bch2_btree_iter_peek_slot(trans, &iter);
ret = bkey_err(k);
if (ret)
goto err;
@@ -320,18 +333,11 @@ static noinline int btree_key_cache_fill(struct btree_trans *trans,
if (ret)
goto err;
- if (trace_key_cache_fill_enabled()) {
- struct printbuf buf = PRINTBUF;
-
- bch2_bpos_to_text(&buf, ck_path->pos);
- prt_char(&buf, ' ');
- bch2_bkey_val_to_text(&buf, trans->c, k);
- trace_key_cache_fill(trans, buf.buf);
- printbuf_exit(&buf);
- }
+ if (trace_key_cache_fill_enabled())
+ do_trace_key_cache_fill(trans, ck_path, k);
out:
/* We're not likely to need this iterator again: */
- bch2_set_btree_iter_dontneed(&iter);
+ bch2_set_btree_iter_dontneed(trans, &iter);
err:
bch2_trans_iter_exit(trans, &iter);
return ret;
@@ -412,7 +418,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
BTREE_ITER_intent);
b_iter.flags &= ~BTREE_ITER_with_key_cache;
- ret = bch2_btree_iter_traverse(&c_iter);
+ ret = bch2_btree_iter_traverse(trans, &c_iter);
if (ret)
goto out;
@@ -444,7 +450,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
!test_bit(JOURNAL_space_low, &c->journal.flags))
commit_flags |= BCH_TRANS_COMMIT_no_journal_res;
- struct bkey_s_c btree_k = bch2_btree_iter_peek_slot(&b_iter);
+ struct bkey_s_c btree_k = bch2_btree_iter_peek_slot(trans, &b_iter);
ret = bkey_err(btree_k);
if (ret)
goto err;
diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c
index 25d54b77cdc2..86acf037590c 100644
--- a/fs/bcachefs/btree_node_scan.c
+++ b/fs/bcachefs/btree_node_scan.c
@@ -183,7 +183,7 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
return;
if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(&bn->keys))) {
- if (!c->chacha20)
+ if (!c->chacha20_key_set)
return;
struct nonce nonce = btree_nonce(&bn->keys, 0);
@@ -271,7 +271,7 @@ static int read_btree_nodes_worker(void *p)
err:
bio_put(bio);
free_page((unsigned long) buf);
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[READ]);
closure_put(w->cl);
kfree(w);
return 0;
@@ -291,7 +291,7 @@ static int read_btree_nodes(struct find_btree_nodes *f)
struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL);
if (!w) {
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[READ]);
ret = -ENOMEM;
goto err;
}
@@ -303,14 +303,14 @@ static int read_btree_nodes(struct find_btree_nodes *f)
struct task_struct *t = kthread_create(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name);
ret = PTR_ERR_OR_ZERO(t);
if (ret) {
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[READ]);
kfree(w);
bch_err_msg(c, ret, "starting kthread");
break;
}
closure_get(&cl);
- percpu_ref_get(&ca->io_ref);
+ percpu_ref_get(&ca->io_ref[READ]);
wake_up_process(t);
}
err:
@@ -398,7 +398,7 @@ int bch2_scan_for_btree_nodes(struct bch_fs *c)
bch2_print_string_as_lines(KERN_INFO, buf.buf);
}
- sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL);
+ sort_nonatomic(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL);
dst = 0;
darray_for_each(f->nodes, i) {
@@ -418,7 +418,7 @@ int bch2_scan_for_btree_nodes(struct bch_fs *c)
}
f->nodes.nr = dst;
- sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
+ sort_nonatomic(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
if (0 && c->opts.verbose) {
printbuf_reset(&buf);
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 77578da2d23f..023c472dc9ee 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -367,7 +367,6 @@ static inline unsigned long btree_path_ip_allocated(struct btree_path *path)
* @nodes_intent_locked - bitmask indicating which locks are intent locks
*/
struct btree_iter {
- struct btree_trans *trans;
btree_path_idx_t path;
btree_path_idx_t update_path;
btree_path_idx_t key_cache_path;
diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
index c05394f56424..1e6b7836cc01 100644
--- a/fs/bcachefs/btree_update.c
+++ b/fs/bcachefs/btree_update.c
@@ -126,7 +126,7 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
struct bpos new_pos)
{
struct bch_fs *c = trans->c;
- struct btree_iter old_iter, new_iter = { NULL };
+ struct btree_iter old_iter, new_iter = {};
struct bkey_s_c old_k, new_k;
snapshot_id_list s;
struct bkey_i *update;
@@ -140,7 +140,7 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
bch2_trans_iter_init(trans, &old_iter, id, old_pos,
BTREE_ITER_not_extents|
BTREE_ITER_all_snapshots);
- while ((old_k = bch2_btree_iter_prev(&old_iter)).k &&
+ while ((old_k = bch2_btree_iter_prev(trans, &old_iter)).k &&
!(ret = bkey_err(old_k)) &&
bkey_eq(old_pos, old_k.k->p)) {
struct bpos whiteout_pos =
@@ -296,7 +296,7 @@ static int bch2_trans_update_extent(struct btree_trans *trans,
BTREE_ITER_intent|
BTREE_ITER_with_updates|
BTREE_ITER_not_extents);
- k = bch2_btree_iter_peek_max(&iter, POS(insert->k.p.inode, U64_MAX));
+ k = bch2_btree_iter_peek_max(trans, &iter, POS(insert->k.p.inode, U64_MAX));
if ((ret = bkey_err(k)))
goto err;
if (!k.k)
@@ -322,8 +322,8 @@ static int bch2_trans_update_extent(struct btree_trans *trans,
if (done)
goto out;
next:
- bch2_btree_iter_advance(&iter);
- k = bch2_btree_iter_peek_max(&iter, POS(insert->k.p.inode, U64_MAX));
+ bch2_btree_iter_advance(trans, &iter);
+ k = bch2_btree_iter_peek_max(trans, &iter, POS(insert->k.p.inode, U64_MAX));
if ((ret = bkey_err(k)))
goto err;
if (!k.k)
@@ -592,13 +592,13 @@ int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter,
enum btree_id btree, struct bpos end)
{
bch2_trans_iter_init(trans, iter, btree, end, BTREE_ITER_intent);
- struct bkey_s_c k = bch2_btree_iter_peek_prev(iter);
+ struct bkey_s_c k = bch2_btree_iter_peek_prev(trans, iter);
int ret = bkey_err(k);
if (ret)
goto err;
- bch2_btree_iter_advance(iter);
- k = bch2_btree_iter_peek_slot(iter);
+ bch2_btree_iter_advance(trans, iter);
+ k = bch2_btree_iter_peek_slot(trans, iter);
ret = bkey_err(k);
if (ret)
goto err;
@@ -634,7 +634,7 @@ int bch2_btree_insert_nonextent(struct btree_trans *trans,
BTREE_ITER_cached|
BTREE_ITER_not_extents|
BTREE_ITER_intent);
- ret = bch2_btree_iter_traverse(&iter) ?:
+ ret = bch2_btree_iter_traverse(trans, &iter) ?:
bch2_trans_update(trans, &iter, k, flags);
bch2_trans_iter_exit(trans, &iter);
return ret;
@@ -646,7 +646,7 @@ int bch2_btree_insert_trans(struct btree_trans *trans, enum btree_id id,
struct btree_iter iter;
bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k),
BTREE_ITER_intent|flags);
- int ret = bch2_btree_iter_traverse(&iter) ?:
+ int ret = bch2_btree_iter_traverse(trans, &iter) ?:
bch2_trans_update(trans, &iter, k, flags);
bch2_trans_iter_exit(trans, &iter);
return ret;
@@ -695,7 +695,7 @@ int bch2_btree_delete(struct btree_trans *trans,
bch2_trans_iter_init(trans, &iter, btree, pos,
BTREE_ITER_cached|
BTREE_ITER_intent);
- ret = bch2_btree_iter_traverse(&iter) ?:
+ ret = bch2_btree_iter_traverse(trans, &iter) ?:
bch2_btree_delete_at(trans, &iter, update_flags);
bch2_trans_iter_exit(trans, &iter);
@@ -713,7 +713,7 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
int ret = 0;
bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_intent);
- while ((k = bch2_btree_iter_peek_max(&iter, end)).k) {
+ while ((k = bch2_btree_iter_peek_max(trans, &iter, end)).k) {
struct disk_reservation disk_res =
bch2_disk_reservation_init(trans->c, 0);
struct bkey_i delete;
@@ -808,7 +808,7 @@ int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree,
struct btree_iter iter;
bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_intent);
- int ret = bch2_btree_iter_traverse(&iter) ?:
+ int ret = bch2_btree_iter_traverse(trans, &iter) ?:
bch2_btree_bit_mod_iter(trans, &iter, set);
bch2_trans_iter_exit(trans, &iter);
return ret;
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index bf7e1dac7f46..44b5fe430370 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1221,7 +1221,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
ret = bch2_disk_reservation_get(c, &as->disk_res,
(nr_nodes[0] + nr_nodes[1]) * btree_sectors(c),
- c->opts.metadata_replicas,
+ READ_ONCE(c->opts.metadata_replicas),
disk_res_flags);
if (ret)
goto err;
@@ -2147,7 +2147,7 @@ static int get_iter_to_node(struct btree_trans *trans, struct btree_iter *iter,
bch2_trans_node_iter_init(trans, iter, b->c.btree_id, b->key.k.p,
BTREE_MAX_DEPTH, b->c.level,
BTREE_ITER_intent);
- int ret = bch2_btree_iter_traverse(iter);
+ int ret = bch2_btree_iter_traverse(trans, iter);
if (ret)
goto err;
@@ -2239,7 +2239,7 @@ static int bch2_btree_node_rewrite_key(struct btree_trans *trans,
bch2_trans_node_iter_init(trans, &iter,
btree, k->k.p,
BTREE_MAX_DEPTH, level, 0);
- struct btree *b = bch2_btree_iter_peek_node(&iter);
+ struct btree *b = bch2_btree_iter_peek_node(trans, &iter);
int ret = PTR_ERR_OR_ZERO(b);
if (ret)
goto out;
@@ -2262,7 +2262,7 @@ int bch2_btree_node_rewrite_pos(struct btree_trans *trans,
/* Traverse one depth lower to get a pointer to the node itself: */
struct btree_iter iter;
bch2_trans_node_iter_init(trans, &iter, btree, pos, 0, level - 1, 0);
- struct btree *b = bch2_btree_iter_peek_node(&iter);
+ struct btree *b = bch2_btree_iter_peek_node(trans, &iter);
int ret = PTR_ERR_OR_ZERO(b);
if (ret)
goto err;
@@ -2406,7 +2406,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
bool skip_triggers)
{
struct bch_fs *c = trans->c;
- struct btree_iter iter2 = { NULL };
+ struct btree_iter iter2 = {};
struct btree *parent;
int ret;
@@ -2430,7 +2430,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
parent = btree_node_parent(btree_iter_path(trans, iter), b);
if (parent) {
- bch2_trans_copy_iter(&iter2, iter);
+ bch2_trans_copy_iter(trans, &iter2, iter);
iter2.path = bch2_btree_path_make_mut(trans, iter2.path,
iter2.flags & BTREE_ITER_intent,
@@ -2444,7 +2444,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
trans->paths_sorted = false;
- ret = bch2_btree_iter_traverse(&iter2) ?:
+ ret = bch2_btree_iter_traverse(trans, &iter2) ?:
bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_norun);
if (ret)
goto err;
diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
index 2c09d19dd621..0941fb2c026d 100644
--- a/fs/bcachefs/btree_write_buffer.c
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -144,7 +144,7 @@ static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *ite
EBUG_ON(!trans->c->btree_write_buffer.flushing.pin.seq);
EBUG_ON(trans->c->btree_write_buffer.flushing.pin.seq > wb->journal_seq);
- ret = bch2_btree_iter_traverse(iter);
+ ret = bch2_btree_iter_traverse(trans, iter);
if (ret)
return ret;
@@ -208,7 +208,7 @@ btree_write_buffered_insert(struct btree_trans *trans,
trans->journal_res.seq = wb->journal_seq;
- ret = bch2_btree_iter_traverse(&iter) ?:
+ ret = bch2_btree_iter_traverse(trans, &iter) ?:
bch2_trans_update(trans, &iter, &wb->k,
BTREE_UPDATE_internal_snapshot_node);
bch2_trans_iter_exit(trans, &iter);
@@ -285,7 +285,7 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
struct bch_fs *c = trans->c;
struct journal *j = &c->journal;
struct btree_write_buffer *wb = &c->btree_write_buffer;
- struct btree_iter iter = { NULL };
+ struct btree_iter iter = {};
size_t overwritten = 0, fast = 0, slowpath = 0, could_not_insert = 0;
bool write_locked = false;
bool accounting_replay_done = test_bit(BCH_FS_accounting_replay_done, &c->flags);
@@ -368,7 +368,7 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
write_locked = false;
ret = lockrestart_do(trans,
- bch2_btree_iter_traverse(&iter) ?:
+ bch2_btree_iter_traverse(trans, &iter) ?:
bch2_foreground_maybe_merge(trans, iter.path, 0,
BCH_WATERMARK_reclaim|
BCH_TRANS_COMMIT_journal_reclaim|
@@ -385,7 +385,7 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
BTREE_ITER_intent|BTREE_ITER_all_snapshots);
}
- bch2_btree_iter_set_pos(&iter, k->k.k.p);
+ bch2_btree_iter_set_pos(trans, &iter, k->k.k.p);
btree_iter_path(trans, &iter)->preserve = false;
bool accounting_accumulated = false;
@@ -428,10 +428,10 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
*/
trace_and_count(c, write_buffer_flush_slowpath, trans, slowpath, wb->flushing.keys.nr);
- sort(wb->flushing.keys.data,
- wb->flushing.keys.nr,
- sizeof(wb->flushing.keys.data[0]),
- wb_key_seq_cmp, NULL);
+ sort_nonatomic(wb->flushing.keys.data,
+ wb->flushing.keys.nr,
+ sizeof(wb->flushing.keys.data[0]),
+ wb_key_seq_cmp, NULL);
darray_for_each(wb->flushing.keys, i) {
if (!i->journal_seq)
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 0903311cc71e..4ef261e8db4f 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -30,8 +30,15 @@
void bch2_dev_usage_read_fast(struct bch_dev *ca, struct bch_dev_usage *usage)
{
+ for (unsigned i = 0; i < BCH_DATA_NR; i++)
+ usage->buckets[i] = percpu_u64_get(&ca->usage->d[i].buckets);
+}
+
+void bch2_dev_usage_full_read_fast(struct bch_dev *ca, struct bch_dev_usage_full *usage)
+{
memset(usage, 0, sizeof(*usage));
- acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage, dev_usage_u64s());
+ acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage,
+ sizeof(struct bch_dev_usage_full) / sizeof(u64));
}
static u64 reserve_factor(u64 r)
@@ -75,7 +82,7 @@ bch2_fs_usage_read_short(struct bch_fs *c)
void bch2_dev_usage_to_text(struct printbuf *out,
struct bch_dev *ca,
- struct bch_dev_usage *usage)
+ struct bch_dev_usage_full *usage)
{
if (out->nr_tabstops < 5) {
printbuf_tabstops_reset(out);
@@ -365,7 +372,7 @@ found:
struct btree_iter iter;
bch2_trans_node_iter_init(trans, &iter, btree, new->k.p, 0, level,
BTREE_ITER_intent|BTREE_ITER_all_snapshots);
- ret = bch2_btree_iter_traverse(&iter) ?:
+ ret = bch2_btree_iter_traverse(trans, &iter) ?:
bch2_trans_update(trans, &iter, new,
BTREE_UPDATE_internal_snapshot_node|
BTREE_TRIGGER_norun);
@@ -707,7 +714,7 @@ err:
struct disk_accounting_pos acc;
memset(&acc, 0, sizeof(acc));
acc.type = BCH_DISK_ACCOUNTING_replicas;
- memcpy(&acc.replicas, &m->r.e, replicas_entry_bytes(&m->r.e));
+ unsafe_memcpy(&acc.replicas, &m->r.e, replicas_entry_bytes(&m->r.e), "VLA");
gc_stripe_unlock(m);
acc.replicas.data_type = data_type;
@@ -1132,7 +1139,7 @@ int bch2_trans_mark_dev_sbs_flags(struct bch_fs *c,
for_each_online_member(c, ca) {
int ret = bch2_trans_mark_dev_sb(c, ca, flags);
if (ret) {
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[READ]);
return ret;
}
}
@@ -1331,7 +1338,7 @@ void bch2_dev_buckets_free(struct bch_dev *ca)
int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
{
- ca->usage = alloc_percpu(struct bch_dev_usage);
+ ca->usage = alloc_percpu(struct bch_dev_usage_full);
if (!ca->usage)
return -BCH_ERR_ENOMEM_usage_init;
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index c5363256e363..8d75b27a1418 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -172,7 +172,16 @@ static inline struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca)
return ret;
}
-void bch2_dev_usage_to_text(struct printbuf *, struct bch_dev *, struct bch_dev_usage *);
+void bch2_dev_usage_full_read_fast(struct bch_dev *, struct bch_dev_usage_full *);
+static inline struct bch_dev_usage_full bch2_dev_usage_full_read(struct bch_dev *ca)
+{
+ struct bch_dev_usage_full ret;
+
+ bch2_dev_usage_full_read_fast(ca, &ret);
+ return ret;
+}
+
+void bch2_dev_usage_to_text(struct printbuf *, struct bch_dev *, struct bch_dev_usage_full *);
static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_watermark watermark)
{
@@ -207,7 +216,7 @@ static inline u64 dev_buckets_free(struct bch_dev *ca,
enum bch_watermark watermark)
{
return max_t(s64, 0,
- usage.d[BCH_DATA_free].buckets -
+ usage.buckets[BCH_DATA_free]-
ca->nr_open_buckets -
bch2_dev_buckets_reserved(ca, watermark));
}
@@ -217,10 +226,10 @@ static inline u64 __dev_buckets_available(struct bch_dev *ca,
enum bch_watermark watermark)
{
return max_t(s64, 0,
- usage.d[BCH_DATA_free].buckets
- + usage.d[BCH_DATA_cached].buckets
- + usage.d[BCH_DATA_need_gc_gens].buckets
- + usage.d[BCH_DATA_need_discard].buckets
+ usage.buckets[BCH_DATA_free]
+ + usage.buckets[BCH_DATA_cached]
+ + usage.buckets[BCH_DATA_need_gc_gens]
+ + usage.buckets[BCH_DATA_need_discard]
- ca->nr_open_buckets
- bch2_dev_buckets_reserved(ca, watermark));
}
@@ -233,11 +242,6 @@ static inline u64 dev_buckets_available(struct bch_dev *ca,
/* Filesystem usage: */
-static inline unsigned dev_usage_u64s(void)
-{
- return sizeof(struct bch_dev_usage) / sizeof(u64);
-}
-
struct bch_fs_usage_short
bch2_fs_usage_read_short(struct bch_fs *);
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 900b8680c8b5..0aed2500ade3 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -54,7 +54,12 @@ struct bucket_gens {
u8 b[] __counted_by(nbuckets);
};
+/* Only info on bucket countns: */
struct bch_dev_usage {
+ u64 buckets[BCH_DATA_NR];
+};
+
+struct bch_dev_usage_full {
struct bch_dev_usage_type {
u64 buckets;
u64 sectors; /* _compressed_ sectors: */
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index 584f4a3eb670..5891b3a1e61c 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -350,8 +350,8 @@ static ssize_t bch2_data_job_read(struct file *file, char __user *buf,
if (ctx->arg.op == BCH_DATA_OP_scrub) {
struct bch_dev *ca = bch2_dev_tryget(c, ctx->arg.scrub.dev);
if (ca) {
- struct bch_dev_usage u;
- bch2_dev_usage_read_fast(ca, &u);
+ struct bch_dev_usage_full u;
+ bch2_dev_usage_full_read_fast(ca, &u);
for (unsigned i = BCH_DATA_btree; i < ARRAY_SIZE(u.d); i++)
if (ctx->arg.scrub.data_types & BIT(i))
e.p.sectors_total += u.d[i].sectors;
@@ -473,7 +473,7 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c,
struct bch_ioctl_dev_usage __user *user_arg)
{
struct bch_ioctl_dev_usage arg;
- struct bch_dev_usage src;
+ struct bch_dev_usage_full src;
struct bch_dev *ca;
unsigned i;
@@ -493,7 +493,7 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c,
if (IS_ERR(ca))
return PTR_ERR(ca);
- src = bch2_dev_usage_read(ca);
+ src = bch2_dev_usage_full_read(ca);
arg.state = ca->mi.state;
arg.bucket_size = ca->mi.bucket_size;
@@ -514,7 +514,7 @@ static long bch2_ioctl_dev_usage_v2(struct bch_fs *c,
struct bch_ioctl_dev_usage_v2 __user *user_arg)
{
struct bch_ioctl_dev_usage_v2 arg;
- struct bch_dev_usage src;
+ struct bch_dev_usage_full src;
struct bch_dev *ca;
int ret = 0;
@@ -534,7 +534,7 @@ static long bch2_ioctl_dev_usage_v2(struct bch_fs *c,
if (IS_ERR(ca))
return PTR_ERR(ca);
- src = bch2_dev_usage_read(ca);
+ src = bch2_dev_usage_full_read(ca);
arg.state = ca->mi.state;
arg.bucket_size = ca->mi.bucket_size;
@@ -615,7 +615,7 @@ static long bch2_ioctl_disk_get_idx(struct bch_fs *c,
for_each_online_member(c, ca)
if (ca->dev == dev) {
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[READ]);
return ca->dev_idx;
}
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index 3726689093e3..d0a34a097b80 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -7,17 +7,12 @@
#include "super-io.h"
#include <linux/crc32c.h>
-#include <linux/crypto.h>
#include <linux/xxhash.h>
#include <linux/key.h>
#include <linux/random.h>
#include <linux/ratelimit.h>
-#include <linux/scatterlist.h>
-#include <crypto/algapi.h>
#include <crypto/chacha.h>
-#include <crypto/hash.h>
#include <crypto/poly1305.h>
-#include <crypto/skcipher.h>
#include <keys/user-type.h>
/*
@@ -96,116 +91,40 @@ static void bch2_checksum_update(struct bch2_checksum_state *state, const void *
}
}
-static inline int do_encrypt_sg(struct crypto_sync_skcipher *tfm,
- struct nonce nonce,
- struct scatterlist *sg, size_t len)
+static void bch2_chacha20_init(u32 state[CHACHA_STATE_WORDS],
+ const struct bch_key *key, struct nonce nonce)
{
- SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
+ u32 key_words[CHACHA_KEY_SIZE / sizeof(u32)];
- skcipher_request_set_sync_tfm(req, tfm);
- skcipher_request_set_callback(req, 0, NULL, NULL);
- skcipher_request_set_crypt(req, sg, sg, len, nonce.d);
+ BUILD_BUG_ON(sizeof(key_words) != sizeof(*key));
+ memcpy(key_words, key, sizeof(key_words));
+ le32_to_cpu_array(key_words, ARRAY_SIZE(key_words));
- int ret = crypto_skcipher_encrypt(req);
- if (ret)
- pr_err("got error %i from crypto_skcipher_encrypt()", ret);
-
- return ret;
-}
-
-static inline int do_encrypt(struct crypto_sync_skcipher *tfm,
- struct nonce nonce,
- void *buf, size_t len)
-{
- if (!is_vmalloc_addr(buf)) {
- struct scatterlist sg = {};
-
- sg_mark_end(&sg);
- sg_set_page(&sg, virt_to_page(buf), len, offset_in_page(buf));
- return do_encrypt_sg(tfm, nonce, &sg, len);
- } else {
- DARRAY_PREALLOCATED(struct scatterlist, 4) sgl;
- size_t sgl_len = 0;
- int ret;
-
- darray_init(&sgl);
-
- while (len) {
- unsigned offset = offset_in_page(buf);
- struct scatterlist sg = {
- .page_link = (unsigned long) vmalloc_to_page(buf),
- .offset = offset,
- .length = min(len, PAGE_SIZE - offset),
- };
+ BUILD_BUG_ON(sizeof(nonce) != CHACHA_IV_SIZE);
+ chacha_init(state, key_words, (const u8 *)nonce.d);
- if (darray_push(&sgl, sg)) {
- sg_mark_end(&darray_last(sgl));
- ret = do_encrypt_sg(tfm, nonce, sgl.data, sgl_len);
- if (ret)
- goto err;
-
- nonce = nonce_add(nonce, sgl_len);
- sgl_len = 0;
- sgl.nr = 0;
- BUG_ON(darray_push(&sgl, sg));
- }
-
- buf += sg.length;
- len -= sg.length;
- sgl_len += sg.length;
- }
-
- sg_mark_end(&darray_last(sgl));
- ret = do_encrypt_sg(tfm, nonce, sgl.data, sgl_len);
-err:
- darray_exit(&sgl);
- return ret;
- }
+ memzero_explicit(key_words, sizeof(key_words));
}
-int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
- void *buf, size_t len)
+static void bch2_chacha20(const struct bch_key *key, struct nonce nonce,
+ void *data, size_t len)
{
- struct crypto_sync_skcipher *chacha20 =
- crypto_alloc_sync_skcipher("chacha20", 0, 0);
- int ret;
-
- ret = PTR_ERR_OR_ZERO(chacha20);
- if (ret) {
- pr_err("error requesting chacha20 cipher: %s", bch2_err_str(ret));
- return ret;
- }
-
- ret = crypto_skcipher_setkey(&chacha20->base,
- (void *) key, sizeof(*key));
- if (ret) {
- pr_err("error from crypto_skcipher_setkey(): %s", bch2_err_str(ret));
- goto err;
- }
+ u32 state[CHACHA_STATE_WORDS];
- ret = do_encrypt(chacha20, nonce, buf, len);
-err:
- crypto_free_sync_skcipher(chacha20);
- return ret;
+ bch2_chacha20_init(state, key, nonce);
+ chacha20_crypt(state, data, data, len);
+ memzero_explicit(state, sizeof(state));
}
-static int gen_poly_key(struct bch_fs *c, struct shash_desc *desc,
- struct nonce nonce)
+static void bch2_poly1305_init(struct poly1305_desc_ctx *desc,
+ struct bch_fs *c, struct nonce nonce)
{
- u8 key[POLY1305_KEY_SIZE];
- int ret;
+ u8 key[POLY1305_KEY_SIZE] = { 0 };
nonce.d[3] ^= BCH_NONCE_POLY;
- memset(key, 0, sizeof(key));
- ret = do_encrypt(c->chacha20, nonce, key, sizeof(key));
- if (ret)
- return ret;
-
- desc->tfm = c->poly1305;
- crypto_shash_init(desc);
- crypto_shash_update(desc, key, sizeof(key));
- return 0;
+ bch2_chacha20(&c->chacha20_key, nonce, key, sizeof(key));
+ poly1305_init(desc, key);
}
struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type,
@@ -230,14 +149,13 @@ struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type,
case BCH_CSUM_chacha20_poly1305_80:
case BCH_CSUM_chacha20_poly1305_128: {
- SHASH_DESC_ON_STACK(desc, c->poly1305);
+ struct poly1305_desc_ctx dctx;
u8 digest[POLY1305_DIGEST_SIZE];
struct bch_csum ret = { 0 };
- gen_poly_key(c, desc, nonce);
-
- crypto_shash_update(desc, data, len);
- crypto_shash_final(desc, digest);
+ bch2_poly1305_init(&dctx, c, nonce);
+ poly1305_update(&dctx, data, len);
+ poly1305_final(&dctx, digest);
memcpy(&ret, digest, bch_crc_bytes[type]);
return ret;
@@ -253,11 +171,12 @@ int bch2_encrypt(struct bch_fs *c, unsigned type,
if (!bch2_csum_type_is_encryption(type))
return 0;
- if (bch2_fs_inconsistent_on(!c->chacha20,
+ if (bch2_fs_inconsistent_on(!c->chacha20_key_set,
c, "attempting to encrypt without encryption key"))
return -BCH_ERR_no_encryption_key;
- return do_encrypt(c->chacha20, nonce, data, len);
+ bch2_chacha20(&c->chacha20_key, nonce, data, len);
+ return 0;
}
static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
@@ -296,26 +215,26 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
case BCH_CSUM_chacha20_poly1305_80:
case BCH_CSUM_chacha20_poly1305_128: {
- SHASH_DESC_ON_STACK(desc, c->poly1305);
+ struct poly1305_desc_ctx dctx;
u8 digest[POLY1305_DIGEST_SIZE];
struct bch_csum ret = { 0 };
- gen_poly_key(c, desc, nonce);
+ bch2_poly1305_init(&dctx, c, nonce);
#ifdef CONFIG_HIGHMEM
__bio_for_each_segment(bv, bio, *iter, *iter) {
void *p = kmap_local_page(bv.bv_page) + bv.bv_offset;
- crypto_shash_update(desc, p, bv.bv_len);
+ poly1305_update(&dctx, p, bv.bv_len);
kunmap_local(p);
}
#else
__bio_for_each_bvec(bv, bio, *iter, *iter)
- crypto_shash_update(desc,
+ poly1305_update(&dctx,
page_address(bv.bv_page) + bv.bv_offset,
bv.bv_len);
#endif
- crypto_shash_final(desc, digest);
+ poly1305_final(&dctx, digest);
memcpy(&ret, digest, bch_crc_bytes[type]);
return ret;
@@ -338,43 +257,33 @@ int __bch2_encrypt_bio(struct bch_fs *c, unsigned type,
{
struct bio_vec bv;
struct bvec_iter iter;
- DARRAY_PREALLOCATED(struct scatterlist, 4) sgl;
- size_t sgl_len = 0;
+ u32 chacha_state[CHACHA_STATE_WORDS];
int ret = 0;
- if (bch2_fs_inconsistent_on(!c->chacha20,
+ if (bch2_fs_inconsistent_on(!c->chacha20_key_set,
c, "attempting to encrypt without encryption key"))
return -BCH_ERR_no_encryption_key;
- darray_init(&sgl);
+ bch2_chacha20_init(chacha_state, &c->chacha20_key, nonce);
bio_for_each_segment(bv, bio, iter) {
- struct scatterlist sg = {
- .page_link = (unsigned long) bv.bv_page,
- .offset = bv.bv_offset,
- .length = bv.bv_len,
- };
-
- if (darray_push(&sgl, sg)) {
- sg_mark_end(&darray_last(sgl));
- ret = do_encrypt_sg(c->chacha20, nonce, sgl.data, sgl_len);
- if (ret)
- goto err;
-
- nonce = nonce_add(nonce, sgl_len);
- sgl_len = 0;
- sgl.nr = 0;
-
- BUG_ON(darray_push(&sgl, sg));
+ void *p;
+
+ /*
+ * chacha_crypt() assumes that the length is a multiple of
+ * CHACHA_BLOCK_SIZE on any non-final call.
+ */
+ if (!IS_ALIGNED(bv.bv_len, CHACHA_BLOCK_SIZE)) {
+ bch_err_ratelimited(c, "bio not aligned for encryption");
+ ret = -EIO;
+ break;
}
- sgl_len += sg.length;
+ p = bvec_kmap_local(&bv);
+ chacha20_crypt(chacha_state, p, p, bv.bv_len);
+ kunmap_local(p);
}
-
- sg_mark_end(&darray_last(sgl));
- ret = do_encrypt_sg(c->chacha20, nonce, sgl.data, sgl_len);
-err:
- darray_exit(&sgl);
+ memzero_explicit(chacha_state, sizeof(chacha_state));
return ret;
}
@@ -650,10 +559,7 @@ int bch2_decrypt_sb_key(struct bch_fs *c,
}
/* decrypt real key: */
- ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c),
- &sb_key, sizeof(sb_key));
- if (ret)
- goto err;
+ bch2_chacha20(&user_key, bch2_sb_key_nonce(c), &sb_key, sizeof(sb_key));
if (bch2_key_is_encrypted(&sb_key)) {
bch_err(c, "incorrect encryption key");
@@ -668,31 +574,6 @@ err:
return ret;
}
-static int bch2_alloc_ciphers(struct bch_fs *c)
-{
- if (c->chacha20)
- return 0;
-
- struct crypto_sync_skcipher *chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0);
- int ret = PTR_ERR_OR_ZERO(chacha20);
- if (ret) {
- bch_err(c, "error requesting chacha20 module: %s", bch2_err_str(ret));
- return ret;
- }
-
- struct crypto_shash *poly1305 = crypto_alloc_shash("poly1305", 0, 0);
- ret = PTR_ERR_OR_ZERO(poly1305);
- if (ret) {
- bch_err(c, "error requesting poly1305 module: %s", bch2_err_str(ret));
- crypto_free_sync_skcipher(chacha20);
- return ret;
- }
-
- c->chacha20 = chacha20;
- c->poly1305 = poly1305;
- return 0;
-}
-
#if 0
/*
@@ -797,35 +678,21 @@ err:
void bch2_fs_encryption_exit(struct bch_fs *c)
{
- if (c->poly1305)
- crypto_free_shash(c->poly1305);
- if (c->chacha20)
- crypto_free_sync_skcipher(c->chacha20);
+ memzero_explicit(&c->chacha20_key, sizeof(c->chacha20_key));
}
int bch2_fs_encryption_init(struct bch_fs *c)
{
struct bch_sb_field_crypt *crypt;
- struct bch_key key;
- int ret = 0;
+ int ret;
crypt = bch2_sb_field_get(c->disk_sb.sb, crypt);
if (!crypt)
- goto out;
+ return 0;
- ret = bch2_alloc_ciphers(c);
+ ret = bch2_decrypt_sb_key(c, crypt, &c->chacha20_key);
if (ret)
- goto out;
-
- ret = bch2_decrypt_sb_key(c, crypt, &key);
- if (ret)
- goto out;
-
- ret = crypto_skcipher_setkey(&c->chacha20->base,
- (void *) &key.key, sizeof(key.key));
- if (ret)
- goto out;
-out:
- memzero_explicit(&key, sizeof(key));
- return ret;
+ return ret;
+ c->chacha20_key_set = true;
+ return 0;
}
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
index 4ac251c8fcd8..1310782d3ae9 100644
--- a/fs/bcachefs/checksum.h
+++ b/fs/bcachefs/checksum.h
@@ -69,7 +69,6 @@ static inline void bch2_csum_err_msg(struct printbuf *out,
bch2_csum_to_text(out, type, expected);
}
-int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t);
int bch2_request_key(struct bch_sb *, struct bch_key *);
#ifndef __KERNEL__
int bch2_revoke_key(struct bch_sb *);
@@ -156,7 +155,7 @@ static inline bool bch2_checksum_type_valid(const struct bch_fs *c,
if (type >= BCH_CSUM_NR)
return false;
- if (bch2_csum_type_is_encryption(type) && !c->chacha20)
+ if (bch2_csum_type_is_encryption(type) && !c->chacha20_key_set)
return false;
return true;
diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c
index 1f8e035d7119..d6dd12d74d4f 100644
--- a/fs/bcachefs/clock.c
+++ b/fs/bcachefs/clock.c
@@ -121,7 +121,7 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock,
} while (0);
__set_current_state(TASK_RUNNING);
- del_timer_sync(&wait.cpu_timer);
+ timer_delete_sync(&wait.cpu_timer);
destroy_timer_on_stack(&wait.cpu_timer);
bch2_io_timer_del(clock, &wait.io_timer);
}
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index 85fc90342492..28ed32449913 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -371,13 +371,14 @@ static int attempt_compress(struct bch_fs *c,
};
zlib_set_workspace(&strm, workspace);
- zlib_deflateInit2(&strm,
+ if (zlib_deflateInit2(&strm,
compression.level
? clamp_t(unsigned, compression.level,
Z_BEST_SPEED, Z_BEST_COMPRESSION)
: Z_DEFAULT_COMPRESSION,
Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL,
- Z_DEFAULT_STRATEGY);
+ Z_DEFAULT_STRATEGY) != Z_OK)
+ return 0;
if (zlib_deflate(&strm, Z_FINISH) != Z_STREAM_END)
return 0;
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index fe400dfc5d76..b211c97238ab 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -216,7 +216,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
bch2_trans_begin(trans);
- k = bch2_btree_iter_peek_slot(&iter);
+ k = bch2_btree_iter_peek_slot(trans, &iter);
ret = bkey_err(k);
if (ret)
goto err;
@@ -398,7 +398,7 @@ restart_drop_extra_replicas:
BCH_TRANS_COMMIT_no_enospc|
m->data_opts.btree_insert_flags);
if (!ret) {
- bch2_btree_iter_set_pos(&iter, next_pos);
+ bch2_btree_iter_set_pos(trans, &iter, next_pos);
this_cpu_add(c->counters[BCH_COUNTER_io_move_finish], new->k.size);
if (trace_io_move_finish_enabled())
@@ -426,7 +426,7 @@ nowork:
count_event(c, io_move_fail);
- bch2_btree_iter_advance(&iter);
+ bch2_btree_iter_advance(trans, &iter);
goto next;
}
out:
@@ -497,7 +497,7 @@ static int bch2_update_unwritten_extent(struct btree_trans *trans,
bch2_trans_iter_init(trans, &iter, update->btree_id, update->op.pos,
BTREE_ITER_slots);
ret = lockrestart_do(trans, ({
- k = bch2_btree_iter_peek_slot(&iter);
+ k = bch2_btree_iter_peek_slot(trans, &iter);
bkey_err(k);
}));
bch2_trans_iter_exit(trans, &iter);
@@ -607,7 +607,7 @@ void bch2_data_update_inflight_to_text(struct printbuf *out, struct data_update
prt_newline(out);
printbuf_indent_add(out, 2);
bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts);
- prt_printf(out, "read_done:\t\%u\n", m->read_done);
+ prt_printf(out, "read_done:\t%u\n", m->read_done);
bch2_write_op_to_text(out, &m->op);
printbuf_indent_sub(out, 2);
}
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 788af88f6979..5a8bc7013512 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -57,7 +57,7 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b,
submit_bio_wait(bio);
bio_put(bio);
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[READ]);
memcpy(n_ondisk, n_sorted, btree_buf_bytes(b));
@@ -297,7 +297,7 @@ out:
if (bio)
bio_put(bio);
kvfree(n_ondisk);
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[READ]);
}
#ifdef CONFIG_DEBUG_FS
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index d7f9f79318a2..8488a7578115 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -287,8 +287,8 @@ static void dirent_init_casefolded_name(struct bkey_i_dirent *dirent,
EBUG_ON(!dirent->v.d_casefold);
EBUG_ON(!cf_name->len);
- dirent->v.d_cf_name_block.d_name_len = name->len;
- dirent->v.d_cf_name_block.d_cf_name_len = cf_name->len;
+ dirent->v.d_cf_name_block.d_name_len = cpu_to_le16(name->len);
+ dirent->v.d_cf_name_block.d_cf_name_len = cpu_to_le16(cf_name->len);
memcpy(&dirent->v.d_cf_name_block.d_names[0], name->name, name->len);
memcpy(&dirent->v.d_cf_name_block.d_names[name->len], cf_name->name, cf_name->len);
memset(&dirent->v.d_cf_name_block.d_names[name->len + cf_name->len], 0,
@@ -417,8 +417,8 @@ int bch2_dirent_rename(struct btree_trans *trans,
enum bch_rename_mode mode)
{
struct qstr src_name_lookup, dst_name_lookup;
- struct btree_iter src_iter = { NULL };
- struct btree_iter dst_iter = { NULL };
+ struct btree_iter src_iter = {};
+ struct btree_iter dst_iter = {};
struct bkey_s_c old_src, old_dst = bkey_s_c_null;
struct bkey_i_dirent *new_src = NULL, *new_dst = NULL;
struct bpos dst_pos =
@@ -586,16 +586,16 @@ out_set_src:
}
if (delete_src) {
- bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot);
- ret = bch2_btree_iter_traverse(&src_iter) ?:
+ bch2_btree_iter_set_snapshot(trans, &src_iter, old_src.k->p.snapshot);
+ ret = bch2_btree_iter_traverse(trans, &src_iter) ?:
bch2_btree_delete_at(trans, &src_iter, BTREE_UPDATE_internal_snapshot_node);
if (ret)
goto out;
}
if (delete_dst) {
- bch2_btree_iter_set_snapshot(&dst_iter, old_dst.k->p.snapshot);
- ret = bch2_btree_iter_traverse(&dst_iter) ?:
+ bch2_btree_iter_set_snapshot(trans, &dst_iter, old_dst.k->p.snapshot);
+ ret = bch2_btree_iter_traverse(trans, &dst_iter) ?:
bch2_btree_delete_at(trans, &dst_iter, BTREE_UPDATE_internal_snapshot_node);
if (ret)
goto out;
@@ -642,7 +642,7 @@ u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir,
const struct qstr *name, subvol_inum *inum)
{
struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter = { NULL };
+ struct btree_iter iter = {};
int ret = lockrestart_do(trans,
bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, name, inum, 0));
@@ -771,7 +771,7 @@ int bch2_fsck_remove_dirent(struct btree_trans *trans, struct bpos pos)
bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent);
- ret = bch2_btree_iter_traverse(&iter) ?:
+ ret = bch2_btree_iter_traverse(trans, &iter) ?:
bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
&dir_hash_info, &iter,
BTREE_UPDATE_internal_snapshot_node);
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index a59f6c12529b..b007319b72e9 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -739,7 +739,7 @@ int bch2_accounting_read(struct bch_fs *c)
struct disk_accounting_pos next;
memset(&next, 0, sizeof(next));
next.type = acc_k.type + 1;
- bch2_btree_iter_set_pos(&iter, disk_accounting_pos_to_bpos(&next));
+ bch2_btree_iter_set_pos(trans, &iter, disk_accounting_pos_to_bpos(&next));
continue;
}
@@ -930,7 +930,7 @@ void bch2_verify_accounting_clean(struct bch_fs *c)
struct disk_accounting_pos next;
memset(&next, 0, sizeof(next));
next.type = acc_k.type + 1;
- bch2_btree_iter_set_pos(&iter, disk_accounting_pos_to_bpos(&next));
+ bch2_btree_iter_set_pos(trans, &iter, disk_accounting_pos_to_bpos(&next));
continue;
}
diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
index 5df8de0b8c02..1186280b29e9 100644
--- a/fs/bcachefs/disk_groups.c
+++ b/fs/bcachefs/disk_groups.c
@@ -555,9 +555,9 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
? rcu_dereference(c->devs[t.dev])
: NULL;
- if (ca && percpu_ref_tryget(&ca->io_ref)) {
+ if (ca && percpu_ref_tryget(&ca->io_ref[READ])) {
prt_printf(out, "/dev/%s", ca->name);
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[READ]);
} else if (ca) {
prt_printf(out, "offline device %u", t.dev);
} else {
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 6faeda7ad03d..a396865e8b17 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -105,6 +105,7 @@ struct ec_bio {
struct bch_dev *ca;
struct ec_stripe_buf *buf;
size_t idx;
+ int rw;
u64 submit_time;
struct bio bio;
};
@@ -462,7 +463,8 @@ int bch2_trigger_stripe(struct btree_trans *trans,
return ret;
if (gc)
- memcpy(&gc->r.e, &acc.replicas, replicas_entry_bytes(&acc.replicas));
+ unsafe_memcpy(&gc->r.e, &acc.replicas,
+ replicas_entry_bytes(&acc.replicas), "VLA");
}
if (old_s) {
@@ -703,6 +705,7 @@ static void ec_block_endio(struct bio *bio)
struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx];
struct bch_dev *ca = ec_bio->ca;
struct closure *cl = bio->bi_private;
+ int rw = ec_bio->rw;
bch2_account_io_completion(ca, bio_data_dir(bio),
ec_bio->submit_time, !bio->bi_status);
@@ -724,7 +727,7 @@ static void ec_block_endio(struct bio *bio)
}
bio_put(&ec_bio->bio);
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[rw]);
closure_put(cl);
}
@@ -775,6 +778,7 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
ec_bio->ca = ca;
ec_bio->buf = buf;
ec_bio->idx = idx;
+ ec_bio->rw = rw;
ec_bio->submit_time = local_clock();
ec_bio->bio.bi_iter.bi_sector = ptr->offset + buf->offset + (offset >> 9);
@@ -784,14 +788,14 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b);
closure_get(cl);
- percpu_ref_get(&ca->io_ref);
+ percpu_ref_get(&ca->io_ref[rw]);
submit_bio(&ec_bio->bio);
offset += b;
}
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[rw]);
}
static int get_stripe_key_trans(struct btree_trans *trans, u64 idx,
@@ -1264,7 +1268,7 @@ static void zero_out_rest_of_ec_bucket(struct bch_fs *c,
ob->sectors_free,
GFP_KERNEL, 0);
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[WRITE]);
if (ret)
s->err = ret;
@@ -1836,7 +1840,7 @@ static int __get_existing_stripe(struct btree_trans *trans,
ret = 1;
}
out:
- bch2_set_btree_iter_dontneed(&iter);
+ bch2_set_btree_iter_dontneed(trans, &iter);
err:
bch2_trans_iter_exit(trans, &iter);
return ret;
@@ -1949,7 +1953,7 @@ static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_st
if (bkey_gt(k.k->p, POS(0, U32_MAX))) {
if (start_pos.offset) {
start_pos = min_pos;
- bch2_btree_iter_set_pos(&iter, start_pos);
+ bch2_btree_iter_set_pos(trans, &iter, start_pos);
continue;
}
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index c8696f01eb14..a615e4852ded 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -287,7 +287,7 @@
x(EIO, mark_stripe) \
x(EIO, stripe_reconstruct) \
x(EIO, key_type_error) \
- x(EIO, extent_poisened) \
+ x(EIO, extent_poisoned) \
x(EIO, missing_indirect_extent) \
x(EIO, invalidate_stripe_to_dev) \
x(EIO, no_encryption_key) \
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index d4dfd13a8076..baf5dfb32298 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -34,7 +34,7 @@ bool __bch2_inconsistent_error(struct bch_fs *c, struct printbuf *out)
journal_cur_seq(&c->journal));
return true;
case BCH_ON_ERROR_panic:
- bch2_print_string_as_lines(KERN_ERR, out->buf);
+ bch2_print_string_as_lines_nonblocking(KERN_ERR, out->buf);
panic(bch2_fmt(c, "panic after error"));
return true;
default:
@@ -45,6 +45,8 @@ bool __bch2_inconsistent_error(struct bch_fs *c, struct printbuf *out)
bool bch2_inconsistent_error(struct bch_fs *c)
{
struct printbuf buf = PRINTBUF;
+ buf.atomic++;
+
printbuf_indent_add_nextline(&buf, 2);
bool ret = __bch2_inconsistent_error(c, &buf);
@@ -59,6 +61,7 @@ static bool bch2_fs_trans_inconsistent(struct bch_fs *c, struct btree_trans *tra
const char *fmt, va_list args)
{
struct printbuf buf = PRINTBUF;
+ buf.atomic++;
bch2_log_msg_start(c, &buf);
@@ -68,7 +71,7 @@ static bool bch2_fs_trans_inconsistent(struct bch_fs *c, struct btree_trans *tra
if (trans)
bch2_trans_updates_to_text(&buf, trans);
bool ret = __bch2_inconsistent_error(c, &buf);
- bch2_print_string_as_lines(KERN_ERR, buf.buf);
+ bch2_print_string_as_lines_nonblocking(KERN_ERR, buf.buf);
printbuf_exit(&buf);
return ret;
diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
index 6aac579a692a..6bb42985306e 100644
--- a/fs/bcachefs/extent_update.c
+++ b/fs/bcachefs/extent_update.c
@@ -112,7 +112,7 @@ int bch2_extent_atomic_end(struct btree_trans *trans,
unsigned nr_iters = 0;
int ret;
- ret = bch2_btree_iter_traverse(iter);
+ ret = bch2_btree_iter_traverse(trans, iter);
if (ret)
return ret;
@@ -126,9 +126,9 @@ int bch2_extent_atomic_end(struct btree_trans *trans,
if (ret < 0)
return ret;
- bch2_trans_copy_iter(&copy, iter);
+ bch2_trans_copy_iter(trans, &copy, iter);
- for_each_btree_key_max_continue_norestart(copy, insert->k.p, 0, k, ret) {
+ for_each_btree_key_max_continue_norestart(trans, copy, insert->k.p, 0, k, ret) {
unsigned offset = 0;
if (bkey_gt(bkey_start_pos(&insert->k), bkey_start_pos(k.k)))
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index ae7c7a177e10..dca2b8425cc0 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -139,7 +139,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
- return -BCH_ERR_extent_poisened;
+ return -BCH_ERR_extent_poisoned;
rcu_read_lock();
const union bch_extent_entry *entry;
diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
index a03e2c780cba..e3a75dcca60c 100644
--- a/fs/bcachefs/fs-io-buffered.c
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -183,12 +183,12 @@ static void bchfs_read(struct btree_trans *trans,
if (ret)
goto err;
- bch2_btree_iter_set_snapshot(&iter, snapshot);
+ bch2_btree_iter_set_snapshot(trans, &iter, snapshot);
- bch2_btree_iter_set_pos(&iter,
+ bch2_btree_iter_set_pos(trans, &iter,
POS(inum.inum, rbio->bio.bi_iter.bi_sector));
- k = bch2_btree_iter_peek_slot(&iter);
+ k = bch2_btree_iter_peek_slot(trans, &iter);
ret = bkey_err(k);
if (ret)
goto err;
@@ -225,11 +225,26 @@ static void bchfs_read(struct btree_trans *trans,
bch2_read_extent(trans, rbio, iter.pos,
data_btree, k, offset_into_extent, flags);
- swap(rbio->bio.bi_iter.bi_size, bytes);
+ /*
+ * Careful there's a landmine here if bch2_read_extent() ever
+ * starts returning transaction restarts here.
+ *
+ * We've changed rbio->bi_iter.bi_size to be "bytes we can read
+ * from this extent" with the swap call, and we restore it
+ * below. That restore needs to come before checking for
+ * errors.
+ *
+ * But unlike __bch2_read(), we use the rbio bvec iter, not one
+ * on the stack, so we can't do the restore right after the
+ * bch2_read_extent() call: we don't own that iterator anymore
+ * if BCH_READ_last_fragment is set, since we may have submitted
+ * that rbio instead of cloning it.
+ */
if (flags & BCH_READ_last_fragment)
break;
+ swap(rbio->bio.bi_iter.bi_size, bytes);
bio_advance(&rbio->bio, bytes);
err:
if (ret &&
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index c80ed3a54e70..65c2c33d253d 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -48,7 +48,7 @@ static void nocow_flush_endio(struct bio *_bio)
struct nocow_flush *bio = container_of(_bio, struct nocow_flush, bio);
closure_put(bio->cl);
- percpu_ref_put(&bio->ca->io_ref);
+ percpu_ref_put(&bio->ca->io_ref[WRITE]);
bio_put(&bio->bio);
}
@@ -71,7 +71,7 @@ void bch2_inode_flush_nocow_writes_async(struct bch_fs *c,
for_each_set_bit(dev, devs.d, BCH_SB_MEMBERS_MAX) {
rcu_read_lock();
ca = rcu_dereference(c->devs[dev]);
- if (ca && !percpu_ref_tryget(&ca->io_ref))
+ if (ca && !percpu_ref_tryget(&ca->io_ref[WRITE]))
ca = NULL;
rcu_read_unlock();
@@ -636,9 +636,9 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
if (ret)
goto bkey_err;
- bch2_btree_iter_set_snapshot(&iter, snapshot);
+ bch2_btree_iter_set_snapshot(trans, &iter, snapshot);
- k = bch2_btree_iter_peek_slot(&iter);
+ k = bch2_btree_iter_peek_slot(trans, &iter);
if ((ret = bkey_err(k)))
goto bkey_err;
@@ -649,13 +649,13 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
/* already reserved */
if (bkey_extent_is_reservation(k) &&
bch2_bkey_nr_ptrs_fully_allocated(k) >= opts.data_replicas) {
- bch2_btree_iter_advance(&iter);
+ bch2_btree_iter_advance(trans, &iter);
continue;
}
if (bkey_extent_is_data(k.k) &&
!(mode & FALLOC_FL_ZERO_RANGE)) {
- bch2_btree_iter_advance(&iter);
+ bch2_btree_iter_advance(trans, &iter);
continue;
}
@@ -676,7 +676,7 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
if (ret)
goto bkey_err;
}
- bch2_btree_iter_set_pos(&iter, POS(iter.pos.inode, hole_start));
+ bch2_btree_iter_set_pos(trans, &iter, POS(iter.pos.inode, hole_start));
if (ret)
goto bkey_err;
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index c1553e44e049..14886e1d4d6d 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -69,7 +69,7 @@ static int bch2_inode_flags_set(struct btree_trans *trans,
if (ret < 0)
return ret;
- ret = bch2_request_incompat_feature(c,bcachefs_metadata_version_casefolding);
+ ret = bch2_request_incompat_feature(c, bcachefs_metadata_version_casefolding);
if (ret)
return ret;
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index fc834bdf1f52..5a41b1a8e54f 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -88,7 +88,7 @@ int __must_check bch2_write_inode(struct bch_fs *c,
void *p, unsigned fields)
{
struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter = { NULL };
+ struct btree_iter iter = {};
struct bch_inode_unpacked inode_u;
int ret;
retry:
@@ -1075,7 +1075,7 @@ int bch2_setattr_nonsize(struct mnt_idmap *idmap,
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch_qid qid;
struct btree_trans *trans;
- struct btree_iter inode_iter = { NULL };
+ struct btree_iter inode_iter = {};
struct bch_inode_unpacked inode_u;
struct posix_acl *acl = NULL;
kuid_t kuid;
@@ -1330,9 +1330,9 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
if (ret)
continue;
- bch2_btree_iter_set_snapshot(&iter, snapshot);
+ bch2_btree_iter_set_snapshot(trans, &iter, snapshot);
- k = bch2_btree_iter_peek_max(&iter, end);
+ k = bch2_btree_iter_peek_max(trans, &iter, end);
ret = bkey_err(k);
if (ret)
continue;
@@ -1342,7 +1342,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
if (!bkey_extent_is_data(k.k) &&
k.k->type != KEY_TYPE_reservation) {
- bch2_btree_iter_advance(&iter);
+ bch2_btree_iter_advance(trans, &iter);
continue;
}
@@ -1380,7 +1380,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
bkey_copy(prev.k, cur.k);
have_extent = true;
- bch2_btree_iter_set_pos(&iter,
+ bch2_btree_iter_set_pos(trans, &iter,
POS(iter.pos.inode, iter.pos.offset + sectors));
}
bch2_trans_iter_exit(trans, &iter);
@@ -1697,17 +1697,17 @@ retry:
if (ret)
goto err;
- bch2_btree_iter_set_snapshot(&iter1, snapshot);
- bch2_btree_iter_set_snapshot(&iter2, snapshot);
+ bch2_btree_iter_set_snapshot(trans, &iter1, snapshot);
+ bch2_btree_iter_set_snapshot(trans, &iter2, snapshot);
ret = bch2_inode_find_by_inum_trans(trans, inode_inum(inode), &inode_u);
if (ret)
goto err;
if (inode_u.bi_dir == dir->ei_inode.bi_inum) {
- bch2_btree_iter_set_pos(&iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset));
+ bch2_btree_iter_set_pos(trans, &iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset));
- k = bch2_btree_iter_peek_slot(&iter1);
+ k = bch2_btree_iter_peek_slot(trans, &iter1);
ret = bkey_err(k);
if (ret)
goto err;
@@ -1731,7 +1731,7 @@ retry:
* File with multiple hardlinks and our backref is to the wrong
* directory - linear search:
*/
- for_each_btree_key_continue_norestart(iter2, 0, k, ret) {
+ for_each_btree_key_continue_norestart(trans, iter2, 0, k, ret) {
if (k.k->p.inode > dir->ei_inode.bi_inum)
break;
@@ -2237,7 +2237,7 @@ got_sb:
/* XXX: create an anonymous device for multi device filesystems */
sb->s_bdev = bdev;
sb->s_dev = bdev->bd_dev;
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[READ]);
break;
}
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 52320295dcf6..7b25cedd3e40 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -186,7 +186,7 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
{
struct bch_fs *c = trans->c;
struct qstr lostfound_str = QSTR("lost+found");
- struct btree_iter lostfound_iter = { NULL };
+ struct btree_iter lostfound_iter = {};
u64 inum = 0;
unsigned d_type = 0;
int ret;
@@ -295,8 +295,8 @@ create_lostfound:
if (ret)
goto err;
- bch2_btree_iter_set_snapshot(&lostfound_iter, snapshot);
- ret = bch2_btree_iter_traverse(&lostfound_iter);
+ bch2_btree_iter_set_snapshot(trans, &lostfound_iter, snapshot);
+ ret = bch2_btree_iter_traverse(trans, &lostfound_iter);
if (ret)
goto err;
@@ -321,6 +321,31 @@ static inline bool inode_should_reattach(struct bch_inode_unpacked *inode)
inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)
return false;
+ /*
+ * Subvolume roots are special: older versions of subvolume roots may be
+ * disconnected, it's only the newest version that matters.
+ *
+ * We only keep a single dirent pointing to a subvolume root, i.e.
+ * older versions of snapshots will not have a different dirent pointing
+ * to the same subvolume root.
+ *
+ * This is because dirents that point to subvolumes are only visible in
+ * the parent subvolume - versioning is not needed - and keeping them
+ * around would break fsck, because when we're crossing subvolumes we
+ * don't have a consistent snapshot ID to do check the inode <-> dirent
+ * relationships.
+ *
+ * Thus, a subvolume root that's been renamed after a snapshot will have
+ * a disconnected older version - that's expected.
+ *
+ * Note that taking a snapshot always updates the root inode (to update
+ * the dirent backpointer), so a subvolume root inode with
+ * BCH_INODE_has_child_snapshot is never visible.
+ */
+ if (inode->bi_subvol &&
+ (inode->bi_flags & BCH_INODE_has_child_snapshot))
+ return false;
+
return !inode->bi_dir && !(inode->bi_flags & BCH_INODE_unlinked);
}
@@ -544,7 +569,7 @@ static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 sub
new_inode.bi_subvol = subvolid;
int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?:
- bch2_btree_iter_traverse(&inode_iter) ?:
+ bch2_btree_iter_traverse(trans, &inode_iter) ?:
bch2_inode_write(trans, &inode_iter, &new_inode);
bch2_trans_iter_exit(trans, &inode_iter);
if (ret)
@@ -609,7 +634,7 @@ static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32
struct btree_iter iter = {};
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0);
- struct bkey_s_c k = bch2_btree_iter_peek_prev_min(&iter, POS(inum, 0));
+ struct bkey_s_c k = bch2_btree_iter_peek_prev_min(trans, &iter, POS(inum, 0));
bch2_trans_iter_exit(trans, &iter);
int ret = bkey_err(k);
if (ret)
@@ -1007,6 +1032,23 @@ static int check_inode_dirent_inode(struct btree_trans *trans,
if (ret && !bch2_err_matches(ret, ENOENT))
return ret;
+ if ((ret || dirent_points_to_inode_nowarn(d, inode)) &&
+ inode->bi_subvol &&
+ (inode->bi_flags & BCH_INODE_has_child_snapshot)) {
+ /* Older version of a renamed subvolume root: we won't have a
+ * correct dirent for it. That's expected, see
+ * inode_should_reattach().
+ *
+ * We don't clear the backpointer field when doing the rename
+ * because there might be arbitrarily many versions in older
+ * snapshots.
+ */
+ inode->bi_dir = 0;
+ inode->bi_dir_offset = 0;
+ *write_inode = true;
+ goto out;
+ }
+
if (fsck_err_on(ret,
trans, inode_points_to_missing_dirent,
"inode points to missing dirent\n%s",
@@ -1027,7 +1069,7 @@ static int check_inode_dirent_inode(struct btree_trans *trans,
inode->bi_dir_offset = 0;
*write_inode = true;
}
-
+out:
ret = 0;
fsck_err:
bch2_trans_iter_exit(trans, &dirent_iter);
@@ -1557,7 +1599,7 @@ static int overlapping_extents_found(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct printbuf buf = PRINTBUF;
- struct btree_iter iter1, iter2 = { NULL };
+ struct btree_iter iter1, iter2 = {};
struct bkey_s_c k1, k2;
int ret;
@@ -1566,7 +1608,7 @@ static int overlapping_extents_found(struct btree_trans *trans,
bch2_trans_iter_init(trans, &iter1, btree, pos1,
BTREE_ITER_all_snapshots|
BTREE_ITER_not_extents);
- k1 = bch2_btree_iter_peek_max(&iter1, POS(pos1.inode, U64_MAX));
+ k1 = bch2_btree_iter_peek_max(trans, &iter1, POS(pos1.inode, U64_MAX));
ret = bkey_err(k1);
if (ret)
goto err;
@@ -1586,12 +1628,12 @@ static int overlapping_extents_found(struct btree_trans *trans,
goto err;
}
- bch2_trans_copy_iter(&iter2, &iter1);
+ bch2_trans_copy_iter(trans, &iter2, &iter1);
while (1) {
- bch2_btree_iter_advance(&iter2);
+ bch2_btree_iter_advance(trans, &iter2);
- k2 = bch2_btree_iter_peek_max(&iter2, POS(pos1.inode, U64_MAX));
+ k2 = bch2_btree_iter_peek_max(trans, &iter2, POS(pos1.inode, U64_MAX));
ret = bkey_err(k2);
if (ret)
goto err;
@@ -1791,9 +1833,9 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
struct btree_iter iter2;
- bch2_trans_copy_iter(&iter2, iter);
- bch2_btree_iter_set_snapshot(&iter2, i->snapshot);
- ret = bch2_btree_iter_traverse(&iter2) ?:
+ bch2_trans_copy_iter(trans, &iter2, iter);
+ bch2_btree_iter_set_snapshot(trans, &iter2, i->snapshot);
+ ret = bch2_btree_iter_traverse(trans, &iter2) ?:
bch2_btree_delete_at(trans, &iter2,
BTREE_UPDATE_internal_snapshot_node);
bch2_trans_iter_exit(trans, &iter2);
@@ -2185,7 +2227,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
BTREE_ID_dirents,
SPOS(k.k->p.inode, k.k->p.offset, *i),
BTREE_ITER_intent);
- ret = bch2_btree_iter_traverse(&delete_iter) ?:
+ ret = bch2_btree_iter_traverse(trans, &delete_iter) ?:
bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
hash_info,
&delete_iter,
@@ -2412,7 +2454,7 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter,
bch2_trans_iter_exit(trans, &parent_iter);
bch2_trans_iter_init(trans, &parent_iter,
BTREE_ID_subvolumes, POS(0, parent), 0);
- k = bch2_btree_iter_peek_slot(&parent_iter);
+ k = bch2_btree_iter_peek_slot(trans, &parent_iter);
ret = bkey_err(k);
if (ret)
goto err;
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 80051073f613..b51d98cf8a80 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -940,7 +940,7 @@ int bch2_inode_create(struct btree_trans *trans,
BTREE_ITER_intent);
struct bkey_s_c k;
again:
- while ((k = bch2_btree_iter_peek(iter)).k &&
+ while ((k = bch2_btree_iter_peek(trans, iter)).k &&
!(ret = bkey_err(k)) &&
bkey_lt(k.k->p, POS(0, max))) {
if (pos < iter->pos.offset)
@@ -951,7 +951,7 @@ again:
* we've found just one:
*/
pos = iter->pos.offset + 1;
- bch2_btree_iter_set_pos(iter, POS(0, pos));
+ bch2_btree_iter_set_pos(trans, iter, POS(0, pos));
}
if (!ret && pos < max)
@@ -967,12 +967,12 @@ again:
/* Retry from start */
pos = start = min;
- bch2_btree_iter_set_pos(iter, POS(0, pos));
+ bch2_btree_iter_set_pos(trans, iter, POS(0, pos));
le32_add_cpu(&cursor->v.gen, 1);
goto again;
found_slot:
- bch2_btree_iter_set_pos(iter, SPOS(0, pos, snapshot));
- k = bch2_btree_iter_peek_slot(iter);
+ bch2_btree_iter_set_pos(trans, iter, SPOS(0, pos, snapshot));
+ k = bch2_btree_iter_peek_slot(trans, iter);
ret = bkey_err(k);
if (ret) {
bch2_trans_iter_exit(trans, iter);
@@ -1009,9 +1009,9 @@ static int bch2_inode_delete_keys(struct btree_trans *trans,
if (ret)
goto err;
- bch2_btree_iter_set_snapshot(&iter, snapshot);
+ bch2_btree_iter_set_snapshot(trans, &iter, snapshot);
- k = bch2_btree_iter_peek_max(&iter, end);
+ k = bch2_btree_iter_peek_max(trans, &iter, end);
ret = bkey_err(k);
if (ret)
goto err;
@@ -1042,7 +1042,7 @@ err:
int bch2_inode_rm(struct bch_fs *c, subvol_inum inum)
{
struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter = { NULL };
+ struct btree_iter iter = {};
struct bkey_s_c k;
u32 snapshot;
int ret;
@@ -1207,7 +1207,7 @@ int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_i
static noinline int __bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
{
struct bch_fs *c = trans->c;
- struct btree_iter iter = { NULL };
+ struct btree_iter iter = {};
struct bkey_i_inode_generation delete;
struct bch_inode_unpacked inode_u;
struct bkey_s_c k;
diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c
index 6b842c8d21be..cc07729a4b62 100644
--- a/fs/bcachefs/io_misc.c
+++ b/fs/bcachefs/io_misc.c
@@ -43,7 +43,7 @@ int bch2_extent_fallocate(struct btree_trans *trans,
bch2_bkey_buf_init(&new);
closure_init_stack(&cl);
- k = bch2_btree_iter_peek_slot(iter);
+ k = bch2_btree_iter_peek_slot(trans, iter);
ret = bkey_err(k);
if (ret)
return ret;
@@ -164,12 +164,12 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
if (ret)
continue;
- bch2_btree_iter_set_snapshot(iter, snapshot);
+ bch2_btree_iter_set_snapshot(trans, iter, snapshot);
/*
* peek_max() doesn't have ideal semantics for extents:
*/
- k = bch2_btree_iter_peek_max(iter, end_pos);
+ k = bch2_btree_iter_peek_max(trans, iter, end_pos);
if (!k.k)
break;
@@ -230,7 +230,7 @@ static int truncate_set_isize(struct btree_trans *trans,
u64 new_i_size,
bool warn)
{
- struct btree_iter iter = { NULL };
+ struct btree_iter iter = {};
struct bch_inode_unpacked inode_u;
int ret;
@@ -399,7 +399,7 @@ case LOGGED_OP_FINSERT_start:
if (ret)
goto err;
} else {
- bch2_btree_iter_set_pos(&iter, POS(inum.inum, src_offset));
+ bch2_btree_iter_set_pos(trans, &iter, POS(inum.inum, src_offset));
ret = bch2_fpunch_at(trans, &iter, inum, src_offset + len, i_sectors_delta);
if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -425,12 +425,12 @@ case LOGGED_OP_FINSERT_shift_extents:
if (ret)
goto btree_err;
- bch2_btree_iter_set_snapshot(&iter, snapshot);
- bch2_btree_iter_set_pos(&iter, SPOS(inum.inum, pos, snapshot));
+ bch2_btree_iter_set_snapshot(trans, &iter, snapshot);
+ bch2_btree_iter_set_pos(trans, &iter, SPOS(inum.inum, pos, snapshot));
k = insert
- ? bch2_btree_iter_peek_prev_min(&iter, POS(inum.inum, 0))
- : bch2_btree_iter_peek_max(&iter, POS(inum.inum, U64_MAX));
+ ? bch2_btree_iter_peek_prev_min(trans, &iter, POS(inum.inum, 0))
+ : bch2_btree_iter_peek_max(trans, &iter, POS(inum.inum, U64_MAX));
if ((ret = bkey_err(k)))
goto btree_err;
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
index fd01e67b3e84..def4a26a3b45 100644
--- a/fs/bcachefs/io_read.c
+++ b/fs/bcachefs/io_read.c
@@ -394,7 +394,7 @@ static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
if (rbio->have_ioref) {
struct bch_dev *ca = bch2_dev_have_ref(rbio->c, rbio->pick.ptr.dev);
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[READ]);
}
if (rbio->split) {
@@ -487,6 +487,8 @@ static void bch2_rbio_retry(struct work_struct *work)
.inum = rbio->read_pos.inode,
};
struct bch_io_failures failed = { .nr = 0 };
+ int orig_error = rbio->ret;
+
struct btree_trans *trans = bch2_trans_get(c);
trace_io_read_retry(&rbio->bio);
@@ -519,7 +521,9 @@ static void bch2_rbio_retry(struct work_struct *work)
if (ret) {
rbio->ret = ret;
rbio->bio.bi_status = BLK_STS_IOERR;
- } else {
+ } else if (orig_error != -BCH_ERR_data_read_retry_csum_err_maybe_userspace &&
+ orig_error != -BCH_ERR_data_read_ptr_stale_race &&
+ !failed.nr) {
struct printbuf buf = PRINTBUF;
lockrestart_do(trans,
@@ -909,7 +913,7 @@ static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans,
prt_printf(&buf, "memory gen: %u", gen);
- ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
+ ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(trans, &iter)));
if (!ret) {
prt_newline(&buf);
bch2_bkey_val_to_text(&buf, c, k);
@@ -977,7 +981,8 @@ retry_pick:
goto err;
}
- if (unlikely(bch2_csum_type_is_encryption(pick.crc.csum_type)) && !c->chacha20) {
+ if (unlikely(bch2_csum_type_is_encryption(pick.crc.csum_type)) &&
+ !c->chacha20_key_set) {
struct printbuf buf = PRINTBUF;
bch2_read_err_msg_trans(trans, &buf, orig, read_pos);
prt_printf(&buf, "attempting to read encrypted data without encryption key\n ");
@@ -1003,7 +1008,7 @@ retry_pick:
unlikely(dev_ptr_stale(ca, &pick.ptr))) {
read_from_stale_dirty_pointer(trans, ca, k, pick.ptr);
bch2_mark_io_failure(failed, &pick, false);
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[READ]);
goto retry_pick;
}
@@ -1036,7 +1041,7 @@ retry_pick:
*/
if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) {
if (ca)
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[READ]);
rbio->ret = -BCH_ERR_data_read_buffer_too_small;
goto out_read_done;
}
@@ -1285,12 +1290,12 @@ int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio,
if (ret)
goto err;
- bch2_btree_iter_set_snapshot(&iter, snapshot);
+ bch2_btree_iter_set_snapshot(trans, &iter, snapshot);
- bch2_btree_iter_set_pos(&iter,
+ bch2_btree_iter_set_pos(trans, &iter,
POS(inum.inum, bvec_iter.bi_sector));
- k = bch2_btree_iter_peek_slot(&iter);
+ k = bch2_btree_iter_peek_slot(trans, &iter);
ret = bkey_err(k);
if (ret)
goto err;
@@ -1344,14 +1349,16 @@ err:
bch2_trans_iter_exit(trans, &iter);
- if (ret) {
- struct printbuf buf = PRINTBUF;
- lockrestart_do(trans,
- bch2_inum_offset_err_msg_trans(trans, &buf, inum,
- bvec_iter.bi_sector << 9));
- prt_printf(&buf, "read error: %s", bch2_err_str(ret));
- bch_err_ratelimited(c, "%s", buf.buf);
- printbuf_exit(&buf);
+ if (unlikely(ret)) {
+ if (ret != -BCH_ERR_extent_poisoned) {
+ struct printbuf buf = PRINTBUF;
+ lockrestart_do(trans,
+ bch2_inum_offset_err_msg_trans(trans, &buf, inum,
+ bvec_iter.bi_sector << 9));
+ prt_printf(&buf, "data read error: %s", bch2_err_str(ret));
+ bch_err_ratelimited(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ }
rbio->bio.bi_status = BLK_STS_IOERR;
rbio->ret = ret;
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index 07b55839768e..a418fa62f09d 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -168,9 +168,9 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans,
*i_sectors_delta = 0;
*disk_sectors_delta = 0;
- bch2_trans_copy_iter(&iter, extent_iter);
+ bch2_trans_copy_iter(trans, &iter, extent_iter);
- for_each_btree_key_max_continue_norestart(iter,
+ for_each_btree_key_max_continue_norestart(trans, iter,
new->k.p, BTREE_ITER_slots, old, ret) {
s64 sectors = min(new->k.p.offset, old.k->p.offset) -
max(bkey_start_offset(&new->k),
@@ -292,7 +292,7 @@ int bch2_extent_update(struct btree_trans *trans,
* path already traversed at iter->pos because
* bch2_trans_extent_update() will use it to attempt extent merging
*/
- ret = __bch2_btree_iter_traverse(iter);
+ ret = __bch2_btree_iter_traverse(trans, iter);
if (ret)
return ret;
@@ -337,7 +337,7 @@ int bch2_extent_update(struct btree_trans *trans,
if (i_sectors_delta_total)
*i_sectors_delta_total += i_sectors_delta;
- bch2_btree_iter_set_pos(iter, next_pos);
+ bch2_btree_iter_set_pos(trans, iter, next_pos);
return 0;
}
@@ -445,6 +445,11 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
BUG_ON(c->opts.nochanges);
bkey_for_each_ptr(ptrs, ptr) {
+ /*
+ * XXX: btree writes should be using io_ref[WRITE], but we
+ * aren't retrying failed btree writes yet (due to device
+ * removal/ro):
+ */
struct bch_dev *ca = nocow
? bch2_dev_have_ref(c, ptr->dev)
: bch2_dev_get_ioref(c, ptr->dev, type == BCH_DATA_btree ? READ : WRITE);
@@ -697,12 +702,19 @@ static void bch2_write_endio(struct bio *bio)
bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write,
wbio->submit_time, !bio->bi_status);
- if (bio->bi_status) {
- bch_err_inum_offset_ratelimited(ca,
- op->pos.inode,
- wbio->inode_offset << 9,
- "data write error: %s",
- bch2_blk_status_to_str(bio->bi_status));
+ if (unlikely(bio->bi_status)) {
+ if (ca)
+ bch_err_inum_offset_ratelimited(ca,
+ op->pos.inode,
+ wbio->inode_offset << 9,
+ "data write error: %s",
+ bch2_blk_status_to_str(bio->bi_status));
+ else
+ bch_err_inum_offset_ratelimited(c,
+ op->pos.inode,
+ wbio->inode_offset << 9,
+ "data write error: %s",
+ bch2_blk_status_to_str(bio->bi_status));
set_bit(wbio->dev, op->failed.d);
op->flags |= BCH_WRITE_io_error;
}
@@ -715,7 +727,7 @@ static void bch2_write_endio(struct bio *bio)
}
if (wbio->have_ioref)
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[WRITE]);
if (wbio->bounce)
bch2_bio_free_pages_pool(c, bio);
@@ -1293,7 +1305,7 @@ retry:
if (ret)
break;
- k = bch2_btree_iter_peek_slot(&iter);
+ k = bch2_btree_iter_peek_slot(trans, &iter);
ret = bkey_err(k);
if (ret)
break;
@@ -1377,7 +1389,7 @@ retry:
bch2_keylist_push(&op->insert_keys);
if (op->flags & BCH_WRITE_submitted)
break;
- bch2_btree_iter_advance(&iter);
+ bch2_btree_iter_advance(trans, &iter);
}
out:
bch2_trans_iter_exit(trans, &iter);
@@ -1414,7 +1426,7 @@ err:
return;
err_get_ioref:
darray_for_each(buckets, i)
- percpu_ref_put(&bch2_dev_have_ref(c, i->b.inode)->io_ref);
+ percpu_ref_put(&bch2_dev_have_ref(c, i->b.inode)->io_ref[WRITE]);
/* Fall back to COW path: */
goto out;
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 8a36d5536668..d8f74b6d0a75 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -1315,7 +1315,7 @@ int bch2_fs_journal_alloc(struct bch_fs *c)
int ret = bch2_dev_journal_alloc(ca, true);
if (ret) {
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[READ]);
return ret;
}
}
@@ -1404,6 +1404,14 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
nr = cur_seq - last_seq;
+ /*
+ * Extra fudge factor, in case we crashed when the journal pin fifo was
+ * nearly or completely full. We'll need to be able to open additional
+ * journal entries (at least a few) in order for journal replay to get
+ * going:
+ */
+ nr += nr / 4;
+
if (nr + 1 > j->pin.size) {
free_fifo(&j->pin);
init_fifo(&j->pin, roundup_pow_of_two(nr + 1), GFP_KERNEL);
@@ -1461,11 +1469,9 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
j->reservations.idx = journal_cur_seq(j);
c->last_bucket_seq_cleanup = journal_cur_seq(j);
-
- bch2_journal_space_available(j);
spin_unlock(&j->lock);
- return bch2_journal_reclaim_start(j);
+ return 0;
}
/* init/exit: */
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 2debc213e47c..2a54ac79189b 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1218,7 +1218,7 @@ static CLOSURE_CALLBACK(bch2_journal_read_device)
out:
bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret);
kvfree(buf.data);
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[READ]);
closure_return(cl);
return;
err:
@@ -1253,7 +1253,7 @@ int bch2_journal_read(struct bch_fs *c,
if ((ca->mi.state == BCH_MEMBER_STATE_rw ||
ca->mi.state == BCH_MEMBER_STATE_ro) &&
- percpu_ref_tryget(&ca->io_ref))
+ percpu_ref_tryget(&ca->io_ref[READ]))
closure_call(&ca->journal.read,
bch2_journal_read_device,
system_unbound_wq,
@@ -1460,7 +1460,7 @@ fsck_err:
static void journal_advance_devs_to_next_bucket(struct journal *j,
struct dev_alloc_list *devs,
- unsigned sectors, u64 seq)
+ unsigned sectors, __le64 seq)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
@@ -1768,7 +1768,7 @@ static void journal_write_endio(struct bio *bio)
}
closure_put(&w->io);
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[WRITE]);
}
static CLOSURE_CALLBACK(journal_write_submit)
@@ -1843,7 +1843,7 @@ static CLOSURE_CALLBACK(journal_write_preflush)
if (w->separate_flush) {
for_each_rw_member(c, ca) {
- percpu_ref_get(&ca->io_ref);
+ percpu_ref_get(&ca->io_ref[WRITE]);
struct journal_device *ja = &ca->journal;
struct bio *bio = &ja->bio[w->idx]->bio;
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index 57ad662871ba..90dcf80bd64a 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -130,7 +130,7 @@ static int bch2_dev_metadata_drop(struct bch_fs *c,
retry:
ret = 0;
while (bch2_trans_begin(trans),
- (b = bch2_btree_iter_peek_node(&iter)) &&
+ (b = bch2_btree_iter_peek_node(trans, &iter)) &&
!(ret = PTR_ERR_OR_ZERO(b))) {
bch2_progress_update_iter(trans, progress, &iter, "dropping metadata");
@@ -154,7 +154,7 @@ retry:
if (ret)
break;
next:
- bch2_btree_iter_next_node(&iter);
+ bch2_btree_iter_next_node(trans, &iter);
}
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 5d41260e10da..fc396b9fa754 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -545,7 +545,7 @@ static struct bkey_s_c bch2_lookup_indirect_extent_for_move(struct btree_trans *
BTREE_ID_reflink, reflink_pos,
BTREE_ITER_not_extents);
- struct bkey_s_c k = bch2_btree_iter_peek(iter);
+ struct bkey_s_c k = bch2_btree_iter_peek(trans, iter);
if (!k.k || bkey_err(k)) {
bch2_trans_iter_exit(trans, iter);
return k;
@@ -603,7 +603,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
bch2_trans_begin(trans);
- k = bch2_btree_iter_peek(&iter);
+ k = bch2_btree_iter_peek(trans, &iter);
if (!k.k)
break;
@@ -681,7 +681,7 @@ next:
if (ctxt->stats)
atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
next_nondata:
- bch2_btree_iter_advance(&iter);
+ bch2_btree_iter_advance(trans, &iter);
}
bch2_trans_iter_exit(trans, &reflink_iter);
@@ -794,7 +794,7 @@ static int __bch2_move_data_phys(struct moving_context *ctxt,
bch2_trans_begin(trans);
- k = bch2_btree_iter_peek(&bp_iter);
+ k = bch2_btree_iter_peek(trans, &bp_iter);
ret = bkey_err(k);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
continue;
@@ -876,7 +876,7 @@ static int __bch2_move_data_phys(struct moving_context *ctxt,
if (ctxt->stats)
atomic64_add(sectors, &ctxt->stats->sectors_seen);
next:
- bch2_btree_iter_advance(&bp_iter);
+ bch2_btree_iter_advance(trans, &bp_iter);
}
err:
bch2_trans_iter_exit(trans, &bp_iter);
@@ -991,7 +991,7 @@ static int bch2_move_btree(struct bch_fs *c,
retry:
ret = 0;
while (bch2_trans_begin(trans),
- (b = bch2_btree_iter_peek_node(&iter)) &&
+ (b = bch2_btree_iter_peek_node(trans, &iter)) &&
!(ret = PTR_ERR_OR_ZERO(b))) {
if (kthread && kthread_should_stop())
break;
@@ -1011,7 +1011,7 @@ retry:
if (ret)
break;
next:
- bch2_btree_iter_next_node(&iter);
+ bch2_btree_iter_next_node(trans, &iter);
}
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 5126c870ce5b..159410c50861 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -280,7 +280,11 @@ unsigned long bch2_copygc_wait_amount(struct bch_fs *c)
s64 wait = S64_MAX, fragmented_allowed, fragmented;
for_each_rw_member(c, ca) {
- struct bch_dev_usage usage = bch2_dev_usage_read(ca);
+ struct bch_dev_usage_full usage_full = bch2_dev_usage_full_read(ca);
+ struct bch_dev_usage usage;
+
+ for (unsigned i = 0; i < BCH_DATA_NR; i++)
+ usage.buckets[i] = usage_full.d[i].buckets;
fragmented_allowed = ((__dev_buckets_available(ca, usage, BCH_WATERMARK_stripe) *
ca->mi.bucket_size) >> 1);
@@ -288,7 +292,7 @@ unsigned long bch2_copygc_wait_amount(struct bch_fs *c)
for (unsigned i = 0; i < BCH_DATA_NR; i++)
if (data_type_movable(i))
- fragmented += usage.d[i].fragmented;
+ fragmented += usage_full.d[i].fragmented;
wait = min(wait, max(0LL, fragmented_allowed - fragmented));
}
diff --git a/fs/bcachefs/namei.c b/fs/bcachefs/namei.c
index ee7251709fb9..0d65ea96f7a2 100644
--- a/fs/bcachefs/namei.c
+++ b/fs/bcachefs/namei.c
@@ -28,8 +28,8 @@ int bch2_create_trans(struct btree_trans *trans,
unsigned flags)
{
struct bch_fs *c = trans->c;
- struct btree_iter dir_iter = { NULL };
- struct btree_iter inode_iter = { NULL };
+ struct btree_iter dir_iter = {};
+ struct btree_iter inode_iter = {};
subvol_inum new_inum = dir;
u64 now = bch2_current_time(c);
u64 cpu = raw_smp_processor_id();
@@ -127,8 +127,8 @@ int bch2_create_trans(struct btree_trans *trans,
if (ret)
goto err;
- bch2_btree_iter_set_snapshot(&dir_iter, dir_snapshot);
- ret = bch2_btree_iter_traverse(&dir_iter);
+ bch2_btree_iter_set_snapshot(trans, &dir_iter, dir_snapshot);
+ ret = bch2_btree_iter_traverse(trans, &dir_iter);
if (ret)
goto err;
}
@@ -177,9 +177,9 @@ int bch2_create_trans(struct btree_trans *trans,
new_inode->bi_depth = dir_u->bi_depth + 1;
inode_iter.flags &= ~BTREE_ITER_all_snapshots;
- bch2_btree_iter_set_snapshot(&inode_iter, snapshot);
+ bch2_btree_iter_set_snapshot(trans, &inode_iter, snapshot);
- ret = bch2_btree_iter_traverse(&inode_iter) ?:
+ ret = bch2_btree_iter_traverse(trans, &inode_iter) ?:
bch2_inode_write(trans, &inode_iter, new_inode);
err:
bch2_trans_iter_exit(trans, &inode_iter);
@@ -193,8 +193,8 @@ int bch2_link_trans(struct btree_trans *trans,
const struct qstr *name)
{
struct bch_fs *c = trans->c;
- struct btree_iter dir_iter = { NULL };
- struct btree_iter inode_iter = { NULL };
+ struct btree_iter dir_iter = {};
+ struct btree_iter inode_iter = {};
struct bch_hash_info dir_hash;
u64 now = bch2_current_time(c);
u64 dir_offset = 0;
@@ -253,9 +253,9 @@ int bch2_unlink_trans(struct btree_trans *trans,
bool deleting_subvol)
{
struct bch_fs *c = trans->c;
- struct btree_iter dir_iter = { NULL };
- struct btree_iter dirent_iter = { NULL };
- struct btree_iter inode_iter = { NULL };
+ struct btree_iter dir_iter = {};
+ struct btree_iter dirent_iter = {};
+ struct btree_iter inode_iter = {};
struct bch_hash_info dir_hash;
subvol_inum inum;
u64 now = bch2_current_time(c);
@@ -301,7 +301,7 @@ int bch2_unlink_trans(struct btree_trans *trans,
if (ret)
goto err;
- k = bch2_btree_iter_peek_slot(&dirent_iter);
+ k = bch2_btree_iter_peek_slot(trans, &dirent_iter);
ret = bkey_err(k);
if (ret)
goto err;
@@ -310,8 +310,8 @@ int bch2_unlink_trans(struct btree_trans *trans,
* If we're deleting a subvolume, we need to really delete the
* dirent, not just emit a whiteout in the current snapshot:
*/
- bch2_btree_iter_set_snapshot(&dirent_iter, k.k->p.snapshot);
- ret = bch2_btree_iter_traverse(&dirent_iter);
+ bch2_btree_iter_set_snapshot(trans, &dirent_iter, k.k->p.snapshot);
+ ret = bch2_btree_iter_traverse(trans, &dirent_iter);
if (ret)
goto err;
} else {
@@ -390,10 +390,10 @@ int bch2_rename_trans(struct btree_trans *trans,
enum bch_rename_mode mode)
{
struct bch_fs *c = trans->c;
- struct btree_iter src_dir_iter = { NULL };
- struct btree_iter dst_dir_iter = { NULL };
- struct btree_iter src_inode_iter = { NULL };
- struct btree_iter dst_inode_iter = { NULL };
+ struct btree_iter src_dir_iter = {};
+ struct btree_iter dst_dir_iter = {};
+ struct btree_iter src_inode_iter = {};
+ struct btree_iter dst_inode_iter = {};
struct bch_hash_info src_hash, dst_hash;
subvol_inum src_inum, dst_inum;
u64 src_offset, dst_offset;
@@ -666,7 +666,7 @@ static int bch2_check_dirent_inode_dirent(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct printbuf buf = PRINTBUF;
- struct btree_iter bp_iter = { NULL };
+ struct btree_iter bp_iter = {};
int ret = 0;
if (inode_points_to_dirent(target, d))
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index 8b857fc33244..3d4755d73af7 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -516,7 +516,7 @@ static int bch2_fs_quota_read_inode(struct btree_trans *trans,
bch2_quota_acct(c, bch_qid(&u), Q_INO, 1,
KEY_TYPE_QUOTA_NOCHECK);
advance:
- bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos));
+ bch2_btree_iter_set_pos(trans, iter, bpos_nosnap_successor(iter->pos));
return 0;
}
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index b9bde04b66c0..c63fa53f30d2 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -233,7 +233,7 @@ int bch2_set_rebalance_needs_scan_trans(struct btree_trans *trans, u64 inum)
bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work,
SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
BTREE_ITER_intent);
- k = bch2_btree_iter_peek_slot(&iter);
+ k = bch2_btree_iter_peek_slot(trans, &iter);
ret = bkey_err(k);
if (ret)
goto err;
@@ -281,7 +281,7 @@ static int bch2_clear_rebalance_needs_scan(struct btree_trans *trans, u64 inum,
bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work,
SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
BTREE_ITER_intent);
- k = bch2_btree_iter_peek_slot(&iter);
+ k = bch2_btree_iter_peek_slot(trans, &iter);
ret = bkey_err(k);
if (ret)
goto err;
@@ -301,7 +301,7 @@ static struct bkey_s_c next_rebalance_entry(struct btree_trans *trans,
struct btree_iter *work_iter)
{
return !kthread_should_stop()
- ? bch2_btree_iter_peek(work_iter)
+ ? bch2_btree_iter_peek(trans, work_iter)
: bkey_s_c_null;
}
@@ -335,7 +335,7 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
work_pos.inode ? BTREE_ID_extents : BTREE_ID_reflink,
work_pos,
BTREE_ITER_all_snapshots);
- struct bkey_s_c k = bch2_btree_iter_peek_slot(extent_iter);
+ struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, extent_iter);
if (bkey_err(k))
return k;
@@ -511,7 +511,7 @@ static int do_rebalance(struct moving_context *ctxt)
struct btree_trans *trans = ctxt->trans;
struct bch_fs *c = trans->c;
struct bch_fs_rebalance *r = &c->rebalance;
- struct btree_iter rebalance_work_iter, extent_iter = { NULL };
+ struct btree_iter rebalance_work_iter, extent_iter = {};
struct bkey_s_c k;
int ret = 0;
@@ -552,7 +552,7 @@ static int do_rebalance(struct moving_context *ctxt)
if (ret)
break;
- bch2_btree_iter_advance(&rebalance_work_iter);
+ bch2_btree_iter_advance(trans, &rebalance_work_iter);
}
bch2_trans_iter_exit(trans, &extent_iter);
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 266c5770c824..606d684e6f23 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -198,7 +198,7 @@ static int bch2_journal_replay_accounting_key(struct btree_trans *trans,
bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
BTREE_MAX_DEPTH, k->level,
BTREE_ITER_intent);
- int ret = bch2_btree_iter_traverse(&iter);
+ int ret = bch2_btree_iter_traverse(trans, &iter);
if (ret)
goto out;
@@ -261,7 +261,7 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
BTREE_MAX_DEPTH, k->level,
iter_flags);
- ret = bch2_btree_iter_traverse(&iter);
+ ret = bch2_btree_iter_traverse(trans, &iter);
if (ret)
goto out;
@@ -270,7 +270,7 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
bch2_trans_iter_exit(trans, &iter);
bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
BTREE_MAX_DEPTH, 0, iter_flags);
- ret = bch2_btree_iter_traverse(&iter) ?:
+ ret = bch2_btree_iter_traverse(trans, &iter) ?:
bch2_btree_increase_depth(trans, iter.path, 0) ?:
-BCH_ERR_transaction_restart_nested;
goto out;
@@ -389,9 +389,9 @@ int bch2_journal_replay(struct bch_fs *c)
* Now, replay any remaining keys in the order in which they appear in
* the journal, unpinning those journal entries as we go:
*/
- sort(keys_sorted.data, keys_sorted.nr,
- sizeof(keys_sorted.data[0]),
- journal_sort_seq_cmp, NULL);
+ sort_nonatomic(keys_sorted.data, keys_sorted.nr,
+ sizeof(keys_sorted.data[0]),
+ journal_sort_seq_cmp, NULL);
darray_for_each(keys_sorted, kp) {
cond_resched();
@@ -1125,7 +1125,10 @@ int bch2_fs_initialize(struct bch_fs *c)
* journal_res_get() will crash if called before this has
* set up the journal.pin FIFO and journal.cur pointer:
*/
- bch2_fs_journal_start(&c->journal, 1);
+ ret = bch2_fs_journal_start(&c->journal, 1);
+ if (ret)
+ goto err;
+
set_bit(BCH_FS_accounting_replay_done, &c->flags);
bch2_journal_set_replay_done(&c->journal);
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index ee23f1f93acc..710178e3da4c 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -495,7 +495,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
bool reflink_p_may_update_opts_field)
{
struct bch_fs *c = trans->c;
- struct btree_iter reflink_iter = { NULL };
+ struct btree_iter reflink_iter = {};
struct bkey_s_c k;
struct bkey_i *r_v;
struct bkey_i_reflink_p *r_p;
@@ -507,7 +507,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
bch2_trans_iter_init(trans, &reflink_iter, BTREE_ID_reflink, POS_MAX,
BTREE_ITER_intent);
- k = bch2_btree_iter_peek_prev(&reflink_iter);
+ k = bch2_btree_iter_peek_prev(trans, &reflink_iter);
ret = bkey_err(k);
if (ret)
goto err;
@@ -569,12 +569,13 @@ err:
return ret;
}
-static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end)
+static struct bkey_s_c get_next_src(struct btree_trans *trans,
+ struct btree_iter *iter, struct bpos end)
{
struct bkey_s_c k;
int ret;
- for_each_btree_key_max_continue_norestart(*iter, end, 0, k, ret) {
+ for_each_btree_key_max_continue_norestart(trans, *iter, end, 0, k, ret) {
if (bkey_extent_is_unwritten(k))
continue;
@@ -583,7 +584,7 @@ static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end)
}
if (bkey_ge(iter->pos, end))
- bch2_btree_iter_set_pos(iter, end);
+ bch2_btree_iter_set_pos(trans, iter, end);
return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
}
@@ -647,27 +648,27 @@ s64 bch2_remap_range(struct bch_fs *c,
if (ret)
continue;
- bch2_btree_iter_set_snapshot(&src_iter, src_snapshot);
+ bch2_btree_iter_set_snapshot(trans, &src_iter, src_snapshot);
ret = bch2_subvolume_get_snapshot(trans, dst_inum.subvol,
&dst_snapshot);
if (ret)
continue;
- bch2_btree_iter_set_snapshot(&dst_iter, dst_snapshot);
+ bch2_btree_iter_set_snapshot(trans, &dst_iter, dst_snapshot);
if (dst_inum.inum < src_inum.inum) {
/* Avoid some lock cycle transaction restarts */
- ret = bch2_btree_iter_traverse(&dst_iter);
+ ret = bch2_btree_iter_traverse(trans, &dst_iter);
if (ret)
continue;
}
dst_done = dst_iter.pos.offset - dst_start.offset;
src_want = POS(src_start.inode, src_start.offset + dst_done);
- bch2_btree_iter_set_pos(&src_iter, src_want);
+ bch2_btree_iter_set_pos(trans, &src_iter, src_want);
- src_k = get_next_src(&src_iter, src_end);
+ src_k = get_next_src(trans, &src_iter, src_end);
ret = bkey_err(src_k);
if (ret)
continue;
@@ -738,7 +739,7 @@ s64 bch2_remap_range(struct bch_fs *c,
do {
struct bch_inode_unpacked inode_u;
- struct btree_iter inode_iter = { NULL };
+ struct btree_iter inode_iter = {};
bch2_trans_begin(trans);
diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
index 5d43e3504386..dc53d25c7cbb 100644
--- a/fs/bcachefs/sb-errors_format.h
+++ b/fs/bcachefs/sb-errors_format.h
@@ -290,8 +290,8 @@ enum bch_fsck_flags {
x(btree_node_bkey_bad_u64s, 260, 0) \
x(btree_node_topology_empty_interior_node, 261, 0) \
x(btree_ptr_v2_min_key_bad, 262, 0) \
- x(btree_root_unreadable_and_scan_found_nothing, 263, 0) \
- x(snapshot_node_missing, 264, 0) \
+ x(btree_root_unreadable_and_scan_found_nothing, 263, FSCK_AUTOFIX) \
+ x(snapshot_node_missing, 264, FSCK_AUTOFIX) \
x(dup_backpointer_to_bad_csum_extent, 265, 0) \
x(btree_bitmap_not_marked, 266, FSCK_AUTOFIX) \
x(sb_clean_entry_overrun, 267, 0) \
diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h
index 38261638a611..06bb41a3f360 100644
--- a/fs/bcachefs/sb-members.h
+++ b/fs/bcachefs/sb-members.h
@@ -20,7 +20,7 @@ struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i);
static inline bool bch2_dev_is_online(struct bch_dev *ca)
{
- return !percpu_ref_is_zero(&ca->io_ref);
+ return !percpu_ref_is_zero(&ca->io_ref[READ]);
}
static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *, unsigned);
@@ -156,33 +156,34 @@ static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, struct bch_dev
static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c,
struct bch_dev *ca,
- unsigned state_mask)
+ unsigned state_mask,
+ int rw)
{
rcu_read_lock();
if (ca)
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[rw]);
while ((ca = __bch2_next_dev(c, ca, NULL)) &&
(!((1 << ca->mi.state) & state_mask) ||
- !percpu_ref_tryget(&ca->io_ref)))
+ !percpu_ref_tryget(&ca->io_ref[rw])))
;
rcu_read_unlock();
return ca;
}
-#define __for_each_online_member(_c, _ca, state_mask) \
+#define __for_each_online_member(_c, _ca, state_mask, rw) \
for (struct bch_dev *_ca = NULL; \
- (_ca = bch2_get_next_online_dev(_c, _ca, state_mask));)
+ (_ca = bch2_get_next_online_dev(_c, _ca, state_mask, rw));)
#define for_each_online_member(c, ca) \
- __for_each_online_member(c, ca, ~0)
+ __for_each_online_member(c, ca, ~0, READ)
#define for_each_rw_member(c, ca) \
- __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw))
+ __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw), WRITE)
#define for_each_readable_member(c, ca) \
- __for_each_online_member(c, ca, BIT( BCH_MEMBER_STATE_rw)|BIT(BCH_MEMBER_STATE_ro))
+ __for_each_online_member(c, ca, BIT( BCH_MEMBER_STATE_rw)|BIT(BCH_MEMBER_STATE_ro), READ)
static inline bool bch2_dev_exists(const struct bch_fs *c, unsigned dev)
{
@@ -287,7 +288,7 @@ static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev,
rcu_read_lock();
struct bch_dev *ca = bch2_dev_rcu(c, dev);
- if (ca && !percpu_ref_tryget(&ca->io_ref))
+ if (ca && !percpu_ref_tryget(&ca->io_ref[rw]))
ca = NULL;
rcu_read_unlock();
@@ -297,7 +298,7 @@ static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev,
return ca;
if (ca)
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[rw]);
return NULL;
}
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index 0c65065b08ec..b7de29aed839 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -843,9 +843,6 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id)
{
struct bch_fs *c = trans->c;
- if (bch2_snapshot_exists(c, id))
- return 0;
-
/* Do we need to reconstruct the snapshot_tree entry as well? */
struct btree_iter iter;
struct bkey_s_c k;
@@ -1074,9 +1071,9 @@ static inline void normalize_snapshot_child_pointers(struct bch_snapshot *s)
static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
{
struct bch_fs *c = trans->c;
- struct btree_iter iter, p_iter = (struct btree_iter) { NULL };
- struct btree_iter c_iter = (struct btree_iter) { NULL };
- struct btree_iter tree_iter = (struct btree_iter) { NULL };
+ struct btree_iter iter, p_iter = {};
+ struct btree_iter c_iter = {};
+ struct btree_iter tree_iter = {};
struct bkey_s_c_snapshot s;
u32 parent_id, child_id;
unsigned i;
@@ -1193,13 +1190,13 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
POS_MIN, BTREE_ITER_intent);
- k = bch2_btree_iter_peek(&iter);
+ k = bch2_btree_iter_peek(trans, &iter);
ret = bkey_err(k);
if (ret)
goto err;
for (i = 0; i < nr_snapids; i++) {
- k = bch2_btree_iter_prev_slot(&iter);
+ k = bch2_btree_iter_prev_slot(trans, &iter);
ret = bkey_err(k);
if (ret)
goto err;
diff --git a/fs/bcachefs/str_hash.c b/fs/bcachefs/str_hash.c
index 602afca2f5ef..a90bf7b8a2b4 100644
--- a/fs/bcachefs/str_hash.c
+++ b/fs/bcachefs/str_hash.c
@@ -195,7 +195,7 @@ int __bch2_str_hash_check_key(struct btree_trans *trans,
struct btree_iter *k_iter, struct bkey_s_c hash_k)
{
struct bch_fs *c = trans->c;
- struct btree_iter iter = { NULL };
+ struct btree_iter iter = {};
struct printbuf buf = PRINTBUF;
struct bkey_s_c k;
int ret = 0;
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index 575ad1e03904..09a354a26c3b 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -231,11 +231,11 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans,
struct bkey_s_c k;
int ret;
- bch2_trans_copy_iter(&iter, start);
+ bch2_trans_copy_iter(trans, &iter, start);
- bch2_btree_iter_advance(&iter);
+ bch2_btree_iter_advance(trans, &iter);
- for_each_btree_key_continue_norestart(iter, BTREE_ITER_slots, k, ret) {
+ for_each_btree_key_continue_norestart(trans, iter, BTREE_ITER_slots, k, ret) {
if (k.k->type != desc.key_type &&
k.k->type != KEY_TYPE_hash_whiteout)
break;
@@ -280,7 +280,7 @@ struct bkey_s_c bch2_hash_set_or_get_in_snapshot(struct btree_trans *trans,
}
if (!slot.path && !(flags & STR_HASH_must_replace))
- bch2_trans_copy_iter(&slot, iter);
+ bch2_trans_copy_iter(trans, &slot, iter);
if (k.k->type != KEY_TYPE_hash_whiteout)
goto not_found;
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index cd0d8e5e44e7..5537283d0bea 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -275,7 +275,7 @@ int bch2_subvol_has_children(struct btree_trans *trans, u32 subvol)
struct btree_iter iter;
bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolume_children, POS(subvol, 0), 0);
- struct bkey_s_c k = bch2_btree_iter_peek(&iter);
+ struct bkey_s_c k = bch2_btree_iter_peek(trans, &iter);
bch2_trans_iter_exit(trans, &iter);
return bkey_err(k) ?: k.k && k.k->p.inode == subvol
@@ -574,7 +574,7 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
bool ro)
{
struct bch_fs *c = trans->c;
- struct btree_iter dst_iter, src_iter = (struct btree_iter) { NULL };
+ struct btree_iter dst_iter, src_iter = {};
struct bkey_i_subvolume *new_subvol = NULL;
struct bkey_i_subvolume *src_subvol = NULL;
u32 parent = 0, new_nodes[2], snapshot_subvols[2];
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index 910f6196700e..f640c1e3d639 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -33,16 +33,16 @@ int bch2_subvol_is_ro_trans(struct btree_trans *, u32);
int bch2_subvol_is_ro(struct bch_fs *, u32);
static inline struct bkey_s_c
-bch2_btree_iter_peek_in_subvolume_max_type(struct btree_iter *iter, struct bpos end,
- u32 subvolid, unsigned flags)
+bch2_btree_iter_peek_in_subvolume_max_type(struct btree_trans *trans, struct btree_iter *iter,
+ struct bpos end, u32 subvolid, unsigned flags)
{
u32 snapshot;
- int ret = bch2_subvolume_get_snapshot(iter->trans, subvolid, &snapshot);
+ int ret = bch2_subvolume_get_snapshot(trans, subvolid, &snapshot);
if (ret)
return bkey_s_c_err(ret);
- bch2_btree_iter_set_snapshot(iter, snapshot);
- return bch2_btree_iter_peek_max_type(iter, end, flags);
+ bch2_btree_iter_set_snapshot(trans, iter, snapshot);
+ return bch2_btree_iter_peek_max_type(trans, iter, end, flags);
}
#define for_each_btree_key_in_subvolume_max_continue(_trans, _iter, \
@@ -53,14 +53,14 @@ bch2_btree_iter_peek_in_subvolume_max_type(struct btree_iter *iter, struct bpos
\
do { \
_ret3 = lockrestart_do(_trans, ({ \
- (_k) = bch2_btree_iter_peek_in_subvolume_max_type(&(_iter), \
+ (_k) = bch2_btree_iter_peek_in_subvolume_max_type(trans, &(_iter),\
_end, _subvolid, (_flags)); \
if (!(_k).k) \
break; \
\
bkey_err(_k) ?: (_do); \
})); \
- } while (!_ret3 && bch2_btree_iter_advance(&(_iter))); \
+ } while (!_ret3 && bch2_btree_iter_advance(_trans, &(_iter))); \
\
bch2_trans_iter_exit((_trans), &(_iter)); \
_ret3; \
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 572b06bfa0b8..25b6bce05c3c 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -73,14 +73,30 @@ int bch2_set_version_incompat(struct bch_fs *c, enum bcachefs_metadata_version v
? 0
: -BCH_ERR_may_not_use_incompat_feature;
+ mutex_lock(&c->sb_lock);
if (!ret) {
- mutex_lock(&c->sb_lock);
SET_BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb,
max(BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb), version));
bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
+ } else {
+ darray_for_each(c->incompat_versions_requested, i)
+ if (version == *i)
+ goto out;
+
+ darray_push(&c->incompat_versions_requested, version);
+ struct printbuf buf = PRINTBUF;
+ prt_str(&buf, "requested incompat feature ");
+ bch2_version_to_text(&buf, version);
+ prt_str(&buf, " currently not enabled");
+ prt_printf(&buf, "\n set version_upgrade=incompat to enable");
+
+ bch_notice(c, "%s", buf.buf);
+ printbuf_exit(&buf);
}
+out:
+ mutex_unlock(&c->sb_lock);
+
return ret;
}
@@ -248,7 +264,7 @@ struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *sb,
struct bch_sb_handle *dev_sb = &ca->disk_sb;
if (bch2_sb_realloc(dev_sb, le32_to_cpu(dev_sb->sb->u64s) + d)) {
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[READ]);
return NULL;
}
}
@@ -945,7 +961,7 @@ static void write_super_endio(struct bio *bio)
}
closure_put(&ca->fs->sb_write);
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[READ]);
}
static void read_back_super(struct bch_fs *c, struct bch_dev *ca)
@@ -963,7 +979,7 @@ static void read_back_super(struct bch_fs *c, struct bch_dev *ca)
this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb], bio_sectors(bio));
- percpu_ref_get(&ca->io_ref);
+ percpu_ref_get(&ca->io_ref[READ]);
closure_bio_submit(bio, &c->sb_write);
}
@@ -989,7 +1005,7 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_sb],
bio_sectors(bio));
- percpu_ref_get(&ca->io_ref);
+ percpu_ref_get(&ca->io_ref[READ]);
closure_bio_submit(bio, &c->sb_write);
}
@@ -1014,13 +1030,20 @@ int bch2_write_super(struct bch_fs *c)
closure_init_stack(cl);
memset(&sb_written, 0, sizeof(sb_written));
+ /*
+ * Note: we do writes to RO devices here, and we might want to change
+ * that in the future.
+ *
+ * For now, we expect to be able to call write_super() when we're not
+ * yet RW:
+ */
for_each_online_member(c, ca) {
ret = darray_push(&online_devices, ca);
if (bch2_fs_fatal_err_on(ret, c, "%s: error allocating online devices", __func__)) {
- percpu_ref_put(&ca->io_ref);
+ percpu_ref_put(&ca->io_ref[READ]);
goto out;
}
- percpu_ref_get(&ca->io_ref);
+ percpu_ref_get(&ca->io_ref[READ]);
}
/* Make sure we're using the new magic numbers: */
@@ -1186,7 +1209,7 @@ out:
/* Make new options visible after they're persistent: */
bch2_sb_update(c);
darray_for_each(online_devices, ca)
- percpu_ref_put(&(*ca)->io_ref);
+ percpu_ref_put(&(*ca)->io_ref[READ]);
darray_exit(&online_devices);
printbuf_exit(&err);
return ret;
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 20208f3c5d8b..e8a17ed1615d 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -70,14 +70,10 @@
#include <linux/percpu.h>
#include <linux/random.h>
#include <linux/sysfs.h>
-#include <crypto/hash.h>
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
MODULE_DESCRIPTION("bcachefs filesystem");
-MODULE_SOFTDEP("pre: chacha20");
-MODULE_SOFTDEP("pre: poly1305");
-MODULE_SOFTDEP("pre: xxhash");
const char * const bch2_fs_flag_strs[] = {
#define x(n) #n,
@@ -185,6 +181,7 @@ static void bch2_dev_unlink(struct bch_dev *);
static void bch2_dev_free(struct bch_dev *);
static int bch2_dev_alloc(struct bch_fs *, unsigned);
static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *);
+static void bch2_dev_io_ref_stop(struct bch_dev *, int);
static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *);
struct bch_fs *bch2_dev_to_fs(dev_t dev)
@@ -294,8 +291,10 @@ static void __bch2_fs_read_only(struct bch_fs *c)
/*
* After stopping journal:
*/
- for_each_member_device(c, ca)
+ for_each_member_device(c, ca) {
+ bch2_dev_io_ref_stop(ca, WRITE);
bch2_dev_allocator_remove(c, ca);
+ }
}
#ifndef BCH_WRITE_REF_DEBUG
@@ -465,10 +464,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
if (ret)
goto err;
- ret = bch2_fs_mark_dirty(c);
- if (ret)
- goto err;
-
clear_bit(BCH_FS_clean_shutdown, &c->flags);
/*
@@ -480,10 +475,24 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
set_bit(JOURNAL_need_flush_write, &c->journal.flags);
set_bit(JOURNAL_running, &c->journal.flags);
- for_each_rw_member(c, ca)
+ __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw), READ) {
bch2_dev_allocator_add(c, ca);
+ percpu_ref_reinit(&ca->io_ref[WRITE]);
+ }
bch2_recalc_capacity(c);
+ ret = bch2_fs_mark_dirty(c);
+ if (ret)
+ goto err;
+
+ spin_lock(&c->journal.lock);
+ bch2_journal_space_available(&c->journal);
+ spin_unlock(&c->journal.lock);
+
+ ret = bch2_journal_reclaim_start(&c->journal);
+ if (ret)
+ goto err;
+
set_bit(BCH_FS_rw, &c->flags);
set_bit(BCH_FS_was_rw, &c->flags);
@@ -495,11 +504,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
atomic_long_inc(&c->writes[i]);
}
#endif
-
- ret = bch2_journal_reclaim_start(&c->journal);
- if (ret)
- goto err;
-
if (!early) {
ret = bch2_fs_read_write_late(c);
if (ret)
@@ -585,6 +589,7 @@ static void __bch2_fs_free(struct bch_fs *c)
free_percpu(c->online_reserved);
}
+ darray_exit(&c->incompat_versions_requested);
darray_exit(&c->btree_roots_extra);
free_percpu(c->pcpu);
free_percpu(c->usage);
@@ -675,6 +680,7 @@ void bch2_fs_free(struct bch_fs *c)
if (ca) {
EBUG_ON(atomic_long_read(&ca->ref) != 1);
+ bch2_dev_io_ref_stop(ca, READ);
bch2_free_super(&ca->disk_sb);
bch2_dev_free(ca);
}
@@ -993,12 +999,6 @@ static void print_mount_opts(struct bch_fs *c)
prt_str(&p, "starting version ");
bch2_version_to_text(&p, c->sb.version);
- if (c->opts.read_only) {
- prt_str(&p, " opts=");
- first = false;
- prt_printf(&p, "ro");
- }
-
for (i = 0; i < bch2_opts_nr; i++) {
const struct bch_option *opt = &bch2_opt_table[i];
u64 v = bch2_opt_get_by_id(&c->opts, i);
@@ -1014,6 +1014,11 @@ static void print_mount_opts(struct bch_fs *c)
bch2_opt_to_text(&p, c, c->disk_sb.sb, opt, v, OPT_SHOW_MOUNT_STYLE);
}
+ if (c->sb.version_incompat_allowed != c->sb.version) {
+ prt_printf(&p, "\n allowing incompatible features above ");
+ bch2_version_to_text(&p, c->sb.version_incompat_allowed);
+ }
+
bch_info(c, "%s", p.buf);
printbuf_exit(&p);
}
@@ -1199,6 +1204,15 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs,
/* Device startup/shutdown: */
+static void bch2_dev_io_ref_stop(struct bch_dev *ca, int rw)
+{
+ if (!percpu_ref_is_zero(&ca->io_ref[rw])) {
+ reinit_completion(&ca->io_ref_completion[rw]);
+ percpu_ref_kill(&ca->io_ref[rw]);
+ wait_for_completion(&ca->io_ref_completion[rw]);
+ }
+}
+
static void bch2_dev_release(struct kobject *kobj)
{
struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
@@ -1208,6 +1222,9 @@ static void bch2_dev_release(struct kobject *kobj)
static void bch2_dev_free(struct bch_dev *ca)
{
+ WARN_ON(!percpu_ref_is_zero(&ca->io_ref[WRITE]));
+ WARN_ON(!percpu_ref_is_zero(&ca->io_ref[READ]));
+
cancel_work_sync(&ca->io_error_work);
bch2_dev_unlink(ca);
@@ -1226,7 +1243,8 @@ static void bch2_dev_free(struct bch_dev *ca)
bch2_time_stats_quantiles_exit(&ca->io_latency[WRITE]);
bch2_time_stats_quantiles_exit(&ca->io_latency[READ]);
- percpu_ref_exit(&ca->io_ref);
+ percpu_ref_exit(&ca->io_ref[WRITE]);
+ percpu_ref_exit(&ca->io_ref[READ]);
#ifndef CONFIG_BCACHEFS_DEBUG
percpu_ref_exit(&ca->ref);
#endif
@@ -1238,14 +1256,12 @@ static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca)
lockdep_assert_held(&c->state_lock);
- if (percpu_ref_is_zero(&ca->io_ref))
+ if (percpu_ref_is_zero(&ca->io_ref[READ]))
return;
__bch2_dev_read_only(c, ca);
- reinit_completion(&ca->io_ref_completion);
- percpu_ref_kill(&ca->io_ref);
- wait_for_completion(&ca->io_ref_completion);
+ bch2_dev_io_ref_stop(ca, READ);
bch2_dev_unlink(ca);
@@ -1262,11 +1278,18 @@ static void bch2_dev_ref_complete(struct percpu_ref *ref)
}
#endif
-static void bch2_dev_io_ref_complete(struct percpu_ref *ref)
+static void bch2_dev_io_ref_read_complete(struct percpu_ref *ref)
+{
+ struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref[READ]);
+
+ complete(&ca->io_ref_completion[READ]);
+}
+
+static void bch2_dev_io_ref_write_complete(struct percpu_ref *ref)
{
- struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref);
+ struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref[WRITE]);
- complete(&ca->io_ref_completion);
+ complete(&ca->io_ref_completion[WRITE]);
}
static void bch2_dev_unlink(struct bch_dev *ca)
@@ -1330,7 +1353,8 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
kobject_init(&ca->kobj, &bch2_dev_ktype);
init_completion(&ca->ref_completion);
- init_completion(&ca->io_ref_completion);
+ init_completion(&ca->io_ref_completion[READ]);
+ init_completion(&ca->io_ref_completion[WRITE]);
INIT_WORK(&ca->io_error_work, bch2_io_error_work);
@@ -1356,7 +1380,9 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
bch2_dev_allocator_background_init(ca);
- if (percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete,
+ if (percpu_ref_init(&ca->io_ref[READ], bch2_dev_io_ref_read_complete,
+ PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
+ percpu_ref_init(&ca->io_ref[WRITE], bch2_dev_io_ref_write_complete,
PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
!(ca->sb_read_scratch = kmalloc(BCH_SB_READ_SCRATCH_BUF_SIZE, GFP_KERNEL)) ||
bch2_dev_buckets_alloc(c, ca) ||
@@ -1419,7 +1445,8 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
return -BCH_ERR_device_size_too_small;
}
- BUG_ON(!percpu_ref_is_zero(&ca->io_ref));
+ BUG_ON(!percpu_ref_is_zero(&ca->io_ref[READ]));
+ BUG_ON(!percpu_ref_is_zero(&ca->io_ref[WRITE]));
ret = bch2_dev_journal_init(ca, sb->sb);
if (ret)
@@ -1438,7 +1465,7 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
ca->dev = ca->disk_sb.bdev->bd_dev;
- percpu_ref_reinit(&ca->io_ref);
+ percpu_ref_reinit(&ca->io_ref[READ]);
return 0;
}
@@ -1568,6 +1595,8 @@ static bool bch2_fs_may_start(struct bch_fs *c)
static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
{
+ bch2_dev_io_ref_stop(ca, WRITE);
+
/*
* The allocator thread itself allocates btree nodes, so stop it first:
*/
@@ -1584,6 +1613,10 @@ static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
+
+ if (percpu_ref_is_zero(&ca->io_ref[WRITE]))
+ percpu_ref_reinit(&ca->io_ref[WRITE]);
+
bch2_dev_do_discards(ca);
}
@@ -1730,8 +1763,9 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
up_write(&c->state_lock);
return 0;
err:
- if (ca->mi.state == BCH_MEMBER_STATE_rw &&
- !percpu_ref_is_zero(&ca->io_ref))
+ if (test_bit(BCH_FS_rw, &c->flags) &&
+ ca->mi.state == BCH_MEMBER_STATE_rw &&
+ !percpu_ref_is_zero(&ca->io_ref[READ]))
__bch2_dev_read_write(c, ca);
up_write(&c->state_lock);
return ret;
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index 6c6469814637..c265b102267a 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -43,7 +43,7 @@ static int test_delete(struct bch_fs *c, u64 nr)
BTREE_ITER_intent);
ret = commit_do(trans, NULL, NULL, 0,
- bch2_btree_iter_traverse(&iter) ?:
+ bch2_btree_iter_traverse(trans, &iter) ?:
bch2_trans_update(trans, &iter, &k.k_i, 0));
bch_err_msg(c, ret, "update error");
if (ret)
@@ -51,7 +51,7 @@ static int test_delete(struct bch_fs *c, u64 nr)
pr_info("deleting once");
ret = commit_do(trans, NULL, NULL, 0,
- bch2_btree_iter_traverse(&iter) ?:
+ bch2_btree_iter_traverse(trans, &iter) ?:
bch2_btree_delete_at(trans, &iter, 0));
bch_err_msg(c, ret, "delete error (first)");
if (ret)
@@ -59,7 +59,7 @@ static int test_delete(struct bch_fs *c, u64 nr)
pr_info("deleting twice");
ret = commit_do(trans, NULL, NULL, 0,
- bch2_btree_iter_traverse(&iter) ?:
+ bch2_btree_iter_traverse(trans, &iter) ?:
bch2_btree_delete_at(trans, &iter, 0));
bch_err_msg(c, ret, "delete error (second)");
if (ret)
@@ -84,7 +84,7 @@ static int test_delete_written(struct bch_fs *c, u64 nr)
BTREE_ITER_intent);
ret = commit_do(trans, NULL, NULL, 0,
- bch2_btree_iter_traverse(&iter) ?:
+ bch2_btree_iter_traverse(trans, &iter) ?:
bch2_trans_update(trans, &iter, &k.k_i, 0));
bch_err_msg(c, ret, "update error");
if (ret)
@@ -94,7 +94,7 @@ static int test_delete_written(struct bch_fs *c, u64 nr)
bch2_journal_flush_all_pins(&c->journal);
ret = commit_do(trans, NULL, NULL, 0,
- bch2_btree_iter_traverse(&iter) ?:
+ bch2_btree_iter_traverse(trans, &iter) ?:
bch2_btree_delete_at(trans, &iter, 0));
bch_err_msg(c, ret, "delete error");
if (ret)
@@ -349,10 +349,10 @@ static int test_peek_end(struct bch_fs *c, u64 nr)
bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
SPOS(0, 0, U32_MAX), 0);
- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX))));
+ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX))));
BUG_ON(k.k);
- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX))));
+ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX))));
BUG_ON(k.k);
bch2_trans_iter_exit(trans, &iter);
@@ -369,10 +369,10 @@ static int test_peek_end_extents(struct bch_fs *c, u64 nr)
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
SPOS(0, 0, U32_MAX), 0);
- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX))));
+ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX))));
BUG_ON(k.k);
- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX))));
+ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX))));
BUG_ON(k.k);
bch2_trans_iter_exit(trans, &iter);
@@ -488,7 +488,7 @@ static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi)
trans = bch2_trans_get(c);
bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
SPOS(0, 0, snapid_lo), 0);
- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX))));
+ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX))));
BUG_ON(k.k->p.snapshot != U32_MAX);
@@ -602,9 +602,9 @@ static int rand_lookup(struct bch_fs *c, u64 nr)
SPOS(0, 0, U32_MAX), 0);
for (i = 0; i < nr; i++) {
- bch2_btree_iter_set_pos(&iter, SPOS(0, test_rand(), U32_MAX));
+ bch2_btree_iter_set_pos(trans, &iter, SPOS(0, test_rand(), U32_MAX));
- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
+ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek(trans, &iter)));
ret = bkey_err(k);
if (ret)
break;
@@ -623,9 +623,9 @@ static int rand_mixed_trans(struct btree_trans *trans,
struct bkey_s_c k;
int ret;
- bch2_btree_iter_set_pos(iter, SPOS(0, pos, U32_MAX));
+ bch2_btree_iter_set_pos(trans, iter, SPOS(0, pos, U32_MAX));
- k = bch2_btree_iter_peek(iter);
+ k = bch2_btree_iter_peek(trans, iter);
ret = bkey_err(k);
bch_err_msg(trans->c, ret, "lookup error");
if (ret)
@@ -672,7 +672,7 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos)
bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos,
BTREE_ITER_intent);
- k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX));
+ k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX));
ret = bkey_err(k);
if (ret)
goto err;
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index f9667b944c0d..651da52b2cbc 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -168,7 +168,7 @@ int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum,
int type, int flags)
{
struct bch_fs *c = trans->c;
- struct btree_iter inode_iter = { NULL };
+ struct btree_iter inode_iter = {};
int ret;
ret = bch2_subvol_is_ro_trans(trans, inum.subvol) ?:
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index fa8515598341..73a2dfb854c5 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -3,9 +3,9 @@
config BTRFS_FS
tristate "Btrfs filesystem support"
select BLK_CGROUP_PUNT_BIO
+ select CRC32
select CRYPTO
select CRYPTO_CRC32C
- select LIBCRC32C
select CRYPTO_XXHASH
select CRYPTO_SHA256
select CRYPTO_BLAKE2B
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 3dd555db3d32..aa58e0663a5d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3853,7 +3853,6 @@ static int write_dev_supers(struct btrfs_device *device,
atomic_inc(&device->sb_write_errors);
continue;
}
- ASSERT(folio_order(folio) == 0);
offset = offset_in_folio(folio, bytenr);
disk_super = folio_address(folio) + offset;
@@ -3926,7 +3925,6 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
/* If the folio has been removed, then we know it completed. */
if (IS_ERR(folio))
continue;
- ASSERT(folio_order(folio) == 0);
/* Folio will be unlocked once the write completes. */
folio_wait_locked(folio);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index a13d81bb56a0..63aeacc54945 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4902,6 +4902,8 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue
ret = btrfs_encoded_read(&kiocb, &data->iter, &data->args, &cached_state,
&disk_bytenr, &disk_io_size);
+ if (ret == -EAGAIN)
+ goto out_acct;
if (ret < 0 && ret != -EIOCBQUEUED)
goto out_free;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 40709e2a44fc..7121d8c7a318 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1139,8 +1139,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
subvol_name = btrfs_get_subvol_name_from_objectid(info,
btrfs_root_id(BTRFS_I(d_inode(dentry))->root));
if (!IS_ERR(subvol_name)) {
- seq_puts(seq, ",subvol=");
- seq_escape(seq, subvol_name, " \t\n\\");
+ seq_show_option(seq, "subvol", subvol_name);
kfree(subvol_name);
}
return 0;
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index cd5f38d6fbaa..3541efa765c7 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -225,7 +225,7 @@ void zstd_cleanup_workspace_manager(void)
}
spin_unlock_bh(&wsm.lock);
- del_timer_sync(&wsm.timer);
+ timer_delete_sync(&wsm.timer);
}
/*
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
index 7249d70e1a43..3e7def3d31c1 100644
--- a/fs/ceph/Kconfig
+++ b/fs/ceph/Kconfig
@@ -3,7 +3,7 @@ config CEPH_FS
tristate "Ceph distributed file system"
depends on INET
select CEPH_LIB
- select LIBCRC32C
+ select CRC32
select CRYPTO_AES
select CRYPTO
select NETFS_SUPPORT
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 42e4d6eeb29f..9c20d78e41f6 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -89,12 +89,12 @@ enum {
};
static const struct fs_parameter_spec devpts_param_specs[] = {
- fsparam_u32 ("gid", Opt_gid),
+ fsparam_gid ("gid", Opt_gid),
fsparam_s32 ("max", Opt_max),
fsparam_u32oct ("mode", Opt_mode),
fsparam_flag ("newinstance", Opt_newinstance),
fsparam_u32oct ("ptmxmode", Opt_ptmxmode),
- fsparam_u32 ("uid", Opt_uid),
+ fsparam_uid ("uid", Opt_uid),
{}
};
diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index 331e49cd1b8d..8f68ec49ad89 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -3,8 +3,8 @@
config EROFS_FS
tristate "EROFS filesystem support"
depends on BLOCK
+ select CRC32
select FS_IOMAP
- select LIBCRC32C
help
EROFS (Enhanced Read-Only File System) is a lightweight read-only
file system with modern designs (e.g. no buffer heads, inline
diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
index 9581e9bf8192..767fb4acdc93 100644
--- a/fs/erofs/erofs_fs.h
+++ b/fs/erofs/erofs_fs.h
@@ -56,7 +56,7 @@ struct erofs_super_block {
union {
__le16 rootnid_2b; /* nid of root directory */
__le16 blocks_hi; /* (48BIT on) blocks count MSB */
- } rb;
+ } __packed rb;
__le64 inos; /* total valid ino # (== f_files - f_favail) */
__le64 epoch; /* base seconds used for compact inodes */
__le32 fixed_nsec; /* fixed nanoseconds for compact inodes */
@@ -148,7 +148,7 @@ union erofs_inode_i_nb {
__le16 nlink; /* if EROFS_I_NLINK_1_BIT is unset */
__le16 blocks_hi; /* total blocks count MSB */
__le16 startblk_hi; /* starting block number MSB */
-};
+} __packed;
/* 32-byte reduced form of an ondisk inode */
struct erofs_inode_compact {
@@ -369,9 +369,9 @@ struct z_erofs_map_header {
* bit 7 : pack the whole file into packed inode
*/
__u8 h_clusterbits;
- };
+ } __packed;
__le16 h_extents_hi; /* extent count MSB */
- };
+ } __packed;
};
enum {
diff --git a/fs/erofs/fileio.c b/fs/erofs/fileio.c
index bec4b56b3826..4fa0a0121288 100644
--- a/fs/erofs/fileio.c
+++ b/fs/erofs/fileio.c
@@ -32,6 +32,8 @@ static void erofs_fileio_ki_complete(struct kiocb *iocb, long ret)
ret = 0;
}
if (rq->bio.bi_end_io) {
+ if (ret < 0 && !rq->bio.bi_status)
+ rq->bio.bi_status = errno_to_blk_status(ret);
rq->bio.bi_end_io(&rq->bio);
} else {
bio_for_each_folio_all(fi, &rq->bio) {
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 0671184d9cf1..5c061aaeeb45 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -725,7 +725,6 @@ static int z_erofs_register_pcluster(struct z_erofs_frontend *fe)
lockref_init(&pcl->lockref); /* one ref for this request */
pcl->algorithmformat = map->m_algorithmformat;
pcl->pclustersize = map->m_plen;
- pcl->pageofs_in = pageofs_in;
pcl->length = 0;
pcl->partial = true;
pcl->next = fe->head;
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index 8de50df05dfe..14ea47f954f5 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -559,7 +559,8 @@ static int z_erofs_map_blocks_ext(struct inode *inode,
pos += sizeof(__le64);
lstart = 0;
} else {
- lstart = map->m_la >> vi->z_lclusterbits;
+ lstart = round_down(map->m_la, 1 << vi->z_lclusterbits);
+ pos += (lstart >> vi->z_lclusterbits) * recsz;
pa = EROFS_NULL_ADDR;
}
@@ -614,7 +615,7 @@ static int z_erofs_map_blocks_ext(struct inode *inode,
if (last && (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER)) {
map->m_flags |= EROFS_MAP_MAPPED | EROFS_MAP_FRAGMENT;
vi->z_fragmentoff = map->m_plen;
- if (recsz >= offsetof(struct z_erofs_extent, pstart_lo))
+ if (recsz > offsetof(struct z_erofs_extent, pstart_lo))
vi->z_fragmentoff |= map->m_pa << 32;
} else if (map->m_plen) {
map->m_flags |= EROFS_MAP_MAPPED |
diff --git a/fs/exec.c b/fs/exec.c
index 5d1c0d2dc403..8e4ea5f1e64c 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1864,9 +1864,9 @@ static int bprm_execve(struct linux_binprm *bprm)
goto out;
sched_mm_cid_after_execve(current);
+ rseq_execve(current);
/* execve succeeded */
current->in_execve = 0;
- rseq_execve(current);
user_events_execve(current);
acct_update_integrals(current);
task_numa_free(current, false);
@@ -1883,6 +1883,7 @@ out:
force_fatal_sig(SIGSEGV);
sched_mm_cid_after_execve(current);
+ rseq_set_notify_resume(current);
current->in_execve = 0;
return retval;
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 87ee3a17bd29..e8c5525afc67 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -351,10 +351,9 @@ int ext4_check_blockref(const char *function, unsigned int line,
{
__le32 *bref = p;
unsigned int blk;
+ journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
- if (ext4_has_feature_journal(inode->i_sb) &&
- (inode->i_ino ==
- le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum)))
+ if (journal && inode == journal->j_inode)
return 0;
while (bref < p+max) {
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 1dc09ed5d403..94c7d2d828a6 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -386,10 +386,11 @@ static int __check_block_validity(struct inode *inode, const char *func,
unsigned int line,
struct ext4_map_blocks *map)
{
- if (ext4_has_feature_journal(inode->i_sb) &&
- (inode->i_ino ==
- le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum)))
+ journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
+
+ if (journal && inode == journal->j_inode)
return 0;
+
if (!ext4_inode_block_valid(inode, map->m_pblk, map->m_len)) {
ext4_error_inode(inode, func, line, map->m_pblk,
"lblock %lu mapped to illegal pblock %llu "
@@ -4724,22 +4725,43 @@ static inline void ext4_inode_set_iversion_queried(struct inode *inode, u64 val)
inode_set_iversion_queried(inode, val);
}
-static const char *check_igot_inode(struct inode *inode, ext4_iget_flags flags)
-
+static int check_igot_inode(struct inode *inode, ext4_iget_flags flags,
+ const char *function, unsigned int line)
{
+ const char *err_str;
+
if (flags & EXT4_IGET_EA_INODE) {
- if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
- return "missing EA_INODE flag";
+ if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) {
+ err_str = "missing EA_INODE flag";
+ goto error;
+ }
if (ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
- EXT4_I(inode)->i_file_acl)
- return "ea_inode with extended attributes";
+ EXT4_I(inode)->i_file_acl) {
+ err_str = "ea_inode with extended attributes";
+ goto error;
+ }
} else {
- if ((EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
- return "unexpected EA_INODE flag";
+ if ((EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) {
+ /*
+ * open_by_handle_at() could provide an old inode number
+ * that has since been reused for an ea_inode; this does
+ * not indicate filesystem corruption
+ */
+ if (flags & EXT4_IGET_HANDLE)
+ return -ESTALE;
+ err_str = "unexpected EA_INODE flag";
+ goto error;
+ }
+ }
+ if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD)) {
+ err_str = "unexpected bad inode w/o EXT4_IGET_BAD";
+ goto error;
}
- if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD))
- return "unexpected bad inode w/o EXT4_IGET_BAD";
- return NULL;
+ return 0;
+
+error:
+ ext4_error_inode(inode, function, line, 0, err_str);
+ return -EFSCORRUPTED;
}
struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
@@ -4751,7 +4773,6 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
struct ext4_inode_info *ei;
struct ext4_super_block *es = EXT4_SB(sb)->s_es;
struct inode *inode;
- const char *err_str;
journal_t *journal = EXT4_SB(sb)->s_journal;
long ret;
loff_t size;
@@ -4780,10 +4801,10 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
if (!inode)
return ERR_PTR(-ENOMEM);
if (!(inode->i_state & I_NEW)) {
- if ((err_str = check_igot_inode(inode, flags)) != NULL) {
- ext4_error_inode(inode, function, line, 0, err_str);
+ ret = check_igot_inode(inode, flags, function, line);
+ if (ret) {
iput(inode);
- return ERR_PTR(-EFSCORRUPTED);
+ return ERR_PTR(ret);
}
return inode;
}
@@ -5065,13 +5086,21 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
ret = -EFSCORRUPTED;
goto bad_inode;
}
- if ((err_str = check_igot_inode(inode, flags)) != NULL) {
- ext4_error_inode(inode, function, line, 0, err_str);
- ret = -EFSCORRUPTED;
- goto bad_inode;
+ ret = check_igot_inode(inode, flags, function, line);
+ /*
+ * -ESTALE here means there is nothing inherently wrong with the inode,
+ * it's just not an inode we can return for an fhandle lookup.
+ */
+ if (ret == -ESTALE) {
+ brelse(iloc.bh);
+ unlock_new_inode(inode);
+ iput(inode);
+ return ERR_PTR(-ESTALE);
}
-
+ if (ret)
+ goto bad_inode;
brelse(iloc.bh);
+
unlock_new_inode(inode);
return inode;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 0d523e9fb3d5..f88424c28194 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3037,10 +3037,8 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
unsigned char blocksize_bits = min_t(unsigned char,
sb->s_blocksize_bits,
EXT4_MAX_BLOCK_LOG_SIZE);
- struct sg {
- struct ext4_group_info info;
- ext4_grpblk_t counters[EXT4_MAX_BLOCK_LOG_SIZE + 2];
- } sg;
+ DEFINE_RAW_FLEX(struct ext4_group_info, sg, bb_counters,
+ EXT4_MAX_BLOCK_LOG_SIZE + 2);
group--;
if (group == 0)
@@ -3048,7 +3046,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
" 2^0 2^1 2^2 2^3 2^4 2^5 2^6 "
" 2^7 2^8 2^9 2^10 2^11 2^12 2^13 ]\n");
- i = (blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
+ i = (blocksize_bits + 2) * sizeof(sg->bb_counters[0]) +
sizeof(struct ext4_group_info);
grinfo = ext4_get_group_info(sb, group);
@@ -3068,14 +3066,14 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
* We care only about free space counters in the group info and
* these are safe to access even after the buddy has been unloaded
*/
- memcpy(&sg, grinfo, i);
- seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
- sg.info.bb_fragments, sg.info.bb_first_free);
+ memcpy(sg, grinfo, i);
+ seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg->bb_free,
+ sg->bb_fragments, sg->bb_first_free);
for (i = 0; i <= 13; i++)
seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ?
- sg.info.bb_counters[i] : 0);
+ sg->bb_counters[i] : 0);
seq_puts(seq, " ]");
- if (EXT4_MB_GRP_BBITMAP_CORRUPT(&sg.info))
+ if (EXT4_MB_GRP_BBITMAP_CORRUPT(sg))
seq_puts(seq, " Block bitmap corrupted!");
seq_putc(seq, '\n');
return 0;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index cb5cb33b1d91..e9712e64ec8f 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1971,7 +1971,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
* split it in half by count; each resulting block will have at least
* half the space free.
*/
- if (i > 0)
+ if (i >= 0)
split = count - move;
else
split = count/2;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 8122d4ffb3b5..181934499624 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -5680,7 +5680,7 @@ failed_mount3:
/* flush s_sb_upd_work before sbi destroy */
flush_work(&sbi->s_sb_upd_work);
ext4_stop_mmpd(sbi);
- del_timer_sync(&sbi->s_err_report);
+ timer_delete_sync(&sbi->s_err_report);
ext4_group_desc_free(sbi);
failed_mount:
#if IS_ENABLED(CONFIG_UNICODE)
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index 2c7b24cb67ad..53c2626e90e7 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -1669,6 +1669,9 @@ static int virtio_fs_get_tree(struct fs_context *fsc)
unsigned int virtqueue_size;
int err = -EIO;
+ if (!fsc->source)
+ return invalf(fsc, "No source specified");
+
/* This gets a reference on virtio_fs object. This ptr gets installed
* in fc->iq->priv. Once fuse_conn is going away, it calls ->put()
* to drop the reference to this object.
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index be7f87a8e11a..7bd231d16d4a 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -4,7 +4,6 @@ config GFS2_FS
select BUFFER_HEAD
select FS_POSIX_ACL
select CRC32
- select LIBCRC32C
select QUOTACTL
select FS_IOMAP
help
diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c
index 6add6ebfef89..cb823a8a6ba9 100644
--- a/fs/hfs/bnode.c
+++ b/fs/hfs/bnode.c
@@ -67,6 +67,12 @@ void hfs_bnode_read_key(struct hfs_bnode *node, void *key, int off)
else
key_len = tree->max_key_len + 1;
+ if (key_len > sizeof(hfs_btree_key) || key_len < 1) {
+ memset(key, 0, sizeof(hfs_btree_key));
+ pr_err("hfs: Invalid key length: %d\n", key_len);
+ return;
+ }
+
hfs_bnode_read(node, key, off, key_len);
}
diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c
index 87974d5e6791..079ea80534f7 100644
--- a/fs/hfsplus/bnode.c
+++ b/fs/hfsplus/bnode.c
@@ -67,6 +67,12 @@ void hfs_bnode_read_key(struct hfs_bnode *node, void *key, int off)
else
key_len = tree->max_key_len + 2;
+ if (key_len > sizeof(hfsplus_btree_key) || key_len < 1) {
+ memset(key, 0, sizeof(hfsplus_btree_key));
+ pr_err("hfsplus: Invalid key length: %d\n", key_len);
+ return;
+ }
+
hfs_bnode_read(node, key, off, key_len);
}
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 31553372b33a..5b08bd417b28 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -259,7 +259,7 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
}
/* truncate len if we find any trailing uptodate block(s) */
- for ( ; i <= last; i++) {
+ while (++i <= last) {
if (ifs_block_is_uptodate(ifs, i)) {
plen -= (last - i + 1) * block_size;
last = i - 1;
diff --git a/fs/isofs/export.c b/fs/isofs/export.c
index 35768a63fb1d..421d247fae52 100644
--- a/fs/isofs/export.c
+++ b/fs/isofs/export.c
@@ -180,7 +180,7 @@ static struct dentry *isofs_fh_to_parent(struct super_block *sb,
return NULL;
return isofs_export_iget(sb,
- fh_len > 2 ? ifid->parent_block : 0,
+ fh_len > 3 ? ifid->parent_block : 0,
ifid->parent_offset,
fh_len > 4 ? ifid->parent_generation : 0);
}
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index a5ccba25ff47..743a1d7633cd 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -197,7 +197,7 @@ loop:
if (journal->j_commit_sequence != journal->j_commit_request) {
jbd2_debug(1, "OK, requests differ\n");
write_unlock(&journal->j_state_lock);
- del_timer_sync(&journal->j_commit_timer);
+ timer_delete_sync(&journal->j_commit_timer);
jbd2_journal_commit_transaction(journal);
write_lock(&journal->j_state_lock);
goto loop;
@@ -246,7 +246,7 @@ loop:
goto loop;
end_loop:
- del_timer_sync(&journal->j_commit_timer);
+ timer_delete_sync(&journal->j_commit_timer);
journal->j_task = NULL;
wake_up(&journal->j_wait_done_commit);
jbd2_debug(1, "Journal thread exiting.\n");
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 4061e0ba7010..bb815a002984 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -584,7 +584,7 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad)
size_t retlen;
/* Nothing to do if not write-buffering the flash. In particular, we shouldn't
- del_timer() the timer we never initialised. */
+ call timer_delete() on the timer we never initialised. */
if (!jffs2_is_writebuffered(c))
return 0;
diff --git a/fs/namei.c b/fs/namei.c
index 360a86ca1f02..8510ff53f12e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -125,9 +125,9 @@
#define EMBEDDED_NAME_MAX (PATH_MAX - offsetof(struct filename, iname))
-static inline void initname(struct filename *name)
+static inline void initname(struct filename *name, const char __user *uptr)
{
- name->uptr = NULL;
+ name->uptr = uptr;
name->aname = NULL;
atomic_set(&name->refcnt, 1);
}
@@ -210,7 +210,7 @@ getname_flags(const char __user *filename, int flags)
return ERR_PTR(-ENAMETOOLONG);
}
}
- initname(result);
+ initname(result, filename);
audit_getname(result);
return result;
}
@@ -268,7 +268,7 @@ struct filename *getname_kernel(const char * filename)
return ERR_PTR(-ENAMETOOLONG);
}
memcpy((char *)result->name, filename, len);
- initname(result);
+ initname(result, NULL);
audit_getname(result);
return result;
}
diff --git a/fs/namespace.c b/fs/namespace.c
index 16292ff760c9..d9ca80dcc544 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1830,6 +1830,8 @@ static inline void namespace_lock(void)
down_write(&namespace_sem);
}
+DEFINE_GUARD(namespace_lock, struct rw_semaphore *, namespace_lock(), namespace_unlock())
+
enum umount_tree_flags {
UMOUNT_SYNC = 1,
UMOUNT_PROPAGATE = 2,
@@ -2383,7 +2385,7 @@ void dissolve_on_fput(struct vfsmount *mnt)
return;
}
- scoped_guard(rwsem_write, &namespace_sem) {
+ scoped_guard(namespace_lock, &namespace_sem) {
ns = m->mnt_ns;
if (!must_dissolve(ns))
return;
@@ -2478,7 +2480,8 @@ struct vfsmount *clone_private_mount(const struct path *path)
struct mount *old_mnt = real_mount(path->mnt);
struct mount *new_mnt;
- scoped_guard(rwsem_read, &namespace_sem)
+ guard(rwsem_read)(&namespace_sem);
+
if (IS_MNT_UNBINDABLE(old_mnt))
return ERR_PTR(-EINVAL);
@@ -5188,8 +5191,8 @@ static void finish_mount_kattr(struct mount_kattr *kattr)
mnt_idmap_put(kattr->mnt_idmap);
}
-static int copy_mount_setattr(struct mount_attr __user *uattr, size_t usize,
- struct mount_kattr *kattr)
+static int wants_mount_setattr(struct mount_attr __user *uattr, size_t usize,
+ struct mount_kattr *kattr)
{
int ret;
struct mount_attr attr;
@@ -5212,9 +5215,13 @@ static int copy_mount_setattr(struct mount_attr __user *uattr, size_t usize,
if (attr.attr_set == 0 &&
attr.attr_clr == 0 &&
attr.propagation == 0)
- return 0;
+ return 0; /* Tell caller to not bother. */
+
+ ret = build_mount_kattr(&attr, usize, kattr);
+ if (ret < 0)
+ return ret;
- return build_mount_kattr(&attr, usize, kattr);
+ return 1;
}
SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
@@ -5246,8 +5253,8 @@ SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
if (flags & AT_RECURSIVE)
kattr.kflags |= MOUNT_KATTR_RECURSE;
- err = copy_mount_setattr(uattr, usize, &kattr);
- if (err)
+ err = wants_mount_setattr(uattr, usize, &kattr);
+ if (err <= 0)
return err;
err = user_path_at(dfd, path, kattr.lookup_flags, &target);
@@ -5281,15 +5288,17 @@ SYSCALL_DEFINE5(open_tree_attr, int, dfd, const char __user *, filename,
if (flags & AT_RECURSIVE)
kattr.kflags |= MOUNT_KATTR_RECURSE;
- ret = copy_mount_setattr(uattr, usize, &kattr);
- if (ret)
+ ret = wants_mount_setattr(uattr, usize, &kattr);
+ if (ret < 0)
return ret;
- ret = do_mount_setattr(&file->f_path, &kattr);
- if (ret)
- return ret;
+ if (ret) {
+ ret = do_mount_setattr(&file->f_path, &kattr);
+ if (ret)
+ return ret;
- finish_mount_kattr(&kattr);
+ finish_mount_kattr(&kattr);
+ }
}
fd = get_unused_fd_flags(flags & O_CLOEXEC);
diff --git a/fs/netfs/main.c b/fs/netfs/main.c
index 4e3e62040831..70ecc8f5f210 100644
--- a/fs/netfs/main.c
+++ b/fs/netfs/main.c
@@ -127,11 +127,13 @@ static int __init netfs_init(void)
if (mempool_init_slab_pool(&netfs_subrequest_pool, 100, netfs_subrequest_slab) < 0)
goto error_subreqpool;
+#ifdef CONFIG_PROC_FS
if (!proc_mkdir("fs/netfs", NULL))
goto error_proc;
if (!proc_create_seq("fs/netfs/requests", S_IFREG | 0444, NULL,
&netfs_requests_seq_ops))
goto error_procfile;
+#endif
#ifdef CONFIG_FSCACHE_STATS
if (!proc_create_single("fs/netfs/stats", S_IFREG | 0444, NULL,
netfs_stats_show))
@@ -144,9 +146,11 @@ static int __init netfs_init(void)
return 0;
error_fscache:
+#ifdef CONFIG_PROC_FS
error_procfile:
remove_proc_subtree("fs/netfs", NULL);
error_proc:
+#endif
mempool_exit(&netfs_subrequest_pool);
error_subreqpool:
kmem_cache_destroy(netfs_subrequest_slab);
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 3a202e51b360..83970d97840b 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -2424,7 +2424,7 @@ static void nilfs_segctor_accept(struct nilfs_sc_info *sci)
* the area protected by sc_state_lock.
*/
if (thread_is_alive)
- del_timer_sync(&sci->sc_timer);
+ timer_delete_sync(&sci->sc_timer);
}
/**
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 0f46b22561d6..fce9beb214f0 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -724,7 +724,7 @@ static void o2net_shutdown_sc(struct work_struct *work)
if (o2net_unregister_callbacks(sc->sc_sock->sk, sc)) {
/* we shouldn't flush as we're in the thread, the
* races with pending sc work structs are harmless */
- del_timer_sync(&sc->sc_idle_timeout);
+ timer_delete_sync(&sc->sc_idle_timeout);
o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work);
sc_put(sc);
kernel_sock_shutdown(sc->sc_sock, SHUT_RDWR);
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 6f2f8f4cfbbc..aef942a758ce 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -541,8 +541,6 @@ int ovl_set_metacopy_xattr(struct ovl_fs *ofs, struct dentry *d,
bool ovl_is_metacopy_dentry(struct dentry *dentry);
char *ovl_get_redirect_xattr(struct ovl_fs *ofs, const struct path *path, int padding);
int ovl_ensure_verity_loaded(struct path *path);
-int ovl_get_verity_xattr(struct ovl_fs *ofs, const struct path *path,
- u8 *digest_buf, int *buf_length);
int ovl_validate_verity(struct ovl_fs *ofs,
struct path *metapath,
struct path *datapath);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index b63474d1b064..e19940d649ca 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -1138,6 +1138,11 @@ static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb,
return ERR_PTR(-EINVAL);
}
+ if (ctx->nr == ctx->nr_data) {
+ pr_err("at least one non-data lowerdir is required\n");
+ return ERR_PTR(-EINVAL);
+ }
+
err = -EINVAL;
for (i = 0; i < ctx->nr; i++) {
l = &ctx->lower[i];
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 557cf9d40177..f8b9c9c73997 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -563,7 +563,7 @@ void pstore_unregister(struct pstore_info *psi)
pstore_unregister_kmsg();
/* Stop timer and make sure all work has finished. */
- del_timer_sync(&pstore_timer);
+ timer_delete_sync(&pstore_timer);
flush_work(&pstore_work);
/* Remove all backend records from filesystem tree. */
diff --git a/fs/smb/client/cifs_fs_sb.h b/fs/smb/client/cifs_fs_sb.h
index 651759192280..5e8d163cb5f8 100644
--- a/fs/smb/client/cifs_fs_sb.h
+++ b/fs/smb/client/cifs_fs_sb.h
@@ -49,6 +49,7 @@
struct cifs_sb_info {
struct rb_root tlink_tree;
+ struct list_head tcon_sb_link;
spinlock_t tlink_tree_lock;
struct tcon_link *master_tlink;
struct nls_table *local_nls;
diff --git a/fs/smb/client/cifsencrypt.c b/fs/smb/client/cifsencrypt.c
index e69968e88fe7..35892df7335c 100644
--- a/fs/smb/client/cifsencrypt.c
+++ b/fs/smb/client/cifsencrypt.c
@@ -704,18 +704,12 @@ cifs_crypto_secmech_release(struct TCP_Server_Info *server)
cifs_free_hash(&server->secmech.md5);
cifs_free_hash(&server->secmech.sha512);
- if (!SERVER_IS_CHAN(server)) {
- if (server->secmech.enc) {
- crypto_free_aead(server->secmech.enc);
- server->secmech.enc = NULL;
- }
-
- if (server->secmech.dec) {
- crypto_free_aead(server->secmech.dec);
- server->secmech.dec = NULL;
- }
- } else {
+ if (server->secmech.enc) {
+ crypto_free_aead(server->secmech.enc);
server->secmech.enc = NULL;
+ }
+ if (server->secmech.dec) {
+ crypto_free_aead(server->secmech.dec);
server->secmech.dec = NULL;
}
}
diff --git a/fs/smb/client/cifsfs.h b/fs/smb/client/cifsfs.h
index 8dea0cf3a8de..ca435a3841b8 100644
--- a/fs/smb/client/cifsfs.h
+++ b/fs/smb/client/cifsfs.h
@@ -135,7 +135,6 @@ extern ssize_t cifs_file_copychunk_range(unsigned int xid,
extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
extern void cifs_setsize(struct inode *inode, loff_t offset);
-extern int cifs_truncate_page(struct address_space *mapping, loff_t from);
struct smb3_fs_context;
extern struct dentry *cifs_smb3_do_mount(struct file_system_type *fs_type,
@@ -146,6 +145,6 @@ extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
/* when changing internal version - update following two lines at same time */
-#define SMB3_PRODUCT_BUILD 53
-#define CIFS_VERSION "2.53"
+#define SMB3_PRODUCT_BUILD 54
+#define CIFS_VERSION "2.54"
#endif /* _CIFSFS_H */
diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index 6ae170a2a042..3b32116b0b49 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -625,10 +625,8 @@ struct smb_version_operations {
bool (*is_status_io_timeout)(char *buf);
/* Check for STATUS_NETWORK_NAME_DELETED */
bool (*is_network_name_deleted)(char *buf, struct TCP_Server_Info *srv);
- int (*parse_reparse_point)(struct cifs_sb_info *cifs_sb,
- const char *full_path,
- struct kvec *rsp_iov,
- struct cifs_open_info_data *data);
+ struct reparse_data_buffer * (*get_reparse_point_buffer)(const struct kvec *rsp_iov,
+ u32 *plen);
int (*create_reparse_symlink)(const unsigned int xid,
struct inode *inode,
struct dentry *dentry,
@@ -714,6 +712,8 @@ struct TCP_Server_Info {
spinlock_t srv_lock; /* protect anything here that is not protected */
__u64 conn_id; /* connection identifier (useful for debugging) */
int srv_count; /* reference counter */
+ int rfc1001_sessinit; /* whether to estasblish netbios session */
+ bool with_rfc1001; /* if netbios session is used */
/* 15 character server name + 0x20 16th byte indicating type = srv */
char server_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
struct smb_version_operations *ops;
@@ -1321,7 +1321,8 @@ struct cifs_tcon {
#endif
struct list_head pending_opens; /* list of incomplete opens */
struct cached_fids *cfids;
- /* BB add field for back pointer to sb struct(s)? */
+ struct list_head cifs_sb_list;
+ spinlock_t sb_list_lock;
#ifdef CONFIG_CIFS_DFS_UPCALL
struct delayed_work dfs_cache_work;
struct list_head dfs_ses_list;
@@ -1718,6 +1719,7 @@ struct mid_q_entry {
void *resp_buf; /* pointer to received SMB header */
unsigned int resp_buf_size;
int mid_state; /* wish this were enum but can not pass to wait_event */
+ int mid_rc; /* rc for MID_RC */
unsigned int mid_flags;
__le16 command; /* smb command code */
unsigned int optype; /* operation type */
@@ -1880,6 +1882,7 @@ static inline bool is_replayable_error(int error)
#define MID_RESPONSE_MALFORMED 0x10
#define MID_SHUTDOWN 0x20
#define MID_RESPONSE_READY 0x40 /* ready for other process handle the rsp */
+#define MID_RC 0x80 /* mid_rc contains custom rc */
/* Flags */
#define MID_WAIT_CANCELLED 1 /* Cancelled while waiting for response */
diff --git a/fs/smb/client/cifspdu.h b/fs/smb/client/cifspdu.h
index 48d0d6f439cf..18d67ab113f0 100644
--- a/fs/smb/client/cifspdu.h
+++ b/fs/smb/client/cifspdu.h
@@ -2256,6 +2256,8 @@ typedef struct {
#define FILE_SUPPORTS_ENCRYPTION 0x00020000
#define FILE_SUPPORTS_OBJECT_IDS 0x00010000
#define FILE_VOLUME_IS_COMPRESSED 0x00008000
+#define FILE_SUPPORTS_POSIX_UNLINK_RENAME 0x00000400
+#define FILE_RETURNS_CLEANUP_RESULT_INFO 0x00000200
#define FILE_SUPPORTS_REMOTE_STORAGE 0x00000100
#define FILE_SUPPORTS_REPARSE_POINTS 0x00000080
#define FILE_SUPPORTS_SPARSE_FILES 0x00000040
diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c
index 29dcb88392e5..60cb264a01e5 100644
--- a/fs/smb/client/cifssmb.c
+++ b/fs/smb/client/cifssmb.c
@@ -1041,15 +1041,31 @@ static __u16 convert_disposition(int disposition)
static int
access_flags_to_smbopen_mode(const int access_flags)
{
- int masked_flags = access_flags & (GENERIC_READ | GENERIC_WRITE);
-
- if (masked_flags == GENERIC_READ)
- return SMBOPEN_READ;
- else if (masked_flags == GENERIC_WRITE)
+ /*
+ * SYSTEM_SECURITY grants both read and write access to SACL, treat is as read/write.
+ * MAXIMUM_ALLOWED grants as many access as possible, so treat it as read/write too.
+ * SYNCHRONIZE as is does not grant any specific access, so do not check its mask.
+ * If only SYNCHRONIZE bit is specified then fallback to read access.
+ */
+ bool with_write_flags = access_flags & (FILE_WRITE_DATA | FILE_APPEND_DATA | FILE_WRITE_EA |
+ FILE_DELETE_CHILD | FILE_WRITE_ATTRIBUTES | DELETE |
+ WRITE_DAC | WRITE_OWNER | SYSTEM_SECURITY |
+ MAXIMUM_ALLOWED | GENERIC_WRITE | GENERIC_ALL);
+ bool with_read_flags = access_flags & (FILE_READ_DATA | FILE_READ_EA | FILE_EXECUTE |
+ FILE_READ_ATTRIBUTES | READ_CONTROL |
+ SYSTEM_SECURITY | MAXIMUM_ALLOWED | GENERIC_ALL |
+ GENERIC_EXECUTE | GENERIC_READ);
+ bool with_execute_flags = access_flags & (FILE_EXECUTE | MAXIMUM_ALLOWED | GENERIC_ALL |
+ GENERIC_EXECUTE);
+
+ if (with_write_flags && with_read_flags)
+ return SMBOPEN_READWRITE;
+ else if (with_write_flags)
return SMBOPEN_WRITE;
-
- /* just go for read/write */
- return SMBOPEN_READWRITE;
+ else if (with_execute_flags)
+ return SMBOPEN_EXECUTE;
+ else
+ return SMBOPEN_READ;
}
int
diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c
index d7bad2c3af37..4a0b2d220fe8 100644
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -371,7 +371,7 @@ static bool cifs_tcp_ses_needs_reconnect(struct TCP_Server_Info *server, int num
*
*/
static int __cifs_reconnect(struct TCP_Server_Info *server,
- bool mark_smb_session)
+ bool mark_smb_session, bool once)
{
int rc = 0;
@@ -399,6 +399,9 @@ static int __cifs_reconnect(struct TCP_Server_Info *server,
if (rc) {
cifs_server_unlock(server);
cifs_dbg(FYI, "%s: reconnect error %d\n", __func__, rc);
+ /* If was asked to reconnect only once, do not try it more times */
+ if (once)
+ break;
msleep(3000);
} else {
atomic_inc(&tcpSesReconnectCount);
@@ -564,19 +567,33 @@ static int reconnect_dfs_server(struct TCP_Server_Info *server)
return rc;
}
-int cifs_reconnect(struct TCP_Server_Info *server, bool mark_smb_session)
+static int
+_cifs_reconnect(struct TCP_Server_Info *server, bool mark_smb_session, bool once)
{
if (!server->leaf_fullpath)
- return __cifs_reconnect(server, mark_smb_session);
+ return __cifs_reconnect(server, mark_smb_session, once);
return reconnect_dfs_server(server);
}
#else
-int cifs_reconnect(struct TCP_Server_Info *server, bool mark_smb_session)
+static int
+_cifs_reconnect(struct TCP_Server_Info *server, bool mark_smb_session, bool once)
{
- return __cifs_reconnect(server, mark_smb_session);
+ return __cifs_reconnect(server, mark_smb_session, once);
}
#endif
+int
+cifs_reconnect(struct TCP_Server_Info *server, bool mark_smb_session)
+{
+ return _cifs_reconnect(server, mark_smb_session, false);
+}
+
+static int
+cifs_reconnect_once(struct TCP_Server_Info *server)
+{
+ return _cifs_reconnect(server, true, true);
+}
+
static void
cifs_echo_request(struct work_struct *work)
{
@@ -803,26 +820,110 @@ is_smb_response(struct TCP_Server_Info *server, unsigned char type)
/* Regular SMB response */
return true;
case RFC1002_SESSION_KEEP_ALIVE:
+ /*
+ * RFC 1002 session keep alive can sent by the server only when
+ * we established a RFC 1002 session. But Samba servers send
+ * RFC 1002 session keep alive also over port 445 on which
+ * RFC 1002 session is not established.
+ */
cifs_dbg(FYI, "RFC 1002 session keep alive\n");
break;
case RFC1002_POSITIVE_SESSION_RESPONSE:
- cifs_dbg(FYI, "RFC 1002 positive session response\n");
+ /*
+ * RFC 1002 positive session response cannot be returned
+ * for SMB request. RFC 1002 session response is handled
+ * exclusively in ip_rfc1001_connect() function.
+ */
+ cifs_server_dbg(VFS, "RFC 1002 positive session response (unexpected)\n");
+ cifs_reconnect(server, true);
break;
case RFC1002_NEGATIVE_SESSION_RESPONSE:
/*
* We get this from Windows 98 instead of an error on
- * SMB negprot response.
+ * SMB negprot response, when we have not established
+ * RFC 1002 session (which means ip_rfc1001_connect()
+ * was skipped). Note that same still happens with
+ * Windows Server 2022 when connecting via port 139.
+ * So for this case when mount option -o nonbsessinit
+ * was not specified, try to reconnect with establishing
+ * RFC 1002 session. If new socket establishment with
+ * RFC 1002 session was successful then return to the
+ * mid's caller -EAGAIN, so it can retry the request.
*/
- cifs_dbg(FYI, "RFC 1002 negative session response\n");
- /* give server a second to clean up */
- msleep(1000);
- /*
- * Always try 445 first on reconnect since we get NACK
- * on some if we ever connected to port 139 (the NACK
- * is since we do not begin with RFC1001 session
- * initialize frame).
- */
- cifs_set_port((struct sockaddr *)&server->dstaddr, CIFS_PORT);
+ if (!cifs_rdma_enabled(server) &&
+ server->tcpStatus == CifsInNegotiate &&
+ !server->with_rfc1001 &&
+ server->rfc1001_sessinit != 0) {
+ int rc, mid_rc;
+ struct mid_q_entry *mid, *nmid;
+ LIST_HEAD(dispose_list);
+
+ cifs_dbg(FYI, "RFC 1002 negative session response during SMB Negotiate, retrying with NetBIOS session\n");
+
+ /*
+ * Before reconnect, delete all pending mids for this
+ * server, so reconnect would not signal connection
+ * aborted error to mid's callbacks. Note that for this
+ * server there should be exactly one pending mid
+ * corresponding to SMB1/SMB2 Negotiate packet.
+ */
+ spin_lock(&server->mid_lock);
+ list_for_each_entry_safe(mid, nmid, &server->pending_mid_q, qhead) {
+ kref_get(&mid->refcount);
+ list_move(&mid->qhead, &dispose_list);
+ mid->mid_flags |= MID_DELETED;
+ }
+ spin_unlock(&server->mid_lock);
+
+ /* Now try to reconnect once with NetBIOS session. */
+ server->with_rfc1001 = true;
+ rc = cifs_reconnect_once(server);
+
+ /*
+ * If reconnect was successful then indicate -EAGAIN
+ * to mid's caller. If reconnect failed with -EAGAIN
+ * then mask it as -EHOSTDOWN, so mid's caller would
+ * know that it failed.
+ */
+ if (rc == 0)
+ mid_rc = -EAGAIN;
+ else if (rc == -EAGAIN)
+ mid_rc = -EHOSTDOWN;
+ else
+ mid_rc = rc;
+
+ /*
+ * After reconnect (either successful or unsuccessful)
+ * deliver reconnect status to mid's caller via mid's
+ * callback. Use MID_RC state which indicates that the
+ * return code should be read from mid_rc member.
+ */
+ list_for_each_entry_safe(mid, nmid, &dispose_list, qhead) {
+ list_del_init(&mid->qhead);
+ mid->mid_rc = mid_rc;
+ mid->mid_state = MID_RC;
+ mid->callback(mid);
+ release_mid(mid);
+ }
+
+ /*
+ * If reconnect failed then wait two seconds. In most
+ * cases we were been called from the mount context and
+ * delivered failure to mid's callback will stop this
+ * receiver task thread and fails the mount process.
+ * So wait two seconds to prevent another reconnect
+ * in this task thread, which would be useless as the
+ * mount context will fail at all.
+ */
+ if (rc != 0)
+ msleep(2000);
+ } else {
+ cifs_server_dbg(VFS, "RFC 1002 negative session response (unexpected)\n");
+ cifs_reconnect(server, true);
+ }
+ break;
+ case RFC1002_RETARGET_SESSION_RESPONSE:
+ cifs_server_dbg(VFS, "RFC 1002 retarget session response (unexpected)\n");
cifs_reconnect(server, true);
break;
default:
@@ -1701,6 +1802,8 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx,
ctx->source_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL);
memcpy(tcp_ses->server_RFC1001_name,
ctx->target_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL);
+ tcp_ses->rfc1001_sessinit = ctx->rfc1001_sessinit;
+ tcp_ses->with_rfc1001 = false;
tcp_ses->session_estab = false;
tcp_ses->sequence_number = 0;
tcp_ses->channel_sequence_num = 0; /* only tracked for primary channel */
@@ -1731,12 +1834,8 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx,
*/
tcp_ses->tcpStatus = CifsNew;
++tcp_ses->srv_count;
+ tcp_ses->echo_interval = ctx->echo_interval * HZ;
- if (ctx->echo_interval >= SMB_ECHO_INTERVAL_MIN &&
- ctx->echo_interval <= SMB_ECHO_INTERVAL_MAX)
- tcp_ses->echo_interval = ctx->echo_interval * HZ;
- else
- tcp_ses->echo_interval = SMB_ECHO_INTERVAL_DEFAULT * HZ;
if (tcp_ses->rdma) {
#ifndef CONFIG_CIFS_SMB_DIRECT
cifs_dbg(VFS, "CONFIG_CIFS_SMB_DIRECT is not enabled\n");
@@ -2457,6 +2556,8 @@ static int match_tcon(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
return 0;
if (tcon->nodelete != ctx->nodelete)
return 0;
+ if (tcon->posix_extensions != ctx->linux_ext)
+ return 0;
return 1;
}
@@ -3221,6 +3322,7 @@ ip_rfc1001_connect(struct TCP_Server_Info *server)
return -EIO;
}
+ server->with_rfc1001 = true;
return 0;
}
@@ -3332,7 +3434,16 @@ generic_ip_connect(struct TCP_Server_Info *server)
return rc;
}
trace_smb3_connect_done(server->hostname, server->conn_id, &server->dstaddr);
- if (sport == htons(RFC1001_PORT))
+
+ /*
+ * Establish RFC1001 NetBIOS session when it was explicitly requested
+ * by mount option -o nbsessinit, or when connecting to default RFC1001
+ * server port (139) and it was not explicitly disabled by mount option
+ * -o nonbsessinit.
+ */
+ if (server->with_rfc1001 ||
+ server->rfc1001_sessinit == 1 ||
+ (server->rfc1001_sessinit == -1 && sport == htons(RFC1001_PORT)))
rc = ip_rfc1001_connect(server);
return rc;
@@ -3481,6 +3592,7 @@ int cifs_setup_cifs_sb(struct cifs_sb_info *cifs_sb)
struct smb3_fs_context *ctx = cifs_sb->ctx;
INIT_DELAYED_WORK(&cifs_sb->prune_tlinks, cifs_prune_tlinks);
+ INIT_LIST_HEAD(&cifs_sb->tcon_sb_link);
spin_lock_init(&cifs_sb->tlink_tree_lock);
cifs_sb->tlink_tree = RB_ROOT;
@@ -3713,6 +3825,10 @@ static int mount_setup_tlink(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses,
tlink_rb_insert(&cifs_sb->tlink_tree, tlink);
spin_unlock(&cifs_sb->tlink_tree_lock);
+ spin_lock(&tcon->sb_list_lock);
+ list_add(&cifs_sb->tcon_sb_link, &tcon->cifs_sb_list);
+ spin_unlock(&tcon->sb_list_lock);
+
queue_delayed_work(cifsiod_wq, &cifs_sb->prune_tlinks,
TLINK_IDLE_EXPIRE);
return 0;
@@ -4054,9 +4170,19 @@ cifs_umount(struct cifs_sb_info *cifs_sb)
struct rb_root *root = &cifs_sb->tlink_tree;
struct rb_node *node;
struct tcon_link *tlink;
+ struct cifs_tcon *tcon = NULL;
cancel_delayed_work_sync(&cifs_sb->prune_tlinks);
+ if (cifs_sb->master_tlink) {
+ tcon = cifs_sb->master_tlink->tl_tcon;
+ if (tcon) {
+ spin_lock(&tcon->sb_list_lock);
+ list_del_init(&cifs_sb->tcon_sb_link);
+ spin_unlock(&tcon->sb_list_lock);
+ }
+ }
+
spin_lock(&cifs_sb->tlink_tree_lock);
while ((node = rb_first(root))) {
tlink = rb_entry(node, struct tcon_link, tl_rbnode);
@@ -4078,11 +4204,13 @@ int
cifs_negotiate_protocol(const unsigned int xid, struct cifs_ses *ses,
struct TCP_Server_Info *server)
{
+ bool in_retry = false;
int rc = 0;
if (!server->ops->need_neg || !server->ops->negotiate)
return -ENOSYS;
+retry:
/* only send once per connect */
spin_lock(&server->srv_lock);
if (server->tcpStatus != CifsGood &&
@@ -4102,6 +4230,14 @@ cifs_negotiate_protocol(const unsigned int xid, struct cifs_ses *ses,
spin_unlock(&server->srv_lock);
rc = server->ops->negotiate(xid, ses, server);
+ if (rc == -EAGAIN) {
+ /* Allow one retry attempt */
+ if (!in_retry) {
+ in_retry = true;
+ goto retry;
+ }
+ rc = -EHOSTDOWN;
+ }
if (rc == 0) {
spin_lock(&server->srv_lock);
if (server->tcpStatus == CifsInNegotiate)
diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c
index bdb762d398af..2980941b9667 100644
--- a/fs/smb/client/fs_context.c
+++ b/fs/smb/client/fs_context.c
@@ -135,6 +135,7 @@ const struct fs_parameter_spec smb3_fs_parameters[] = {
fsparam_flag("witness", Opt_witness),
fsparam_flag_no("nativesocket", Opt_nativesocket),
fsparam_flag_no("unicode", Opt_unicode),
+ fsparam_flag_no("nbsessinit", Opt_nbsessinit),
/* Mount options which take uid or gid */
fsparam_uid("backupuid", Opt_backupuid),
@@ -968,6 +969,10 @@ static int smb3_verify_reconfigure_ctx(struct fs_context *fc,
cifs_errorf(fc, "can not change unicode during remount\n");
return -EINVAL;
}
+ if (new_ctx->rfc1001_sessinit != old_ctx->rfc1001_sessinit) {
+ cifs_errorf(fc, "can not change nbsessinit during remount\n");
+ return -EINVAL;
+ }
return 0;
}
@@ -1333,6 +1338,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
case Opt_rsize:
ctx->rsize = result.uint_32;
ctx->got_rsize = true;
+ ctx->vol_rsize = ctx->rsize;
break;
case Opt_wsize:
ctx->wsize = result.uint_32;
@@ -1348,6 +1354,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
ctx->wsize, PAGE_SIZE);
}
}
+ ctx->vol_wsize = ctx->wsize;
break;
case Opt_acregmax:
if (result.uint_32 > CIFS_MAX_ACTIMEO / HZ) {
@@ -1383,6 +1390,11 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
ctx->closetimeo = HZ * result.uint_32;
break;
case Opt_echo_interval:
+ if (result.uint_32 < SMB_ECHO_INTERVAL_MIN ||
+ result.uint_32 > SMB_ECHO_INTERVAL_MAX) {
+ cifs_errorf(fc, "echo interval is out of bounds\n");
+ goto cifs_parse_mount_err;
+ }
ctx->echo_interval = result.uint_32;
break;
case Opt_snapshot:
@@ -1602,6 +1614,10 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
if (i == RFC1001_NAME_LEN && param->string[i] != 0)
pr_warn("server netbiosname longer than 15 truncated\n");
break;
+ case Opt_nbsessinit:
+ ctx->rfc1001_sessinit = !result.negated;
+ cifs_dbg(FYI, "rfc1001_sessinit set to %d\n", ctx->rfc1001_sessinit);
+ break;
case Opt_ver:
/* version of mount userspace tools, not dialect */
/* If interface changes in mount.cifs bump to new ver */
@@ -1889,13 +1905,16 @@ int smb3_init_fs_context(struct fs_context *fc)
memset(ctx->source_rfc1001_name, 0x20, RFC1001_NAME_LEN);
for (i = 0; i < strnlen(nodename, RFC1001_NAME_LEN); i++)
ctx->source_rfc1001_name[i] = toupper(nodename[i]);
-
ctx->source_rfc1001_name[RFC1001_NAME_LEN] = 0;
+
/*
* null target name indicates to use *SMBSERVR default called name
* if we end up sending RFC1001 session initialize
*/
ctx->target_rfc1001_name[0] = 0;
+
+ ctx->rfc1001_sessinit = -1; /* autodetect based on port number */
+
ctx->cred_uid = current_uid();
ctx->linux_uid = current_uid();
ctx->linux_gid = current_gid();
diff --git a/fs/smb/client/fs_context.h b/fs/smb/client/fs_context.h
index 42c6b66c2c1a..d1d29249bcdb 100644
--- a/fs/smb/client/fs_context.h
+++ b/fs/smb/client/fs_context.h
@@ -174,6 +174,7 @@ enum cifs_param {
Opt_iocharset,
Opt_netbiosname,
Opt_servern,
+ Opt_nbsessinit,
Opt_ver,
Opt_vers,
Opt_sec,
@@ -216,6 +217,7 @@ struct smb3_fs_context {
char *iocharset; /* local code page for mapping to and from Unicode */
char source_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* clnt nb name */
char target_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* srvr nb name */
+ int rfc1001_sessinit;
kuid_t cred_uid;
kuid_t linux_uid;
kgid_t linux_gid;
@@ -280,6 +282,9 @@ struct smb3_fs_context {
bool use_client_guid:1;
/* reuse existing guid for multichannel */
u8 client_guid[SMB2_CLIENT_GUID_SIZE];
+ /* User-specified original r/wsize value */
+ unsigned int vol_rsize;
+ unsigned int vol_wsize;
unsigned int bsize;
unsigned int rasize;
unsigned int rsize;
diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c
index 3bb21aa58474..75be4b46bc6f 100644
--- a/fs/smb/client/inode.c
+++ b/fs/smb/client/inode.c
@@ -1203,18 +1203,17 @@ static int reparse_info_to_fattr(struct cifs_open_info_data *data,
goto out;
}
break;
- case IO_REPARSE_TAG_MOUNT_POINT:
- cifs_create_junction_fattr(fattr, sb);
- rc = 0;
- goto out;
default:
/* Check for cached reparse point data */
if (data->symlink_target || data->reparse.buf) {
rc = 0;
- } else if (iov && server->ops->parse_reparse_point) {
- rc = server->ops->parse_reparse_point(cifs_sb,
- full_path,
- iov, data);
+ } else if (iov && server->ops->get_reparse_point_buffer) {
+ struct reparse_data_buffer *reparse_buf;
+ u32 reparse_len;
+
+ reparse_buf = server->ops->get_reparse_point_buffer(iov, &reparse_len);
+ rc = parse_reparse_point(reparse_buf, reparse_len,
+ cifs_sb, full_path, data);
/*
* If the reparse point was not handled but it is the
* name surrogate which points to directory, then treat
@@ -1228,6 +1227,16 @@ static int reparse_info_to_fattr(struct cifs_open_info_data *data,
cifs_create_junction_fattr(fattr, sb);
goto out;
}
+ /*
+ * If the reparse point is unsupported by the Linux SMB
+ * client then let it process by the SMB server. So mask
+ * the -EOPNOTSUPP error code. This will allow Linux SMB
+ * client to send SMB OPEN request to server. If server
+ * does not support this reparse point too then server
+ * will return error during open the path.
+ */
+ if (rc == -EOPNOTSUPP)
+ rc = 0;
}
if (data->reparse.tag == IO_REPARSE_TAG_SYMLINK && !rc) {
@@ -2901,23 +2910,6 @@ int cifs_fiemap(struct inode *inode, struct fiemap_extent_info *fei, u64 start,
return -EOPNOTSUPP;
}
-int cifs_truncate_page(struct address_space *mapping, loff_t from)
-{
- pgoff_t index = from >> PAGE_SHIFT;
- unsigned offset = from & (PAGE_SIZE - 1);
- struct page *page;
- int rc = 0;
-
- page = grab_cache_page(mapping, index);
- if (!page)
- return -ENOMEM;
-
- zero_user_segment(page, offset, PAGE_SIZE);
- unlock_page(page);
- put_page(page);
- return rc;
-}
-
void cifs_setsize(struct inode *inode, loff_t offset)
{
struct cifsInodeInfo *cifs_i = CIFS_I(inode);
@@ -3012,8 +3004,6 @@ set_size_out:
*/
attrs->ia_ctime = attrs->ia_mtime = current_time(inode);
attrs->ia_valid |= ATTR_CTIME | ATTR_MTIME;
-
- cifs_truncate_page(inode->i_mapping, inode->i_size);
}
return rc;
diff --git a/fs/smb/client/link.c b/fs/smb/client/link.c
index a88253668286..769752ad2c5c 100644
--- a/fs/smb/client/link.c
+++ b/fs/smb/client/link.c
@@ -258,7 +258,7 @@ cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
struct cifs_open_parms oparms;
struct cifs_io_parms io_parms = {0};
int buf_type = CIFS_NO_BUFFER;
- FILE_ALL_INFO file_info;
+ struct cifs_open_info_data query_data;
oparms = (struct cifs_open_parms) {
.tcon = tcon,
@@ -270,11 +270,11 @@ cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
.fid = &fid,
};
- rc = CIFS_open(xid, &oparms, &oplock, &file_info);
+ rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, &query_data);
if (rc)
return rc;
- if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) {
+ if (query_data.fi.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) {
rc = -ENOENT;
/* it's not a symlink */
goto out;
@@ -313,7 +313,7 @@ cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
.fid = &fid,
};
- rc = CIFS_open(xid, &oparms, &oplock, NULL);
+ rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, NULL);
if (rc)
return rc;
diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c
index b328dc5c7988..7b6ed9b23e71 100644
--- a/fs/smb/client/misc.c
+++ b/fs/smb/client/misc.c
@@ -137,8 +137,10 @@ tcon_info_alloc(bool dir_leases_enabled, enum smb3_tcon_ref_trace trace)
spin_lock_init(&ret_buf->tc_lock);
INIT_LIST_HEAD(&ret_buf->openFileList);
INIT_LIST_HEAD(&ret_buf->tcon_list);
+ INIT_LIST_HEAD(&ret_buf->cifs_sb_list);
spin_lock_init(&ret_buf->open_file_lock);
spin_lock_init(&ret_buf->stat_lock);
+ spin_lock_init(&ret_buf->sb_list_lock);
atomic_set(&ret_buf->num_local_opens, 0);
atomic_set(&ret_buf->num_remote_opens, 0);
ret_buf->stats_from_time = ktime_get_real_seconds();
diff --git a/fs/smb/client/reparse.c b/fs/smb/client/reparse.c
index 2b9e9885dc42..bb25e77c5540 100644
--- a/fs/smb/client/reparse.c
+++ b/fs/smb/client/reparse.c
@@ -542,12 +542,12 @@ static int wsl_set_reparse_buf(struct reparse_data_buffer **buf,
kfree(symname_utf16);
return -ENOMEM;
}
- /* Flag 0x02000000 is unknown, but all wsl symlinks have this value */
- symlink_buf->Flags = cpu_to_le32(0x02000000);
- /* PathBuffer is in UTF-8 but without trailing null-term byte */
+ /* Version field must be set to 2 (MS-FSCC 2.1.2.7) */
+ symlink_buf->Version = cpu_to_le32(2);
+ /* Target for Version 2 is in UTF-8 but without trailing null-term byte */
symname_utf8_len = utf16s_to_utf8s((wchar_t *)symname_utf16, symname_utf16_len/2,
UTF16_LITTLE_ENDIAN,
- symlink_buf->PathBuffer,
+ symlink_buf->Target,
symname_utf8_maxlen);
*buf = (struct reparse_data_buffer *)symlink_buf;
buf_len = sizeof(struct reparse_wsl_symlink_data_buffer) + symname_utf8_len;
@@ -1016,29 +1016,36 @@ static int parse_reparse_wsl_symlink(struct reparse_wsl_symlink_data_buffer *buf
struct cifs_open_info_data *data)
{
int len = le16_to_cpu(buf->ReparseDataLength);
+ int data_offset = offsetof(typeof(*buf), Target) - offsetof(typeof(*buf), Version);
int symname_utf8_len;
__le16 *symname_utf16;
int symname_utf16_len;
- if (len <= sizeof(buf->Flags)) {
+ if (len <= data_offset) {
cifs_dbg(VFS, "srv returned malformed wsl symlink buffer\n");
return -EIO;
}
- /* PathBuffer is in UTF-8 but without trailing null-term byte */
- symname_utf8_len = len - sizeof(buf->Flags);
+ /* MS-FSCC 2.1.2.7 defines layout of the Target field only for Version 2. */
+ if (le32_to_cpu(buf->Version) != 2) {
+ cifs_dbg(VFS, "srv returned unsupported wsl symlink version %u\n", le32_to_cpu(buf->Version));
+ return -EIO;
+ }
+
+ /* Target for Version 2 is in UTF-8 but without trailing null-term byte */
+ symname_utf8_len = len - data_offset;
/*
* Check that buffer does not contain null byte
* because Linux cannot process symlink with null byte.
*/
- if (strnlen(buf->PathBuffer, symname_utf8_len) != symname_utf8_len) {
+ if (strnlen(buf->Target, symname_utf8_len) != symname_utf8_len) {
cifs_dbg(VFS, "srv returned null byte in wsl symlink target location\n");
return -EIO;
}
symname_utf16 = kzalloc(symname_utf8_len * 2, GFP_KERNEL);
if (!symname_utf16)
return -ENOMEM;
- symname_utf16_len = utf8s_to_utf16s(buf->PathBuffer, symname_utf8_len,
+ symname_utf16_len = utf8s_to_utf16s(buf->Target, symname_utf8_len,
UTF16_LITTLE_ENDIAN,
(wchar_t *) symname_utf16, symname_utf8_len * 2);
if (symname_utf16_len < 0) {
@@ -1062,8 +1069,6 @@ int parse_reparse_point(struct reparse_data_buffer *buf,
const char *full_path,
struct cifs_open_info_data *data)
{
- struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
-
data->reparse.buf = buf;
/* See MS-FSCC 2.1.2 */
@@ -1090,24 +1095,17 @@ int parse_reparse_point(struct reparse_data_buffer *buf,
}
return 0;
default:
- cifs_tcon_dbg(VFS | ONCE, "unhandled reparse tag: 0x%08x\n",
- le32_to_cpu(buf->ReparseTag));
return -EOPNOTSUPP;
}
}
-int smb2_parse_reparse_point(struct cifs_sb_info *cifs_sb,
- const char *full_path,
- struct kvec *rsp_iov,
- struct cifs_open_info_data *data)
+struct reparse_data_buffer *smb2_get_reparse_point_buffer(const struct kvec *rsp_iov,
+ u32 *plen)
{
- struct reparse_data_buffer *buf;
struct smb2_ioctl_rsp *io = rsp_iov->iov_base;
- u32 plen = le32_to_cpu(io->OutputCount);
-
- buf = (struct reparse_data_buffer *)((u8 *)io +
- le32_to_cpu(io->OutputOffset));
- return parse_reparse_point(buf, plen, cifs_sb, full_path, data);
+ *plen = le32_to_cpu(io->OutputCount);
+ return (struct reparse_data_buffer *)((u8 *)io +
+ le32_to_cpu(io->OutputOffset));
}
static bool wsl_to_fattr(struct cifs_open_info_data *data,
@@ -1233,16 +1231,6 @@ bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb,
bool ok;
switch (tag) {
- case IO_REPARSE_TAG_INTERNAL:
- if (!(fattr->cf_cifsattrs & ATTR_DIRECTORY))
- return false;
- fallthrough;
- case IO_REPARSE_TAG_DFS:
- case IO_REPARSE_TAG_DFSR:
- case IO_REPARSE_TAG_MOUNT_POINT:
- /* See cifs_create_junction_fattr() */
- fattr->cf_mode = S_IFDIR | 0711;
- break;
case IO_REPARSE_TAG_LX_SYMLINK:
case IO_REPARSE_TAG_LX_FIFO:
case IO_REPARSE_TAG_AF_UNIX:
@@ -1262,7 +1250,14 @@ bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb,
fattr->cf_mode |= S_IFLNK;
break;
default:
- return false;
+ if (!(fattr->cf_cifsattrs & ATTR_DIRECTORY))
+ return false;
+ if (!IS_REPARSE_TAG_NAME_SURROGATE(tag) &&
+ tag != IO_REPARSE_TAG_INTERNAL)
+ return false;
+ /* See cifs_create_junction_fattr() */
+ fattr->cf_mode = S_IFDIR | 0711;
+ break;
}
fattr->cf_dtype = S_DT(fattr->cf_mode);
diff --git a/fs/smb/client/reparse.h b/fs/smb/client/reparse.h
index c0be5ab45a78..08de853b36a8 100644
--- a/fs/smb/client/reparse.h
+++ b/fs/smb/client/reparse.h
@@ -135,9 +135,6 @@ int smb2_create_reparse_symlink(const unsigned int xid, struct inode *inode,
int smb2_mknod_reparse(unsigned int xid, struct inode *inode,
struct dentry *dentry, struct cifs_tcon *tcon,
const char *full_path, umode_t mode, dev_t dev);
-int smb2_parse_reparse_point(struct cifs_sb_info *cifs_sb,
- const char *full_path,
- struct kvec *rsp_iov,
- struct cifs_open_info_data *data);
+struct reparse_data_buffer *smb2_get_reparse_point_buffer(const struct kvec *rsp_iov, u32 *len);
#endif /* _CIFS_REPARSE_H */
diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c
index f2ca5963cd9d..b3fa9ee26912 100644
--- a/fs/smb/client/sess.c
+++ b/fs/smb/client/sess.c
@@ -680,6 +680,22 @@ unicode_oslm_strings(char **pbcc_area, const struct nls_table *nls_cp)
*pbcc_area = bcc_ptr;
}
+static void
+ascii_oslm_strings(char **pbcc_area, const struct nls_table *nls_cp)
+{
+ char *bcc_ptr = *pbcc_area;
+
+ strcpy(bcc_ptr, "Linux version ");
+ bcc_ptr += strlen("Linux version ");
+ strcpy(bcc_ptr, init_utsname()->release);
+ bcc_ptr += strlen(init_utsname()->release) + 1;
+
+ strcpy(bcc_ptr, CIFS_NETWORK_OPSYS);
+ bcc_ptr += strlen(CIFS_NETWORK_OPSYS) + 1;
+
+ *pbcc_area = bcc_ptr;
+}
+
static void unicode_domain_string(char **pbcc_area, struct cifs_ses *ses,
const struct nls_table *nls_cp)
{
@@ -704,6 +720,25 @@ static void unicode_domain_string(char **pbcc_area, struct cifs_ses *ses,
*pbcc_area = bcc_ptr;
}
+static void ascii_domain_string(char **pbcc_area, struct cifs_ses *ses,
+ const struct nls_table *nls_cp)
+{
+ char *bcc_ptr = *pbcc_area;
+ int len;
+
+ /* copy domain */
+ if (ses->domainName != NULL) {
+ len = strscpy(bcc_ptr, ses->domainName, CIFS_MAX_DOMAINNAME_LEN);
+ if (WARN_ON_ONCE(len < 0))
+ len = CIFS_MAX_DOMAINNAME_LEN - 1;
+ bcc_ptr += len;
+ } /* else we send a null domain name so server will default to its own domain */
+ *bcc_ptr = 0;
+ bcc_ptr++;
+
+ *pbcc_area = bcc_ptr;
+}
+
static void unicode_ssetup_strings(char **pbcc_area, struct cifs_ses *ses,
const struct nls_table *nls_cp)
{
@@ -749,25 +784,10 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifs_ses *ses,
*bcc_ptr = 0;
bcc_ptr++; /* account for null termination */
- /* copy domain */
- if (ses->domainName != NULL) {
- len = strscpy(bcc_ptr, ses->domainName, CIFS_MAX_DOMAINNAME_LEN);
- if (WARN_ON_ONCE(len < 0))
- len = CIFS_MAX_DOMAINNAME_LEN - 1;
- bcc_ptr += len;
- } /* else we send a null domain name so server will default to its own domain */
- *bcc_ptr = 0;
- bcc_ptr++;
-
/* BB check for overflow here */
- strcpy(bcc_ptr, "Linux version ");
- bcc_ptr += strlen("Linux version ");
- strcpy(bcc_ptr, init_utsname()->release);
- bcc_ptr += strlen(init_utsname()->release) + 1;
-
- strcpy(bcc_ptr, CIFS_NETWORK_OPSYS);
- bcc_ptr += strlen(CIFS_NETWORK_OPSYS) + 1;
+ ascii_domain_string(&bcc_ptr, ses, nls_cp);
+ ascii_oslm_strings(&bcc_ptr, nls_cp);
*pbcc_area = bcc_ptr;
}
@@ -1570,7 +1590,7 @@ sess_auth_kerberos(struct sess_data *sess_data)
sess_data->iov[1].iov_len = msg->secblob_len;
pSMB->req.SecurityBlobLength = cpu_to_le16(sess_data->iov[1].iov_len);
- if (ses->capabilities & CAP_UNICODE) {
+ if (pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) {
/* unicode strings must be word aligned */
if (!IS_ALIGNED(sess_data->iov[0].iov_len + sess_data->iov[1].iov_len, 2)) {
*bcc_ptr = 0;
@@ -1579,8 +1599,8 @@ sess_auth_kerberos(struct sess_data *sess_data)
unicode_oslm_strings(&bcc_ptr, sess_data->nls_cp);
unicode_domain_string(&bcc_ptr, ses, sess_data->nls_cp);
} else {
- /* BB: is this right? */
- ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp);
+ ascii_oslm_strings(&bcc_ptr, sess_data->nls_cp);
+ ascii_domain_string(&bcc_ptr, ses, sess_data->nls_cp);
}
sess_data->iov[2].iov_len = (long) bcc_ptr -
diff --git a/fs/smb/client/smb1ops.c b/fs/smb/client/smb1ops.c
index 8701484805cd..0adeec652dc1 100644
--- a/fs/smb/client/smb1ops.c
+++ b/fs/smb/client/smb1ops.c
@@ -14,6 +14,8 @@
#include "cifspdu.h"
#include "cifs_unicode.h"
#include "fs_context.h"
+#include "nterr.h"
+#include "smberr.h"
/*
* An NT cancel request header looks just like the original request except:
@@ -426,13 +428,6 @@ cifs_negotiate(const unsigned int xid,
{
int rc;
rc = CIFSSMBNegotiate(xid, ses, server);
- if (rc == -EAGAIN) {
- /* retry only once on 1st time connection */
- set_credits(server, 1);
- rc = CIFSSMBNegotiate(xid, ses, server);
- if (rc == -EAGAIN)
- rc = -EHOSTDOWN;
- }
return rc;
}
@@ -444,8 +439,8 @@ cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
unsigned int wsize;
/* start with specified wsize, or default */
- if (ctx->wsize)
- wsize = ctx->wsize;
+ if (ctx->got_wsize)
+ wsize = ctx->vol_wsize;
else if (tcon->unix_ext && (unix_cap & CIFS_UNIX_LARGE_WRITE_CAP))
wsize = CIFS_DEFAULT_IOSIZE;
else
@@ -497,7 +492,7 @@ cifs_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
else
defsize = server->maxBuf - sizeof(READ_RSP);
- rsize = ctx->rsize ? ctx->rsize : defsize;
+ rsize = ctx->got_rsize ? ctx->vol_rsize : defsize;
/*
* no CAP_LARGE_READ_X? Then MS-CIFS states that we must limit this to
@@ -573,6 +568,42 @@ static int cifs_query_path_info(const unsigned int xid,
data->reparse_point = le32_to_cpu(fi.Attributes) & ATTR_REPARSE;
}
+#ifdef CONFIG_CIFS_XATTR
+ /*
+ * For WSL CHR and BLK reparse points it is required to fetch
+ * EA $LXDEV which contains major and minor device numbers.
+ */
+ if (!rc && data->reparse_point) {
+ struct smb2_file_full_ea_info *ea;
+
+ ea = (struct smb2_file_full_ea_info *)data->wsl.eas;
+ rc = CIFSSMBQAllEAs(xid, tcon, full_path, SMB2_WSL_XATTR_DEV,
+ &ea->ea_data[SMB2_WSL_XATTR_NAME_LEN + 1],
+ SMB2_WSL_XATTR_DEV_SIZE, cifs_sb);
+ if (rc == SMB2_WSL_XATTR_DEV_SIZE) {
+ ea->next_entry_offset = cpu_to_le32(0);
+ ea->flags = 0;
+ ea->ea_name_length = SMB2_WSL_XATTR_NAME_LEN;
+ ea->ea_value_length = cpu_to_le16(SMB2_WSL_XATTR_DEV_SIZE);
+ memcpy(&ea->ea_data[0], SMB2_WSL_XATTR_DEV, SMB2_WSL_XATTR_NAME_LEN + 1);
+ data->wsl.eas_len = sizeof(*ea) + SMB2_WSL_XATTR_NAME_LEN + 1 +
+ SMB2_WSL_XATTR_DEV_SIZE;
+ rc = 0;
+ } else if (rc >= 0) {
+ /* It is an error if EA $LXDEV has wrong size. */
+ rc = -EINVAL;
+ } else {
+ /*
+ * In all other cases ignore error if fetching
+ * of EA $LXDEV failed. It is needed only for
+ * WSL CHR and BLK reparse points and wsl_to_fattr()
+ * handle the case when EA is missing.
+ */
+ rc = 0;
+ }
+ }
+#endif
+
return rc;
}
@@ -975,18 +1006,13 @@ static int cifs_query_symlink(const unsigned int xid,
return rc;
}
-static int cifs_parse_reparse_point(struct cifs_sb_info *cifs_sb,
- const char *full_path,
- struct kvec *rsp_iov,
- struct cifs_open_info_data *data)
+static struct reparse_data_buffer *cifs_get_reparse_point_buffer(const struct kvec *rsp_iov,
+ u32 *plen)
{
- struct reparse_data_buffer *buf;
TRANSACT_IOCTL_RSP *io = rsp_iov->iov_base;
- u32 plen = le16_to_cpu(io->ByteCount);
-
- buf = (struct reparse_data_buffer *)((__u8 *)&io->hdr.Protocol +
- le32_to_cpu(io->DataOffset));
- return parse_reparse_point(buf, plen, cifs_sb, full_path, data);
+ *plen = le16_to_cpu(io->ByteCount);
+ return (struct reparse_data_buffer *)((__u8 *)&io->hdr.Protocol +
+ le32_to_cpu(io->DataOffset));
}
static bool
@@ -1069,6 +1095,47 @@ cifs_make_node(unsigned int xid, struct inode *inode,
full_path, mode, dev);
}
+static bool
+cifs_is_network_name_deleted(char *buf, struct TCP_Server_Info *server)
+{
+ struct smb_hdr *shdr = (struct smb_hdr *)buf;
+ struct TCP_Server_Info *pserver;
+ struct cifs_ses *ses;
+ struct cifs_tcon *tcon;
+
+ if (shdr->Flags2 & SMBFLG2_ERR_STATUS) {
+ if (shdr->Status.CifsError != cpu_to_le32(NT_STATUS_NETWORK_NAME_DELETED))
+ return false;
+ } else {
+ if (shdr->Status.DosError.ErrorClass != ERRSRV ||
+ shdr->Status.DosError.Error != cpu_to_le16(ERRinvtid))
+ return false;
+ }
+
+ /* If server is a channel, select the primary channel */
+ pserver = SERVER_IS_CHAN(server) ? server->primary_server : server;
+
+ spin_lock(&cifs_tcp_ses_lock);
+ list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) {
+ if (cifs_ses_exiting(ses))
+ continue;
+ list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
+ if (tcon->tid == shdr->Tid) {
+ spin_lock(&tcon->tc_lock);
+ tcon->need_reconnect = true;
+ spin_unlock(&tcon->tc_lock);
+ spin_unlock(&cifs_tcp_ses_lock);
+ pr_warn_once("Server share %s deleted.\n",
+ tcon->tree_name);
+ return true;
+ }
+ }
+ }
+ spin_unlock(&cifs_tcp_ses_lock);
+
+ return false;
+}
+
struct smb_version_operations smb1_operations = {
.send_cancel = send_nt_cancel,
.compare_fids = cifs_compare_fids,
@@ -1121,7 +1188,7 @@ struct smb_version_operations smb1_operations = {
.rename = CIFSSMBRename,
.create_hardlink = CIFSCreateHardLink,
.query_symlink = cifs_query_symlink,
- .parse_reparse_point = cifs_parse_reparse_point,
+ .get_reparse_point_buffer = cifs_get_reparse_point_buffer,
.open = cifs_open_file,
.set_fid = cifs_set_fid,
.close = cifs_close_file,
@@ -1153,6 +1220,7 @@ struct smb_version_operations smb1_operations = {
.get_acl_by_fid = get_cifs_acl_by_fid,
.set_acl = set_cifs_acl,
.make_node = cifs_make_node,
+ .is_network_name_deleted = cifs_is_network_name_deleted,
};
struct smb_version_values smb1_values = {
diff --git a/fs/smb/client/smb2file.c b/fs/smb/client/smb2file.c
index d609a20fb98a..a7f629238830 100644
--- a/fs/smb/client/smb2file.c
+++ b/fs/smb/client/smb2file.c
@@ -152,16 +152,35 @@ int smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, __u32
int err_buftype = CIFS_NO_BUFFER;
struct cifs_fid *fid = oparms->fid;
struct network_resiliency_req nr_ioctl_req;
+ bool retry_without_read_attributes = false;
smb2_path = cifs_convert_path_to_utf16(oparms->path, oparms->cifs_sb);
if (smb2_path == NULL)
return -ENOMEM;
- oparms->desired_access |= FILE_READ_ATTRIBUTES;
+ /*
+ * GENERIC_READ, GENERIC_EXECUTE, GENERIC_ALL and MAXIMUM_ALLOWED
+ * contains also FILE_READ_ATTRIBUTES access right. So do not append
+ * FILE_READ_ATTRIBUTES when not needed and prevent calling code path
+ * for retry_without_read_attributes.
+ */
+ if (!(oparms->desired_access & FILE_READ_ATTRIBUTES) &&
+ !(oparms->desired_access & GENERIC_READ) &&
+ !(oparms->desired_access & GENERIC_EXECUTE) &&
+ !(oparms->desired_access & GENERIC_ALL) &&
+ !(oparms->desired_access & MAXIMUM_ALLOWED)) {
+ oparms->desired_access |= FILE_READ_ATTRIBUTES;
+ retry_without_read_attributes = true;
+ }
smb2_oplock = SMB2_OPLOCK_LEVEL_BATCH;
rc = SMB2_open(xid, oparms, smb2_path, &smb2_oplock, smb2_data, NULL, &err_iov,
&err_buftype);
+ if (rc == -EACCES && retry_without_read_attributes) {
+ oparms->desired_access &= ~FILE_READ_ATTRIBUTES;
+ rc = SMB2_open(xid, oparms, smb2_path, &smb2_oplock, smb2_data, NULL, &err_iov,
+ &err_buftype);
+ }
if (rc && data) {
struct smb2_hdr *hdr = err_iov.iov_base;
diff --git a/fs/smb/client/smb2glob.h b/fs/smb/client/smb2glob.h
index 2466e6155136..224495322a05 100644
--- a/fs/smb/client/smb2glob.h
+++ b/fs/smb/client/smb2glob.h
@@ -38,6 +38,7 @@ enum smb2_compound_ops {
SMB2_OP_SET_REPARSE,
SMB2_OP_GET_REPARSE,
SMB2_OP_QUERY_WSL_EA,
+ SMB2_OP_OPEN_QUERY,
};
/* Used when constructing chained read requests. */
diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c
index e9fd3e204a6f..57d9bfbadd97 100644
--- a/fs/smb/client/smb2inode.c
+++ b/fs/smb/client/smb2inode.c
@@ -176,6 +176,7 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
struct kvec *out_iov, int *out_buftype, struct dentry *dentry)
{
+ struct smb2_create_rsp *create_rsp = NULL;
struct smb2_query_info_rsp *qi_rsp = NULL;
struct smb2_compound_vars *vars = NULL;
__u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
@@ -265,7 +266,13 @@ replay_again:
num_rqst++;
rc = 0;
- for (i = 0; i < num_cmds; i++) {
+ i = 0;
+
+ /* Skip the leading explicit OPEN operation */
+ if (num_cmds > 0 && cmds[0] == SMB2_OP_OPEN_QUERY)
+ i++;
+
+ for (; i < num_cmds; i++) {
/* Operation */
switch (cmds[i]) {
case SMB2_OP_QUERY_INFO:
@@ -640,6 +647,27 @@ finished:
}
tmp_rc = rc;
+
+ if (rc == 0 && num_cmds > 0 && cmds[0] == SMB2_OP_OPEN_QUERY) {
+ create_rsp = rsp_iov[0].iov_base;
+ idata = in_iov[0].iov_base;
+ idata->fi.CreationTime = create_rsp->CreationTime;
+ idata->fi.LastAccessTime = create_rsp->LastAccessTime;
+ idata->fi.LastWriteTime = create_rsp->LastWriteTime;
+ idata->fi.ChangeTime = create_rsp->ChangeTime;
+ idata->fi.Attributes = create_rsp->FileAttributes;
+ idata->fi.AllocationSize = create_rsp->AllocationSize;
+ idata->fi.EndOfFile = create_rsp->EndofFile;
+ if (le32_to_cpu(idata->fi.NumberOfLinks) == 0)
+ idata->fi.NumberOfLinks = cpu_to_le32(1); /* dummy value */
+ idata->fi.DeletePending = 0;
+ idata->fi.Directory = !!(le32_to_cpu(create_rsp->FileAttributes) & ATTR_DIRECTORY);
+
+ /* smb2_parse_contexts() fills idata->fi.IndexNumber */
+ rc = smb2_parse_contexts(server, &rsp_iov[0], &oparms->fid->epoch,
+ oparms->fid->lease_key, &oplock, &idata->fi, NULL);
+ }
+
for (i = 0; i < num_cmds; i++) {
char *buf = rsp_iov[i + i].iov_base;
@@ -978,6 +1006,43 @@ int smb2_query_path_info(const unsigned int xid,
case 0:
rc = parse_create_response(data, cifs_sb, full_path, &out_iov[0]);
break;
+ case -EACCES:
+ /*
+ * If SMB2_OP_QUERY_INFO (called when POSIX extensions are not used) failed with
+ * STATUS_ACCESS_DENIED then it means that caller does not have permission to
+ * open the path with FILE_READ_ATTRIBUTES access and therefore cannot issue
+ * SMB2_OP_QUERY_INFO command.
+ *
+ * There is an alternative way how to query limited information about path but still
+ * suitable for stat() syscall. SMB2 OPEN/CREATE operation returns in its successful
+ * response subset of query information.
+ *
+ * So try to open the path without FILE_READ_ATTRIBUTES but with MAXIMUM_ALLOWED
+ * access which will grant the maximum possible access to the file and the response
+ * will contain required query information for stat() syscall.
+ */
+
+ if (tcon->posix_extensions)
+ break;
+
+ num_cmds = 1;
+ cmds[0] = SMB2_OP_OPEN_QUERY;
+ in_iov[0].iov_base = data;
+ in_iov[0].iov_len = sizeof(*data);
+ oparms = CIFS_OPARMS(cifs_sb, tcon, full_path, MAXIMUM_ALLOWED,
+ FILE_OPEN, create_options, ACL_NO_MODE);
+ free_rsp_iov(out_iov, out_buftype, ARRAY_SIZE(out_iov));
+ rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
+ &oparms, in_iov, cmds, num_cmds,
+ cfile, out_iov, out_buftype, NULL);
+
+ hdr = out_iov[0].iov_base;
+ if (!hdr || out_buftype[0] == CIFS_NO_BUFFER)
+ goto out;
+
+ if (!rc)
+ rc = parse_create_response(data, cifs_sb, full_path, &out_iov[0]);
+ break;
case -EOPNOTSUPP:
/*
* BB TODO: When support for special files added to Samba
diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c
index a700e5921961..2fe8eeb98535 100644
--- a/fs/smb/client/smb2ops.c
+++ b/fs/smb/client/smb2ops.c
@@ -464,12 +464,20 @@ smb2_negotiate(const unsigned int xid,
server->CurrentMid = 0;
spin_unlock(&server->mid_lock);
rc = SMB2_negotiate(xid, ses, server);
- /* BB we probably don't need to retry with modern servers */
- if (rc == -EAGAIN)
- rc = -EHOSTDOWN;
return rc;
}
+static inline unsigned int
+prevent_zero_iosize(unsigned int size, const char *type)
+{
+ if (size == 0) {
+ cifs_dbg(VFS, "SMB: Zero %ssize calculated, using minimum value %u\n",
+ type, CIFS_MIN_DEFAULT_IOSIZE);
+ return CIFS_MIN_DEFAULT_IOSIZE;
+ }
+ return size;
+}
+
static unsigned int
smb2_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
{
@@ -477,12 +485,12 @@ smb2_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
unsigned int wsize;
/* start with specified wsize, or default */
- wsize = ctx->wsize ? ctx->wsize : CIFS_DEFAULT_IOSIZE;
+ wsize = ctx->got_wsize ? ctx->vol_wsize : CIFS_DEFAULT_IOSIZE;
wsize = min_t(unsigned int, wsize, server->max_write);
if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU))
wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE);
- return wsize;
+ return prevent_zero_iosize(wsize, "w");
}
static unsigned int
@@ -492,7 +500,7 @@ smb3_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
unsigned int wsize;
/* start with specified wsize, or default */
- wsize = ctx->wsize ? ctx->wsize : SMB3_DEFAULT_IOSIZE;
+ wsize = ctx->got_wsize ? ctx->vol_wsize : SMB3_DEFAULT_IOSIZE;
wsize = min_t(unsigned int, wsize, server->max_write);
#ifdef CONFIG_CIFS_SMB_DIRECT
if (server->rdma) {
@@ -514,7 +522,7 @@ smb3_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU))
wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE);
- return wsize;
+ return prevent_zero_iosize(wsize, "w");
}
static unsigned int
@@ -524,13 +532,13 @@ smb2_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
unsigned int rsize;
/* start with specified rsize, or default */
- rsize = ctx->rsize ? ctx->rsize : CIFS_DEFAULT_IOSIZE;
+ rsize = ctx->got_rsize ? ctx->vol_rsize : CIFS_DEFAULT_IOSIZE;
rsize = min_t(unsigned int, rsize, server->max_read);
if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU))
rsize = min_t(unsigned int, rsize, SMB2_MAX_BUFFER_SIZE);
- return rsize;
+ return prevent_zero_iosize(rsize, "r");
}
static unsigned int
@@ -540,7 +548,7 @@ smb3_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
unsigned int rsize;
/* start with specified rsize, or default */
- rsize = ctx->rsize ? ctx->rsize : SMB3_DEFAULT_IOSIZE;
+ rsize = ctx->got_rsize ? ctx->vol_rsize : SMB3_DEFAULT_IOSIZE;
rsize = min_t(unsigned int, rsize, server->max_read);
#ifdef CONFIG_CIFS_SMB_DIRECT
if (server->rdma) {
@@ -563,7 +571,7 @@ smb3_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU))
rsize = min_t(unsigned int, rsize, SMB2_MAX_BUFFER_SIZE);
- return rsize;
+ return prevent_zero_iosize(rsize, "r");
}
/*
@@ -3526,8 +3534,6 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon,
if (rc == 0) {
netfs_resize_file(&cifsi->netfs, new_eof, true);
cifs_setsize(inode, new_eof);
- cifs_truncate_page(inode->i_mapping, inode->i_size);
- truncate_setsize(inode, new_eof);
}
goto out;
}
@@ -4549,9 +4555,9 @@ decrypt_raw_data(struct TCP_Server_Info *server, char *buf,
return rc;
}
} else {
- if (unlikely(!server->secmech.dec))
- return -EIO;
-
+ rc = smb3_crypto_aead_allocate(server);
+ if (unlikely(rc))
+ return rc;
tfm = server->secmech.dec;
}
@@ -5297,7 +5303,7 @@ struct smb_version_operations smb20_operations = {
.unlink = smb2_unlink,
.rename = smb2_rename_path,
.create_hardlink = smb2_create_hardlink,
- .parse_reparse_point = smb2_parse_reparse_point,
+ .get_reparse_point_buffer = smb2_get_reparse_point_buffer,
.query_mf_symlink = smb3_query_mf_symlink,
.create_mf_symlink = smb3_create_mf_symlink,
.create_reparse_symlink = smb2_create_reparse_symlink,
@@ -5400,7 +5406,7 @@ struct smb_version_operations smb21_operations = {
.unlink = smb2_unlink,
.rename = smb2_rename_path,
.create_hardlink = smb2_create_hardlink,
- .parse_reparse_point = smb2_parse_reparse_point,
+ .get_reparse_point_buffer = smb2_get_reparse_point_buffer,
.query_mf_symlink = smb3_query_mf_symlink,
.create_mf_symlink = smb3_create_mf_symlink,
.create_reparse_symlink = smb2_create_reparse_symlink,
@@ -5507,7 +5513,7 @@ struct smb_version_operations smb30_operations = {
.unlink = smb2_unlink,
.rename = smb2_rename_path,
.create_hardlink = smb2_create_hardlink,
- .parse_reparse_point = smb2_parse_reparse_point,
+ .get_reparse_point_buffer = smb2_get_reparse_point_buffer,
.query_mf_symlink = smb3_query_mf_symlink,
.create_mf_symlink = smb3_create_mf_symlink,
.create_reparse_symlink = smb2_create_reparse_symlink,
@@ -5623,7 +5629,7 @@ struct smb_version_operations smb311_operations = {
.unlink = smb2_unlink,
.rename = smb2_rename_path,
.create_hardlink = smb2_create_hardlink,
- .parse_reparse_point = smb2_parse_reparse_point,
+ .get_reparse_point_buffer = smb2_get_reparse_point_buffer,
.query_mf_symlink = smb3_query_mf_symlink,
.create_mf_symlink = smb3_create_mf_symlink,
.create_reparse_symlink = smb2_create_reparse_symlink,
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index 4f69a1825e42..c4d52bebd37d 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -43,6 +43,7 @@
#endif
#include "cached_dir.h"
#include "compress.h"
+#include "fs_context.h"
/*
* The following table defines the expected "StructureSize" of SMB2 requests
@@ -1251,15 +1252,8 @@ SMB2_negotiate(const unsigned int xid,
cifs_server_dbg(VFS, "Missing expected negotiate contexts\n");
}
- if (server->cipher_type && !rc) {
- if (!SERVER_IS_CHAN(server)) {
- rc = smb3_crypto_aead_allocate(server);
- } else {
- /* For channels, just reuse the primary server crypto secmech. */
- server->secmech.enc = server->primary_server->secmech.enc;
- server->secmech.dec = server->primary_server->secmech.dec;
- }
- }
+ if (server->cipher_type && !rc)
+ rc = smb3_crypto_aead_allocate(server);
neg_exit:
free_rsp_buf(resp_buftype, rsp);
return rc;
@@ -4089,6 +4083,24 @@ smb2_echo_callback(struct mid_q_entry *mid)
add_credits(server, &credits, CIFS_ECHO_OP);
}
+static void cifs_renegotiate_iosize(struct TCP_Server_Info *server,
+ struct cifs_tcon *tcon)
+{
+ struct cifs_sb_info *cifs_sb;
+
+ if (server == NULL || tcon == NULL)
+ return;
+
+ spin_lock(&tcon->sb_list_lock);
+ list_for_each_entry(cifs_sb, &tcon->cifs_sb_list, tcon_sb_link) {
+ cifs_sb->ctx->rsize =
+ server->ops->negotiate_rsize(tcon, cifs_sb->ctx);
+ cifs_sb->ctx->wsize =
+ server->ops->negotiate_wsize(tcon, cifs_sb->ctx);
+ }
+ spin_unlock(&tcon->sb_list_lock);
+}
+
void smb2_reconnect_server(struct work_struct *work)
{
struct TCP_Server_Info *server = container_of(work,
@@ -4174,9 +4186,10 @@ void smb2_reconnect_server(struct work_struct *work)
list_for_each_entry_safe(tcon, tcon2, &tmp_list, rlist) {
rc = smb2_reconnect(SMB2_INTERNAL_CMD, tcon, server, true);
- if (!rc)
+ if (!rc) {
+ cifs_renegotiate_iosize(server, tcon);
cifs_reopen_persistent_handles(tcon);
- else
+ } else
resched = true;
list_del_init(&tcon->rlist);
if (tcon->ipc)
diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c
index 03434dbe9374..266af17aa7d9 100644
--- a/fs/smb/client/transport.c
+++ b/fs/smb/client/transport.c
@@ -894,6 +894,9 @@ cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
case MID_SHUTDOWN:
rc = -EHOSTDOWN;
break;
+ case MID_RC:
+ rc = mid->mid_rc;
+ break;
default:
if (!(mid->mid_flags & MID_DELETED)) {
list_del_init(&mid->qhead);
diff --git a/fs/smb/client/xattr.c b/fs/smb/client/xattr.c
index 7d49f38f01f3..b88fa04f5792 100644
--- a/fs/smb/client/xattr.c
+++ b/fs/smb/client/xattr.c
@@ -31,6 +31,8 @@
* secure, replaced by SMB2 (then even more highly secure SMB3) many years ago
*/
#define SMB3_XATTR_CIFS_ACL "system.smb3_acl" /* DACL only */
+#define SMB3_XATTR_CIFS_NTSD_SACL "system.smb3_ntsd_sacl" /* SACL only */
+#define SMB3_XATTR_CIFS_NTSD_OWNER "system.smb3_ntsd_owner" /* owner only */
#define SMB3_XATTR_CIFS_NTSD "system.smb3_ntsd" /* owner plus DACL */
#define SMB3_XATTR_CIFS_NTSD_FULL "system.smb3_ntsd_full" /* owner/DACL/SACL */
#define SMB3_XATTR_ATTRIB "smb3.dosattrib" /* full name: user.smb3.dosattrib */
@@ -38,6 +40,7 @@
/* BB need to add server (Samba e.g) support for security and trusted prefix */
enum { XATTR_USER, XATTR_CIFS_ACL, XATTR_ACL_ACCESS, XATTR_ACL_DEFAULT,
+ XATTR_CIFS_NTSD_SACL, XATTR_CIFS_NTSD_OWNER,
XATTR_CIFS_NTSD, XATTR_CIFS_NTSD_FULL };
static int cifs_attrib_set(unsigned int xid, struct cifs_tcon *pTcon,
@@ -160,6 +163,8 @@ static int cifs_xattr_set(const struct xattr_handler *handler,
break;
case XATTR_CIFS_ACL:
+ case XATTR_CIFS_NTSD_SACL:
+ case XATTR_CIFS_NTSD_OWNER:
case XATTR_CIFS_NTSD:
case XATTR_CIFS_NTSD_FULL: {
struct smb_ntsd *pacl;
@@ -187,6 +192,13 @@ static int cifs_xattr_set(const struct xattr_handler *handler,
CIFS_ACL_GROUP |
CIFS_ACL_DACL);
break;
+ case XATTR_CIFS_NTSD_OWNER:
+ aclflags = (CIFS_ACL_OWNER |
+ CIFS_ACL_GROUP);
+ break;
+ case XATTR_CIFS_NTSD_SACL:
+ aclflags = CIFS_ACL_SACL;
+ break;
case XATTR_CIFS_ACL:
default:
aclflags = CIFS_ACL_DACL;
@@ -308,6 +320,8 @@ static int cifs_xattr_get(const struct xattr_handler *handler,
break;
case XATTR_CIFS_ACL:
+ case XATTR_CIFS_NTSD_SACL:
+ case XATTR_CIFS_NTSD_OWNER:
case XATTR_CIFS_NTSD:
case XATTR_CIFS_NTSD_FULL: {
/*
@@ -327,6 +341,12 @@ static int cifs_xattr_get(const struct xattr_handler *handler,
case XATTR_CIFS_NTSD:
extra_info = OWNER_SECINFO | GROUP_SECINFO | DACL_SECINFO;
break;
+ case XATTR_CIFS_NTSD_OWNER:
+ extra_info = OWNER_SECINFO | GROUP_SECINFO;
+ break;
+ case XATTR_CIFS_NTSD_SACL:
+ extra_info = SACL_SECINFO;
+ break;
case XATTR_CIFS_ACL:
default:
extra_info = DACL_SECINFO;
@@ -448,6 +468,20 @@ static const struct xattr_handler smb3_acl_xattr_handler = {
.set = cifs_xattr_set,
};
+static const struct xattr_handler smb3_ntsd_sacl_xattr_handler = {
+ .name = SMB3_XATTR_CIFS_NTSD_SACL,
+ .flags = XATTR_CIFS_NTSD_SACL,
+ .get = cifs_xattr_get,
+ .set = cifs_xattr_set,
+};
+
+static const struct xattr_handler smb3_ntsd_owner_xattr_handler = {
+ .name = SMB3_XATTR_CIFS_NTSD_OWNER,
+ .flags = XATTR_CIFS_NTSD_OWNER,
+ .get = cifs_xattr_get,
+ .set = cifs_xattr_set,
+};
+
static const struct xattr_handler cifs_cifs_ntsd_xattr_handler = {
.name = CIFS_XATTR_CIFS_NTSD,
.flags = XATTR_CIFS_NTSD,
@@ -493,6 +527,8 @@ const struct xattr_handler * const cifs_xattr_handlers[] = {
&cifs_os2_xattr_handler,
&cifs_cifs_acl_xattr_handler,
&smb3_acl_xattr_handler, /* alias for above since avoiding "cifs" */
+ &smb3_ntsd_sacl_xattr_handler,
+ &smb3_ntsd_owner_xattr_handler,
&cifs_cifs_ntsd_xattr_handler,
&smb3_ntsd_xattr_handler, /* alias for above since avoiding "cifs" */
&cifs_cifs_ntsd_full_xattr_handler,
diff --git a/fs/smb/common/smb2pdu.h b/fs/smb/common/smb2pdu.h
index c7a0efda4403..f79a5165a7cc 100644
--- a/fs/smb/common/smb2pdu.h
+++ b/fs/smb/common/smb2pdu.h
@@ -95,6 +95,9 @@
*/
#define SMB3_DEFAULT_IOSIZE (4 * 1024 * 1024)
+/* According to MS-SMB2 specification The minimum recommended value is 65536.*/
+#define CIFS_MIN_DEFAULT_IOSIZE (65536)
+
/*
* SMB2 Header Definition
*
@@ -1564,13 +1567,13 @@ struct reparse_nfs_data_buffer {
__u8 DataBuffer[];
} __packed;
-/* For IO_REPARSE_TAG_LX_SYMLINK */
+/* For IO_REPARSE_TAG_LX_SYMLINK - see MS-FSCC 2.1.2.7 */
struct reparse_wsl_symlink_data_buffer {
__le32 ReparseTag;
__le16 ReparseDataLength;
__u16 Reserved;
- __le32 Flags;
- __u8 PathBuffer[]; /* Variable Length UTF-8 string without nul-term */
+ __le32 Version; /* Always 2 */
+ __u8 Target[]; /* Variable Length UTF-8 string without nul-term */
} __packed;
struct validate_negotiate_info_req {
diff --git a/fs/smb/server/auth.c b/fs/smb/server/auth.c
index 00b31cf86462..83caa3849749 100644
--- a/fs/smb/server/auth.c
+++ b/fs/smb/server/auth.c
@@ -1016,9 +1016,9 @@ static int ksmbd_get_encryption_key(struct ksmbd_work *work, __u64 ses_id,
ses_enc_key = enc ? sess->smb3encryptionkey :
sess->smb3decryptionkey;
- if (enc)
- ksmbd_user_session_get(sess);
memcpy(key, ses_enc_key, SMB3_ENC_DEC_KEY_SIZE);
+ if (!enc)
+ ksmbd_user_session_put(sess);
return 0;
}
diff --git a/fs/smb/server/connection.h b/fs/smb/server/connection.h
index 91c2318639e7..14620e147dda 100644
--- a/fs/smb/server/connection.h
+++ b/fs/smb/server/connection.h
@@ -27,6 +27,7 @@ enum {
KSMBD_SESS_EXITING,
KSMBD_SESS_NEED_RECONNECT,
KSMBD_SESS_NEED_NEGOTIATE,
+ KSMBD_SESS_NEED_SETUP,
KSMBD_SESS_RELEASING
};
@@ -187,6 +188,11 @@ static inline bool ksmbd_conn_need_negotiate(struct ksmbd_conn *conn)
return READ_ONCE(conn->status) == KSMBD_SESS_NEED_NEGOTIATE;
}
+static inline bool ksmbd_conn_need_setup(struct ksmbd_conn *conn)
+{
+ return READ_ONCE(conn->status) == KSMBD_SESS_NEED_SETUP;
+}
+
static inline bool ksmbd_conn_need_reconnect(struct ksmbd_conn *conn)
{
return READ_ONCE(conn->status) == KSMBD_SESS_NEED_RECONNECT;
@@ -217,6 +223,11 @@ static inline void ksmbd_conn_set_need_negotiate(struct ksmbd_conn *conn)
WRITE_ONCE(conn->status, KSMBD_SESS_NEED_NEGOTIATE);
}
+static inline void ksmbd_conn_set_need_setup(struct ksmbd_conn *conn)
+{
+ WRITE_ONCE(conn->status, KSMBD_SESS_NEED_SETUP);
+}
+
static inline void ksmbd_conn_set_need_reconnect(struct ksmbd_conn *conn)
{
WRITE_ONCE(conn->status, KSMBD_SESS_NEED_RECONNECT);
diff --git a/fs/smb/server/mgmt/user_session.c b/fs/smb/server/mgmt/user_session.c
index 53d308f331af..3f45f28f6f0f 100644
--- a/fs/smb/server/mgmt/user_session.c
+++ b/fs/smb/server/mgmt/user_session.c
@@ -181,7 +181,7 @@ static void ksmbd_expire_session(struct ksmbd_conn *conn)
down_write(&sessions_table_lock);
down_write(&conn->session_lock);
xa_for_each(&conn->sessions, id, sess) {
- if (atomic_read(&sess->refcnt) == 0 &&
+ if (atomic_read(&sess->refcnt) <= 1 &&
(sess->state != SMB2_SESSION_VALID ||
time_after(jiffies,
sess->last_active + SMB2_SESSION_TIMEOUT))) {
@@ -233,7 +233,8 @@ void ksmbd_sessions_deregister(struct ksmbd_conn *conn)
down_write(&conn->session_lock);
xa_erase(&conn->sessions, sess->id);
up_write(&conn->session_lock);
- ksmbd_session_destroy(sess);
+ if (atomic_dec_and_test(&sess->refcnt))
+ ksmbd_session_destroy(sess);
}
}
}
@@ -252,7 +253,8 @@ void ksmbd_sessions_deregister(struct ksmbd_conn *conn)
if (xa_empty(&sess->ksmbd_chann_list)) {
xa_erase(&conn->sessions, sess->id);
hash_del(&sess->hlist);
- ksmbd_session_destroy(sess);
+ if (atomic_dec_and_test(&sess->refcnt))
+ ksmbd_session_destroy(sess);
}
}
up_write(&conn->session_lock);
@@ -328,8 +330,8 @@ void ksmbd_user_session_put(struct ksmbd_session *sess)
if (atomic_read(&sess->refcnt) <= 0)
WARN_ON(1);
- else
- atomic_dec(&sess->refcnt);
+ else if (atomic_dec_and_test(&sess->refcnt))
+ ksmbd_session_destroy(sess);
}
struct preauth_session *ksmbd_preauth_session_alloc(struct ksmbd_conn *conn,
@@ -372,13 +374,13 @@ void destroy_previous_session(struct ksmbd_conn *conn,
ksmbd_all_conn_set_status(id, KSMBD_SESS_NEED_RECONNECT);
err = ksmbd_conn_wait_idle_sess_id(conn, id);
if (err) {
- ksmbd_all_conn_set_status(id, KSMBD_SESS_NEED_NEGOTIATE);
+ ksmbd_all_conn_set_status(id, KSMBD_SESS_NEED_SETUP);
goto out;
}
ksmbd_destroy_file_table(&prev_sess->file_table);
prev_sess->state = SMB2_SESSION_EXPIRED;
- ksmbd_all_conn_set_status(id, KSMBD_SESS_NEED_NEGOTIATE);
+ ksmbd_all_conn_set_status(id, KSMBD_SESS_NEED_SETUP);
ksmbd_launch_ksmbd_durable_scavenger();
out:
up_write(&conn->session_lock);
@@ -436,7 +438,7 @@ static struct ksmbd_session *__session_create(int protocol)
xa_init(&sess->rpc_handle_list);
sess->sequence_number = 1;
rwlock_init(&sess->tree_conns_lock);
- atomic_set(&sess->refcnt, 1);
+ atomic_set(&sess->refcnt, 2);
ret = __init_smb2_session(sess);
if (ret)
diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index 4ddf4300371b..d24d95d15d87 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -1249,7 +1249,7 @@ int smb2_handle_negotiate(struct ksmbd_work *work)
}
conn->srv_sec_mode = le16_to_cpu(rsp->SecurityMode);
- ksmbd_conn_set_need_negotiate(conn);
+ ksmbd_conn_set_need_setup(conn);
err_out:
ksmbd_conn_unlock(conn);
@@ -1271,6 +1271,9 @@ static int alloc_preauth_hash(struct ksmbd_session *sess,
if (sess->Preauth_HashValue)
return 0;
+ if (!conn->preauth_info)
+ return -ENOMEM;
+
sess->Preauth_HashValue = kmemdup(conn->preauth_info->Preauth_HashValue,
PREAUTH_HASHVALUE_SIZE, KSMBD_DEFAULT_GFP);
if (!sess->Preauth_HashValue)
@@ -1674,6 +1677,11 @@ int smb2_sess_setup(struct ksmbd_work *work)
ksmbd_debug(SMB, "Received smb2 session setup request\n");
+ if (!ksmbd_conn_need_setup(conn) && !ksmbd_conn_good(conn)) {
+ work->send_no_response = 1;
+ return rc;
+ }
+
WORK_BUFFERS(work, req, rsp);
rsp->StructureSize = cpu_to_le16(9);
@@ -1909,7 +1917,7 @@ out_err:
if (try_delay) {
ksmbd_conn_set_need_reconnect(conn);
ssleep(5);
- ksmbd_conn_set_need_negotiate(conn);
+ ksmbd_conn_set_need_setup(conn);
}
}
smb2_set_err_rsp(work);
@@ -2235,14 +2243,15 @@ int smb2_session_logoff(struct ksmbd_work *work)
return -ENOENT;
}
- ksmbd_destroy_file_table(&sess->file_table);
down_write(&conn->session_lock);
sess->state = SMB2_SESSION_EXPIRED;
up_write(&conn->session_lock);
- ksmbd_free_user(sess->user);
- sess->user = NULL;
- ksmbd_all_conn_set_status(sess_id, KSMBD_SESS_NEED_NEGOTIATE);
+ if (sess->user) {
+ ksmbd_free_user(sess->user);
+ sess->user = NULL;
+ }
+ ksmbd_all_conn_set_status(sess_id, KSMBD_SESS_NEED_SETUP);
rsp->StructureSize = cpu_to_le16(4);
err = ksmbd_iov_pin_rsp(work, rsp, sizeof(struct smb2_logoff_rsp));
diff --git a/fs/smb/server/smb_common.h b/fs/smb/server/smb_common.h
index a3d8a905b07e..d742ba754348 100644
--- a/fs/smb/server/smb_common.h
+++ b/fs/smb/server/smb_common.h
@@ -72,6 +72,8 @@
#define FILE_SUPPORTS_ENCRYPTION 0x00020000
#define FILE_SUPPORTS_OBJECT_IDS 0x00010000
#define FILE_VOLUME_IS_COMPRESSED 0x00008000
+#define FILE_SUPPORTS_POSIX_UNLINK_RENAME 0x00000400
+#define FILE_RETURNS_CLEANUP_RESULT_INFO 0x00000200
#define FILE_SUPPORTS_REMOTE_STORAGE 0x00000100
#define FILE_SUPPORTS_REPARSE_POINTS 0x00000080
#define FILE_SUPPORTS_SPARSE_FILES 0x00000040
diff --git a/fs/smb/server/smbacl.c b/fs/smb/server/smbacl.c
index 49b128698670..5aa7a66334d9 100644
--- a/fs/smb/server/smbacl.c
+++ b/fs/smb/server/smbacl.c
@@ -270,6 +270,11 @@ static int sid_to_id(struct mnt_idmap *idmap,
return -EIO;
}
+ if (psid->num_subauth == 0) {
+ pr_err("%s: zero subauthorities!\n", __func__);
+ return -EIO;
+ }
+
if (sidtype == SIDOWNER) {
kuid_t uid;
uid_t id;
@@ -1026,7 +1031,9 @@ int smb_inherit_dacl(struct ksmbd_conn *conn,
struct dentry *parent = path->dentry->d_parent;
struct mnt_idmap *idmap = mnt_idmap(path->mnt);
int inherited_flags = 0, flags = 0, i, nt_size = 0, pdacl_size;
- int rc = 0, dacloffset, pntsd_type, pntsd_size, acl_len, aces_size;
+ int rc = 0, pntsd_type, pntsd_size, acl_len, aces_size;
+ unsigned int dacloffset;
+ size_t dacl_struct_end;
u16 num_aces, ace_cnt = 0;
char *aces_base;
bool is_dir = S_ISDIR(d_inode(path->dentry)->i_mode);
@@ -1035,8 +1042,11 @@ int smb_inherit_dacl(struct ksmbd_conn *conn,
parent, &parent_pntsd);
if (pntsd_size <= 0)
return -ENOENT;
+
dacloffset = le32_to_cpu(parent_pntsd->dacloffset);
- if (!dacloffset || (dacloffset + sizeof(struct smb_acl) > pntsd_size)) {
+ if (!dacloffset ||
+ check_add_overflow(dacloffset, sizeof(struct smb_acl), &dacl_struct_end) ||
+ dacl_struct_end > (size_t)pntsd_size) {
rc = -EINVAL;
goto free_parent_pntsd;
}
@@ -1240,7 +1250,9 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, const struct path *path,
struct smb_ntsd *pntsd = NULL;
struct smb_acl *pdacl;
struct posix_acl *posix_acls;
- int rc = 0, pntsd_size, acl_size, aces_size, pdacl_size, dacl_offset;
+ int rc = 0, pntsd_size, acl_size, aces_size, pdacl_size;
+ unsigned int dacl_offset;
+ size_t dacl_struct_end;
struct smb_sid sid;
int granted = le32_to_cpu(*pdaccess & ~FILE_MAXIMAL_ACCESS_LE);
struct smb_ace *ace;
@@ -1259,7 +1271,8 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, const struct path *path,
dacl_offset = le32_to_cpu(pntsd->dacloffset);
if (!dacl_offset ||
- (dacl_offset + sizeof(struct smb_acl) > pntsd_size))
+ check_add_overflow(dacl_offset, sizeof(struct smb_acl), &dacl_struct_end) ||
+ dacl_struct_end > (size_t)pntsd_size)
goto err_out;
pdacl = (struct smb_acl *)((char *)pntsd + le32_to_cpu(pntsd->dacloffset));
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 97c4d71115d8..d80f94346199 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -396,32 +396,6 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
goto out;
/*
- * If it's already released don't get it. This avoids to loop
- * in __get_user_pages if userfaultfd_release waits on the
- * caller of handle_userfault to release the mmap_lock.
- */
- if (unlikely(READ_ONCE(ctx->released))) {
- /*
- * Don't return VM_FAULT_SIGBUS in this case, so a non
- * cooperative manager can close the uffd after the
- * last UFFDIO_COPY, without risking to trigger an
- * involuntary SIGBUS if the process was starting the
- * userfaultfd while the userfaultfd was still armed
- * (but after the last UFFDIO_COPY). If the uffd
- * wasn't already closed when the userfault reached
- * this point, that would normally be solved by
- * userfaultfd_must_wait returning 'false'.
- *
- * If we were to return VM_FAULT_SIGBUS here, the non
- * cooperative manager would be instead forced to
- * always call UFFDIO_UNREGISTER before it can safely
- * close the uffd.
- */
- ret = VM_FAULT_NOPAGE;
- goto out;
- }
-
- /*
* Check that we can return VM_FAULT_RETRY.
*
* NOTE: it should become possible to return VM_FAULT_RETRY
@@ -457,6 +431,31 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
goto out;
+ if (unlikely(READ_ONCE(ctx->released))) {
+ /*
+ * If a concurrent release is detected, do not return
+ * VM_FAULT_SIGBUS or VM_FAULT_NOPAGE, but instead always
+ * return VM_FAULT_RETRY with lock released proactively.
+ *
+ * If we were to return VM_FAULT_SIGBUS here, the non
+ * cooperative manager would be instead forced to
+ * always call UFFDIO_UNREGISTER before it can safely
+ * close the uffd, to avoid involuntary SIGBUS triggered.
+ *
+ * If we were to return VM_FAULT_NOPAGE, it would work for
+ * the fault path, in which the lock will be released
+ * later. However for GUP, faultin_page() does nothing
+ * special on NOPAGE, so GUP would spin retrying without
+ * releasing the mmap read lock, causing possible livelock.
+ *
+ * Here only VM_FAULT_RETRY would make sure the mmap lock
+ * be released immediately, so that the thread concurrently
+ * releasing the userfault would always make progress.
+ */
+ release_fault_lock(vmf);
+ goto out;
+ }
+
/* take the reference before dropping the mmap_lock */
userfaultfd_ctx_get(ctx);
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index fffd6fffdce0..ae0ca6858496 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -3,7 +3,7 @@ config XFS_FS
tristate "XFS filesystem support"
depends on BLOCK
select EXPORTFS
- select LIBCRC32C
+ select CRC32
select FS_IOMAP
help
XFS is a high performance journaling filesystem which originated
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 8e7f1b324b3b..1a2b3f06fa71 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -105,6 +105,7 @@ xfs_buf_free(
{
unsigned int size = BBTOB(bp->b_length);
+ might_sleep();
trace_xfs_buf_free(bp, _RET_IP_);
ASSERT(list_empty(&bp->b_lru));
diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c
index b4ffd80b7cb6..dcbfa274e06d 100644
--- a/fs/xfs/xfs_buf_mem.c
+++ b/fs/xfs/xfs_buf_mem.c
@@ -165,7 +165,7 @@ xmbuf_map_backing_mem(
folio_set_dirty(folio);
folio_unlock(folio);
- bp->b_addr = folio_address(folio);
+ bp->b_addr = folio_address(folio) + offset_in_folio(folio, pos);
return 0;
}
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index edbc521870a1..b4e32f0860b7 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -1186,9 +1186,8 @@ xfs_qm_dqflush_done(
if (test_bit(XFS_LI_IN_AIL, &lip->li_flags) &&
(lip->li_lsn == qlip->qli_flush_lsn ||
test_bit(XFS_LI_FAILED, &lip->li_flags))) {
-
spin_lock(&ailp->ail_lock);
- xfs_clear_li_failed(lip);
+ clear_bit(XFS_LI_FAILED, &lip->li_flags);
if (lip->li_lsn == qlip->qli_flush_lsn) {
/* xfs_ail_update_finish() drops the AIL lock */
tail_lsn = xfs_ail_delete_one(ailp, lip);
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index a4bc1642fe56..414b27a86458 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -876,6 +876,7 @@ xfs_getfsmap_rtdev_rmapbt(
const struct xfs_fsmap *keys,
struct xfs_getfsmap_info *info)
{
+ struct xfs_fsmap key0 = *keys; /* struct copy */
struct xfs_mount *mp = tp->t_mountp;
struct xfs_rtgroup *rtg = NULL;
struct xfs_btree_cur *bt_cur = NULL;
@@ -887,32 +888,46 @@ xfs_getfsmap_rtdev_rmapbt(
int error = 0;
eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart + mp->m_sb.sb_rblocks);
- if (keys[0].fmr_physical >= eofs)
+ if (key0.fmr_physical >= eofs)
return 0;
+ /*
+ * On zoned filesystems with an internal rt volume, the volume comes
+ * immediately after the end of the data volume. However, the
+ * xfs_rtblock_t address space is relative to the start of the data
+ * device, which means that the first @rtstart fsblocks do not actually
+ * point anywhere. If a fsmap query comes in with the low key starting
+ * below @rtstart, report it as "owned by filesystem".
+ */
rtstart_daddr = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart);
- if (keys[0].fmr_physical < rtstart_daddr) {
+ if (xfs_has_zoned(mp) && key0.fmr_physical < rtstart_daddr) {
struct xfs_fsmap_irec frec = {
.owner = XFS_RMAP_OWN_FS,
.len_daddr = rtstart_daddr,
};
- /* Adjust the low key if we are continuing from where we left off. */
- if (keys[0].fmr_length > 0) {
- info->low_daddr = keys[0].fmr_physical + keys[0].fmr_length;
- return 0;
+ /*
+ * Adjust the start of the query range if we're picking up from
+ * a previous round, and only emit the record if we haven't
+ * already gone past.
+ */
+ key0.fmr_physical += key0.fmr_length;
+ if (key0.fmr_physical < rtstart_daddr) {
+ error = xfs_getfsmap_helper(tp, info, &frec);
+ if (error)
+ return error;
+
+ key0.fmr_physical = rtstart_daddr;
}
- /* Fabricate an rmap entry for space occupied by the data dev */
- error = xfs_getfsmap_helper(tp, info, &frec);
- if (error)
- return error;
+ /* Zero the other fields to avoid further adjustments. */
+ key0.fmr_owner = 0;
+ key0.fmr_offset = 0;
+ key0.fmr_length = 0;
}
- start_rtb = xfs_daddr_to_rtb(mp, rtstart_daddr + keys[0].fmr_physical);
- end_rtb = xfs_daddr_to_rtb(mp, rtstart_daddr +
- min(eofs - 1, keys[1].fmr_physical));
-
+ start_rtb = xfs_daddr_to_rtb(mp, key0.fmr_physical);
+ end_rtb = xfs_daddr_to_rtb(mp, min(eofs - 1, keys[1].fmr_physical));
info->missing_owner = XFS_FMR_OWN_FREE;
/*
@@ -920,12 +935,12 @@ xfs_getfsmap_rtdev_rmapbt(
* low to the fsmap low key and max out the high key to the end
* of the rtgroup.
*/
- info->low.rm_offset = XFS_BB_TO_FSBT(mp, keys[0].fmr_offset);
- error = xfs_fsmap_owner_to_rmap(&info->low, &keys[0]);
+ info->low.rm_offset = XFS_BB_TO_FSBT(mp, key0.fmr_offset);
+ error = xfs_fsmap_owner_to_rmap(&info->low, &key0);
if (error)
return error;
- info->low.rm_blockcount = XFS_BB_TO_FSBT(mp, keys[0].fmr_length);
- xfs_getfsmap_set_irec_flags(&info->low, &keys[0]);
+ info->low.rm_blockcount = XFS_BB_TO_FSBT(mp, key0.fmr_length);
+ xfs_getfsmap_set_irec_flags(&info->low, &key0);
/* Adjust the low key if we are continuing from where we left off. */
if (info->low.rm_blockcount == 0) {
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 40fc1bf900af..c6cb0b6b9e46 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -1089,13 +1089,7 @@ xfs_iflush_abort(
* state. Whilst the inode is in the AIL, it should have a valid buffer
* pointer for push operations to access - it is only safe to remove the
* inode from the buffer once it has been removed from the AIL.
- *
- * We also clear the failed bit before removing the item from the AIL
- * as xfs_trans_ail_delete()->xfs_clear_li_failed() will release buffer
- * references the inode item owns and needs to hold until we've fully
- * aborted the inode log item and detached it from the buffer.
*/
- clear_bit(XFS_LI_FAILED, &iip->ili_item.li_flags);
xfs_trans_ail_delete(&iip->ili_item, 0);
/*
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 6493bdb57351..980aabc49512 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -2888,7 +2888,7 @@ xlog_force_and_check_iclog(
*
* 1. the current iclog is active and has no data; the previous iclog
* is in the active or dirty state.
- * 2. the current iclog is drity, and the previous iclog is in the
+ * 2. the current iclog is dirty, and the previous iclog is in the
* active or dirty state.
*
* We may sleep if:
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 799b84220ebb..e5192c12e7ac 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -229,6 +229,7 @@ typedef struct xfs_mount {
bool m_finobt_nores; /* no per-AG finobt resv. */
bool m_update_sb; /* sb needs update in mount */
unsigned int m_max_open_zones;
+ unsigned int m_zonegc_low_space;
/*
* Bitsets of per-fs metadata that have been checked and/or are sick.
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index b7e82d85f043..7a5c5ef2db92 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -718,8 +718,40 @@ max_open_zones_show(
}
XFS_SYSFS_ATTR_RO(max_open_zones);
+static ssize_t
+zonegc_low_space_store(
+ struct kobject *kobj,
+ const char *buf,
+ size_t count)
+{
+ int ret;
+ unsigned int val;
+
+ ret = kstrtouint(buf, 0, &val);
+ if (ret)
+ return ret;
+
+ if (val > 100)
+ return -EINVAL;
+
+ zoned_to_mp(kobj)->m_zonegc_low_space = val;
+
+ return count;
+}
+
+static ssize_t
+zonegc_low_space_show(
+ struct kobject *kobj,
+ char *buf)
+{
+ return sysfs_emit(buf, "%u\n",
+ zoned_to_mp(kobj)->m_zonegc_low_space);
+}
+XFS_SYSFS_ATTR_RW(zonegc_low_space);
+
static struct attribute *xfs_zoned_attrs[] = {
ATTR_LIST(max_open_zones),
+ ATTR_LIST(zonegc_low_space),
NULL,
};
ATTRIBUTE_GROUPS(xfs_zoned);
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 0fcb1828e598..85a649fec6ac 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -909,10 +909,9 @@ xfs_trans_ail_delete(
return;
}
- /* xfs_ail_update_finish() drops the AIL lock */
- xfs_clear_li_failed(lip);
+ clear_bit(XFS_LI_FAILED, &lip->li_flags);
tail_lsn = xfs_ail_delete_one(ailp, lip);
- xfs_ail_update_finish(ailp, tail_lsn);
+ xfs_ail_update_finish(ailp, tail_lsn); /* drops the AIL lock */
}
int
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index bd841df93021..f945f0450b16 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -167,32 +167,4 @@ xfs_trans_ail_copy_lsn(
}
#endif
-static inline void
-xfs_clear_li_failed(
- struct xfs_log_item *lip)
-{
- struct xfs_buf *bp = lip->li_buf;
-
- ASSERT(test_bit(XFS_LI_IN_AIL, &lip->li_flags));
- lockdep_assert_held(&lip->li_ailp->ail_lock);
-
- if (test_and_clear_bit(XFS_LI_FAILED, &lip->li_flags)) {
- lip->li_buf = NULL;
- xfs_buf_rele(bp);
- }
-}
-
-static inline void
-xfs_set_li_failed(
- struct xfs_log_item *lip,
- struct xfs_buf *bp)
-{
- lockdep_assert_held(&lip->li_ailp->ail_lock);
-
- if (!test_and_set_bit(XFS_LI_FAILED, &lip->li_flags)) {
- xfs_buf_hold(bp);
- lip->li_buf = bp;
- }
-}
-
#endif /* __XFS_TRANS_PRIV_H__ */
diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c
index 52af234936a2..d509e49b2aaa 100644
--- a/fs/xfs/xfs_zone_alloc.c
+++ b/fs/xfs/xfs_zone_alloc.c
@@ -1201,6 +1201,13 @@ xfs_mount_zones(
xfs_set_freecounter(mp, XC_FREE_RTEXTENTS,
iz.available + iz.reclaimable);
+ /*
+ * The user may configure GC to free up a percentage of unused blocks.
+ * By default this is 0. GC will always trigger at the minimum level
+ * for keeping max_open_zones available for data placement.
+ */
+ mp->m_zonegc_low_space = 0;
+
error = xfs_zone_gc_mount(mp);
if (error)
goto out_free_zone_info;
diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c
index c5136ea9bb1d..8c541ca71872 100644
--- a/fs/xfs/xfs_zone_gc.c
+++ b/fs/xfs/xfs_zone_gc.c
@@ -162,18 +162,30 @@ struct xfs_zone_gc_data {
/*
* We aim to keep enough zones free in stock to fully use the open zone limit
- * for data placement purposes.
+ * for data placement purposes. Additionally, the m_zonegc_low_space tunable
+ * can be set to make sure a fraction of the unused blocks are available for
+ * writing.
*/
bool
xfs_zoned_need_gc(
struct xfs_mount *mp)
{
+ s64 available, free;
+
if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE))
return false;
- if (xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE) <
+
+ available = xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE);
+
+ if (available <
mp->m_groups[XG_TYPE_RTG].blocks *
(mp->m_max_open_zones - XFS_OPEN_GC_ZONES))
return true;
+
+ free = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS);
+ if (available < mult_frac(free, mp->m_zonegc_low_space, 100))
+ return true;
+
return false;
}