summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>2024-11-05 09:55:37 +0100
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>2024-11-05 09:56:08 +0100
commit85c4efbe608887cbce675fad3288172046f74713 (patch)
tree42aa0a76ec945920d04344a6c2a7f7c86d054c28 /fs
parentbf373d2919d98f3d1fe1b19a0304f72fe74386d9 (diff)
parent59b723cd2adbac2a34fc8e12c74ae26ae45bf230 (diff)
downloadlwn-85c4efbe608887cbce675fad3288172046f74713.tar.gz
lwn-85c4efbe608887cbce675fad3288172046f74713.zip
Merge v6.12-rc6 into usb-next
We need the USB fixes in here as well, and this resolves a merge conflict in: drivers/usb/typec/tcpm/tcpm.c Reported-by: Stephen Rothwell <sfr@canb.auug.org.au> Link: https://lore.kernel.org/r/20241101150730.090dc30f@canb.auug.org.au Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/v9fs.h34
-rw-r--r--fs/9p/v9fs_vfs.h2
-rw-r--r--fs/9p/vfs_inode.c130
-rw-r--r--fs/9p/vfs_inode_dotl.c112
-rw-r--r--fs/9p/vfs_super.c2
-rw-r--r--fs/afs/dir.c25
-rw-r--r--fs/afs/dir_edit.c91
-rw-r--r--fs/afs/internal.h4
-rw-r--r--fs/afs/rxrpc.c83
-rw-r--r--fs/autofs/dev-ioctl.c5
-rw-r--r--fs/backing-file.c8
-rw-r--r--fs/bcachefs/alloc_background.c37
-rw-r--r--fs/bcachefs/alloc_background.h3
-rw-r--r--fs/bcachefs/alloc_foreground.c21
-rw-r--r--fs/bcachefs/bcachefs.h1
-rw-r--r--fs/bcachefs/btree_gc.c12
-rw-r--r--fs/bcachefs/btree_io.c2
-rw-r--r--fs/bcachefs/btree_iter.c13
-rw-r--r--fs/bcachefs/btree_iter.h2
-rw-r--r--fs/bcachefs/btree_update.c4
-rw-r--r--fs/bcachefs/btree_update.h2
-rw-r--r--fs/bcachefs/btree_update_interior.c4
-rw-r--r--fs/bcachefs/buckets.c7
-rw-r--r--fs/bcachefs/buckets.h12
-rw-r--r--fs/bcachefs/chardev.c1
-rw-r--r--fs/bcachefs/darray.c15
-rw-r--r--fs/bcachefs/data_update.c21
-rw-r--r--fs/bcachefs/data_update.h3
-rw-r--r--fs/bcachefs/dirent.c7
-rw-r--r--fs/bcachefs/dirent.h7
-rw-r--r--fs/bcachefs/disk_accounting.c6
-rw-r--r--fs/bcachefs/ec.c26
-rw-r--r--fs/bcachefs/errcode.h2
-rw-r--r--fs/bcachefs/error.c5
-rw-r--r--fs/bcachefs/extents.c86
-rw-r--r--fs/bcachefs/extents.h5
-rw-r--r--fs/bcachefs/fs-io-buffered.c6
-rw-r--r--fs/bcachefs/fs-io-pagecache.c70
-rw-r--r--fs/bcachefs/fs-io.c19
-rw-r--r--fs/bcachefs/fs.c18
-rw-r--r--fs/bcachefs/fsck.c281
-rw-r--r--fs/bcachefs/inode.c27
-rw-r--r--fs/bcachefs/inode.h1
-rw-r--r--fs/bcachefs/inode_format.h6
-rw-r--r--fs/bcachefs/io_misc.c2
-rw-r--r--fs/bcachefs/io_read.c8
-rw-r--r--fs/bcachefs/io_write.c4
-rw-r--r--fs/bcachefs/journal.c10
-rw-r--r--fs/bcachefs/journal.h2
-rw-r--r--fs/bcachefs/move.c2
-rw-r--r--fs/bcachefs/opts.c6
-rw-r--r--fs/bcachefs/opts.h3
-rw-r--r--fs/bcachefs/quota.c2
-rw-r--r--fs/bcachefs/rebalance.c4
-rw-r--r--fs/bcachefs/recovery.c16
-rw-r--r--fs/bcachefs/sb-downgrade.c3
-rw-r--r--fs/bcachefs/sb-errors_format.h4
-rw-r--r--fs/bcachefs/str_hash.h60
-rw-r--r--fs/bcachefs/subvolume.c7
-rw-r--r--fs/bcachefs/super-io.c5
-rw-r--r--fs/bcachefs/super.c2
-rw-r--r--fs/bcachefs/tests.c4
-rw-r--r--fs/bcachefs/xattr.c2
-rw-r--r--fs/btrfs/bio.c37
-rw-r--r--fs/btrfs/bio.h3
-rw-r--r--fs/btrfs/block-group.c2
-rw-r--r--fs/btrfs/defrag.c10
-rw-r--r--fs/btrfs/dir-item.c4
-rw-r--r--fs/btrfs/disk-io.c2
-rw-r--r--fs/btrfs/extent_io.c17
-rw-r--r--fs/btrfs/extent_map.c38
-rw-r--r--fs/btrfs/inode.c7
-rw-r--r--fs/btrfs/qgroup.c2
-rw-r--r--fs/btrfs/qgroup.h2
-rw-r--r--fs/btrfs/super.c12
-rw-r--r--fs/btrfs/volumes.c1
-rw-r--r--fs/dax.c45
-rw-r--r--fs/erofs/super.c4
-rw-r--r--fs/fuse/file.c18
-rw-r--r--fs/fuse/passthrough.c9
-rw-r--r--fs/iomap/buffered-io.c17
-rw-r--r--fs/jfs/jfs_dmap.c2
-rw-r--r--fs/namespace.c4
-rw-r--r--fs/netfs/buffered_read.c47
-rw-r--r--fs/netfs/locking.c3
-rw-r--r--fs/netfs/read_collect.c2
-rw-r--r--fs/nfsd/nfs4proc.c8
-rw-r--r--fs/nfsd/nfs4state.c50
-rw-r--r--fs/nfsd/state.h2
-rw-r--r--fs/nilfs2/namei.c3
-rw-r--r--fs/nilfs2/page.c7
-rw-r--r--fs/ocfs2/file.c17
-rw-r--r--fs/open.c2
-rw-r--r--fs/overlayfs/file.c9
-rw-r--r--fs/proc/fd.c2
-rw-r--r--fs/smb/client/cifsfs.c2
-rw-r--r--fs/smb/client/fs_context.c7
-rw-r--r--fs/squashfs/file_direct.c9
-rw-r--r--fs/super.c26
-rw-r--r--fs/userfaultfd.c28
-rw-r--r--fs/xfs/libxfs/xfs_ag.c75
-rw-r--r--fs/xfs/libxfs/xfs_ag.h11
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c2
-rw-r--r--fs/xfs/scrub/repair.c8
-rw-r--r--fs/xfs/xfs_buf_item_recover.c70
-rw-r--r--fs/xfs/xfs_filestream.c99
-rw-r--r--fs/xfs/xfs_fsops.c20
-rw-r--r--fs/xfs/xfs_inode.c2
-rw-r--r--fs/xfs/xfs_inode.h5
-rw-r--r--fs/xfs/xfs_ioctl.c4
-rw-r--r--fs/xfs/xfs_iomap.c2
-rw-r--r--fs/xfs/xfs_log_recover.c7
-rw-r--r--fs/xfs/xfs_mount.c9
-rw-r--r--fs/xfs/xfs_trace.h15
114 files changed, 1477 insertions, 686 deletions
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 1775fcc7f0e8..698c43dd5dc8 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -179,14 +179,16 @@ extern int v9fs_vfs_rename(struct mnt_idmap *idmap,
struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry,
unsigned int flags);
-extern struct inode *v9fs_fid_iget(struct super_block *sb, struct p9_fid *fid,
- bool new);
+extern struct inode *v9fs_inode_from_fid(struct v9fs_session_info *v9ses,
+ struct p9_fid *fid,
+ struct super_block *sb, int new);
extern const struct inode_operations v9fs_dir_inode_operations_dotl;
extern const struct inode_operations v9fs_file_inode_operations_dotl;
extern const struct inode_operations v9fs_symlink_inode_operations_dotl;
extern const struct netfs_request_ops v9fs_req_ops;
-extern struct inode *v9fs_fid_iget_dotl(struct super_block *sb,
- struct p9_fid *fid, bool new);
+extern struct inode *v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses,
+ struct p9_fid *fid,
+ struct super_block *sb, int new);
/* other default globals */
#define V9FS_PORT 564
@@ -225,12 +227,30 @@ static inline int v9fs_proto_dotl(struct v9fs_session_info *v9ses)
*/
static inline struct inode *
v9fs_get_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
- struct super_block *sb, bool new)
+ struct super_block *sb)
{
if (v9fs_proto_dotl(v9ses))
- return v9fs_fid_iget_dotl(sb, fid, new);
+ return v9fs_inode_from_fid_dotl(v9ses, fid, sb, 0);
else
- return v9fs_fid_iget(sb, fid, new);
+ return v9fs_inode_from_fid(v9ses, fid, sb, 0);
+}
+
+/**
+ * v9fs_get_new_inode_from_fid - Helper routine to populate an inode by
+ * issuing a attribute request
+ * @v9ses: session information
+ * @fid: fid to issue attribute request for
+ * @sb: superblock on which to create inode
+ *
+ */
+static inline struct inode *
+v9fs_get_new_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+ struct super_block *sb)
+{
+ if (v9fs_proto_dotl(v9ses))
+ return v9fs_inode_from_fid_dotl(v9ses, fid, sb, 1);
+ else
+ return v9fs_inode_from_fid(v9ses, fid, sb, 1);
}
#endif
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 7923c3c347cb..d3aefbec4de6 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -42,7 +42,7 @@ struct inode *v9fs_alloc_inode(struct super_block *sb);
void v9fs_free_inode(struct inode *inode);
void v9fs_set_netfs_context(struct inode *inode);
int v9fs_init_inode(struct v9fs_session_info *v9ses,
- struct inode *inode, struct p9_qid *qid, umode_t mode, dev_t rdev);
+ struct inode *inode, umode_t mode, dev_t rdev);
void v9fs_evict_inode(struct inode *inode);
#if (BITS_PER_LONG == 32)
#define QID2INO(q) ((ino_t) (((q)->path+2) ^ (((q)->path) >> 32)))
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index effb3aa1f3ed..3e68521f4e2f 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -256,12 +256,9 @@ void v9fs_set_netfs_context(struct inode *inode)
}
int v9fs_init_inode(struct v9fs_session_info *v9ses,
- struct inode *inode, struct p9_qid *qid, umode_t mode, dev_t rdev)
+ struct inode *inode, umode_t mode, dev_t rdev)
{
int err = 0;
- struct v9fs_inode *v9inode = V9FS_I(inode);
-
- memcpy(&v9inode->qid, qid, sizeof(struct p9_qid));
inode_init_owner(&nop_mnt_idmap, inode, NULL, mode);
inode->i_blocks = 0;
@@ -295,7 +292,6 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses,
inode->i_op = &v9fs_file_inode_operations;
inode->i_fop = &v9fs_file_operations;
}
- mapping_set_large_folios(inode->i_mapping);
break;
case S_IFLNK:
@@ -366,59 +362,105 @@ void v9fs_evict_inode(struct inode *inode)
clear_inode(inode);
}
-struct inode *
-v9fs_fid_iget(struct super_block *sb, struct p9_fid *fid, bool new)
+static int v9fs_test_inode(struct inode *inode, void *data)
+{
+ int umode;
+ dev_t rdev;
+ struct v9fs_inode *v9inode = V9FS_I(inode);
+ struct p9_wstat *st = (struct p9_wstat *)data;
+ struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode);
+
+ umode = p9mode2unixmode(v9ses, st, &rdev);
+ /* don't match inode of different type */
+ if (inode_wrong_type(inode, umode))
+ return 0;
+
+ /* compare qid details */
+ if (memcmp(&v9inode->qid.version,
+ &st->qid.version, sizeof(v9inode->qid.version)))
+ return 0;
+
+ if (v9inode->qid.type != st->qid.type)
+ return 0;
+
+ if (v9inode->qid.path != st->qid.path)
+ return 0;
+ return 1;
+}
+
+static int v9fs_test_new_inode(struct inode *inode, void *data)
+{
+ return 0;
+}
+
+static int v9fs_set_inode(struct inode *inode, void *data)
+{
+ struct v9fs_inode *v9inode = V9FS_I(inode);
+ struct p9_wstat *st = (struct p9_wstat *)data;
+
+ memcpy(&v9inode->qid, &st->qid, sizeof(st->qid));
+ return 0;
+}
+
+static struct inode *v9fs_qid_iget(struct super_block *sb,
+ struct p9_qid *qid,
+ struct p9_wstat *st,
+ int new)
{
dev_t rdev;
int retval;
umode_t umode;
struct inode *inode;
- struct p9_wstat *st;
struct v9fs_session_info *v9ses = sb->s_fs_info;
+ int (*test)(struct inode *inode, void *data);
- inode = iget_locked(sb, QID2INO(&fid->qid));
- if (unlikely(!inode))
- return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW)) {
- if (!new) {
- goto done;
- } else {
- p9_debug(P9_DEBUG_VFS, "WARNING: Inode collision %ld\n",
- inode->i_ino);
- iput(inode);
- remove_inode_hash(inode);
- inode = iget_locked(sb, QID2INO(&fid->qid));
- WARN_ON(!(inode->i_state & I_NEW));
- }
- }
+ if (new)
+ test = v9fs_test_new_inode;
+ else
+ test = v9fs_test_inode;
+ inode = iget5_locked(sb, QID2INO(qid), test, v9fs_set_inode, st);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+ if (!(inode->i_state & I_NEW))
+ return inode;
/*
* initialize the inode with the stat info
* FIXME!! we may need support for stale inodes
* later.
*/
- st = p9_client_stat(fid);
- if (IS_ERR(st)) {
- retval = PTR_ERR(st);
- goto error;
- }
-
+ inode->i_ino = QID2INO(qid);
umode = p9mode2unixmode(v9ses, st, &rdev);
- retval = v9fs_init_inode(v9ses, inode, &fid->qid, umode, rdev);
- v9fs_stat2inode(st, inode, sb, 0);
- p9stat_free(st);
- kfree(st);
+ retval = v9fs_init_inode(v9ses, inode, umode, rdev);
if (retval)
goto error;
+ v9fs_stat2inode(st, inode, sb, 0);
v9fs_set_netfs_context(inode);
v9fs_cache_inode_get_cookie(inode);
unlock_new_inode(inode);
-done:
return inode;
error:
iget_failed(inode);
return ERR_PTR(retval);
+
+}
+
+struct inode *
+v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+ struct super_block *sb, int new)
+{
+ struct p9_wstat *st;
+ struct inode *inode = NULL;
+
+ st = p9_client_stat(fid);
+ if (IS_ERR(st))
+ return ERR_CAST(st);
+
+ inode = v9fs_qid_iget(sb, &st->qid, st, new);
+ p9stat_free(st);
+ kfree(st);
+ return inode;
}
/**
@@ -450,15 +492,8 @@ static int v9fs_at_to_dotl_flags(int flags)
*/
static void v9fs_dec_count(struct inode *inode)
{
- if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2) {
- if (inode->i_nlink) {
- drop_nlink(inode);
- } else {
- p9_debug(P9_DEBUG_VFS,
- "WARNING: unexpected i_nlink zero %d inode %ld\n",
- inode->i_nlink, inode->i_ino);
- }
- }
+ if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2)
+ drop_nlink(inode);
}
/**
@@ -509,9 +544,6 @@ static int v9fs_remove(struct inode *dir, struct dentry *dentry, int flags)
} else
v9fs_dec_count(inode);
- if (inode->i_nlink <= 0) /* no more refs unhash it */
- remove_inode_hash(inode);
-
v9fs_invalidate_inode_attr(inode);
v9fs_invalidate_inode_attr(dir);
@@ -577,7 +609,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
/*
* instantiate inode and assign the unopened fid to the dentry
*/
- inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb, true);
+ inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
p9_debug(P9_DEBUG_VFS,
@@ -705,8 +737,10 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
inode = NULL;
else if (IS_ERR(fid))
inode = ERR_CAST(fid);
+ else if (v9ses->cache & (CACHE_META|CACHE_LOOSE))
+ inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
else
- inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb, false);
+ inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
/*
* If we had a rename on the server and a parallel lookup
* for the new name, then make sure we instantiate with
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index c61b97bd13b9..143ac03b7425 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -52,50 +52,80 @@ static kgid_t v9fs_get_fsgid_for_create(struct inode *dir_inode)
return current_fsgid();
}
+static int v9fs_test_inode_dotl(struct inode *inode, void *data)
+{
+ struct v9fs_inode *v9inode = V9FS_I(inode);
+ struct p9_stat_dotl *st = (struct p9_stat_dotl *)data;
+ /* don't match inode of different type */
+ if (inode_wrong_type(inode, st->st_mode))
+ return 0;
-struct inode *
-v9fs_fid_iget_dotl(struct super_block *sb, struct p9_fid *fid, bool new)
+ if (inode->i_generation != st->st_gen)
+ return 0;
+
+ /* compare qid details */
+ if (memcmp(&v9inode->qid.version,
+ &st->qid.version, sizeof(v9inode->qid.version)))
+ return 0;
+
+ if (v9inode->qid.type != st->qid.type)
+ return 0;
+
+ if (v9inode->qid.path != st->qid.path)
+ return 0;
+ return 1;
+}
+
+/* Always get a new inode */
+static int v9fs_test_new_inode_dotl(struct inode *inode, void *data)
+{
+ return 0;
+}
+
+static int v9fs_set_inode_dotl(struct inode *inode, void *data)
+{
+ struct v9fs_inode *v9inode = V9FS_I(inode);
+ struct p9_stat_dotl *st = (struct p9_stat_dotl *)data;
+
+ memcpy(&v9inode->qid, &st->qid, sizeof(st->qid));
+ inode->i_generation = st->st_gen;
+ return 0;
+}
+
+static struct inode *v9fs_qid_iget_dotl(struct super_block *sb,
+ struct p9_qid *qid,
+ struct p9_fid *fid,
+ struct p9_stat_dotl *st,
+ int new)
{
int retval;
struct inode *inode;
- struct p9_stat_dotl *st;
struct v9fs_session_info *v9ses = sb->s_fs_info;
+ int (*test)(struct inode *inode, void *data);
- inode = iget_locked(sb, QID2INO(&fid->qid));
- if (unlikely(!inode))
- return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW)) {
- if (!new) {
- goto done;
- } else { /* deal with race condition in inode number reuse */
- p9_debug(P9_DEBUG_ERROR, "WARNING: Inode collision %lx\n",
- inode->i_ino);
- iput(inode);
- remove_inode_hash(inode);
- inode = iget_locked(sb, QID2INO(&fid->qid));
- WARN_ON(!(inode->i_state & I_NEW));
- }
- }
+ if (new)
+ test = v9fs_test_new_inode_dotl;
+ else
+ test = v9fs_test_inode_dotl;
+ inode = iget5_locked(sb, QID2INO(qid), test, v9fs_set_inode_dotl, st);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+ if (!(inode->i_state & I_NEW))
+ return inode;
/*
* initialize the inode with the stat info
* FIXME!! we may need support for stale inodes
* later.
*/
- st = p9_client_getattr_dotl(fid, P9_STATS_BASIC | P9_STATS_GEN);
- if (IS_ERR(st)) {
- retval = PTR_ERR(st);
- goto error;
- }
-
- retval = v9fs_init_inode(v9ses, inode, &fid->qid,
+ inode->i_ino = QID2INO(qid);
+ retval = v9fs_init_inode(v9ses, inode,
st->st_mode, new_decode_dev(st->st_rdev));
- v9fs_stat2inode_dotl(st, inode, 0);
- kfree(st);
if (retval)
goto error;
+ v9fs_stat2inode_dotl(st, inode, 0);
v9fs_set_netfs_context(inode);
v9fs_cache_inode_get_cookie(inode);
retval = v9fs_get_acl(inode, fid);
@@ -103,11 +133,27 @@ v9fs_fid_iget_dotl(struct super_block *sb, struct p9_fid *fid, bool new)
goto error;
unlock_new_inode(inode);
-done:
return inode;
error:
iget_failed(inode);
return ERR_PTR(retval);
+
+}
+
+struct inode *
+v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+ struct super_block *sb, int new)
+{
+ struct p9_stat_dotl *st;
+ struct inode *inode = NULL;
+
+ st = p9_client_getattr_dotl(fid, P9_STATS_BASIC | P9_STATS_GEN);
+ if (IS_ERR(st))
+ return ERR_CAST(st);
+
+ inode = v9fs_qid_iget_dotl(sb, &st->qid, fid, st, new);
+ kfree(st);
+ return inode;
}
struct dotl_openflag_map {
@@ -259,7 +305,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
goto out;
}
- inode = v9fs_fid_iget_dotl(dir->i_sb, fid, true);
+ inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", err);
@@ -309,6 +355,7 @@ static int v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap,
umode_t omode)
{
int err;
+ struct v9fs_session_info *v9ses;
struct p9_fid *fid = NULL, *dfid = NULL;
kgid_t gid;
const unsigned char *name;
@@ -318,6 +365,7 @@ static int v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap,
struct posix_acl *dacl = NULL, *pacl = NULL;
p9_debug(P9_DEBUG_VFS, "name %pd\n", dentry);
+ v9ses = v9fs_inode2v9ses(dir);
omode |= S_IFDIR;
if (dir->i_mode & S_ISGID)
@@ -352,7 +400,7 @@ static int v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap,
}
/* instantiate inode and assign the unopened fid to the dentry */
- inode = v9fs_fid_iget_dotl(dir->i_sb, fid, true);
+ inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n",
@@ -749,6 +797,7 @@ v9fs_vfs_mknod_dotl(struct mnt_idmap *idmap, struct inode *dir,
kgid_t gid;
const unsigned char *name;
umode_t mode;
+ struct v9fs_session_info *v9ses;
struct p9_fid *fid = NULL, *dfid = NULL;
struct inode *inode;
struct p9_qid qid;
@@ -758,6 +807,7 @@ v9fs_vfs_mknod_dotl(struct mnt_idmap *idmap, struct inode *dir,
dir->i_ino, dentry, omode,
MAJOR(rdev), MINOR(rdev));
+ v9ses = v9fs_inode2v9ses(dir);
dfid = v9fs_parent_fid(dentry);
if (IS_ERR(dfid)) {
err = PTR_ERR(dfid);
@@ -788,7 +838,7 @@ v9fs_vfs_mknod_dotl(struct mnt_idmap *idmap, struct inode *dir,
err);
goto error;
}
- inode = v9fs_fid_iget_dotl(dir->i_sb, fid, true);
+ inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n",
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index f52fdf42945c..489db161abc9 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -139,7 +139,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
else
sb->s_d_op = &v9fs_dentry_operations;
- inode = v9fs_get_inode_from_fid(v9ses, fid, sb, true);
+ inode = v9fs_get_new_inode_from_fid(v9ses, fid, sb);
if (IS_ERR(inode)) {
retval = PTR_ERR(inode);
goto release_sb;
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index f8622ed72e08..ada363af5aab 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -12,6 +12,7 @@
#include <linux/swap.h>
#include <linux/ctype.h>
#include <linux/sched.h>
+#include <linux/iversion.h>
#include <linux/task_io_accounting_ops.h>
#include "internal.h"
#include "afs_fs.h"
@@ -1823,6 +1824,8 @@ error:
static void afs_rename_success(struct afs_operation *op)
{
+ struct afs_vnode *vnode = AFS_FS_I(d_inode(op->dentry));
+
_enter("op=%08x", op->debug_id);
op->ctime = op->file[0].scb.status.mtime_client;
@@ -1832,6 +1835,22 @@ static void afs_rename_success(struct afs_operation *op)
op->ctime = op->file[1].scb.status.mtime_client;
afs_vnode_commit_status(op, &op->file[1]);
}
+
+ /* If we're moving a subdir between dirs, we need to update
+ * its DV counter too as the ".." will be altered.
+ */
+ if (S_ISDIR(vnode->netfs.inode.i_mode) &&
+ op->file[0].vnode != op->file[1].vnode) {
+ u64 new_dv;
+
+ write_seqlock(&vnode->cb_lock);
+
+ new_dv = vnode->status.data_version + 1;
+ vnode->status.data_version = new_dv;
+ inode_set_iversion_raw(&vnode->netfs.inode, new_dv);
+
+ write_sequnlock(&vnode->cb_lock);
+ }
}
static void afs_rename_edit_dir(struct afs_operation *op)
@@ -1873,6 +1892,12 @@ static void afs_rename_edit_dir(struct afs_operation *op)
&vnode->fid, afs_edit_dir_for_rename_2);
}
+ if (S_ISDIR(vnode->netfs.inode.i_mode) &&
+ new_dvnode != orig_dvnode &&
+ test_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
+ afs_edit_dir_update_dotdot(vnode, new_dvnode,
+ afs_edit_dir_for_rename_sub);
+
new_inode = d_inode(new_dentry);
if (new_inode) {
spin_lock(&new_inode->i_lock);
diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c
index a71bff10496b..fe223fb78111 100644
--- a/fs/afs/dir_edit.c
+++ b/fs/afs/dir_edit.c
@@ -127,10 +127,10 @@ static struct folio *afs_dir_get_folio(struct afs_vnode *vnode, pgoff_t index)
/*
* Scan a directory block looking for a dirent of the right name.
*/
-static int afs_dir_scan_block(union afs_xdr_dir_block *block, struct qstr *name,
+static int afs_dir_scan_block(const union afs_xdr_dir_block *block, const struct qstr *name,
unsigned int blocknum)
{
- union afs_xdr_dirent *de;
+ const union afs_xdr_dirent *de;
u64 bitmap;
int d, len, n;
@@ -492,3 +492,90 @@ error:
clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
goto out_unmap;
}
+
+/*
+ * Edit a subdirectory that has been moved between directories to update the
+ * ".." entry.
+ */
+void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_dvnode,
+ enum afs_edit_dir_reason why)
+{
+ union afs_xdr_dir_block *block;
+ union afs_xdr_dirent *de;
+ struct folio *folio;
+ unsigned int nr_blocks, b;
+ pgoff_t index;
+ loff_t i_size;
+ int slot;
+
+ _enter("");
+
+ i_size = i_size_read(&vnode->netfs.inode);
+ if (i_size < AFS_DIR_BLOCK_SIZE) {
+ clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
+ return;
+ }
+ nr_blocks = i_size / AFS_DIR_BLOCK_SIZE;
+
+ /* Find a block that has sufficient slots available. Each folio
+ * contains two or more directory blocks.
+ */
+ for (b = 0; b < nr_blocks; b++) {
+ index = b / AFS_DIR_BLOCKS_PER_PAGE;
+ folio = afs_dir_get_folio(vnode, index);
+ if (!folio)
+ goto error;
+
+ block = kmap_local_folio(folio, b * AFS_DIR_BLOCK_SIZE - folio_pos(folio));
+
+ /* Abandon the edit if we got a callback break. */
+ if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
+ goto invalidated;
+
+ slot = afs_dir_scan_block(block, &dotdot_name, b);
+ if (slot >= 0)
+ goto found_dirent;
+
+ kunmap_local(block);
+ folio_unlock(folio);
+ folio_put(folio);
+ }
+
+ /* Didn't find the dirent to clobber. Download the directory again. */
+ trace_afs_edit_dir(vnode, why, afs_edit_dir_update_nodd,
+ 0, 0, 0, 0, "..");
+ clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
+ goto out;
+
+found_dirent:
+ de = &block->dirents[slot];
+ de->u.vnode = htonl(new_dvnode->fid.vnode);
+ de->u.unique = htonl(new_dvnode->fid.unique);
+
+ trace_afs_edit_dir(vnode, why, afs_edit_dir_update_dd, b, slot,
+ ntohl(de->u.vnode), ntohl(de->u.unique), "..");
+
+ kunmap_local(block);
+ folio_unlock(folio);
+ folio_put(folio);
+ inode_set_iversion_raw(&vnode->netfs.inode, vnode->status.data_version);
+
+out:
+ _leave("");
+ return;
+
+invalidated:
+ kunmap_local(block);
+ folio_unlock(folio);
+ folio_put(folio);
+ trace_afs_edit_dir(vnode, why, afs_edit_dir_update_inval,
+ 0, 0, 0, 0, "..");
+ clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
+ goto out;
+
+error:
+ trace_afs_edit_dir(vnode, why, afs_edit_dir_update_error,
+ 0, 0, 0, 0, "..");
+ clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
+ goto out;
+}
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 6e1d3c4daf72..c9d620175e80 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -130,6 +130,7 @@ struct afs_call {
wait_queue_head_t waitq; /* processes awaiting completion */
struct work_struct async_work; /* async I/O processor */
struct work_struct work; /* actual work processor */
+ struct work_struct free_work; /* Deferred free processor */
struct rxrpc_call *rxcall; /* RxRPC call handle */
struct rxrpc_peer *peer; /* Remote endpoint */
struct key *key; /* security for this call */
@@ -1072,6 +1073,8 @@ extern void afs_check_for_remote_deletion(struct afs_operation *);
extern void afs_edit_dir_add(struct afs_vnode *, struct qstr *, struct afs_fid *,
enum afs_edit_dir_reason);
extern void afs_edit_dir_remove(struct afs_vnode *, struct qstr *, enum afs_edit_dir_reason);
+void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_dvnode,
+ enum afs_edit_dir_reason why);
/*
* dir_silly.c
@@ -1331,6 +1334,7 @@ extern int __net_init afs_open_socket(struct afs_net *);
extern void __net_exit afs_close_socket(struct afs_net *);
extern void afs_charge_preallocation(struct work_struct *);
extern void afs_put_call(struct afs_call *);
+void afs_deferred_put_call(struct afs_call *call);
void afs_make_call(struct afs_call *call, gfp_t gfp);
void afs_wait_for_call_to_complete(struct afs_call *call);
extern struct afs_call *afs_alloc_flat_call(struct afs_net *,
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index c453428f3c8b..9f2a3bb56ec6 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -18,6 +18,7 @@
struct workqueue_struct *afs_async_calls;
+static void afs_deferred_free_worker(struct work_struct *work);
static void afs_wake_up_call_waiter(struct sock *, struct rxrpc_call *, unsigned long);
static void afs_wake_up_async_call(struct sock *, struct rxrpc_call *, unsigned long);
static void afs_process_async_call(struct work_struct *);
@@ -149,6 +150,7 @@ static struct afs_call *afs_alloc_call(struct afs_net *net,
call->debug_id = atomic_inc_return(&rxrpc_debug_id);
refcount_set(&call->ref, 1);
INIT_WORK(&call->async_work, afs_process_async_call);
+ INIT_WORK(&call->free_work, afs_deferred_free_worker);
init_waitqueue_head(&call->waitq);
spin_lock_init(&call->state_lock);
call->iter = &call->def_iter;
@@ -159,6 +161,36 @@ static struct afs_call *afs_alloc_call(struct afs_net *net,
return call;
}
+static void afs_free_call(struct afs_call *call)
+{
+ struct afs_net *net = call->net;
+ int o;
+
+ ASSERT(!work_pending(&call->async_work));
+
+ rxrpc_kernel_put_peer(call->peer);
+
+ if (call->rxcall) {
+ rxrpc_kernel_shutdown_call(net->socket, call->rxcall);
+ rxrpc_kernel_put_call(net->socket, call->rxcall);
+ call->rxcall = NULL;
+ }
+ if (call->type->destructor)
+ call->type->destructor(call);
+
+ afs_unuse_server_notime(call->net, call->server, afs_server_trace_put_call);
+ kfree(call->request);
+
+ o = atomic_read(&net->nr_outstanding_calls);
+ trace_afs_call(call->debug_id, afs_call_trace_free, 0, o,
+ __builtin_return_address(0));
+ kfree(call);
+
+ o = atomic_dec_return(&net->nr_outstanding_calls);
+ if (o == 0)
+ wake_up_var(&net->nr_outstanding_calls);
+}
+
/*
* Dispose of a reference on a call.
*/
@@ -173,32 +205,34 @@ void afs_put_call(struct afs_call *call)
o = atomic_read(&net->nr_outstanding_calls);
trace_afs_call(debug_id, afs_call_trace_put, r - 1, o,
__builtin_return_address(0));
+ if (zero)
+ afs_free_call(call);
+}
- if (zero) {
- ASSERT(!work_pending(&call->async_work));
- ASSERT(call->type->name != NULL);
-
- rxrpc_kernel_put_peer(call->peer);
-
- if (call->rxcall) {
- rxrpc_kernel_shutdown_call(net->socket, call->rxcall);
- rxrpc_kernel_put_call(net->socket, call->rxcall);
- call->rxcall = NULL;
- }
- if (call->type->destructor)
- call->type->destructor(call);
+static void afs_deferred_free_worker(struct work_struct *work)
+{
+ struct afs_call *call = container_of(work, struct afs_call, free_work);
- afs_unuse_server_notime(call->net, call->server, afs_server_trace_put_call);
- kfree(call->request);
+ afs_free_call(call);
+}
- trace_afs_call(call->debug_id, afs_call_trace_free, 0, o,
- __builtin_return_address(0));
- kfree(call);
+/*
+ * Dispose of a reference on a call, deferring the cleanup to a workqueue
+ * to avoid lock recursion.
+ */
+void afs_deferred_put_call(struct afs_call *call)
+{
+ struct afs_net *net = call->net;
+ unsigned int debug_id = call->debug_id;
+ bool zero;
+ int r, o;
- o = atomic_dec_return(&net->nr_outstanding_calls);
- if (o == 0)
- wake_up_var(&net->nr_outstanding_calls);
- }
+ zero = __refcount_dec_and_test(&call->ref, &r);
+ o = atomic_read(&net->nr_outstanding_calls);
+ trace_afs_call(debug_id, afs_call_trace_put, r - 1, o,
+ __builtin_return_address(0));
+ if (zero)
+ schedule_work(&call->free_work);
}
static struct afs_call *afs_get_call(struct afs_call *call,
@@ -640,7 +674,8 @@ static void afs_wake_up_call_waiter(struct sock *sk, struct rxrpc_call *rxcall,
}
/*
- * wake up an asynchronous call
+ * Wake up an asynchronous call. The caller is holding the call notify
+ * spinlock around this, so we can't call afs_put_call().
*/
static void afs_wake_up_async_call(struct sock *sk, struct rxrpc_call *rxcall,
unsigned long call_user_ID)
@@ -657,7 +692,7 @@ static void afs_wake_up_async_call(struct sock *sk, struct rxrpc_call *rxcall,
__builtin_return_address(0));
if (!queue_work(afs_async_calls, &call->async_work))
- afs_put_call(call);
+ afs_deferred_put_call(call);
}
}
diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c
index f011e026358e..6d57efbb8110 100644
--- a/fs/autofs/dev-ioctl.c
+++ b/fs/autofs/dev-ioctl.c
@@ -110,6 +110,7 @@ static inline void free_dev_ioctl(struct autofs_dev_ioctl *param)
*/
static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param)
{
+ unsigned int inr = _IOC_NR(cmd);
int err;
err = check_dev_ioctl_version(cmd, param);
@@ -133,7 +134,7 @@ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param)
* check_name() return for AUTOFS_DEV_IOCTL_TIMEOUT_CMD.
*/
err = check_name(param->path);
- if (cmd == AUTOFS_DEV_IOCTL_TIMEOUT_CMD)
+ if (inr == AUTOFS_DEV_IOCTL_TIMEOUT_CMD)
err = err ? 0 : -EINVAL;
if (err) {
pr_warn("invalid path supplied for cmd(0x%08x)\n",
@@ -141,8 +142,6 @@ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param)
goto out;
}
} else {
- unsigned int inr = _IOC_NR(cmd);
-
if (inr == AUTOFS_DEV_IOCTL_OPENMOUNT_CMD ||
inr == AUTOFS_DEV_IOCTL_REQUESTER_CMD ||
inr == AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD) {
diff --git a/fs/backing-file.c b/fs/backing-file.c
index 8860dac58c37..09a9be945d45 100644
--- a/fs/backing-file.c
+++ b/fs/backing-file.c
@@ -80,7 +80,7 @@ struct backing_aio {
refcount_t ref;
struct kiocb *orig_iocb;
/* used for aio completion */
- void (*end_write)(struct file *);
+ void (*end_write)(struct file *, loff_t, ssize_t);
struct work_struct work;
long res;
};
@@ -109,7 +109,7 @@ static void backing_aio_cleanup(struct backing_aio *aio, long res)
struct kiocb *orig_iocb = aio->orig_iocb;
if (aio->end_write)
- aio->end_write(orig_iocb->ki_filp);
+ aio->end_write(orig_iocb->ki_filp, iocb->ki_pos, res);
orig_iocb->ki_pos = iocb->ki_pos;
backing_aio_put(aio);
@@ -239,7 +239,7 @@ ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter,
ret = vfs_iter_write(file, iter, &iocb->ki_pos, rwf);
if (ctx->end_write)
- ctx->end_write(ctx->user_file);
+ ctx->end_write(ctx->user_file, iocb->ki_pos, ret);
} else {
struct backing_aio *aio;
@@ -317,7 +317,7 @@ ssize_t backing_file_splice_write(struct pipe_inode_info *pipe,
revert_creds(old_cred);
if (ctx->end_write)
- ctx->end_write(ctx->user_file);
+ ctx->end_write(ctx->user_file, ppos ? *ppos : 0, ret);
return ret;
}
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 6e161f8ffe8d..c84a91572a1d 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1977,7 +1977,7 @@ static void bch2_do_discards_fast_work(struct work_struct *work)
ca->mi.bucket_size,
GFP_KERNEL);
- int ret = bch2_trans_do(c, NULL, NULL,
+ int ret = bch2_trans_commit_do(c, NULL, NULL,
BCH_WATERMARK_btree|
BCH_TRANS_COMMIT_no_enospc,
bch2_clear_bucket_needs_discard(trans, POS(ca->dev_idx, bucket)));
@@ -2137,14 +2137,15 @@ static void bch2_do_invalidates_work(struct work_struct *work)
struct bkey_s_c k = next_lru_key(trans, &iter, ca, &wrapped);
ret = bkey_err(k);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- continue;
if (ret)
- break;
+ goto restart_err;
if (!k.k)
break;
ret = invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate);
+restart_err:
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ continue;
if (ret)
break;
@@ -2350,24 +2351,19 @@ int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
/* Bucket IO clocks: */
-int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
- size_t bucket_nr, int rw)
+static int __bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
+ size_t bucket_nr, int rw)
{
struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_i_alloc_v4 *a;
- u64 now;
- int ret = 0;
-
- if (bch2_trans_relock(trans))
- bch2_trans_begin(trans);
- a = bch2_trans_start_alloc_update_noupdate(trans, &iter, POS(dev, bucket_nr));
- ret = PTR_ERR_OR_ZERO(a);
+ struct btree_iter iter;
+ struct bkey_i_alloc_v4 *a =
+ bch2_trans_start_alloc_update_noupdate(trans, &iter, POS(dev, bucket_nr));
+ int ret = PTR_ERR_OR_ZERO(a);
if (ret)
return ret;
- now = bch2_current_io_time(c, rw);
+ u64 now = bch2_current_io_time(c, rw);
if (a->v.io_time[rw] == now)
goto out;
@@ -2380,6 +2376,15 @@ out:
return ret;
}
+int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
+ size_t bucket_nr, int rw)
+{
+ if (bch2_trans_relock(trans))
+ bch2_trans_begin(trans);
+
+ return nested_lockrestart_do(trans, __bch2_bucket_io_time_reset(trans, dev, bucket_nr, rw));
+}
+
/* Startup/shutdown (ro/rw): */
void bch2_recalc_capacity(struct bch_fs *c)
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index f8e87c6721b1..163a67b97a40 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -168,6 +168,9 @@ static inline bool data_type_movable(enum bch_data_type type)
static inline u64 alloc_lru_idx_fragmentation(struct bch_alloc_v4 a,
struct bch_dev *ca)
{
+ if (a.data_type >= BCH_DATA_NR)
+ return 0;
+
if (!data_type_movable(a.data_type) ||
!bch2_bucket_sectors_fragmented(ca, a))
return 0;
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index d0e0b56892e3..372178c8d416 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -162,6 +162,10 @@ static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob)
ARRAY_SIZE(c->open_buckets_partial));
spin_lock(&c->freelist_lock);
+ rcu_read_lock();
+ bch2_dev_rcu(c, ob->dev)->nr_partial_buckets++;
+ rcu_read_unlock();
+
ob->on_partial_list = true;
c->open_buckets_partial[c->open_buckets_partial_nr++] =
ob - c->open_buckets;
@@ -684,7 +688,7 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
struct bch_dev_usage usage;
struct open_bucket *ob;
- bch2_trans_do(c, NULL, NULL, 0,
+ bch2_trans_do(c,
PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark,
data_type, cl, false, &usage)));
return ob;
@@ -972,7 +976,7 @@ static int bucket_alloc_set_partial(struct bch_fs *c,
u64 avail;
bch2_dev_usage_read_fast(ca, &usage);
- avail = dev_buckets_free(ca, usage, watermark);
+ avail = dev_buckets_free(ca, usage, watermark) + ca->nr_partial_buckets;
if (!avail)
continue;
@@ -981,6 +985,10 @@ static int bucket_alloc_set_partial(struct bch_fs *c,
i);
ob->on_partial_list = false;
+ rcu_read_lock();
+ bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--;
+ rcu_read_unlock();
+
ret = add_new_bucket(c, ptrs, devs_may_alloc,
nr_replicas, nr_effective,
have_cache, ob);
@@ -1191,7 +1199,13 @@ void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *ca,
--c->open_buckets_partial_nr;
swap(c->open_buckets_partial[i],
c->open_buckets_partial[c->open_buckets_partial_nr]);
+
ob->on_partial_list = false;
+
+ rcu_read_lock();
+ bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--;
+ rcu_read_unlock();
+
spin_unlock(&c->freelist_lock);
bch2_open_bucket_put(c, ob);
spin_lock(&c->freelist_lock);
@@ -1610,8 +1624,7 @@ void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c,
ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
ob++) {
spin_lock(&ob->lock);
- if (ob->valid && !ob->on_partial_list &&
- (!ca || ob->dev == ca->dev_idx))
+ if (ob->valid && (!ca || ob->dev == ca->dev_idx))
bch2_open_bucket_to_text(out, c, ob);
spin_unlock(&ob->lock);
}
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index f4151ee51b03..e94a83b8113e 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -555,6 +555,7 @@ struct bch_dev {
u64 alloc_cursor[3];
unsigned nr_open_buckets;
+ unsigned nr_partial_buckets;
unsigned nr_btree_reserve;
size_t inc_gen_needs_gc;
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 94bbd8505582..0ca3feeb42c8 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -820,12 +820,22 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
* fix that here:
*/
alloc_data_type_set(&gc, gc.data_type);
-
if (gc.data_type != old_gc.data_type ||
gc.dirty_sectors != old_gc.dirty_sectors) {
ret = bch2_alloc_key_to_dev_counters(trans, ca, &old_gc, &gc, BTREE_TRIGGER_gc);
if (ret)
return ret;
+
+ /*
+ * Ugly: alloc_key_to_dev_counters(..., BTREE_TRIGGER_gc) is not
+ * safe w.r.t. transaction restarts, so fixup the gc_bucket so
+ * we don't run it twice:
+ */
+ percpu_down_read(&c->mark_lock);
+ struct bucket *gc_m = gc_bucket(ca, iter->pos.offset);
+ gc_m->data_type = gc.data_type;
+ gc_m->dirty_sectors = gc.dirty_sectors;
+ percpu_up_read(&c->mark_lock);
}
if (fsck_err_on(new.data_type != gc.data_type,
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index cf933409d385..6296a11ccb09 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1871,7 +1871,7 @@ static void btree_node_write_work(struct work_struct *work)
}
} else {
- ret = bch2_trans_do(c, NULL, NULL, 0,
+ ret = bch2_trans_do(c,
bch2_btree_node_update_key_get_iter(trans, b, &wbio->key,
BCH_WATERMARK_interior_updates|
BCH_TRANS_COMMIT_journal_reclaim|
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 0883cf6e1a3e..eef9b89c561d 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -882,6 +882,18 @@ static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
__bch2_btree_and_journal_iter_init_node_iter(trans, &jiter, l->b, l->iter, path->pos);
k = bch2_btree_and_journal_iter_peek(&jiter);
+ if (!k.k) {
+ struct printbuf buf = PRINTBUF;
+
+ prt_str(&buf, "node not found at pos ");
+ bch2_bpos_to_text(&buf, path->pos);
+ prt_str(&buf, " at btree ");
+ bch2_btree_pos_to_text(&buf, c, l->b);
+
+ ret = bch2_fs_topology_error(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ goto err;
+ }
bch2_bkey_buf_reassemble(out, c, k);
@@ -889,6 +901,7 @@ static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
c->opts.btree_node_prefetch)
ret = btree_path_prefetch_j(trans, path, &jiter);
+err:
bch2_btree_and_journal_iter_exit(&jiter);
return ret;
}
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 31a58bf46fdb..0bda054f80d7 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -912,6 +912,8 @@ struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *);
_ret; \
})
+#define bch2_trans_do(_c, _do) bch2_trans_run(_c, lockrestart_do(trans, _do))
+
struct btree_trans *__bch2_trans_get(struct bch_fs *, unsigned);
void bch2_trans_put(struct btree_trans *);
diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
index 514df618548e..5d809e8bd170 100644
--- a/fs/bcachefs/btree_update.c
+++ b/fs/bcachefs/btree_update.c
@@ -668,7 +668,7 @@ int bch2_btree_insert(struct bch_fs *c, enum btree_id id, struct bkey_i *k,
struct disk_reservation *disk_res, int flags,
enum btree_iter_update_trigger_flags iter_flags)
{
- return bch2_trans_do(c, disk_res, NULL, flags,
+ return bch2_trans_commit_do(c, disk_res, NULL, flags,
bch2_btree_insert_trans(trans, id, k, iter_flags));
}
@@ -865,7 +865,7 @@ __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt,
memcpy(l->d, buf.buf, buf.pos);
c->journal.early_journal_entries.nr += jset_u64s(u64s);
} else {
- ret = bch2_trans_do(c, NULL, NULL,
+ ret = bch2_trans_commit_do(c, NULL, NULL,
BCH_TRANS_COMMIT_lazy_rw|commit_flags,
__bch2_trans_log_msg(trans, &buf, u64s));
}
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 6a454f2fa005..70b3c989fac2 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -192,7 +192,7 @@ static inline int bch2_trans_commit(struct btree_trans *trans,
nested_lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\
(_journal_seq), (_flags)))
-#define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do) \
+#define bch2_trans_commit_do(_c, _disk_res, _journal_seq, _flags, _do) \
bch2_trans_run(_c, commit_do(trans, _disk_res, _journal_seq, _flags, _do))
#define trans_for_each_update(_trans, _i) \
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 190bc1e81756..64f0928e1137 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -2239,10 +2239,8 @@ static void async_btree_node_rewrite_work(struct work_struct *work)
struct async_btree_rewrite *a =
container_of(work, struct async_btree_rewrite, work);
struct bch_fs *c = a->c;
- int ret;
- ret = bch2_trans_do(c, NULL, NULL, 0,
- async_btree_node_rewrite_trans(trans, a));
+ int ret = bch2_trans_do(c, async_btree_node_rewrite_trans(trans, a));
bch_err_fn_ratelimited(c, ret);
bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite);
kfree(a);
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 546cd01a72e3..ec7d9a59bea9 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1160,11 +1160,11 @@ int bch2_trans_mark_dev_sbs(struct bch_fs *c)
#define SECTORS_CACHE 1024
int __bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
- u64 sectors, int flags)
+ u64 sectors, enum bch_reservation_flags flags)
{
struct bch_fs_pcpu *pcpu;
u64 old, get;
- s64 sectors_available;
+ u64 sectors_available;
int ret;
percpu_down_read(&c->mark_lock);
@@ -1202,6 +1202,9 @@ recalculate:
percpu_u64_set(&c->pcpu->sectors_available, 0);
sectors_available = avail_factor(__bch2_fs_usage_read_short(c).free);
+ if (sectors_available && (flags & BCH_DISK_RESERVATION_PARTIAL))
+ sectors = min(sectors, sectors_available);
+
if (sectors <= sectors_available ||
(flags & BCH_DISK_RESERVATION_NOFAIL)) {
atomic64_set(&c->sectors_available,
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index e2cb7b24b220..fd5e6ccad45e 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -344,14 +344,16 @@ static inline void bch2_disk_reservation_put(struct bch_fs *c,
}
}
-#define BCH_DISK_RESERVATION_NOFAIL (1 << 0)
+enum bch_reservation_flags {
+ BCH_DISK_RESERVATION_NOFAIL = 1 << 0,
+ BCH_DISK_RESERVATION_PARTIAL = 1 << 1,
+};
-int __bch2_disk_reservation_add(struct bch_fs *,
- struct disk_reservation *,
- u64, int);
+int __bch2_disk_reservation_add(struct bch_fs *, struct disk_reservation *,
+ u64, enum bch_reservation_flags);
static inline int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
- u64 sectors, int flags)
+ u64 sectors, enum bch_reservation_flags flags)
{
#ifdef __KERNEL__
u64 old, new;
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index cbfd88f98472..2182b555c112 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -225,6 +225,7 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a
opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio);
opt_set(thr->opts, read_only, 1);
+ opt_set(thr->opts, ratelimit_errors, 0);
/* We need request_key() to be called before we punt to kthread: */
opt_set(thr->opts, nostart, true);
diff --git a/fs/bcachefs/darray.c b/fs/bcachefs/darray.c
index 4f06cd8bbbe1..e86d36d23e9e 100644
--- a/fs/bcachefs/darray.c
+++ b/fs/bcachefs/darray.c
@@ -2,6 +2,7 @@
#include <linux/log2.h>
#include <linux/slab.h>
+#include <linux/vmalloc.h>
#include "darray.h"
int __bch2_darray_resize_noprof(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp)
@@ -9,7 +10,19 @@ int __bch2_darray_resize_noprof(darray_char *d, size_t element_size, size_t new_
if (new_size > d->size) {
new_size = roundup_pow_of_two(new_size);
- void *data = kvmalloc_array_noprof(new_size, element_size, gfp);
+ /*
+ * This is a workaround: kvmalloc() doesn't support > INT_MAX
+ * allocations, but vmalloc() does.
+ * The limit needs to be lifted from kvmalloc, and when it does
+ * we'll go back to just using that.
+ */
+ size_t bytes;
+ if (unlikely(check_mul_overflow(new_size, element_size, &bytes)))
+ return -ENOMEM;
+
+ void *data = likely(bytes < INT_MAX)
+ ? kvmalloc_noprof(bytes, gfp)
+ : vmalloc_noprof(bytes);
if (!data)
return -ENOMEM;
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index a6ee0beee6b0..8e75a852b358 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -236,7 +236,8 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
if (((1U << i) & m->data_opts.rewrite_ptrs) &&
(ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) &&
!ptr->cached) {
- bch2_extent_ptr_set_cached(bkey_i_to_s(insert), ptr);
+ bch2_extent_ptr_set_cached(c, &m->op.opts,
+ bkey_i_to_s(insert), ptr);
rewrites_found |= 1U << i;
}
i++;
@@ -284,7 +285,8 @@ restart_drop_extra_replicas:
durability - ptr_durability >= m->op.opts.data_replicas) {
durability -= ptr_durability;
- bch2_extent_ptr_set_cached(bkey_i_to_s(insert), &entry->ptr);
+ bch2_extent_ptr_set_cached(c, &m->op.opts,
+ bkey_i_to_s(insert), &entry->ptr);
goto restart_drop_extra_replicas;
}
}
@@ -295,7 +297,7 @@ restart_drop_extra_replicas:
bch2_extent_ptr_decoded_append(insert, &p);
bch2_bkey_narrow_crcs(insert, (struct bch_extent_crc_unpacked) { 0 });
- bch2_extent_normalize(c, bkey_i_to_s(insert));
+ bch2_extent_normalize_by_opts(c, &m->op.opts, bkey_i_to_s(insert));
ret = bch2_sum_sector_overwrites(trans, &iter, insert,
&should_check_enospc,
@@ -558,7 +560,8 @@ void bch2_data_update_to_text(struct printbuf *out, struct data_update *m)
int bch2_extent_drop_ptrs(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k,
- struct data_update_opts data_opts)
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
{
struct bch_fs *c = trans->c;
struct bkey_i *n;
@@ -569,11 +572,11 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans,
if (ret)
return ret;
- while (data_opts.kill_ptrs) {
- unsigned i = 0, drop = __fls(data_opts.kill_ptrs);
+ while (data_opts->kill_ptrs) {
+ unsigned i = 0, drop = __fls(data_opts->kill_ptrs);
bch2_bkey_drop_ptrs_noerror(bkey_i_to_s(n), ptr, i++ == drop);
- data_opts.kill_ptrs ^= 1U << drop;
+ data_opts->kill_ptrs ^= 1U << drop;
}
/*
@@ -581,7 +584,7 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans,
* will do the appropriate thing with it (turning it into a
* KEY_TYPE_error key, or just a discard if it was a cached extent)
*/
- bch2_extent_normalize(c, bkey_i_to_s(n));
+ bch2_extent_normalize_by_opts(c, io_opts, bkey_i_to_s(n));
/*
* Since we're not inserting through an extent iterator
@@ -720,7 +723,7 @@ int bch2_data_update_init(struct btree_trans *trans,
m->data_opts.rewrite_ptrs = 0;
/* if iter == NULL, it's just a promote */
if (iter)
- ret = bch2_extent_drop_ptrs(trans, iter, k, m->data_opts);
+ ret = bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &m->data_opts);
goto out;
}
diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h
index 8d36365bdea8..e4b50723428e 100644
--- a/fs/bcachefs/data_update.h
+++ b/fs/bcachefs/data_update.h
@@ -40,7 +40,8 @@ void bch2_data_update_read_done(struct data_update *,
int bch2_extent_drop_ptrs(struct btree_trans *,
struct btree_iter *,
struct bkey_s_c,
- struct data_update_opts);
+ struct bch_io_opts *,
+ struct data_update_opts *);
void bch2_data_update_exit(struct data_update *);
int bch2_data_update_init(struct btree_trans *, struct btree_iter *,
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 84dd4a879d98..faffc98d5605 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -250,13 +250,6 @@ int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir,
return ret;
}
-static void dirent_copy_target(struct bkey_i_dirent *dst,
- struct bkey_s_c_dirent src)
-{
- dst->v.d_inum = src.v->d_inum;
- dst->v.d_type = src.v->d_type;
-}
-
int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir,
struct bkey_s_c_dirent d, subvol_inum *target)
{
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index 8945145865c5..53ad99666022 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -34,6 +34,13 @@ static inline unsigned dirent_val_u64s(unsigned len)
int bch2_dirent_read_target(struct btree_trans *, subvol_inum,
struct bkey_s_c_dirent, subvol_inum *);
+static inline void dirent_copy_target(struct bkey_i_dirent *dst,
+ struct bkey_s_c_dirent src)
+{
+ dst->v.d_inum = src.v->d_inum;
+ dst->v.d_type = src.v->d_type;
+}
+
int bch2_dirent_create_snapshot(struct btree_trans *, u32, u64, u32,
const struct bch_hash_info *, u8,
const struct qstr *, u64, u64 *,
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index e309fb78529b..07eb8fa1b026 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -856,8 +856,10 @@ int bch2_dev_usage_init(struct bch_dev *ca, bool gc)
};
u64 v[3] = { ca->mi.nbuckets - ca->mi.first_bucket, 0, 0 };
- int ret = bch2_trans_do(c, NULL, NULL, 0,
- bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v), gc));
+ int ret = bch2_trans_do(c, ({
+ bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v), gc) ?:
+ (!gc ? bch2_trans_commit(trans, NULL, NULL, 0) : 0);
+ }));
bch_err_fn(c, ret);
return ret;
}
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index e410cfe37b1a..749dcf368841 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -266,12 +266,12 @@ static int __mark_stripe_bucket(struct btree_trans *trans,
if (!deleting) {
a->stripe = s.k->p.offset;
a->stripe_redundancy = s.v->nr_redundant;
+ alloc_data_type_set(a, data_type);
} else {
a->stripe = 0;
a->stripe_redundancy = 0;
+ alloc_data_type_set(a, BCH_DATA_user);
}
-
- alloc_data_type_set(a, data_type);
err:
printbuf_exit(&buf);
return ret;
@@ -1186,7 +1186,7 @@ static void ec_stripe_delete_work(struct work_struct *work)
if (!idx)
break;
- int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
ec_stripe_delete(trans, idx));
bch_err_fn(c, ret);
if (ret)
@@ -1519,14 +1519,14 @@ static void ec_stripe_create(struct ec_stripe_new *s)
goto err;
}
- ret = bch2_trans_do(c, &s->res, NULL,
- BCH_TRANS_COMMIT_no_check_rw|
- BCH_TRANS_COMMIT_no_enospc,
- ec_stripe_key_update(trans,
- s->have_existing_stripe
- ? bkey_i_to_stripe(&s->existing_stripe.key)
- : NULL,
- bkey_i_to_stripe(&s->new_stripe.key)));
+ ret = bch2_trans_commit_do(c, &s->res, NULL,
+ BCH_TRANS_COMMIT_no_check_rw|
+ BCH_TRANS_COMMIT_no_enospc,
+ ec_stripe_key_update(trans,
+ s->have_existing_stripe
+ ? bkey_i_to_stripe(&s->existing_stripe.key)
+ : NULL,
+ bkey_i_to_stripe(&s->new_stripe.key)));
bch_err_msg(c, ret, "creating stripe key");
if (ret) {
goto err;
@@ -1870,6 +1870,10 @@ __bch2_ec_stripe_head_get(struct btree_trans *trans,
}
h = ec_new_stripe_head_alloc(c, disk_label, algo, redundancy, watermark);
+ if (!h) {
+ h = ERR_PTR(-BCH_ERR_ENOMEM_stripe_head_alloc);
+ goto err;
+ }
found:
if (h->rw_devs_change_count != c->rw_devs_change_count)
ec_stripe_head_devs_update(c, h);
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 649263516ab1..a1bc6c7a8ba0 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -83,6 +83,7 @@
x(ENOMEM, ENOMEM_fs_other_alloc) \
x(ENOMEM, ENOMEM_dev_alloc) \
x(ENOMEM, ENOMEM_disk_accounting) \
+ x(ENOMEM, ENOMEM_stripe_head_alloc) \
x(ENOSPC, ENOSPC_disk_reservation) \
x(ENOSPC, ENOSPC_bucket_alloc) \
x(ENOSPC, ENOSPC_disk_label_add) \
@@ -222,6 +223,7 @@
x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_type) \
x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_nr_superblocks) \
x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_superblocks_overlap) \
+ x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_sb_max_size_bits) \
x(BCH_ERR_invalid_sb, invalid_sb_members_missing) \
x(BCH_ERR_invalid_sb, invalid_sb_members) \
x(BCH_ERR_invalid_sb, invalid_sb_disk_groups) \
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index 7a79f695ba2e..b679def8fb98 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -251,7 +251,10 @@ int __bch2_fsck_err(struct bch_fs *c,
* delete the key)
* - and we don't need to warn if we're not prompting
*/
- WARN_ON(!(flags & FSCK_AUTOFIX) && !trans && bch2_current_has_btree_trans(c));
+ WARN_ON((flags & FSCK_CAN_FIX) &&
+ !(flags & FSCK_AUTOFIX) &&
+ !trans &&
+ bch2_current_has_btree_trans(c));
if ((flags & FSCK_CAN_FIX) &&
test_bit(err, c->sb.errors_silent))
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index cc0d22085aef..c4e91d123849 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -978,31 +978,54 @@ bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, struct bke
return NULL;
}
-void bch2_extent_ptr_set_cached(struct bkey_s k, struct bch_extent_ptr *ptr)
+static bool want_cached_ptr(struct bch_fs *c, struct bch_io_opts *opts,
+ struct bch_extent_ptr *ptr)
+{
+ if (!opts->promote_target ||
+ !bch2_dev_in_target(c, ptr->dev, opts->promote_target))
+ return false;
+
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
+
+ return ca && bch2_dev_is_readable(ca) && !dev_ptr_stale_rcu(ca, ptr);
+}
+
+void bch2_extent_ptr_set_cached(struct bch_fs *c,
+ struct bch_io_opts *opts,
+ struct bkey_s k,
+ struct bch_extent_ptr *ptr)
{
struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
union bch_extent_entry *entry;
- union bch_extent_entry *ec = NULL;
+ struct extent_ptr_decoded p;
- bkey_extent_entry_for_each(ptrs, entry) {
+ rcu_read_lock();
+ if (!want_cached_ptr(c, opts, ptr)) {
+ bch2_bkey_drop_ptr_noerror(k, ptr);
+ goto out;
+ }
+
+ /*
+ * Stripes can't contain cached data, for - reasons.
+ *
+ * Possibly something we can fix in the future?
+ */
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
if (&entry->ptr == ptr) {
- ptr->cached = true;
- if (ec)
- extent_entry_drop(k, ec);
- return;
+ if (p.has_ec)
+ bch2_bkey_drop_ptr_noerror(k, ptr);
+ else
+ ptr->cached = true;
+ goto out;
}
- if (extent_entry_is_stripe_ptr(entry))
- ec = entry;
- else if (extent_entry_is_ptr(entry))
- ec = NULL;
- }
-
BUG();
+out:
+ rcu_read_unlock();
}
/*
- * bch_extent_normalize - clean up an extent, dropping stale pointers etc.
+ * bch2_extent_normalize - clean up an extent, dropping stale pointers etc.
*
* Returns true if @k should be dropped entirely
*
@@ -1016,8 +1039,39 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
rcu_read_lock();
bch2_bkey_drop_ptrs(k, ptr,
ptr->cached &&
- (ca = bch2_dev_rcu(c, ptr->dev)) &&
- dev_ptr_stale_rcu(ca, ptr) > 0);
+ (!(ca = bch2_dev_rcu(c, ptr->dev)) ||
+ dev_ptr_stale_rcu(ca, ptr) > 0));
+ rcu_read_unlock();
+
+ return bkey_deleted(k.k);
+}
+
+/*
+ * bch2_extent_normalize_by_opts - clean up an extent, dropping stale pointers etc.
+ *
+ * Like bch2_extent_normalize(), but also only keeps a single cached pointer on
+ * the promote target.
+ */
+bool bch2_extent_normalize_by_opts(struct bch_fs *c,
+ struct bch_io_opts *opts,
+ struct bkey_s k)
+{
+ struct bkey_ptrs ptrs;
+ bool have_cached_ptr;
+
+ rcu_read_lock();
+restart_drop_ptrs:
+ ptrs = bch2_bkey_ptrs(k);
+ have_cached_ptr = false;
+
+ bkey_for_each_ptr(ptrs, ptr)
+ if (ptr->cached) {
+ if (have_cached_ptr || !want_cached_ptr(c, opts, ptr)) {
+ bch2_bkey_drop_ptr(k, ptr);
+ goto restart_drop_ptrs;
+ }
+ have_cached_ptr = true;
+ }
rcu_read_unlock();
return bkey_deleted(k.k);
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 923a5f1849a8..bcffcf60aaaf 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -686,9 +686,12 @@ bool bch2_extents_match(struct bkey_s_c, struct bkey_s_c);
struct bch_extent_ptr *
bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s);
-void bch2_extent_ptr_set_cached(struct bkey_s, struct bch_extent_ptr *);
+void bch2_extent_ptr_set_cached(struct bch_fs *, struct bch_io_opts *,
+ struct bkey_s, struct bch_extent_ptr *);
+bool bch2_extent_normalize_by_opts(struct bch_fs *, struct bch_io_opts *, struct bkey_s);
bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
+
void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *, const struct bch_extent_ptr *);
void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
index 48a1ab9a649b..95972809e76d 100644
--- a/fs/bcachefs/fs-io-buffered.c
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -856,6 +856,12 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
folios_trunc(&fs, fi);
end = min(end, folio_end_pos(darray_last(fs)));
} else {
+ if (!folio_test_uptodate(f)) {
+ ret = bch2_read_single_folio(f, mapping);
+ if (ret)
+ goto out;
+ }
+
folios_trunc(&fs, fi + 1);
end = f_pos + f_reserved;
}
diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c
index af3a24546aa3..1d4910ea0f1d 100644
--- a/fs/bcachefs/fs-io-pagecache.c
+++ b/fs/bcachefs/fs-io-pagecache.c
@@ -399,14 +399,17 @@ void bch2_folio_reservation_put(struct bch_fs *c,
bch2_quota_reservation_put(c, inode, &res->quota);
}
-int bch2_folio_reservation_get(struct bch_fs *c,
+static int __bch2_folio_reservation_get(struct bch_fs *c,
struct bch_inode_info *inode,
struct folio *folio,
struct bch2_folio_reservation *res,
- size_t offset, size_t len)
+ size_t offset, size_t len,
+ bool partial)
{
struct bch_folio *s = bch2_folio_create(folio, 0);
unsigned i, disk_sectors = 0, quota_sectors = 0;
+ struct disk_reservation disk_res = {};
+ size_t reserved = len;
int ret;
if (!s)
@@ -422,48 +425,65 @@ int bch2_folio_reservation_get(struct bch_fs *c,
}
if (disk_sectors) {
- ret = bch2_disk_reservation_add(c, &res->disk, disk_sectors, 0);
+ ret = bch2_disk_reservation_add(c, &disk_res, disk_sectors,
+ partial ? BCH_DISK_RESERVATION_PARTIAL : 0);
if (unlikely(ret))
return ret;
+
+ if (unlikely(disk_res.sectors != disk_sectors)) {
+ disk_sectors = quota_sectors = 0;
+
+ for (i = round_down(offset, block_bytes(c)) >> 9;
+ i < round_up(offset + len, block_bytes(c)) >> 9;
+ i++) {
+ disk_sectors += sectors_to_reserve(&s->s[i], res->disk.nr_replicas);
+ if (disk_sectors > disk_res.sectors) {
+ /*
+ * Make sure to get a reservation that's
+ * aligned to the filesystem blocksize:
+ */
+ unsigned reserved_offset = round_down(i << 9, block_bytes(c));
+ reserved = clamp(reserved_offset, offset, offset + len) - offset;
+
+ if (!reserved) {
+ bch2_disk_reservation_put(c, &disk_res);
+ return -BCH_ERR_ENOSPC_disk_reservation;
+ }
+ break;
+ }
+ quota_sectors += s->s[i].state == SECTOR_unallocated;
+ }
+ }
}
if (quota_sectors) {
ret = bch2_quota_reservation_add(c, inode, &res->quota, quota_sectors, true);
if (unlikely(ret)) {
- struct disk_reservation tmp = { .sectors = disk_sectors };
-
- bch2_disk_reservation_put(c, &tmp);
- res->disk.sectors -= disk_sectors;
+ bch2_disk_reservation_put(c, &disk_res);
return ret;
}
}
- return 0;
+ res->disk.sectors += disk_res.sectors;
+ return partial ? reserved : 0;
}
-ssize_t bch2_folio_reservation_get_partial(struct bch_fs *c,
+int bch2_folio_reservation_get(struct bch_fs *c,
struct bch_inode_info *inode,
struct folio *folio,
struct bch2_folio_reservation *res,
size_t offset, size_t len)
{
- size_t l, reserved = 0;
- int ret;
-
- while ((l = len - reserved)) {
- while ((ret = bch2_folio_reservation_get(c, inode, folio, res, offset, l))) {
- if ((offset & (block_bytes(c) - 1)) + l <= block_bytes(c))
- return reserved ?: ret;
-
- len = reserved + l;
- l /= 2;
- }
-
- offset += l;
- reserved += l;
- }
+ return __bch2_folio_reservation_get(c, inode, folio, res, offset, len, false);
+}
- return reserved;
+ssize_t bch2_folio_reservation_get_partial(struct bch_fs *c,
+ struct bch_inode_info *inode,
+ struct folio *folio,
+ struct bch2_folio_reservation *res,
+ size_t offset, size_t len)
+{
+ return __bch2_folio_reservation_get(c, inode, folio, res, offset, len, true);
}
static void bch2_clear_folio_bits(struct folio *folio)
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 71d0fa387509..2456c41b215e 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -182,7 +182,7 @@ static int bch2_flush_inode(struct bch_fs *c,
struct bch_inode_unpacked u;
int ret = bch2_inode_find_by_inum(c, inode_inum(inode), &u) ?:
- bch2_journal_flush_seq(&c->journal, u.bi_journal_seq) ?:
+ bch2_journal_flush_seq(&c->journal, u.bi_journal_seq, TASK_INTERRUPTIBLE) ?:
bch2_inode_flush_nocow_writes(c, inode);
bch2_write_ref_put(c, BCH_WRITE_REF_fsync);
return ret;
@@ -587,7 +587,7 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
POS(inode->v.i_ino, start_sector),
BTREE_ITER_slots|BTREE_ITER_intent);
- while (!ret && bkey_lt(iter.pos, end_pos)) {
+ while (!ret) {
s64 i_sectors_delta = 0;
struct quota_res quota_res = { 0 };
struct bkey_s_c k;
@@ -598,6 +598,9 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
bch2_trans_begin(trans);
+ if (bkey_ge(iter.pos, end_pos))
+ break;
+
ret = bch2_subvolume_get_snapshot(trans,
inode->ei_inum.subvol, &snapshot);
if (ret)
@@ -634,12 +637,15 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
if (bch2_clamp_data_hole(&inode->v,
&hole_start,
&hole_end,
- opts.data_replicas, true))
+ opts.data_replicas, true)) {
ret = drop_locks_do(trans,
(bch2_clamp_data_hole(&inode->v,
&hole_start,
&hole_end,
opts.data_replicas, false), 0));
+ if (ret)
+ goto bkey_err;
+ }
bch2_btree_iter_set_pos(&iter, POS(iter.pos.inode, hole_start));
if (ret)
@@ -667,10 +673,13 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
bch2_i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
if (bch2_mark_pagecache_reserved(inode, &hole_start,
- iter.pos.offset, true))
- drop_locks_do(trans,
+ iter.pos.offset, true)) {
+ ret = drop_locks_do(trans,
bch2_mark_pagecache_reserved(inode, &hole_start,
iter.pos.offset, false));
+ if (ret)
+ goto bkey_err;
+ }
bkey_err:
bch2_quota_reservation_put(c, inode, &quota_res);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index d7bd28bdaffc..a41d0d8a2f7b 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -656,7 +656,7 @@ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode);
struct bch_inode_info *inode;
- bch2_trans_do(c, NULL, NULL, 0,
+ bch2_trans_do(c,
PTR_ERR_OR_ZERO(inode = bch2_lookup_trans(trans, inode_inum(dir),
&hash, &dentry->d_name)));
if (IS_ERR(inode))
@@ -869,7 +869,7 @@ static int bch2_rename2(struct mnt_idmap *idmap,
ret = bch2_subvol_is_ro_trans(trans, src_dir->ei_inum.subvol) ?:
bch2_subvol_is_ro_trans(trans, dst_dir->ei_inum.subvol);
if (ret)
- goto err;
+ goto err_tx_restart;
if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) {
ret = bch2_fs_quota_transfer(c, src_inode,
@@ -1266,7 +1266,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
POS(ei->v.i_ino, start), 0);
- while (true) {
+ while (!ret || bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
enum btree_id data_btree = BTREE_ID_extents;
bch2_trans_begin(trans);
@@ -1274,14 +1274,14 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
u32 snapshot;
ret = bch2_subvolume_get_snapshot(trans, ei->ei_inum.subvol, &snapshot);
if (ret)
- goto err;
+ continue;
bch2_btree_iter_set_snapshot(&iter, snapshot);
k = bch2_btree_iter_peek_upto(&iter, end);
ret = bkey_err(k);
if (ret)
- goto err;
+ continue;
if (!k.k)
break;
@@ -1301,7 +1301,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
ret = bch2_read_indirect_extent(trans, &data_btree,
&offset_into_extent, &cur);
if (ret)
- break;
+ continue;
k = bkey_i_to_s_c(cur.k);
bch2_bkey_buf_realloc(&prev, c, k.k->u64s);
@@ -1329,10 +1329,6 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
bch2_btree_iter_set_pos(&iter,
POS(iter.pos.inode, iter.pos.offset + sectors));
-err:
- if (ret &&
- !bch2_err_matches(ret, BCH_ERR_transaction_restart))
- break;
}
bch2_trans_iter_exit(trans, &iter);
@@ -2040,7 +2036,7 @@ static int bch2_show_options(struct seq_file *seq, struct dentry *root)
bch2_opts_to_text(&buf, c->opts, c, c->disk_sb.sb,
OPT_MOUNT, OPT_HIDDEN, OPT_SHOW_MOUNT_STYLE);
printbuf_nul_terminate(&buf);
- seq_puts(seq, buf.buf);
+ seq_printf(seq, ",%s", buf.buf);
int ret = buf.allocation_failure ? -ENOMEM : 0;
printbuf_exit(&buf);
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index a1087fd292e4..75c8a97a6954 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -929,35 +929,138 @@ static int get_visible_inodes(struct btree_trans *trans,
return ret;
}
-static int hash_redo_key(struct btree_trans *trans,
- const struct bch_hash_desc desc,
- struct bch_hash_info *hash_info,
- struct btree_iter *k_iter, struct bkey_s_c k)
-{
- struct bkey_i *delete;
- struct bkey_i *tmp;
-
- delete = bch2_trans_kmalloc(trans, sizeof(*delete));
- if (IS_ERR(delete))
- return PTR_ERR(delete);
-
- tmp = bch2_bkey_make_mut_noupdate(trans, k);
- if (IS_ERR(tmp))
- return PTR_ERR(tmp);
-
- bkey_init(&delete->k);
- delete->k.p = k_iter->pos;
- return bch2_btree_iter_traverse(k_iter) ?:
- bch2_trans_update(trans, k_iter, delete, 0) ?:
- bch2_hash_set_in_snapshot(trans, desc, hash_info,
- (subvol_inum) { 0, k.k->p.inode },
- k.k->p.snapshot, tmp,
- STR_HASH_must_create|
- BTREE_UPDATE_internal_snapshot_node) ?:
- bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
+static int dirent_has_target(struct btree_trans *trans, struct bkey_s_c_dirent d)
+{
+ if (d.v->d_type == DT_SUBVOL) {
+ u32 snap;
+ u64 inum;
+ int ret = subvol_lookup(trans, le32_to_cpu(d.v->d_child_subvol), &snap, &inum);
+ if (ret && !bch2_err_matches(ret, ENOENT))
+ return ret;
+ return !ret;
+ } else {
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
+ SPOS(0, le64_to_cpu(d.v->d_inum), d.k->p.snapshot), 0);
+ int ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ ret = bkey_is_inode(k.k);
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+ }
+}
+
+/*
+ * Prefer to delete the first one, since that will be the one at the wrong
+ * offset:
+ * return value: 0 -> delete k1, 1 -> delete k2
+ */
+static int hash_pick_winner(struct btree_trans *trans,
+ const struct bch_hash_desc desc,
+ struct bch_hash_info *hash_info,
+ struct bkey_s_c k1,
+ struct bkey_s_c k2)
+{
+ if (bkey_val_bytes(k1.k) == bkey_val_bytes(k2.k) &&
+ !memcmp(k1.v, k2.v, bkey_val_bytes(k1.k)))
+ return 0;
+
+ switch (desc.btree_id) {
+ case BTREE_ID_dirents: {
+ int ret = dirent_has_target(trans, bkey_s_c_to_dirent(k1));
+ if (ret < 0)
+ return ret;
+ if (!ret)
+ return 0;
+
+ ret = dirent_has_target(trans, bkey_s_c_to_dirent(k2));
+ if (ret < 0)
+ return ret;
+ if (!ret)
+ return 1;
+ return 2;
+ }
+ default:
+ return 0;
+ }
+}
+
+static int fsck_update_backpointers(struct btree_trans *trans,
+ struct snapshots_seen *s,
+ const struct bch_hash_desc desc,
+ struct bch_hash_info *hash_info,
+ struct bkey_i *new)
+{
+ if (new->k.type != KEY_TYPE_dirent)
+ return 0;
+
+ struct bkey_i_dirent *d = bkey_i_to_dirent(new);
+ struct inode_walker target = inode_walker_init();
+ int ret = 0;
+
+ if (d->v.d_type == DT_SUBVOL) {
+ BUG();
+ } else {
+ ret = get_visible_inodes(trans, &target, s, le64_to_cpu(d->v.d_inum));
+ if (ret)
+ goto err;
+
+ darray_for_each(target.inodes, i) {
+ i->inode.bi_dir_offset = d->k.p.offset;
+ ret = __bch2_fsck_write_inode(trans, &i->inode);
+ if (ret)
+ goto err;
+ }
+ }
+err:
+ inode_walker_exit(&target);
+ return ret;
+}
+
+static int fsck_rename_dirent(struct btree_trans *trans,
+ struct snapshots_seen *s,
+ const struct bch_hash_desc desc,
+ struct bch_hash_info *hash_info,
+ struct bkey_s_c_dirent old)
+{
+ struct qstr old_name = bch2_dirent_get_name(old);
+ struct bkey_i_dirent *new = bch2_trans_kmalloc(trans, bkey_bytes(old.k) + 32);
+ int ret = PTR_ERR_OR_ZERO(new);
+ if (ret)
+ return ret;
+
+ bkey_dirent_init(&new->k_i);
+ dirent_copy_target(new, old);
+ new->k.p = old.k->p;
+
+ for (unsigned i = 0; i < 1000; i++) {
+ unsigned len = sprintf(new->v.d_name, "%.*s.fsck_renamed-%u",
+ old_name.len, old_name.name, i);
+ unsigned u64s = BKEY_U64s + dirent_val_u64s(len);
+
+ if (u64s > U8_MAX)
+ return -EINVAL;
+
+ new->k.u64s = u64s;
+
+ ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info,
+ (subvol_inum) { 0, old.k->p.inode },
+ old.k->p.snapshot, &new->k_i,
+ BTREE_UPDATE_internal_snapshot_node);
+ if (!bch2_err_matches(ret, EEXIST))
+ break;
+ }
+
+ if (ret)
+ return ret;
+
+ return fsck_update_backpointers(trans, s, desc, hash_info, &new->k_i);
}
static int hash_check_key(struct btree_trans *trans,
+ struct snapshots_seen *s,
const struct bch_hash_desc desc,
struct bch_hash_info *hash_info,
struct btree_iter *k_iter, struct bkey_s_c hash_k)
@@ -986,16 +1089,9 @@ static int hash_check_key(struct btree_trans *trans,
if (bkey_eq(k.k->p, hash_k.k->p))
break;
- if (fsck_err_on(k.k->type == desc.key_type &&
- !desc.cmp_bkey(k, hash_k),
- trans, hash_table_key_duplicate,
- "duplicate hash table keys:\n%s",
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, hash_k),
- buf.buf))) {
- ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0) ?: 1;
- break;
- }
+ if (k.k->type == desc.key_type &&
+ !desc.cmp_bkey(k, hash_k))
+ goto duplicate_entries;
if (bkey_deleted(k.k)) {
bch2_trans_iter_exit(trans, &iter);
@@ -1008,18 +1104,66 @@ out:
return ret;
bad_hash:
if (fsck_err(trans, hash_table_key_wrong_offset,
- "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s",
+ "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n %s",
bch2_btree_id_str(desc.btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) {
- ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k);
- bch_err_fn(c, ret);
+ struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, hash_k);
+ if (IS_ERR(new))
+ return PTR_ERR(new);
+
+ k = bch2_hash_set_or_get_in_snapshot(trans, &iter, desc, hash_info,
+ (subvol_inum) { 0, hash_k.k->p.inode },
+ hash_k.k->p.snapshot, new,
+ STR_HASH_must_create|
+ BTREE_ITER_with_updates|
+ BTREE_UPDATE_internal_snapshot_node);
+ ret = bkey_err(k);
if (ret)
- return ret;
- ret = -BCH_ERR_transaction_restart_nested;
+ goto out;
+ if (k.k)
+ goto duplicate_entries;
+
+ ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter,
+ BTREE_UPDATE_internal_snapshot_node) ?:
+ fsck_update_backpointers(trans, s, desc, hash_info, new) ?:
+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
+ -BCH_ERR_transaction_restart_nested;
+ goto out;
}
fsck_err:
goto out;
+duplicate_entries:
+ ret = hash_pick_winner(trans, desc, hash_info, hash_k, k);
+ if (ret < 0)
+ goto out;
+
+ if (!fsck_err(trans, hash_table_key_duplicate,
+ "duplicate hash table keys%s:\n%s",
+ ret != 2 ? "" : ", both point to valid inodes",
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, hash_k),
+ prt_newline(&buf),
+ bch2_bkey_val_to_text(&buf, c, k),
+ buf.buf)))
+ goto out;
+
+ switch (ret) {
+ case 0:
+ ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0);
+ break;
+ case 1:
+ ret = bch2_hash_delete_at(trans, desc, hash_info, &iter, 0);
+ break;
+ case 2:
+ ret = fsck_rename_dirent(trans, s, desc, hash_info, bkey_s_c_to_dirent(hash_k)) ?:
+ bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0);
+ goto out;
+ }
+
+ ret = bch2_trans_commit(trans, NULL, NULL, 0) ?:
+ -BCH_ERR_transaction_restart_nested;
+ goto out;
}
static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
@@ -1096,10 +1240,36 @@ fsck_err:
return ret;
}
+static int get_snapshot_root_inode(struct btree_trans *trans,
+ struct bch_inode_unpacked *root,
+ u64 inum)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret = 0;
+
+ for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes,
+ SPOS(0, inum, U32_MAX),
+ BTREE_ITER_all_snapshots, k, ret) {
+ if (k.k->p.offset != inum)
+ break;
+ if (bkey_is_inode(k.k))
+ goto found_root;
+ }
+ if (ret)
+ goto err;
+ BUG();
+found_root:
+ BUG_ON(bch2_inode_unpack(k, root));
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
static int check_inode(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k,
- struct bch_inode_unpacked *prev,
+ struct bch_inode_unpacked *snapshot_root,
struct snapshots_seen *s)
{
struct bch_fs *c = trans->c;
@@ -1123,16 +1293,19 @@ static int check_inode(struct btree_trans *trans,
BUG_ON(bch2_inode_unpack(k, &u));
- if (prev->bi_inum != u.bi_inum)
- *prev = u;
+ if (snapshot_root->bi_inum != u.bi_inum) {
+ ret = get_snapshot_root_inode(trans, snapshot_root, u.bi_inum);
+ if (ret)
+ goto err;
+ }
- if (fsck_err_on(prev->bi_hash_seed != u.bi_hash_seed ||
- inode_d_type(prev) != inode_d_type(&u),
+ if (fsck_err_on(u.bi_hash_seed != snapshot_root->bi_hash_seed ||
+ INODE_STR_HASH(&u) != INODE_STR_HASH(snapshot_root),
trans, inode_snapshot_mismatch,
"inodes in different snapshots don't match")) {
- bch_err(c, "repair not implemented yet");
- ret = -BCH_ERR_fsck_repair_unimplemented;
- goto err_noprint;
+ u.bi_hash_seed = snapshot_root->bi_hash_seed;
+ SET_INODE_STR_HASH(&u, INODE_STR_HASH(snapshot_root));
+ do_update = true;
}
if (u.bi_dir || u.bi_dir_offset) {
@@ -1285,7 +1458,7 @@ err_noprint:
int bch2_check_inodes(struct bch_fs *c)
{
- struct bch_inode_unpacked prev = { 0 };
+ struct bch_inode_unpacked snapshot_root = {};
struct snapshots_seen s;
snapshots_seen_init(&s);
@@ -1295,7 +1468,7 @@ int bch2_check_inodes(struct bch_fs *c)
POS_MIN,
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- check_inode(trans, &iter, k, &prev, &s)));
+ check_inode(trans, &iter, k, &snapshot_root, &s)));
snapshots_seen_exit(&s);
bch_err_fn(c, ret);
@@ -2307,7 +2480,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
*hash_info = bch2_hash_info_init(c, &i->inode);
dir->first_this_inode = false;
- ret = hash_check_key(trans, bch2_dirent_hash_desc, hash_info, iter, k);
+ ret = hash_check_key(trans, s, bch2_dirent_hash_desc, hash_info, iter, k);
if (ret < 0)
goto err;
if (ret) {
@@ -2421,7 +2594,7 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
*hash_info = bch2_hash_info_init(c, &i->inode);
inode->first_this_inode = false;
- ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k);
+ ret = hash_check_key(trans, NULL, bch2_xattr_hash_desc, hash_info, iter, k);
bch_err_fn(c, ret);
return ret;
}
@@ -2509,7 +2682,7 @@ fsck_err:
/* Get root directory, create if it doesn't exist: */
int bch2_check_root(struct bch_fs *c)
{
- int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
check_root_trans(trans));
bch_err_fn(c, ret);
return ret;
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 344ccb7a824c..039cb7a22244 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -163,8 +163,8 @@ static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode,
unsigned fieldnr = 0, field_bits;
int ret;
-#define x(_name, _bits) \
- if (fieldnr++ == INODE_NR_FIELDS(inode.v)) { \
+#define x(_name, _bits) \
+ if (fieldnr++ == INODEv1_NR_FIELDS(inode.v)) { \
unsigned offset = offsetof(struct bch_inode_unpacked, _name);\
memset((void *) unpacked + offset, 0, \
sizeof(*unpacked) - offset); \
@@ -283,6 +283,8 @@ static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k,
{
memset(unpacked, 0, sizeof(*unpacked));
+ unpacked->bi_snapshot = k.k->p.snapshot;
+
switch (k.k->type) {
case KEY_TYPE_inode: {
struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
@@ -293,10 +295,10 @@ static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k,
unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags);
unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode);
- if (INODE_NEW_VARINT(inode.v)) {
+ if (INODEv1_NEW_VARINT(inode.v)) {
return bch2_inode_unpack_v2(unpacked, inode.v->fields,
bkey_val_end(inode),
- INODE_NR_FIELDS(inode.v));
+ INODEv1_NR_FIELDS(inode.v));
} else {
return bch2_inode_unpack_v1(inode, unpacked);
}
@@ -471,10 +473,10 @@ int bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k,
struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
int ret = 0;
- bkey_fsck_err_on(INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR,
+ bkey_fsck_err_on(INODEv1_STR_HASH(inode.v) >= BCH_STR_HASH_NR,
c, inode_str_hash_invalid,
"invalid str hash type (%llu >= %u)",
- INODE_STR_HASH(inode.v), BCH_STR_HASH_NR);
+ INODEv1_STR_HASH(inode.v), BCH_STR_HASH_NR);
ret = __bch2_inode_validate(c, k, flags);
fsck_err:
@@ -533,6 +535,10 @@ static void __bch2_inode_unpacked_to_text(struct printbuf *out,
prt_printf(out, "(%x)\n", inode->bi_flags);
prt_printf(out, "journal_seq=%llu\n", inode->bi_journal_seq);
+ prt_printf(out, "hash_seed=%llx\n", inode->bi_hash_seed);
+ prt_printf(out, "hash_type=");
+ bch2_prt_str_hash_type(out, INODE_STR_HASH(inode));
+ prt_newline(out);
prt_printf(out, "bi_size=%llu\n", inode->bi_size);
prt_printf(out, "bi_sectors=%llu\n", inode->bi_sectors);
prt_printf(out, "bi_version=%llu\n", inode->bi_version);
@@ -800,10 +806,8 @@ void bch2_inode_init_early(struct bch_fs *c,
memset(inode_u, 0, sizeof(*inode_u));
- /* ick */
- inode_u->bi_flags |= str_hash << INODE_STR_HASH_OFFSET;
- get_random_bytes(&inode_u->bi_hash_seed,
- sizeof(inode_u->bi_hash_seed));
+ SET_INODE_STR_HASH(inode_u, str_hash);
+ get_random_bytes(&inode_u->bi_hash_seed, sizeof(inode_u->bi_hash_seed));
}
void bch2_inode_init_late(struct bch_inode_unpacked *inode_u, u64 now,
@@ -1087,8 +1091,7 @@ int bch2_inode_find_by_inum_trans(struct btree_trans *trans,
int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum,
struct bch_inode_unpacked *inode)
{
- return bch2_trans_do(c, NULL, NULL, 0,
- bch2_inode_find_by_inum_trans(trans, inum, inode));
+ return bch2_trans_do(c, bch2_inode_find_by_inum_trans(trans, inum, inode));
}
int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi)
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index c8e98443e2d4..eab82b5eb897 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -92,6 +92,7 @@ struct bch_inode_unpacked {
BCH_INODE_FIELDS_v3()
#undef x
};
+BITMASK(INODE_STR_HASH, struct bch_inode_unpacked, bi_flags, 20, 24);
struct bkey_inode_buf {
struct bkey_i_inode_v3 inode;
diff --git a/fs/bcachefs/inode_format.h b/fs/bcachefs/inode_format.h
index a204e46b6b47..7928d0c6954f 100644
--- a/fs/bcachefs/inode_format.h
+++ b/fs/bcachefs/inode_format.h
@@ -150,9 +150,9 @@ enum __bch_inode_flags {
#undef x
};
-LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24);
-LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31);
-LE32_BITMASK(INODE_NEW_VARINT, struct bch_inode, bi_flags, 31, 32);
+LE32_BITMASK(INODEv1_STR_HASH, struct bch_inode, bi_flags, 20, 24);
+LE32_BITMASK(INODEv1_NR_FIELDS, struct bch_inode, bi_flags, 24, 31);
+LE32_BITMASK(INODEv1_NEW_VARINT,struct bch_inode, bi_flags, 31, 32);
LE64_BITMASK(INODEv2_STR_HASH, struct bch_inode_v2, bi_flags, 20, 24);
LE64_BITMASK(INODEv2_NR_FIELDS, struct bch_inode_v2, bi_flags, 24, 31);
diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c
index 307ed0a45184..f283051758d6 100644
--- a/fs/bcachefs/io_misc.c
+++ b/fs/bcachefs/io_misc.c
@@ -377,7 +377,7 @@ static int __bch2_resume_logged_op_finsert(struct btree_trans *trans,
* check for missing subvolume before fpunch, as in resume we don't want
* it to be a fatal error
*/
- ret = __bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot, warn_errors);
+ ret = lockrestart_do(trans, __bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot, warn_errors));
if (ret)
return ret;
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
index e4fc17c548fd..fc246f342820 100644
--- a/fs/bcachefs/io_read.c
+++ b/fs/bcachefs/io_read.c
@@ -409,8 +409,8 @@ retry:
bch2_trans_begin(trans);
rbio->bio.bi_status = 0;
- k = bch2_btree_iter_peek_slot(&iter);
- if (bkey_err(k))
+ ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
+ if (ret)
goto err;
bch2_bkey_buf_reassemble(&sk, c, k);
@@ -557,8 +557,8 @@ out:
static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
{
- bch2_trans_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- __bch2_rbio_narrow_crcs(trans, rbio));
+ bch2_trans_commit_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ __bch2_rbio_narrow_crcs(trans, rbio));
}
/* Inner part that may run in process context */
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index b5fe9e0dc155..8609e25e450f 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -1437,7 +1437,7 @@ again:
* freeing up space on specific disks, which means that
* allocations for specific disks may hang arbitrarily long:
*/
- ret = bch2_trans_do(c, NULL, NULL, 0,
+ ret = bch2_trans_run(c, lockrestart_do(trans,
bch2_alloc_sectors_start_trans(trans,
op->target,
op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED),
@@ -1447,7 +1447,7 @@ again:
op->nr_replicas_required,
op->watermark,
op->flags,
- &op->cl, &wp));
+ &op->cl, &wp)));
if (unlikely(ret)) {
if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
break;
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index dc099f06341f..2dc0d60c1745 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -758,7 +758,7 @@ out:
return ret;
}
-int bch2_journal_flush_seq(struct journal *j, u64 seq)
+int bch2_journal_flush_seq(struct journal *j, u64 seq, unsigned task_state)
{
u64 start_time = local_clock();
int ret, ret2;
@@ -769,7 +769,9 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq)
if (seq <= j->flushed_seq_ondisk)
return 0;
- ret = wait_event_interruptible(j->wait, (ret2 = bch2_journal_flush_seq_async(j, seq, NULL)));
+ ret = wait_event_state(j->wait,
+ (ret2 = bch2_journal_flush_seq_async(j, seq, NULL)),
+ task_state);
if (!ret)
bch2_time_stats_update(j->flush_seq_time, start_time);
@@ -788,7 +790,7 @@ void bch2_journal_flush_async(struct journal *j, struct closure *parent)
int bch2_journal_flush(struct journal *j)
{
- return bch2_journal_flush_seq(j, atomic64_read(&j->seq));
+ return bch2_journal_flush_seq(j, atomic64_read(&j->seq), TASK_UNINTERRUPTIBLE);
}
/*
@@ -851,7 +853,7 @@ int bch2_journal_meta(struct journal *j)
bch2_journal_res_put(j, &res);
- return bch2_journal_flush_seq(j, res.seq);
+ return bch2_journal_flush_seq(j, res.seq, TASK_UNINTERRUPTIBLE);
}
/* block/unlock the journal: */
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 377a3750406e..2762be6f9814 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -401,7 +401,7 @@ void bch2_journal_entry_res_resize(struct journal *,
int bch2_journal_flush_seq_async(struct journal *, u64, struct closure *);
void bch2_journal_flush_async(struct journal *, struct closure *);
-int bch2_journal_flush_seq(struct journal *, u64);
+int bch2_journal_flush_seq(struct journal *, u64, unsigned);
int bch2_journal_flush(struct journal *);
bool bch2_journal_noflush_seq(struct journal *, u64);
int bch2_journal_meta(struct journal *);
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 8c456d8b8b99..0ef4a86850bb 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -266,7 +266,7 @@ int bch2_move_extent(struct moving_context *ctxt,
if (!data_opts.rewrite_ptrs &&
!data_opts.extra_replicas) {
if (data_opts.kill_ptrs)
- return bch2_extent_drop_ptrs(trans, iter, k, data_opts);
+ return bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &data_opts);
return 0;
}
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index 84097235eea9..6673cbd8bdb9 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -63,7 +63,7 @@ const char * const bch2_compression_opts[] = {
NULL
};
-const char * const bch2_str_hash_types[] = {
+const char * const __bch2_str_hash_types[] = {
BCH_STR_HASH_TYPES()
NULL
};
@@ -115,6 +115,7 @@ PRT_STR_OPT_BOUNDSCHECKED(fs_usage_type, enum bch_fs_usage_type);
PRT_STR_OPT_BOUNDSCHECKED(data_type, enum bch_data_type);
PRT_STR_OPT_BOUNDSCHECKED(csum_type, enum bch_csum_type);
PRT_STR_OPT_BOUNDSCHECKED(compression_type, enum bch_compression_type);
+PRT_STR_OPT_BOUNDSCHECKED(str_hash_type, enum bch_str_hash_type);
static int bch2_opt_fix_errors_parse(struct bch_fs *c, const char *val, u64 *res,
struct printbuf *err)
@@ -596,6 +597,9 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
copied_opts_start = copied_opts;
while ((opt = strsep(&copied_opts, ",")) != NULL) {
+ if (!*opt)
+ continue;
+
name = strsep(&opt, "=");
val = opt;
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index cb2e244a2429..23dda014e331 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -18,7 +18,7 @@ extern const char * const bch2_sb_compat[];
extern const char * const __bch2_btree_ids[];
extern const char * const bch2_csum_opts[];
extern const char * const bch2_compression_opts[];
-extern const char * const bch2_str_hash_types[];
+extern const char * const __bch2_str_hash_types[];
extern const char * const bch2_str_hash_opts[];
extern const char * const __bch2_data_types[];
extern const char * const bch2_member_states[];
@@ -29,6 +29,7 @@ void bch2_prt_fs_usage_type(struct printbuf *, enum bch_fs_usage_type);
void bch2_prt_data_type(struct printbuf *, enum bch_data_type);
void bch2_prt_csum_type(struct printbuf *, enum bch_csum_type);
void bch2_prt_compression_type(struct printbuf *, enum bch_compression_type);
+void bch2_prt_str_hash_type(struct printbuf *, enum bch_str_hash_type);
static inline const char *bch2_d_type_str(unsigned d_type)
{
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index c32a05e252e2..74f45a8162ad 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -869,7 +869,7 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid,
bkey_quota_init(&new_quota.k_i);
new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid));
- ret = bch2_trans_do(c, NULL, NULL, 0,
+ ret = bch2_trans_commit_do(c, NULL, NULL, 0,
bch2_set_quota_trans(trans, &new_quota, qdq)) ?:
__bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i), qdq);
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 2d299a37cf07..cd6647374353 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -70,7 +70,9 @@ err:
int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum)
{
- int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw,
+ int ret = bch2_trans_commit_do(c, NULL, NULL,
+ BCH_TRANS_COMMIT_no_enospc|
+ BCH_TRANS_COMMIT_lazy_rw,
__bch2_set_rebalance_needs_scan(trans, inum));
rebalance_wakeup(c);
return ret;
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 55e1504a8130..32d15aacc069 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -94,11 +94,10 @@ static void bch2_reconstruct_alloc(struct bch_fs *c)
__set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent);
c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
-
c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
bch2_shoot_down_journal_keys(c, BTREE_ID_alloc,
0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
@@ -1002,6 +1001,7 @@ int bch2_fs_initialize(struct bch_fs *c)
struct bch_inode_unpacked root_inode, lostfound_inode;
struct bkey_inode_buf packed_inode;
struct qstr lostfound = QSTR("lost+found");
+ struct bch_member *m;
int ret;
bch_notice(c, "initializing new filesystem");
@@ -1018,6 +1018,14 @@ int bch2_fs_initialize(struct bch_fs *c)
SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current);
bch2_write_super(c);
}
+
+ for_each_member_device(c, ca) {
+ m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
+ SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, false);
+ ca->mi = bch2_mi_to_cpu(m);
+ }
+
+ bch2_write_super(c);
mutex_unlock(&c->sb_lock);
c->curr_recovery_pass = BCH_RECOVERY_PASS_NR;
@@ -1091,7 +1099,7 @@ int bch2_fs_initialize(struct bch_fs *c)
bch2_inode_init_early(c, &lostfound_inode);
- ret = bch2_trans_do(c, NULL, NULL, 0,
+ ret = bch2_trans_commit_do(c, NULL, NULL, 0,
bch2_create_trans(trans,
BCACHEFS_ROOT_SUBVOL_INUM,
&root_inode, &lostfound_inode,
diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c
index ae715ff658e8..8767c33c2b51 100644
--- a/fs/bcachefs/sb-downgrade.c
+++ b/fs/bcachefs/sb-downgrade.c
@@ -143,6 +143,9 @@ UPGRADE_TABLE()
static int have_stripes(struct bch_fs *c)
{
+ if (IS_ERR_OR_NULL(c->btree_roots_known[BTREE_ID_stripes].b))
+ return 0;
+
return !btree_node_fake(c->btree_roots_known[BTREE_ID_stripes].b);
}
diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
index aab328ac6dfa..937275d061fe 100644
--- a/fs/bcachefs/sb-errors_format.h
+++ b/fs/bcachefs/sb-errors_format.h
@@ -267,8 +267,8 @@ enum bch_fsck_flags {
x(journal_entry_dup_same_device, 246, 0) \
x(inode_bi_subvol_missing, 247, 0) \
x(inode_bi_subvol_wrong, 248, 0) \
- x(inode_points_to_missing_dirent, 249, 0) \
- x(inode_points_to_wrong_dirent, 250, 0) \
+ x(inode_points_to_missing_dirent, 249, FSCK_AUTOFIX) \
+ x(inode_points_to_wrong_dirent, 250, FSCK_AUTOFIX) \
x(inode_bi_parent_nonzero, 251, 0) \
x(dirent_to_missing_parent_subvol, 252, 0) \
x(dirent_not_visible_in_parent_subvol, 253, 0) \
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index 215eed4cce6d..ec2b1feea520 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -46,8 +46,7 @@ bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi)
{
/* XXX ick */
struct bch_hash_info info = {
- .type = (bi->bi_flags >> INODE_STR_HASH_OFFSET) &
- ~(~0U << INODE_STR_HASH_BITS),
+ .type = INODE_STR_HASH(bi),
.siphash_key = { .k0 = bi->bi_hash_seed }
};
@@ -253,19 +252,20 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans,
}
static __always_inline
-int bch2_hash_set_in_snapshot(struct btree_trans *trans,
+struct bkey_s_c bch2_hash_set_or_get_in_snapshot(struct btree_trans *trans,
+ struct btree_iter *iter,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
subvol_inum inum, u32 snapshot,
struct bkey_i *insert,
enum btree_iter_update_trigger_flags flags)
{
- struct btree_iter iter, slot = { NULL };
+ struct btree_iter slot = {};
struct bkey_s_c k;
bool found = false;
int ret;
- for_each_btree_key_upto_norestart(trans, iter, desc.btree_id,
+ for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id,
SPOS(insert->k.p.inode,
desc.hash_bkey(info, bkey_i_to_s_c(insert)),
snapshot),
@@ -280,7 +280,7 @@ int bch2_hash_set_in_snapshot(struct btree_trans *trans,
}
if (!slot.path && !(flags & STR_HASH_must_replace))
- bch2_trans_copy_iter(&slot, &iter);
+ bch2_trans_copy_iter(&slot, iter);
if (k.k->type != KEY_TYPE_hash_whiteout)
goto not_found;
@@ -290,29 +290,50 @@ int bch2_hash_set_in_snapshot(struct btree_trans *trans,
ret = -BCH_ERR_ENOSPC_str_hash_create;
out:
bch2_trans_iter_exit(trans, &slot);
- bch2_trans_iter_exit(trans, &iter);
-
- return ret;
+ bch2_trans_iter_exit(trans, iter);
+ return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
found:
found = true;
not_found:
-
- if (!found && (flags & STR_HASH_must_replace)) {
+ if (found && (flags & STR_HASH_must_create)) {
+ bch2_trans_iter_exit(trans, &slot);
+ return k;
+ } else if (!found && (flags & STR_HASH_must_replace)) {
ret = -BCH_ERR_ENOENT_str_hash_set_must_replace;
- } else if (found && (flags & STR_HASH_must_create)) {
- ret = -BCH_ERR_EEXIST_str_hash_set;
} else {
if (!found && slot.path)
- swap(iter, slot);
+ swap(*iter, slot);
- insert->k.p = iter.pos;
- ret = bch2_trans_update(trans, &iter, insert, flags);
+ insert->k.p = iter->pos;
+ ret = bch2_trans_update(trans, iter, insert, flags);
}
goto out;
}
static __always_inline
+int bch2_hash_set_in_snapshot(struct btree_trans *trans,
+ const struct bch_hash_desc desc,
+ const struct bch_hash_info *info,
+ subvol_inum inum, u32 snapshot,
+ struct bkey_i *insert,
+ enum btree_iter_update_trigger_flags flags)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_hash_set_or_get_in_snapshot(trans, &iter, desc, info, inum,
+ snapshot, insert, flags);
+ int ret = bkey_err(k);
+ if (ret)
+ return ret;
+ if (k.k) {
+ bch2_trans_iter_exit(trans, &iter);
+ return -BCH_ERR_EEXIST_str_hash_set;
+ }
+
+ return 0;
+}
+
+static __always_inline
int bch2_hash_set(struct btree_trans *trans,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
@@ -363,8 +384,11 @@ int bch2_hash_delete(struct btree_trans *trans,
struct btree_iter iter;
struct bkey_s_c k = bch2_hash_lookup(trans, &iter, desc, info, inum, key,
BTREE_ITER_intent);
- int ret = bkey_err(k) ?:
- bch2_hash_delete_at(trans, desc, info, &iter, 0);
+ int ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ ret = bch2_hash_delete_at(trans, desc, info, &iter, 0);
bch2_trans_iter_exit(trans, &iter);
return ret;
}
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 91d8187ee168..80e5efaff524 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -319,8 +319,7 @@ int bch2_subvol_is_ro_trans(struct btree_trans *trans, u32 subvol)
int bch2_subvol_is_ro(struct bch_fs *c, u32 subvol)
{
- return bch2_trans_do(c, NULL, NULL, 0,
- bch2_subvol_is_ro_trans(trans, subvol));
+ return bch2_trans_do(c, bch2_subvol_is_ro_trans(trans, subvol));
}
int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot,
@@ -676,8 +675,8 @@ err:
/* set bi_subvol on root inode */
int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c)
{
- int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw,
- __bch2_fs_upgrade_for_subvolumes(trans));
+ int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw,
+ __bch2_fs_upgrade_for_subvolumes(trans));
bch_err_fn(c, ret);
return ret;
}
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index ce7410d72089..7c71594f6a8b 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -287,6 +287,11 @@ static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out
return -BCH_ERR_invalid_sb_layout_nr_superblocks;
}
+ if (layout->sb_max_size_bits > BCH_SB_LAYOUT_SIZE_BITS_MAX) {
+ prt_printf(out, "Invalid superblock layout: max_size_bits too high");
+ return -BCH_ERR_invalid_sb_layout_sb_max_size_bits;
+ }
+
max_sectors = 1 << layout->sb_max_size_bits;
prev_offset = le64_to_cpu(layout->sb_offset[0]);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 77d811a539af..657fd3759e7b 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1972,7 +1972,7 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
};
u64 v[3] = { nbuckets - old_nbuckets, 0, 0 };
- ret = bch2_trans_do(ca->fs, NULL, NULL, 0,
+ ret = bch2_trans_commit_do(ca->fs, NULL, NULL, 0,
bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v), false)) ?:
bch2_dev_freespace_init(c, ca, old_nbuckets, nbuckets);
if (ret)
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index b2f209743afe..315038a0a92d 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -450,7 +450,7 @@ static int insert_test_overlapping_extent(struct bch_fs *c, u64 inum, u64 start,
k.k_i.k.p.snapshot = snapid;
k.k_i.k.size = len;
- ret = bch2_trans_do(c, NULL, NULL, 0,
+ ret = bch2_trans_commit_do(c, NULL, NULL, 0,
bch2_btree_insert_nonextent(trans, BTREE_ID_extents, &k.k_i,
BTREE_UPDATE_internal_snapshot_node));
bch_err_fn(c, ret);
@@ -510,7 +510,7 @@ static int test_snapshots(struct bch_fs *c, u64 nr)
if (ret)
return ret;
- ret = bch2_trans_do(c, NULL, NULL, 0,
+ ret = bch2_trans_commit_do(c, NULL, NULL, 0,
bch2_snapshot_node_create(trans, U32_MAX,
snapids,
snapid_subvols,
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 56c8d3fe55a4..952aca400faf 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -330,7 +330,7 @@ static int bch2_xattr_get_handler(const struct xattr_handler *handler,
{
struct bch_inode_info *inode = to_bch_ei(vinode);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- int ret = bch2_trans_do(c, NULL, NULL, 0,
+ int ret = bch2_trans_do(c,
bch2_xattr_get_trans(trans, inode, name, buffer, size, handler->flags));
if (ret < 0 && bch2_err_matches(ret, ENOENT))
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index fec5c6cde0a7..7e0f9600b80c 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -49,6 +49,7 @@ void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info,
bbio->end_io = end_io;
bbio->private = private;
atomic_set(&bbio->pending_ios, 1);
+ WRITE_ONCE(bbio->status, BLK_STS_OK);
}
/*
@@ -113,41 +114,29 @@ static void __btrfs_bio_end_io(struct btrfs_bio *bbio)
}
}
-static void btrfs_orig_write_end_io(struct bio *bio);
-
-static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio,
- struct btrfs_bio *orig_bbio)
-{
- /*
- * For writes we tolerate nr_mirrors - 1 write failures, so we can't
- * just blindly propagate a write failure here. Instead increment the
- * error count in the original I/O context so that it is guaranteed to
- * be larger than the error tolerance.
- */
- if (bbio->bio.bi_end_io == &btrfs_orig_write_end_io) {
- struct btrfs_io_stripe *orig_stripe = orig_bbio->bio.bi_private;
- struct btrfs_io_context *orig_bioc = orig_stripe->bioc;
-
- atomic_add(orig_bioc->max_errors, &orig_bioc->error);
- } else {
- orig_bbio->bio.bi_status = bbio->bio.bi_status;
- }
-}
-
void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
{
bbio->bio.bi_status = status;
if (bbio->bio.bi_pool == &btrfs_clone_bioset) {
struct btrfs_bio *orig_bbio = bbio->private;
- if (bbio->bio.bi_status)
- btrfs_bbio_propagate_error(bbio, orig_bbio);
btrfs_cleanup_bio(bbio);
bbio = orig_bbio;
}
- if (atomic_dec_and_test(&bbio->pending_ios))
+ /*
+ * At this point, bbio always points to the original btrfs_bio. Save
+ * the first error in it.
+ */
+ if (status != BLK_STS_OK)
+ cmpxchg(&bbio->status, BLK_STS_OK, status);
+
+ if (atomic_dec_and_test(&bbio->pending_ios)) {
+ /* Load split bio's error which might be set above. */
+ if (status == BLK_STS_OK)
+ bbio->bio.bi_status = READ_ONCE(bbio->status);
__btrfs_bio_end_io(bbio);
+ }
}
static int next_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror)
diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h
index e48612340745..e2fe16074ad6 100644
--- a/fs/btrfs/bio.h
+++ b/fs/btrfs/bio.h
@@ -79,6 +79,9 @@ struct btrfs_bio {
/* File system that this I/O operates on. */
struct btrfs_fs_info *fs_info;
+ /* Save the first error status of split bio. */
+ blk_status_t status;
+
/*
* This member must come last, bio_alloc_bioset will allocate enough
* bytes for entire btrfs_bio but relies on bio being last.
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 7980b2e33a92..4423d8b716a5 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -3819,6 +3819,8 @@ void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
spin_lock(&cache->lock);
if (cache->ro)
space_info->bytes_readonly += num_bytes;
+ else if (btrfs_is_zoned(cache->fs_info))
+ space_info->bytes_zone_unusable += num_bytes;
cache->reserved -= num_bytes;
space_info->bytes_reserved -= num_bytes;
space_info->max_extent_size = 0;
diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
index b95ef44c326b..968dae953948 100644
--- a/fs/btrfs/defrag.c
+++ b/fs/btrfs/defrag.c
@@ -763,12 +763,12 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
* We can get a merged extent, in that case, we need to re-search
* tree to get the original em for defrag.
*
- * If @newer_than is 0 or em::generation < newer_than, we can trust
- * this em, as either we don't care about the generation, or the
- * merged extent map will be rejected anyway.
+ * This is because even if we have adjacent extents that are contiguous
+ * and compatible (same type and flags), we still want to defrag them
+ * so that we use less metadata (extent items in the extent tree and
+ * file extent items in the inode's subvolume tree).
*/
- if (em && (em->flags & EXTENT_FLAG_MERGED) &&
- newer_than && em->generation >= newer_than) {
+ if (em && (em->flags & EXTENT_FLAG_MERGED)) {
free_extent_map(em);
em = NULL;
}
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 001c0c2f872c..1e8cd7c9472e 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -347,8 +347,8 @@ btrfs_search_dir_index_item(struct btrfs_root *root, struct btrfs_path *path,
return di;
}
/* Adjust return code if the key was not found in the next leaf. */
- if (ret > 0)
- ret = 0;
+ if (ret >= 0)
+ ret = -ENOENT;
return ERR_PTR(ret);
}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4ad5db619b00..b11bfe68dd65 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1959,7 +1959,7 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
fs_info->qgroup_seq = 1;
fs_info->qgroup_ulist = NULL;
fs_info->qgroup_rescan_running = false;
- fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL;
+ fs_info->qgroup_drop_subtree_thres = BTRFS_QGROUP_DROP_SUBTREE_THRES_DEFAULT;
mutex_init(&fs_info->qgroup_rescan_lock);
}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 309a8ae48434..872cca54cc6c 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -262,22 +262,23 @@ static noinline int lock_delalloc_folios(struct inode *inode,
for (i = 0; i < found_folios; i++) {
struct folio *folio = fbatch.folios[i];
- u32 len = end + 1 - start;
+ u64 range_start;
+ u32 range_len;
if (folio == locked_folio)
continue;
- if (btrfs_folio_start_writer_lock(fs_info, folio, start,
- len))
- goto out;
-
+ folio_lock(folio);
if (!folio_test_dirty(folio) || folio->mapping != mapping) {
- btrfs_folio_end_writer_lock(fs_info, folio, start,
- len);
+ folio_unlock(folio);
goto out;
}
+ range_start = max_t(u64, folio_pos(folio), start);
+ range_len = min_t(u64, folio_pos(folio) + folio_size(folio),
+ end + 1) - range_start;
+ btrfs_folio_set_writer_lock(fs_info, folio, range_start, range_len);
- processed_end = folio_pos(folio) + folio_size(folio) - 1;
+ processed_end = range_start + range_len - 1;
}
folio_batch_release(&fbatch);
cond_resched();
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 25d191f1ac10..1d93e1202c33 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -230,7 +230,12 @@ static bool mergeable_maps(const struct extent_map *prev, const struct extent_ma
if (extent_map_end(prev) != next->start)
return false;
- if (prev->flags != next->flags)
+ /*
+ * The merged flag is not an on-disk flag, it just indicates we had the
+ * extent maps of 2 (or more) adjacent extents merged, so factor it out.
+ */
+ if ((prev->flags & ~EXTENT_FLAG_MERGED) !=
+ (next->flags & ~EXTENT_FLAG_MERGED))
return false;
if (next->disk_bytenr < EXTENT_MAP_LAST_BYTE - 1)
@@ -243,13 +248,19 @@ static bool mergeable_maps(const struct extent_map *prev, const struct extent_ma
/*
* Handle the on-disk data extents merge for @prev and @next.
*
+ * @prev: left extent to merge
+ * @next: right extent to merge
+ * @merged: the extent we will not discard after the merge; updated with new values
+ *
+ * After this, one of the two extents is the new merged extent and the other is
+ * removed from the tree and likely freed. Note that @merged is one of @prev/@next
+ * so there is const/non-const aliasing occurring here.
+ *
* Only touches disk_bytenr/disk_num_bytes/offset/ram_bytes.
* For now only uncompressed regular extent can be merged.
- *
- * @prev and @next will be both updated to point to the new merged range.
- * Thus one of them should be removed by the caller.
*/
-static void merge_ondisk_extents(struct extent_map *prev, struct extent_map *next)
+static void merge_ondisk_extents(const struct extent_map *prev, const struct extent_map *next,
+ struct extent_map *merged)
{
u64 new_disk_bytenr;
u64 new_disk_num_bytes;
@@ -284,15 +295,10 @@ static void merge_ondisk_extents(struct extent_map *prev, struct extent_map *nex
new_disk_bytenr;
new_offset = prev->disk_bytenr + prev->offset - new_disk_bytenr;
- prev->disk_bytenr = new_disk_bytenr;
- prev->disk_num_bytes = new_disk_num_bytes;
- prev->ram_bytes = new_disk_num_bytes;
- prev->offset = new_offset;
-
- next->disk_bytenr = new_disk_bytenr;
- next->disk_num_bytes = new_disk_num_bytes;
- next->ram_bytes = new_disk_num_bytes;
- next->offset = new_offset;
+ merged->disk_bytenr = new_disk_bytenr;
+ merged->disk_num_bytes = new_disk_num_bytes;
+ merged->ram_bytes = new_disk_num_bytes;
+ merged->offset = new_offset;
}
static void dump_extent_map(struct btrfs_fs_info *fs_info, const char *prefix,
@@ -361,7 +367,7 @@ static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em)
em->generation = max(em->generation, merge->generation);
if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE)
- merge_ondisk_extents(merge, em);
+ merge_ondisk_extents(merge, em, em);
em->flags |= EXTENT_FLAG_MERGED;
validate_extent_map(fs_info, em);
@@ -378,7 +384,7 @@ static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em)
if (rb && can_merge_extent_map(merge) && mergeable_maps(em, merge)) {
em->len += merge->len;
if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE)
- merge_ondisk_extents(em, merge);
+ merge_ondisk_extents(em, merge, em);
validate_extent_map(fs_info, em);
rb_erase(&merge->rb_node, &tree->root);
RB_CLEAR_NODE(&merge->rb_node);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5618ca02934a..da51edbad6a0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4368,11 +4368,8 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
*/
if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
di = btrfs_search_dir_index_item(root, path, dir_ino, &fname.disk_name);
- if (IS_ERR_OR_NULL(di)) {
- if (!di)
- ret = -ENOENT;
- else
- ret = PTR_ERR(di);
+ if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
btrfs_abort_transaction(trans, ret);
goto out;
}
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 1332ec59c539..a0e8deca87a7 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1407,7 +1407,7 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
fs_info->quota_root = NULL;
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE;
- fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL;
+ fs_info->qgroup_drop_subtree_thres = BTRFS_QGROUP_DROP_SUBTREE_THRES_DEFAULT;
spin_unlock(&fs_info->qgroup_lock);
btrfs_free_qgroup_config(fs_info);
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 98adf4ec7b01..c229256d6fd5 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -121,6 +121,8 @@ struct btrfs_inode;
#define BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN (1ULL << 63)
#define BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING (1ULL << 62)
+#define BTRFS_QGROUP_DROP_SUBTREE_THRES_DEFAULT (3)
+
/*
* Record a dirty extent, and info qgroup to update quota on it
*/
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 98fa0f382480..926d7a9ed99d 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -340,6 +340,15 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
fallthrough;
case Opt_compress:
case Opt_compress_type:
+ /*
+ * Provide the same semantics as older kernels that don't use fs
+ * context, specifying the "compress" option clears
+ * "force-compress" without the need to pass
+ * "compress-force=[no|none]" before specifying "compress".
+ */
+ if (opt != Opt_compress_force && opt != Opt_compress_force_type)
+ btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS);
+
if (opt == Opt_compress || opt == Opt_compress_force) {
ctx->compress_type = BTRFS_COMPRESS_ZLIB;
ctx->compress_level = BTRFS_ZLIB_DEFAULT_LEVEL;
@@ -1498,8 +1507,7 @@ static int btrfs_reconfigure(struct fs_context *fc)
sync_filesystem(sb);
set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
- if (!mount_reconfigure &&
- !btrfs_check_options(fs_info, &ctx->mount_opt, fc->sb_flags))
+ if (!btrfs_check_options(fs_info, &ctx->mount_opt, fc->sb_flags))
return -EINVAL;
ret = btrfs_check_features(fs_info, !(fc->sb_flags & SB_RDONLY));
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 8f340ad1d938..eb51b609190f 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1105,6 +1105,7 @@ static void btrfs_close_one_device(struct btrfs_device *device)
if (device->bdev) {
fs_devices->open_devices--;
device->bdev = NULL;
+ device->bdev_file = NULL;
}
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
btrfs_destroy_dev_zone_info(device);
diff --git a/fs/dax.c b/fs/dax.c
index c62acd2812f8..21b47402b3dc 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1262,35 +1262,46 @@ static s64 dax_unshare_iter(struct iomap_iter *iter)
{
struct iomap *iomap = &iter->iomap;
const struct iomap *srcmap = iomap_iter_srcmap(iter);
- loff_t pos = iter->pos;
- loff_t length = iomap_length(iter);
+ loff_t copy_pos = iter->pos;
+ u64 copy_len = iomap_length(iter);
+ u32 mod;
int id = 0;
s64 ret = 0;
void *daddr = NULL, *saddr = NULL;
- /* don't bother with blocks that are not shared to start with */
- if (!(iomap->flags & IOMAP_F_SHARED))
- return length;
+ if (!iomap_want_unshare_iter(iter))
+ return iomap_length(iter);
+
+ /*
+ * Extend the file range to be aligned to fsblock/pagesize, because
+ * we need to copy entire blocks, not just the byte range specified.
+ * Invalidate the mapping because we're about to CoW.
+ */
+ mod = offset_in_page(copy_pos);
+ if (mod) {
+ copy_len += mod;
+ copy_pos -= mod;
+ }
+
+ mod = offset_in_page(copy_pos + copy_len);
+ if (mod)
+ copy_len += PAGE_SIZE - mod;
+
+ invalidate_inode_pages2_range(iter->inode->i_mapping,
+ copy_pos >> PAGE_SHIFT,
+ (copy_pos + copy_len - 1) >> PAGE_SHIFT);
id = dax_read_lock();
- ret = dax_iomap_direct_access(iomap, pos, length, &daddr, NULL);
+ ret = dax_iomap_direct_access(iomap, copy_pos, copy_len, &daddr, NULL);
if (ret < 0)
goto out_unlock;
- /* zero the distance if srcmap is HOLE or UNWRITTEN */
- if (srcmap->flags & IOMAP_F_SHARED || srcmap->type == IOMAP_UNWRITTEN) {
- memset(daddr, 0, length);
- dax_flush(iomap->dax_dev, daddr, length);
- ret = length;
- goto out_unlock;
- }
-
- ret = dax_iomap_direct_access(srcmap, pos, length, &saddr, NULL);
+ ret = dax_iomap_direct_access(srcmap, copy_pos, copy_len, &saddr, NULL);
if (ret < 0)
goto out_unlock;
- if (copy_mc_to_kernel(daddr, saddr, length) == 0)
- ret = length;
+ if (copy_mc_to_kernel(daddr, saddr, copy_len) == 0)
+ ret = iomap_length(iter);
else
ret = -EIO;
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 320d586c3896..bed3dbe5b7cb 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -709,7 +709,9 @@ static int erofs_fc_get_tree(struct fs_context *fc)
if (IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && sbi->fsid)
return get_tree_nodev(fc, erofs_fc_fill_super);
- ret = get_tree_bdev(fc, erofs_fc_fill_super);
+ ret = get_tree_bdev_flags(fc, erofs_fc_fill_super,
+ IS_ENABLED(CONFIG_EROFS_FS_BACKED_BY_FILE) ?
+ GET_TREE_BDEV_QUIET_LOOKUP : 0);
#ifdef CONFIG_EROFS_FS_BACKED_BY_FILE
if (ret == -ENOTBLK) {
if (!fc->source)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index f33fbce86ae0..dafdf766b1d5 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -2288,6 +2288,13 @@ static int fuse_writepages_fill(struct folio *folio,
struct folio *tmp_folio;
int err;
+ if (!data->ff) {
+ err = -EIO;
+ data->ff = fuse_write_file_get(fi);
+ if (!data->ff)
+ goto out_unlock;
+ }
+
if (wpa && fuse_writepage_need_send(fc, &folio->page, ap, data)) {
fuse_writepages_send(data);
data->wpa = NULL;
@@ -2351,13 +2358,13 @@ static int fuse_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
struct inode *inode = mapping->host;
- struct fuse_inode *fi = get_fuse_inode(inode);
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_fill_wb_data data;
int err;
+ err = -EIO;
if (fuse_is_bad(inode))
- return -EIO;
+ goto out;
if (wbc->sync_mode == WB_SYNC_NONE &&
fc->num_background >= fc->congestion_threshold)
@@ -2365,9 +2372,7 @@ static int fuse_writepages(struct address_space *mapping,
data.inode = inode;
data.wpa = NULL;
- data.ff = fuse_write_file_get(fi);
- if (!data.ff)
- return -EIO;
+ data.ff = NULL;
err = -ENOMEM;
data.orig_pages = kcalloc(fc->max_pages,
@@ -2381,10 +2386,11 @@ static int fuse_writepages(struct address_space *mapping,
WARN_ON(!data.wpa->ia.ap.num_pages);
fuse_writepages_send(&data);
}
+ if (data.ff)
+ fuse_file_put(data.ff, false);
kfree(data.orig_pages);
out:
- fuse_file_put(data.ff, false);
return err;
}
diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c
index 62aee8289d11..bbac547dfcb3 100644
--- a/fs/fuse/passthrough.c
+++ b/fs/fuse/passthrough.c
@@ -18,11 +18,11 @@ static void fuse_file_accessed(struct file *file)
fuse_invalidate_atime(inode);
}
-static void fuse_file_modified(struct file *file)
+static void fuse_passthrough_end_write(struct file *file, loff_t pos, ssize_t ret)
{
struct inode *inode = file_inode(file);
- fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
+ fuse_write_update_attr(inode, pos, ret);
}
ssize_t fuse_passthrough_read_iter(struct kiocb *iocb, struct iov_iter *iter)
@@ -63,7 +63,7 @@ ssize_t fuse_passthrough_write_iter(struct kiocb *iocb,
struct backing_file_ctx ctx = {
.cred = ff->cred,
.user_file = file,
- .end_write = fuse_file_modified,
+ .end_write = fuse_passthrough_end_write,
};
pr_debug("%s: backing_file=0x%p, pos=%lld, len=%zu\n", __func__,
@@ -110,7 +110,7 @@ ssize_t fuse_passthrough_splice_write(struct pipe_inode_info *pipe,
struct backing_file_ctx ctx = {
.cred = ff->cred,
.user_file = out,
- .end_write = fuse_file_modified,
+ .end_write = fuse_passthrough_end_write,
};
pr_debug("%s: backing_file=0x%p, pos=%lld, len=%zu, flags=0x%x\n", __func__,
@@ -234,7 +234,6 @@ int fuse_backing_open(struct fuse_conn *fc, struct fuse_backing_map *map)
goto out;
backing_sb = file_inode(file)->i_sb;
- pr_info("%s: %x:%pD %i\n", __func__, backing_sb->s_dev, file, backing_sb->s_stack_depth);
res = -ELOOP;
if (backing_sb->s_stack_depth >= fc->max_stack_depth)
goto out_fput;
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index aa587b2142e2..ef0b68bccbb6 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1277,22 +1277,7 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter)
loff_t length = iomap_length(iter);
loff_t written = 0;
- /* Don't bother with blocks that are not shared to start with. */
- if (!(iomap->flags & IOMAP_F_SHARED))
- return length;
-
- /*
- * Don't bother with delalloc reservations, holes or unwritten extents.
- *
- * Note that we use srcmap directly instead of iomap_iter_srcmap as
- * unsharing requires providing a separate source map, and the presence
- * of one is a good indicator that unsharing is needed, unlike
- * IOMAP_F_SHARED which can be set for any data that goes into the COW
- * fork for XFS.
- */
- if (iter->srcmap.type == IOMAP_HOLE ||
- iter->srcmap.type == IOMAP_DELALLOC ||
- iter->srcmap.type == IOMAP_UNWRITTEN)
+ if (!iomap_want_unshare_iter(iter))
return length;
do {
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 974ecf5e0d95..3ab410059dc2 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -187,7 +187,7 @@ int dbMount(struct inode *ipbmap)
}
bmp->db_numag = le32_to_cpu(dbmp_le->dn_numag);
- if (!bmp->db_numag || bmp->db_numag >= MAXAG) {
+ if (!bmp->db_numag || bmp->db_numag > MAXAG) {
err = -EINVAL;
goto err_release_metapage;
}
diff --git a/fs/namespace.c b/fs/namespace.c
index 93c377816d75..d26f5e6d2ca3 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3944,7 +3944,9 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
new = copy_tree(old, old->mnt.mnt_root, copy_flags);
if (IS_ERR(new)) {
namespace_unlock();
- free_mnt_ns(new_ns);
+ ns_free_inum(&new_ns->ns);
+ dec_mnt_namespaces(new_ns->ucounts);
+ mnt_ns_release(new_ns);
return ERR_CAST(new);
}
if (user_ns != ns->user_ns) {
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index c40e226053cc..af46a598f4d7 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -67,7 +67,8 @@ static int netfs_begin_cache_read(struct netfs_io_request *rreq, struct netfs_in
* Decant the list of folios to read into a rolling buffer.
*/
static size_t netfs_load_buffer_from_ra(struct netfs_io_request *rreq,
- struct folio_queue *folioq)
+ struct folio_queue *folioq,
+ struct folio_batch *put_batch)
{
unsigned int order, nr;
size_t size = 0;
@@ -82,6 +83,9 @@ static size_t netfs_load_buffer_from_ra(struct netfs_io_request *rreq,
order = folio_order(folio);
folioq->orders[i] = order;
size += PAGE_SIZE << order;
+
+ if (!folio_batch_add(put_batch, folio))
+ folio_batch_release(put_batch);
}
for (int i = nr; i < folioq_nr_slots(folioq); i++)
@@ -120,6 +124,9 @@ static ssize_t netfs_prepare_read_iterator(struct netfs_io_subrequest *subreq)
* that we will need to release later - but we don't want to do
* that until after we've started the I/O.
*/
+ struct folio_batch put_batch;
+
+ folio_batch_init(&put_batch);
while (rreq->submitted < subreq->start + rsize) {
struct folio_queue *tail = rreq->buffer_tail, *new;
size_t added;
@@ -132,10 +139,11 @@ static ssize_t netfs_prepare_read_iterator(struct netfs_io_subrequest *subreq)
new->prev = tail;
tail->next = new;
rreq->buffer_tail = new;
- added = netfs_load_buffer_from_ra(rreq, new);
+ added = netfs_load_buffer_from_ra(rreq, new, &put_batch);
rreq->iter.count += added;
rreq->submitted += added;
}
+ folio_batch_release(&put_batch);
}
subreq->len = rsize;
@@ -348,6 +356,7 @@ static int netfs_wait_for_read(struct netfs_io_request *rreq)
static int netfs_prime_buffer(struct netfs_io_request *rreq)
{
struct folio_queue *folioq;
+ struct folio_batch put_batch;
size_t added;
folioq = kmalloc(sizeof(*folioq), GFP_KERNEL);
@@ -360,39 +369,14 @@ static int netfs_prime_buffer(struct netfs_io_request *rreq)
rreq->submitted = rreq->start;
iov_iter_folio_queue(&rreq->iter, ITER_DEST, folioq, 0, 0, 0);
- added = netfs_load_buffer_from_ra(rreq, folioq);
+ folio_batch_init(&put_batch);
+ added = netfs_load_buffer_from_ra(rreq, folioq, &put_batch);
+ folio_batch_release(&put_batch);
rreq->iter.count += added;
rreq->submitted += added;
return 0;
}
-/*
- * Drop the ref on each folio that we inherited from the VM readahead code. We
- * still have the folio locks to pin the page until we complete the I/O.
- *
- * Note that we can't just release the batch in each queue struct as we use the
- * occupancy count in other places.
- */
-static void netfs_put_ra_refs(struct folio_queue *folioq)
-{
- struct folio_batch fbatch;
-
- folio_batch_init(&fbatch);
- while (folioq) {
- for (unsigned int slot = 0; slot < folioq_count(folioq); slot++) {
- struct folio *folio = folioq_folio(folioq, slot);
- if (!folio)
- continue;
- trace_netfs_folio(folio, netfs_folio_trace_read_put);
- if (!folio_batch_add(&fbatch, folio))
- folio_batch_release(&fbatch);
- }
- folioq = folioq->next;
- }
-
- folio_batch_release(&fbatch);
-}
-
/**
* netfs_readahead - Helper to manage a read request
* @ractl: The description of the readahead request
@@ -436,9 +420,6 @@ void netfs_readahead(struct readahead_control *ractl)
goto cleanup_free;
netfs_read_to_pagecache(rreq);
- /* Release the folio refs whilst we're waiting for the I/O. */
- netfs_put_ra_refs(rreq->buffer);
-
netfs_put_request(rreq, true, netfs_rreq_trace_put_return);
return;
diff --git a/fs/netfs/locking.c b/fs/netfs/locking.c
index 21eab56ee2f9..2249ecd09d0a 100644
--- a/fs/netfs/locking.c
+++ b/fs/netfs/locking.c
@@ -109,6 +109,7 @@ int netfs_start_io_write(struct inode *inode)
up_write(&inode->i_rwsem);
return -ERESTARTSYS;
}
+ downgrade_write(&inode->i_rwsem);
return 0;
}
EXPORT_SYMBOL(netfs_start_io_write);
@@ -123,7 +124,7 @@ EXPORT_SYMBOL(netfs_start_io_write);
void netfs_end_io_write(struct inode *inode)
__releases(inode->i_rwsem)
{
- up_write(&inode->i_rwsem);
+ up_read(&inode->i_rwsem);
}
EXPORT_SYMBOL(netfs_end_io_write);
diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c
index b18c65ba5580..3cbb289535a8 100644
--- a/fs/netfs/read_collect.c
+++ b/fs/netfs/read_collect.c
@@ -77,6 +77,8 @@ static void netfs_unlock_read_folio(struct netfs_io_subrequest *subreq,
folio_unlock(folio);
}
}
+
+ folioq_clear(folioq, slot);
}
/*
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index b5a6bf4f459f..d32f2dfd148f 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1841,14 +1841,12 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (!async_copy)
goto out_err;
async_copy->cp_nn = nn;
+ INIT_LIST_HEAD(&async_copy->copies);
+ refcount_set(&async_copy->refcount, 1);
/* Arbitrary cap on number of pending async copy operations */
if (atomic_inc_return(&nn->pending_async_copies) >
- (int)rqstp->rq_pool->sp_nrthreads) {
- atomic_dec(&nn->pending_async_copies);
+ (int)rqstp->rq_pool->sp_nrthreads)
goto out_err;
- }
- INIT_LIST_HEAD(&async_copy->copies);
- refcount_set(&async_copy->refcount, 1);
async_copy->cp_src = kmalloc(sizeof(*async_copy->cp_src), GFP_KERNEL);
if (!async_copy->cp_src)
goto out_err;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 56b261608af4..551d2958ec29 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1359,21 +1359,47 @@ static void destroy_delegation(struct nfs4_delegation *dp)
destroy_unhashed_deleg(dp);
}
+/**
+ * revoke_delegation - perform nfs4 delegation structure cleanup
+ * @dp: pointer to the delegation
+ *
+ * This function assumes that it's called either from the administrative
+ * interface (nfsd4_revoke_states()) that's revoking a specific delegation
+ * stateid or it's called from a laundromat thread (nfsd4_landromat()) that
+ * determined that this specific state has expired and needs to be revoked
+ * (both mark state with the appropriate stid sc_status mode). It is also
+ * assumed that a reference was taken on the @dp state.
+ *
+ * If this function finds that the @dp state is SC_STATUS_FREED it means
+ * that a FREE_STATEID operation for this stateid has been processed and
+ * we can proceed to removing it from recalled list. However, if @dp state
+ * isn't marked SC_STATUS_FREED, it means we need place it on the cl_revoked
+ * list and wait for the FREE_STATEID to arrive from the client. At the same
+ * time, we need to mark it as SC_STATUS_FREEABLE to indicate to the
+ * nfsd4_free_stateid() function that this stateid has already been added
+ * to the cl_revoked list and that nfsd4_free_stateid() is now responsible
+ * for removing it from the list. Inspection of where the delegation state
+ * in the revocation process is protected by the clp->cl_lock.
+ */
static void revoke_delegation(struct nfs4_delegation *dp)
{
struct nfs4_client *clp = dp->dl_stid.sc_client;
WARN_ON(!list_empty(&dp->dl_recall_lru));
+ WARN_ON_ONCE(!(dp->dl_stid.sc_status &
+ (SC_STATUS_REVOKED | SC_STATUS_ADMIN_REVOKED)));
trace_nfsd_stid_revoke(&dp->dl_stid);
- if (dp->dl_stid.sc_status &
- (SC_STATUS_REVOKED | SC_STATUS_ADMIN_REVOKED)) {
- spin_lock(&clp->cl_lock);
- refcount_inc(&dp->dl_stid.sc_count);
- list_add(&dp->dl_recall_lru, &clp->cl_revoked);
- spin_unlock(&clp->cl_lock);
+ spin_lock(&clp->cl_lock);
+ if (dp->dl_stid.sc_status & SC_STATUS_FREED) {
+ list_del_init(&dp->dl_recall_lru);
+ goto out;
}
+ list_add(&dp->dl_recall_lru, &clp->cl_revoked);
+ dp->dl_stid.sc_status |= SC_STATUS_FREEABLE;
+out:
+ spin_unlock(&clp->cl_lock);
destroy_unhashed_deleg(dp);
}
@@ -1780,6 +1806,7 @@ void nfsd4_revoke_states(struct net *net, struct super_block *sb)
mutex_unlock(&stp->st_mutex);
break;
case SC_TYPE_DELEG:
+ refcount_inc(&stid->sc_count);
dp = delegstateid(stid);
spin_lock(&state_lock);
if (!unhash_delegation_locked(
@@ -6545,6 +6572,7 @@ nfs4_laundromat(struct nfsd_net *nn)
dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
if (!state_expired(&lt, dp->dl_time))
break;
+ refcount_inc(&dp->dl_stid.sc_count);
unhash_delegation_locked(dp, SC_STATUS_REVOKED);
list_add(&dp->dl_recall_lru, &reaplist);
}
@@ -7157,7 +7185,9 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
s->sc_status |= SC_STATUS_CLOSED;
spin_unlock(&s->sc_lock);
dp = delegstateid(s);
- list_del_init(&dp->dl_recall_lru);
+ if (s->sc_status & SC_STATUS_FREEABLE)
+ list_del_init(&dp->dl_recall_lru);
+ s->sc_status |= SC_STATUS_FREED;
spin_unlock(&cl->cl_lock);
nfs4_put_stid(s);
ret = nfs_ok;
@@ -7487,7 +7517,9 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
return status;
- status = nfsd4_lookup_stateid(cstate, stateid, SC_TYPE_DELEG, 0, &s, nn);
+ status = nfsd4_lookup_stateid(cstate, stateid, SC_TYPE_DELEG,
+ SC_STATUS_REVOKED | SC_STATUS_FREEABLE,
+ &s, nn);
if (status)
goto out;
dp = delegstateid(s);
@@ -8684,7 +8716,7 @@ nfs4_state_shutdown_net(struct net *net)
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
shrinker_free(nn->nfsd_client_shrinker);
- cancel_work(&nn->nfsd_shrinker_work);
+ cancel_work_sync(&nn->nfsd_shrinker_work);
cancel_delayed_work_sync(&nn->laundromat_work);
locks_end_grace(&nn->nfsd4_manager);
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 79c743c01a47..35b3564c065f 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -114,6 +114,8 @@ struct nfs4_stid {
/* For a deleg stateid kept around only to process free_stateid's: */
#define SC_STATUS_REVOKED BIT(1)
#define SC_STATUS_ADMIN_REVOKED BIT(2)
+#define SC_STATUS_FREEABLE BIT(3)
+#define SC_STATUS_FREED BIT(4)
unsigned short sc_status;
struct list_head sc_cp_list;
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 4905063790c5..9b108052d9f7 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -157,6 +157,9 @@ static int nilfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
/* slow symlink */
inode->i_op = &nilfs_symlink_inode_operations;
inode_nohighmem(inode);
+ mapping_set_gfp_mask(inode->i_mapping,
+ mapping_gfp_constraint(inode->i_mapping,
+ ~__GFP_FS));
inode->i_mapping->a_ops = &nilfs_aops;
err = page_symlink(inode, symname, l);
if (err)
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 9c0b7cddeaae..10def4b55995 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -77,7 +77,8 @@ void nilfs_forget_buffer(struct buffer_head *bh)
const unsigned long clear_bits =
(BIT(BH_Uptodate) | BIT(BH_Dirty) | BIT(BH_Mapped) |
BIT(BH_Async_Write) | BIT(BH_NILFS_Volatile) |
- BIT(BH_NILFS_Checked) | BIT(BH_NILFS_Redirected));
+ BIT(BH_NILFS_Checked) | BIT(BH_NILFS_Redirected) |
+ BIT(BH_Delay));
lock_buffer(bh);
set_mask_bits(&bh->b_state, clear_bits, 0);
@@ -400,13 +401,15 @@ void nilfs_clear_folio_dirty(struct folio *folio)
folio_clear_uptodate(folio);
folio_clear_mappedtodisk(folio);
+ folio_clear_checked(folio);
head = folio_buffers(folio);
if (head) {
const unsigned long clear_bits =
(BIT(BH_Uptodate) | BIT(BH_Dirty) | BIT(BH_Mapped) |
BIT(BH_Async_Write) | BIT(BH_NILFS_Volatile) |
- BIT(BH_NILFS_Checked) | BIT(BH_NILFS_Redirected));
+ BIT(BH_NILFS_Checked) | BIT(BH_NILFS_Redirected) |
+ BIT(BH_Delay));
bh = head;
do {
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index ad131a2fc58e..06af21982c16 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1129,9 +1129,12 @@ int ocfs2_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
trace_ocfs2_setattr(inode, dentry,
(unsigned long long)OCFS2_I(inode)->ip_blkno,
dentry->d_name.len, dentry->d_name.name,
- attr->ia_valid, attr->ia_mode,
- from_kuid(&init_user_ns, attr->ia_uid),
- from_kgid(&init_user_ns, attr->ia_gid));
+ attr->ia_valid,
+ attr->ia_valid & ATTR_MODE ? attr->ia_mode : 0,
+ attr->ia_valid & ATTR_UID ?
+ from_kuid(&init_user_ns, attr->ia_uid) : 0,
+ attr->ia_valid & ATTR_GID ?
+ from_kgid(&init_user_ns, attr->ia_gid) : 0);
/* ensuring we don't even attempt to truncate a symlink */
if (S_ISLNK(inode->i_mode))
@@ -1784,6 +1787,14 @@ int ocfs2_remove_inode_range(struct inode *inode,
return 0;
if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+ int id_count = ocfs2_max_inline_data_with_xattr(inode->i_sb, di);
+
+ if (byte_start > id_count || byte_start + byte_len > id_count) {
+ ret = -EINVAL;
+ mlog_errno(ret);
+ goto out;
+ }
+
ret = ocfs2_truncate_inline(inode, di_bh, byte_start,
byte_start + byte_len, 0);
if (ret) {
diff --git a/fs/open.c b/fs/open.c
index acaeb3e25c88..5da4df2f9b18 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1457,6 +1457,8 @@ SYSCALL_DEFINE4(openat2, int, dfd, const char __user *, filename,
if (unlikely(usize < OPEN_HOW_SIZE_VER0))
return -EINVAL;
+ if (unlikely(usize > PAGE_SIZE))
+ return -E2BIG;
err = copy_struct_from_user(&tmp, sizeof(tmp), how, usize);
if (err)
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 4504493b20be..4444c78e2e0c 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -231,6 +231,11 @@ static void ovl_file_modified(struct file *file)
ovl_copyattr(file_inode(file));
}
+static void ovl_file_end_write(struct file *file, loff_t pos, ssize_t ret)
+{
+ ovl_file_modified(file);
+}
+
static void ovl_file_accessed(struct file *file)
{
struct inode *inode, *upperinode;
@@ -294,7 +299,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
struct backing_file_ctx ctx = {
.cred = ovl_creds(inode->i_sb),
.user_file = file,
- .end_write = ovl_file_modified,
+ .end_write = ovl_file_end_write,
};
if (!iov_iter_count(iter))
@@ -364,7 +369,7 @@ static ssize_t ovl_splice_write(struct pipe_inode_info *pipe, struct file *out,
struct backing_file_ctx ctx = {
.cred = ovl_creds(inode->i_sb),
.user_file = out,
- .end_write = ovl_file_modified,
+ .end_write = ovl_file_end_write,
};
inode_lock(inode);
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 1f54a54bfb91..5e391cbca7a3 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -77,7 +77,7 @@ static int seq_fdinfo_open(struct inode *inode, struct file *file)
return single_open(file, seq_show, inode);
}
-/**
+/*
* Shared /proc/pid/fdinfo and /proc/pid/fdinfo/fd permission helper to ensure
* that the current task has PTRACE_MODE_READ in addition to the normal
* POSIX-like checks.
diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c
index 000e1ef3beea..20cafdff5081 100644
--- a/fs/smb/client/cifsfs.c
+++ b/fs/smb/client/cifsfs.c
@@ -1780,7 +1780,7 @@ static int cifs_init_netfs(void)
nomem_subreqpool:
kmem_cache_destroy(cifs_io_subrequest_cachep);
nomem_subreq:
- mempool_destroy(&cifs_io_request_pool);
+ mempool_exit(&cifs_io_request_pool);
nomem_reqpool:
kmem_cache_destroy(cifs_io_request_cachep);
nomem_req:
diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c
index 28c4e576d460..5c5a52019efa 100644
--- a/fs/smb/client/fs_context.c
+++ b/fs/smb/client/fs_context.c
@@ -920,8 +920,15 @@ static int smb3_reconfigure(struct fs_context *fc)
else {
kfree_sensitive(ses->password);
ses->password = kstrdup(ctx->password, GFP_KERNEL);
+ if (!ses->password)
+ return -ENOMEM;
kfree_sensitive(ses->password2);
ses->password2 = kstrdup(ctx->password2, GFP_KERNEL);
+ if (!ses->password2) {
+ kfree_sensitive(ses->password);
+ ses->password = NULL;
+ return -ENOMEM;
+ }
}
STEAL_STRING(cifs_sb, ctx, domainname);
STEAL_STRING(cifs_sb, ctx, nodename);
diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c
index 22251743fadf..d19d4db74af8 100644
--- a/fs/squashfs/file_direct.c
+++ b/fs/squashfs/file_direct.c
@@ -30,7 +30,8 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize,
int mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1;
loff_t start_index = folio->index & ~mask;
loff_t end_index = start_index | mask;
- int i, n, pages, bytes, res = -ENOMEM;
+ loff_t index;
+ int i, pages, bytes, res = -ENOMEM;
struct page **page, *last_page;
struct squashfs_page_actor *actor;
void *pageaddr;
@@ -45,9 +46,9 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize,
return res;
/* Try to grab all the pages covered by the Squashfs block */
- for (i = 0, n = start_index; n <= end_index; n++) {
- page[i] = (n == folio->index) ? target_page :
- grab_cache_page_nowait(target_page->mapping, n);
+ for (i = 0, index = start_index; index <= end_index; index++) {
+ page[i] = (index == folio->index) ? target_page :
+ grab_cache_page_nowait(target_page->mapping, index);
if (page[i] == NULL)
continue;
diff --git a/fs/super.c b/fs/super.c
index 1db230432960..c9c7223bc2a2 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1596,13 +1596,14 @@ int setup_bdev_super(struct super_block *sb, int sb_flags,
EXPORT_SYMBOL_GPL(setup_bdev_super);
/**
- * get_tree_bdev - Get a superblock based on a single block device
+ * get_tree_bdev_flags - Get a superblock based on a single block device
* @fc: The filesystem context holding the parameters
* @fill_super: Helper to initialise a new superblock
+ * @flags: GET_TREE_BDEV_* flags
*/
-int get_tree_bdev(struct fs_context *fc,
- int (*fill_super)(struct super_block *,
- struct fs_context *))
+int get_tree_bdev_flags(struct fs_context *fc,
+ int (*fill_super)(struct super_block *sb,
+ struct fs_context *fc), unsigned int flags)
{
struct super_block *s;
int error = 0;
@@ -1613,10 +1614,10 @@ int get_tree_bdev(struct fs_context *fc,
error = lookup_bdev(fc->source, &dev);
if (error) {
- errorf(fc, "%s: Can't lookup blockdev", fc->source);
+ if (!(flags & GET_TREE_BDEV_QUIET_LOOKUP))
+ errorf(fc, "%s: Can't lookup blockdev", fc->source);
return error;
}
-
fc->sb_flags |= SB_NOSEC;
s = sget_dev(fc, dev);
if (IS_ERR(s))
@@ -1644,6 +1645,19 @@ int get_tree_bdev(struct fs_context *fc,
fc->root = dget(s->s_root);
return 0;
}
+EXPORT_SYMBOL_GPL(get_tree_bdev_flags);
+
+/**
+ * get_tree_bdev - Get a superblock based on a single block device
+ * @fc: The filesystem context holding the parameters
+ * @fill_super: Helper to initialise a new superblock
+ */
+int get_tree_bdev(struct fs_context *fc,
+ int (*fill_super)(struct super_block *,
+ struct fs_context *))
+{
+ return get_tree_bdev_flags(fc, fill_super, 0);
+}
EXPORT_SYMBOL(get_tree_bdev);
static int test_bdev_super(struct super_block *s, void *data)
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 68cdd89c97a3..7c0bd0b55f88 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -692,6 +692,34 @@ void dup_userfaultfd_complete(struct list_head *fcs)
}
}
+void dup_userfaultfd_fail(struct list_head *fcs)
+{
+ struct userfaultfd_fork_ctx *fctx, *n;
+
+ /*
+ * An error has occurred on fork, we will tear memory down, but have
+ * allocated memory for fctx's and raised reference counts for both the
+ * original and child contexts (and on the mm for each as a result).
+ *
+ * These would ordinarily be taken care of by a user handling the event,
+ * but we are no longer doing so, so manually clean up here.
+ *
+ * mm tear down will take care of cleaning up VMA contexts.
+ */
+ list_for_each_entry_safe(fctx, n, fcs, list) {
+ struct userfaultfd_ctx *octx = fctx->orig;
+ struct userfaultfd_ctx *ctx = fctx->new;
+
+ atomic_dec(&octx->mmap_changing);
+ VM_BUG_ON(atomic_read(&octx->mmap_changing) < 0);
+ userfaultfd_ctx_put(octx);
+ userfaultfd_ctx_put(ctx);
+
+ list_del(&fctx->list);
+ kfree(fctx);
+ }
+}
+
void mremap_userfaultfd_prep(struct vm_area_struct *vma,
struct vm_userfaultfd_ctx *vm_ctx)
{
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index 5f0494702e0b..5ca8d0106827 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -185,17 +185,20 @@ out:
}
/*
- * Free up the per-ag resources associated with the mount structure.
+ * Free up the per-ag resources within the specified AG range.
*/
void
-xfs_free_perag(
- struct xfs_mount *mp)
+xfs_free_perag_range(
+ struct xfs_mount *mp,
+ xfs_agnumber_t first_agno,
+ xfs_agnumber_t end_agno)
+
{
- struct xfs_perag *pag;
xfs_agnumber_t agno;
- for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
- pag = xa_erase(&mp->m_perags, agno);
+ for (agno = first_agno; agno < end_agno; agno++) {
+ struct xfs_perag *pag = xa_erase(&mp->m_perags, agno);
+
ASSERT(pag);
XFS_IS_CORRUPT(pag->pag_mount, atomic_read(&pag->pag_ref) != 0);
xfs_defer_drain_free(&pag->pag_intents_drain);
@@ -270,54 +273,37 @@ xfs_agino_range(
return __xfs_agino_range(mp, xfs_ag_block_count(mp, agno), first, last);
}
-/*
- * Free perag within the specified AG range, it is only used to free unused
- * perags under the error handling path.
- */
-void
-xfs_free_unused_perag_range(
+int
+xfs_update_last_ag_size(
struct xfs_mount *mp,
- xfs_agnumber_t agstart,
- xfs_agnumber_t agend)
+ xfs_agnumber_t prev_agcount)
{
- struct xfs_perag *pag;
- xfs_agnumber_t index;
+ struct xfs_perag *pag = xfs_perag_grab(mp, prev_agcount - 1);
- for (index = agstart; index < agend; index++) {
- pag = xa_erase(&mp->m_perags, index);
- if (!pag)
- break;
- xfs_buf_cache_destroy(&pag->pag_bcache);
- xfs_defer_drain_free(&pag->pag_intents_drain);
- kfree(pag);
- }
+ if (!pag)
+ return -EFSCORRUPTED;
+ pag->block_count = __xfs_ag_block_count(mp, prev_agcount - 1,
+ mp->m_sb.sb_agcount, mp->m_sb.sb_dblocks);
+ __xfs_agino_range(mp, pag->block_count, &pag->agino_min,
+ &pag->agino_max);
+ xfs_perag_rele(pag);
+ return 0;
}
int
xfs_initialize_perag(
struct xfs_mount *mp,
- xfs_agnumber_t agcount,
+ xfs_agnumber_t old_agcount,
+ xfs_agnumber_t new_agcount,
xfs_rfsblock_t dblocks,
xfs_agnumber_t *maxagi)
{
struct xfs_perag *pag;
xfs_agnumber_t index;
- xfs_agnumber_t first_initialised = NULLAGNUMBER;
int error;
- /*
- * Walk the current per-ag tree so we don't try to initialise AGs
- * that already exist (growfs case). Allocate and insert all the
- * AGs we don't find ready for initialisation.
- */
- for (index = 0; index < agcount; index++) {
- pag = xfs_perag_get(mp, index);
- if (pag) {
- xfs_perag_put(pag);
- continue;
- }
-
- pag = kzalloc(sizeof(*pag), GFP_KERNEL | __GFP_RETRY_MAYFAIL);
+ for (index = old_agcount; index < new_agcount; index++) {
+ pag = kzalloc(sizeof(*pag), GFP_KERNEL);
if (!pag) {
error = -ENOMEM;
goto out_unwind_new_pags;
@@ -353,21 +339,17 @@ xfs_initialize_perag(
/* Active ref owned by mount indicates AG is online. */
atomic_set(&pag->pag_active_ref, 1);
- /* first new pag is fully initialized */
- if (first_initialised == NULLAGNUMBER)
- first_initialised = index;
-
/*
* Pre-calculated geometry
*/
- pag->block_count = __xfs_ag_block_count(mp, index, agcount,
+ pag->block_count = __xfs_ag_block_count(mp, index, new_agcount,
dblocks);
pag->min_block = XFS_AGFL_BLOCK(mp);
__xfs_agino_range(mp, pag->block_count, &pag->agino_min,
&pag->agino_max);
}
- index = xfs_set_inode_alloc(mp, agcount);
+ index = xfs_set_inode_alloc(mp, new_agcount);
if (maxagi)
*maxagi = index;
@@ -381,8 +363,7 @@ out_remove_pag:
out_free_pag:
kfree(pag);
out_unwind_new_pags:
- /* unwind any prior newly initialized pags */
- xfs_free_unused_perag_range(mp, first_initialised, agcount);
+ xfs_free_perag_range(mp, old_agcount, index);
return error;
}
diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
index d9cccd093b60..9edfe0e96439 100644
--- a/fs/xfs/libxfs/xfs_ag.h
+++ b/fs/xfs/libxfs/xfs_ag.h
@@ -144,12 +144,13 @@ __XFS_AG_OPSTATE(prefers_metadata, PREFERS_METADATA)
__XFS_AG_OPSTATE(allows_inodes, ALLOWS_INODES)
__XFS_AG_OPSTATE(agfl_needs_reset, AGFL_NEEDS_RESET)
-void xfs_free_unused_perag_range(struct xfs_mount *mp, xfs_agnumber_t agstart,
- xfs_agnumber_t agend);
-int xfs_initialize_perag(struct xfs_mount *mp, xfs_agnumber_t agcount,
- xfs_rfsblock_t dcount, xfs_agnumber_t *maxagi);
+int xfs_initialize_perag(struct xfs_mount *mp, xfs_agnumber_t old_agcount,
+ xfs_agnumber_t agcount, xfs_rfsblock_t dcount,
+ xfs_agnumber_t *maxagi);
+void xfs_free_perag_range(struct xfs_mount *mp, xfs_agnumber_t first_agno,
+ xfs_agnumber_t end_agno);
int xfs_initialize_perag_data(struct xfs_mount *mp, xfs_agnumber_t agno);
-void xfs_free_perag(struct xfs_mount *mp);
+int xfs_update_last_ag_size(struct xfs_mount *mp, xfs_agnumber_t prev_agcount);
/* Passive AG references */
struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno);
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 04f64cf9777e..22bdbb3e9980 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -1923,7 +1923,7 @@ restart:
error = -EFSCORRUPTED;
goto error0;
}
- if (flen < bestrlen)
+ if (flen <= bestrlen)
break;
busy = xfs_alloc_compute_aligned(args, fbno, flen,
&rbno, &rlen, &busy_gen);
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 67478294f11a..155bbaaa496e 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -1084,9 +1084,11 @@ xrep_metadata_inode_forks(
return error;
/* Make sure the attr fork looks ok before we delete it. */
- error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTA);
- if (error)
- return error;
+ if (xfs_inode_hasattr(sc->ip)) {
+ error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTA);
+ if (error)
+ return error;
+ }
/* Clear the reflink flag since metadata never shares. */
if (xfs_is_reflink_inode(sc->ip)) {
diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c
index 09e893cf563c..5180cbf5a90b 100644
--- a/fs/xfs/xfs_buf_item_recover.c
+++ b/fs/xfs/xfs_buf_item_recover.c
@@ -22,6 +22,9 @@
#include "xfs_inode.h"
#include "xfs_dir2.h"
#include "xfs_quota.h"
+#include "xfs_alloc.h"
+#include "xfs_ag.h"
+#include "xfs_sb.h"
/*
* This is the number of entries in the l_buf_cancel_table used during
@@ -685,6 +688,67 @@ xlog_recover_do_inode_buffer(
}
/*
+ * Update the in-memory superblock and perag structures from the primary SB
+ * buffer.
+ *
+ * This is required because transactions running after growfs may require the
+ * updated values to be set in a previous fully commit transaction.
+ */
+static int
+xlog_recover_do_primary_sb_buffer(
+ struct xfs_mount *mp,
+ struct xlog_recover_item *item,
+ struct xfs_buf *bp,
+ struct xfs_buf_log_format *buf_f,
+ xfs_lsn_t current_lsn)
+{
+ struct xfs_dsb *dsb = bp->b_addr;
+ xfs_agnumber_t orig_agcount = mp->m_sb.sb_agcount;
+ int error;
+
+ xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn);
+
+ if (orig_agcount == 0) {
+ xfs_alert(mp, "Trying to grow file system without AGs");
+ return -EFSCORRUPTED;
+ }
+
+ /*
+ * Update the in-core super block from the freshly recovered on-disk one.
+ */
+ xfs_sb_from_disk(&mp->m_sb, dsb);
+
+ if (mp->m_sb.sb_agcount < orig_agcount) {
+ xfs_alert(mp, "Shrinking AG count in log recovery not supported");
+ return -EFSCORRUPTED;
+ }
+
+ /*
+ * Growfs can also grow the last existing AG. In this case we also need
+ * to update the length in the in-core perag structure and values
+ * depending on it.
+ */
+ error = xfs_update_last_ag_size(mp, orig_agcount);
+ if (error)
+ return error;
+
+ /*
+ * Initialize the new perags, and also update various block and inode
+ * allocator setting based off the number of AGs or total blocks.
+ * Because of the latter this also needs to happen if the agcount did
+ * not change.
+ */
+ error = xfs_initialize_perag(mp, orig_agcount, mp->m_sb.sb_agcount,
+ mp->m_sb.sb_dblocks, &mp->m_maxagi);
+ if (error) {
+ xfs_warn(mp, "Failed recovery per-ag init: %d", error);
+ return error;
+ }
+ mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
+ return 0;
+}
+
+/*
* V5 filesystems know the age of the buffer on disk being recovered. We can
* have newer objects on disk than we are replaying, and so for these cases we
* don't want to replay the current change as that will make the buffer contents
@@ -967,6 +1031,12 @@ xlog_recover_buf_commit_pass2(
dirty = xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
if (!dirty)
goto out_release;
+ } else if ((xfs_blft_from_flags(buf_f) & XFS_BLFT_SB_BUF) &&
+ xfs_buf_daddr(bp) == 0) {
+ error = xlog_recover_do_primary_sb_buffer(mp, item, bp, buf_f,
+ current_lsn);
+ if (error)
+ goto out_release;
} else {
xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn);
}
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index e3aaa0555597..290ba8887d29 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -64,25 +64,31 @@ xfs_filestream_pick_ag(
struct xfs_perag *pag;
struct xfs_perag *max_pag = NULL;
xfs_extlen_t minlen = *longest;
- xfs_extlen_t free = 0, minfree, maxfree = 0;
+ xfs_extlen_t minfree, maxfree = 0;
xfs_agnumber_t agno;
bool first_pass = true;
- int err;
/* 2% of an AG's blocks must be free for it to be chosen. */
minfree = mp->m_sb.sb_agblocks / 50;
restart:
for_each_perag_wrap(mp, start_agno, agno, pag) {
+ int err;
+
trace_xfs_filestream_scan(pag, pino);
+
*longest = 0;
err = xfs_bmap_longest_free_extent(pag, NULL, longest);
if (err) {
- if (err != -EAGAIN)
- break;
- /* Couldn't lock the AGF, skip this AG. */
- err = 0;
- continue;
+ if (err == -EAGAIN) {
+ /* Couldn't lock the AGF, skip this AG. */
+ err = 0;
+ continue;
+ }
+ xfs_perag_rele(pag);
+ if (max_pag)
+ xfs_perag_rele(max_pag);
+ return err;
}
/* Keep track of the AG with the most free blocks. */
@@ -107,8 +113,9 @@ restart:
!(flags & XFS_PICK_USERDATA) ||
(flags & XFS_PICK_LOWSPACE))) {
/* Break out, retaining the reference on the AG. */
- free = pag->pagf_freeblks;
- break;
+ if (max_pag)
+ xfs_perag_rele(max_pag);
+ goto done;
}
}
@@ -116,57 +123,47 @@ restart:
atomic_dec(&pag->pagf_fstrms);
}
- if (err) {
- xfs_perag_rele(pag);
- if (max_pag)
- xfs_perag_rele(max_pag);
- return err;
+ /*
+ * Allow a second pass to give xfs_bmap_longest_free_extent() another
+ * attempt at locking AGFs that it might have skipped over before we
+ * fail.
+ */
+ if (first_pass) {
+ first_pass = false;
+ goto restart;
}
- if (!pag) {
- /*
- * Allow a second pass to give xfs_bmap_longest_free_extent()
- * another attempt at locking AGFs that it might have skipped
- * over before we fail.
- */
- if (first_pass) {
- first_pass = false;
- goto restart;
- }
+ /*
+ * We must be low on data space, so run a final lowspace optimised
+ * selection pass if we haven't already.
+ */
+ if (!(flags & XFS_PICK_LOWSPACE)) {
+ flags |= XFS_PICK_LOWSPACE;
+ goto restart;
+ }
- /*
- * We must be low on data space, so run a final lowspace
- * optimised selection pass if we haven't already.
- */
- if (!(flags & XFS_PICK_LOWSPACE)) {
- flags |= XFS_PICK_LOWSPACE;
- goto restart;
+ /*
+ * No unassociated AGs are available, so select the AG with the most
+ * free space, regardless of whether it's already in use by another
+ * filestream. It none suit, just use whatever AG we can grab.
+ */
+ if (!max_pag) {
+ for_each_perag_wrap(args->mp, 0, start_agno, pag) {
+ max_pag = pag;
+ break;
}
- /*
- * No unassociated AGs are available, so select the AG with the
- * most free space, regardless of whether it's already in use by
- * another filestream. It none suit, just use whatever AG we can
- * grab.
- */
- if (!max_pag) {
- for_each_perag_wrap(args->mp, 0, start_agno, args->pag)
- break;
- atomic_inc(&args->pag->pagf_fstrms);
- *longest = 0;
- } else {
- pag = max_pag;
- free = maxfree;
- atomic_inc(&pag->pagf_fstrms);
- }
- } else if (max_pag) {
- xfs_perag_rele(max_pag);
+ /* Bail if there are no AGs at all to select from. */
+ if (!max_pag)
+ return -ENOSPC;
}
- trace_xfs_filestream_pick(pag, pino, free);
+ pag = max_pag;
+ atomic_inc(&pag->pagf_fstrms);
+done:
+ trace_xfs_filestream_pick(pag, pino);
args->pag = pag;
return 0;
-
}
static struct xfs_inode *
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 3643cc843f62..b247d895c276 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -87,6 +87,7 @@ xfs_growfs_data_private(
struct xfs_mount *mp, /* mount point for filesystem */
struct xfs_growfs_data *in) /* growfs data input struct */
{
+ xfs_agnumber_t oagcount = mp->m_sb.sb_agcount;
struct xfs_buf *bp;
int error;
xfs_agnumber_t nagcount;
@@ -94,7 +95,6 @@ xfs_growfs_data_private(
xfs_rfsblock_t nb, nb_div, nb_mod;
int64_t delta;
bool lastag_extended = false;
- xfs_agnumber_t oagcount;
struct xfs_trans *tp;
struct aghdr_init_data id = {};
struct xfs_perag *last_pag;
@@ -138,16 +138,14 @@ xfs_growfs_data_private(
if (delta == 0)
return 0;
- oagcount = mp->m_sb.sb_agcount;
- /* allocate the new per-ag structures */
- if (nagcount > oagcount) {
- error = xfs_initialize_perag(mp, nagcount, nb, &nagimax);
- if (error)
- return error;
- } else if (nagcount < oagcount) {
- /* TODO: shrinking the entire AGs hasn't yet completed */
+ /* TODO: shrinking the entire AGs hasn't yet completed */
+ if (nagcount < oagcount)
return -EINVAL;
- }
+
+ /* allocate the new per-ag structures */
+ error = xfs_initialize_perag(mp, oagcount, nagcount, nb, &nagimax);
+ if (error)
+ return error;
if (delta > 0)
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata,
@@ -231,7 +229,7 @@ out_trans_cancel:
xfs_trans_cancel(tp);
out_free_unused_perag:
if (nagcount > oagcount)
- xfs_free_unused_perag_range(mp, oagcount, nagcount);
+ xfs_free_perag_range(mp, oagcount, nagcount);
return error;
}
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index bcc277fc0a83..19dcb569a3e7 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1409,7 +1409,7 @@ xfs_inactive(
if (S_ISREG(VFS_I(ip)->i_mode) &&
(ip->i_disk_size != 0 || XFS_ISIZE(ip) != 0 ||
- ip->i_df.if_nextents > 0 || ip->i_delayed_blks > 0))
+ xfs_inode_has_filedata(ip)))
truncate = 1;
if (xfs_iflags_test(ip, XFS_IQUOTAUNCHECKED)) {
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 97ed912306fd..03944b6c5fba 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -292,6 +292,11 @@ static inline bool xfs_is_cow_inode(struct xfs_inode *ip)
return xfs_is_reflink_inode(ip) || xfs_is_always_cow_inode(ip);
}
+static inline bool xfs_inode_has_filedata(const struct xfs_inode *ip)
+{
+ return ip->i_df.if_nextents > 0 || ip->i_delayed_blks > 0;
+}
+
/*
* Check if an inode has any data in the COW fork. This might be often false
* even for inodes with the reflink flag when there is no pending COW operation.
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index a20d426ef021..2567fd2a0994 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -481,7 +481,7 @@ xfs_ioctl_setattr_xflags(
if (rtflag != XFS_IS_REALTIME_INODE(ip)) {
/* Can't change realtime flag if any extents are allocated. */
- if (ip->i_df.if_nextents || ip->i_delayed_blks)
+ if (xfs_inode_has_filedata(ip))
return -EINVAL;
/*
@@ -602,7 +602,7 @@ xfs_ioctl_setattr_check_extsize(
if (!fa->fsx_valid)
return 0;
- if (S_ISREG(VFS_I(ip)->i_mode) && ip->i_df.if_nextents &&
+ if (S_ISREG(VFS_I(ip)->i_mode) && xfs_inode_has_filedata(ip) &&
XFS_FSB_TO_B(mp, ip->i_extsize) != fa->fsx_extsize)
return -EINVAL;
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 916531d9f83c..86da16f54be9 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -707,7 +707,7 @@ imap_needs_cow(
return false;
/* when zeroing we don't have to COW holes or unwritten extents */
- if (flags & IOMAP_ZERO) {
+ if (flags & (IOMAP_UNSHARE | IOMAP_ZERO)) {
if (!nimaps ||
imap->br_startblock == HOLESTARTBLOCK ||
imap->br_state == XFS_EXT_UNWRITTEN)
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index a13bf53fea49..704aaadb61cf 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -3393,13 +3393,6 @@ xlog_do_recover(
/* re-initialise in-core superblock and geometry structures */
mp->m_features |= xfs_sb_version_to_features(sbp);
xfs_reinit_percpu_counters(mp);
- error = xfs_initialize_perag(mp, sbp->sb_agcount, sbp->sb_dblocks,
- &mp->m_maxagi);
- if (error) {
- xfs_warn(mp, "Failed post-recovery per-ag init: %d", error);
- return error;
- }
- mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
/* Normal transactions can now occur */
clear_bit(XLOG_ACTIVE_RECOVERY, &log->l_opstate);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 1fdd79c5bfa0..25bbcc3f4ee0 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -810,8 +810,8 @@ xfs_mountfs(
/*
* Allocate and initialize the per-ag data.
*/
- error = xfs_initialize_perag(mp, sbp->sb_agcount, mp->m_sb.sb_dblocks,
- &mp->m_maxagi);
+ error = xfs_initialize_perag(mp, 0, sbp->sb_agcount,
+ mp->m_sb.sb_dblocks, &mp->m_maxagi);
if (error) {
xfs_warn(mp, "Failed per-ag init: %d", error);
goto out_free_dir;
@@ -1048,7 +1048,7 @@ xfs_mountfs(
xfs_buftarg_drain(mp->m_logdev_targp);
xfs_buftarg_drain(mp->m_ddev_targp);
out_free_perag:
- xfs_free_perag(mp);
+ xfs_free_perag_range(mp, 0, mp->m_sb.sb_agcount);
out_free_dir:
xfs_da_unmount(mp);
out_remove_uuid:
@@ -1129,8 +1129,7 @@ xfs_unmountfs(
xfs_errortag_clearall(mp);
#endif
shrinker_free(mp->m_inodegc_shrinker);
- xfs_free_perag(mp);
-
+ xfs_free_perag_range(mp, 0, mp->m_sb.sb_agcount);
xfs_errortag_del(mp);
xfs_error_sysfs_del(mp);
xchk_stats_unregister(mp->m_scrub_stats);
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index ee9f0b1f548d..fcb2bad4f76e 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -691,8 +691,8 @@ DEFINE_FILESTREAM_EVENT(xfs_filestream_lookup);
DEFINE_FILESTREAM_EVENT(xfs_filestream_scan);
TRACE_EVENT(xfs_filestream_pick,
- TP_PROTO(struct xfs_perag *pag, xfs_ino_t ino, xfs_extlen_t free),
- TP_ARGS(pag, ino, free),
+ TP_PROTO(struct xfs_perag *pag, xfs_ino_t ino),
+ TP_ARGS(pag, ino),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_ino_t, ino)
@@ -703,14 +703,9 @@ TRACE_EVENT(xfs_filestream_pick,
TP_fast_assign(
__entry->dev = pag->pag_mount->m_super->s_dev;
__entry->ino = ino;
- if (pag) {
- __entry->agno = pag->pag_agno;
- __entry->streams = atomic_read(&pag->pagf_fstrms);
- } else {
- __entry->agno = NULLAGNUMBER;
- __entry->streams = 0;
- }
- __entry->free = free;
+ __entry->agno = pag->pag_agno;
+ __entry->streams = atomic_read(&pag->pagf_fstrms);
+ __entry->free = pag->pagf_freeblks;
),
TP_printk("dev %d:%d ino 0x%llx agno 0x%x streams %d free %d",
MAJOR(__entry->dev), MINOR(__entry->dev),